Merge branch 'int' into feat/chatbot
This commit is contained in:
commit
57118a633e
93 changed files with 13774 additions and 3691 deletions
|
|
@ -29,4 +29,9 @@ Web_Search_MIN_RESULTS = 1
|
||||||
# Web Crawl configuration
|
# Web Crawl configuration
|
||||||
Web_Crawl_TIMEOUT = 30
|
Web_Crawl_TIMEOUT = 30
|
||||||
Web_Crawl_MAX_RETRIES = 3
|
Web_Crawl_MAX_RETRIES = 3
|
||||||
Web_Crawl_RETRY_DELAY = 2
|
Web_Crawl_RETRY_DELAY = 2
|
||||||
|
|
||||||
|
# Web Research configuration
|
||||||
|
Web_Research_MAX_DEPTH = 2
|
||||||
|
Web_Research_MAX_LINKS_PER_DOMAIN = 4
|
||||||
|
Web_Research_CRAWL_TIMEOUT_MINUTES = 10
|
||||||
|
|
@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
|
||||||
|
|
||||||
# Perplexity AI configuration
|
# Perplexity AI configuration
|
||||||
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
||||||
Connector_AiPerplexity_API_SECRET = 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
|
Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0=
|
||||||
Connector_AiPerplexity_MODEL_NAME = sonar
|
Connector_AiPerplexity_MODEL_NAME = sonar
|
||||||
Connector_AiPerplexity_TEMPERATURE = 0.2
|
Connector_AiPerplexity_TEMPERATURE = 0.2
|
||||||
Connector_AiPerplexity_MAX_TOKENS = 2000
|
Connector_AiPerplexity_MAX_TOKENS = 2000
|
||||||
|
|
||||||
# Agent Mail configuration
|
# Agent Mail configuration
|
||||||
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
||||||
Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQk4xYnpmbnItUEU3dHU4eHB5dzVYay1WT012RTRLUWJDTlBILVY5dC1FX3VMNjZmLThrbDRFNWFSNGprY3RRTlpYNGlubVBpNnY3MjNJcGtzVk9PMzRacl9LUlM2RU5vTVVZWHJvaUhWSHVfc1pNR0pfQmI5SEprOG5KdlB1QnQ=
|
Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm83T29rV1pQelMtc1p1MXR4NTFpa19CTEhHQ0xfNmdPUmZqcWp5UHBMS0hYTGl4c1pPdmhTNTJVWUl5WnlnUUZhV0VTRzVCb0d5YjR1NnZPZk5CZ0dGazNGdUJVbjkxeVdrYlNiVjJUYzF2aVFtQnVxTHFqTTJqZlF0RTFGNmE1OGN1TEk=
|
||||||
Service_MSFT_TENANT_ID = common
|
Service_MSFT_TENANT_ID = common
|
||||||
|
|
||||||
# Google Service configuration
|
# Google Service configuration
|
||||||
|
|
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpETk5FWWM3Q0JKMzhI
|
||||||
|
|
||||||
# Feature SyncDelta JIRA configuration
|
# Feature SyncDelta JIRA configuration
|
||||||
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEbm0yRUJ6VUJKbUwyRW5kMnRaNW4wM2YxMkJUTXVXZUdmdVRCaUZIVHU2TTV2RWZLRmUtZkcwZE4yRUNlNDQ0aUJWYjNfdVg5YjV5c2JwMHhoUUYxZWdkeS11bXR0eGxRLWRVaVU3cUVQZWJlNDRtY1lWUDdqeDVFSlpXS0VFX21WajlRS3lHQjc0bS11akkybWV3QUFlR2hNWUNYLUdiRjZuN2dQODdDSExXWG1Dd2ZGclI2aUhlSWhETVZuY3hYdnhkb2c2LU1JTFBvWFpTNmZtMkNVOTZTejJwbDI2eGE0OS1xUlIwQnlCSmFxRFNCeVJNVzlOMDhTR1VUamx4RDRyV3p6Tk9qVHBrWWdySUM3TVRaYjd3N0JHMFhpdzFhZTNDLTFkRVQ2RVE4U19COXRhRWtNc0NVOHRqUS1CRDFpZ19xQmtFLU9YSDU3TXBZQXpVcld3PT0=
|
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEbm0yRUJ6VUJKbUwyRW5kMnRaNW4wM2YxMkJUTXVXZUdmdVRCaUZIVHU2TTV2RWZLRmUtZkcwZE4yRUNlNDQ0aUJWYjNfdVg5YjV5c2JwMHhoUUYxZWdkeS11bXR0eGxRLWRVaVU3cUVQZWJlNDRtY1lWUDdqeDVFSlpXS0VFX21WajlRS3lHQjc0bS11akkybWV3QUFlR2hNWUNYLUdiRjZuN2dQODdDSExXWG1Dd2ZGclI2aUhlSWhETVZuY3hYdnhkb2c2LU1JTFBvWFpTNmZtMkNVOTZTejJwbDI2eGE0OS1xUlIwQnlCSmFxRFNCeVJNVzlOMDhTR1VUamx4RDRyV3p6Tk9qVHBrWWdySUM3TVRaYjd3N0JHMFhpdzFhZTNDLTFkRVQ2RVE4U19COXRhRWtNc0NVOHRqUS1CRDFpZ19xQmtFLU9YSDU3TXBZQXpVcld3PT0=
|
||||||
|
|
||||||
|
# Debug Configuration
|
||||||
|
APP_DEBUG_CHAT_WORKFLOW_ENABLED = True
|
||||||
|
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
|
||||||
|
|
@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
|
||||||
|
|
||||||
# Perplexity AI configuration
|
# Perplexity AI configuration
|
||||||
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
||||||
Connector_AiPerplexity_API_SECRET = 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
|
Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0=
|
||||||
Connector_AiPerplexity_MODEL_NAME = sonar
|
Connector_AiPerplexity_MODEL_NAME = sonar
|
||||||
Connector_AiPerplexity_TEMPERATURE = 0.2
|
Connector_AiPerplexity_TEMPERATURE = 0.2
|
||||||
Connector_AiPerplexity_MAX_TOKENS = 2000
|
Connector_AiPerplexity_MAX_TOKENS = 2000
|
||||||
|
|
||||||
# Agent Mail configuration
|
# Agent Mail configuration
|
||||||
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
||||||
Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNzB2M3ZjaE1SVE9ON2FKam9yVURxcHl1Ym5VNVUtS0MyWUpNVXVlaWpWS2U3VVd3em9vQl9lcnVYay03bS04YjNBbDZZNTB4eUtjT3ppQjJjY3dOT0FNLW9LeDhIUU5iaTNqNURUWE5La3kzaHNGcU9yNVI0YjhWZTZRRFktcTk=
|
Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm83T29rMDZvcV9qTG5xb1FzUkdqS1llbzRxSEJXbmpONFFtcUtfZXdtZjQybmJSMjBjMEpnRVhiOGRuczZvVFBFdVVTQV80SG9PSnRQTEpLdVViNm5wc2E5aGRLWjZ4TGF1QjVkNmdRSzBpNWNkYXVublFYclVEdEM5TVBBZWVVMW5RVWk=
|
||||||
Service_MSFT_TENANT_ID = common
|
Service_MSFT_TENANT_ID = common
|
||||||
|
|
||||||
# Google Service configuration
|
# Google Service configuration
|
||||||
|
|
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkNmVXZ1pWcHcydTF2
|
||||||
|
|
||||||
# Feature SyncDelta JIRA configuration
|
# Feature SyncDelta JIRA configuration
|
||||||
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkTUNsWm4wX0p6eXFDZmJ4dFdHNEs1MV9MUzdrb3RzeC1jVWVYZ0REWHRyZkFiaGZLcUQtTXFBZzZkNzRmQ0gxbEhGbUNlVVFfR1JEQTc0aldkZkgyWnBOcjdlUlZxR0tDTEdKRExULXAyUEtsVmNTMkRKU1BJNnFiM0hlMXo4YndMcHlRMExtZDQ3Zm9vNFhMcEZCcHpBPT0=
|
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkTUNsWm4wX0p6eXFDZmJ4dFdHNEs1MV9MUzdrb3RzeC1jVWVYZ0REWHRyZkFiaGZLcUQtTXFBZzZkNzRmQ0gxbEhGbUNlVVFfR1JEQTc0aldkZkgyWnBOcjdlUlZxR0tDTEdKRExULXAyUEtsVmNTMkRKU1BJNnFiM0hlMXo4YndMcHlRMExtZDQ3Zm9vNFhMcEZCcHpBPT0=
|
||||||
|
|
||||||
|
# Debug Configuration
|
||||||
|
APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE
|
||||||
|
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
|
||||||
|
|
@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
|
||||||
|
|
||||||
# Perplexity AI configuration
|
# Perplexity AI configuration
|
||||||
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
||||||
Connector_AiPerplexity_API_SECRET = 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
|
Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0=
|
||||||
Connector_AiPerplexity_MODEL_NAME = sonar
|
Connector_AiPerplexity_MODEL_NAME = sonar
|
||||||
Connector_AiPerplexity_TEMPERATURE = 0.2
|
Connector_AiPerplexity_TEMPERATURE = 0.2
|
||||||
Connector_AiPerplexity_MAX_TOKENS = 2000
|
Connector_AiPerplexity_MAX_TOKENS = 2000
|
||||||
|
|
||||||
# Agent Mail configuration
|
# Agent Mail configuration
|
||||||
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
||||||
Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pVEhHdlZHU3FNMmhuRGVwaGc3YzIxSjlZNzBCQjlOV2pSYVNXb0t1ZnVwQzZsQzY4cHMtVlZtNF85OEVaV1BMTzdXMmpzaGZpaG1DalJ0bkNPMHA5ZUcwZjNDdGk1TFdxYTJSZnVrVmhhZ2VRUEZxbjJOOGFhWk9EYlY3dmRVTnI=
|
Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm83T29rSzdYLTRydXN5V3lQLXhmRjMyQ1FOaGpuek45QllaX1REN2s5aWNIUl81NGlrYlJTeFV0RlRZd0xPcm5uMDM4QlpibHJQbm5XZTlWeWxfcWNVdFpCUHI2amh0MVBnZ21IN2ptSkhWLTVfaHEwNmI5SEtiS05pQmt5eV8yMnhLMEc=
|
||||||
Service_MSFT_TENANT_ID = common
|
Service_MSFT_TENANT_ID = common
|
||||||
|
|
||||||
# Google Service configuration
|
# Google Service configuration
|
||||||
|
|
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pNjlJdmFMeERXUUQ
|
||||||
|
|
||||||
# Feature SyncDelta JIRA configuration
|
# Feature SyncDelta JIRA configuration
|
||||||
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pTDhnTVNzRUhScU8wYnZsZk52bHFkSWxLc18xQmtCeC1HbnNwTzVBbXRNTmQzRjZYaGE2MVlCNGtnWDk1T2I5VXVKNHpKU1VRbXEyN2tRWUJnU2ltZE5qZ3lmNEF6Z1hMTTEwZkk2NUNBYjhmVTJEcWpRUW9HNEVpSGFWdjBWQXQ3eUtHUTFJS3U5QWpaeno0RFNhMUxnPT0=
|
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pTDhnTVNzRUhScU8wYnZsZk52bHFkSWxLc18xQmtCeC1HbnNwTzVBbXRNTmQzRjZYaGE2MVlCNGtnWDk1T2I5VXVKNHpKU1VRbXEyN2tRWUJnU2ltZE5qZ3lmNEF6Z1hMTTEwZkk2NUNBYjhmVTJEcWpRUW9HNEVpSGFWdjBWQXQ3eUtHUTFJS3U5QWpaeno0RFNhMUxnPT0=
|
||||||
|
|
||||||
|
# Debug Configuration
|
||||||
|
APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE
|
||||||
|
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
import httpx
|
import httpx
|
||||||
|
import os
|
||||||
from typing import Dict, Any, List, Union
|
from typing import Dict, Any, List, Union
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
from modules.shared.configuration import APP_CONFIG
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
|
@ -147,6 +148,11 @@ class AiAnthropic:
|
||||||
# Direct content as string (in older API versions)
|
# Direct content as string (in older API versions)
|
||||||
content = anthropicResponse["content"]
|
content = anthropicResponse["content"]
|
||||||
|
|
||||||
|
# Debug logging for empty responses
|
||||||
|
if not content or content.strip() == "":
|
||||||
|
logger.warning(f"Anthropic API returned empty content. Full response: {anthropicResponse}")
|
||||||
|
content = "[Anthropic API returned empty response]"
|
||||||
|
|
||||||
# Return in OpenAI format
|
# Return in OpenAI format
|
||||||
return {
|
return {
|
||||||
"id": anthropicResponse.get("id", ""),
|
"id": anthropicResponse.get("id", ""),
|
||||||
|
|
@ -182,14 +188,27 @@ class AiAnthropic:
|
||||||
The analysis response as text
|
The analysis response as text
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# Debug logging
|
||||||
|
logger.info(f"callAiImage called with imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
|
||||||
|
|
||||||
# Distinguish between file path and binary data
|
# Distinguish between file path and binary data
|
||||||
if isinstance(imageData, str):
|
if isinstance(imageData, str):
|
||||||
# It's a file path - import filehandling only when needed
|
# Check if it's base64 encoded data or a file path
|
||||||
from modules import agentserviceFilemanager as fileHandler
|
if len(imageData) > 100 and not os.path.exists(imageData):
|
||||||
base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData)
|
# It's likely base64 encoded data
|
||||||
mimeType = mimeType or autoMimeType
|
logger.info("Treating imageData as base64 encoded string")
|
||||||
|
base64Data = imageData
|
||||||
|
if not mimeType:
|
||||||
|
mimeType = "image/png"
|
||||||
|
else:
|
||||||
|
# It's a file path - import filehandling only when needed
|
||||||
|
logger.info(f"Treating imageData as file path: {imageData}")
|
||||||
|
from modules import agentserviceFilemanager as fileHandler
|
||||||
|
base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData)
|
||||||
|
mimeType = mimeType or autoMimeType
|
||||||
else:
|
else:
|
||||||
# It's binary data
|
# It's binary data
|
||||||
|
logger.info("Treating imageData as binary data")
|
||||||
import base64
|
import base64
|
||||||
base64Data = base64.b64encode(imageData).decode('utf-8')
|
base64Data = base64.b64encode(imageData).decode('utf-8')
|
||||||
# MIME type must be specified for binary data
|
# MIME type must be specified for binary data
|
||||||
|
|
@ -216,8 +235,16 @@ class AiAnthropic:
|
||||||
# Use the existing callAiBasic function with the Vision model
|
# Use the existing callAiBasic function with the Vision model
|
||||||
response = await self.callAiBasic(messages)
|
response = await self.callAiBasic(messages)
|
||||||
|
|
||||||
# Extract and return content
|
# Extract and return content with proper error handling
|
||||||
return response["choices"][0]["message"]["content"]
|
try:
|
||||||
|
content = response["choices"][0]["message"]["content"]
|
||||||
|
if content is None or content.strip() == "":
|
||||||
|
return "[AI returned empty response for image analysis]"
|
||||||
|
return content
|
||||||
|
except (KeyError, IndexError, TypeError) as e:
|
||||||
|
logger.error(f"Error extracting content from AI response: {str(e)}")
|
||||||
|
logger.error(f"Response structure: {response}")
|
||||||
|
return f"[Error extracting AI response: {str(e)}]"
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
|
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
|
||||||
|
|
|
||||||
|
|
@ -188,4 +188,83 @@ class AiOpenai:
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
|
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
|
||||||
return f"[Error during image analysis: {str(e)}]"
|
return f"[Error during image analysis: {str(e)}]"
|
||||||
|
|
||||||
|
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Generate an image using DALL-E 3.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The text prompt for image generation
|
||||||
|
size: Image size (1024x1024, 1792x1024, or 1024x1792)
|
||||||
|
quality: Image quality (standard or hd)
|
||||||
|
style: Image style (vivid or natural)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with success status and image data
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
|
||||||
|
|
||||||
|
# DALL-E 3 API endpoint
|
||||||
|
dalle_url = "https://api.openai.com/v1/images/generations"
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "dall-e-3",
|
||||||
|
"prompt": prompt,
|
||||||
|
"size": size,
|
||||||
|
"quality": quality,
|
||||||
|
"style": style,
|
||||||
|
"n": 1,
|
||||||
|
"response_format": "b64_json" # Get base64 data directly instead of URLs
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a separate client for DALL-E API calls
|
||||||
|
dalle_client = httpx.AsyncClient(
|
||||||
|
timeout=120.0,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {self.apiKey}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await dalle_client.post(
|
||||||
|
dalle_url,
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
await dalle_client.aclose()
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": f"DALL-E API error: {response.status_code} - {response.text}"
|
||||||
|
}
|
||||||
|
|
||||||
|
responseJson = response.json()
|
||||||
|
|
||||||
|
if "data" in responseJson and len(responseJson["data"]) > 0:
|
||||||
|
image_data = responseJson["data"][0]["b64_json"]
|
||||||
|
|
||||||
|
logger.info(f"Successfully generated image: {len(image_data)} characters")
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"image_data": image_data,
|
||||||
|
"size": size,
|
||||||
|
"quality": quality,
|
||||||
|
"style": style
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
logger.error("No image data in DALL-E response")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": "No image data in DALL-E response"
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during image generation: {str(e)}", exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": f"Error during image generation: {str(e)}"
|
||||||
|
}
|
||||||
|
|
@ -271,6 +271,7 @@ class ConnectorWeb:
|
||||||
include_domains: list[str] | None = None,
|
include_domains: list[str] | None = None,
|
||||||
exclude_domains: list[str] | None = None,
|
exclude_domains: list[str] | None = None,
|
||||||
language: str | None = None,
|
language: str | None = None,
|
||||||
|
country: str | None = None,
|
||||||
include_answer: bool | None = None,
|
include_answer: bool | None = None,
|
||||||
include_raw_content: bool | None = None,
|
include_raw_content: bool | None = None,
|
||||||
) -> list[WebSearchResult]:
|
) -> list[WebSearchResult]:
|
||||||
|
|
@ -290,17 +291,20 @@ class ConnectorWeb:
|
||||||
kwargs["time_range"] = time_range
|
kwargs["time_range"] = time_range
|
||||||
if topic is not None:
|
if topic is not None:
|
||||||
kwargs["topic"] = topic
|
kwargs["topic"] = topic
|
||||||
if include_domains is not None:
|
if include_domains is not None and len(include_domains) > 0:
|
||||||
kwargs["include_domains"] = include_domains
|
kwargs["include_domains"] = include_domains
|
||||||
if exclude_domains is not None:
|
if exclude_domains is not None:
|
||||||
kwargs["exclude_domains"] = exclude_domains
|
kwargs["exclude_domains"] = exclude_domains
|
||||||
if language is not None:
|
if language is not None:
|
||||||
kwargs["language"] = language
|
kwargs["language"] = language
|
||||||
|
if country is not None:
|
||||||
|
kwargs["country"] = country
|
||||||
if include_answer is not None:
|
if include_answer is not None:
|
||||||
kwargs["include_answer"] = include_answer
|
kwargs["include_answer"] = include_answer
|
||||||
if include_raw_content is not None:
|
if include_raw_content is not None:
|
||||||
kwargs["include_raw_content"] = include_raw_content
|
kwargs["include_raw_content"] = include_raw_content
|
||||||
|
|
||||||
|
logger.debug(f"Tavily.search kwargs: {kwargs}")
|
||||||
response = await self.client.search(**kwargs)
|
response = await self.client.search(**kwargs)
|
||||||
|
|
||||||
return [
|
return [
|
||||||
|
|
|
||||||
|
|
@ -135,3 +135,29 @@ class AiCallResponse(BaseModel):
|
||||||
costEstimate: Optional[float] = Field(default=None, description="Estimated cost of the call")
|
costEstimate: Optional[float] = Field(default=None, description="Estimated cost of the call")
|
||||||
|
|
||||||
|
|
||||||
|
class EnhancedAiCallOptions(AiCallOptions):
|
||||||
|
"""Enhanced options for improved document processing with chunk mapping."""
|
||||||
|
|
||||||
|
# Parallel processing
|
||||||
|
enableParallelProcessing: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable parallel processing of chunks"
|
||||||
|
)
|
||||||
|
maxConcurrentChunks: int = Field(
|
||||||
|
default=5,
|
||||||
|
ge=1,
|
||||||
|
le=20,
|
||||||
|
description="Maximum number of chunks to process concurrently"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chunk mapping
|
||||||
|
preserveChunkMetadata: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Preserve chunk metadata during processing"
|
||||||
|
)
|
||||||
|
chunkSeparator: str = Field(
|
||||||
|
default="\n\n---\n\n",
|
||||||
|
description="Separator between chunks in merged output"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
130
modules/datamodels/datamodelDocument.py
Normal file
130
modules/datamodels/datamodelDocument.py
Normal file
|
|
@ -0,0 +1,130 @@
|
||||||
|
from typing import Any, Dict, List, Optional, Literal, Union
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentMetadata(BaseModel):
|
||||||
|
"""Metadata for the entire document."""
|
||||||
|
title: str = Field(description="Document title")
|
||||||
|
author: Optional[str] = Field(default=None, description="Document author")
|
||||||
|
created_at: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
|
||||||
|
source_documents: List[str] = Field(default_factory=list, description="Source document IDs")
|
||||||
|
extraction_method: str = Field(default="ai_extraction", description="Method used for extraction")
|
||||||
|
version: str = Field(default="1.0", description="Document version")
|
||||||
|
|
||||||
|
|
||||||
|
class TableData(BaseModel):
|
||||||
|
"""Structured table data."""
|
||||||
|
headers: List[str] = Field(description="Table column headers")
|
||||||
|
rows: List[List[str]] = Field(description="Table data rows")
|
||||||
|
caption: Optional[str] = Field(default=None, description="Table caption")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class ListItem(BaseModel):
|
||||||
|
"""Individual list item with optional sub-items."""
|
||||||
|
text: str = Field(description="List item text")
|
||||||
|
subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class BulletList(BaseModel):
|
||||||
|
"""Bulleted or numbered list."""
|
||||||
|
items: List[ListItem] = Field(description="List items")
|
||||||
|
list_type: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class Paragraph(BaseModel):
|
||||||
|
"""Text paragraph with optional formatting."""
|
||||||
|
text: str = Field(description="Paragraph text")
|
||||||
|
formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class Heading(BaseModel):
|
||||||
|
"""Document heading."""
|
||||||
|
text: str = Field(description="Heading text")
|
||||||
|
level: int = Field(ge=1, le=6, description="Heading level (1-6)")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class CodeBlock(BaseModel):
|
||||||
|
"""Code block with syntax highlighting."""
|
||||||
|
code: str = Field(description="Code content")
|
||||||
|
language: Optional[str] = Field(default=None, description="Programming language")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class Image(BaseModel):
|
||||||
|
"""Image with metadata."""
|
||||||
|
data: str = Field(description="Base64 encoded image data")
|
||||||
|
alt_text: Optional[str] = Field(default=None, description="Alternative text")
|
||||||
|
caption: Optional[str] = Field(default=None, description="Image caption")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentSection(BaseModel):
|
||||||
|
"""A section of the document containing one or more content elements."""
|
||||||
|
id: str = Field(description="Unique section identifier")
|
||||||
|
title: Optional[str] = Field(default=None, description="Section title")
|
||||||
|
content_type: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type")
|
||||||
|
elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section")
|
||||||
|
order: int = Field(description="Section order in document")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class StructuredDocument(BaseModel):
|
||||||
|
"""Complete structured document in JSON format."""
|
||||||
|
metadata: DocumentMetadata = Field(description="Document metadata")
|
||||||
|
sections: List[DocumentSection] = Field(description="Document sections")
|
||||||
|
summary: Optional[str] = Field(default=None, description="Document summary")
|
||||||
|
tags: List[str] = Field(default_factory=list, description="Document tags")
|
||||||
|
|
||||||
|
def get_sections_by_type(self, content_type: str) -> List[DocumentSection]:
|
||||||
|
"""Get all sections of a specific content type."""
|
||||||
|
return [section for section in self.sections if section.content_type == content_type]
|
||||||
|
|
||||||
|
def get_all_tables(self) -> List[TableData]:
|
||||||
|
"""Get all table data from the document."""
|
||||||
|
tables = []
|
||||||
|
for section in self.sections:
|
||||||
|
for element in section.elements:
|
||||||
|
if isinstance(element, TableData):
|
||||||
|
tables.append(element)
|
||||||
|
return tables
|
||||||
|
|
||||||
|
def get_all_lists(self) -> List[BulletList]:
|
||||||
|
"""Get all lists from the document."""
|
||||||
|
lists = []
|
||||||
|
for section in self.sections:
|
||||||
|
for element in section.elements:
|
||||||
|
if isinstance(element, BulletList):
|
||||||
|
lists.append(element)
|
||||||
|
return lists
|
||||||
|
|
||||||
|
|
||||||
|
class JsonChunkResult(BaseModel):
|
||||||
|
"""Result from processing a single chunk with JSON output."""
|
||||||
|
chunk_id: str = Field(description="Chunk identifier")
|
||||||
|
document_section: DocumentSection = Field(description="Structured content from this chunk")
|
||||||
|
processing_time: float = Field(description="Processing time in seconds")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk processing metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class JsonMergeResult(BaseModel):
|
||||||
|
"""Result from merging multiple JSON chunks."""
|
||||||
|
merged_document: StructuredDocument = Field(description="Merged structured document")
|
||||||
|
merge_strategy: str = Field(description="Strategy used for merging")
|
||||||
|
chunks_processed: int = Field(description="Number of chunks processed")
|
||||||
|
merge_time: float = Field(description="Time taken to merge chunks")
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
|
||||||
|
|
||||||
|
|
||||||
|
# Update forward references (compatible with Pydantic v1 and v2)
|
||||||
|
try:
|
||||||
|
# Pydantic v2
|
||||||
|
ListItem.model_rebuild()
|
||||||
|
except AttributeError:
|
||||||
|
# Pydantic v1
|
||||||
|
ListItem.update_forward_refs()
|
||||||
|
|
@ -18,6 +18,16 @@ class ContentExtracted(BaseModel):
|
||||||
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
|
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkResult(BaseModel):
|
||||||
|
"""Preserves the relationship between a chunk and its AI result."""
|
||||||
|
originalChunk: ContentPart
|
||||||
|
aiResult: str
|
||||||
|
chunkIndex: int
|
||||||
|
documentId: str
|
||||||
|
processingTime: float = 0.0
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
class MergeStrategy(BaseModel):
|
class MergeStrategy(BaseModel):
|
||||||
"""Strategy configuration for merging content parts and AI results."""
|
"""Strategy configuration for merging content parts and AI results."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
|
import asyncio
|
||||||
from typing import Dict, Any, List, Union, Tuple, Optional
|
from typing import Dict, Any, List, Union, Tuple, Optional
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
@ -260,6 +261,7 @@ class AiObjects:
|
||||||
if not requiredTags:
|
if not requiredTags:
|
||||||
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
|
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
|
||||||
|
|
||||||
|
|
||||||
# Override priority based on processing mode if not explicitly set
|
# Override priority based on processing mode if not explicitly set
|
||||||
effectivePriority = options.priority
|
effectivePriority = options.priority
|
||||||
if options.priority == Priority.BALANCED:
|
if options.priority == Priority.BALANCED:
|
||||||
|
|
@ -268,6 +270,7 @@ class AiObjects:
|
||||||
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
|
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
|
||||||
|
|
||||||
for name, info in aiModels.items():
|
for name, info in aiModels.items():
|
||||||
|
logger.info(f"Checking model: {name}, tags: {info.get('tags', [])}, function: {info.get('function', 'unknown')}")
|
||||||
# Check context length
|
# Check context length
|
||||||
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
|
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
|
||||||
continue
|
continue
|
||||||
|
|
@ -279,8 +282,11 @@ class AiObjects:
|
||||||
|
|
||||||
# Check required tags/capabilities
|
# Check required tags/capabilities
|
||||||
modelTags = info.get("tags", [])
|
modelTags = info.get("tags", [])
|
||||||
if requiredTags and not any(tag in modelTags for tag in requiredTags):
|
if requiredTags and not all(tag in modelTags for tag in requiredTags):
|
||||||
|
logger.info(f" -> Skipping {name}: missing required tags. Has: {modelTags}, needs: {requiredTags}")
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
logger.info(f" -> {name} passed tag check")
|
||||||
|
|
||||||
# Check processing mode requirements
|
# Check processing mode requirements
|
||||||
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
|
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
|
||||||
|
|
@ -288,16 +294,24 @@ class AiObjects:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
candidates[name] = info
|
candidates[name] = info
|
||||||
|
logger.info(f" -> {name} added to candidates")
|
||||||
|
|
||||||
|
logger.info(f"Final candidates: {list(candidates.keys())}")
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
|
logger.info("No candidates found, using fallback")
|
||||||
# Fallback based on operation type
|
# Fallback based on operation type
|
||||||
if options.operationType == OperationType.IMAGE_ANALYSIS:
|
if options.operationType == OperationType.IMAGE_ANALYSIS:
|
||||||
|
logger.info("Using fallback: openai_callAiImage")
|
||||||
return "openai_callAiImage"
|
return "openai_callAiImage"
|
||||||
elif options.operationType == OperationType.IMAGE_GENERATION:
|
elif options.operationType == OperationType.IMAGE_GENERATION:
|
||||||
|
logger.info("Using fallback: openai_generateImage")
|
||||||
return "openai_generateImage"
|
return "openai_generateImage"
|
||||||
elif options.operationType == OperationType.WEB_RESEARCH:
|
elif options.operationType == OperationType.WEB_RESEARCH:
|
||||||
|
logger.info("Using fallback: perplexity_callAiWithWebSearch")
|
||||||
return "perplexity_callAiWithWebSearch"
|
return "perplexity_callAiWithWebSearch"
|
||||||
else:
|
else:
|
||||||
|
logger.info("Using fallback: openai_callAiBasic_gpt35")
|
||||||
return "openai_callAiBasic_gpt35"
|
return "openai_callAiBasic_gpt35"
|
||||||
|
|
||||||
# Special handling for planning operations - use Claude for consistency
|
# Special handling for planning operations - use Claude for consistency
|
||||||
|
|
@ -313,17 +327,60 @@ class AiObjects:
|
||||||
|
|
||||||
# Select based on priority for other operations
|
# Select based on priority for other operations
|
||||||
if effectivePriority == Priority.SPEED:
|
if effectivePriority == Priority.SPEED:
|
||||||
return max(candidates, key=lambda k: candidates[k]["speedRating"])
|
selected = max(candidates, key=lambda k: candidates[k]["speedRating"])
|
||||||
|
logger.info(f"Selected by SPEED: {selected}")
|
||||||
|
return selected
|
||||||
elif effectivePriority == Priority.QUALITY:
|
elif effectivePriority == Priority.QUALITY:
|
||||||
return max(candidates, key=lambda k: candidates[k]["qualityRating"])
|
selected = max(candidates, key=lambda k: candidates[k]["qualityRating"])
|
||||||
|
logger.info(f"Selected by QUALITY: {selected}")
|
||||||
|
return selected
|
||||||
elif effectivePriority == Priority.COST:
|
elif effectivePriority == Priority.COST:
|
||||||
return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
|
selected = min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
|
||||||
|
logger.info(f"Selected by COST: {selected}")
|
||||||
|
return selected
|
||||||
else: # BALANCED
|
else: # BALANCED
|
||||||
def balancedScore(name: str) -> float:
|
def balancedScore(name: str) -> float:
|
||||||
info = candidates[name]
|
info = candidates[name]
|
||||||
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
|
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
|
||||||
|
|
||||||
return max(candidates, key=balancedScore)
|
selected = max(candidates, key=balancedScore)
|
||||||
|
logger.info(f"Selected by BALANCED: {selected}")
|
||||||
|
return selected
|
||||||
|
|
||||||
|
def _getFallbackModels(self, operationType: str) -> List[str]:
|
||||||
|
"""Get ordered list of fallback models for a given operation type."""
|
||||||
|
fallbackMappings = {
|
||||||
|
OperationType.GENERAL: [
|
||||||
|
"openai_callAiBasic_gpt35", # Fast and reliable
|
||||||
|
"openai_callAiBasic", # High quality
|
||||||
|
"anthropic_callAiBasic", # Alternative high quality
|
||||||
|
"perplexity_callAiBasic" # Cost effective
|
||||||
|
],
|
||||||
|
OperationType.IMAGE_ANALYSIS: [
|
||||||
|
"openai_callAiImage", # Primary image analysis
|
||||||
|
"anthropic_callAiImage" # Alternative image analysis
|
||||||
|
],
|
||||||
|
OperationType.IMAGE_GENERATION: [
|
||||||
|
"openai_generateImage" # Only image generation model
|
||||||
|
],
|
||||||
|
OperationType.WEB_RESEARCH: [
|
||||||
|
"perplexity_callAiWithWebSearch", # Primary web research
|
||||||
|
"perplexity_callAiBasic", # Alternative with web search
|
||||||
|
"openai_callAiBasic" # Fallback to general model
|
||||||
|
],
|
||||||
|
OperationType.GENERATE_PLAN: [
|
||||||
|
"anthropic_callAiBasic", # Best for planning
|
||||||
|
"openai_callAiBasic", # High quality alternative
|
||||||
|
"openai_callAiBasic_gpt35" # Fast fallback
|
||||||
|
],
|
||||||
|
OperationType.ANALYSE_CONTENT: [
|
||||||
|
"anthropic_callAiBasic", # Best for analysis
|
||||||
|
"openai_callAiBasic", # High quality alternative
|
||||||
|
"openai_callAiBasic_gpt35" # Fast fallback
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
return fallbackMappings.get(operationType, fallbackMappings[OperationType.GENERAL])
|
||||||
|
|
||||||
def _connectorFor(self, modelName: str):
|
def _connectorFor(self, modelName: str):
|
||||||
"""Get the appropriate connector for the model."""
|
"""Get the appropriate connector for the model."""
|
||||||
|
|
@ -340,7 +397,7 @@ class AiObjects:
|
||||||
raise ValueError(f"Unknown connector type: {connectorType}")
|
raise ValueError(f"Unknown connector type: {connectorType}")
|
||||||
|
|
||||||
async def call(self, request: AiCallRequest) -> AiCallResponse:
|
async def call(self, request: AiCallRequest) -> AiCallResponse:
|
||||||
"""Call AI model for text generation."""
|
"""Call AI model for text generation with fallback mechanism."""
|
||||||
prompt = request.prompt
|
prompt = request.prompt
|
||||||
context = request.context or ""
|
context = request.context or ""
|
||||||
options = request.options
|
options = request.options
|
||||||
|
|
@ -357,9 +414,6 @@ class AiObjects:
|
||||||
if options.compressContext and len(context.encode("utf-8")) > 70000:
|
if options.compressContext and len(context.encode("utf-8")) > 70000:
|
||||||
context = maybeTruncate(context, 70000)
|
context = maybeTruncate(context, 70000)
|
||||||
|
|
||||||
# Select model for text generation
|
|
||||||
modelName = self._selectModel(prompt, context, options)
|
|
||||||
|
|
||||||
# Derive generation parameters
|
# Derive generation parameters
|
||||||
temperature = getattr(options, "temperature", None)
|
temperature = getattr(options, "temperature", None)
|
||||||
if temperature is None:
|
if temperature is None:
|
||||||
|
|
@ -376,58 +430,112 @@ class AiObjects:
|
||||||
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
|
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
|
||||||
messages.append({"role": "user", "content": prompt})
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
|
||||||
connector = self._connectorFor(modelName)
|
# Get fallback models for this operation type
|
||||||
functionName = aiModels[modelName]["function"]
|
fallbackModels = self._getFallbackModels(options.operationType)
|
||||||
|
|
||||||
# Call the appropriate function
|
# Try primary model first, then fallbacks
|
||||||
if functionName == "callAiBasic":
|
lastError = None
|
||||||
if aiModels[modelName]["connector"] == "openai":
|
for attempt, modelName in enumerate(fallbackModels):
|
||||||
content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
|
try:
|
||||||
elif aiModels[modelName]["connector"] == "perplexity":
|
logger.info(f"Attempting AI call with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})")
|
||||||
content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
|
|
||||||
else:
|
connector = self._connectorFor(modelName)
|
||||||
response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
|
functionName = aiModels[modelName]["function"]
|
||||||
content = response["choices"][0]["message"]["content"]
|
|
||||||
elif functionName == "callAiWithWebSearch":
|
# Call the appropriate function
|
||||||
# Perplexity web search function
|
if functionName == "callAiBasic":
|
||||||
query = prompt
|
if aiModels[modelName]["connector"] == "openai":
|
||||||
if context:
|
content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
|
||||||
query = f"Context: {context}\n\nQuery: {prompt}"
|
elif aiModels[modelName]["connector"] == "perplexity":
|
||||||
content = await connector.callAiWithWebSearch(query)
|
content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
|
||||||
elif functionName == "researchTopic":
|
else:
|
||||||
# Perplexity research function
|
response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
|
||||||
content = await connector.researchTopic(prompt)
|
content = response["choices"][0]["message"]["content"]
|
||||||
elif functionName == "answerQuestion":
|
elif functionName == "callAiWithWebSearch":
|
||||||
# Perplexity question answering function
|
# Perplexity web search function
|
||||||
content = await connector.answerQuestion(prompt, context)
|
query = prompt
|
||||||
elif functionName == "getCurrentNews":
|
if context:
|
||||||
# Perplexity news function
|
query = f"Context: {context}\n\nQuery: {prompt}"
|
||||||
content = await connector.getCurrentNews(prompt)
|
content = await connector.callAiWithWebSearch(query)
|
||||||
else:
|
elif functionName == "researchTopic":
|
||||||
raise ValueError(f"Function {functionName} not supported for text generation")
|
# Perplexity research function
|
||||||
|
content = await connector.researchTopic(prompt)
|
||||||
|
elif functionName == "answerQuestion":
|
||||||
|
# Perplexity question answering function
|
||||||
|
content = await connector.answerQuestion(prompt, context)
|
||||||
|
elif functionName == "getCurrentNews":
|
||||||
|
# Perplexity news function
|
||||||
|
content = await connector.getCurrentNews(prompt)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Function {functionName} not supported for text generation")
|
||||||
|
|
||||||
# Estimate cost/tokens
|
# Success! Estimate cost/tokens and return
|
||||||
totalSize = len((prompt + context).encode("utf-8"))
|
totalSize = len((prompt + context).encode("utf-8"))
|
||||||
cost = self._estimateCost(aiModels[modelName], totalSize)
|
cost = self._estimateCost(aiModels[modelName], totalSize)
|
||||||
usedTokens = int(totalSize / 4)
|
usedTokens = int(totalSize / 4)
|
||||||
|
|
||||||
|
logger.info(f"✅ AI call successful with model: {modelName}")
|
||||||
|
return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
lastError = e
|
||||||
|
logger.warning(f"❌ AI call failed with model {modelName}: {str(e)}")
|
||||||
|
|
||||||
|
# If this is not the last model, try the next one
|
||||||
|
if attempt < len(fallbackModels) - 1:
|
||||||
|
logger.info(f"🔄 Trying next fallback model...")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# All models failed
|
||||||
|
logger.error(f"💥 All {len(fallbackModels)} models failed for operation {options.operationType}")
|
||||||
|
break
|
||||||
|
|
||||||
return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost)
|
# All fallback attempts failed
|
||||||
|
errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}"
|
||||||
|
logger.error(errorMsg)
|
||||||
|
raise Exception(errorMsg)
|
||||||
|
|
||||||
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str:
|
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str:
|
||||||
"""Call AI model for image analysis."""
|
"""Call AI model for image analysis with fallback mechanism."""
|
||||||
if options is None:
|
if options is None:
|
||||||
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
|
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
|
||||||
|
|
||||||
# Select model for image analysis
|
# Get fallback models for image analysis
|
||||||
modelName = self._selectModel(prompt, "", options)
|
fallbackModels = self._getFallbackModels(OperationType.IMAGE_ANALYSIS)
|
||||||
|
|
||||||
connector = self._connectorFor(modelName)
|
# Try primary model first, then fallbacks
|
||||||
functionName = aiModels[modelName]["function"]
|
lastError = None
|
||||||
|
for attempt, modelName in enumerate(fallbackModels):
|
||||||
if functionName == "callAiImage":
|
try:
|
||||||
return await connector.callAiImage(prompt, imageData, mimeType)
|
logger.info(f"Attempting image analysis with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})")
|
||||||
else:
|
|
||||||
raise ValueError(f"Function {functionName} not supported for image analysis")
|
connector = self._connectorFor(modelName)
|
||||||
|
functionName = aiModels[modelName]["function"]
|
||||||
|
|
||||||
|
if functionName == "callAiImage":
|
||||||
|
content = await connector.callAiImage(prompt, imageData, mimeType)
|
||||||
|
logger.info(f"✅ Image analysis successful with model: {modelName}")
|
||||||
|
return content
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Function {functionName} not supported for image analysis")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
lastError = e
|
||||||
|
logger.warning(f"❌ Image analysis failed with model {modelName}: {str(e)}")
|
||||||
|
|
||||||
|
# If this is not the last model, try the next one
|
||||||
|
if attempt < len(fallbackModels) - 1:
|
||||||
|
logger.info(f"🔄 Trying next fallback model for image analysis...")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# All models failed
|
||||||
|
logger.error(f"💥 All {len(fallbackModels)} models failed for image analysis")
|
||||||
|
break
|
||||||
|
|
||||||
|
# All fallback attempts failed
|
||||||
|
errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}"
|
||||||
|
logger.error(errorMsg)
|
||||||
|
raise Exception(errorMsg)
|
||||||
|
|
||||||
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]:
|
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]:
|
||||||
"""Generate an image using AI."""
|
"""Generate an image using AI."""
|
||||||
|
|
@ -694,7 +802,22 @@ class AiObjects:
|
||||||
logger.warning(f"Failed to extract links from content: {e}")
|
logger.warning(f"Failed to extract links from content: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]:
|
def _normalizeUrl(self, url: str) -> str:
|
||||||
|
"""Normalize URL to handle variations that should be considered duplicates."""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Remove trailing slashes and fragments
|
||||||
|
url = url.rstrip('/')
|
||||||
|
if '#' in url:
|
||||||
|
url = url.split('#')[0]
|
||||||
|
|
||||||
|
# Handle common URL variations
|
||||||
|
url = url.replace('http://', 'https://') # Normalize protocol
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Recursively crawl URLs up to specified depth.
|
Recursively crawl URLs up to specified depth.
|
||||||
|
|
||||||
|
|
@ -703,76 +826,100 @@ class AiObjects:
|
||||||
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
||||||
extract_depth: Tavily extract depth setting
|
extract_depth: Tavily extract depth setting
|
||||||
max_per_domain: Maximum URLs per domain per level
|
max_per_domain: Maximum URLs per domain per level
|
||||||
|
global_processed_urls: Optional global set to track processed URLs across sessions
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary mapping URL -> content for all crawled pages
|
Dictionary mapping URL -> content for all crawled pages
|
||||||
"""
|
"""
|
||||||
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
||||||
|
|
||||||
# URL index to track all processed URLs
|
# URL index to track all processed URLs (local + global)
|
||||||
processed_urls = set()
|
processed_urls = set()
|
||||||
|
if global_processed_urls is not None:
|
||||||
|
# Use global index if provided, otherwise create local one
|
||||||
|
processed_urls = global_processed_urls
|
||||||
|
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
||||||
|
else:
|
||||||
|
logger.info("Using local URL index for this crawl session")
|
||||||
|
|
||||||
all_content = {}
|
all_content = {}
|
||||||
|
|
||||||
# Current level URLs to process
|
# Current level URLs to process
|
||||||
current_level_urls = urls.copy()
|
current_level_urls = urls.copy()
|
||||||
|
|
||||||
for depth in range(1, max_depth + 1):
|
try:
|
||||||
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
for depth in range(1, max_depth + 1):
|
||||||
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
||||||
|
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
||||||
# URLs found at this level (for next iteration)
|
|
||||||
next_level_urls = []
|
|
||||||
|
|
||||||
for url in current_level_urls:
|
|
||||||
if url in processed_urls:
|
|
||||||
logger.debug(f"URL {url} already processed, skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
# URLs found at this level (for next iteration)
|
||||||
logger.info(f"Processing URL at depth {depth}: {url}")
|
next_level_urls = []
|
||||||
|
|
||||||
|
for url in current_level_urls:
|
||||||
|
# Normalize URL for duplicate checking
|
||||||
|
normalized_url = self._normalizeUrl(url)
|
||||||
|
if normalized_url in processed_urls:
|
||||||
|
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
# Read page content
|
try:
|
||||||
content = await self.readPage(url, extract_depth)
|
logger.info(f"Processing URL at depth {depth}: {url}")
|
||||||
if content:
|
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
|
||||||
all_content[url] = content
|
|
||||||
processed_urls.add(url)
|
|
||||||
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
|
||||||
|
|
||||||
# Get URLs from this page for next level
|
# Read page content
|
||||||
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
content = await self.readPage(url, extract_depth)
|
||||||
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
if content:
|
||||||
|
all_content[url] = content
|
||||||
# Filter URLs and add to next level
|
processed_urls.add(normalized_url)
|
||||||
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
||||||
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
|
||||||
|
# Get URLs from this page for next level
|
||||||
# Add new URLs to next level (avoiding already processed ones)
|
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
||||||
new_urls_count = 0
|
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
||||||
for new_url in filtered_urls:
|
|
||||||
if new_url not in processed_urls:
|
# Filter URLs and add to next level
|
||||||
next_level_urls.append(new_url)
|
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
||||||
new_urls_count += 1
|
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
||||||
|
|
||||||
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
# Add new URLs to next level (avoiding already processed ones)
|
||||||
else:
|
new_urls_count = 0
|
||||||
logger.warning(f"✗ No content extracted from {url}")
|
for new_url in filtered_urls:
|
||||||
processed_urls.add(url) # Mark as processed to avoid retry
|
normalized_new_url = self._normalizeUrl(new_url)
|
||||||
|
if normalized_new_url not in processed_urls:
|
||||||
except Exception as e:
|
next_level_urls.append(new_url)
|
||||||
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
new_urls_count += 1
|
||||||
processed_urls.add(url) # Mark as processed to avoid retry
|
else:
|
||||||
|
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
|
||||||
|
|
||||||
|
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"✗ No content extracted from {url}")
|
||||||
|
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
||||||
|
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||||
|
|
||||||
|
# Prepare for next iteration
|
||||||
|
current_level_urls = next_level_urls
|
||||||
|
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
||||||
|
|
||||||
|
# Stop if no more URLs to process
|
||||||
|
if not current_level_urls:
|
||||||
|
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
||||||
|
break
|
||||||
|
|
||||||
# Prepare for next iteration
|
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||||
current_level_urls = next_level_urls
|
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
|
||||||
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
logger.info(f"Unique URLs found: {len(all_content)}")
|
||||||
|
return all_content
|
||||||
|
|
||||||
# Stop if no more URLs to process
|
except asyncio.TimeoutError:
|
||||||
if not current_level_urls:
|
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
|
||||||
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
return all_content
|
||||||
break
|
except Exception as e:
|
||||||
|
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
||||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
return all_content
|
||||||
return all_content
|
|
||||||
|
|
||||||
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
|
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
|
||||||
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
||||||
|
|
|
||||||
|
|
@ -571,8 +571,10 @@ class ChatObjects:
|
||||||
actionName=createdMessage.get("actionName")
|
actionName=createdMessage.get("actionName")
|
||||||
)
|
)
|
||||||
|
|
||||||
# Debug: Store message and documents for debugging TODO REMOVE
|
# Debug: Store message and documents for debugging - only if debug enabled
|
||||||
self._storeDebugMessageAndDocuments(chat_message)
|
debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if debug_enabled:
|
||||||
|
self._storeDebugMessageAndDocuments(chat_message)
|
||||||
|
|
||||||
return chat_message
|
return chat_message
|
||||||
|
|
||||||
|
|
@ -1052,8 +1054,11 @@ class ChatObjects:
|
||||||
|
|
||||||
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
|
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
|
||||||
"""
|
"""
|
||||||
Store message and documents for debugging purposes in fileshare.
|
Store message and documents (metadata and file bytes) for debugging purposes.
|
||||||
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents
|
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/
|
||||||
|
- message.json, message_text.txt
|
||||||
|
- document_###_metadata.json
|
||||||
|
- document_###_<original_filename> (actual file bytes)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
message: ChatMessage object to store
|
message: ChatMessage object to store
|
||||||
|
|
@ -1156,6 +1161,26 @@ class ChatObjects:
|
||||||
json.dump(doc_meta, f, indent=2, ensure_ascii=False, default=str)
|
json.dump(doc_meta, f, indent=2, ensure_ascii=False, default=str)
|
||||||
|
|
||||||
logger.info(f"Debug: Stored document metadata for {doc.fileName}")
|
logger.info(f"Debug: Stored document metadata for {doc.fileName}")
|
||||||
|
|
||||||
|
# Also store the actual file bytes next to metadata for debugging
|
||||||
|
try:
|
||||||
|
# Lazy import to avoid circular deps at module load
|
||||||
|
from modules.interfaces import interfaceDbComponentObjects as comp
|
||||||
|
componentInterface = comp.getInterface(self.currentUser)
|
||||||
|
file_bytes = componentInterface.getFileData(doc.fileId)
|
||||||
|
if file_bytes:
|
||||||
|
# Build a safe filename preserving original name
|
||||||
|
safe_name = doc.fileName or f"document_{i+1:03d}"
|
||||||
|
# Avoid path traversal
|
||||||
|
safe_name = os.path.basename(safe_name)
|
||||||
|
doc_file_path = os.path.join(label_folder, f"document_{i+1:03d}_" + safe_name)
|
||||||
|
with open(doc_file_path, "wb") as df:
|
||||||
|
df.write(file_bytes)
|
||||||
|
logger.info(f"Debug: Stored document file bytes: {doc_file_path} ({len(file_bytes)} bytes)")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Debug: No file bytes returned for fileId {doc.fileId}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Debug: Failed to store document file for {doc.fileName} (fileId {doc.fileId}): {e}")
|
||||||
|
|
||||||
logger.info(f"Debug: Stored message and documents in {message_path}")
|
logger.info(f"Debug: Stored message and documents in {message_path}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,8 +95,8 @@ async def update_prompt(
|
||||||
detail=f"Prompt with ID {promptId} not found"
|
detail=f"Prompt with ID {promptId} not found"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert Prompt to dict for interface
|
# Convert Prompt to dict for interface, excluding the id field
|
||||||
update_data = promptData.dict()
|
update_data = promptData.dict(exclude={'id'})
|
||||||
|
|
||||||
# Update prompt
|
# Update prompt
|
||||||
updatedPrompt = managementInterface.updatePrompt(promptId, update_data)
|
updatedPrompt = managementInterface.updatePrompt(promptId, update_data)
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
# Import auth modules
|
# Import auth modules
|
||||||
from modules.security.auth import getCurrentUser, limiter, SECRET_KEY, ALGORITHM
|
from modules.security.auth import getCurrentUser, limiter, SECRET_KEY, ALGORITHM
|
||||||
from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie
|
from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie, clearAccessTokenCookie, clearRefreshTokenCookie
|
||||||
from modules.interfaces.interfaceDbAppObjects import getInterface, getRootInterface
|
from modules.interfaces.interfaceDbAppObjects import getInterface, getRootInterface
|
||||||
from modules.datamodels.datamodelUam import User, UserInDB, AuthAuthority, UserPrivilege
|
from modules.datamodels.datamodelUam import User, UserInDB, AuthAuthority, UserPrivilege
|
||||||
from modules.datamodels.datamodelSecurity import Token
|
from modules.datamodels.datamodelSecurity import Token
|
||||||
|
|
@ -263,8 +263,7 @@ async def read_user_me(
|
||||||
@limiter.limit("60/minute")
|
@limiter.limit("60/minute")
|
||||||
async def refresh_token(
|
async def refresh_token(
|
||||||
request: Request,
|
request: Request,
|
||||||
response: Response,
|
response: Response
|
||||||
currentUser: User = Depends(getCurrentUser)
|
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Refresh access token using refresh token from cookie"""
|
"""Refresh access token using refresh token from cookie"""
|
||||||
try:
|
try:
|
||||||
|
|
@ -283,12 +282,27 @@ async def refresh_token(
|
||||||
except jwt.JWTError:
|
except jwt.JWTError:
|
||||||
raise HTTPException(status_code=401, detail="Invalid refresh token")
|
raise HTTPException(status_code=401, detail="Invalid refresh token")
|
||||||
|
|
||||||
|
# Get user information from refresh token payload
|
||||||
|
user_id = payload.get("userId")
|
||||||
|
if not user_id:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID")
|
||||||
|
|
||||||
|
# Get user from database using the user ID from refresh token
|
||||||
|
try:
|
||||||
|
app_interface = getRootInterface()
|
||||||
|
current_user = app_interface.getUser(user_id)
|
||||||
|
if not current_user:
|
||||||
|
raise HTTPException(status_code=401, detail="User not found")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get user from database: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to validate user")
|
||||||
|
|
||||||
# Create new token data
|
# Create new token data
|
||||||
token_data = {
|
token_data = {
|
||||||
"sub": currentUser.username,
|
"sub": current_user.username,
|
||||||
"mandateId": str(currentUser.mandateId),
|
"mandateId": str(current_user.mandateId),
|
||||||
"userId": str(currentUser.id),
|
"userId": str(current_user.id),
|
||||||
"authenticationAuthority": currentUser.authenticationAuthority
|
"authenticationAuthority": current_user.authenticationAuthority
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create new access token + set cookie
|
# Create new access token + set cookie
|
||||||
|
|
@ -365,15 +379,18 @@ async def logout(request: Request, response: Response, currentUser: User = Depen
|
||||||
# Don't fail if audit logging fails
|
# Don't fail if audit logging fails
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Clear httpOnly cookies
|
# Create the JSON response first
|
||||||
response.delete_cookie(key="auth_token", httponly=True, samesite="strict")
|
json_response = JSONResponse({
|
||||||
response.delete_cookie(key="refresh_token", httponly=True, samesite="strict")
|
|
||||||
|
|
||||||
return JSONResponse({
|
|
||||||
"message": "Successfully logged out - cookies cleared",
|
"message": "Successfully logged out - cookies cleared",
|
||||||
"revokedTokens": revoked
|
"revokedTokens": revoked
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Clear httpOnly cookies on the response we're actually returning
|
||||||
|
clearAccessTokenCookie(json_response)
|
||||||
|
clearRefreshTokenCookie(json_response)
|
||||||
|
|
||||||
|
return json_response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during logout: {str(e)}")
|
logger.error(f"Error during logout: {str(e)}")
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,11 @@ ALGORITHM = APP_CONFIG.get("Auth_ALGORITHM")
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES = int(APP_CONFIG.get("APP_TOKEN_EXPIRY"))
|
ACCESS_TOKEN_EXPIRE_MINUTES = int(APP_CONFIG.get("APP_TOKEN_EXPIRY"))
|
||||||
REFRESH_TOKEN_EXPIRE_DAYS = int(APP_CONFIG.get("APP_REFRESH_TOKEN_EXPIRY", "7"))
|
REFRESH_TOKEN_EXPIRE_DAYS = int(APP_CONFIG.get("APP_REFRESH_TOKEN_EXPIRY", "7"))
|
||||||
|
|
||||||
|
# Cookie security settings - use secure cookies based on whether API uses HTTPS
|
||||||
|
# Cookies must have secure=True on HTTPS sites, secure=False on HTTP sites
|
||||||
|
APP_API_URL = APP_CONFIG.get("APP_API_URL", "http://localhost:8000")
|
||||||
|
USE_SECURE_COOKIES = APP_API_URL.startswith("https://") if APP_API_URL else False
|
||||||
|
|
||||||
|
|
||||||
def createAccessToken(data: dict, expiresDelta: Optional[timedelta] = None) -> Tuple[str, "datetime"]:
|
def createAccessToken(data: dict, expiresDelta: Optional[timedelta] = None) -> Tuple[str, "datetime"]:
|
||||||
"""Create a JWT access token and return (token, expiresAt)."""
|
"""Create a JWT access token and return (token, expiresAt)."""
|
||||||
|
|
@ -52,8 +57,9 @@ def setAccessTokenCookie(response: Response, token: str, expiresDelta: Optional[
|
||||||
key="auth_token",
|
key="auth_token",
|
||||||
value=token,
|
value=token,
|
||||||
httponly=True,
|
httponly=True,
|
||||||
secure=True,
|
secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS)
|
||||||
samesite="strict",
|
samesite="strict",
|
||||||
|
path="/",
|
||||||
max_age=maxAge
|
max_age=maxAge
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -64,9 +70,46 @@ def setRefreshTokenCookie(response: Response, token: str) -> None:
|
||||||
key="refresh_token",
|
key="refresh_token",
|
||||||
value=token,
|
value=token,
|
||||||
httponly=True,
|
httponly=True,
|
||||||
secure=True,
|
secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS)
|
||||||
samesite="strict",
|
samesite="strict",
|
||||||
|
path="/",
|
||||||
max_age=REFRESH_TOKEN_EXPIRE_DAYS * 24 * 60 * 60
|
max_age=REFRESH_TOKEN_EXPIRE_DAYS * 24 * 60 * 60
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def clearAccessTokenCookie(response: Response) -> None:
|
||||||
|
"""
|
||||||
|
Clear access token cookie by setting it to expire immediately.
|
||||||
|
Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility.
|
||||||
|
"""
|
||||||
|
# Build secure flag based on environment
|
||||||
|
secure_flag = "; Secure" if USE_SECURE_COOKIES else ""
|
||||||
|
|
||||||
|
# Primary method: Raw Set-Cookie header for guaranteed deletion
|
||||||
|
response.headers.append(
|
||||||
|
"Set-Cookie",
|
||||||
|
f"auth_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: Also use FastAPI's built-in method
|
||||||
|
response.delete_cookie(key="auth_token", path="/")
|
||||||
|
|
||||||
|
|
||||||
|
def clearRefreshTokenCookie(response: Response) -> None:
|
||||||
|
"""
|
||||||
|
Clear refresh token cookie by setting it to expire immediately.
|
||||||
|
Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility.
|
||||||
|
"""
|
||||||
|
# Build secure flag based on environment
|
||||||
|
secure_flag = "; Secure" if USE_SECURE_COOKIES else ""
|
||||||
|
|
||||||
|
# Primary method: Raw Set-Cookie header for guaranteed deletion
|
||||||
|
response.headers.append(
|
||||||
|
"Set-Cookie",
|
||||||
|
f"refresh_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: Also use FastAPI's built-in method
|
||||||
|
response.delete_cookie(key="refresh_token", path="/")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
596
modules/services/serviceAi/subCoreAi.py
Normal file
596
modules/services/serviceAi/subCoreAi.py
Normal file
|
|
@ -0,0 +1,596 @@
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||||
|
from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority
|
||||||
|
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SubCoreAi:
|
||||||
|
"""Core AI operations including image analysis, text generation, and planning calls."""
|
||||||
|
|
||||||
|
def __init__(self, services, aiObjects):
|
||||||
|
"""Initialize core AI operations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
services: Service center instance for accessing other services
|
||||||
|
aiObjects: Initialized AiObjects instance
|
||||||
|
"""
|
||||||
|
self.services = services
|
||||||
|
self.aiObjects = aiObjects
|
||||||
|
|
||||||
|
# AI Processing Call
|
||||||
|
async def callAi(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
documents: Optional[List[ChatDocument]] = None,
|
||||||
|
placeholders: Optional[List[PromptPlaceholder]] = None,
|
||||||
|
options: Optional[AiCallOptions] = None,
|
||||||
|
outputFormat: Optional[str] = None,
|
||||||
|
title: Optional[str] = None,
|
||||||
|
documentProcessor=None,
|
||||||
|
documentGenerator=None
|
||||||
|
) -> Union[str, Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Unified AI call interface that automatically routes to appropriate handler.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The main prompt for the AI call
|
||||||
|
documents: Optional list of documents to process
|
||||||
|
placeholders: Optional list of placeholder replacements for planning calls
|
||||||
|
options: AI call configuration options
|
||||||
|
outputFormat: Optional output format (html, pdf, docx, txt, md, json, csv, xlsx) for document generation
|
||||||
|
title: Optional title for generated documents
|
||||||
|
documentProcessor: Document processing service instance
|
||||||
|
documentGenerator: Document generation service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AI response as string, or dict with documents if outputFormat is specified
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If all available models fail
|
||||||
|
"""
|
||||||
|
if options is None:
|
||||||
|
options = AiCallOptions()
|
||||||
|
|
||||||
|
# Normalize placeholders from List[PromptPlaceholder]
|
||||||
|
placeholders_dict: Dict[str, str] = {}
|
||||||
|
placeholders_meta: Dict[str, bool] = {}
|
||||||
|
if placeholders:
|
||||||
|
placeholders_dict = {p.label: p.content for p in placeholders}
|
||||||
|
placeholders_meta = {p.label: bool(getattr(p, 'summaryAllowed', False)) for p in placeholders}
|
||||||
|
|
||||||
|
# Auto-determine call type based on documents and operation type
|
||||||
|
call_type = self._determineCallType(documents, options.operationType)
|
||||||
|
options.callType = call_type
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Build the full prompt that will be sent to AI
|
||||||
|
if placeholders:
|
||||||
|
full_prompt = prompt
|
||||||
|
for p in placeholders:
|
||||||
|
placeholder = f"{{{{KEY:{p.label}}}}}"
|
||||||
|
full_prompt = full_prompt.replace(placeholder, p.content)
|
||||||
|
else:
|
||||||
|
full_prompt = prompt
|
||||||
|
|
||||||
|
self._writeAiResponseDebug(
|
||||||
|
label='ai_prompt_debug',
|
||||||
|
content=full_prompt,
|
||||||
|
partIndex=1,
|
||||||
|
modelName=None,
|
||||||
|
continuation=False
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Handle document generation with specific output format
|
||||||
|
if outputFormat and documentGenerator:
|
||||||
|
result = await documentGenerator.callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title)
|
||||||
|
# Log AI response for debugging
|
||||||
|
try:
|
||||||
|
if isinstance(result, dict) and 'content' in result:
|
||||||
|
self._writeAiResponseDebug(
|
||||||
|
label='ai_document_generation',
|
||||||
|
content=result['content'],
|
||||||
|
partIndex=1,
|
||||||
|
modelName=None, # Document generation doesn't return model info
|
||||||
|
continuation=False
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return result
|
||||||
|
|
||||||
|
if call_type == "planning":
|
||||||
|
result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options)
|
||||||
|
# Log AI response for debugging
|
||||||
|
try:
|
||||||
|
self._writeAiResponseDebug(
|
||||||
|
label='ai_planning',
|
||||||
|
content=result or "",
|
||||||
|
partIndex=1,
|
||||||
|
modelName=None, # Planning doesn't return model info
|
||||||
|
continuation=False
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
# Set processDocumentsIndividually from the legacy parameter if not set in options
|
||||||
|
if options.processDocumentsIndividually is None and documents:
|
||||||
|
options.processDocumentsIndividually = False # Default to batch processing
|
||||||
|
|
||||||
|
# For text calls, we need to build the full prompt with placeholders here
|
||||||
|
# since _callAiText doesn't handle placeholders directly
|
||||||
|
if placeholders_dict:
|
||||||
|
full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders_dict)
|
||||||
|
else:
|
||||||
|
full_prompt = prompt
|
||||||
|
|
||||||
|
if documentProcessor and documents:
|
||||||
|
result = await documentProcessor.callAiText(full_prompt, documents, options)
|
||||||
|
else:
|
||||||
|
# Fallback to direct AI call if no document processor available
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=full_prompt,
|
||||||
|
context="",
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
result = response.content
|
||||||
|
|
||||||
|
# Log AI response for debugging (additional logging for text calls)
|
||||||
|
try:
|
||||||
|
self._writeAiResponseDebug(
|
||||||
|
label='ai_text_main',
|
||||||
|
content=result or "",
|
||||||
|
partIndex=1,
|
||||||
|
modelName=None, # Text calls already log internally
|
||||||
|
continuation=False
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return result
|
||||||
|
|
||||||
|
# AI Image Analysis
|
||||||
|
async def readImage(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
imageData: Union[str, bytes],
|
||||||
|
mimeType: str = None,
|
||||||
|
options: Optional[AiCallOptions] = None,
|
||||||
|
) -> str:
|
||||||
|
"""Call AI for image analysis using interface.callImage()."""
|
||||||
|
try:
|
||||||
|
# Check if imageData is valid
|
||||||
|
if not imageData:
|
||||||
|
error_msg = "No image data provided"
|
||||||
|
self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE")
|
||||||
|
logger.error(f"Error in AI image analysis: {error_msg}")
|
||||||
|
return f"Error: {error_msg}"
|
||||||
|
|
||||||
|
self.services.utils.debugLogToFile(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}", "AI_SERVICE")
|
||||||
|
logger.info(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
|
||||||
|
|
||||||
|
# Always use IMAGE_ANALYSIS operation type for image processing
|
||||||
|
if options is None:
|
||||||
|
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
|
||||||
|
else:
|
||||||
|
# Override the operation type to ensure image analysis
|
||||||
|
options.operationType = OperationType.IMAGE_ANALYSIS
|
||||||
|
|
||||||
|
self.services.utils.debugLogToFile(f"Calling aiObjects.callImage with operationType: {options.operationType}", "AI_SERVICE")
|
||||||
|
logger.info(f"Calling aiObjects.callImage with operationType: {options.operationType}")
|
||||||
|
result = await self.aiObjects.callImage(prompt, imageData, mimeType, options)
|
||||||
|
|
||||||
|
# Debug the result
|
||||||
|
self.services.utils.debugLogToFile(f"Raw AI result type: {type(result)}, value: {repr(result)}", "AI_SERVICE")
|
||||||
|
|
||||||
|
# Check if result is valid
|
||||||
|
if not result or (isinstance(result, str) and not result.strip()):
|
||||||
|
error_msg = f"No response from AI image analysis (result: {repr(result)})"
|
||||||
|
self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE")
|
||||||
|
logger.error(f"Error in AI image analysis: {error_msg}")
|
||||||
|
return f"Error: {error_msg}"
|
||||||
|
|
||||||
|
self.services.utils.debugLogToFile(f"callImage returned: {result[:200]}..." if len(result) > 200 else result, "AI_SERVICE")
|
||||||
|
logger.info(f"callImage returned: {result[:200]}..." if len(result) > 200 else result)
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
self.services.utils.debugLogToFile(f"Error in AI image analysis: {str(e)}", "AI_SERVICE")
|
||||||
|
logger.error(f"Error in AI image analysis: {str(e)}")
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
# AI Image Generation
|
||||||
|
async def generateImage(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
size: str = "1024x1024",
|
||||||
|
quality: str = "standard",
|
||||||
|
style: str = "vivid",
|
||||||
|
options: Optional[AiCallOptions] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Generate an image using AI using interface.generateImage()."""
|
||||||
|
try:
|
||||||
|
return await self.aiObjects.generateImage(prompt, size, quality, style, options)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in AI image generation: {str(e)}")
|
||||||
|
return {"success": False, "error": str(e)}
|
||||||
|
|
||||||
|
def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str:
|
||||||
|
"""
|
||||||
|
Determine call type based on documents and operation type.
|
||||||
|
|
||||||
|
Criteria: no documents AND operationType is "generate_plan" -> planning
|
||||||
|
All other cases -> text
|
||||||
|
"""
|
||||||
|
has_documents = documents is not None and len(documents) > 0
|
||||||
|
is_planning_operation = operation_type == OperationType.GENERATE_PLAN
|
||||||
|
|
||||||
|
if not has_documents and is_planning_operation:
|
||||||
|
return "planning"
|
||||||
|
else:
|
||||||
|
return "text"
|
||||||
|
|
||||||
|
async def _callAiPlanning(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
placeholders: Optional[Dict[str, str]],
|
||||||
|
placeholdersMeta: Optional[Dict[str, bool]],
|
||||||
|
options: AiCallOptions
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Handle planning calls with placeholder system and selective summarization.
|
||||||
|
"""
|
||||||
|
# Build full prompt with placeholders; if too large, summarize summaryAllowed placeholders proportionally
|
||||||
|
effective_placeholders = placeholders or {}
|
||||||
|
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
|
||||||
|
|
||||||
|
if options.compressPrompt and placeholdersMeta:
|
||||||
|
# Determine model capacity
|
||||||
|
try:
|
||||||
|
caps = self._getModelCapabilitiesForContent(full_prompt, None, options)
|
||||||
|
max_bytes = caps.get("maxContextBytes", len(full_prompt.encode("utf-8")))
|
||||||
|
except Exception:
|
||||||
|
max_bytes = len(full_prompt.encode("utf-8"))
|
||||||
|
|
||||||
|
current_bytes = len(full_prompt.encode("utf-8"))
|
||||||
|
if current_bytes > max_bytes:
|
||||||
|
# Compute total bytes contributed by allowed placeholders (approximate by content length)
|
||||||
|
allowed_labels = [l for l, allow in placeholdersMeta.items() if allow]
|
||||||
|
allowed_sizes = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels}
|
||||||
|
total_allowed = sum(allowed_sizes.values())
|
||||||
|
|
||||||
|
overage = current_bytes - max_bytes
|
||||||
|
if total_allowed > 0 and overage > 0:
|
||||||
|
# Target total for allowed after reduction
|
||||||
|
target_allowed = max(total_allowed - overage, 0)
|
||||||
|
# Global ratio to apply across allowed placeholders
|
||||||
|
ratio = target_allowed / total_allowed if total_allowed > 0 else 1.0
|
||||||
|
ratio = max(0.0, min(1.0, ratio))
|
||||||
|
|
||||||
|
reduced: Dict[str, str] = {}
|
||||||
|
for label, content in effective_placeholders.items():
|
||||||
|
if label in allowed_labels and isinstance(content, str) and len(content) > 0:
|
||||||
|
old_len = len(content)
|
||||||
|
# Reduce by proportional ratio on characters (fallback if empty)
|
||||||
|
reduction_factor = ratio if old_len > 0 else 1.0
|
||||||
|
reduced[label] = self._reduceText(content, reduction_factor)
|
||||||
|
else:
|
||||||
|
reduced[label] = content
|
||||||
|
|
||||||
|
effective_placeholders = reduced
|
||||||
|
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
|
||||||
|
|
||||||
|
# If still slightly over, perform a second-pass fine adjustment with updated ratio
|
||||||
|
current_bytes = len(full_prompt.encode("utf-8"))
|
||||||
|
if current_bytes > max_bytes and total_allowed > 0:
|
||||||
|
overage2 = current_bytes - max_bytes
|
||||||
|
# Recompute allowed sizes after first reduction
|
||||||
|
allowed_sizes2 = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels}
|
||||||
|
total_allowed2 = sum(allowed_sizes2.values())
|
||||||
|
if total_allowed2 > 0 and overage2 > 0:
|
||||||
|
target_allowed2 = max(total_allowed2 - overage2, 0)
|
||||||
|
ratio2 = target_allowed2 / total_allowed2
|
||||||
|
ratio2 = max(0.0, min(1.0, ratio2))
|
||||||
|
reduced2: Dict[str, str] = {}
|
||||||
|
for label, content in effective_placeholders.items():
|
||||||
|
if label in allowed_labels and isinstance(content, str) and len(content) > 0:
|
||||||
|
old_len = len(content)
|
||||||
|
reduction_factor = ratio2 if old_len > 0 else 1.0
|
||||||
|
reduced2[label] = self._reduceText(content, reduction_factor)
|
||||||
|
else:
|
||||||
|
reduced2[label] = content
|
||||||
|
effective_placeholders = reduced2
|
||||||
|
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
|
||||||
|
|
||||||
|
|
||||||
|
# Make AI call using AiObjects (let it handle model selection)
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=full_prompt,
|
||||||
|
context="", # Context is already included in the prompt
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
try:
|
||||||
|
logger.debug(f"AI model selected (planning): {getattr(response, 'modelName', 'unknown')}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
async def _callAiDirect(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
documents: Optional[List[ChatDocument]],
|
||||||
|
options: AiCallOptions,
|
||||||
|
documentProcessor=None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Call AI directly with prompt and documents for JSON output.
|
||||||
|
Used for multi-file generation - uses the existing generation pipeline.
|
||||||
|
"""
|
||||||
|
# Use the existing generation pipeline that already works
|
||||||
|
# This ensures proper document processing and content extraction
|
||||||
|
logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents")
|
||||||
|
|
||||||
|
if documentProcessor:
|
||||||
|
# Process documents with JSON merging using the existing pipeline
|
||||||
|
result = await documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
|
||||||
|
else:
|
||||||
|
# Fallback to simple AI call
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
context="",
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]}
|
||||||
|
|
||||||
|
# Convert single-file result to multi-file format if needed
|
||||||
|
if "sections" in result and "documents" not in result:
|
||||||
|
logger.info("Converting single-file result to multi-file format")
|
||||||
|
# This is a single-file result, convert it to multi-file format
|
||||||
|
return {
|
||||||
|
"metadata": result.get("metadata", {"title": "Converted Document"}),
|
||||||
|
"documents": [{
|
||||||
|
"id": "doc_1",
|
||||||
|
"title": result.get("metadata", {}).get("title", "Document"),
|
||||||
|
"filename": "document.txt",
|
||||||
|
"sections": result.get("sections", [])
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Get model capabilities for content processing, including appropriate size limits for chunking.
|
||||||
|
"""
|
||||||
|
# Estimate total content size
|
||||||
|
prompt_size = len(prompt.encode('utf-8'))
|
||||||
|
document_size = 0
|
||||||
|
if documents:
|
||||||
|
# Rough estimate of document content size
|
||||||
|
for doc in documents:
|
||||||
|
document_size += doc.fileSize or 0
|
||||||
|
|
||||||
|
total_size = prompt_size + document_size
|
||||||
|
|
||||||
|
# Use AiObjects to select the best model for this content size
|
||||||
|
# We'll simulate the model selection by checking available models
|
||||||
|
from modules.interfaces.interfaceAiObjects import aiModels
|
||||||
|
|
||||||
|
# Find the best model for this content size and operation
|
||||||
|
best_model = None
|
||||||
|
best_context_length = 0
|
||||||
|
|
||||||
|
for model_name, model_info in aiModels.items():
|
||||||
|
context_length = model_info.get("contextLength", 0)
|
||||||
|
|
||||||
|
# Skip models with no context length or too small for content
|
||||||
|
if context_length == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if model supports the operation type
|
||||||
|
capabilities = model_info.get("capabilities", [])
|
||||||
|
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
|
||||||
|
continue
|
||||||
|
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
|
||||||
|
continue
|
||||||
|
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
|
||||||
|
continue
|
||||||
|
elif "text_generation" not in capabilities:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Prefer models that can handle the content without chunking, but allow chunking if needed
|
||||||
|
if context_length >= total_size * 0.8: # 80% of content size
|
||||||
|
if context_length > best_context_length:
|
||||||
|
best_model = model_info
|
||||||
|
best_context_length = context_length
|
||||||
|
elif best_model is None: # Fallback to largest available model
|
||||||
|
if context_length > best_context_length:
|
||||||
|
best_model = model_info
|
||||||
|
best_context_length = context_length
|
||||||
|
|
||||||
|
# Fallback to a reasonable default if no model found
|
||||||
|
if best_model is None:
|
||||||
|
best_model = {
|
||||||
|
"contextLength": 128000, # GPT-4o default
|
||||||
|
"llmName": "gpt-4o"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate appropriate sizes
|
||||||
|
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
|
||||||
|
context_length_bytes = int(best_model["contextLength"] * 4)
|
||||||
|
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
|
||||||
|
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
|
||||||
|
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
|
||||||
|
|
||||||
|
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
|
||||||
|
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
|
||||||
|
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"maxContextBytes": max_context_bytes,
|
||||||
|
"textChunkSize": text_chunk_size,
|
||||||
|
"imageChunkSize": image_chunk_size
|
||||||
|
}
|
||||||
|
|
||||||
|
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
|
||||||
|
"""
|
||||||
|
Get models capable of handling the specific operation with capability filtering.
|
||||||
|
"""
|
||||||
|
# Use the actual AI objects model selection instead of hardcoded default
|
||||||
|
if hasattr(self, 'aiObjects') and self.aiObjects:
|
||||||
|
# Let AiObjects handle the model selection
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
# Fallback to default model if AiObjects not available
|
||||||
|
default_model = ModelCapabilities(
|
||||||
|
name="default",
|
||||||
|
maxTokens=4000,
|
||||||
|
capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"],
|
||||||
|
costPerToken=0.001,
|
||||||
|
processingTime=1.0,
|
||||||
|
isAvailable=True
|
||||||
|
)
|
||||||
|
return [default_model]
|
||||||
|
|
||||||
|
def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str:
|
||||||
|
"""
|
||||||
|
Build full prompt by replacing placeholders with their content.
|
||||||
|
Uses the new {{KEY:placeholder}} format.
|
||||||
|
"""
|
||||||
|
if not placeholders:
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
full_prompt = prompt
|
||||||
|
for placeholder, content in placeholders.items():
|
||||||
|
# Replace both old format {{placeholder}} and new format {{KEY:placeholder}}
|
||||||
|
full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content)
|
||||||
|
full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content)
|
||||||
|
|
||||||
|
return full_prompt
|
||||||
|
|
||||||
|
def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
|
||||||
|
"""Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled."""
|
||||||
|
try:
|
||||||
|
# Check if debug logging is enabled
|
||||||
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if not debug_enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
# Base dir: gateway/test-chat/ai (go up 4 levels from this file)
|
||||||
|
# .../gateway/modules/services/serviceAi/subCoreAi.py -> up to gateway root
|
||||||
|
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
outDir = os.path.join(gatewayDir, 'test-chat', 'ai')
|
||||||
|
os.makedirs(outDir, exist_ok=True)
|
||||||
|
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
|
||||||
|
suffix = []
|
||||||
|
if partIndex is not None:
|
||||||
|
suffix.append(f"part{partIndex}")
|
||||||
|
if continuation is not None:
|
||||||
|
suffix.append(f"cont_{str(continuation).lower()}")
|
||||||
|
if modelName:
|
||||||
|
safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName)
|
||||||
|
suffix.append(safeModel)
|
||||||
|
suffixStr = ('_' + '_'.join(suffix)) if suffix else ''
|
||||||
|
fname = f"{ts}_{label}{suffixStr}.txt"
|
||||||
|
fpath = os.path.join(outDir, fname)
|
||||||
|
with open(fpath, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(content or '')
|
||||||
|
except Exception:
|
||||||
|
# Do not raise; best-effort debug write
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
|
||||||
|
"""
|
||||||
|
Check if text exceeds model token limit with safety margin.
|
||||||
|
"""
|
||||||
|
# Simple character-based estimation (4 chars per token)
|
||||||
|
estimated_tokens = len(text) // 4
|
||||||
|
max_tokens = int(model.maxTokens * (1 - safety_margin))
|
||||||
|
return estimated_tokens > max_tokens
|
||||||
|
|
||||||
|
def _reducePlanningPrompt(
|
||||||
|
self,
|
||||||
|
full_prompt: str,
|
||||||
|
placeholders: Optional[Dict[str, str]],
|
||||||
|
model: ModelCapabilities,
|
||||||
|
options: AiCallOptions
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Reduce planning prompt size by summarizing placeholders while preserving prompt structure.
|
||||||
|
"""
|
||||||
|
if not placeholders:
|
||||||
|
return self._reduceText(full_prompt, 0.7)
|
||||||
|
|
||||||
|
# Reduce placeholders while preserving prompt
|
||||||
|
reduced_placeholders = {}
|
||||||
|
for placeholder, content in placeholders.items():
|
||||||
|
if len(content) > 1000: # Only reduce long content
|
||||||
|
reduction_factor = 0.7
|
||||||
|
reduced_content = self._reduceText(content, reduction_factor)
|
||||||
|
reduced_placeholders[placeholder] = reduced_content
|
||||||
|
else:
|
||||||
|
reduced_placeholders[placeholder] = content
|
||||||
|
|
||||||
|
return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders)
|
||||||
|
|
||||||
|
def _reduceTextPrompt(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
context: str,
|
||||||
|
model: ModelCapabilities,
|
||||||
|
options: AiCallOptions
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Reduce text prompt size using typeGroup-aware chunking and merging.
|
||||||
|
"""
|
||||||
|
max_size = int(model.maxTokens * (1 - options.safetyMargin))
|
||||||
|
|
||||||
|
if options.compressPrompt:
|
||||||
|
# Reduce both prompt and context
|
||||||
|
target_size = max_size
|
||||||
|
current_size = len(prompt) + len(context)
|
||||||
|
reduction_factor = (target_size * 0.7) / current_size
|
||||||
|
|
||||||
|
if reduction_factor < 1.0:
|
||||||
|
prompt = self._reduceText(prompt, reduction_factor)
|
||||||
|
context = self._reduceText(context, reduction_factor)
|
||||||
|
else:
|
||||||
|
# Only reduce context, preserve prompt integrity
|
||||||
|
max_context_size = max_size - len(prompt)
|
||||||
|
if len(context) > max_context_size:
|
||||||
|
reduction_factor = max_context_size / len(context)
|
||||||
|
context = self._reduceText(context, reduction_factor)
|
||||||
|
|
||||||
|
return prompt + "\n\n" + context if context else prompt
|
||||||
|
|
||||||
|
def _extractTextFromContentParts(self, extracted_content) -> str:
|
||||||
|
"""
|
||||||
|
Extract text content from ExtractionService ContentPart objects.
|
||||||
|
"""
|
||||||
|
if not extracted_content or not hasattr(extracted_content, 'parts'):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text_parts = []
|
||||||
|
for part in extracted_content.parts:
|
||||||
|
if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']:
|
||||||
|
if hasattr(part, 'data') and part.data:
|
||||||
|
text_parts.append(part.data)
|
||||||
|
|
||||||
|
return "\n\n".join(text_parts)
|
||||||
|
|
||||||
|
def _reduceText(self, text: str, reduction_factor: float) -> str:
|
||||||
|
"""
|
||||||
|
Reduce text size by the specified factor.
|
||||||
|
"""
|
||||||
|
if reduction_factor >= 1.0:
|
||||||
|
return text
|
||||||
|
|
||||||
|
target_length = int(len(text) * reduction_factor)
|
||||||
|
return text[:target_length] + "... [reduced]"
|
||||||
804
modules/services/serviceAi/subDocumentGeneration.py
Normal file
804
modules/services/serviceAi/subDocumentGeneration.py
Normal file
|
|
@ -0,0 +1,804 @@
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||||
|
from modules.datamodels.datamodelChat import ChatDocument
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SubDocumentGeneration:
|
||||||
|
"""Document generation operations including single-file and multi-file generation."""
|
||||||
|
|
||||||
|
def __init__(self, services, aiObjects, documentProcessor):
|
||||||
|
"""Initialize document generation service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
services: Service center instance for accessing other services
|
||||||
|
aiObjects: Initialized AiObjects instance
|
||||||
|
documentProcessor: Document processing service instance
|
||||||
|
"""
|
||||||
|
self.services = services
|
||||||
|
self.aiObjects = aiObjects
|
||||||
|
self.documentProcessor = documentProcessor
|
||||||
|
|
||||||
|
async def callAiWithDocumentGeneration(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
documents: Optional[List[ChatDocument]],
|
||||||
|
options: AiCallOptions,
|
||||||
|
outputFormat: str,
|
||||||
|
title: Optional[str]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Handle AI calls with document generation in specific output format.
|
||||||
|
Now supports both single-file and multi-file generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The main prompt for the AI call
|
||||||
|
documents: Optional list of documents to process
|
||||||
|
options: AI call configuration options
|
||||||
|
outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||||
|
title: Optional title for generated documents
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with generated documents and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Use AI to analyze prompt intent
|
||||||
|
prompt_analysis = await self._analyzePromptIntent(prompt, self)
|
||||||
|
logger.info(f"Prompt analysis result: {prompt_analysis}")
|
||||||
|
|
||||||
|
if prompt_analysis.get("is_multi_file", False):
|
||||||
|
return await self._callAiWithMultiFileGeneration(
|
||||||
|
prompt, documents, options, outputFormat, title, prompt_analysis
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return await self._callAiWithSingleFileGeneration(
|
||||||
|
prompt, documents, options, outputFormat, title
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in document generation: {str(e)}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"content": "",
|
||||||
|
"rendered_content": "",
|
||||||
|
"mime_type": "text/plain",
|
||||||
|
"filename": f"error_{outputFormat}",
|
||||||
|
"format": outputFormat,
|
||||||
|
"title": title or "Error",
|
||||||
|
"documents": []
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _callAiWithSingleFileGeneration(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
documents: Optional[List[ChatDocument]],
|
||||||
|
options: AiCallOptions,
|
||||||
|
outputFormat: str,
|
||||||
|
title: Optional[str],
|
||||||
|
generationPrompt: Optional[str] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Handle single-file document generation (existing functionality)."""
|
||||||
|
try:
|
||||||
|
# Get format-specific extraction prompt from generation service
|
||||||
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||||
|
generation_service = GenerationService(self.services)
|
||||||
|
|
||||||
|
# Use default title if not provided
|
||||||
|
if not title:
|
||||||
|
title = "AI Generated Document"
|
||||||
|
|
||||||
|
# Get format-specific extraction prompt
|
||||||
|
extractionPrompt = await generation_service.getExtractionPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=prompt,
|
||||||
|
title=title,
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process documents with format-specific prompt using JSON mode
|
||||||
|
# This ensures structured JSON output instead of text
|
||||||
|
aiResponseJson = await self._callAiJson(extractionPrompt, documents, options)
|
||||||
|
|
||||||
|
# Validate JSON response
|
||||||
|
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
|
||||||
|
raise Exception("AI response is not valid JSON document structure")
|
||||||
|
|
||||||
|
# Emit raw extracted data as a chat message attachment before rendering
|
||||||
|
try:
|
||||||
|
await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to emit raw extraction chat message (single-file)")
|
||||||
|
|
||||||
|
# Generate filename from document metadata
|
||||||
|
parsedFilename = None
|
||||||
|
try:
|
||||||
|
if aiResponseJson.get("metadata", {}).get("title"):
|
||||||
|
title = aiResponseJson["metadata"]["title"]
|
||||||
|
# Clean title for filename
|
||||||
|
import re
|
||||||
|
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title)
|
||||||
|
parsed = re.sub(r"-+", "-", parsed).strip('-')
|
||||||
|
if parsed:
|
||||||
|
parsedFilename = f"{parsed}.{outputFormat}"
|
||||||
|
except Exception:
|
||||||
|
parsedFilename = None
|
||||||
|
|
||||||
|
# Use AI generation to enhance the extracted JSON before rendering
|
||||||
|
enhancedContent = aiResponseJson # Default to original
|
||||||
|
if prompt:
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
# Get generation prompt
|
||||||
|
generationPrompt = await generation_service.getGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=prompt,
|
||||||
|
title=title,
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare the AI call
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
# Create context with the extracted JSON content
|
||||||
|
import json
|
||||||
|
context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"
|
||||||
|
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=generationPrompt,
|
||||||
|
context=context,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
# Call AI to enhance the content
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
# Parse the AI response as JSON
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
result = response.content.strip()
|
||||||
|
|
||||||
|
# Extract JSON from markdown if present
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
enhancedContent = json.loads(result)
|
||||||
|
logger.info(f"AI enhanced JSON content successfully")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
||||||
|
enhancedContent = aiResponseJson
|
||||||
|
else:
|
||||||
|
logger.warning("AI generation returned empty response, using original content")
|
||||||
|
enhancedContent = aiResponseJson
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||||
|
enhancedContent = aiResponseJson
|
||||||
|
|
||||||
|
# Render the enhanced JSON content
|
||||||
|
renderedContent, mimeType = await generation_service.renderReport(
|
||||||
|
extractedContent=enhancedContent,
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
title=title,
|
||||||
|
userPrompt=prompt,
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate meaningful filename (use AI-provided if valid, else fallback)
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
|
||||||
|
filename = parsedFilename
|
||||||
|
else:
|
||||||
|
safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
|
||||||
|
filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
|
||||||
|
|
||||||
|
# Return structured result with document information
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"content": aiResponseJson, # Structured JSON document
|
||||||
|
"rendered_content": renderedContent, # Formatted content
|
||||||
|
"mime_type": mimeType,
|
||||||
|
"filename": filename,
|
||||||
|
"format": outputFormat,
|
||||||
|
"title": title,
|
||||||
|
"documents": [{
|
||||||
|
"documentName": filename,
|
||||||
|
"documentData": renderedContent,
|
||||||
|
"mimeType": mimeType
|
||||||
|
}],
|
||||||
|
"is_multi_file": False
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in single-file document generation: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def _callAiWithMultiFileGeneration(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
documents: Optional[List[ChatDocument]],
|
||||||
|
options: AiCallOptions,
|
||||||
|
outputFormat: str,
|
||||||
|
title: Optional[str],
|
||||||
|
prompt_analysis: Dict[str, Any]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Handle multi-file document generation using AI analysis."""
|
||||||
|
try:
|
||||||
|
# Get multi-file extraction prompt based on AI analysis
|
||||||
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||||
|
generation_service = GenerationService(self.services)
|
||||||
|
|
||||||
|
# Use default title if not provided
|
||||||
|
if not title:
|
||||||
|
title = "AI Generated Documents"
|
||||||
|
|
||||||
|
# Get adaptive extraction prompt
|
||||||
|
extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=prompt,
|
||||||
|
title=title,
|
||||||
|
promptAnalysis=prompt_analysis,
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
|
||||||
|
logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
|
||||||
|
|
||||||
|
# Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
|
||||||
|
logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
|
||||||
|
logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
|
||||||
|
|
||||||
|
# Use the existing pipeline but replace the prompt with our adaptive one
|
||||||
|
# This ensures proper document processing while using the multi-file prompt
|
||||||
|
ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
|
||||||
|
|
||||||
|
logger.info(f"AI response type: {type(ai_response)}")
|
||||||
|
logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
|
||||||
|
logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
|
||||||
|
|
||||||
|
# Validate response structure
|
||||||
|
if not self._validateResponseStructure(ai_response, prompt_analysis):
|
||||||
|
# Fallback to single-file if multi-file fails
|
||||||
|
logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
|
||||||
|
logger.warning(f"Prompt analysis: {prompt_analysis}")
|
||||||
|
logger.warning("Falling back to single-file generation")
|
||||||
|
return await self._callAiWithSingleFileGeneration(
|
||||||
|
prompt, documents, options, outputFormat, title
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emit raw extracted data as a chat message attachment before transformation/rendering
|
||||||
|
try:
|
||||||
|
await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to emit raw extraction chat message (multi-file)")
|
||||||
|
|
||||||
|
# Process multiple documents
|
||||||
|
generated_documents = []
|
||||||
|
for i, doc_data in enumerate(ai_response.get("documents", [])):
|
||||||
|
# Transform AI-generated sections to renderer-compatible format
|
||||||
|
transformed_sections = []
|
||||||
|
for section in doc_data.get("sections", []):
|
||||||
|
# Convert AI format to renderer format
|
||||||
|
transformed_section = {
|
||||||
|
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
||||||
|
"content_type": section.get("content_type", "paragraph"),
|
||||||
|
"elements": section.get("elements", []),
|
||||||
|
"order": section.get("order", len(transformed_sections) + 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract text from elements for simple text-based sections
|
||||||
|
if section.get("content_type") in ["paragraph", "heading"]:
|
||||||
|
text_parts = []
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if "text" in element:
|
||||||
|
text_parts.append(element["text"])
|
||||||
|
# Add text to the first element or create a new one
|
||||||
|
if transformed_section["elements"]:
|
||||||
|
transformed_section["elements"][0]["text"] = "\n".join(text_parts)
|
||||||
|
else:
|
||||||
|
transformed_section["elements"] = [{"text": "\n".join(text_parts)}]
|
||||||
|
|
||||||
|
transformed_sections.append(transformed_section)
|
||||||
|
|
||||||
|
# Create complete document structure for rendering
|
||||||
|
complete_document = {
|
||||||
|
"metadata": {
|
||||||
|
"title": doc_data["title"],
|
||||||
|
"source_document": "multi_file_generation",
|
||||||
|
"document_id": doc_data.get("id", f"doc_{i+1}"),
|
||||||
|
"filename": doc_data.get("filename", f"document_{i+1}"),
|
||||||
|
"split_strategy": prompt_analysis.get("strategy", "custom")
|
||||||
|
},
|
||||||
|
"sections": transformed_sections,
|
||||||
|
"summary": f"Generated document: {doc_data['title']}",
|
||||||
|
"tags": ["multi_file", "ai_generated"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Use AI generation to enhance the extracted JSON before rendering
|
||||||
|
enhancedContent = complete_document # Default to original
|
||||||
|
if prompt:
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
# Get generation prompt
|
||||||
|
generationPrompt = await generation_service.getGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=prompt,
|
||||||
|
title=doc_data["title"],
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare the AI call
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
# Create context with the extracted JSON content
|
||||||
|
import json
|
||||||
|
context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"
|
||||||
|
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=generationPrompt,
|
||||||
|
context=context,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
# Call AI to enhance the content
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
# Parse the AI response as JSON
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
result = response.content.strip()
|
||||||
|
|
||||||
|
# Extract JSON from markdown if present
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
enhancedContent = json.loads(result)
|
||||||
|
logger.info(f"AI enhanced JSON content successfully")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...")
|
||||||
|
# Try to repair common JSON issues
|
||||||
|
try:
|
||||||
|
repaired_result = self._repairJson(result)
|
||||||
|
enhancedContent = json.loads(repaired_result)
|
||||||
|
logger.info(f"Successfully repaired JSON content")
|
||||||
|
except (json.JSONDecodeError, Exception) as repair_error:
|
||||||
|
logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...")
|
||||||
|
# Try AI-powered JSON repair as last resort
|
||||||
|
try:
|
||||||
|
ai_repaired = await self._repairJsonWithAI(result)
|
||||||
|
enhancedContent = json.loads(ai_repaired)
|
||||||
|
logger.info(f"AI successfully repaired JSON content")
|
||||||
|
except Exception as ai_repair_error:
|
||||||
|
logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content")
|
||||||
|
enhancedContent = complete_document
|
||||||
|
else:
|
||||||
|
logger.warning("AI generation returned empty response, using original content")
|
||||||
|
enhancedContent = complete_document
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||||
|
enhancedContent = complete_document
|
||||||
|
|
||||||
|
# Render the enhanced JSON content
|
||||||
|
rendered_content, mime_type = await generation_service.renderReport(
|
||||||
|
extractedContent=enhancedContent,
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
title=doc_data["title"],
|
||||||
|
userPrompt=prompt,
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate proper filename with correct extension
|
||||||
|
base_filename = doc_data.get("filename", f"document_{i+1}")
|
||||||
|
# Remove any existing extension and add the correct one
|
||||||
|
if '.' in base_filename:
|
||||||
|
base_filename = base_filename.rsplit('.', 1)[0]
|
||||||
|
|
||||||
|
# Add proper extension based on output format
|
||||||
|
if outputFormat.lower() == "docx":
|
||||||
|
filename = f"{base_filename}.docx"
|
||||||
|
elif outputFormat.lower() == "pdf":
|
||||||
|
filename = f"{base_filename}.pdf"
|
||||||
|
elif outputFormat.lower() == "html":
|
||||||
|
filename = f"{base_filename}.html"
|
||||||
|
else:
|
||||||
|
filename = f"{base_filename}.{outputFormat}"
|
||||||
|
|
||||||
|
generated_documents.append({
|
||||||
|
"documentName": filename,
|
||||||
|
"documentData": rendered_content,
|
||||||
|
"mimeType": mime_type
|
||||||
|
})
|
||||||
|
|
||||||
|
# Save debug files for multi-file generation - only if debug enabled
|
||||||
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if debug_enabled:
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
debug_root = "./test-chat/ai"
|
||||||
|
debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
|
||||||
|
os.makedirs(debug_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Save metadata
|
||||||
|
with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"title: {title}\n")
|
||||||
|
f.write(f"format: {outputFormat}\n")
|
||||||
|
f.write(f"documents_count: {len(generated_documents)}\n")
|
||||||
|
f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
|
||||||
|
f.write(f"prompt_analysis: {prompt_analysis}\n")
|
||||||
|
|
||||||
|
# Save each generated document
|
||||||
|
for i, doc in enumerate(generated_documents):
|
||||||
|
doc_filename = doc["documentName"]
|
||||||
|
doc_data = doc["documentData"]
|
||||||
|
doc_mime = doc["mimeType"]
|
||||||
|
|
||||||
|
# Determine file extension
|
||||||
|
if outputFormat.lower() == "docx":
|
||||||
|
file_ext = ".docx"
|
||||||
|
elif outputFormat.lower() == "pdf":
|
||||||
|
file_ext = ".pdf"
|
||||||
|
elif outputFormat.lower() == "html":
|
||||||
|
file_ext = ".html"
|
||||||
|
else:
|
||||||
|
file_ext = f".{outputFormat}"
|
||||||
|
|
||||||
|
# Save the rendered document
|
||||||
|
output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
|
||||||
|
|
||||||
|
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
|
||||||
|
# Text-based formats
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(doc_data)
|
||||||
|
else:
|
||||||
|
# Binary formats - decode from base64 if needed
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
doc_bytes = base64.b64decode(doc_data)
|
||||||
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(doc_bytes)
|
||||||
|
except Exception:
|
||||||
|
# If not base64, save as text
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(doc_data)
|
||||||
|
|
||||||
|
logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")
|
||||||
|
|
||||||
|
logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to save multi-file debug output: {e}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"content": ai_response,
|
||||||
|
"rendered_content": None, # Not applicable for multi-file
|
||||||
|
"mime_type": None, # Not applicable for multi-file
|
||||||
|
"filename": None, # Not applicable for multi-file
|
||||||
|
"format": outputFormat,
|
||||||
|
"title": title,
|
||||||
|
"documents": generated_documents,
|
||||||
|
"is_multi_file": True,
|
||||||
|
"split_strategy": prompt_analysis.get("strategy", "custom")
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in multi-file document generation: {str(e)}")
|
||||||
|
# Fallback to single-file
|
||||||
|
return await self._callAiWithSingleFileGeneration(
|
||||||
|
prompt, documents, options, outputFormat, title
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _callAiJson(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
documents: Optional[List[ChatDocument]],
|
||||||
|
options: AiCallOptions
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Handle AI calls with document processing for JSON output.
|
||||||
|
Returns structured JSON document instead of text.
|
||||||
|
"""
|
||||||
|
# Process documents with JSON merging
|
||||||
|
return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
|
||||||
|
|
||||||
|
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||||
|
"""Use AI to analyze user prompt and determine processing requirements."""
|
||||||
|
if not ai_service:
|
||||||
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||||
|
|
||||||
|
try:
|
||||||
|
analysis_prompt = f"""
|
||||||
|
Analyze this user request and determine if it requires multiple file output or single file output.
|
||||||
|
|
||||||
|
User request: "{prompt}"
|
||||||
|
|
||||||
|
Respond with JSON only in this exact format:
|
||||||
|
{{
|
||||||
|
"is_multi_file": true/false,
|
||||||
|
"strategy": "single|per_entity|by_section|by_criteria|custom",
|
||||||
|
"criteria": "description of how to split content",
|
||||||
|
"file_naming_pattern": "suggested pattern for filenames",
|
||||||
|
"reasoning": "brief explanation of the analysis"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Consider:
|
||||||
|
- Does the user want separate files for different entities (customers, products, etc.)?
|
||||||
|
- Does the user want to split content into multiple documents?
|
||||||
|
- What would be the most logical way to organize the content?
|
||||||
|
- What language is the request in? (analyze in the original language)
|
||||||
|
|
||||||
|
Return only the JSON response.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||||
|
response = await ai_service.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Extract JSON from response
|
||||||
|
result = response.content.strip()
|
||||||
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(0)
|
||||||
|
|
||||||
|
analysis = json.loads(result)
|
||||||
|
return analysis
|
||||||
|
else:
|
||||||
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
|
||||||
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||||
|
|
||||||
|
def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
|
||||||
|
"""Validate that AI response matches the expected structure."""
|
||||||
|
try:
|
||||||
|
if not isinstance(response, dict):
|
||||||
|
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for multi-file structure
|
||||||
|
if prompt_analysis.get("is_multi_file", False):
|
||||||
|
has_documents = "documents" in response
|
||||||
|
is_documents_list = isinstance(response.get("documents"), list)
|
||||||
|
logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
|
||||||
|
if has_documents and is_documents_list:
|
||||||
|
logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
|
||||||
|
logger.warning(f"Available keys: {list(response.keys())}")
|
||||||
|
return has_documents and is_documents_list
|
||||||
|
else:
|
||||||
|
has_sections = "sections" in response
|
||||||
|
is_sections_list = isinstance(response.get("sections"), list)
|
||||||
|
logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
|
||||||
|
return has_sections and is_sections_list
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Response validation failed with exception: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
|
||||||
|
"""
|
||||||
|
Create a ChatMessage with the extracted raw JSON attached as a file so the user
|
||||||
|
has access to the data even if downstream processing fails.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
services = self.services
|
||||||
|
workflow = services.currentWorkflow
|
||||||
|
|
||||||
|
# Serialize payload
|
||||||
|
import json as _json
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
|
||||||
|
content_bytes = content_text.encode('utf-8')
|
||||||
|
|
||||||
|
# Store as file via component storage
|
||||||
|
file_name = f"{label}_{ts}.json"
|
||||||
|
file_item = services.interfaceDbComponent.createFile(
|
||||||
|
name=file_name,
|
||||||
|
mimeType="application/json",
|
||||||
|
content=content_bytes
|
||||||
|
)
|
||||||
|
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
||||||
|
|
||||||
|
# Lookup file info for ChatDocument
|
||||||
|
file_info = services.workflow.getFileInfo(file_item.id)
|
||||||
|
doc = ChatDocument(
|
||||||
|
messageId="", # set after message creation
|
||||||
|
fileId=file_item.id,
|
||||||
|
fileName=file_info.get("fileName", file_name) if file_info else file_name,
|
||||||
|
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
|
||||||
|
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create message referencing the file
|
||||||
|
messageData = {
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"role": "assistant",
|
||||||
|
"message": "Raw extraction data saved",
|
||||||
|
"status": "data",
|
||||||
|
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
|
||||||
|
"publishedAt": services.utils.getUtcTimestamp(),
|
||||||
|
"documentsLabel": label,
|
||||||
|
"documents": []
|
||||||
|
}
|
||||||
|
message = services.workflow.createMessage(messageData)
|
||||||
|
if not message:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Persist ChatDocument with messageId
|
||||||
|
doc.messageId = message.id
|
||||||
|
services.interfaceDbChat.createDocument(doc.to_dict())
|
||||||
|
|
||||||
|
# Update message to include document
|
||||||
|
try:
|
||||||
|
if not message.documents:
|
||||||
|
message.documents = []
|
||||||
|
message.documents.append(doc)
|
||||||
|
services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
# Non-fatal; ignore if storage or chat creation fails
|
||||||
|
return
|
||||||
|
|
||||||
|
def _repairJson(self, json_string: str) -> str:
|
||||||
|
"""Repair common JSON syntax errors efficiently for large JSON."""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Remove any leading/trailing whitespace
|
||||||
|
json_string = json_string.strip()
|
||||||
|
|
||||||
|
# For large JSON, skip substring extraction and go straight to targeted repairs
|
||||||
|
logger.info(f"Attempting JSON repair for {len(json_string)} characters...")
|
||||||
|
|
||||||
|
# Try to parse first to see what specific error we get
|
||||||
|
try:
|
||||||
|
json.loads(json_string)
|
||||||
|
return json_string # Already valid
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.info(f"JSON error: {error_msg}")
|
||||||
|
|
||||||
|
# Apply targeted fixes based on the specific error
|
||||||
|
if "Expecting ',' delimiter" in error_msg:
|
||||||
|
# Fix missing commas between array elements
|
||||||
|
json_string = re.sub(r'\]\s*\[', '], [', json_string)
|
||||||
|
json_string = re.sub(r'\}\s*\{', '}, {', json_string)
|
||||||
|
# Fix missing commas between object properties
|
||||||
|
json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string)
|
||||||
|
|
||||||
|
if "Expecting value" in error_msg:
|
||||||
|
# Fix missing values (replace empty with null)
|
||||||
|
json_string = re.sub(r':\s*,', ': null,', json_string)
|
||||||
|
json_string = re.sub(r':\s*}', ': null}', json_string)
|
||||||
|
|
||||||
|
if "Expecting property name" in error_msg:
|
||||||
|
# Fix unquoted property names
|
||||||
|
json_string = re.sub(r'(\w+):', r'"\1":', json_string)
|
||||||
|
|
||||||
|
# Fix trailing commas before closing brackets/braces
|
||||||
|
json_string = re.sub(r',(\s*[}\]])', r'\1', json_string)
|
||||||
|
|
||||||
|
# Fix missing closing brackets/braces (only if reasonable)
|
||||||
|
open_braces = json_string.count('{')
|
||||||
|
close_braces = json_string.count('}')
|
||||||
|
open_brackets = json_string.count('[')
|
||||||
|
close_brackets = json_string.count(']')
|
||||||
|
|
||||||
|
# Only add missing brackets if the difference is small (avoid runaway)
|
||||||
|
if 0 < (open_braces - close_braces) <= 5:
|
||||||
|
missing_braces = open_braces - close_braces
|
||||||
|
json_string += '}' * missing_braces
|
||||||
|
|
||||||
|
if 0 < (open_brackets - close_brackets) <= 5:
|
||||||
|
missing_brackets = open_brackets - close_brackets
|
||||||
|
json_string += ']' * missing_brackets
|
||||||
|
|
||||||
|
# Try to parse again
|
||||||
|
try:
|
||||||
|
json.loads(json_string)
|
||||||
|
logger.info("JSON repair successful")
|
||||||
|
return json_string
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("JSON repair failed - will try AI repair")
|
||||||
|
return json_string
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"JSON repair failed: {str(e)}")
|
||||||
|
return json_string
|
||||||
|
|
||||||
|
async def _repairJsonWithAI(self, malformed_json: str) -> str:
|
||||||
|
"""Use AI to repair malformed JSON efficiently for large files."""
|
||||||
|
try:
|
||||||
|
# Limit JSON size for AI processing (max 50KB to avoid token limits)
|
||||||
|
max_json_size = 50000
|
||||||
|
json_to_repair = malformed_json
|
||||||
|
|
||||||
|
if len(malformed_json) > max_json_size:
|
||||||
|
logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair")
|
||||||
|
# Try to find a good truncation point (end of a complete object/array)
|
||||||
|
truncate_at = max_json_size
|
||||||
|
for i in range(max_json_size, max(0, max_json_size - 1000), -1):
|
||||||
|
if malformed_json[i] in ['}', ']']:
|
||||||
|
truncate_at = i + 1
|
||||||
|
break
|
||||||
|
json_to_repair = malformed_json[:truncate_at] + "..."
|
||||||
|
|
||||||
|
repair_prompt = f"""
|
||||||
|
You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations.
|
||||||
|
|
||||||
|
Malformed JSON:
|
||||||
|
{json_to_repair}
|
||||||
|
|
||||||
|
Return only the valid JSON:
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Use AI to repair the JSON
|
||||||
|
repaired_json = await self.services.ai.callAi(
|
||||||
|
prompt=repair_prompt,
|
||||||
|
documents=None,
|
||||||
|
options={
|
||||||
|
"process_type": "text",
|
||||||
|
"operation_type": "generate_content",
|
||||||
|
"priority": "speed",
|
||||||
|
"max_cost": 0.01
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean up the response (remove any markdown formatting)
|
||||||
|
repaired_json = repaired_json.strip()
|
||||||
|
if repaired_json.startswith('```json'):
|
||||||
|
repaired_json = repaired_json[7:]
|
||||||
|
if repaired_json.endswith('```'):
|
||||||
|
repaired_json = repaired_json[:-3]
|
||||||
|
repaired_json = repaired_json.strip()
|
||||||
|
|
||||||
|
# Validate the repaired JSON
|
||||||
|
import json
|
||||||
|
json.loads(repaired_json)
|
||||||
|
logger.info("AI JSON repair successful")
|
||||||
|
return repaired_json
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI JSON repair failed: {str(e)}")
|
||||||
|
return malformed_json
|
||||||
1132
modules/services/serviceAi/subDocumentProcessing.py
Normal file
1132
modules/services/serviceAi/subDocumentProcessing.py
Normal file
File diff suppressed because it is too large
Load diff
316
modules/services/serviceAi/subUtilities.py
Normal file
316
modules/services/serviceAi/subUtilities.py
Normal file
|
|
@ -0,0 +1,316 @@
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||||
|
from modules.datamodels.datamodelAi import ModelCapabilities, AiCallOptions
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SubUtilities:
|
||||||
|
"""Utility functions for text processing, debugging, and helper operations."""
|
||||||
|
|
||||||
|
def __init__(self, services):
|
||||||
|
"""Initialize utilities service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
services: Service center instance for accessing other services
|
||||||
|
"""
|
||||||
|
self.services = services
|
||||||
|
|
||||||
|
def _writeTraceLog(self, contextText: str, data: Any) -> None:
|
||||||
|
"""Write raw data to the central trace log file without truncation."""
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
# Only write if logger is in debug mode
|
||||||
|
if logger.level > logging.DEBUG:
|
||||||
|
return
|
||||||
|
# Get log directory from configuration via service center if possible
|
||||||
|
logDir = None
|
||||||
|
try:
|
||||||
|
logDir = self.services.utils.configGet("APP_LOGGING_LOG_DIR", "./")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not logDir:
|
||||||
|
logDir = "./"
|
||||||
|
if not os.path.isabs(logDir):
|
||||||
|
# Make it relative to gateway directory
|
||||||
|
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
logDir = os.path.join(gatewayDir, logDir)
|
||||||
|
os.makedirs(logDir, exist_ok=True)
|
||||||
|
traceFile = os.path.join(logDir, "log_trace.log")
|
||||||
|
timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||||
|
traceEntry = f"[{timestamp}] {contextText}\n" + ("=" * 80) + "\n"
|
||||||
|
if data is None:
|
||||||
|
traceEntry += "No data provided\n"
|
||||||
|
else:
|
||||||
|
# Prefer exact text; if dict/list, pretty print JSON
|
||||||
|
try:
|
||||||
|
if isinstance(data, (dict, list)):
|
||||||
|
traceEntry += f"JSON Data:\n{json.dumps(data, indent=2, ensure_ascii=False)}\n"
|
||||||
|
else:
|
||||||
|
text = str(data)
|
||||||
|
traceEntry += f"Text Data:\n{text}\n"
|
||||||
|
except Exception:
|
||||||
|
traceEntry += f"Data (fallback): {str(data)}\n"
|
||||||
|
traceEntry += ("=" * 80) + "\n\n"
|
||||||
|
with open(traceFile, "a", encoding="utf-8") as f:
|
||||||
|
f.write(traceEntry)
|
||||||
|
except Exception:
|
||||||
|
# Swallow to avoid recursive logging issues
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
|
||||||
|
"""Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled."""
|
||||||
|
try:
|
||||||
|
# Check if debug logging is enabled
|
||||||
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if not debug_enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
# Base dir: gateway/test-chat/ai (go up 4 levels from this file)
|
||||||
|
# .../gateway/modules/services/serviceAi/subUtilities.py -> up to gateway root
|
||||||
|
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
outDir = os.path.join(gatewayDir, 'test-chat', 'ai')
|
||||||
|
os.makedirs(outDir, exist_ok=True)
|
||||||
|
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
|
||||||
|
suffix = []
|
||||||
|
if partIndex is not None:
|
||||||
|
suffix.append(f"part{partIndex}")
|
||||||
|
if continuation is not None:
|
||||||
|
suffix.append(f"cont_{str(continuation).lower()}")
|
||||||
|
if modelName:
|
||||||
|
safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName)
|
||||||
|
suffix.append(safeModel)
|
||||||
|
suffixStr = ('_' + '_'.join(suffix)) if suffix else ''
|
||||||
|
fname = f"{ts}_{label}{suffixStr}.txt"
|
||||||
|
fpath = os.path.join(outDir, fname)
|
||||||
|
with open(fpath, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(content or '')
|
||||||
|
except Exception:
|
||||||
|
# Do not raise; best-effort debug write
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
|
||||||
|
"""
|
||||||
|
Check if text exceeds model token limit with safety margin.
|
||||||
|
"""
|
||||||
|
# Simple character-based estimation (4 chars per token)
|
||||||
|
estimated_tokens = len(text) // 4
|
||||||
|
max_tokens = int(model.maxTokens * (1 - safety_margin))
|
||||||
|
return estimated_tokens > max_tokens
|
||||||
|
|
||||||
|
def _reduceText(self, text: str, reduction_factor: float) -> str:
|
||||||
|
"""
|
||||||
|
Reduce text size by the specified factor.
|
||||||
|
"""
|
||||||
|
if reduction_factor >= 1.0:
|
||||||
|
return text
|
||||||
|
|
||||||
|
target_length = int(len(text) * reduction_factor)
|
||||||
|
return text[:target_length] + "... [reduced]"
|
||||||
|
|
||||||
|
def _extractTextFromContentParts(self, extracted_content) -> str:
|
||||||
|
"""
|
||||||
|
Extract text content from ExtractionService ContentPart objects.
|
||||||
|
"""
|
||||||
|
if not extracted_content or not hasattr(extracted_content, 'parts'):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text_parts = []
|
||||||
|
for part in extracted_content.parts:
|
||||||
|
if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']:
|
||||||
|
if hasattr(part, 'data') and part.data:
|
||||||
|
text_parts.append(part.data)
|
||||||
|
|
||||||
|
return "\n\n".join(text_parts)
|
||||||
|
|
||||||
|
def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str:
|
||||||
|
"""
|
||||||
|
Build full prompt by replacing placeholders with their content.
|
||||||
|
Uses the new {{KEY:placeholder}} format.
|
||||||
|
"""
|
||||||
|
if not placeholders:
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
full_prompt = prompt
|
||||||
|
for placeholder, content in placeholders.items():
|
||||||
|
# Replace both old format {{placeholder}} and new format {{KEY:placeholder}}
|
||||||
|
full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content)
|
||||||
|
full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content)
|
||||||
|
|
||||||
|
return full_prompt
|
||||||
|
|
||||||
|
def _reducePlanningPrompt(
|
||||||
|
self,
|
||||||
|
full_prompt: str,
|
||||||
|
placeholders: Optional[Dict[str, str]],
|
||||||
|
model: ModelCapabilities,
|
||||||
|
options: AiCallOptions
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Reduce planning prompt size by summarizing placeholders while preserving prompt structure.
|
||||||
|
"""
|
||||||
|
if not placeholders:
|
||||||
|
return self._reduceText(full_prompt, 0.7)
|
||||||
|
|
||||||
|
# Reduce placeholders while preserving prompt
|
||||||
|
reduced_placeholders = {}
|
||||||
|
for placeholder, content in placeholders.items():
|
||||||
|
if len(content) > 1000: # Only reduce long content
|
||||||
|
reduction_factor = 0.7
|
||||||
|
reduced_content = self._reduceText(content, reduction_factor)
|
||||||
|
reduced_placeholders[placeholder] = reduced_content
|
||||||
|
else:
|
||||||
|
reduced_placeholders[placeholder] = content
|
||||||
|
|
||||||
|
return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders)
|
||||||
|
|
||||||
|
def _reduceTextPrompt(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
context: str,
|
||||||
|
model: ModelCapabilities,
|
||||||
|
options: AiCallOptions
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Reduce text prompt size using typeGroup-aware chunking and merging.
|
||||||
|
"""
|
||||||
|
max_size = int(model.maxTokens * (1 - options.safetyMargin))
|
||||||
|
|
||||||
|
if options.compressPrompt:
|
||||||
|
# Reduce both prompt and context
|
||||||
|
target_size = max_size
|
||||||
|
current_size = len(prompt) + len(context)
|
||||||
|
reduction_factor = (target_size * 0.7) / current_size
|
||||||
|
|
||||||
|
if reduction_factor < 1.0:
|
||||||
|
prompt = self._reduceText(prompt, reduction_factor)
|
||||||
|
context = self._reduceText(context, reduction_factor)
|
||||||
|
else:
|
||||||
|
# Only reduce context, preserve prompt integrity
|
||||||
|
max_context_size = max_size - len(prompt)
|
||||||
|
if len(context) > max_context_size:
|
||||||
|
reduction_factor = max_context_size / len(context)
|
||||||
|
context = self._reduceText(context, reduction_factor)
|
||||||
|
|
||||||
|
return prompt + "\n\n" + context if context else prompt
|
||||||
|
|
||||||
|
async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str:
|
||||||
|
"""Compress content to target size."""
|
||||||
|
if len(content.encode("utf-8")) <= targetSize:
|
||||||
|
return content
|
||||||
|
|
||||||
|
try:
|
||||||
|
compressionPrompt = f"""
|
||||||
|
Komprimiere den folgenden {contentType} auf maximal {targetSize} Zeichen,
|
||||||
|
behalte aber alle wichtigen Informationen bei:
|
||||||
|
|
||||||
|
{content}
|
||||||
|
|
||||||
|
Gib nur den komprimierten Inhalt zurück, ohne zusätzliche Erklärungen.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Service must not call connectors directly; use simple truncation fallback here
|
||||||
|
data = content.encode("utf-8")
|
||||||
|
return data[:targetSize].decode("utf-8", errors="ignore") + "... [truncated]"
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI compression failed, using truncation: {str(e)}")
|
||||||
|
return content[:targetSize] + "... [truncated]"
|
||||||
|
|
||||||
|
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List], options: AiCallOptions) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Get model capabilities for content processing, including appropriate size limits for chunking.
|
||||||
|
"""
|
||||||
|
# Estimate total content size
|
||||||
|
prompt_size = len(prompt.encode('utf-8'))
|
||||||
|
document_size = 0
|
||||||
|
if documents:
|
||||||
|
# Rough estimate of document content size
|
||||||
|
for doc in documents:
|
||||||
|
document_size += getattr(doc, 'fileSize', 0) or 0
|
||||||
|
|
||||||
|
total_size = prompt_size + document_size
|
||||||
|
|
||||||
|
# Use AiObjects to select the best model for this content size
|
||||||
|
# We'll simulate the model selection by checking available models
|
||||||
|
from modules.interfaces.interfaceAiObjects import aiModels
|
||||||
|
|
||||||
|
# Find the best model for this content size and operation
|
||||||
|
best_model = None
|
||||||
|
best_context_length = 0
|
||||||
|
|
||||||
|
for model_name, model_info in aiModels.items():
|
||||||
|
context_length = model_info.get("contextLength", 0)
|
||||||
|
|
||||||
|
# Skip models with no context length or too small for content
|
||||||
|
if context_length == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if model supports the operation type
|
||||||
|
capabilities = model_info.get("capabilities", [])
|
||||||
|
from modules.datamodels.datamodelAi import OperationType
|
||||||
|
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
|
||||||
|
continue
|
||||||
|
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
|
||||||
|
continue
|
||||||
|
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
|
||||||
|
continue
|
||||||
|
elif "text_generation" not in capabilities:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Prefer models that can handle the content without chunking, but allow chunking if needed
|
||||||
|
if context_length >= total_size * 0.8: # 80% of content size
|
||||||
|
if context_length > best_context_length:
|
||||||
|
best_model = model_info
|
||||||
|
best_context_length = context_length
|
||||||
|
elif best_model is None: # Fallback to largest available model
|
||||||
|
if context_length > best_context_length:
|
||||||
|
best_model = model_info
|
||||||
|
best_context_length = context_length
|
||||||
|
|
||||||
|
# Fallback to a reasonable default if no model found
|
||||||
|
if best_model is None:
|
||||||
|
best_model = {
|
||||||
|
"contextLength": 128000, # GPT-4o default
|
||||||
|
"llmName": "gpt-4o"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate appropriate sizes
|
||||||
|
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
|
||||||
|
context_length_bytes = int(best_model["contextLength"] * 4)
|
||||||
|
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
|
||||||
|
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
|
||||||
|
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
|
||||||
|
|
||||||
|
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
|
||||||
|
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
|
||||||
|
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"maxContextBytes": max_context_bytes,
|
||||||
|
"textChunkSize": text_chunk_size,
|
||||||
|
"imageChunkSize": image_chunk_size
|
||||||
|
}
|
||||||
|
|
||||||
|
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
|
||||||
|
"""
|
||||||
|
Get models capable of handling the specific operation with capability filtering.
|
||||||
|
"""
|
||||||
|
# Use the actual AI objects model selection instead of hardcoded default
|
||||||
|
if hasattr(self, 'aiObjects') and self.aiObjects:
|
||||||
|
# Let AiObjects handle the model selection
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
# Fallback to default model if AiObjects not available
|
||||||
|
default_model = ModelCapabilities(
|
||||||
|
name="default",
|
||||||
|
maxTokens=4000,
|
||||||
|
capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"],
|
||||||
|
costPerToken=0.001,
|
||||||
|
processingTime=1.0,
|
||||||
|
isAvailable=True
|
||||||
|
)
|
||||||
|
return [default_model]
|
||||||
384
modules/services/serviceAi/subWebResearch.py
Normal file
384
modules/services/serviceAi/subWebResearch.py
Normal file
|
|
@ -0,0 +1,384 @@
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||||
|
from modules.datamodels.datamodelWeb import (
|
||||||
|
WebResearchRequest,
|
||||||
|
WebResearchActionResult,
|
||||||
|
WebResearchDocumentData,
|
||||||
|
WebResearchActionDocument,
|
||||||
|
WebSearchResultItem,
|
||||||
|
)
|
||||||
|
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||||
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SubWebResearch:
|
||||||
|
"""Web research operations including search, crawling, and analysis."""
|
||||||
|
|
||||||
|
def __init__(self, services, aiObjects):
|
||||||
|
"""Initialize web research service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
services: Service center instance for accessing other services
|
||||||
|
aiObjects: Initialized AiObjects instance
|
||||||
|
"""
|
||||||
|
self.services = services
|
||||||
|
self.aiObjects = aiObjects
|
||||||
|
|
||||||
|
async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult:
|
||||||
|
"""Perform web research using interface functions."""
|
||||||
|
try:
|
||||||
|
logger.info(f"WEB RESEARCH STARTED")
|
||||||
|
logger.info(f"User Query: {request.user_prompt}")
|
||||||
|
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
||||||
|
|
||||||
|
# Global URL index to track all processed URLs across the entire research session
|
||||||
|
global_processed_urls = set()
|
||||||
|
|
||||||
|
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
||||||
|
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
||||||
|
|
||||||
|
if request.urls:
|
||||||
|
# Use provided URLs as initial main URLs
|
||||||
|
websites = request.urls
|
||||||
|
logger.info(f"Using provided URLs ({len(websites)}):")
|
||||||
|
for i, url in enumerate(websites, 1):
|
||||||
|
logger.info(f" {i}. {url}")
|
||||||
|
else:
|
||||||
|
# Use AI to determine main URLs based on user's intention
|
||||||
|
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
||||||
|
|
||||||
|
# Use AI to generate optimized Tavily search query and search parameters
|
||||||
|
query_optimizer_prompt = f"""You are a search query optimizer.
|
||||||
|
|
||||||
|
USER QUERY: {request.user_prompt}
|
||||||
|
|
||||||
|
Your task: Create a search query and parameters for the USER QUERY given.
|
||||||
|
|
||||||
|
RULES:
|
||||||
|
1. The search query MUST be related to the user query above
|
||||||
|
2. Extract key terms from the user query
|
||||||
|
3. Determine appropriate country/language based on the query context
|
||||||
|
4. Keep search query short (2-6 words)
|
||||||
|
|
||||||
|
Return ONLY this JSON format:
|
||||||
|
{{
|
||||||
|
"user_prompt": "search query based on user query above",
|
||||||
|
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
||||||
|
"language": "language_code_or_null",
|
||||||
|
"topic": "general|news|academic_or_null",
|
||||||
|
"time_range": "d|w|m|y_or_null",
|
||||||
|
"selection_strategy": "single|multiple|specific_page",
|
||||||
|
"selection_criteria": "what URLs to prioritize",
|
||||||
|
"expected_url_patterns": ["pattern1", "pattern2"],
|
||||||
|
"estimated_result_count": number
|
||||||
|
}}"""
|
||||||
|
|
||||||
|
# Get AI response for query optimization
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
||||||
|
ai_request = AiCallRequest(
|
||||||
|
prompt=query_optimizer_prompt,
|
||||||
|
options=AiCallOptions()
|
||||||
|
)
|
||||||
|
ai_response_obj = await self.aiObjects.call(ai_request)
|
||||||
|
ai_response = ai_response_obj.content
|
||||||
|
logger.debug(f"AI query optimizer response: {ai_response}")
|
||||||
|
|
||||||
|
# Parse AI response to extract search query
|
||||||
|
import json
|
||||||
|
try:
|
||||||
|
# Clean the response by removing markdown code blocks
|
||||||
|
cleaned_response = ai_response.strip()
|
||||||
|
if cleaned_response.startswith('```json'):
|
||||||
|
cleaned_response = cleaned_response[7:] # Remove ```json
|
||||||
|
if cleaned_response.endswith('```'):
|
||||||
|
cleaned_response = cleaned_response[:-3] # Remove ```
|
||||||
|
cleaned_response = cleaned_response.strip()
|
||||||
|
|
||||||
|
query_data = json.loads(cleaned_response)
|
||||||
|
search_query = query_data.get("user_prompt", request.user_prompt)
|
||||||
|
ai_country = query_data.get("country")
|
||||||
|
ai_language = query_data.get("language")
|
||||||
|
ai_topic = query_data.get("topic")
|
||||||
|
ai_time_range = query_data.get("time_range")
|
||||||
|
selection_strategy = query_data.get("selection_strategy", "multiple")
|
||||||
|
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
|
||||||
|
expected_patterns = query_data.get("expected_url_patterns", [])
|
||||||
|
estimated_count = query_data.get("estimated_result_count", request.max_results)
|
||||||
|
|
||||||
|
logger.info(f"AI optimized search query: '{search_query}'")
|
||||||
|
logger.info(f"Selection strategy: {selection_strategy}")
|
||||||
|
logger.info(f"Selection criteria: {selection_criteria}")
|
||||||
|
logger.info(f"Expected URL patterns: {expected_patterns}")
|
||||||
|
logger.info(f"Estimated result count: {estimated_count}")
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("Failed to parse AI response as JSON, using original query")
|
||||||
|
search_query = request.user_prompt
|
||||||
|
ai_country = None
|
||||||
|
ai_language = None
|
||||||
|
ai_topic = None
|
||||||
|
ai_time_range = None
|
||||||
|
selection_strategy = "multiple"
|
||||||
|
|
||||||
|
# Perform the web search with AI-determined parameters
|
||||||
|
search_kwargs = {
|
||||||
|
"query": search_query,
|
||||||
|
"max_results": request.max_results,
|
||||||
|
"search_depth": request.options.search_depth,
|
||||||
|
"auto_parameters": False # Use explicit parameters
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add parameters only if they have valid values
|
||||||
|
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
|
||||||
|
if not c:
|
||||||
|
return None
|
||||||
|
s = str(c).strip()
|
||||||
|
if not s or s.lower() in ['null', 'none', 'undefined']:
|
||||||
|
return None
|
||||||
|
# Map common codes to full English names when easy to do without extra deps
|
||||||
|
mapping = {
|
||||||
|
'ch': 'Switzerland', 'che': 'Switzerland',
|
||||||
|
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
|
||||||
|
'at': 'Austria', 'aut': 'Austria',
|
||||||
|
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
|
||||||
|
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
|
||||||
|
}
|
||||||
|
key = s.lower()
|
||||||
|
if key in mapping:
|
||||||
|
return mapping[key]
|
||||||
|
# If looks like full name, capitalize first letter only (Tavily accepts English names)
|
||||||
|
return s
|
||||||
|
|
||||||
|
norm_ai_country = _normalizeCountry(ai_country)
|
||||||
|
norm_req_country = _normalizeCountry(request.options.country)
|
||||||
|
if norm_ai_country:
|
||||||
|
search_kwargs["country"] = norm_ai_country
|
||||||
|
elif norm_req_country:
|
||||||
|
search_kwargs["country"] = norm_req_country
|
||||||
|
|
||||||
|
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
||||||
|
search_kwargs["language"] = ai_language
|
||||||
|
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
|
||||||
|
search_kwargs["language"] = request.options.language
|
||||||
|
|
||||||
|
if ai_topic and ai_topic in ['general', 'news', 'academic']:
|
||||||
|
search_kwargs["topic"] = ai_topic
|
||||||
|
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
|
||||||
|
search_kwargs["topic"] = request.options.topic
|
||||||
|
|
||||||
|
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
|
||||||
|
search_kwargs["time_range"] = ai_time_range
|
||||||
|
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
||||||
|
search_kwargs["time_range"] = request.options.time_range
|
||||||
|
|
||||||
|
# Constrain by expected domains if provided by AI
|
||||||
|
try:
|
||||||
|
include_domains = []
|
||||||
|
for p in expected_patterns or []:
|
||||||
|
if not isinstance(p, str):
|
||||||
|
continue
|
||||||
|
# Extract bare domain from pattern or URL
|
||||||
|
import re
|
||||||
|
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
|
||||||
|
if m:
|
||||||
|
domain = m.group(1).lower()
|
||||||
|
# strip leading www.
|
||||||
|
if domain.startswith('www.'):
|
||||||
|
domain = domain[4:]
|
||||||
|
include_domains.append(domain)
|
||||||
|
# Deduplicate
|
||||||
|
if include_domains:
|
||||||
|
seen = set()
|
||||||
|
uniq = []
|
||||||
|
for d in include_domains:
|
||||||
|
if d not in seen:
|
||||||
|
seen.add(d)
|
||||||
|
uniq.append(d)
|
||||||
|
search_kwargs["include_domains"] = uniq
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Log the parameters being used
|
||||||
|
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
|
||||||
|
|
||||||
|
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
||||||
|
|
||||||
|
logger.debug(f"Web search returned {len(search_results)} results:")
|
||||||
|
for i, result in enumerate(search_results, 1):
|
||||||
|
logger.debug(f" {i}. {result.url} - {result.title}")
|
||||||
|
|
||||||
|
# Deduplicate while preserving order
|
||||||
|
seen = set()
|
||||||
|
search_urls = []
|
||||||
|
for r in search_results:
|
||||||
|
u = str(r.url)
|
||||||
|
if u not in seen:
|
||||||
|
seen.add(u)
|
||||||
|
search_urls.append(u)
|
||||||
|
|
||||||
|
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
|
||||||
|
|
||||||
|
if not search_urls:
|
||||||
|
logger.error("No relevant websites found")
|
||||||
|
return WebResearchActionResult(success=False, error="No relevant websites found")
|
||||||
|
|
||||||
|
# Now use AI to determine the main URLs based on user's intention
|
||||||
|
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
|
||||||
|
|
||||||
|
# Create a prompt for AI to identify main URLs based on user's intention
|
||||||
|
ai_prompt = f"""
|
||||||
|
Select the most relevant URLs from these search results:
|
||||||
|
|
||||||
|
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
|
||||||
|
|
||||||
|
Return only the URLs that are most relevant for the user's query.
|
||||||
|
One URL per line.
|
||||||
|
"""
|
||||||
|
# Create AI call request
|
||||||
|
ai_request = AiCallRequest(
|
||||||
|
prompt=ai_prompt,
|
||||||
|
options=AiCallOptions()
|
||||||
|
)
|
||||||
|
ai_response_obj = await self.aiObjects.call(ai_request)
|
||||||
|
ai_response = ai_response_obj.content
|
||||||
|
logger.debug(f"AI response for main URL selection: {ai_response}")
|
||||||
|
|
||||||
|
# Parse AI response to extract URLs
|
||||||
|
websites = []
|
||||||
|
for line in ai_response.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line and ('http://' in line or 'https://' in line):
|
||||||
|
# Extract URL from the line
|
||||||
|
for word in line.split():
|
||||||
|
if word.startswith('http://') or word.startswith('https://'):
|
||||||
|
websites.append(word.rstrip('.,;'))
|
||||||
|
break
|
||||||
|
|
||||||
|
if not websites:
|
||||||
|
logger.warning("AI did not identify any main URLs, using first few search results")
|
||||||
|
websites = search_urls[:3] # Fallback to first 3 search results
|
||||||
|
|
||||||
|
# Deduplicate while preserving order
|
||||||
|
seen = set()
|
||||||
|
unique_websites = []
|
||||||
|
for url in websites:
|
||||||
|
if url not in seen:
|
||||||
|
seen.add(url)
|
||||||
|
unique_websites.append(url)
|
||||||
|
|
||||||
|
websites = unique_websites
|
||||||
|
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
|
||||||
|
|
||||||
|
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
||||||
|
for i, url in enumerate(websites, 1):
|
||||||
|
logger.info(f" {i}. {url}")
|
||||||
|
|
||||||
|
# Step 2: Smart website selection using AI interface
|
||||||
|
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
||||||
|
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
||||||
|
|
||||||
|
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
|
||||||
|
|
||||||
|
logger.debug(f"AI Response: {aiResponse}")
|
||||||
|
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
|
||||||
|
for i, url in enumerate(selectedWebsites, 1):
|
||||||
|
logger.debug(f" {i}. {url}")
|
||||||
|
|
||||||
|
# Show which were filtered out
|
||||||
|
filtered_out = [url for url in websites if url not in selectedWebsites]
|
||||||
|
if filtered_out:
|
||||||
|
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
|
||||||
|
for i, url in enumerate(filtered_out, 1):
|
||||||
|
logger.debug(f" {i}. {url}")
|
||||||
|
|
||||||
|
# Step 3+4+5: Recursive crawling with configurable depth
|
||||||
|
# Get configuration parameters
|
||||||
|
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
||||||
|
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
||||||
|
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
||||||
|
crawl_timeout_seconds = crawl_timeout_minutes * 60
|
||||||
|
|
||||||
|
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
|
||||||
|
effective_depth = min(max_depth, request.options.pages_search_depth)
|
||||||
|
|
||||||
|
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
|
||||||
|
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
||||||
|
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
|
||||||
|
logger.info(f"Max links per domain: {max_links_per_domain}")
|
||||||
|
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
||||||
|
|
||||||
|
# Use recursive crawling with URL index to avoid duplicates
|
||||||
|
import asyncio
|
||||||
|
try:
|
||||||
|
allContent = await asyncio.wait_for(
|
||||||
|
self.aiObjects.crawlRecursively(
|
||||||
|
urls=selectedWebsites,
|
||||||
|
max_depth=effective_depth,
|
||||||
|
extract_depth=request.options.extract_depth,
|
||||||
|
max_per_domain=max_links_per_domain,
|
||||||
|
global_processed_urls=global_processed_urls
|
||||||
|
),
|
||||||
|
timeout=crawl_timeout_seconds
|
||||||
|
)
|
||||||
|
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
|
||||||
|
# crawlRecursively now handles timeouts gracefully and returns partial results
|
||||||
|
# Try to get the partial results that were collected
|
||||||
|
allContent = {}
|
||||||
|
|
||||||
|
if not allContent:
|
||||||
|
logger.error("Could not extract content from any websites")
|
||||||
|
return WebResearchActionResult(success=False, error="Could not extract content from any websites")
|
||||||
|
|
||||||
|
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
||||||
|
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
||||||
|
logger.info(f"Crawl depth: {effective_depth} levels")
|
||||||
|
|
||||||
|
# Create simple result with raw content
|
||||||
|
sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites]
|
||||||
|
|
||||||
|
# Get all additional links (all URLs except main ones)
|
||||||
|
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
|
||||||
|
|
||||||
|
# Combine all content into a single result
|
||||||
|
combinedContent = ""
|
||||||
|
for url, content in allContent.items():
|
||||||
|
combinedContent += f"\n\n=== {url} ===\n{content}\n"
|
||||||
|
|
||||||
|
documentData = WebResearchDocumentData(
|
||||||
|
user_prompt=request.user_prompt,
|
||||||
|
websites_analyzed=len(allContent),
|
||||||
|
additional_links_found=len(additional_links),
|
||||||
|
analysis_result=combinedContent, # Raw content, no analysis
|
||||||
|
sources=sources,
|
||||||
|
additional_links=additional_links,
|
||||||
|
individual_content=allContent, # Individual URL -> content mapping
|
||||||
|
debug_info={
|
||||||
|
"crawl_depth": effective_depth,
|
||||||
|
"max_configured_depth": max_depth,
|
||||||
|
"max_links_per_domain": max_links_per_domain,
|
||||||
|
"crawl_timeout_minutes": crawl_timeout_minutes,
|
||||||
|
"total_urls_crawled": len(allContent),
|
||||||
|
"main_urls": len(selectedWebsites),
|
||||||
|
"additional_urls": len(additional_links)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
document = WebResearchActionDocument(
|
||||||
|
documentName=f"web_research_{request.user_prompt[:50]}.json",
|
||||||
|
documentData=documentData,
|
||||||
|
mimeType="application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return WebResearchActionResult(
|
||||||
|
success=True,
|
||||||
|
documents=[document],
|
||||||
|
resultLabel="web_research_results"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in web research: {str(e)}")
|
||||||
|
return WebResearchActionResult(success=False, error=str(e))
|
||||||
|
|
@ -7,8 +7,28 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class BinaryExtractor(Extractor):
|
class BinaryExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Fallback extractor for unsupported file types.
|
||||||
|
|
||||||
|
This extractor handles any file type that doesn't match other extractors.
|
||||||
|
It encodes the file as base64 and marks it as binary data.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- All file types (fallback)
|
||||||
|
- MIME types: application/octet-stream (default)
|
||||||
|
- File extensions: All (fallback)
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions (all)."""
|
||||||
|
return [] # Accepts all extensions as fallback
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types (all)."""
|
||||||
|
return [] # Accepts all MIME types as fallback
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "application/octet-stream"
|
mimeType = context.get("mimeType") or "application/octet-stream"
|
||||||
|
|
@ -6,8 +6,25 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class CsvExtractor(Extractor):
|
class CsvExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for CSV files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/csv
|
||||||
|
- File extensions: .csv
|
||||||
|
- Special handling: Treats as table data
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".csv"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["text/csv"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
fileName = context.get("fileName")
|
fileName = context.get("fileName")
|
||||||
|
|
@ -7,6 +7,16 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class DocxExtractor(Extractor):
|
class DocxExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for Microsoft Word documents.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
||||||
|
- File extensions: .docx
|
||||||
|
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
|
||||||
|
- Dependencies: python-docx
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
self._haveLibs = False
|
self._haveLibs = False
|
||||||
|
|
@ -24,6 +34,14 @@ class DocxExtractor(Extractor):
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".docx"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
self._load()
|
self._load()
|
||||||
|
|
@ -7,8 +7,26 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class HtmlExtractor(Extractor):
|
class HtmlExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for HTML files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/html
|
||||||
|
- File extensions: .html, .htm
|
||||||
|
- Special handling: Uses BeautifulSoup for parsing
|
||||||
|
- Dependencies: beautifulsoup4
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".html", ".htm"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["text/html"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "text/html"
|
mimeType = context.get("mimeType") or "text/html"
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ..subUtils import makeId
|
||||||
|
from modules.datamodels.datamodelExtraction import ContentPart
|
||||||
|
from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for image files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
|
||||||
|
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
|
||||||
|
- Special handling: GIF files are converted to PNG during extraction
|
||||||
|
"""
|
||||||
|
|
||||||
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
return ((mimeType or "").startswith("image/") or
|
||||||
|
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
|
||||||
|
|
||||||
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
|
mimeType = context.get("mimeType") or "image/unknown"
|
||||||
|
fileName = context.get("fileName", "")
|
||||||
|
|
||||||
|
# Convert GIF to PNG during extraction
|
||||||
|
if mimeType.lower() == "image/gif":
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Open GIF and convert to PNG
|
||||||
|
with Image.open(io.BytesIO(fileBytes)) as img:
|
||||||
|
# Convert to RGB (removes animation)
|
||||||
|
if img.mode in ('RGBA', 'LA', 'P'):
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
# Save as PNG in memory
|
||||||
|
png_buffer = io.BytesIO()
|
||||||
|
img.save(png_buffer, format='PNG')
|
||||||
|
png_data = png_buffer.getvalue()
|
||||||
|
|
||||||
|
# Update mimeType and fileBytes
|
||||||
|
mimeType = "image/png"
|
||||||
|
fileBytes = png_data
|
||||||
|
|
||||||
|
logger.info(f"GIF converted to PNG during extraction: {fileName}, original={len(fileBytes)} bytes, converted={len(png_data)} bytes")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"GIF conversion failed during extraction for {fileName}: {str(e)}, using original")
|
||||||
|
# Keep original GIF data if conversion fails
|
||||||
|
|
||||||
|
return [ContentPart(
|
||||||
|
id=makeId(),
|
||||||
|
parentId=None,
|
||||||
|
label="image",
|
||||||
|
typeGroup="image",
|
||||||
|
mimeType=mimeType,
|
||||||
|
data=base64.b64encode(fileBytes).decode("utf-8"),
|
||||||
|
metadata={"size": len(fileBytes)}
|
||||||
|
)]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -7,8 +7,25 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class JsonExtractor(Extractor):
|
class JsonExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for JSON files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/json
|
||||||
|
- File extensions: .json
|
||||||
|
- Special handling: Validates JSON format, falls back to text if invalid
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".json"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/json"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "application/json"
|
mimeType = context.get("mimeType") or "application/json"
|
||||||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class PdfExtractor(Extractor):
|
class PdfExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for PDF files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/pdf
|
||||||
|
- File extensions: .pdf
|
||||||
|
- Special handling: Extracts text per page and embedded images
|
||||||
|
- Dependencies: PyPDF2, PyMuPDF (fitz)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
self._haveLibs = False
|
self._haveLibs = False
|
||||||
|
|
@ -26,6 +36,14 @@ class PdfExtractor(Extractor):
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".pdf"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/pdf"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
self._load()
|
self._load()
|
||||||
225
modules/services/serviceExtraction/extractors/extractorPptx.py
Normal file
225
modules/services/serviceExtraction/extractors/extractorPptx.py
Normal file
|
|
@ -0,0 +1,225 @@
|
||||||
|
import logging
|
||||||
|
import base64
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
|
||||||
|
from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PptxExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for PowerPoint files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
|
||||||
|
- File extensions: .pptx, .ppt
|
||||||
|
- Special handling: Extracts slide content, tables, and images
|
||||||
|
- Dependencies: python-pptx
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._loaded = False
|
||||||
|
self._haveLibs = False
|
||||||
|
|
||||||
|
def _load(self):
|
||||||
|
if self._loaded:
|
||||||
|
return
|
||||||
|
self._loaded = True
|
||||||
|
try:
|
||||||
|
global Presentation
|
||||||
|
from pptx import Presentation
|
||||||
|
self._haveLibs = True
|
||||||
|
except Exception:
|
||||||
|
self._haveLibs = False
|
||||||
|
|
||||||
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
return (mimeType in [
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
"application/vnd.ms-powerpoint"
|
||||||
|
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".pptx", ".ppt"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return [
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
"application/vnd.ms-powerpoint"
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
|
"""
|
||||||
|
Extract content from PowerPoint files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fileBytes: Raw file data as bytes
|
||||||
|
context: Context dictionary with file information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ContentPart objects with extracted content
|
||||||
|
"""
|
||||||
|
self._load()
|
||||||
|
|
||||||
|
if not self._haveLibs:
|
||||||
|
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||||
|
return [ContentPart(
|
||||||
|
id="error",
|
||||||
|
label="PowerPoint Extraction Error",
|
||||||
|
typeGroup="text",
|
||||||
|
mimeType="text/plain",
|
||||||
|
data="Error: python-pptx library not installed",
|
||||||
|
metadata={"error": True, "error_message": "python-pptx library not installed"}
|
||||||
|
)]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Load presentation from bytes
|
||||||
|
presentation = Presentation(io.BytesIO(fileBytes))
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
slide_index = 0
|
||||||
|
|
||||||
|
# Extract content from each slide
|
||||||
|
for slide in presentation.slides:
|
||||||
|
slide_index += 1
|
||||||
|
slide_content = []
|
||||||
|
|
||||||
|
# Extract text from slide
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text") and shape.text.strip():
|
||||||
|
slide_content.append(shape.text.strip())
|
||||||
|
|
||||||
|
# Extract table data
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if shape.has_table:
|
||||||
|
table = shape.table
|
||||||
|
table_data = []
|
||||||
|
for row in table.rows:
|
||||||
|
row_data = []
|
||||||
|
for cell in row.cells:
|
||||||
|
row_data.append(cell.text.strip())
|
||||||
|
table_data.append(row_data)
|
||||||
|
|
||||||
|
if table_data:
|
||||||
|
# Convert table to markdown format
|
||||||
|
table_md = self._table_to_markdown(table_data)
|
||||||
|
slide_content.append(table_md)
|
||||||
|
|
||||||
|
# Extract images
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
|
||||||
|
try:
|
||||||
|
image = shape.image
|
||||||
|
image_bytes = image.blob
|
||||||
|
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
# Create image part
|
||||||
|
image_part = ContentPart(
|
||||||
|
id=f"slide_{slide_index}_image_{len(parts)}",
|
||||||
|
label=f"Slide {slide_index} Image",
|
||||||
|
typeGroup="image",
|
||||||
|
mimeType="image/png", # Default to PNG
|
||||||
|
data=image_b64,
|
||||||
|
metadata={
|
||||||
|
"slide_number": slide_index,
|
||||||
|
"shape_type": "image",
|
||||||
|
"extracted_from": "powerpoint"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
parts.append(image_part)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
|
||||||
|
|
||||||
|
# Create slide content part
|
||||||
|
if slide_content:
|
||||||
|
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
|
||||||
|
|
||||||
|
slide_part = ContentPart(
|
||||||
|
id=f"slide_{slide_index}",
|
||||||
|
label=f"Slide {slide_index} Content",
|
||||||
|
typeGroup="structure",
|
||||||
|
mimeType="text/plain",
|
||||||
|
data=slide_text,
|
||||||
|
metadata={
|
||||||
|
"slide_number": slide_index,
|
||||||
|
"content_type": "slide",
|
||||||
|
"extracted_from": "powerpoint",
|
||||||
|
"text_length": len(slide_text)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
parts.append(slide_part)
|
||||||
|
|
||||||
|
# Create presentation overview
|
||||||
|
file_name = context.get("fileName", "presentation.pptx")
|
||||||
|
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
|
||||||
|
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
|
||||||
|
overview_text += f"**Content Parts:** {len(parts)}\n\n"
|
||||||
|
|
||||||
|
# Add slide summaries
|
||||||
|
for i, slide in enumerate(presentation.slides, 1):
|
||||||
|
slide_text_parts = []
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text") and shape.text.strip():
|
||||||
|
slide_text_parts.append(shape.text.strip())
|
||||||
|
|
||||||
|
if slide_text_parts:
|
||||||
|
overview_text += f"## Slide {i}\n"
|
||||||
|
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
|
||||||
|
overview_text += "\n\n"
|
||||||
|
|
||||||
|
# Create overview part
|
||||||
|
overview_part = ContentPart(
|
||||||
|
id="presentation_overview",
|
||||||
|
label="Presentation Overview",
|
||||||
|
typeGroup="text",
|
||||||
|
mimeType="text/plain",
|
||||||
|
data=overview_text,
|
||||||
|
metadata={
|
||||||
|
"content_type": "overview",
|
||||||
|
"extracted_from": "powerpoint",
|
||||||
|
"total_slides": len(presentation.slides),
|
||||||
|
"text_length": len(overview_text)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
parts.insert(0, overview_part) # Insert at beginning
|
||||||
|
|
||||||
|
return parts
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting PowerPoint content: {str(e)}")
|
||||||
|
return [ContentPart(
|
||||||
|
id="error",
|
||||||
|
label="PowerPoint Extraction Error",
|
||||||
|
typeGroup="text",
|
||||||
|
mimeType="text/plain",
|
||||||
|
data=f"Error extracting PowerPoint content: {str(e)}",
|
||||||
|
metadata={"error": True, "error_message": str(e)}
|
||||||
|
)]
|
||||||
|
|
||||||
|
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
|
||||||
|
"""Convert table data to markdown format."""
|
||||||
|
if not table_data:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
markdown_lines = []
|
||||||
|
|
||||||
|
# Header row
|
||||||
|
if table_data:
|
||||||
|
header = "| " + " | ".join(table_data[0]) + " |"
|
||||||
|
markdown_lines.append(header)
|
||||||
|
|
||||||
|
# Separator row
|
||||||
|
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
|
||||||
|
markdown_lines.append(separator)
|
||||||
|
|
||||||
|
# Data rows
|
||||||
|
for row in table_data[1:]:
|
||||||
|
data_row = "| " + " | ".join(row) + " |"
|
||||||
|
markdown_lines.append(data_row)
|
||||||
|
|
||||||
|
return "\n".join(markdown_lines)
|
||||||
|
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelExtraction import ContentPart
|
||||||
|
from ..subUtils import makeId
|
||||||
|
from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
|
class SqlExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for SQL files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/x-sql, application/sql
|
||||||
|
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
|
||||||
|
- Special handling: Treats as structured text with SQL syntax
|
||||||
|
"""
|
||||||
|
|
||||||
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
return (mimeType in ("text/x-sql", "application/sql") or
|
||||||
|
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["text/x-sql", "application/sql"]
|
||||||
|
|
||||||
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
|
fileName = context.get("fileName")
|
||||||
|
mimeType = context.get("mimeType") or "text/x-sql"
|
||||||
|
data = fileBytes.decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
# Add SQL-specific metadata
|
||||||
|
metadata = {
|
||||||
|
"size": len(fileBytes),
|
||||||
|
"file_type": "sql",
|
||||||
|
"line_count": len(data.splitlines()),
|
||||||
|
"has_select": "SELECT" in data.upper(),
|
||||||
|
"has_insert": "INSERT" in data.upper(),
|
||||||
|
"has_update": "UPDATE" in data.upper(),
|
||||||
|
"has_delete": "DELETE" in data.upper(),
|
||||||
|
"has_create": "CREATE" in data.upper(),
|
||||||
|
"has_drop": "DROP" in data.upper()
|
||||||
|
}
|
||||||
|
|
||||||
|
return [ContentPart(
|
||||||
|
id=makeId(),
|
||||||
|
parentId=None,
|
||||||
|
label="main",
|
||||||
|
typeGroup="structure",
|
||||||
|
mimeType=mimeType,
|
||||||
|
data=data,
|
||||||
|
metadata=metadata
|
||||||
|
)]
|
||||||
103
modules/services/serviceExtraction/extractors/extractorText.py
Normal file
103
modules/services/serviceExtraction/extractors/extractorText.py
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelExtraction import ContentPart
|
||||||
|
from ..subUtils import makeId
|
||||||
|
from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
|
class TextExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for plain text files and code files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
|
||||||
|
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
|
||||||
|
"""
|
||||||
|
|
||||||
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
# Check MIME types
|
||||||
|
if mimeType and mimeType.startswith("text/"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check file extensions
|
||||||
|
if fileName:
|
||||||
|
ext = fileName.lower()
|
||||||
|
return ext.endswith((
|
||||||
|
# Basic text files
|
||||||
|
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||||
|
# Programming languages
|
||||||
|
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||||
|
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||||
|
# Web technologies
|
||||||
|
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||||
|
# Configuration files
|
||||||
|
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||||
|
# Scripts and automation
|
||||||
|
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||||
|
# Data files
|
||||||
|
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||||
|
# Documentation
|
||||||
|
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||||
|
# Other text formats
|
||||||
|
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||||
|
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||||
|
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||||
|
))
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [
|
||||||
|
# Basic text files
|
||||||
|
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||||
|
# Programming languages
|
||||||
|
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||||
|
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||||
|
# Web technologies
|
||||||
|
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||||
|
# Configuration files
|
||||||
|
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||||
|
# Scripts and automation
|
||||||
|
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||||
|
# Data files
|
||||||
|
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||||
|
# Documentation
|
||||||
|
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||||
|
# Other text formats
|
||||||
|
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||||
|
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||||
|
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||||
|
]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return [
|
||||||
|
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
|
||||||
|
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
|
||||||
|
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
|
||||||
|
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
|
||||||
|
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
|
||||||
|
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
|
||||||
|
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
|
||||||
|
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
|
||||||
|
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
|
||||||
|
"application/x-yaml", "application/x-toml", "application/x-ini",
|
||||||
|
"application/x-config", "application/x-properties", "application/x-log"
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
|
fileName = context.get("fileName")
|
||||||
|
mimeType = context.get("mimeType") or "text/plain"
|
||||||
|
data = fileBytes.decode("utf-8", errors="replace")
|
||||||
|
return [ContentPart(
|
||||||
|
id=makeId(),
|
||||||
|
parentId=None,
|
||||||
|
label="main",
|
||||||
|
typeGroup="text",
|
||||||
|
mimeType=mimeType,
|
||||||
|
data=data,
|
||||||
|
metadata={"size": len(fileBytes)}
|
||||||
|
)]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class XlsxExtractor(Extractor):
|
class XlsxExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for Microsoft Excel spreadsheets.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
||||||
|
- File extensions: .xlsx, .xlsm
|
||||||
|
- Special handling: Extracts all sheets as CSV data
|
||||||
|
- Dependencies: openpyxl
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
self._haveLibs = False
|
self._haveLibs = False
|
||||||
|
|
@ -26,6 +36,14 @@ class XlsxExtractor(Extractor):
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".xlsx", ".xlsm"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
self._load()
|
self._load()
|
||||||
|
|
@ -7,8 +7,25 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class XmlExtractor(Extractor):
|
class XmlExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for XML files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/xml
|
||||||
|
- File extensions: .xml, .rss, .atom
|
||||||
|
- Special handling: Uses ElementTree for parsing
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".xml", ".rss", ".atom"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/xml"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "application/xml"
|
mimeType = context.get("mimeType") or "application/xml"
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
from typing import Any, Dict, List
|
|
||||||
import base64
|
|
||||||
|
|
||||||
from ..subUtils import makeId
|
|
||||||
from modules.datamodels.datamodelExtraction import ContentPart
|
|
||||||
from ..subRegistry import Extractor
|
|
||||||
|
|
||||||
|
|
||||||
class ImageExtractor(Extractor):
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
||||||
return (mimeType or "").startswith("image/")
|
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
||||||
mimeType = context.get("mimeType") or "image/unknown"
|
|
||||||
return [ContentPart(
|
|
||||||
id=makeId(),
|
|
||||||
parentId=None,
|
|
||||||
label="image",
|
|
||||||
typeGroup="image",
|
|
||||||
mimeType=mimeType,
|
|
||||||
data=base64.b64encode(fileBytes).decode("utf-8"),
|
|
||||||
metadata={"size": len(fileBytes)}
|
|
||||||
)]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
from typing import Any, Dict, List
|
|
||||||
|
|
||||||
from modules.datamodels.datamodelExtraction import ContentPart
|
|
||||||
from ..subUtils import makeId
|
|
||||||
from ..subRegistry import Extractor
|
|
||||||
|
|
||||||
|
|
||||||
class TextExtractor(Extractor):
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
||||||
return mimeType in ("text/plain", "text/markdown")
|
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
||||||
fileName = context.get("fileName")
|
|
||||||
mimeType = context.get("mimeType") or "text/plain"
|
|
||||||
data = fileBytes.decode("utf-8", errors="replace")
|
|
||||||
return [ContentPart(
|
|
||||||
id=makeId(),
|
|
||||||
parentId=None,
|
|
||||||
label="main",
|
|
||||||
typeGroup="text",
|
|
||||||
mimeType=mimeType,
|
|
||||||
data=data,
|
|
||||||
metadata={"size": len(fileBytes)}
|
|
||||||
)]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -67,10 +67,12 @@ class ExtractionService:
|
||||||
if part.metadata:
|
if part.metadata:
|
||||||
logger.debug(f" Metadata: {part.metadata}")
|
logger.debug(f" Metadata: {part.metadata}")
|
||||||
|
|
||||||
# Attach document id to parts if missing
|
# Attach document id and MIME type to parts if missing
|
||||||
for p in ec.parts:
|
for p in ec.parts:
|
||||||
if "documentId" not in p.metadata:
|
if "documentId" not in p.metadata:
|
||||||
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
||||||
|
if "documentMimeType" not in p.metadata:
|
||||||
|
p.metadata["documentMimeType"] = documentData["mimeType"]
|
||||||
|
|
||||||
# Log chunking information
|
# Log chunking information
|
||||||
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
||||||
|
|
|
||||||
209
modules/services/serviceExtraction/subMerger.py
Normal file
209
modules/services/serviceExtraction/subMerger.py
Normal file
|
|
@ -0,0 +1,209 @@
|
||||||
|
"""
|
||||||
|
Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits.
|
||||||
|
"""
|
||||||
|
from typing import List, Dict, Any, Tuple
|
||||||
|
import logging
|
||||||
|
from modules.datamodels.datamodelExtraction import ContentPart
|
||||||
|
from .subUtils import makeId
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class IntelligentTokenAwareMerger:
|
||||||
|
"""
|
||||||
|
Intelligent merger that groups chunks based on LLM token limits to minimize AI calls.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Calculate token count for each chunk
|
||||||
|
2. Group chunks to maximize token usage without exceeding limits
|
||||||
|
3. Preserve document structure and semantic boundaries
|
||||||
|
4. Minimize total number of AI calls
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model_capabilities: Dict[str, Any]):
|
||||||
|
self.max_tokens = model_capabilities.get("maxTokens", 4000)
|
||||||
|
self.safety_margin = model_capabilities.get("safetyMargin", 0.1)
|
||||||
|
self.effective_max_tokens = int(self.max_tokens * (1 - self.safety_margin))
|
||||||
|
self.chars_per_token = model_capabilities.get("charsPerToken", 4) # Rough estimation
|
||||||
|
|
||||||
|
def merge_chunks_intelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]:
|
||||||
|
"""
|
||||||
|
Merge chunks intelligently based on token limits.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks: List of ContentPart chunks to merge
|
||||||
|
prompt: AI prompt to account for in token calculation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of optimally merged ContentPart objects
|
||||||
|
"""
|
||||||
|
if not chunks:
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, max_tokens={self.effective_max_tokens}")
|
||||||
|
|
||||||
|
# Calculate tokens for prompt
|
||||||
|
prompt_tokens = self._estimate_tokens(prompt)
|
||||||
|
available_tokens = self.effective_max_tokens - prompt_tokens
|
||||||
|
|
||||||
|
logger.info(f"📊 Prompt tokens: {prompt_tokens}, Available for content: {available_tokens}")
|
||||||
|
|
||||||
|
# Group chunks by document and type for semantic coherence
|
||||||
|
grouped_chunks = self._group_chunks_by_document_and_type(chunks)
|
||||||
|
|
||||||
|
merged_parts = []
|
||||||
|
|
||||||
|
for group_key, group_chunks in grouped_chunks.items():
|
||||||
|
logger.info(f"📁 Processing group: {group_key} ({len(group_chunks)} chunks)")
|
||||||
|
|
||||||
|
# Merge chunks within this group optimally
|
||||||
|
group_merged = self._merge_group_optimally(group_chunks, available_tokens)
|
||||||
|
merged_parts.extend(group_merged)
|
||||||
|
|
||||||
|
logger.info(f"✅ Intelligent merging complete: {len(chunks)} → {len(merged_parts)} parts")
|
||||||
|
return merged_parts
|
||||||
|
|
||||||
|
def _group_chunks_by_document_and_type(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]:
|
||||||
|
"""Group chunks by document and type for semantic coherence."""
|
||||||
|
groups = {}
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
# Create group key: document_id + type_group
|
||||||
|
doc_id = chunk.metadata.get("documentId", "unknown")
|
||||||
|
type_group = chunk.typeGroup
|
||||||
|
group_key = f"{doc_id}_{type_group}"
|
||||||
|
|
||||||
|
if group_key not in groups:
|
||||||
|
groups[group_key] = []
|
||||||
|
groups[group_key].append(chunk)
|
||||||
|
|
||||||
|
return groups
|
||||||
|
|
||||||
|
def _merge_group_optimally(self, chunks: List[ContentPart], available_tokens: int) -> List[ContentPart]:
|
||||||
|
"""Merge chunks within a group optimally to minimize AI calls."""
|
||||||
|
if not chunks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Sort chunks by size (smallest first for better packing)
|
||||||
|
sorted_chunks = sorted(chunks, key=lambda c: self._estimate_tokens(c.data))
|
||||||
|
|
||||||
|
merged_parts = []
|
||||||
|
current_group = []
|
||||||
|
current_tokens = 0
|
||||||
|
|
||||||
|
for chunk in sorted_chunks:
|
||||||
|
chunk_tokens = self._estimate_tokens(chunk.data)
|
||||||
|
|
||||||
|
# Special case: If single chunk is already at max size, process it alone
|
||||||
|
if chunk_tokens >= available_tokens * 0.9: # 90% of available tokens
|
||||||
|
# Finalize current group if it exists
|
||||||
|
if current_group:
|
||||||
|
merged_part = self._create_merged_part(current_group, current_tokens)
|
||||||
|
merged_parts.append(merged_part)
|
||||||
|
current_group = []
|
||||||
|
current_tokens = 0
|
||||||
|
|
||||||
|
# Process large chunk individually
|
||||||
|
merged_parts.append(chunk)
|
||||||
|
logger.debug(f"🔍 Large chunk processed individually: {chunk_tokens} tokens")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If adding this chunk would exceed limit, finalize current group
|
||||||
|
if current_tokens + chunk_tokens > available_tokens and current_group:
|
||||||
|
merged_part = self._create_merged_part(current_group, current_tokens)
|
||||||
|
merged_parts.append(merged_part)
|
||||||
|
current_group = [chunk]
|
||||||
|
current_tokens = chunk_tokens
|
||||||
|
else:
|
||||||
|
current_group.append(chunk)
|
||||||
|
current_tokens += chunk_tokens
|
||||||
|
|
||||||
|
# Finalize remaining group
|
||||||
|
if current_group:
|
||||||
|
merged_part = self._create_merged_part(current_group, current_tokens)
|
||||||
|
merged_parts.append(merged_part)
|
||||||
|
|
||||||
|
logger.info(f"📦 Group merged: {len(chunks)} → {len(merged_parts)} parts")
|
||||||
|
return merged_parts
|
||||||
|
|
||||||
|
def _create_merged_part(self, chunks: List[ContentPart], total_tokens: int) -> ContentPart:
|
||||||
|
"""Create a merged ContentPart from multiple chunks."""
|
||||||
|
if len(chunks) == 1:
|
||||||
|
return chunks[0] # No need to merge single chunk
|
||||||
|
|
||||||
|
# Combine data with semantic separators
|
||||||
|
combined_data = self._combine_chunk_data(chunks)
|
||||||
|
|
||||||
|
# Use metadata from first chunk as base
|
||||||
|
base_chunk = chunks[0]
|
||||||
|
merged_metadata = base_chunk.metadata.copy()
|
||||||
|
merged_metadata.update({
|
||||||
|
"merged": True,
|
||||||
|
"originalChunkCount": len(chunks),
|
||||||
|
"totalTokens": total_tokens,
|
||||||
|
"originalChunkIds": [c.id for c in chunks],
|
||||||
|
"size": len(combined_data.encode('utf-8'))
|
||||||
|
})
|
||||||
|
|
||||||
|
merged_part = ContentPart(
|
||||||
|
id=makeId(),
|
||||||
|
parentId=base_chunk.parentId,
|
||||||
|
label=f"merged_{len(chunks)}_chunks",
|
||||||
|
typeGroup=base_chunk.typeGroup,
|
||||||
|
mimeType=base_chunk.mimeType,
|
||||||
|
data=combined_data,
|
||||||
|
metadata=merged_metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"🔗 Created merged part: {len(chunks)} chunks, {total_tokens} tokens")
|
||||||
|
return merged_part
|
||||||
|
|
||||||
|
def _combine_chunk_data(self, chunks: List[ContentPart]) -> str:
|
||||||
|
"""Combine chunk data with appropriate separators."""
|
||||||
|
if not chunks:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Use different separators based on content type
|
||||||
|
if chunks[0].typeGroup == "text":
|
||||||
|
separator = "\n\n---\n\n" # Clear text separation
|
||||||
|
elif chunks[0].typeGroup == "table":
|
||||||
|
separator = "\n\n[TABLE BREAK]\n\n" # Table separation
|
||||||
|
else:
|
||||||
|
separator = "\n\n---\n\n" # Default separation
|
||||||
|
|
||||||
|
return separator.join([chunk.data for chunk in chunks])
|
||||||
|
|
||||||
|
def _estimate_tokens(self, text: str) -> int:
|
||||||
|
"""Estimate token count for text."""
|
||||||
|
if not text:
|
||||||
|
return 0
|
||||||
|
return len(text) // self.chars_per_token
|
||||||
|
|
||||||
|
def calculate_optimization_stats(self, original_chunks: List[ContentPart], merged_parts: List[ContentPart]) -> Dict[str, Any]:
|
||||||
|
"""Calculate optimization statistics with detailed analysis."""
|
||||||
|
original_calls = len(original_chunks)
|
||||||
|
optimized_calls = len(merged_parts)
|
||||||
|
reduction_percent = ((original_calls - optimized_calls) / original_calls * 100) if original_calls > 0 else 0
|
||||||
|
|
||||||
|
# Analyze chunk sizes
|
||||||
|
large_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) >= self.effective_max_tokens * 0.9]
|
||||||
|
small_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) < self.effective_max_tokens * 0.9]
|
||||||
|
|
||||||
|
# Calculate theoretical maximum optimization (if all small chunks could be merged)
|
||||||
|
theoretical_min_calls = len(large_chunks) + max(1, len(small_chunks) // 3) # Assume 3 small chunks per call
|
||||||
|
theoretical_reduction = ((original_calls - theoretical_min_calls) / original_calls * 100) if original_calls > 0 else 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"original_ai_calls": original_calls,
|
||||||
|
"optimized_ai_calls": optimized_calls,
|
||||||
|
"reduction_percent": round(reduction_percent, 1),
|
||||||
|
"cost_savings": f"{reduction_percent:.1f}%",
|
||||||
|
"efficiency_gain": f"{original_calls / optimized_calls:.1f}x" if optimized_calls > 0 else "∞",
|
||||||
|
"analysis": {
|
||||||
|
"large_chunks": len(large_chunks),
|
||||||
|
"small_chunks": len(small_chunks),
|
||||||
|
"theoretical_min_calls": theoretical_min_calls,
|
||||||
|
"theoretical_reduction": round(theoretical_reduction, 1),
|
||||||
|
"optimization_potential": "high" if reduction_percent > 50 else "moderate" if reduction_percent > 20 else "low"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -3,11 +3,13 @@ import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
||||||
|
from modules.shared.configuration import APP_CONFIG
|
||||||
from .subUtils import makeId
|
from .subUtils import makeId
|
||||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||||
from .merging.text_merger import TextMerger
|
from .merging.mergerText import TextMerger
|
||||||
from .merging.table_merger import TableMerger
|
from .merging.mergerTable import TableMerger
|
||||||
from .merging.default_merger import DefaultMerger
|
from .merging.mergerDefault import DefaultMerger
|
||||||
|
from .subMerger import IntelligentTokenAwareMerger
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -84,46 +86,55 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
|
||||||
chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]
|
chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]
|
||||||
|
|
||||||
logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
|
logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
|
||||||
|
logger.debug(f"runExtraction - non_chunk_parts: {len(non_chunk_parts)}, chunk_parts: {len(chunk_parts)}")
|
||||||
|
|
||||||
if non_chunk_parts:
|
# Apply intelligent merging for small text parts
|
||||||
|
if non_chunk_parts:
|
||||||
|
# Count text parts
|
||||||
|
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
|
||||||
|
if len(text_parts) > 5: # If we have many small text parts, merge them
|
||||||
|
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
|
||||||
non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)
|
non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)
|
||||||
|
|
||||||
# Combine non-chunk parts with chunk parts (chunks stay separate)
|
# Combine non-chunk parts with chunk parts (chunks stay separate)
|
||||||
parts = non_chunk_parts + chunk_parts
|
parts = non_chunk_parts + chunk_parts
|
||||||
|
|
||||||
logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
|
logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
|
||||||
# DEBUG: dump parts and chunks to files TODO TO REMOVE
|
logger.debug(f"runExtraction - Final parts: {len(parts)} (chunks: {len(chunk_parts)})")
|
||||||
|
# DEBUG: dump parts and chunks to files - only if debug enabled
|
||||||
try:
|
try:
|
||||||
base_dir = "./test-chat/ai"
|
debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
os.makedirs(base_dir, exist_ok=True)
|
if debug_enabled:
|
||||||
|
base_dir = "./test-chat/ai"
|
||||||
# Generate timestamp for consistent naming
|
os.makedirs(base_dir, exist_ok=True)
|
||||||
from datetime import datetime, UTC
|
|
||||||
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
|
# Generate timestamp for consistent naming
|
||||||
|
from datetime import datetime, UTC
|
||||||
# Write a summary file
|
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
|
||||||
summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"]
|
|
||||||
text_index = 0
|
# Write a summary file
|
||||||
for idx, part in enumerate(parts):
|
summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"]
|
||||||
is_texty = part.typeGroup in ("text", "table", "structure")
|
text_index = 0
|
||||||
size = int(part.metadata.get("size", 0) or 0)
|
for idx, part in enumerate(parts):
|
||||||
is_chunk = bool(part.metadata.get("chunk", False))
|
is_texty = part.typeGroup in ("text", "table", "structure")
|
||||||
summary_lines.append(
|
size = int(part.metadata.get("size", 0) or 0)
|
||||||
f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}"
|
is_chunk = bool(part.metadata.get("chunk", False))
|
||||||
)
|
summary_lines.append(
|
||||||
if is_texty and getattr(part, "data", None):
|
f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}"
|
||||||
text_index += 1
|
)
|
||||||
fname = f"{ts}_extract_{fileName}_part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt"
|
if is_texty and getattr(part, "data", None):
|
||||||
fpath = os.path.join(base_dir, fname)
|
text_index += 1
|
||||||
with open(fpath, "w", encoding="utf-8") as f:
|
fname = f"{ts}_extract_{fileName}_part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt"
|
||||||
f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n")
|
fpath = os.path.join(base_dir, fname)
|
||||||
f.write(str(part.data))
|
with open(fpath, "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n")
|
||||||
# Write summary file
|
f.write(str(part.data))
|
||||||
summary_fname = f"{ts}_extract_{fileName}_summary.txt"
|
|
||||||
summary_fpath = os.path.join(base_dir, summary_fname)
|
# Write summary file
|
||||||
with open(summary_fpath, "w", encoding="utf-8") as f:
|
summary_fname = f"{ts}_extract_{fileName}_summary.txt"
|
||||||
f.write("\n".join(summary_lines))
|
summary_fpath = os.path.join(base_dir, summary_fname)
|
||||||
|
with open(summary_fpath, "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(summary_lines))
|
||||||
except Exception as _e:
|
except Exception as _e:
|
||||||
logger.debug(f"Debug dump skipped: {_e}")
|
logger.debug(f"Debug dump skipped: {_e}")
|
||||||
|
|
||||||
|
|
@ -146,13 +157,22 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
||||||
kept: List[ContentPart] = []
|
kept: List[ContentPart] = []
|
||||||
remaining: List[ContentPart] = []
|
remaining: List[ContentPart] = []
|
||||||
|
|
||||||
for p in parts:
|
logger.debug(f"Starting poolAndLimit with {len(parts)} parts, maxSize={maxSize}")
|
||||||
|
|
||||||
|
for i, p in enumerate(parts):
|
||||||
size = int(p.metadata.get("size", 0) or 0)
|
size = int(p.metadata.get("size", 0) or 0)
|
||||||
|
# Show first 50 characters of text content for debugging
|
||||||
|
content_preview = p.data[:50].replace('\n', '\\n') if p.data else ""
|
||||||
|
logger.debug(f"Part {i}: {p.typeGroup} - {size} bytes - '{content_preview}...' (current: {current})")
|
||||||
if current + size <= maxSize:
|
if current + size <= maxSize:
|
||||||
kept.append(p)
|
kept.append(p)
|
||||||
current += size
|
current += size
|
||||||
|
logger.debug(f"Part {i} kept (total: {current})")
|
||||||
else:
|
else:
|
||||||
remaining.append(p)
|
remaining.append(p)
|
||||||
|
logger.debug(f"Part {i} moved to remaining")
|
||||||
|
|
||||||
|
logger.debug(f"Kept: {len(kept)}, Remaining: {len(remaining)}")
|
||||||
|
|
||||||
# If we have remaining parts and chunking is allowed, try chunking
|
# If we have remaining parts and chunking is allowed, try chunking
|
||||||
if remaining and chunkAllowed:
|
if remaining and chunkAllowed:
|
||||||
|
|
@ -160,12 +180,15 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
||||||
logger.debug(f"Remaining parts to chunk: {len(remaining)}")
|
logger.debug(f"Remaining parts to chunk: {len(remaining)}")
|
||||||
logger.debug(f"Max size limit: {maxSize} bytes")
|
logger.debug(f"Max size limit: {maxSize} bytes")
|
||||||
logger.debug(f"Current size used: {current} bytes")
|
logger.debug(f"Current size used: {current} bytes")
|
||||||
|
logger.debug(f"Chunking {len(remaining)} remaining parts")
|
||||||
|
|
||||||
for p in remaining:
|
for p in remaining:
|
||||||
if p.typeGroup in ("text", "table", "structure", "image"):
|
if p.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
||||||
logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
|
logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
|
||||||
|
logger.debug(f"Chunking {p.typeGroup} part with {len(p.data)} chars")
|
||||||
chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
|
chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
|
||||||
logger.debug(f"Created {len(chunks)} chunks")
|
logger.debug(f"Created {len(chunks)} chunks")
|
||||||
|
logger.debug(f"Created {len(chunks)} chunks")
|
||||||
|
|
||||||
chunks_added = 0
|
chunks_added = 0
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
|
|
@ -197,12 +220,18 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
||||||
|
|
||||||
logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")
|
logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")
|
||||||
|
|
||||||
if non_chunk_parts:
|
# Apply intelligent merging for small text parts
|
||||||
|
if non_chunk_parts:
|
||||||
|
# Count text parts
|
||||||
|
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
|
||||||
|
if len(text_parts) > 5: # If we have many small text parts, merge them
|
||||||
|
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
|
||||||
non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)
|
non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)
|
||||||
|
|
||||||
# Combine non-chunk parts with chunk parts (chunks stay separate)
|
# Combine non-chunk parts with chunk parts (chunks stay separate)
|
||||||
kept = non_chunk_parts + chunk_parts
|
kept = non_chunk_parts + chunk_parts
|
||||||
|
|
||||||
|
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
|
||||||
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
|
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
|
||||||
|
|
||||||
# Re-check size after merging
|
# Re-check size after merging
|
||||||
|
|
@ -211,11 +240,30 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
||||||
# Apply size limit to merged parts
|
# Apply size limit to merged parts
|
||||||
kept = _applySizeLimit(kept, maxSize)
|
kept = _applySizeLimit(kept, maxSize)
|
||||||
|
|
||||||
|
logger.debug(f"poolAndLimit returning {len(kept)} parts")
|
||||||
return kept
|
return kept
|
||||||
|
|
||||||
|
|
||||||
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
||||||
"""Apply merging strategy to parts."""
|
"""Apply merging strategy to parts with intelligent token-aware merging."""
|
||||||
|
logger.debug(f"_applyMerging called with {len(parts)} parts")
|
||||||
|
|
||||||
|
# Check if intelligent merging is enabled
|
||||||
|
if strategy.get("useIntelligentMerging", False):
|
||||||
|
model_capabilities = strategy.get("modelCapabilities", {})
|
||||||
|
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
||||||
|
|
||||||
|
# Use intelligent merging for all parts
|
||||||
|
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
||||||
|
|
||||||
|
# Calculate and log optimization stats
|
||||||
|
stats = subMerger.calculate_optimization_stats(parts, merged)
|
||||||
|
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
||||||
|
logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
# Fallback to traditional merging
|
||||||
textMerger = TextMerger()
|
textMerger = TextMerger()
|
||||||
tableMerger = TableMerger()
|
tableMerger = TableMerger()
|
||||||
defaultMerger = DefaultMerger()
|
defaultMerger = DefaultMerger()
|
||||||
|
|
@ -226,18 +274,29 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
|
||||||
structureParts = [p for p in parts if p.typeGroup == "structure"]
|
structureParts = [p for p in parts if p.typeGroup == "structure"]
|
||||||
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
|
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
|
||||||
|
|
||||||
|
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
|
||||||
|
|
||||||
merged: List[ContentPart] = []
|
merged: List[ContentPart] = []
|
||||||
|
|
||||||
if textParts:
|
if textParts:
|
||||||
merged.extend(textMerger.merge(textParts, strategy))
|
textMerged = textMerger.merge(textParts, strategy)
|
||||||
|
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
|
||||||
|
merged.extend(textMerged)
|
||||||
if tableParts:
|
if tableParts:
|
||||||
merged.extend(tableMerger.merge(tableParts, strategy))
|
tableMerged = tableMerger.merge(tableParts, strategy)
|
||||||
|
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
|
||||||
|
merged.extend(tableMerged)
|
||||||
if structureParts:
|
if structureParts:
|
||||||
# For now, treat structure like text
|
# For now, treat structure like text
|
||||||
merged.extend(textMerger.merge(structureParts, strategy))
|
structureMerged = textMerger.merge(structureParts, strategy)
|
||||||
|
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
|
||||||
|
merged.extend(structureMerged)
|
||||||
if otherParts:
|
if otherParts:
|
||||||
merged.extend(defaultMerger.merge(otherParts, strategy))
|
otherMerged = defaultMerger.merge(otherParts, strategy)
|
||||||
|
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
|
||||||
|
merged.extend(otherMerged)
|
||||||
|
|
||||||
|
logger.debug(f"_applyMerging returning {len(merged)} parts")
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,37 @@
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
import logging
|
||||||
|
|
||||||
from modules.datamodels.datamodelExtraction import ContentPart
|
from modules.datamodels.datamodelExtraction import ContentPart
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Extractor:
|
class Extractor:
|
||||||
|
"""
|
||||||
|
Base class for all document extractors.
|
||||||
|
|
||||||
|
Each extractor should implement:
|
||||||
|
- detect(): Check if this extractor can handle the given file
|
||||||
|
- extract(): Extract content from the file
|
||||||
|
- getSupportedExtensions(): Return supported file extensions
|
||||||
|
- getSupportedMimeTypes(): Return supported MIME types
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
"""Check if this extractor can handle the given file."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
||||||
|
"""Extract content from the file bytes."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions (including dots)."""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
class Chunker:
|
class Chunker:
|
||||||
|
|
@ -20,50 +43,85 @@ class ExtractorRegistry:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._map: Dict[str, Extractor] = {}
|
self._map: Dict[str, Extractor] = {}
|
||||||
self._fallback: Optional[Extractor] = None
|
self._fallback: Optional[Extractor] = None
|
||||||
# Register built-ins
|
self._auto_discover_extractors()
|
||||||
|
|
||||||
|
def _auto_discover_extractors(self):
|
||||||
|
"""Auto-discover and register all extractors from the extractors directory."""
|
||||||
try:
|
try:
|
||||||
from .formats.text_extractor import TextExtractor
|
import os
|
||||||
from .formats.csv_extractor import CsvExtractor
|
import importlib
|
||||||
from .formats.json_extractor import JsonExtractor
|
from pathlib import Path
|
||||||
from .formats.xml_extractor import XmlExtractor
|
|
||||||
from .formats.html_extractor import HtmlExtractor
|
# Get the extractors directory
|
||||||
from .formats.pdf_extractor import PdfExtractor
|
current_dir = Path(__file__).parent
|
||||||
from .formats.docx_extractor import DocxExtractor
|
extractors_dir = current_dir / "extractors"
|
||||||
from .formats.xlsx_extractor import XlsxExtractor
|
|
||||||
from .formats.image_extractor import ImageExtractor
|
if not extractors_dir.exists():
|
||||||
from .formats.binary_extractor import BinaryExtractor
|
logger.error(f"Extractors directory not found: {extractors_dir}")
|
||||||
self.register("text/plain", TextExtractor())
|
return
|
||||||
self.register("text/markdown", TextExtractor())
|
|
||||||
self.register("text/csv", CsvExtractor())
|
# Import all extractor modules
|
||||||
self.register("application/json", JsonExtractor())
|
extractor_modules = []
|
||||||
self.register("application/xml", XmlExtractor())
|
for file_path in extractors_dir.glob("extractor*.py"):
|
||||||
self.register("text/html", HtmlExtractor())
|
if file_path.name == "__init__.py":
|
||||||
self.register("application/pdf", PdfExtractor())
|
continue
|
||||||
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
|
|
||||||
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
|
module_name = file_path.stem
|
||||||
# images
|
try:
|
||||||
self.register("image/jpeg", ImageExtractor())
|
# Import the module
|
||||||
self.register("image/png", ImageExtractor())
|
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
|
||||||
self.register("image/gif", ImageExtractor())
|
|
||||||
# extension fallbacks
|
# Find all extractor classes in the module
|
||||||
self.register("txt", TextExtractor())
|
for attr_name in dir(module):
|
||||||
self.register("md", TextExtractor())
|
attr = getattr(module, attr_name)
|
||||||
self.register("csv", CsvExtractor())
|
if (isinstance(attr, type) and
|
||||||
self.register("json", JsonExtractor())
|
issubclass(attr, Extractor) and
|
||||||
self.register("xml", XmlExtractor())
|
attr != Extractor and
|
||||||
self.register("html", HtmlExtractor())
|
not attr_name.startswith('_')):
|
||||||
self.register("htm", HtmlExtractor())
|
|
||||||
self.register("pdf", PdfExtractor())
|
# Create instance and auto-register
|
||||||
self.register("docx", DocxExtractor())
|
extractor_instance = attr()
|
||||||
self.register("xlsx", XlsxExtractor())
|
self._auto_register_extractor(extractor_instance)
|
||||||
self.register("xlsm", XlsxExtractor())
|
extractor_modules.append(attr_name)
|
||||||
# fallback
|
|
||||||
self.setFallback(BinaryExtractor())
|
except Exception as e:
|
||||||
print(f"✅ ExtractorRegistry: Successfully registered {len(self._map)} extractors")
|
logger.warning(f"Failed to import {module_name}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Set fallback extractor
|
||||||
|
try:
|
||||||
|
from .extractors.extractorBinary import BinaryExtractor
|
||||||
|
self.setFallback(BinaryExtractor())
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to set fallback extractor: {str(e)}")
|
||||||
|
|
||||||
|
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
|
||||||
|
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ ExtractorRegistry: Failed to register extractors: {str(e)}")
|
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def _auto_register_extractor(self, extractor: Extractor):
|
||||||
|
"""Auto-register an extractor based on its declared supported formats."""
|
||||||
|
try:
|
||||||
|
# Register MIME types
|
||||||
|
mime_types = extractor.getSupportedMimeTypes()
|
||||||
|
for mime_type in mime_types:
|
||||||
|
self.register(mime_type, extractor)
|
||||||
|
logger.debug(f"Registered MIME type: {mime_type} → {extractor.__class__.__name__}")
|
||||||
|
|
||||||
|
# Register file extensions
|
||||||
|
extensions = extractor.getSupportedExtensions()
|
||||||
|
for ext in extensions:
|
||||||
|
# Remove leading dot for registry key
|
||||||
|
ext_key = ext.lstrip('.')
|
||||||
|
self.register(ext_key, extractor)
|
||||||
|
logger.debug(f"Registered extension: .{ext_key} → {extractor.__class__.__name__}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
|
||||||
|
|
||||||
def register(self, key: str, extractor: Extractor):
|
def register(self, key: str, extractor: Extractor):
|
||||||
self._map[key] = extractor
|
self._map[key] = extractor
|
||||||
|
|
@ -80,6 +138,43 @@ class ExtractorRegistry:
|
||||||
if ext in self._map:
|
if ext in self._map:
|
||||||
return self._map[ext]
|
return self._map[ext]
|
||||||
return self._fallback
|
return self._fallback
|
||||||
|
|
||||||
|
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
|
||||||
|
"""
|
||||||
|
Get all supported formats from all registered extractors.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with format information:
|
||||||
|
{
|
||||||
|
"extensions": {
|
||||||
|
"extractor_name": [".ext1", ".ext2", ...]
|
||||||
|
},
|
||||||
|
"mime_types": {
|
||||||
|
"extractor_name": ["mime/type1", "mime/type2", ...]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
formats = {"extensions": {}, "mime_types": {}}
|
||||||
|
|
||||||
|
# Get formats from registered extractors
|
||||||
|
for key, extractor in self._map.items():
|
||||||
|
if hasattr(extractor, 'getSupportedExtensions'):
|
||||||
|
extensions = extractor.getSupportedExtensions()
|
||||||
|
if extensions:
|
||||||
|
formats["extensions"][key] = extensions
|
||||||
|
|
||||||
|
if hasattr(extractor, 'getSupportedMimeTypes'):
|
||||||
|
mime_types = extractor.getSupportedMimeTypes()
|
||||||
|
if mime_types:
|
||||||
|
formats["mime_types"][key] = mime_types
|
||||||
|
|
||||||
|
# Add fallback extractor info
|
||||||
|
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
|
||||||
|
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
|
||||||
|
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
|
||||||
|
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
|
||||||
|
|
||||||
|
return formats
|
||||||
|
|
||||||
|
|
||||||
class ChunkerRegistry:
|
class ChunkerRegistry:
|
||||||
|
|
@ -88,17 +183,19 @@ class ChunkerRegistry:
|
||||||
self._noop = Chunker()
|
self._noop = Chunker()
|
||||||
# Register default chunkers
|
# Register default chunkers
|
||||||
try:
|
try:
|
||||||
from .chunking.text_chunker import TextChunker
|
from .chunking.chunkerText import TextChunker
|
||||||
from .chunking.table_chunker import TableChunker
|
from .chunking.chunkerTable import TableChunker
|
||||||
from .chunking.structure_chunker import StructureChunker
|
from .chunking.chunkerStructure import StructureChunker
|
||||||
# Skip ImageChunker for now to avoid PIL import hang
|
from .chunking.chunkerImage import ImageChunker
|
||||||
# from .chunking.image_chunker import ImageChunker
|
|
||||||
self.register("text", TextChunker())
|
self.register("text", TextChunker())
|
||||||
self.register("table", TableChunker())
|
self.register("table", TableChunker())
|
||||||
self.register("structure", StructureChunker())
|
self.register("structure", StructureChunker())
|
||||||
# self.register("image", ImageChunker())
|
self.register("image", ImageChunker())
|
||||||
|
# Use text chunker for container and binary content
|
||||||
|
self.register("container", TextChunker())
|
||||||
|
self.register("binary", TextChunker())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ ChunkerRegistry: Failed to register chunkers: {str(e)}")
|
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, List, Optional
|
import json
|
||||||
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
import re
|
import re
|
||||||
from modules.shared.timezoneUtils import get_utc_timestamp
|
from modules.shared.timezoneUtils import get_utc_timestamp
|
||||||
|
|
@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
|
||||||
class GenerationService:
|
class GenerationService:
|
||||||
def __init__(self, serviceCenter=None):
|
def __init__(self, serviceCenter=None):
|
||||||
# Directly use interfaces from the provided service center (no self.service calls)
|
# Directly use interfaces from the provided service center (no self.service calls)
|
||||||
self.serviceCenter = serviceCenter
|
self.services = serviceCenter
|
||||||
self.interfaceDbComponent = getattr(serviceCenter, 'interfaceDbComponent', None) if serviceCenter else None
|
self.interfaceDbComponent = getattr(serviceCenter, 'interfaceDbComponent', None) if serviceCenter else None
|
||||||
self.interfaceDbChat = getattr(serviceCenter, 'interfaceDbChat', None) if serviceCenter else None
|
self.interfaceDbChat = getattr(serviceCenter, 'interfaceDbChat', None) if serviceCenter else None
|
||||||
self.workflow = getattr(serviceCenter, 'workflow', None) if serviceCenter else None
|
self.workflow = getattr(serviceCenter, 'workflow', None) if serviceCenter else None
|
||||||
|
|
@ -296,101 +297,237 @@ class GenerationService:
|
||||||
'workflowId': 'unknown'
|
'workflowId': 'unknown'
|
||||||
}
|
}
|
||||||
|
|
||||||
async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]:
|
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Render extracted content to the specified output format.
|
Render extracted JSON content to the specified output format.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
extracted_content: Content extracted by AI using format-specific prompt
|
extractedContent: Structured JSON document from AI extraction
|
||||||
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||||
title: Report title
|
title: Report title
|
||||||
|
userPrompt: User's original prompt for report generation
|
||||||
|
aiService: AI service instance for generation prompt creation
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (rendered_content, mime_type)
|
tuple: (rendered_content, mime_type)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# DEBUG: dump renderer input to diagnose JSON+HTML mixtures TODO REMOVE
|
# Validate JSON input
|
||||||
|
if not isinstance(extractedContent, dict):
|
||||||
|
raise ValueError("extractedContent must be a JSON dictionary")
|
||||||
|
|
||||||
|
if "sections" not in extractedContent:
|
||||||
|
raise ValueError("extractedContent must contain 'sections' field")
|
||||||
|
|
||||||
|
# DEBUG: Log renderer input metadata only (no verbose JSON) - only if debug enabled
|
||||||
try:
|
try:
|
||||||
import os
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
if debug_enabled:
|
||||||
debug_root = "./test-chat/ai"
|
import os
|
||||||
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
os.makedirs(debug_dir, exist_ok=True)
|
debug_root = "./test-chat/ai"
|
||||||
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
|
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
|
||||||
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
|
os.makedirs(debug_dir, exist_ok=True)
|
||||||
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(extracted_content or "")
|
f.write(f"title: {title}\nformat: {outputFormat}\ncontent_type: {type(extractedContent).__name__}\n")
|
||||||
|
f.write(f"content_size: {len(str(extractedContent))} characters\n")
|
||||||
|
f.write(f"sections_count: {len(extractedContent.get('sections', []))}\n")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Get the appropriate renderer for the format
|
# Get the appropriate renderer for the format
|
||||||
renderer = self._getFormatRenderer(output_format)
|
renderer = self._getFormatRenderer(outputFormat)
|
||||||
if not renderer:
|
if not renderer:
|
||||||
raise ValueError(f"Unsupported output format: {output_format}")
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||||
|
|
||||||
# Render the content
|
# Render the JSON content directly (AI generation handled by main service)
|
||||||
rendered_content, mime_type = await renderer.render(extracted_content, title)
|
renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService)
|
||||||
# DEBUG: dump rendered output
|
# DEBUG: dump rendered output
|
||||||
try:
|
try:
|
||||||
import os
|
import os
|
||||||
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(rendered_content or "")
|
f.write(renderedContent or "")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
|
logger.info(f"Successfully rendered JSON report to {outputFormat} format: {len(renderedContent)} characters")
|
||||||
return rendered_content, mime_type
|
return renderedContent, mimeType
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error rendering report to {output_format}: {str(e)}")
|
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
|
async def getAdaptiveExtractionPrompt(
|
||||||
|
self,
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
promptAnalysis: Dict[str, Any],
|
||||||
|
aiService=None
|
||||||
|
) -> str:
|
||||||
|
"""Get adaptive extraction prompt based on AI analysis."""
|
||||||
|
from .subPromptBuilder import buildAdaptiveExtractionPrompt
|
||||||
|
return await buildAdaptiveExtractionPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
title=title,
|
||||||
|
promptAnalysis=promptAnalysis,
|
||||||
|
aiService=aiService,
|
||||||
|
services=self.services
|
||||||
|
)
|
||||||
|
|
||||||
|
async def getGenerationPrompt(
|
||||||
|
self,
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None
|
||||||
|
) -> str:
|
||||||
|
"""Get generation prompt for enhancing extracted JSON content."""
|
||||||
|
from .subPromptBuilder import buildGenerationPrompt
|
||||||
|
return await buildGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
title=title,
|
||||||
|
aiService=aiService,
|
||||||
|
services=self.services
|
||||||
|
)
|
||||||
|
|
||||||
|
async def getGenericExtractionPrompt(
|
||||||
|
self,
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None
|
||||||
|
) -> str:
|
||||||
|
"""Get generic extraction prompt that works for both single and multi-file."""
|
||||||
|
from .subPromptBuilder import buildGenericExtractionPrompt
|
||||||
|
return await buildGenericExtractionPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
title=title,
|
||||||
|
aiService=aiService,
|
||||||
|
services=self.services
|
||||||
|
)
|
||||||
|
|
||||||
|
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
|
||||||
"""
|
"""
|
||||||
Get the format-specific extraction prompt for AI content extraction.
|
Get the format-specific extraction prompt for AI content extraction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||||
user_prompt: User's original prompt for report generation
|
userPrompt: User's original prompt for report generation
|
||||||
title: Report title
|
title: Report title
|
||||||
|
aiService: AI service instance for intent extraction
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Format-specific prompt for AI extraction
|
str: Format-specific prompt for AI extraction
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Get the appropriate renderer for the format
|
# Get the appropriate renderer for the format
|
||||||
renderer = self._getFormatRenderer(output_format)
|
renderer = self._getFormatRenderer(outputFormat)
|
||||||
if not renderer:
|
if not renderer:
|
||||||
raise ValueError(f"Unsupported output format: {output_format}")
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||||
|
|
||||||
# Build centralized prompt with generic rules + format-specific guidelines
|
# Build centralized prompt with generic rules + format-specific guidelines
|
||||||
from .prompt_builder import buildExtractionPrompt
|
from .subPromptBuilder import buildExtractionPrompt
|
||||||
extraction_prompt = buildExtractionPrompt(
|
extractionPrompt = await buildExtractionPrompt(
|
||||||
output_format=output_format,
|
outputFormat=outputFormat,
|
||||||
renderer=renderer,
|
renderer=renderer,
|
||||||
user_prompt=user_prompt,
|
userPrompt=userPrompt,
|
||||||
title=title
|
title=title,
|
||||||
|
aiService=aiService,
|
||||||
|
services=self.services
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
|
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
|
||||||
return extraction_prompt
|
return extractionPrompt
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
|
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
async def renderAdaptiveReport(
|
||||||
|
self,
|
||||||
|
extractedContent: Dict[str, Any],
|
||||||
|
outputFormat: str,
|
||||||
|
title: str,
|
||||||
|
userPrompt: str = None,
|
||||||
|
aiService=None,
|
||||||
|
isMultiFile: bool = False
|
||||||
|
) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
|
||||||
|
"""Render report adaptively based on content structure."""
|
||||||
|
|
||||||
|
if isMultiFile and "documents" in extractedContent:
|
||||||
|
return await self._renderMultiFileReport(
|
||||||
|
extractedContent, outputFormat, title, userPrompt, aiService
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return await self._renderSingleFileReport(
|
||||||
|
extractedContent, outputFormat, title, userPrompt, aiService
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _renderMultiFileReport(
|
||||||
|
self,
|
||||||
|
extractedContent: Dict[str, Any],
|
||||||
|
outputFormat: str,
|
||||||
|
title: str,
|
||||||
|
userPrompt: str = None,
|
||||||
|
aiService=None
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Render multiple documents from extracted content."""
|
||||||
|
|
||||||
|
generated_documents = []
|
||||||
|
|
||||||
|
for doc_data in extractedContent.get("documents", []):
|
||||||
|
# Use existing single-file renderer for each document
|
||||||
|
renderer = self._getFormatRenderer(outputFormat)
|
||||||
|
if not renderer:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Render individual document
|
||||||
|
rendered_content, mime_type = await renderer.render(
|
||||||
|
extractedContent={"sections": doc_data["sections"]},
|
||||||
|
title=doc_data["title"],
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
aiService=aiService
|
||||||
|
)
|
||||||
|
|
||||||
|
generated_documents.append({
|
||||||
|
"filename": doc_data["filename"],
|
||||||
|
"content": rendered_content,
|
||||||
|
"mime_type": mime_type,
|
||||||
|
"title": doc_data["title"]
|
||||||
|
})
|
||||||
|
|
||||||
|
return generated_documents
|
||||||
|
|
||||||
|
async def _renderSingleFileReport(
|
||||||
|
self,
|
||||||
|
extractedContent: Dict[str, Any],
|
||||||
|
outputFormat: str,
|
||||||
|
title: str,
|
||||||
|
userPrompt: str = None,
|
||||||
|
aiService=None
|
||||||
|
) -> Tuple[str, str]:
|
||||||
|
"""Render single file report (existing functionality)."""
|
||||||
|
# Use existing renderReport method
|
||||||
|
return await self.renderReport(
|
||||||
|
extractedContent, outputFormat, title, userPrompt, aiService
|
||||||
|
)
|
||||||
|
|
||||||
def _getFormatRenderer(self, output_format: str):
|
def _getFormatRenderer(self, output_format: str):
|
||||||
"""Get the appropriate renderer for the specified format using auto-discovery."""
|
"""Get the appropriate renderer for the specified format using auto-discovery."""
|
||||||
try:
|
try:
|
||||||
from .renderers.registry import get_renderer
|
from .renderers.registry import get_renderer
|
||||||
renderer = get_renderer(output_format)
|
renderer = get_renderer(output_format, services=self.services)
|
||||||
|
|
||||||
if renderer:
|
if renderer:
|
||||||
return renderer
|
return renderer
|
||||||
|
|
||||||
# Fallback to text renderer if no specific renderer found
|
# Fallback to text renderer if no specific renderer found
|
||||||
logger.warning(f"No renderer found for format {output_format}, falling back to text")
|
logger.warning(f"No renderer found for format {output_format}, falling back to text")
|
||||||
fallback_renderer = get_renderer('text')
|
fallback_renderer = get_renderer('text', services=self.services)
|
||||||
if fallback_renderer:
|
if fallback_renderer:
|
||||||
return fallback_renderer
|
return fallback_renderer
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
||||||
"""
|
|
||||||
Centralized prompt builder for document generation across formats.
|
|
||||||
|
|
||||||
Builds a robust prompt that:
|
|
||||||
- Accepts any user intent (no fixed structure assumptions)
|
|
||||||
- Injects format-specific guidelines from the selected renderer
|
|
||||||
- Adds a common policy section to always use real data from source docs
|
|
||||||
- Requires the AI to output a filename header that we can parse and use
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Protocol
|
|
||||||
|
|
||||||
|
|
||||||
class _RendererLike(Protocol):
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
def buildExtractionPrompt(
|
|
||||||
output_format: str,
|
|
||||||
renderer: _RendererLike,
|
|
||||||
user_prompt: str,
|
|
||||||
title: str
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Build the final extraction prompt by combining:
|
|
||||||
- The raw user prompt (verbatim)
|
|
||||||
- Generic cross-format instructions (filename header + real-data policy)
|
|
||||||
- Format-specific guidelines snippet provided by the renderer
|
|
||||||
|
|
||||||
The AI must place a single filename header at the very top:
|
|
||||||
FILENAME: <safe-file-name-with-extension>
|
|
||||||
followed by a blank line and then ONLY the document content according to the target format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
|
|
||||||
|
|
||||||
# Generic block appears once for every format
|
|
||||||
generic_intro = f"""
|
|
||||||
{user_prompt}
|
|
||||||
|
|
||||||
You are generating a document in {output_format.upper()} format for the title: "{title}".
|
|
||||||
|
|
||||||
Rules:
|
|
||||||
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
|
|
||||||
- Use only factual information extracted from the supplied source documents.
|
|
||||||
- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD").
|
|
||||||
- The output must strictly follow the target format and be ready for saving without extra wrapping.
|
|
||||||
- At the VERY TOP output exactly one line with the filename header:
|
|
||||||
FILENAME: <safe-file-name-with-extension>
|
|
||||||
- The base name should be short, descriptive, and kebab-case or snake-case without spaces.
|
|
||||||
- Include the correct extension for the requested format (e.g., .html, .pdf, .docx, .md, .txt, .json, .csv, .xlsx).
|
|
||||||
- Avoid special characters beyond [a-zA-Z0-9-_].
|
|
||||||
- After this header, insert a single blank line and then provide ONLY the document content.
|
|
||||||
|
|
||||||
Common policy:
|
|
||||||
- Use the actual data from the source documents to create the content.
|
|
||||||
- Do not generate placeholder text or templates.
|
|
||||||
- Extract and use the real data provided in the source documents to create meaningful content.
|
|
||||||
""".strip()
|
|
||||||
|
|
||||||
# Final assembly
|
|
||||||
final_prompt = (
|
|
||||||
generic_intro
|
|
||||||
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
|
|
||||||
+ format_guidelines.strip()
|
|
||||||
+ "\n\nGenerate the complete document content now based on the source documents below:"
|
|
||||||
)
|
|
||||||
|
|
||||||
return final_prompt
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
"""
|
|
||||||
Base renderer class for all format renderers.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class BaseRenderer(ABC):
|
|
||||||
"""Base class for all format renderers."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.logger = logger
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""
|
|
||||||
Return list of supported format names for this renderer.
|
|
||||||
Override this method in subclasses to specify supported formats.
|
|
||||||
"""
|
|
||||||
return []
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""
|
|
||||||
Return list of format aliases for this renderer.
|
|
||||||
Override this method in subclasses to specify format aliases.
|
|
||||||
"""
|
|
||||||
return []
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""
|
|
||||||
Return priority for this renderer (higher number = higher priority).
|
|
||||||
Used when multiple renderers support the same format.
|
|
||||||
"""
|
|
||||||
return 0
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""
|
|
||||||
Get the format-specific extraction prompt for AI content extraction.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
user_prompt: User's original prompt for report generation
|
|
||||||
title: Report title
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Format-specific prompt for AI extraction
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""
|
|
||||||
Render extracted content to the target format.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
extracted_content: Raw content extracted by AI using format-specific prompt
|
|
||||||
title: Report title
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (rendered_content, mime_type)
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _extract_sections(self, report_data: Dict[str, Any]) -> list:
|
|
||||||
"""Extract sections from report data."""
|
|
||||||
return report_data.get('sections', [])
|
|
||||||
|
|
||||||
def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Extract metadata from report data."""
|
|
||||||
return report_data.get('metadata', {})
|
|
||||||
|
|
||||||
def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
|
|
||||||
"""Get title from report data or use fallback."""
|
|
||||||
return report_data.get('title', fallback_title)
|
|
||||||
|
|
||||||
def _format_timestamp(self, timestamp: str = None) -> str:
|
|
||||||
"""Format timestamp for display."""
|
|
||||||
if timestamp:
|
|
||||||
return timestamp
|
|
||||||
from datetime import datetime, UTC
|
|
||||||
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
||||||
|
|
@ -1,64 +0,0 @@
|
||||||
"""
|
|
||||||
CSV renderer for report generation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
import csv
|
|
||||||
import io
|
|
||||||
|
|
||||||
class CsvRenderer(BaseRenderer):
|
|
||||||
"""Renders content to CSV format with format-specific extraction."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported CSV formats."""
|
|
||||||
return ['csv']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['spreadsheet', 'table']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for CSV renderer."""
|
|
||||||
return 70
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only CSV-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"CSV FORMAT GUIDELINES:\n"
|
|
||||||
"- Emit ONLY CSV text without fences or commentary.\n"
|
|
||||||
"- Include a single header row with clear column names.\n"
|
|
||||||
"- Quote fields containing commas, quotes, or newlines; escape quotes by doubling them.\n"
|
|
||||||
"- Use rows to represent items/records derived from sources.\n"
|
|
||||||
"- Keep cells concise; include units in headers when useful.\n"
|
|
||||||
"OUTPUT: Return ONLY valid CSV content that can be imported."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to CSV format."""
|
|
||||||
try:
|
|
||||||
# The extracted content should already be CSV from the AI
|
|
||||||
# Just clean it up
|
|
||||||
csv_content = self._clean_csv_content(extracted_content, title)
|
|
||||||
|
|
||||||
return csv_content, "text/csv"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering CSV: {str(e)}")
|
|
||||||
# Return minimal CSV fallback
|
|
||||||
return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
|
|
||||||
|
|
||||||
def _clean_csv_content(self, content: str, title: str) -> str:
|
|
||||||
"""Clean and validate CSV content from AI."""
|
|
||||||
content = content.strip()
|
|
||||||
|
|
||||||
# Remove markdown code blocks if present
|
|
||||||
if content.startswith("```") and content.endswith("```"):
|
|
||||||
lines = content.split('\n')
|
|
||||||
if len(lines) > 2:
|
|
||||||
content = '\n'.join(lines[1:-1]).strip()
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
@ -1,249 +0,0 @@
|
||||||
"""
|
|
||||||
DOCX renderer for report generation using python-docx.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
import io
|
|
||||||
import base64
|
|
||||||
from datetime import datetime, UTC
|
|
||||||
|
|
||||||
try:
|
|
||||||
from docx import Document
|
|
||||||
from docx.shared import Inches, Pt
|
|
||||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
||||||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
|
||||||
from docx.oxml.shared import OxmlElement, qn
|
|
||||||
from docx.oxml.ns import nsdecls
|
|
||||||
from docx.oxml import parse_xml
|
|
||||||
DOCX_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
DOCX_AVAILABLE = False
|
|
||||||
|
|
||||||
class DocxRenderer(BaseRenderer):
|
|
||||||
"""Renders content to DOCX format using python-docx."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported DOCX formats."""
|
|
||||||
return ['docx', 'doc']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['word', 'document']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for DOCX renderer."""
|
|
||||||
return 115
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only DOCX-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"DOCX FORMAT GUIDELINES:\n"
|
|
||||||
"- Provide plain text content suitable for Word generation (no markdown/HTML).\n"
|
|
||||||
"- Use clear section hierarchy; bullet and numbered lists where needed.\n"
|
|
||||||
"- Include tables as simple pipe-delimited lines if tabular data is needed.\n"
|
|
||||||
"OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to DOCX format."""
|
|
||||||
try:
|
|
||||||
if not DOCX_AVAILABLE:
|
|
||||||
# Fallback to HTML if python-docx not available
|
|
||||||
from .html_renderer import HtmlRenderer
|
|
||||||
html_renderer = HtmlRenderer()
|
|
||||||
html_content, _ = await html_renderer.render(extracted_content, title)
|
|
||||||
return html_content, "text/html"
|
|
||||||
|
|
||||||
# Generate DOCX using python-docx
|
|
||||||
docx_content = self._generate_docx(extracted_content, title)
|
|
||||||
|
|
||||||
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
|
||||||
# Return minimal fallback
|
|
||||||
return f"DOCX Generation Error: {str(e)}", "text/plain"
|
|
||||||
|
|
||||||
def _generate_docx(self, content: str, title: str) -> str:
|
|
||||||
"""Generate DOCX content using python-docx."""
|
|
||||||
try:
|
|
||||||
# Create new document
|
|
||||||
doc = Document()
|
|
||||||
|
|
||||||
# Set up document styles
|
|
||||||
self._setup_document_styles(doc)
|
|
||||||
|
|
||||||
# Add title
|
|
||||||
title_para = doc.add_heading(title, 0)
|
|
||||||
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
||||||
|
|
||||||
# Add generation date
|
|
||||||
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
|
|
||||||
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
||||||
|
|
||||||
# Add page break
|
|
||||||
doc.add_page_break()
|
|
||||||
|
|
||||||
# Process content
|
|
||||||
lines = content.split('\n')
|
|
||||||
current_section = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check for ALL CAPS headings (major headings)
|
|
||||||
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
|
|
||||||
if current_section:
|
|
||||||
self._process_section(doc, current_section)
|
|
||||||
current_section = []
|
|
||||||
doc.add_heading(line, level=1)
|
|
||||||
# Check for Title Case headings (subheadings)
|
|
||||||
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
|
|
||||||
if current_section:
|
|
||||||
self._process_section(doc, current_section)
|
|
||||||
current_section = []
|
|
||||||
doc.add_heading(line, level=2)
|
|
||||||
# Check for markdown headings (fallback)
|
|
||||||
elif line.startswith('# '):
|
|
||||||
# H1 heading
|
|
||||||
if current_section:
|
|
||||||
self._process_section(doc, current_section)
|
|
||||||
current_section = []
|
|
||||||
doc.add_heading(line[2:], level=1)
|
|
||||||
elif line.startswith('## '):
|
|
||||||
# H2 heading
|
|
||||||
if current_section:
|
|
||||||
self._process_section(doc, current_section)
|
|
||||||
current_section = []
|
|
||||||
doc.add_heading(line[3:], level=2)
|
|
||||||
elif line.startswith('### '):
|
|
||||||
# H3 heading
|
|
||||||
if current_section:
|
|
||||||
self._process_section(doc, current_section)
|
|
||||||
current_section = []
|
|
||||||
doc.add_heading(line[4:], level=3)
|
|
||||||
else:
|
|
||||||
current_section.append(line)
|
|
||||||
|
|
||||||
# Process remaining content
|
|
||||||
if current_section:
|
|
||||||
self._process_section(doc, current_section)
|
|
||||||
|
|
||||||
# Save to buffer
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
doc.save(buffer)
|
|
||||||
buffer.seek(0)
|
|
||||||
|
|
||||||
# Convert to base64
|
|
||||||
docx_bytes = buffer.getvalue()
|
|
||||||
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
|
||||||
|
|
||||||
return docx_base64
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error generating DOCX: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _setup_document_styles(self, doc):
|
|
||||||
"""Set up document styles."""
|
|
||||||
try:
|
|
||||||
# Set default font
|
|
||||||
style = doc.styles['Normal']
|
|
||||||
font = style.font
|
|
||||||
font.name = 'Calibri'
|
|
||||||
font.size = Pt(11)
|
|
||||||
|
|
||||||
# Set heading styles
|
|
||||||
for i in range(1, 4):
|
|
||||||
heading_style = doc.styles[f'Heading {i}']
|
|
||||||
heading_font = heading_style.font
|
|
||||||
heading_font.name = 'Calibri'
|
|
||||||
heading_font.size = Pt(16 - i * 2)
|
|
||||||
heading_font.bold = True
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
|
||||||
|
|
||||||
def _process_section(self, doc, lines: list):
|
|
||||||
"""Process a section of content into DOCX elements."""
|
|
||||||
for line in lines:
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check for tables (lines with |)
|
|
||||||
if '|' in line and not line.startswith('|'):
|
|
||||||
# This might be part of a table, process as table
|
|
||||||
table_data = self._extract_table_data(lines)
|
|
||||||
if table_data:
|
|
||||||
self._add_table(doc, table_data)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Check for lists
|
|
||||||
if line.startswith('- ') or line.startswith('* '):
|
|
||||||
# This is a list item
|
|
||||||
doc.add_paragraph(line[2:], style='List Bullet')
|
|
||||||
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
|
||||||
# This is a numbered list item
|
|
||||||
doc.add_paragraph(line[3:], style='List Number')
|
|
||||||
else:
|
|
||||||
# Regular paragraph
|
|
||||||
doc.add_paragraph(line)
|
|
||||||
|
|
||||||
def _extract_table_data(self, lines: list) -> list:
|
|
||||||
"""Extract table data from lines."""
|
|
||||||
table_data = []
|
|
||||||
in_table = False
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if '|' in line:
|
|
||||||
if not in_table:
|
|
||||||
in_table = True
|
|
||||||
# Split by | and clean up
|
|
||||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
|
||||||
if cells:
|
|
||||||
table_data.append(cells)
|
|
||||||
elif in_table and not line.strip():
|
|
||||||
# Empty line, might be end of table
|
|
||||||
break
|
|
||||||
|
|
||||||
return table_data if len(table_data) > 1 else []
|
|
||||||
|
|
||||||
def _add_table(self, doc, table_data: list):
|
|
||||||
"""Add a table to the document."""
|
|
||||||
try:
|
|
||||||
if not table_data:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Create table
|
|
||||||
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
|
||||||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
|
||||||
|
|
||||||
# Add data to table
|
|
||||||
for row_idx, row_data in enumerate(table_data):
|
|
||||||
for col_idx, cell_data in enumerate(row_data):
|
|
||||||
if col_idx < len(table.rows[row_idx].cells):
|
|
||||||
table.rows[row_idx].cells[col_idx].text = cell_data
|
|
||||||
|
|
||||||
# Style the table
|
|
||||||
self._style_table(table)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not add table: {str(e)}")
|
|
||||||
|
|
||||||
def _style_table(self, table):
|
|
||||||
"""Apply styling to the table."""
|
|
||||||
try:
|
|
||||||
# Style header row
|
|
||||||
if len(table.rows) > 0:
|
|
||||||
header_cells = table.rows[0].cells
|
|
||||||
for cell in header_cells:
|
|
||||||
for paragraph in cell.paragraphs:
|
|
||||||
for run in paragraph.runs:
|
|
||||||
run.bold = True
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not style table: {str(e)}")
|
|
||||||
|
|
@ -1,210 +0,0 @@
|
||||||
"""
|
|
||||||
Excel renderer for report generation using openpyxl.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
import io
|
|
||||||
import base64
|
|
||||||
from datetime import datetime, UTC
|
|
||||||
|
|
||||||
try:
|
|
||||||
from openpyxl import Workbook
|
|
||||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
|
||||||
from openpyxl.utils import get_column_letter
|
|
||||||
from openpyxl.worksheet.table import Table, TableStyleInfo
|
|
||||||
OPENPYXL_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
OPENPYXL_AVAILABLE = False
|
|
||||||
|
|
||||||
class ExcelRenderer(BaseRenderer):
|
|
||||||
"""Renders content to Excel format using openpyxl."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported Excel formats."""
|
|
||||||
return ['xlsx', 'xls', 'excel']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['spreadsheet', 'workbook']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for Excel renderer."""
|
|
||||||
return 110
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only Excel-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"EXCEL FORMAT GUIDELINES:\n"
|
|
||||||
"- Output one or more pipe-delimited tables with a single header row.\n"
|
|
||||||
"- Let user intent define columns; use clear names and ISO dates.\n"
|
|
||||||
"- Separate multiple tables by a single blank line.\n"
|
|
||||||
"- No markdown/HTML/code fences; tables only unless user explicitly asks for notes.\n"
|
|
||||||
"OUTPUT: Return ONLY pipe-delimited tables suitable for import."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to Excel format."""
|
|
||||||
try:
|
|
||||||
if not OPENPYXL_AVAILABLE:
|
|
||||||
# Fallback to CSV if openpyxl not available
|
|
||||||
from .csv_renderer import CsvRenderer
|
|
||||||
csv_renderer = CsvRenderer()
|
|
||||||
csv_content, _ = await csv_renderer.render(extracted_content, title)
|
|
||||||
return csv_content, "text/csv"
|
|
||||||
|
|
||||||
# Generate Excel using openpyxl
|
|
||||||
excel_content = self._generate_excel(extracted_content, title)
|
|
||||||
|
|
||||||
return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering Excel: {str(e)}")
|
|
||||||
# Return CSV fallback
|
|
||||||
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
|
|
||||||
|
|
||||||
def _generate_excel(self, content: str, title: str) -> str:
|
|
||||||
"""Generate Excel content using openpyxl."""
|
|
||||||
try:
|
|
||||||
# Create workbook
|
|
||||||
wb = Workbook()
|
|
||||||
|
|
||||||
# Remove default sheet
|
|
||||||
wb.remove(wb.active)
|
|
||||||
|
|
||||||
# Create sheets
|
|
||||||
summary_sheet = wb.create_sheet("Summary", 0)
|
|
||||||
data_sheet = wb.create_sheet("Data", 1)
|
|
||||||
analysis_sheet = wb.create_sheet("Analysis", 2)
|
|
||||||
|
|
||||||
# Add content to sheets
|
|
||||||
self._populate_summary_sheet(summary_sheet, title)
|
|
||||||
self._populate_data_sheet(data_sheet, content)
|
|
||||||
self._populate_analysis_sheet(analysis_sheet, content)
|
|
||||||
|
|
||||||
# Save to buffer
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
wb.save(buffer)
|
|
||||||
buffer.seek(0)
|
|
||||||
|
|
||||||
# Convert to base64
|
|
||||||
excel_bytes = buffer.getvalue()
|
|
||||||
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
|
|
||||||
|
|
||||||
return excel_base64
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error generating Excel: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _populate_summary_sheet(self, sheet, title: str):
|
|
||||||
"""Populate the summary sheet."""
|
|
||||||
try:
|
|
||||||
# Title
|
|
||||||
sheet['A1'] = title
|
|
||||||
sheet['A1'].font = Font(size=16, bold=True)
|
|
||||||
sheet['A1'].alignment = Alignment(horizontal='center')
|
|
||||||
|
|
||||||
# Generation info
|
|
||||||
sheet['A3'] = "Generated:"
|
|
||||||
sheet['B3'] = self._format_timestamp()
|
|
||||||
sheet['A4'] = "Status:"
|
|
||||||
sheet['B4'] = "Generated Successfully"
|
|
||||||
|
|
||||||
# Key metrics placeholder
|
|
||||||
sheet['A6'] = "Key Metrics:"
|
|
||||||
sheet['A6'].font = Font(bold=True)
|
|
||||||
sheet['A7'] = "Total Items:"
|
|
||||||
sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
|
|
||||||
|
|
||||||
# Auto-adjust column widths
|
|
||||||
sheet.column_dimensions['A'].width = 20
|
|
||||||
sheet.column_dimensions['B'].width = 30
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not populate summary sheet: {str(e)}")
|
|
||||||
|
|
||||||
def _populate_data_sheet(self, sheet, content: str):
|
|
||||||
"""Populate the data sheet."""
|
|
||||||
try:
|
|
||||||
# Headers
|
|
||||||
headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
|
|
||||||
for col, header in enumerate(headers, 1):
|
|
||||||
cell = sheet.cell(row=1, column=col, value=header)
|
|
||||||
cell.font = Font(bold=True)
|
|
||||||
cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
|
|
||||||
|
|
||||||
# Process content
|
|
||||||
lines = content.split('\n')
|
|
||||||
row = 2
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check for table data (lines with |)
|
|
||||||
if '|' in line:
|
|
||||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
|
||||||
for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
|
|
||||||
sheet.cell(row=row, column=col, value=cell_data)
|
|
||||||
row += 1
|
|
||||||
else:
|
|
||||||
# Regular content
|
|
||||||
sheet.cell(row=row, column=1, value=line)
|
|
||||||
row += 1
|
|
||||||
|
|
||||||
# Auto-adjust column widths
|
|
||||||
for col in range(1, 6):
|
|
||||||
sheet.column_dimensions[get_column_letter(col)].width = 20
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not populate data sheet: {str(e)}")
|
|
||||||
|
|
||||||
def _populate_analysis_sheet(self, sheet, content: str):
|
|
||||||
"""Populate the analysis sheet."""
|
|
||||||
try:
|
|
||||||
# Title
|
|
||||||
sheet['A1'] = "Analysis & Insights"
|
|
||||||
sheet['A1'].font = Font(size=14, bold=True)
|
|
||||||
|
|
||||||
# Content analysis
|
|
||||||
lines = content.split('\n')
|
|
||||||
row = 3
|
|
||||||
|
|
||||||
sheet['A3'] = "Content Analysis:"
|
|
||||||
sheet['A3'].font = Font(bold=True)
|
|
||||||
row += 1
|
|
||||||
|
|
||||||
# Count different types of content
|
|
||||||
table_lines = sum(1 for line in lines if '|' in line)
|
|
||||||
list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
|
|
||||||
text_lines = len(lines) - table_lines - list_lines
|
|
||||||
|
|
||||||
sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
|
|
||||||
row += 1
|
|
||||||
sheet[f'A{row}'] = f"Table Rows: {table_lines}"
|
|
||||||
row += 1
|
|
||||||
sheet[f'A{row}'] = f"List Items: {list_lines}"
|
|
||||||
row += 1
|
|
||||||
sheet[f'A{row}'] = f"Text Lines: {text_lines}"
|
|
||||||
row += 2
|
|
||||||
|
|
||||||
# Recommendations
|
|
||||||
sheet[f'A{row}'] = "Recommendations:"
|
|
||||||
sheet[f'A{row}'].font = Font(bold=True)
|
|
||||||
row += 1
|
|
||||||
sheet[f'A{row}'] = "1. Review data accuracy"
|
|
||||||
row += 1
|
|
||||||
sheet[f'A{row}'] = "2. Consider additional analysis"
|
|
||||||
row += 1
|
|
||||||
sheet[f'A{row}'] = "3. Update regularly"
|
|
||||||
|
|
||||||
# Auto-adjust column width
|
|
||||||
sheet.column_dimensions['A'].width = 30
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
|
|
||||||
|
|
@ -1,69 +0,0 @@
|
||||||
"""
|
|
||||||
HTML renderer for report generation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
|
|
||||||
class HtmlRenderer(BaseRenderer):
|
|
||||||
"""Renders content to HTML format with format-specific extraction."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported HTML formats."""
|
|
||||||
return ['html', 'htm']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['web', 'webpage']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for HTML renderer."""
|
|
||||||
return 100
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only HTML-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"HTML FORMAT GUIDELINES:\n"
|
|
||||||
"- Output a complete HTML5 document starting with <!DOCTYPE html>.\n"
|
|
||||||
"- Include <html>, <head> with <meta charset=\"UTF-8\"> and <title>, and <body>.\n"
|
|
||||||
"- Use semantic elements: <header>, <main>, <section>, <article>, <footer>.\n"
|
|
||||||
"- Provide professional CSS in a <style> block; responsive, clean typography.\n"
|
|
||||||
"- Use h1/h2/h3 for headings; tables and lists for structure.\n"
|
|
||||||
"OUTPUT: Return ONLY valid HTML (no markdown, no code fences)."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to HTML format."""
|
|
||||||
try:
|
|
||||||
# The extracted content should already be HTML from the AI
|
|
||||||
# Just clean it up and ensure it's valid
|
|
||||||
html_content = self._clean_html_content(extracted_content, title)
|
|
||||||
|
|
||||||
return html_content, "text/html"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering HTML: {str(e)}")
|
|
||||||
# Return minimal HTML fallback
|
|
||||||
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
|
|
||||||
|
|
||||||
def _clean_html_content(self, content: str, title: str) -> str:
|
|
||||||
"""Clean and validate HTML content from AI."""
|
|
||||||
content = content.strip()
|
|
||||||
|
|
||||||
# Remove markdown code blocks if present
|
|
||||||
if content.startswith("```") and content.endswith("```"):
|
|
||||||
lines = content.split('\n')
|
|
||||||
if len(lines) > 2:
|
|
||||||
content = '\n'.join(lines[1:-1]).strip()
|
|
||||||
|
|
||||||
# Ensure it starts with DOCTYPE
|
|
||||||
if not content.startswith('<!DOCTYPE'):
|
|
||||||
if content.startswith('<html'):
|
|
||||||
content = '<!DOCTYPE html>\n' + content
|
|
||||||
else:
|
|
||||||
content = f'<!DOCTYPE html>\n<html>\n<head><meta charset="UTF-8"><title>{title}</title></head>\n<body>\n{content}\n</body>\n</html>'
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
@ -1,74 +0,0 @@
|
||||||
"""
|
|
||||||
JSON renderer for report generation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
import json
|
|
||||||
|
|
||||||
class JsonRenderer(BaseRenderer):
|
|
||||||
"""Renders content to JSON format with format-specific extraction."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported JSON formats."""
|
|
||||||
return ['json']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['data']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for JSON renderer."""
|
|
||||||
return 80
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only JSON-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"JSON FORMAT GUIDELINES:\n"
|
|
||||||
"- Output ONLY a single valid JSON object (no fences, no pre/post text).\n"
|
|
||||||
"- Choose a structure that best fits the user's intent; include a top-level title and data.\n"
|
|
||||||
"- Prefer arrays/objects that map cleanly to the extracted facts.\n"
|
|
||||||
"- Include minimal metadata only if useful (e.g., generatedAt, sources).\n"
|
|
||||||
"OUTPUT: Return ONLY valid, parseable JSON."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to JSON format."""
|
|
||||||
try:
|
|
||||||
# The extracted content should already be JSON from the AI
|
|
||||||
# Just validate and format it
|
|
||||||
json_content = self._clean_json_content(extracted_content, title)
|
|
||||||
|
|
||||||
return json_content, "application/json"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering JSON: {str(e)}")
|
|
||||||
# Return minimal JSON fallback
|
|
||||||
fallback_data = {
|
|
||||||
"title": title,
|
|
||||||
"sections": [{"type": "text", "content": f"Error rendering report: {str(e)}"}],
|
|
||||||
"metadata": {"error": str(e)}
|
|
||||||
}
|
|
||||||
return json.dumps(fallback_data, indent=2), "application/json"
|
|
||||||
|
|
||||||
def _clean_json_content(self, content: str, title: str) -> str:
|
|
||||||
"""Clean and validate JSON content from AI."""
|
|
||||||
content = content.strip()
|
|
||||||
|
|
||||||
# Remove markdown code blocks if present
|
|
||||||
if content.startswith("```") and content.endswith("```"):
|
|
||||||
lines = content.split('\n')
|
|
||||||
if len(lines) > 2:
|
|
||||||
content = '\n'.join(lines[1:-1]).strip()
|
|
||||||
|
|
||||||
# Validate JSON
|
|
||||||
try:
|
|
||||||
parsed = json.loads(content)
|
|
||||||
# Re-format with proper indentation
|
|
||||||
return json.dumps(parsed, indent=2, ensure_ascii=False)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# If not valid JSON, return as-is
|
|
||||||
return content
|
|
||||||
|
|
@ -1,65 +0,0 @@
|
||||||
"""
|
|
||||||
Markdown renderer for report generation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
|
|
||||||
class MarkdownRenderer(BaseRenderer):
|
|
||||||
"""Renders content to Markdown format with format-specific extraction."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported Markdown formats."""
|
|
||||||
return ['md', 'markdown']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['mdown', 'mkd']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for markdown renderer."""
|
|
||||||
return 95
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only Markdown-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"MARKDOWN FORMAT GUIDELINES:\n"
|
|
||||||
"- Use proper Markdown syntax only (no HTML wrappers).\n"
|
|
||||||
"- # for main title, ## for sections, ### for subsections.\n"
|
|
||||||
"- Tables with | separators and a header row.\n"
|
|
||||||
"- Bullet lists with - or *.\n"
|
|
||||||
"- Emphasis with **bold** and *italic*.\n"
|
|
||||||
"- Code blocks with ```language.\n"
|
|
||||||
"- Horizontal rules (---) to separate major sections when helpful.\n"
|
|
||||||
"- Include links [text](url) and images  when referenced by sources.\n"
|
|
||||||
"OUTPUT: Return ONLY raw Markdown content without code fences."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to Markdown format."""
|
|
||||||
try:
|
|
||||||
# The extracted content should already be Markdown from the AI
|
|
||||||
# Just clean it up
|
|
||||||
markdown_content = self._clean_markdown_content(extracted_content, title)
|
|
||||||
|
|
||||||
return markdown_content, "text/markdown"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering markdown: {str(e)}")
|
|
||||||
# Return minimal markdown fallback
|
|
||||||
return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown"
|
|
||||||
|
|
||||||
def _clean_markdown_content(self, content: str, title: str) -> str:
|
|
||||||
"""Clean and validate Markdown content from AI."""
|
|
||||||
content = content.strip()
|
|
||||||
|
|
||||||
# Remove markdown code blocks if present
|
|
||||||
if content.startswith("```") and content.endswith("```"):
|
|
||||||
lines = content.split('\n')
|
|
||||||
if len(lines) > 2:
|
|
||||||
content = '\n'.join(lines[1:-1]).strip()
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
@ -1,225 +0,0 @@
|
||||||
"""
|
|
||||||
PDF renderer for report generation using reportlab.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
import io
|
|
||||||
import base64
|
|
||||||
from datetime import datetime, UTC
|
|
||||||
|
|
||||||
try:
|
|
||||||
from reportlab.lib.pagesizes import letter, A4
|
|
||||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
|
|
||||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
|
||||||
from reportlab.lib.units import inch
|
|
||||||
from reportlab.lib import colors
|
|
||||||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
|
|
||||||
REPORTLAB_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
REPORTLAB_AVAILABLE = False
|
|
||||||
|
|
||||||
class PdfRenderer(BaseRenderer):
|
|
||||||
"""Renders content to PDF format using reportlab."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported PDF formats."""
|
|
||||||
return ['pdf']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return ['document', 'print']
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for PDF renderer."""
|
|
||||||
return 120
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only PDF-specific guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"PDF FORMAT GUIDELINES:\n"
|
|
||||||
"- Provide structured content suitable for pagination and headings (H1/H2/H3-like).\n"
|
|
||||||
"- Use bullet lists and tables where useful; separate major sections clearly.\n"
|
|
||||||
"- Avoid markdown/HTML; produce clean, plain content that can be laid out as PDF.\n"
|
|
||||||
"OUTPUT: Return ONLY the PDF-ready textual content (no fences)."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to PDF format."""
|
|
||||||
try:
|
|
||||||
if not REPORTLAB_AVAILABLE:
|
|
||||||
# Fallback to HTML if reportlab not available
|
|
||||||
from .html_renderer import HtmlRenderer
|
|
||||||
html_renderer = HtmlRenderer()
|
|
||||||
html_content, _ = await html_renderer.render(extracted_content, title)
|
|
||||||
return html_content, "text/html"
|
|
||||||
|
|
||||||
# Generate PDF using reportlab
|
|
||||||
pdf_content = self._generate_pdf(extracted_content, title)
|
|
||||||
|
|
||||||
return pdf_content, "application/pdf"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering PDF: {str(e)}")
|
|
||||||
# Return minimal fallback
|
|
||||||
return f"PDF Generation Error: {str(e)}", "text/plain"
|
|
||||||
|
|
||||||
def _generate_pdf(self, content: str, title: str) -> str:
|
|
||||||
"""Generate PDF content using reportlab."""
|
|
||||||
try:
|
|
||||||
# Create a buffer to hold the PDF
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
|
|
||||||
# Create PDF document
|
|
||||||
doc = SimpleDocTemplate(
|
|
||||||
buffer,
|
|
||||||
pagesize=A4,
|
|
||||||
rightMargin=72,
|
|
||||||
leftMargin=72,
|
|
||||||
topMargin=72,
|
|
||||||
bottomMargin=18
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get styles
|
|
||||||
styles = getSampleStyleSheet()
|
|
||||||
|
|
||||||
# Create custom styles
|
|
||||||
title_style = ParagraphStyle(
|
|
||||||
'CustomTitle',
|
|
||||||
parent=styles['Heading1'],
|
|
||||||
fontSize=24,
|
|
||||||
spaceAfter=30,
|
|
||||||
alignment=TA_CENTER,
|
|
||||||
textColor=colors.darkblue
|
|
||||||
)
|
|
||||||
|
|
||||||
heading_style = ParagraphStyle(
|
|
||||||
'CustomHeading',
|
|
||||||
parent=styles['Heading2'],
|
|
||||||
fontSize=16,
|
|
||||||
spaceAfter=12,
|
|
||||||
spaceBefore=12,
|
|
||||||
textColor=colors.darkblue
|
|
||||||
)
|
|
||||||
|
|
||||||
# Build PDF content
|
|
||||||
story = []
|
|
||||||
|
|
||||||
# Title page
|
|
||||||
story.append(Paragraph(title, title_style))
|
|
||||||
story.append(Spacer(1, 20))
|
|
||||||
story.append(Paragraph(f"Generated: {self._format_timestamp()}", styles['Normal']))
|
|
||||||
story.append(PageBreak())
|
|
||||||
|
|
||||||
# Process content
|
|
||||||
lines = content.split('\n')
|
|
||||||
current_section = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check for headings
|
|
||||||
if line.startswith('# '):
|
|
||||||
# H1 heading
|
|
||||||
if current_section:
|
|
||||||
story.extend(self._process_section(current_section, styles))
|
|
||||||
current_section = []
|
|
||||||
story.append(Paragraph(line[2:], title_style))
|
|
||||||
story.append(Spacer(1, 12))
|
|
||||||
elif line.startswith('## '):
|
|
||||||
# H2 heading
|
|
||||||
if current_section:
|
|
||||||
story.extend(self._process_section(current_section, styles))
|
|
||||||
current_section = []
|
|
||||||
story.append(Paragraph(line[3:], heading_style))
|
|
||||||
story.append(Spacer(1, 8))
|
|
||||||
elif line.startswith('### '):
|
|
||||||
# H3 heading
|
|
||||||
if current_section:
|
|
||||||
story.extend(self._process_section(current_section, styles))
|
|
||||||
current_section = []
|
|
||||||
story.append(Paragraph(line[4:], styles['Heading3']))
|
|
||||||
story.append(Spacer(1, 6))
|
|
||||||
else:
|
|
||||||
current_section.append(line)
|
|
||||||
|
|
||||||
# Process remaining content
|
|
||||||
if current_section:
|
|
||||||
story.extend(self._process_section(current_section, styles))
|
|
||||||
|
|
||||||
# Build PDF
|
|
||||||
doc.build(story)
|
|
||||||
|
|
||||||
# Get PDF content as base64
|
|
||||||
buffer.seek(0)
|
|
||||||
pdf_bytes = buffer.getvalue()
|
|
||||||
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
|
||||||
|
|
||||||
return pdf_base64
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error generating PDF: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _process_section(self, lines: list, styles) -> list:
|
|
||||||
"""Process a section of content into PDF elements."""
|
|
||||||
elements = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check for tables (lines with |)
|
|
||||||
if '|' in line and not line.startswith('|'):
|
|
||||||
# This might be part of a table, process as table
|
|
||||||
table_data = self._extract_table_data(lines)
|
|
||||||
if table_data:
|
|
||||||
table = Table(table_data)
|
|
||||||
table.setStyle(TableStyle([
|
|
||||||
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
|
||||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
|
||||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
|
||||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
||||||
('FONTSIZE', (0, 0), (-1, 0), 14),
|
|
||||||
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
||||||
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
|
||||||
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
|
||||||
]))
|
|
||||||
elements.append(table)
|
|
||||||
elements.append(Spacer(1, 12))
|
|
||||||
return elements
|
|
||||||
|
|
||||||
# Check for lists
|
|
||||||
if line.startswith('- ') or line.startswith('* '):
|
|
||||||
# This is a list item
|
|
||||||
elements.append(Paragraph(f"• {line[2:]}", styles['Normal']))
|
|
||||||
else:
|
|
||||||
# Regular paragraph
|
|
||||||
elements.append(Paragraph(line, styles['Normal']))
|
|
||||||
|
|
||||||
elements.append(Spacer(1, 6))
|
|
||||||
return elements
|
|
||||||
|
|
||||||
def _extract_table_data(self, lines: list) -> list:
|
|
||||||
"""Extract table data from lines."""
|
|
||||||
table_data = []
|
|
||||||
in_table = False
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if '|' in line:
|
|
||||||
if not in_table:
|
|
||||||
in_table = True
|
|
||||||
# Split by | and clean up
|
|
||||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
|
||||||
if cells:
|
|
||||||
table_data.append(cells)
|
|
||||||
elif in_table and not line.strip():
|
|
||||||
# Empty line, might be end of table
|
|
||||||
break
|
|
||||||
|
|
||||||
return table_data if len(table_data) > 1 else []
|
|
||||||
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
import importlib
|
import importlib
|
||||||
import pkgutil
|
import pkgutil
|
||||||
from typing import Dict, Type, List, Optional
|
from typing import Dict, Type, List, Optional
|
||||||
from .base_renderer import BaseRenderer
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -37,7 +37,7 @@ class RendererRegistry:
|
||||||
|
|
||||||
# Scan all Python files in the renderers directory
|
# Scan all Python files in the renderers directory
|
||||||
for file_path in renderers_dir.glob("*.py"):
|
for file_path in renderers_dir.glob("*.py"):
|
||||||
if file_path.name in ['registry.py', 'base_renderer.py', '__init__.py']:
|
if file_path.name in ['registry.py', 'rendererBaseTemplate.py', '__init__.py']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract module name from filename
|
# Extract module name from filename
|
||||||
|
|
@ -92,7 +92,7 @@ class RendererRegistry:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error registering renderer {renderer_class.__name__}: {str(e)}")
|
logger.error(f"Error registering renderer {renderer_class.__name__}: {str(e)}")
|
||||||
|
|
||||||
def get_renderer(self, output_format: str) -> Optional[BaseRenderer]:
|
def get_renderer(self, output_format: str, services=None) -> Optional[BaseRenderer]:
|
||||||
"""Get a renderer instance for the specified format."""
|
"""Get a renderer instance for the specified format."""
|
||||||
if not self._discovered:
|
if not self._discovered:
|
||||||
self.discover_renderers()
|
self.discover_renderers()
|
||||||
|
|
@ -109,7 +109,7 @@ class RendererRegistry:
|
||||||
|
|
||||||
if renderer_class:
|
if renderer_class:
|
||||||
try:
|
try:
|
||||||
return renderer_class()
|
return renderer_class(services=services)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error creating renderer instance for {format_name}: {str(e)}")
|
logger.error(f"Error creating renderer instance for {format_name}: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
@ -144,9 +144,9 @@ class RendererRegistry:
|
||||||
# Global registry instance
|
# Global registry instance
|
||||||
_registry = RendererRegistry()
|
_registry = RendererRegistry()
|
||||||
|
|
||||||
def get_renderer(output_format: str) -> Optional[BaseRenderer]:
|
def get_renderer(output_format: str, services=None) -> Optional[BaseRenderer]:
|
||||||
"""Get a renderer instance for the specified format."""
|
"""Get a renderer instance for the specified format."""
|
||||||
return _registry.get_renderer(output_format)
|
return _registry.get_renderer(output_format, services)
|
||||||
|
|
||||||
def get_supported_formats() -> List[str]:
|
def get_supported_formats() -> List[str]:
|
||||||
"""Get list of all supported formats."""
|
"""Get list of all supported formats."""
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,459 @@
|
||||||
|
"""
|
||||||
|
Base renderer class for all format renderers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class BaseRenderer(ABC):
|
||||||
|
"""Base class for all format renderers."""
|
||||||
|
|
||||||
|
def __init__(self, services=None):
|
||||||
|
self.logger = logger
|
||||||
|
self.services = services # Add services attribute
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""
|
||||||
|
Return list of supported format names for this renderer.
|
||||||
|
Override this method in subclasses to specify supported formats.
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""
|
||||||
|
Return list of format aliases for this renderer.
|
||||||
|
Override this method in subclasses to specify format aliases.
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""
|
||||||
|
Return priority for this renderer (higher number = higher priority).
|
||||||
|
Used when multiple renderers support the same format.
|
||||||
|
"""
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Render extracted JSON content to the target format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extracted_content: Structured JSON content with sections and metadata
|
||||||
|
title: Report title
|
||||||
|
user_prompt: Original user prompt for context
|
||||||
|
ai_service: AI service instance for additional processing
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (rendered_content, mime_type)
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _extract_sections(self, report_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||||
|
"""Extract sections from report data."""
|
||||||
|
return report_data.get('sections', [])
|
||||||
|
|
||||||
|
def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Extract metadata from report data."""
|
||||||
|
return report_data.get('metadata', {})
|
||||||
|
|
||||||
|
def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
|
||||||
|
"""Get title from report data or use fallback."""
|
||||||
|
metadata = report_data.get('metadata', {})
|
||||||
|
return metadata.get('title', fallback_title)
|
||||||
|
|
||||||
|
def _validate_json_structure(self, json_content: Dict[str, Any]) -> bool:
|
||||||
|
"""Validate that JSON content has the expected structure."""
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
return False
|
||||||
|
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
if not isinstance(sections, list):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate each section has content_type and elements
|
||||||
|
for section in sections:
|
||||||
|
if not isinstance(section, dict):
|
||||||
|
return False
|
||||||
|
if "content_type" not in section or "elements" not in section:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _get_section_type(self, section: Dict[str, Any]) -> str:
|
||||||
|
"""Get the type of a section."""
|
||||||
|
return section.get("content_type", "paragraph")
|
||||||
|
|
||||||
|
def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||||
|
"""Get the elements of a section."""
|
||||||
|
return section.get("elements", [])
|
||||||
|
|
||||||
|
def _get_section_id(self, section: Dict[str, Any]) -> str:
|
||||||
|
"""Get the ID of a section (if available)."""
|
||||||
|
return section.get("id", "unknown")
|
||||||
|
|
||||||
|
def _extract_table_data(self, section_data: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
|
||||||
|
"""Extract table headers and rows from section data."""
|
||||||
|
headers = section_data.get("headers", [])
|
||||||
|
rows = section_data.get("rows", [])
|
||||||
|
return headers, rows
|
||||||
|
|
||||||
|
def _extract_bullet_list_items(self, section_data: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Extract bullet list items from section data."""
|
||||||
|
items = section_data.get("items", [])
|
||||||
|
result = []
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
result.append(item)
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
result.append(item["text"])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _extract_heading_data(self, section_data: Dict[str, Any]) -> Tuple[int, str]:
|
||||||
|
"""Extract heading level and text from section data."""
|
||||||
|
level = section_data.get("level", 1)
|
||||||
|
text = section_data.get("text", "")
|
||||||
|
return level, text
|
||||||
|
|
||||||
|
def _extract_paragraph_text(self, section_data: Dict[str, Any]) -> str:
|
||||||
|
"""Extract paragraph text from section data."""
|
||||||
|
return section_data.get("text", "")
|
||||||
|
|
||||||
|
def _extract_code_block_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
|
"""Extract code and language from section data."""
|
||||||
|
code = section_data.get("code", "")
|
||||||
|
language = section_data.get("language", "")
|
||||||
|
return code, language
|
||||||
|
|
||||||
|
def _extract_image_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
|
"""Extract base64 data and alt text from section data."""
|
||||||
|
base64_data = section_data.get("base64Data", "")
|
||||||
|
alt_text = section_data.get("altText", "Image")
|
||||||
|
return base64_data, alt_text
|
||||||
|
|
||||||
|
def _render_image_section(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
|
||||||
|
"""
|
||||||
|
Render an image section. This is a base implementation that should be overridden
|
||||||
|
by format-specific renderers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
section: Image section data
|
||||||
|
styles: Optional styling information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Format-specific image representation
|
||||||
|
"""
|
||||||
|
section_data = self._get_section_data(section)
|
||||||
|
base64_data, alt_text = self._extract_image_data(section_data)
|
||||||
|
|
||||||
|
# Base implementation returns a simple dict
|
||||||
|
# Format-specific renderers should override this method
|
||||||
|
return {
|
||||||
|
"content_type": "image",
|
||||||
|
"base64Data": base64_data,
|
||||||
|
"altText": alt_text,
|
||||||
|
"width": section_data.get("width", None),
|
||||||
|
"height": section_data.get("height", None),
|
||||||
|
"caption": section_data.get("caption", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
def _validate_image_data(self, base64_data: str, alt_text: str) -> bool:
|
||||||
|
"""Validate image data."""
|
||||||
|
if not base64_data:
|
||||||
|
self.logger.warning("Image section has no base64 data")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not alt_text:
|
||||||
|
self.logger.warning("Image section has no alt text")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Basic base64 validation
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
base64.b64decode(base64_data, validate=True)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Invalid base64 image data: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_image_dimensions(self, base64_data: str) -> Tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Get image dimensions from base64 data.
|
||||||
|
This is a helper method that format-specific renderers can use.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Decode base64 data
|
||||||
|
image_data = base64.b64decode(base64_data)
|
||||||
|
image = Image.open(io.BytesIO(image_data))
|
||||||
|
|
||||||
|
return image.size # Returns (width, height)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not determine image dimensions: {str(e)}")
|
||||||
|
return (0, 0)
|
||||||
|
|
||||||
|
def _resize_image_if_needed(self, base64_data: str, max_width: int = 800, max_height: int = 600) -> str:
|
||||||
|
"""
|
||||||
|
Resize image if it exceeds maximum dimensions.
|
||||||
|
Returns the resized image as base64 string.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Decode base64 data
|
||||||
|
image_data = base64.b64decode(base64_data)
|
||||||
|
image = Image.open(io.BytesIO(image_data))
|
||||||
|
|
||||||
|
# Check if resizing is needed
|
||||||
|
width, height = image.size
|
||||||
|
if width <= max_width and height <= max_height:
|
||||||
|
return base64_data # No resizing needed
|
||||||
|
|
||||||
|
# Calculate new dimensions maintaining aspect ratio
|
||||||
|
ratio = min(max_width / width, max_height / height)
|
||||||
|
new_width = int(width * ratio)
|
||||||
|
new_height = int(height * ratio)
|
||||||
|
|
||||||
|
# Resize image
|
||||||
|
resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||||
|
|
||||||
|
# Convert back to base64
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
resized_image.save(buffer, format=image.format or 'PNG')
|
||||||
|
resized_data = buffer.getvalue()
|
||||||
|
|
||||||
|
return base64.b64encode(resized_data).decode('utf-8')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not resize image: {str(e)}")
|
||||||
|
return base64_data # Return original if resize fails
|
||||||
|
|
||||||
|
def _get_supported_section_types(self) -> List[str]:
|
||||||
|
"""Return list of supported section types."""
|
||||||
|
return ["table", "bullet_list", "heading", "paragraph", "code_block", "image"]
|
||||||
|
|
||||||
|
def _is_valid_section_type(self, section_type: str) -> bool:
|
||||||
|
"""Check if a section type is valid."""
|
||||||
|
return section_type in self._get_supported_section_types()
|
||||||
|
|
||||||
|
def _process_section_by_type(self, section: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Process a section and return structured data based on its type."""
|
||||||
|
section_type = self._get_section_type(section)
|
||||||
|
section_data = self._get_section_data(section)
|
||||||
|
|
||||||
|
if section_type == "table":
|
||||||
|
headers, rows = self._extract_table_data(section_data)
|
||||||
|
return {"content_type": "table", "headers": headers, "rows": rows}
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
items = self._extract_bullet_list_items(section_data)
|
||||||
|
return {"content_type": "bullet_list", "items": items}
|
||||||
|
elif section_type == "heading":
|
||||||
|
level, text = self._extract_heading_data(section_data)
|
||||||
|
return {"content_type": "heading", "level": level, "text": text}
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
text = self._extract_paragraph_text(section_data)
|
||||||
|
return {"content_type": "paragraph", "text": text}
|
||||||
|
elif section_type == "code_block":
|
||||||
|
code, language = self._extract_code_block_data(section_data)
|
||||||
|
return {"content_type": "code_block", "code": code, "language": language}
|
||||||
|
elif section_type == "image":
|
||||||
|
base64_data, alt_text = self._extract_image_data(section_data)
|
||||||
|
# Validate image data
|
||||||
|
if self._validate_image_data(base64_data, alt_text):
|
||||||
|
return {
|
||||||
|
"content_type": "image",
|
||||||
|
"base64Data": base64_data,
|
||||||
|
"altText": alt_text,
|
||||||
|
"width": section_data.get("width"),
|
||||||
|
"height": section_data.get("height"),
|
||||||
|
"caption": section_data.get("caption", "")
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Return placeholder if image data is invalid
|
||||||
|
return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"}
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph
|
||||||
|
text = self._extract_paragraph_text(section_data)
|
||||||
|
return {"content_type": "paragraph", "text": text}
|
||||||
|
|
||||||
|
def _format_timestamp(self, timestamp: str = None) -> str:
|
||||||
|
"""Format timestamp for display."""
|
||||||
|
if timestamp:
|
||||||
|
return timestamp
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
|
||||||
|
# ===== GENERIC AI STYLING HELPERS =====
|
||||||
|
|
||||||
|
async def _get_ai_styles(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Generic AI styling method that can be used by all renderers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ai_service: AI service instance
|
||||||
|
style_template: Format-specific style template
|
||||||
|
default_styles: Default styles to fall back to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with styling definitions
|
||||||
|
"""
|
||||||
|
# DEBUG: Show which renderer is calling this method
|
||||||
|
|
||||||
|
if not ai_service:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||||
|
|
||||||
|
# DEBUG: Show the actual prompt being sent to AI
|
||||||
|
self.logger.debug(f"AI Style Template Prompt:")
|
||||||
|
self.logger.debug(f"{style_template}")
|
||||||
|
|
||||||
|
response = await ai_service.aiObjects.call(request)
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Clean and parse JSON
|
||||||
|
result = response.content.strip() if response and response.content else ""
|
||||||
|
|
||||||
|
# Check if result is empty
|
||||||
|
if not result:
|
||||||
|
self.logger.warning("AI styling returned empty response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Extract JSON from markdown if present
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
styles = json.loads(result)
|
||||||
|
except json.JSONDecodeError as json_error:
|
||||||
|
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
|
||||||
|
|
||||||
|
# Use print instead of logger to avoid truncation
|
||||||
|
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "RENDERER")
|
||||||
|
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "RENDERER")
|
||||||
|
|
||||||
|
self.logger.warning(f"Raw content that failed to parse: {result}")
|
||||||
|
|
||||||
|
# Try to fix incomplete JSON by adding missing closing braces
|
||||||
|
open_braces = result.count('{')
|
||||||
|
close_braces = result.count('}')
|
||||||
|
|
||||||
|
if open_braces > close_braces:
|
||||||
|
# JSON is incomplete, add missing closing braces
|
||||||
|
missing_braces = open_braces - close_braces
|
||||||
|
result = result + '}' * missing_braces
|
||||||
|
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
|
||||||
|
self.logger.debug(f"Fixed JSON: {result}")
|
||||||
|
|
||||||
|
# Try parsing the fixed JSON
|
||||||
|
try:
|
||||||
|
styles = json.loads(result)
|
||||||
|
self.logger.info("Successfully fixed incomplete JSON")
|
||||||
|
except json.JSONDecodeError as fix_error:
|
||||||
|
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
|
||||||
|
self.logger.warning(f"Fixed JSON content: {result}")
|
||||||
|
# Try to extract just the JSON part if it's embedded in text
|
||||||
|
json_start = result.find('{')
|
||||||
|
json_end = result.rfind('}')
|
||||||
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||||
|
json_part = result[json_start:json_end+1]
|
||||||
|
try:
|
||||||
|
styles = json.loads(json_part)
|
||||||
|
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
# Try to extract just the JSON part if it's embedded in text
|
||||||
|
json_start = result.find('{')
|
||||||
|
json_end = result.rfind('}')
|
||||||
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||||
|
json_part = result[json_start:json_end+1]
|
||||||
|
try:
|
||||||
|
styles = json.loads(json_part)
|
||||||
|
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Convert colors to appropriate format
|
||||||
|
styles = self._convert_colors_format(styles)
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert colors to appropriate format based on renderer type.
|
||||||
|
Override this method in subclasses for format-specific color handling.
|
||||||
|
"""
|
||||||
|
return styles
|
||||||
|
|
||||||
|
def _create_ai_style_template(self, format_name: str, user_prompt: str, style_schema: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Create a standardized AI style template for any format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
format_name: Name of the format (e.g., "docx", "xlsx", "pptx")
|
||||||
|
user_prompt: User's original prompt
|
||||||
|
style_schema: Format-specific style schema
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted prompt string
|
||||||
|
"""
|
||||||
|
schema_json = json.dumps(style_schema, indent=4)
|
||||||
|
|
||||||
|
# DEBUG: Show the schema being sent
|
||||||
|
|
||||||
|
return f"""You are a professional document styling expert. Generate a complete JSON styling configuration for {format_name.upper()} documents.
|
||||||
|
|
||||||
|
Use this schema as a template and customize the values for professional document styling:
|
||||||
|
|
||||||
|
{schema_json}
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Return ONLY the complete JSON object (no markdown, no explanations)
|
||||||
|
- Customize colors, fonts, and spacing for professional appearance
|
||||||
|
- Ensure all objects are properly closed with closing braces
|
||||||
|
- Make the styling modern and professional
|
||||||
|
|
||||||
|
Return the complete JSON:"""
|
||||||
260
modules/services/serviceGeneration/renderers/rendererCsv.py
Normal file
260
modules/services/serviceGeneration/renderers/rendererCsv.py
Normal file
|
|
@ -0,0 +1,260 @@
|
||||||
|
"""
|
||||||
|
CSV renderer for report generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
|
||||||
|
class RendererCsv(BaseRenderer):
|
||||||
|
"""Renders content to CSV format with format-specific extraction."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported CSV formats."""
|
||||||
|
return ['csv']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['spreadsheet', 'table']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for CSV renderer."""
|
||||||
|
return 70
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to CSV format."""
|
||||||
|
try:
|
||||||
|
# Generate CSV directly from JSON (no styling needed for CSV)
|
||||||
|
csv_content = await self._generate_csv_from_json(extracted_content, title)
|
||||||
|
|
||||||
|
return csv_content, "text/csv"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering CSV: {str(e)}")
|
||||||
|
# Return minimal CSV fallback
|
||||||
|
return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
|
||||||
|
|
||||||
|
async def _generate_csv_from_json(self, json_content: Dict[str, Any], title: str) -> str:
|
||||||
|
"""Generate CSV content from structured JSON document."""
|
||||||
|
try:
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Generate CSV content
|
||||||
|
csv_rows = []
|
||||||
|
|
||||||
|
# Add title row
|
||||||
|
if document_title:
|
||||||
|
csv_rows.append([document_title])
|
||||||
|
csv_rows.append([]) # Empty row
|
||||||
|
|
||||||
|
# Process each section in order
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
for section in sections:
|
||||||
|
section_csv = self._render_json_section_to_csv(section)
|
||||||
|
if section_csv:
|
||||||
|
csv_rows.extend(section_csv)
|
||||||
|
csv_rows.append([]) # Empty row between sections
|
||||||
|
|
||||||
|
# Convert to CSV string
|
||||||
|
csv_content = self._convert_rows_to_csv(csv_rows)
|
||||||
|
|
||||||
|
return csv_content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating CSV from JSON: {str(e)}")
|
||||||
|
raise Exception(f"CSV generation failed: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_section_to_csv(self, section: Dict[str, Any]) -> List[List[str]]:
|
||||||
|
"""Render a single JSON section to CSV rows."""
|
||||||
|
try:
|
||||||
|
section_type = section.get("content_type", "paragraph")
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
csv_rows = []
|
||||||
|
|
||||||
|
# Add section title if available
|
||||||
|
section_title = section.get("title")
|
||||||
|
if section_title:
|
||||||
|
csv_rows.append([f"# {section_title}"])
|
||||||
|
|
||||||
|
# Process each element in the section
|
||||||
|
for element in elements:
|
||||||
|
if section_type == "table":
|
||||||
|
csv_rows.extend(self._render_json_table_to_csv(element))
|
||||||
|
elif section_type == "list":
|
||||||
|
csv_rows.extend(self._render_json_list_to_csv(element))
|
||||||
|
elif section_type == "heading":
|
||||||
|
csv_rows.extend(self._render_json_heading_to_csv(element))
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
csv_rows.extend(self._render_json_paragraph_to_csv(element))
|
||||||
|
elif section_type == "code":
|
||||||
|
csv_rows.extend(self._render_json_code_to_csv(element))
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
csv_rows.extend(self._render_json_paragraph_to_csv(element))
|
||||||
|
|
||||||
|
return csv_rows
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||||
|
return [["[Error rendering section]"]]
|
||||||
|
|
||||||
|
def _render_json_table_to_csv(self, table_data: Dict[str, Any]) -> List[List[str]]:
|
||||||
|
"""Render a JSON table to CSV rows."""
|
||||||
|
try:
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
|
||||||
|
csv_rows = []
|
||||||
|
|
||||||
|
if headers:
|
||||||
|
csv_rows.append(headers)
|
||||||
|
|
||||||
|
if rows:
|
||||||
|
csv_rows.extend(rows)
|
||||||
|
|
||||||
|
return csv_rows
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||||
|
return [["[Error rendering table]"]]
|
||||||
|
|
||||||
|
def _render_json_list_to_csv(self, list_data: Dict[str, Any]) -> List[List[str]]:
|
||||||
|
"""Render a JSON list to CSV rows."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
csv_rows = []
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
text = item.get("text", "")
|
||||||
|
subitems = item.get("subitems", [])
|
||||||
|
csv_rows.append([text])
|
||||||
|
|
||||||
|
# Add subitems as indented rows
|
||||||
|
for subitem in subitems:
|
||||||
|
if isinstance(subitem, dict):
|
||||||
|
csv_rows.append([f" - {subitem.get('text', '')}"])
|
||||||
|
else:
|
||||||
|
csv_rows.append([f" - {subitem}"])
|
||||||
|
else:
|
||||||
|
csv_rows.append([str(item)])
|
||||||
|
|
||||||
|
return csv_rows
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering list: {str(e)}")
|
||||||
|
return [["[Error rendering list]"]]
|
||||||
|
|
||||||
|
def _render_json_heading_to_csv(self, heading_data: Dict[str, Any]) -> List[List[str]]:
|
||||||
|
"""Render a JSON heading to CSV rows."""
|
||||||
|
try:
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Use # symbols for heading levels
|
||||||
|
heading_text = f"{'#' * level} {text}"
|
||||||
|
return [[heading_text]]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||||
|
return [["[Error rendering heading]"]]
|
||||||
|
|
||||||
|
def _render_json_paragraph_to_csv(self, paragraph_data: Dict[str, Any]) -> List[List[str]]:
|
||||||
|
"""Render a JSON paragraph to CSV rows."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Split long paragraphs into multiple rows if needed
|
||||||
|
if len(text) > 100:
|
||||||
|
words = text.split()
|
||||||
|
rows = []
|
||||||
|
current_row = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
if current_length + len(word) > 100 and current_row:
|
||||||
|
rows.append([" ".join(current_row)])
|
||||||
|
current_row = [word]
|
||||||
|
current_length = len(word)
|
||||||
|
else:
|
||||||
|
current_row.append(word)
|
||||||
|
current_length += len(word) + 1
|
||||||
|
|
||||||
|
if current_row:
|
||||||
|
rows.append([" ".join(current_row)])
|
||||||
|
|
||||||
|
return rows
|
||||||
|
else:
|
||||||
|
return [[text]]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||||
|
return [["[Error rendering paragraph]"]]
|
||||||
|
|
||||||
|
def _render_json_code_to_csv(self, code_data: Dict[str, Any]) -> List[List[str]]:
|
||||||
|
"""Render a JSON code block to CSV rows."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
|
||||||
|
csv_rows = []
|
||||||
|
|
||||||
|
if language:
|
||||||
|
csv_rows.append([f"Code ({language}):"])
|
||||||
|
|
||||||
|
if code:
|
||||||
|
# Split code into lines
|
||||||
|
code_lines = code.split('\n')
|
||||||
|
for line in code_lines:
|
||||||
|
csv_rows.append([f" {line}"])
|
||||||
|
|
||||||
|
return csv_rows
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||||
|
return [["[Error rendering code block]"]]
|
||||||
|
|
||||||
|
def _convert_rows_to_csv(self, rows: List[List[str]]) -> str:
|
||||||
|
"""Convert rows to CSV string."""
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
|
||||||
|
output = io.StringIO()
|
||||||
|
writer = csv.writer(output)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
if row: # Only write non-empty rows
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
return output.getvalue()
|
||||||
|
|
||||||
|
def _clean_csv_content(self, content: str, title: str) -> str:
|
||||||
|
"""Clean and validate CSV content from AI."""
|
||||||
|
content = content.strip()
|
||||||
|
|
||||||
|
# Remove markdown code blocks if present
|
||||||
|
if content.startswith("```") and content.endswith("```"):
|
||||||
|
lines = content.split('\n')
|
||||||
|
if len(lines) > 2:
|
||||||
|
content = '\n'.join(lines[1:-1]).strip()
|
||||||
|
|
||||||
|
return content
|
||||||
958
modules/services/serviceGeneration/renderers/rendererDocx.py
Normal file
958
modules/services/serviceGeneration/renderers/rendererDocx.py
Normal file
|
|
@ -0,0 +1,958 @@
|
||||||
|
"""
|
||||||
|
DOCX renderer for report generation using python-docx.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import io
|
||||||
|
import base64
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
from docx.shared import Inches, Pt, RGBColor
|
||||||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||||
|
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||||||
|
from docx.oxml.shared import OxmlElement, qn
|
||||||
|
from docx.oxml.ns import nsdecls
|
||||||
|
from docx.oxml import parse_xml
|
||||||
|
DOCX_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
DOCX_AVAILABLE = False
|
||||||
|
|
||||||
|
class RendererDocx(BaseRenderer):
|
||||||
|
"""Renders content to DOCX format using python-docx."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported DOCX formats."""
|
||||||
|
return ['docx', 'doc']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['word', 'document']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for DOCX renderer."""
|
||||||
|
return 115
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
|
||||||
|
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={user_prompt[:50] if user_prompt else 'None'}...", "DOCX_RENDERER")
|
||||||
|
try:
|
||||||
|
if not DOCX_AVAILABLE:
|
||||||
|
# Fallback to HTML if python-docx not available
|
||||||
|
from .rendererHtml import RendererHtml
|
||||||
|
html_renderer = RendererHtml()
|
||||||
|
html_content, _ = await html_renderer.render(extracted_content, title)
|
||||||
|
return html_content, "text/html"
|
||||||
|
|
||||||
|
# Generate DOCX using AI-analyzed styling
|
||||||
|
docx_content = await self._generate_docx_from_json(extracted_content, title, user_prompt, ai_service)
|
||||||
|
|
||||||
|
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
||||||
|
# Return minimal fallback
|
||||||
|
return f"DOCX Generation Error: {str(e)}", "text/plain"
|
||||||
|
|
||||||
|
async def _generate_docx_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||||
|
"""Generate DOCX content from structured JSON document using AI-generated styling."""
|
||||||
|
try:
|
||||||
|
# Create new document
|
||||||
|
doc = Document()
|
||||||
|
|
||||||
|
# Get AI-generated styling definitions
|
||||||
|
self.logger.info(f"About to call AI styling with user_prompt: {user_prompt[:100] if user_prompt else 'None'}...")
|
||||||
|
styles = await self._get_docx_styles(user_prompt, ai_service)
|
||||||
|
|
||||||
|
# Apply basic document setup
|
||||||
|
self._setup_basic_document_styles(doc)
|
||||||
|
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Add document title using analyzed styles
|
||||||
|
if document_title:
|
||||||
|
title_heading = doc.add_heading(document_title, level=1)
|
||||||
|
title_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
|
||||||
|
# Process each section in order
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
for section in sections:
|
||||||
|
self._render_json_section(doc, section, styles)
|
||||||
|
|
||||||
|
# Save to buffer
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
doc.save(buffer)
|
||||||
|
buffer.seek(0)
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
docx_bytes = buffer.getvalue()
|
||||||
|
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
return docx_base64
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
|
||||||
|
raise Exception(f"DOCX generation failed: {str(e)}")
|
||||||
|
|
||||||
|
async def _get_docx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||||
|
"""Get DOCX styling definitions using base template AI styling."""
|
||||||
|
style_schema = {
|
||||||
|
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
|
||||||
|
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
|
||||||
|
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
|
||||||
|
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
|
||||||
|
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
|
||||||
|
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
|
||||||
|
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
|
||||||
|
}
|
||||||
|
|
||||||
|
style_template = self._create_ai_style_template("docx", user_prompt, style_schema)
|
||||||
|
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_styles())
|
||||||
|
|
||||||
|
# Validate and fix contrast issues
|
||||||
|
return self._validate_styles_contrast(styles)
|
||||||
|
|
||||||
|
def _validate_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Validate and fix contrast issues in AI-generated styles."""
|
||||||
|
try:
|
||||||
|
# Fix table header contrast
|
||||||
|
if "table_header" in styles:
|
||||||
|
header = styles["table_header"]
|
||||||
|
bg_color = header.get("background", "#FFFFFF")
|
||||||
|
text_color = header.get("text_color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["text_color"] = "#FFFFFF"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["text_color"] = "#FFFFFF"
|
||||||
|
|
||||||
|
# Fix table cell contrast
|
||||||
|
if "table_cell" in styles:
|
||||||
|
cell = styles["table_cell"]
|
||||||
|
bg_color = cell.get("background", "#FFFFFF")
|
||||||
|
text_color = cell.get("text_color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["text_color"] = "#2F2F2F"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["text_color"] = "#2F2F2F"
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||||
|
return self._get_default_styles()
|
||||||
|
|
||||||
|
def _get_default_styles(self) -> Dict[str, Any]:
|
||||||
|
"""Default DOCX styles."""
|
||||||
|
return {
|
||||||
|
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
|
||||||
|
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
|
||||||
|
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
|
||||||
|
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
|
||||||
|
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
|
||||||
|
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
|
||||||
|
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _setup_basic_document_styles(self, doc: Document) -> None:
|
||||||
|
"""Set up basic document styles."""
|
||||||
|
try:
|
||||||
|
# Set default font
|
||||||
|
style = doc.styles['Normal']
|
||||||
|
font = style.font
|
||||||
|
font.name = 'Calibri'
|
||||||
|
font.size = Pt(11)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_template_content(self, doc: Document) -> None:
|
||||||
|
"""Clear template content while preserving styles."""
|
||||||
|
try:
|
||||||
|
# Remove all paragraphs except keep the styles
|
||||||
|
for paragraph in list(doc.paragraphs):
|
||||||
|
# Keep the paragraph but clear its content
|
||||||
|
paragraph.clear()
|
||||||
|
|
||||||
|
# Remove all tables
|
||||||
|
for table in list(doc.tables):
|
||||||
|
table._element.getparent().remove(table._element)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not clear template content: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a single JSON section to DOCX using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
section_type = section.get("content_type", "paragraph")
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
# Process each element in the section
|
||||||
|
for element in elements:
|
||||||
|
if section_type == "table":
|
||||||
|
self._render_json_table(doc, element, styles)
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
self._render_json_bullet_list(doc, element, styles)
|
||||||
|
elif section_type == "heading":
|
||||||
|
self._render_json_heading(doc, element, styles)
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
self._render_json_paragraph(doc, element, styles)
|
||||||
|
elif section_type == "code_block":
|
||||||
|
self._render_json_code_block(doc, element, styles)
|
||||||
|
elif section_type == "image":
|
||||||
|
self._render_json_image(doc, element, styles)
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
self._render_json_paragraph(doc, element, styles)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||||
|
# Add error paragraph as fallback
|
||||||
|
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
|
||||||
|
|
||||||
|
def _render_json_table(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a JSON table to DOCX using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
|
||||||
|
if not headers or not rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
table = doc.add_table(rows=len(rows) + 1, cols=len(headers))
|
||||||
|
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||||
|
|
||||||
|
# Apply table borders based on AI style
|
||||||
|
border_style = styles["table_border"]["style"]
|
||||||
|
if border_style == "horizontal_only":
|
||||||
|
self._apply_horizontal_borders_only(table)
|
||||||
|
elif border_style == "grid":
|
||||||
|
table.style = 'Table Grid'
|
||||||
|
# else: no borders
|
||||||
|
|
||||||
|
# Add headers with AI-generated styling
|
||||||
|
header_row = table.rows[0]
|
||||||
|
header_style = styles["table_header"]
|
||||||
|
for i, header in enumerate(headers):
|
||||||
|
if i < len(header_row.cells):
|
||||||
|
cell = header_row.cells[i]
|
||||||
|
cell.text = str(header)
|
||||||
|
|
||||||
|
# Apply background color
|
||||||
|
bg_color = header_style["background"].lstrip('#')
|
||||||
|
self._set_cell_background(cell, RGBColor(int(bg_color[0:2], 16), int(bg_color[2:4], 16), int(bg_color[4:6], 16)))
|
||||||
|
|
||||||
|
# Apply text styling
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if header_style["align"] == "center" else WD_ALIGN_PARAGRAPH.LEFT
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = header_style["bold"]
|
||||||
|
run.font.size = Pt(11)
|
||||||
|
text_color = header_style["text_color"].lstrip('#')
|
||||||
|
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
|
||||||
|
|
||||||
|
# Add data rows with AI-generated styling
|
||||||
|
cell_style = styles["table_cell"]
|
||||||
|
for row_idx, row_data in enumerate(rows):
|
||||||
|
if row_idx + 1 < len(table.rows):
|
||||||
|
table_row = table.rows[row_idx + 1]
|
||||||
|
for col_idx, cell_data in enumerate(row_data):
|
||||||
|
if col_idx < len(table_row.cells):
|
||||||
|
cell = table_row.cells[col_idx]
|
||||||
|
cell.text = str(cell_data)
|
||||||
|
|
||||||
|
# Apply text styling
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.font.size = Pt(10)
|
||||||
|
text_color = cell_style["text_color"].lstrip('#')
|
||||||
|
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||||
|
|
||||||
|
def _apply_horizontal_borders_only(self, table) -> None:
|
||||||
|
"""Apply only horizontal borders to the table (no vertical borders)."""
|
||||||
|
try:
|
||||||
|
from docx.oxml.shared import OxmlElement, qn
|
||||||
|
|
||||||
|
# Get table properties
|
||||||
|
tbl_pr = table._element.find(qn('w:tblPr'))
|
||||||
|
if tbl_pr is None:
|
||||||
|
tbl_pr = OxmlElement('w:tblPr')
|
||||||
|
table._element.insert(0, tbl_pr)
|
||||||
|
|
||||||
|
# Remove existing borders
|
||||||
|
existing_borders = tbl_pr.find(qn('w:tblBorders'))
|
||||||
|
if existing_borders is not None:
|
||||||
|
tbl_pr.remove(existing_borders)
|
||||||
|
|
||||||
|
# Create new borders element
|
||||||
|
tbl_borders = OxmlElement('w:tblBorders')
|
||||||
|
|
||||||
|
# Top border
|
||||||
|
top_border = OxmlElement('w:top')
|
||||||
|
top_border.set(qn('w:val'), 'single')
|
||||||
|
top_border.set(qn('w:sz'), '4')
|
||||||
|
top_border.set(qn('w:space'), '0')
|
||||||
|
top_border.set(qn('w:color'), '000000')
|
||||||
|
tbl_borders.append(top_border)
|
||||||
|
|
||||||
|
# Bottom border
|
||||||
|
bottom_border = OxmlElement('w:bottom')
|
||||||
|
bottom_border.set(qn('w:val'), 'single')
|
||||||
|
bottom_border.set(qn('w:sz'), '4')
|
||||||
|
bottom_border.set(qn('w:space'), '0')
|
||||||
|
bottom_border.set(qn('w:color'), '000000')
|
||||||
|
tbl_borders.append(bottom_border)
|
||||||
|
|
||||||
|
# Left border - none
|
||||||
|
left_border = OxmlElement('w:left')
|
||||||
|
left_border.set(qn('w:val'), 'none')
|
||||||
|
tbl_borders.append(left_border)
|
||||||
|
|
||||||
|
# Right border - none
|
||||||
|
right_border = OxmlElement('w:right')
|
||||||
|
right_border.set(qn('w:val'), 'none')
|
||||||
|
tbl_borders.append(right_border)
|
||||||
|
|
||||||
|
# Inside horizontal border
|
||||||
|
inside_h_border = OxmlElement('w:insideH')
|
||||||
|
inside_h_border.set(qn('w:val'), 'single')
|
||||||
|
inside_h_border.set(qn('w:sz'), '4')
|
||||||
|
inside_h_border.set(qn('w:space'), '0')
|
||||||
|
inside_h_border.set(qn('w:color'), '000000')
|
||||||
|
tbl_borders.append(inside_h_border)
|
||||||
|
|
||||||
|
# Inside vertical border - none
|
||||||
|
inside_v_border = OxmlElement('w:insideV')
|
||||||
|
inside_v_border.set(qn('w:val'), 'none')
|
||||||
|
tbl_borders.append(inside_v_border)
|
||||||
|
|
||||||
|
tbl_pr.append(tbl_borders)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not apply horizontal borders: {str(e)}")
|
||||||
|
|
||||||
|
def _set_cell_background(self, cell, color: RGBColor) -> None:
|
||||||
|
"""Set the background color of a table cell."""
|
||||||
|
try:
|
||||||
|
from docx.oxml.shared import OxmlElement, qn
|
||||||
|
|
||||||
|
# Get cell properties
|
||||||
|
tc_pr = cell._element.find(qn('w:tcPr'))
|
||||||
|
if tc_pr is None:
|
||||||
|
tc_pr = OxmlElement('w:tcPr')
|
||||||
|
cell._element.insert(0, tc_pr)
|
||||||
|
|
||||||
|
# Remove existing shading
|
||||||
|
existing_shading = tc_pr.find(qn('w:shd'))
|
||||||
|
if existing_shading is not None:
|
||||||
|
tc_pr.remove(existing_shading)
|
||||||
|
|
||||||
|
# Create new shading element
|
||||||
|
shading = OxmlElement('w:shd')
|
||||||
|
shading.set(qn('w:val'), 'clear')
|
||||||
|
shading.set(qn('w:color'), 'auto')
|
||||||
|
# Convert RGBColor to hex string by unpacking RGB components
|
||||||
|
red, green, blue = color
|
||||||
|
hex_color = f"{red:02x}{green:02x}{blue:02x}"
|
||||||
|
shading.set(qn('w:fill'), hex_color)
|
||||||
|
tc_pr.append(shading)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not set cell background: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _render_json_bullet_list(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a JSON bullet list to DOCX using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
bullet_style = styles["bullet_list"]
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
para = doc.add_paragraph(item, style='List Bullet')
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
para = doc.add_paragraph(item["text"], style='List Bullet')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_heading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a JSON heading to DOCX using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
level = max(1, min(6, level))
|
||||||
|
doc.add_heading(text, level=level)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_paragraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a JSON paragraph to DOCX using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
para = doc.add_paragraph(text)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_code_block(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a JSON code block to DOCX using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
|
||||||
|
if code:
|
||||||
|
if language:
|
||||||
|
lang_para = doc.add_paragraph(f"Code ({language}):")
|
||||||
|
lang_para.runs[0].bold = True
|
||||||
|
|
||||||
|
code_para = doc.add_paragraph(code)
|
||||||
|
for run in code_para.runs:
|
||||||
|
run.font.name = 'Courier New'
|
||||||
|
run.font.size = Pt(10)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_image(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Render a JSON image to DOCX."""
|
||||||
|
try:
|
||||||
|
base64_data = image_data.get("base64Data", "")
|
||||||
|
alt_text = image_data.get("altText", "Image")
|
||||||
|
|
||||||
|
if base64_data:
|
||||||
|
image_bytes = base64.b64decode(base64_data)
|
||||||
|
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
|
||||||
|
|
||||||
|
if alt_text:
|
||||||
|
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
|
||||||
|
caption_para.runs[0].italic = True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||||
|
doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]")
|
||||||
|
|
||||||
|
def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]:
|
||||||
|
"""Extract document structure from user prompt."""
|
||||||
|
structure = {
|
||||||
|
'title': title,
|
||||||
|
'sections': [],
|
||||||
|
'format': 'standard'
|
||||||
|
}
|
||||||
|
|
||||||
|
if not user_prompt:
|
||||||
|
return structure
|
||||||
|
|
||||||
|
# Extract title from prompt if not provided
|
||||||
|
if not title or title == "Generated Document":
|
||||||
|
# Look for "create a ... document" or "generate a ... report"
|
||||||
|
import re
|
||||||
|
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower())
|
||||||
|
if title_match:
|
||||||
|
structure['title'] = title_match.group(1).strip().title()
|
||||||
|
|
||||||
|
# Extract sections from numbered lists in prompt
|
||||||
|
import re
|
||||||
|
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
|
||||||
|
sections = re.findall(section_pattern, user_prompt)
|
||||||
|
|
||||||
|
for num, section_text in sections:
|
||||||
|
structure['sections'].append({
|
||||||
|
'number': int(num),
|
||||||
|
'title': section_text.strip(),
|
||||||
|
'level': 2 # H2 level
|
||||||
|
})
|
||||||
|
|
||||||
|
# If no numbered sections found, try to extract from "including:" patterns
|
||||||
|
if not structure['sections']:
|
||||||
|
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL)
|
||||||
|
if including_match:
|
||||||
|
including_text = including_match.group(1)
|
||||||
|
# Split by common separators
|
||||||
|
parts = re.split(r'[,;]\s*', including_text)
|
||||||
|
for i, part in enumerate(parts, 1):
|
||||||
|
part = part.strip()
|
||||||
|
if part:
|
||||||
|
structure['sections'].append({
|
||||||
|
'number': i,
|
||||||
|
'title': part,
|
||||||
|
'level': 2
|
||||||
|
})
|
||||||
|
|
||||||
|
# If still no sections, extract from any list-like patterns
|
||||||
|
if not structure['sections']:
|
||||||
|
# Look for bullet points or dashes
|
||||||
|
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
|
||||||
|
bullets = re.findall(bullet_pattern, user_prompt)
|
||||||
|
for i, bullet in enumerate(bullets, 1):
|
||||||
|
bullet = bullet.strip()
|
||||||
|
if bullet and len(bullet) > 3:
|
||||||
|
structure['sections'].append({
|
||||||
|
'number': i,
|
||||||
|
'title': bullet,
|
||||||
|
'level': 2
|
||||||
|
})
|
||||||
|
|
||||||
|
# If still no sections, extract from sentence structure
|
||||||
|
if not structure['sections']:
|
||||||
|
# Split prompt into sentences and use as sections
|
||||||
|
sentences = re.split(r'[.!?]\s+', user_prompt)
|
||||||
|
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
|
||||||
|
sentence = sentence.strip()
|
||||||
|
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
|
||||||
|
structure['sections'].append({
|
||||||
|
'number': i,
|
||||||
|
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
|
||||||
|
'level': 2
|
||||||
|
})
|
||||||
|
|
||||||
|
# Final fallback: create sections from prompt keywords
|
||||||
|
if not structure['sections']:
|
||||||
|
# Extract key action words from prompt
|
||||||
|
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
|
||||||
|
found_actions = []
|
||||||
|
for action in action_words:
|
||||||
|
if action in user_prompt.lower():
|
||||||
|
found_actions.append(action.title())
|
||||||
|
|
||||||
|
if found_actions:
|
||||||
|
for i, action in enumerate(found_actions[:3], 1):
|
||||||
|
structure['sections'].append({
|
||||||
|
'number': i,
|
||||||
|
'title': f"{action} Document Content",
|
||||||
|
'level': 2
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Last resort: generic but meaningful sections
|
||||||
|
structure['sections'] = [
|
||||||
|
{'number': 1, 'title': 'Document Analysis', 'level': 2},
|
||||||
|
{'number': 2, 'title': 'Key Information', 'level': 2},
|
||||||
|
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
|
||||||
|
]
|
||||||
|
|
||||||
|
return structure
|
||||||
|
|
||||||
|
def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]):
|
||||||
|
"""Generate DOCX content based on extracted structure."""
|
||||||
|
# Add sections based on prompt structure
|
||||||
|
for section in structure['sections']:
|
||||||
|
# Add section heading
|
||||||
|
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
|
||||||
|
|
||||||
|
# Add AI-generated content for this section
|
||||||
|
# Try to extract relevant content for this section from the AI response
|
||||||
|
section_content = self._extract_section_content(content, section['title'])
|
||||||
|
|
||||||
|
if section_content:
|
||||||
|
doc.add_paragraph(section_content)
|
||||||
|
else:
|
||||||
|
# If no specific content found, add a note
|
||||||
|
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
|
||||||
|
|
||||||
|
# Add some spacing
|
||||||
|
doc.add_paragraph()
|
||||||
|
|
||||||
|
# Add the complete AI-generated content as additional analysis
|
||||||
|
if content and content.strip():
|
||||||
|
doc.add_heading("Complete Analysis", level=1)
|
||||||
|
doc.add_paragraph(content)
|
||||||
|
|
||||||
|
def _extract_section_content(self, content: str, section_title: str) -> str:
|
||||||
|
"""Extract relevant content for a specific section from AI response."""
|
||||||
|
if not content or not section_title:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Look for content that matches the section title
|
||||||
|
section_keywords = section_title.lower().split()
|
||||||
|
|
||||||
|
# Split content into paragraphs
|
||||||
|
paragraphs = content.split('\n\n')
|
||||||
|
|
||||||
|
relevant_paragraphs = []
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
paragraph_lower = paragraph.lower()
|
||||||
|
# Check if paragraph contains keywords from section title
|
||||||
|
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
|
||||||
|
relevant_paragraphs.append(paragraph.strip())
|
||||||
|
|
||||||
|
if relevant_paragraphs:
|
||||||
|
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _setup_document_styles(self, doc):
|
||||||
|
"""Set up document styles."""
|
||||||
|
try:
|
||||||
|
# Set default font
|
||||||
|
style = doc.styles['Normal']
|
||||||
|
font = style.font
|
||||||
|
font.name = 'Calibri'
|
||||||
|
font.size = Pt(11)
|
||||||
|
|
||||||
|
# Set heading styles
|
||||||
|
for i in range(1, 4):
|
||||||
|
heading_style = doc.styles[f'Heading {i}']
|
||||||
|
heading_font = heading_style.font
|
||||||
|
heading_font.name = 'Calibri'
|
||||||
|
heading_font.size = Pt(16 - i * 2)
|
||||||
|
heading_font.bold = True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
||||||
|
|
||||||
|
def _process_section(self, doc, lines: list):
|
||||||
|
"""Process a section of content into DOCX elements."""
|
||||||
|
for line in lines:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for tables (lines with |)
|
||||||
|
if '|' in line and not line.startswith('|'):
|
||||||
|
# This might be part of a table, process as table
|
||||||
|
table_data = self._extract_table_data(lines)
|
||||||
|
if table_data:
|
||||||
|
self._add_table(doc, table_data)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for lists
|
||||||
|
if line.startswith('- ') or line.startswith('* '):
|
||||||
|
# This is a list item
|
||||||
|
doc.add_paragraph(line[2:], style='List Bullet')
|
||||||
|
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
||||||
|
# This is a numbered list item
|
||||||
|
doc.add_paragraph(line[3:], style='List Number')
|
||||||
|
else:
|
||||||
|
# Regular paragraph
|
||||||
|
doc.add_paragraph(line)
|
||||||
|
|
||||||
|
def _extract_table_data(self, lines: list) -> list:
|
||||||
|
"""Extract table data from lines."""
|
||||||
|
table_data = []
|
||||||
|
in_table = False
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if '|' in line:
|
||||||
|
if not in_table:
|
||||||
|
in_table = True
|
||||||
|
# Split by | and clean up
|
||||||
|
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||||
|
if cells:
|
||||||
|
table_data.append(cells)
|
||||||
|
elif in_table and not line.strip():
|
||||||
|
# Empty line, might be end of table
|
||||||
|
break
|
||||||
|
|
||||||
|
return table_data if len(table_data) > 1 else []
|
||||||
|
|
||||||
|
def _add_table(self, doc, table_data: list):
|
||||||
|
"""Add a table to the document."""
|
||||||
|
try:
|
||||||
|
if not table_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
||||||
|
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||||
|
|
||||||
|
# Add data to table
|
||||||
|
for row_idx, row_data in enumerate(table_data):
|
||||||
|
for col_idx, cell_data in enumerate(row_data):
|
||||||
|
if col_idx < len(table.rows[row_idx].cells):
|
||||||
|
table.rows[row_idx].cells[col_idx].text = cell_data
|
||||||
|
|
||||||
|
# Style the table
|
||||||
|
self._style_table(table)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not add table: {str(e)}")
|
||||||
|
|
||||||
|
def _style_table(self, table):
|
||||||
|
"""Apply styling to the table."""
|
||||||
|
try:
|
||||||
|
# Style header row
|
||||||
|
if len(table.rows) > 0:
|
||||||
|
header_cells = table.rows[0].cells
|
||||||
|
for cell in header_cells:
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not style table: {str(e)}")
|
||||||
|
|
||||||
|
def _process_table_row(self, doc, line: str):
|
||||||
|
"""Process a table row and add it to the document."""
|
||||||
|
if not line.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Split by pipe separator
|
||||||
|
parts = [part.strip() for part in line.split('|')]
|
||||||
|
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# This is a table row - create a table if it doesn't exist
|
||||||
|
if not hasattr(self, '_current_table') or self._current_table is None:
|
||||||
|
# Create new table
|
||||||
|
self._current_table = doc.add_table(rows=1, cols=len(parts))
|
||||||
|
self._current_table.style = 'Table Grid'
|
||||||
|
|
||||||
|
# Add header row
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(self._current_table.rows[0].cells):
|
||||||
|
cell = self._current_table.rows[0].cells[i]
|
||||||
|
cell.text = part
|
||||||
|
# Make header bold
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
|
else:
|
||||||
|
# Add data row to existing table
|
||||||
|
row = self._current_table.add_row()
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(row.cells):
|
||||||
|
row.cells[i].text = part
|
||||||
|
else:
|
||||||
|
# Not a table row, treat as regular text
|
||||||
|
doc.add_paragraph(line)
|
||||||
|
|
||||||
|
def _clean_ai_content(self, content: str) -> str:
|
||||||
|
"""Clean AI-generated content by removing debug information and duplicates."""
|
||||||
|
if not content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove debug information
|
||||||
|
lines = content.split('\n')
|
||||||
|
clean_lines = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# Skip debug lines and separators
|
||||||
|
if (line.startswith('[Skipped ') or
|
||||||
|
line.startswith('=== DOCUMENT:') or
|
||||||
|
line.startswith('---') or
|
||||||
|
line.startswith('FILENAME:') or
|
||||||
|
line.strip() == '' or
|
||||||
|
line.strip() == '---'):
|
||||||
|
continue
|
||||||
|
clean_lines.append(line)
|
||||||
|
|
||||||
|
# Join lines and remove duplicate content
|
||||||
|
clean_content = '\n'.join(clean_lines)
|
||||||
|
|
||||||
|
# Remove duplicate sections by keeping only the first occurrence
|
||||||
|
sections = clean_content.split('\n\n')
|
||||||
|
seen_sections = set()
|
||||||
|
unique_sections = []
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
section_key = section.strip()[:50] # Use first 50 chars as key
|
||||||
|
if section_key not in seen_sections and section.strip():
|
||||||
|
seen_sections.add(section_key)
|
||||||
|
unique_sections.append(section)
|
||||||
|
|
||||||
|
return '\n\n'.join(unique_sections)
|
||||||
|
|
||||||
|
def _process_tables(self, doc, content: str) -> str:
|
||||||
|
"""
|
||||||
|
Process tables in the content (both CSV and pipe-separated) and convert them to Word tables.
|
||||||
|
Returns the content with tables replaced by placeholders.
|
||||||
|
"""
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
|
||||||
|
lines = content.split('\n')
|
||||||
|
processed_lines = []
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while i < len(lines):
|
||||||
|
line = lines[i].strip()
|
||||||
|
|
||||||
|
# Check if this line looks like a table (contains pipes or commas with multiple fields)
|
||||||
|
is_pipe_table = '|' in line and len(line.split('|')) >= 2
|
||||||
|
is_csv_table = ',' in line and len(line.split(',')) >= 2
|
||||||
|
|
||||||
|
if is_pipe_table or is_csv_table:
|
||||||
|
# Collect consecutive table lines
|
||||||
|
table_lines = []
|
||||||
|
j = i
|
||||||
|
|
||||||
|
# Determine separator and collect lines
|
||||||
|
separator = '|' if is_pipe_table else ','
|
||||||
|
while j < len(lines):
|
||||||
|
current_line = lines[j].strip()
|
||||||
|
if separator in current_line and len(current_line.split(separator)) >= 2:
|
||||||
|
table_lines.append(current_line)
|
||||||
|
j += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(table_lines) >= 2: # At least header + 1 data row
|
||||||
|
# Create Word table
|
||||||
|
try:
|
||||||
|
if separator == '|':
|
||||||
|
# Process pipe-separated table
|
||||||
|
rows = []
|
||||||
|
for table_line in table_lines:
|
||||||
|
# Split by pipe and clean up
|
||||||
|
cells = [cell.strip() for cell in table_line.split('|')]
|
||||||
|
rows.append(cells)
|
||||||
|
else:
|
||||||
|
# Process CSV table
|
||||||
|
csv_content = '\n'.join(table_lines)
|
||||||
|
csv_reader = csv.reader(io.StringIO(csv_content))
|
||||||
|
rows = list(csv_reader)
|
||||||
|
|
||||||
|
if rows and len(rows[0]) > 0:
|
||||||
|
# Create Word table
|
||||||
|
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
|
||||||
|
table.style = 'Table Grid'
|
||||||
|
|
||||||
|
# Populate table
|
||||||
|
for row_idx, row_data in enumerate(rows):
|
||||||
|
for col_idx, cell_data in enumerate(row_data):
|
||||||
|
if col_idx < len(table.rows[row_idx].cells):
|
||||||
|
table.rows[row_idx].cells[col_idx].text = cell_data.strip()
|
||||||
|
|
||||||
|
# Make header row bold
|
||||||
|
if row_idx == 0:
|
||||||
|
for cell in table.rows[row_idx].cells:
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
|
|
||||||
|
# Add placeholder to mark where table was inserted
|
||||||
|
processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]")
|
||||||
|
|
||||||
|
# Skip the table lines
|
||||||
|
i = j
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
# If table parsing fails, treat as regular text
|
||||||
|
pass
|
||||||
|
|
||||||
|
processed_lines.append(line)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return '\n'.join(processed_lines)
|
||||||
|
|
||||||
|
def _parse_and_format_content(self, doc, content: str, title: str):
|
||||||
|
"""Parse AI-generated content in standardized format and apply proper DOCX formatting."""
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Process tables and replace them with placeholders
|
||||||
|
content = self._process_tables(doc, content)
|
||||||
|
|
||||||
|
# Parse content line by line in exact sequence
|
||||||
|
lines = content.split('\n')
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
# Empty line - add paragraph break
|
||||||
|
doc.add_paragraph()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip table placeholders (already processed)
|
||||||
|
if line.startswith('[TABLE_INSERTED_'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this is a Markdown heading (# ## ###)
|
||||||
|
if line.startswith('#'):
|
||||||
|
level = len(line) - len(line.lstrip('#'))
|
||||||
|
heading_text = line.lstrip('# ').strip()
|
||||||
|
doc.add_heading(heading_text, level=min(level, 3))
|
||||||
|
|
||||||
|
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
|
||||||
|
elif re.match(r'^\d+\)\s+.+', line):
|
||||||
|
heading_text = re.sub(r'^\d+\)\s+', '', line)
|
||||||
|
doc.add_heading(heading_text, level=1)
|
||||||
|
|
||||||
|
# Check if this is a Markdown list item
|
||||||
|
elif line.startswith('- ') or re.match(r'^\d+\.\s+', line):
|
||||||
|
bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line)
|
||||||
|
self._add_bullet_point(doc, bullet_text)
|
||||||
|
|
||||||
|
# Check if this is a code block
|
||||||
|
elif line.startswith('```'):
|
||||||
|
if not line.endswith('```'):
|
||||||
|
# Start of code block - collect until end
|
||||||
|
code_lines = [line]
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# End of code block
|
||||||
|
if 'code_lines' in locals():
|
||||||
|
code_lines.append(line)
|
||||||
|
code_text = '\n'.join(code_lines)
|
||||||
|
para = doc.add_paragraph()
|
||||||
|
run = para.add_run(code_text)
|
||||||
|
run.font.name = 'Courier New'
|
||||||
|
del code_lines
|
||||||
|
|
||||||
|
# Regular paragraph
|
||||||
|
else:
|
||||||
|
self._add_paragraph_to_doc(doc, line)
|
||||||
|
|
||||||
|
def _add_paragraph_to_doc(self, doc, text: str):
|
||||||
|
"""Add a paragraph to the document with proper formatting."""
|
||||||
|
if not text.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for Markdown formatting (**bold**, *italic*)
|
||||||
|
para = doc.add_paragraph()
|
||||||
|
|
||||||
|
# Split by bold markers
|
||||||
|
parts = text.split('**')
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i % 2 == 0:
|
||||||
|
# Regular text - check for italic
|
||||||
|
italic_parts = part.split('*')
|
||||||
|
for j, italic_part in enumerate(italic_parts):
|
||||||
|
if j % 2 == 0:
|
||||||
|
# Regular text
|
||||||
|
if italic_part:
|
||||||
|
para.add_run(italic_part)
|
||||||
|
else:
|
||||||
|
# Italic text
|
||||||
|
if italic_part:
|
||||||
|
run = para.add_run(italic_part)
|
||||||
|
run.italic = True
|
||||||
|
else:
|
||||||
|
# Bold text
|
||||||
|
if part:
|
||||||
|
run = para.add_run(part)
|
||||||
|
run.bold = True
|
||||||
424
modules/services/serviceGeneration/renderers/rendererHtml.py
Normal file
424
modules/services/serviceGeneration/renderers/rendererHtml.py
Normal file
|
|
@ -0,0 +1,424 @@
|
||||||
|
"""
|
||||||
|
HTML renderer for report generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
|
||||||
|
class RendererHtml(BaseRenderer):
|
||||||
|
"""Renders content to HTML format with format-specific extraction."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported HTML formats."""
|
||||||
|
return ['html', 'htm']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['web', 'webpage']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for HTML renderer."""
|
||||||
|
return 100
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to HTML format using AI-analyzed styling."""
|
||||||
|
try:
|
||||||
|
# Generate HTML using AI-analyzed styling
|
||||||
|
html_content = await self._generate_html_from_json(extracted_content, title, user_prompt, ai_service)
|
||||||
|
|
||||||
|
return html_content, "text/html"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering HTML: {str(e)}")
|
||||||
|
# Return minimal HTML fallback
|
||||||
|
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
|
||||||
|
|
||||||
|
async def _generate_html_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||||
|
"""Generate HTML content from structured JSON document using AI-generated styling."""
|
||||||
|
try:
|
||||||
|
# Get AI-generated styling definitions
|
||||||
|
styles = await self._get_html_styles(user_prompt, ai_service)
|
||||||
|
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Build HTML document
|
||||||
|
html_parts = []
|
||||||
|
|
||||||
|
# HTML document structure
|
||||||
|
html_parts.append('<!DOCTYPE html>')
|
||||||
|
html_parts.append('<html lang="en">')
|
||||||
|
html_parts.append('<head>')
|
||||||
|
html_parts.append('<meta charset="UTF-8">')
|
||||||
|
html_parts.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
|
||||||
|
html_parts.append(f'<title>{document_title}</title>')
|
||||||
|
html_parts.append('<style>')
|
||||||
|
html_parts.append(self._generate_css_styles(styles))
|
||||||
|
html_parts.append('</style>')
|
||||||
|
html_parts.append('</head>')
|
||||||
|
html_parts.append('<body>')
|
||||||
|
|
||||||
|
# Document header
|
||||||
|
html_parts.append(f'<header><h1 class="document-title">{document_title}</h1></header>')
|
||||||
|
|
||||||
|
# Main content
|
||||||
|
html_parts.append('<main>')
|
||||||
|
|
||||||
|
# Process each section
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
for section in sections:
|
||||||
|
section_html = self._render_json_section(section, styles)
|
||||||
|
if section_html:
|
||||||
|
html_parts.append(section_html)
|
||||||
|
|
||||||
|
html_parts.append('</main>')
|
||||||
|
|
||||||
|
# Footer
|
||||||
|
html_parts.append('<footer>')
|
||||||
|
html_parts.append(f'<p class="generated-info">Generated: {self._format_timestamp()}</p>')
|
||||||
|
html_parts.append('</footer>')
|
||||||
|
|
||||||
|
html_parts.append('</body>')
|
||||||
|
html_parts.append('</html>')
|
||||||
|
|
||||||
|
return '\n'.join(html_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
|
||||||
|
raise Exception(f"HTML generation failed: {str(e)}")
|
||||||
|
|
||||||
|
async def _get_html_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||||
|
"""Get HTML styling definitions using base template AI styling."""
|
||||||
|
style_schema = {
|
||||||
|
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
|
||||||
|
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
|
||||||
|
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
|
||||||
|
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
|
||||||
|
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
|
||||||
|
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
|
||||||
|
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
|
||||||
|
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
|
||||||
|
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
|
||||||
|
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
|
||||||
|
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
|
||||||
|
}
|
||||||
|
|
||||||
|
style_template = self._create_ai_style_template("html", user_prompt, style_schema)
|
||||||
|
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_html_styles())
|
||||||
|
|
||||||
|
# Validate and fix contrast issues
|
||||||
|
return self._validate_html_styles_contrast(styles)
|
||||||
|
|
||||||
|
def _validate_html_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Validate and fix contrast issues in AI-generated styles."""
|
||||||
|
try:
|
||||||
|
# Fix table header contrast
|
||||||
|
if "table_header" in styles:
|
||||||
|
header = styles["table_header"]
|
||||||
|
bg_color = header.get("background", "#FFFFFF")
|
||||||
|
text_color = header.get("color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["color"] = "#FFFFFF"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["color"] = "#FFFFFF"
|
||||||
|
|
||||||
|
# Fix table cell contrast
|
||||||
|
if "table_cell" in styles:
|
||||||
|
cell = styles["table_cell"]
|
||||||
|
bg_color = cell.get("background", "#FFFFFF")
|
||||||
|
text_color = cell.get("color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["color"] = "#2F2F2F"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["color"] = "#2F2F2F"
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||||
|
return self._get_default_html_styles()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_default_html_styles(self) -> Dict[str, Any]:
|
||||||
|
"""Default HTML styles."""
|
||||||
|
return {
|
||||||
|
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
|
||||||
|
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
|
||||||
|
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
|
||||||
|
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
|
||||||
|
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
|
||||||
|
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
|
||||||
|
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
|
||||||
|
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
|
||||||
|
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
|
||||||
|
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
|
||||||
|
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generate_css_styles(self, styles: Dict[str, Any]) -> str:
|
||||||
|
"""Generate CSS from style definitions."""
|
||||||
|
css_parts = []
|
||||||
|
|
||||||
|
# Body styles
|
||||||
|
body_style = styles.get("body", {})
|
||||||
|
css_parts.append("body {")
|
||||||
|
for property_name, value in body_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Document title
|
||||||
|
title_style = styles.get("title", {})
|
||||||
|
css_parts.append(".document-title {")
|
||||||
|
for property_name, value in title_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Headings
|
||||||
|
for heading_level in ["heading1", "heading2"]:
|
||||||
|
heading_style = styles.get(heading_level, {})
|
||||||
|
css_class = f"h{heading_level[-1]}"
|
||||||
|
css_parts.append(f"{css_class} {{")
|
||||||
|
for property_name, value in heading_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Paragraphs
|
||||||
|
paragraph_style = styles.get("paragraph", {})
|
||||||
|
css_parts.append("p {")
|
||||||
|
for property_name, value in paragraph_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Tables
|
||||||
|
table_style = styles.get("table", {})
|
||||||
|
css_parts.append("table {")
|
||||||
|
for property_name, value in table_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Table headers
|
||||||
|
table_header_style = styles.get("table_header", {})
|
||||||
|
css_parts.append("th {")
|
||||||
|
for property_name, value in table_header_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Table cells
|
||||||
|
table_cell_style = styles.get("table_cell", {})
|
||||||
|
css_parts.append("td {")
|
||||||
|
for property_name, value in table_cell_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Lists
|
||||||
|
bullet_list_style = styles.get("bullet_list", {})
|
||||||
|
css_parts.append("ul {")
|
||||||
|
for property_name, value in bullet_list_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Code blocks
|
||||||
|
code_block_style = styles.get("code_block", {})
|
||||||
|
css_parts.append("pre {")
|
||||||
|
for property_name, value in code_block_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Images
|
||||||
|
image_style = styles.get("image", {})
|
||||||
|
css_parts.append("img {")
|
||||||
|
for property_name, value in image_style.items():
|
||||||
|
css_property = property_name.replace("_", "-")
|
||||||
|
css_parts.append(f" {css_property}: {value};")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
# Generated info
|
||||||
|
css_parts.append(".generated-info {")
|
||||||
|
css_parts.append(" font-size: 0.9em;")
|
||||||
|
css_parts.append(" color: #666;")
|
||||||
|
css_parts.append(" text-align: center;")
|
||||||
|
css_parts.append(" margin-top: 2em;")
|
||||||
|
css_parts.append(" padding-top: 1em;")
|
||||||
|
css_parts.append(" border-top: 1px solid #ddd;")
|
||||||
|
css_parts.append("}")
|
||||||
|
|
||||||
|
return '\n'.join(css_parts)
|
||||||
|
|
||||||
|
def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a single JSON section to HTML using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
section_type = self._get_section_type(section)
|
||||||
|
section_data = self._get_section_data(section)
|
||||||
|
|
||||||
|
if section_type == "table":
|
||||||
|
# Process the section data to extract table structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_table(processed_data, styles)
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
# Process the section data to extract bullet list structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_bullet_list(processed_data, styles)
|
||||||
|
elif section_type == "heading":
|
||||||
|
return self._render_json_heading(section_data, styles)
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
return self._render_json_paragraph(section_data, styles)
|
||||||
|
elif section_type == "code_block":
|
||||||
|
# Process the section data to extract code block structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_code_block(processed_data, styles)
|
||||||
|
elif section_type == "image":
|
||||||
|
# Process the section data to extract image structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_image(processed_data, styles)
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
return self._render_json_paragraph(section_data, styles)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||||
|
return f'<div class="error">[Error rendering section: {str(e)}]</div>'
|
||||||
|
|
||||||
|
def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON table to HTML using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
|
||||||
|
if not headers or not rows:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
html_parts = ['<table>']
|
||||||
|
|
||||||
|
# Table header
|
||||||
|
html_parts.append('<thead><tr>')
|
||||||
|
for header in headers:
|
||||||
|
html_parts.append(f'<th>{header}</th>')
|
||||||
|
html_parts.append('</tr></thead>')
|
||||||
|
|
||||||
|
# Table body
|
||||||
|
html_parts.append('<tbody>')
|
||||||
|
for row in rows:
|
||||||
|
html_parts.append('<tr>')
|
||||||
|
for cell_data in row:
|
||||||
|
html_parts.append(f'<td>{cell_data}</td>')
|
||||||
|
html_parts.append('</tr>')
|
||||||
|
html_parts.append('</tbody>')
|
||||||
|
|
||||||
|
html_parts.append('</table>')
|
||||||
|
return '\n'.join(html_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON bullet list to HTML using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
html_parts = ['<ul>']
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
html_parts.append(f'<li>{item}</li>')
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
html_parts.append(f'<li>{item["text"]}</li>')
|
||||||
|
html_parts.append('</ul>')
|
||||||
|
|
||||||
|
return '\n'.join(html_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON heading to HTML using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
level = max(1, min(6, level))
|
||||||
|
return f'<h{level}>{text}</h{level}>'
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON paragraph to HTML using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
return f'<p>{text}</p>'
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON code block to HTML using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
|
||||||
|
if code:
|
||||||
|
if language:
|
||||||
|
return f'<pre><code class="language-{language}">{code}</code></pre>'
|
||||||
|
else:
|
||||||
|
return f'<pre><code>{code}</code></pre>'
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON image to HTML."""
|
||||||
|
try:
|
||||||
|
base64_data = image_data.get("base64Data", "")
|
||||||
|
alt_text = image_data.get("altText", "Image")
|
||||||
|
|
||||||
|
if base64_data:
|
||||||
|
return f'<img src="data:image/png;base64,{base64_data}" alt="{alt_text}">'
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||||
|
return f'<div class="error">[Image: {image_data.get("altText", "Image")}]</div>'
|
||||||
281
modules/services/serviceGeneration/renderers/rendererImage.py
Normal file
281
modules/services/serviceGeneration/renderers/rendererImage.py
Normal file
|
|
@ -0,0 +1,281 @@
|
||||||
|
"""
|
||||||
|
Image renderer for report generation using AI image generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class RendererImage(BaseRenderer):
|
||||||
|
"""Renders content to image format using AI image generation."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported image formats."""
|
||||||
|
return ['png', 'jpg', 'jpeg', 'image']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['img', 'picture', 'photo', 'graphic']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for image renderer."""
|
||||||
|
return 90
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to image format using AI image generation."""
|
||||||
|
try:
|
||||||
|
# Generate AI image from content
|
||||||
|
image_content = await self._generate_ai_image(extracted_content, title, user_prompt, ai_service)
|
||||||
|
|
||||||
|
return image_content, "image/png"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering image: {str(e)}")
|
||||||
|
# Re-raise the exception instead of using fallback
|
||||||
|
raise Exception(f"Image rendering failed: {str(e)}")
|
||||||
|
|
||||||
|
async def _generate_ai_image(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||||
|
"""Generate AI image from extracted content."""
|
||||||
|
try:
|
||||||
|
if not ai_service:
|
||||||
|
raise ValueError("AI service is required for image generation")
|
||||||
|
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(extracted_content, dict):
|
||||||
|
raise ValueError("Extracted content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in extracted_content:
|
||||||
|
raise ValueError("Extracted content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = extracted_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Create AI prompt for image generation
|
||||||
|
image_prompt = await self._create_image_generation_prompt(extracted_content, document_title, user_prompt, ai_service)
|
||||||
|
|
||||||
|
# Generate image using AI
|
||||||
|
image_result = await ai_service.aiObjects.generateImage(
|
||||||
|
prompt=image_prompt,
|
||||||
|
size="1024x1024",
|
||||||
|
quality="standard",
|
||||||
|
style="vivid"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract base64 image data from result
|
||||||
|
if image_result and image_result.get("success", False):
|
||||||
|
image_data = image_result.get("image_data", "")
|
||||||
|
if image_data:
|
||||||
|
return image_data
|
||||||
|
else:
|
||||||
|
raise ValueError("No image data returned from AI")
|
||||||
|
else:
|
||||||
|
error_msg = image_result.get("error", "Unknown error") if image_result else "No result"
|
||||||
|
raise ValueError(f"AI image generation failed: {error_msg}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating AI image: {str(e)}")
|
||||||
|
raise Exception(f"AI image generation failed: {str(e)}")
|
||||||
|
|
||||||
|
async def _create_image_generation_prompt(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||||
|
"""Create a detailed prompt for AI image generation based on the content."""
|
||||||
|
try:
|
||||||
|
# Start with base prompt
|
||||||
|
prompt_parts = []
|
||||||
|
|
||||||
|
# Add user's original intent if available
|
||||||
|
if user_prompt:
|
||||||
|
prompt_parts.append(f"User Request: {user_prompt}")
|
||||||
|
|
||||||
|
# Add document title
|
||||||
|
prompt_parts.append(f"Document Title: {title}")
|
||||||
|
|
||||||
|
# Analyze content and create visual description
|
||||||
|
sections = extracted_content.get("sections", [])
|
||||||
|
content_description = self._analyze_content_for_visual_description(sections)
|
||||||
|
|
||||||
|
if content_description:
|
||||||
|
prompt_parts.append(f"Content to Visualize: {content_description}")
|
||||||
|
|
||||||
|
# Add style guidance
|
||||||
|
style_guidance = self._get_style_guidance_from_content(extracted_content, user_prompt)
|
||||||
|
if style_guidance:
|
||||||
|
prompt_parts.append(f"Visual Style: {style_guidance}")
|
||||||
|
|
||||||
|
# Combine all parts
|
||||||
|
full_prompt = "Create a professional, informative image that visualizes the following content:\n\n" + "\n\n".join(prompt_parts)
|
||||||
|
|
||||||
|
# Add technical requirements
|
||||||
|
full_prompt += "\n\nTechnical Requirements:"
|
||||||
|
full_prompt += "\n- High quality, professional appearance"
|
||||||
|
full_prompt += "\n- Clear, readable text if any text is included"
|
||||||
|
full_prompt += "\n- Appropriate colors and layout"
|
||||||
|
full_prompt += "\n- Suitable for business/professional use"
|
||||||
|
|
||||||
|
# Truncate prompt if it exceeds DALL-E's 4000 character limit
|
||||||
|
if len(full_prompt) > 4000:
|
||||||
|
# Use AI to compress the prompt intelligently
|
||||||
|
compressed_prompt = await self._compress_prompt_with_ai(full_prompt, ai_service)
|
||||||
|
if compressed_prompt and len(compressed_prompt) <= 4000:
|
||||||
|
return compressed_prompt
|
||||||
|
|
||||||
|
# Fallback to minimal prompt if AI compression fails or is still too long
|
||||||
|
minimal_prompt = f"Create a professional image representing: {title}"
|
||||||
|
if user_prompt:
|
||||||
|
minimal_prompt += f" - {user_prompt}"
|
||||||
|
|
||||||
|
# If even the minimal prompt is too long, truncate it
|
||||||
|
if len(minimal_prompt) > 4000:
|
||||||
|
minimal_prompt = minimal_prompt[:3997] + "..."
|
||||||
|
|
||||||
|
return minimal_prompt
|
||||||
|
|
||||||
|
return full_prompt
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error creating image prompt: {str(e)}")
|
||||||
|
# Fallback to simple prompt
|
||||||
|
return f"Create a professional image representing: {title}"
|
||||||
|
|
||||||
|
async def _compress_prompt_with_ai(self, long_prompt: str, ai_service=None) -> str:
|
||||||
|
"""Use AI to intelligently compress a long prompt while preserving key information."""
|
||||||
|
try:
|
||||||
|
if not ai_service:
|
||||||
|
return None
|
||||||
|
|
||||||
|
compression_prompt = f"""
|
||||||
|
You are an expert at creating concise, effective prompts for AI image generation.
|
||||||
|
|
||||||
|
The following prompt is too long for DALL-E (4000 character limit) and needs to be compressed to under 4000 characters while preserving the most important visual information.
|
||||||
|
|
||||||
|
Original prompt ({len(long_prompt)} characters):
|
||||||
|
{long_prompt}
|
||||||
|
|
||||||
|
Please create a compressed version that:
|
||||||
|
1. Keeps the most important visual elements and requirements
|
||||||
|
2. Maintains the core intent and style guidance
|
||||||
|
3. Preserves technical requirements
|
||||||
|
4. Stays under 4000 characters
|
||||||
|
5. Is optimized for DALL-E image generation
|
||||||
|
|
||||||
|
Return only the compressed prompt, no explanations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Use AI to compress the prompt - call the AI service correctly
|
||||||
|
# The ai_service has an aiObjects attribute that contains the actual AI interface
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=compression_prompt,
|
||||||
|
options=AiCallOptions(
|
||||||
|
operationType=OperationType.GENERAL,
|
||||||
|
maxTokens=2000,
|
||||||
|
temperature=0.3 # Lower temperature for more consistent compression
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await ai_service.aiObjects.call(request)
|
||||||
|
compressed = response.content.strip()
|
||||||
|
|
||||||
|
# Validate the compressed prompt
|
||||||
|
if compressed and len(compressed) <= 4000 and len(compressed) > 50:
|
||||||
|
self.logger.info(f"Successfully compressed prompt from {len(long_prompt)} to {len(compressed)} characters")
|
||||||
|
return compressed
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"AI compression failed or produced invalid result: {len(compressed) if compressed else 0} chars")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error compressing prompt with AI: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _analyze_content_for_visual_description(self, sections: List[Dict[str, Any]]) -> str:
|
||||||
|
"""Analyze content sections and create a visual description for AI."""
|
||||||
|
try:
|
||||||
|
descriptions = []
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
section_type = self._get_section_type(section)
|
||||||
|
section_data = self._get_section_data(section)
|
||||||
|
|
||||||
|
if section_type == "table":
|
||||||
|
headers = section_data.get("headers", [])
|
||||||
|
rows = section_data.get("rows", [])
|
||||||
|
if headers and rows:
|
||||||
|
descriptions.append(f"Data table with {len(headers)} columns and {len(rows)} rows: {', '.join(headers)}")
|
||||||
|
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
items = section_data.get("items", [])
|
||||||
|
if items:
|
||||||
|
descriptions.append(f"List with {len(items)} items")
|
||||||
|
|
||||||
|
elif section_type == "heading":
|
||||||
|
text = section_data.get("text", "")
|
||||||
|
level = section_data.get("level", 1)
|
||||||
|
if text:
|
||||||
|
descriptions.append(f"Heading {level}: {text}")
|
||||||
|
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
text = section_data.get("text", "")
|
||||||
|
if text and len(text) > 10: # Only include substantial paragraphs
|
||||||
|
# Truncate long text
|
||||||
|
truncated = text[:100] + "..." if len(text) > 100 else text
|
||||||
|
descriptions.append(f"Text content: {truncated}")
|
||||||
|
|
||||||
|
elif section_type == "code_block":
|
||||||
|
code = section_data.get("code", "")
|
||||||
|
language = section_data.get("language", "")
|
||||||
|
if code:
|
||||||
|
descriptions.append(f"Code block ({language}): {code[:50]}...")
|
||||||
|
|
||||||
|
return "; ".join(descriptions) if descriptions else "General document content"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error analyzing content: {str(e)}")
|
||||||
|
return "Document content"
|
||||||
|
|
||||||
|
def _get_style_guidance_from_content(self, extracted_content: Dict[str, Any], user_prompt: str = None) -> str:
|
||||||
|
"""Determine visual style guidance based on content and user prompt."""
|
||||||
|
try:
|
||||||
|
style_elements = []
|
||||||
|
|
||||||
|
# Analyze user prompt for style hints
|
||||||
|
if user_prompt:
|
||||||
|
prompt_lower = user_prompt.lower()
|
||||||
|
|
||||||
|
if any(word in prompt_lower for word in ["modern", "contemporary", "sleek"]):
|
||||||
|
style_elements.append("modern, clean design")
|
||||||
|
elif any(word in prompt_lower for word in ["classic", "traditional", "formal"]):
|
||||||
|
style_elements.append("classic, formal design")
|
||||||
|
elif any(word in prompt_lower for word in ["creative", "artistic", "colorful"]):
|
||||||
|
style_elements.append("creative, artistic design")
|
||||||
|
elif any(word in prompt_lower for word in ["corporate", "business", "professional"]):
|
||||||
|
style_elements.append("corporate, professional design")
|
||||||
|
|
||||||
|
# Analyze content type for additional style hints
|
||||||
|
sections = extracted_content.get("sections", [])
|
||||||
|
has_tables = any(self._get_section_type(s) == "table" for s in sections)
|
||||||
|
has_lists = any(self._get_section_type(s) == "bullet_list" for s in sections)
|
||||||
|
has_code = any(self._get_section_type(s) == "code_block" for s in sections)
|
||||||
|
|
||||||
|
if has_tables:
|
||||||
|
style_elements.append("data-focused layout")
|
||||||
|
if has_lists:
|
||||||
|
style_elements.append("organized, structured presentation")
|
||||||
|
if has_code:
|
||||||
|
style_elements.append("technical, developer-friendly")
|
||||||
|
|
||||||
|
# Default style if no specific guidance
|
||||||
|
if not style_elements:
|
||||||
|
style_elements.append("professional, clean design")
|
||||||
|
|
||||||
|
return ", ".join(style_elements)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error determining style guidance: {str(e)}")
|
||||||
|
return "professional design"
|
||||||
79
modules/services/serviceGeneration/renderers/rendererJson.py
Normal file
79
modules/services/serviceGeneration/renderers/rendererJson.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
"""
|
||||||
|
JSON renderer for report generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import json
|
||||||
|
|
||||||
|
class RendererJson(BaseRenderer):
|
||||||
|
"""Renders content to JSON format with format-specific extraction."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported JSON formats."""
|
||||||
|
return ['json']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['data']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for JSON renderer."""
|
||||||
|
return 80
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to JSON format."""
|
||||||
|
try:
|
||||||
|
# The extracted content should already be JSON from the AI
|
||||||
|
# Just validate and format it
|
||||||
|
json_content = self._clean_json_content(extracted_content, title)
|
||||||
|
|
||||||
|
return json_content, "application/json"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering JSON: {str(e)}")
|
||||||
|
# Return minimal JSON fallback
|
||||||
|
fallback_data = {
|
||||||
|
"title": title,
|
||||||
|
"sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}],
|
||||||
|
"metadata": {"error": str(e)}
|
||||||
|
}
|
||||||
|
return json.dumps(fallback_data, indent=2), "application/json"
|
||||||
|
|
||||||
|
def _clean_json_content(self, content: Dict[str, Any], title: str) -> str:
|
||||||
|
"""Clean and validate JSON content from AI."""
|
||||||
|
try:
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
raise ValueError("Content must be a dictionary")
|
||||||
|
|
||||||
|
# Ensure it has the expected structure
|
||||||
|
if "sections" not in content:
|
||||||
|
# Convert old format to new format
|
||||||
|
content = {
|
||||||
|
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||||
|
"metadata": {"title": title}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ensure metadata exists
|
||||||
|
if "metadata" not in content:
|
||||||
|
content["metadata"] = {}
|
||||||
|
|
||||||
|
# Set title in metadata if not present
|
||||||
|
if "title" not in content["metadata"]:
|
||||||
|
content["metadata"]["title"] = title
|
||||||
|
|
||||||
|
# Re-format with proper indentation
|
||||||
|
return json.dumps(content, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error cleaning JSON content: {str(e)}")
|
||||||
|
# Return minimal valid JSON
|
||||||
|
fallback_data = {
|
||||||
|
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||||
|
"metadata": {"title": title, "error": str(e)}
|
||||||
|
}
|
||||||
|
return json.dumps(fallback_data, indent=2, ensure_ascii=False)
|
||||||
221
modules/services/serviceGeneration/renderers/rendererMarkdown.py
Normal file
221
modules/services/serviceGeneration/renderers/rendererMarkdown.py
Normal file
|
|
@ -0,0 +1,221 @@
|
||||||
|
"""
|
||||||
|
Markdown renderer for report generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
|
||||||
|
class RendererMarkdown(BaseRenderer):
|
||||||
|
"""Renders content to Markdown format with format-specific extraction."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported Markdown formats."""
|
||||||
|
return ['md', 'markdown']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['mdown', 'mkd']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for markdown renderer."""
|
||||||
|
return 95
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to Markdown format."""
|
||||||
|
try:
|
||||||
|
# Generate markdown from JSON structure
|
||||||
|
markdown_content = self._generate_markdown_from_json(extracted_content, title)
|
||||||
|
|
||||||
|
return markdown_content, "text/markdown"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering markdown: {str(e)}")
|
||||||
|
# Return minimal markdown fallback
|
||||||
|
return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown"
|
||||||
|
|
||||||
|
def _generate_markdown_from_json(self, json_content: Dict[str, Any], title: str) -> str:
|
||||||
|
"""Generate markdown content from structured JSON document."""
|
||||||
|
try:
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Build markdown content
|
||||||
|
markdown_parts = []
|
||||||
|
|
||||||
|
# Document title
|
||||||
|
markdown_parts.append(f"# {document_title}")
|
||||||
|
markdown_parts.append("")
|
||||||
|
|
||||||
|
# Process each section
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
for section in sections:
|
||||||
|
section_markdown = self._render_json_section(section)
|
||||||
|
if section_markdown:
|
||||||
|
markdown_parts.append(section_markdown)
|
||||||
|
markdown_parts.append("") # Add spacing between sections
|
||||||
|
|
||||||
|
# Add generation info
|
||||||
|
markdown_parts.append("---")
|
||||||
|
markdown_parts.append(f"*Generated: {self._format_timestamp()}*")
|
||||||
|
|
||||||
|
return '\n'.join(markdown_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating markdown from JSON: {str(e)}")
|
||||||
|
raise Exception(f"Markdown generation failed: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_section(self, section: Dict[str, Any]) -> str:
|
||||||
|
"""Render a single JSON section to markdown."""
|
||||||
|
try:
|
||||||
|
section_type = self._get_section_type(section)
|
||||||
|
section_data = self._get_section_data(section)
|
||||||
|
|
||||||
|
if section_type == "table":
|
||||||
|
# Process the section data to extract table structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_table(processed_data)
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
# Process the section data to extract bullet list structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_bullet_list(processed_data)
|
||||||
|
elif section_type == "heading":
|
||||||
|
return self._render_json_heading(section_data)
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
return self._render_json_paragraph(section_data)
|
||||||
|
elif section_type == "code_block":
|
||||||
|
# Process the section data to extract code block structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_code_block(processed_data)
|
||||||
|
elif section_type == "image":
|
||||||
|
# Process the section data to extract image structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_image(processed_data)
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
return self._render_json_paragraph(section_data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||||
|
return f"*[Error rendering section: {str(e)}]*"
|
||||||
|
|
||||||
|
def _render_json_table(self, table_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON table to markdown."""
|
||||||
|
try:
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
|
||||||
|
if not headers or not rows:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
markdown_parts = []
|
||||||
|
|
||||||
|
# Create table header
|
||||||
|
header_line = " | ".join(str(header) for header in headers)
|
||||||
|
markdown_parts.append(header_line)
|
||||||
|
|
||||||
|
# Add separator line
|
||||||
|
separator_line = " | ".join("---" for _ in headers)
|
||||||
|
markdown_parts.append(separator_line)
|
||||||
|
|
||||||
|
# Add data rows
|
||||||
|
for row in rows:
|
||||||
|
row_line = " | ".join(str(cell_data) for cell_data in row)
|
||||||
|
markdown_parts.append(row_line)
|
||||||
|
|
||||||
|
return '\n'.join(markdown_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON bullet list to markdown."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
markdown_parts = []
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
markdown_parts.append(f"- {item}")
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
markdown_parts.append(f"- {item['text']}")
|
||||||
|
|
||||||
|
return '\n'.join(markdown_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_heading(self, heading_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON heading to markdown."""
|
||||||
|
try:
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
level = max(1, min(6, level))
|
||||||
|
return f"{'#' * level} {text}"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON paragraph to markdown."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
return text if text else ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_code_block(self, code_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON code block to markdown."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
|
||||||
|
if code:
|
||||||
|
if language:
|
||||||
|
return f"```{language}\n{code}\n```"
|
||||||
|
else:
|
||||||
|
return f"```\n{code}\n```"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_image(self, image_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON image to markdown."""
|
||||||
|
try:
|
||||||
|
alt_text = image_data.get("altText", "Image")
|
||||||
|
base64_data = image_data.get("base64Data", "")
|
||||||
|
|
||||||
|
if base64_data:
|
||||||
|
# For base64 images, we can't embed them directly in markdown
|
||||||
|
# So we'll use a placeholder with the alt text
|
||||||
|
return f""
|
||||||
|
else:
|
||||||
|
return f""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||||
|
return f""
|
||||||
642
modules/services/serviceGeneration/renderers/rendererPdf.py
Normal file
642
modules/services/serviceGeneration/renderers/rendererPdf.py
Normal file
|
|
@ -0,0 +1,642 @@
|
||||||
|
"""
|
||||||
|
PDF renderer for report generation using reportlab.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import io
|
||||||
|
import base64
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
|
||||||
|
try:
|
||||||
|
from reportlab.lib.pagesizes import letter, A4
|
||||||
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
|
||||||
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||||
|
from reportlab.lib.units import inch
|
||||||
|
from reportlab.lib import colors
|
||||||
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
|
||||||
|
REPORTLAB_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
REPORTLAB_AVAILABLE = False
|
||||||
|
|
||||||
|
class RendererPdf(BaseRenderer):
|
||||||
|
"""Renders content to PDF format using reportlab."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported PDF formats."""
|
||||||
|
return ['pdf']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['document', 'print']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for PDF renderer."""
|
||||||
|
return 120
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to PDF format using AI-analyzed styling."""
|
||||||
|
try:
|
||||||
|
if not REPORTLAB_AVAILABLE:
|
||||||
|
# Fallback to HTML if reportlab not available
|
||||||
|
from .rendererHtml import RendererHtml
|
||||||
|
html_renderer = RendererHtml()
|
||||||
|
html_content, _ = await html_renderer.render(extracted_content, title, user_prompt, ai_service)
|
||||||
|
return html_content, "text/html"
|
||||||
|
|
||||||
|
# Generate PDF using AI-analyzed styling
|
||||||
|
pdf_content = await self._generate_pdf_from_json(extracted_content, title, user_prompt, ai_service)
|
||||||
|
|
||||||
|
return pdf_content, "application/pdf"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering PDF: {str(e)}")
|
||||||
|
# Return minimal fallback
|
||||||
|
return f"PDF Generation Error: {str(e)}", "text/plain"
|
||||||
|
|
||||||
|
async def _generate_pdf_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||||
|
"""Generate PDF content from structured JSON document using AI-generated styling."""
|
||||||
|
try:
|
||||||
|
# Get AI-generated styling definitions
|
||||||
|
styles = await self._get_pdf_styles(user_prompt, ai_service)
|
||||||
|
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Make title shorter to prevent wrapping/overlapping
|
||||||
|
if len(document_title) > 40:
|
||||||
|
document_title = "PowerOn - Consent Agreement"
|
||||||
|
|
||||||
|
# Create a buffer to hold the PDF
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
|
||||||
|
# Create PDF document
|
||||||
|
doc = SimpleDocTemplate(
|
||||||
|
buffer,
|
||||||
|
pagesize=A4,
|
||||||
|
rightMargin=72,
|
||||||
|
leftMargin=72,
|
||||||
|
topMargin=72,
|
||||||
|
bottomMargin=18
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build PDF content
|
||||||
|
story = []
|
||||||
|
|
||||||
|
# Title page
|
||||||
|
title_style = self._create_title_style(styles)
|
||||||
|
story.append(Paragraph(document_title, title_style))
|
||||||
|
story.append(Spacer(1, 50)) # Increased spacing to prevent overlap
|
||||||
|
story.append(Paragraph(f"Generated: {self._format_timestamp()}", self._create_normal_style(styles)))
|
||||||
|
story.append(Spacer(1, 30)) # Add spacing before page break
|
||||||
|
story.append(PageBreak())
|
||||||
|
|
||||||
|
# Process each section
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
|
||||||
|
for i, section in enumerate(sections):
|
||||||
|
self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
|
||||||
|
section_elements = self._render_json_section(section, styles)
|
||||||
|
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
|
||||||
|
story.extend(section_elements)
|
||||||
|
|
||||||
|
# Build PDF
|
||||||
|
doc.build(story)
|
||||||
|
|
||||||
|
# Get PDF content as base64
|
||||||
|
buffer.seek(0)
|
||||||
|
pdf_bytes = buffer.getvalue()
|
||||||
|
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
return pdf_base64
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
|
||||||
|
raise Exception(f"PDF generation failed: {str(e)}")
|
||||||
|
|
||||||
|
async def _get_pdf_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||||
|
"""Get PDF styling definitions using base template AI styling."""
|
||||||
|
style_schema = {
|
||||||
|
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
|
||||||
|
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
|
||||||
|
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
|
||||||
|
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
|
||||||
|
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
|
||||||
|
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
|
||||||
|
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
|
||||||
|
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
|
||||||
|
}
|
||||||
|
|
||||||
|
style_template = self._create_ai_style_template("pdf", user_prompt, style_schema)
|
||||||
|
|
||||||
|
# Use base template method like DOCX does (this works!)
|
||||||
|
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_pdf_styles())
|
||||||
|
|
||||||
|
if styles is None:
|
||||||
|
return self._get_default_pdf_styles()
|
||||||
|
|
||||||
|
# Convert colors to PDF format after getting styles
|
||||||
|
styles = self._convert_colors_format(styles)
|
||||||
|
|
||||||
|
# Validate and fix contrast issues
|
||||||
|
return self._validate_pdf_styles_contrast(styles)
|
||||||
|
|
||||||
|
async def _get_ai_styles_with_pdf_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Get AI styles with proper PDF color conversion."""
|
||||||
|
if not ai_service:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||||
|
|
||||||
|
# Check if AI service is properly configured
|
||||||
|
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
|
||||||
|
self.logger.warning("AI service not properly configured, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
response = await ai_service.aiObjects.call(request)
|
||||||
|
|
||||||
|
# Check if response is valid
|
||||||
|
if not response:
|
||||||
|
self.logger.warning("AI service returned no response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Clean and parse JSON
|
||||||
|
result = response.content.strip() if response and response.content else ""
|
||||||
|
|
||||||
|
# Check if result is empty
|
||||||
|
if not result:
|
||||||
|
self.logger.warning("AI styling returned empty response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Log the raw response for debugging
|
||||||
|
self.logger.debug(f"AI styling raw response: {result[:200]}...")
|
||||||
|
|
||||||
|
# Extract JSON from various formats
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to extract JSON from explanatory text
|
||||||
|
json_patterns = [
|
||||||
|
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
|
||||||
|
r'\{.*?"title".*?\}', # JSON with title field
|
||||||
|
r'\{.*?"font_size".*?\}', # JSON with font_size field
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in json_patterns:
|
||||||
|
json_match = re.search(pattern, result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(0)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Additional cleanup - remove any leading/trailing whitespace and newlines
|
||||||
|
result = result.strip()
|
||||||
|
|
||||||
|
# Check if result is still empty after cleanup
|
||||||
|
if not result:
|
||||||
|
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
styles = json.loads(result)
|
||||||
|
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
|
||||||
|
except json.JSONDecodeError as json_error:
|
||||||
|
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
|
||||||
|
|
||||||
|
# Use print instead of logger to avoid truncation
|
||||||
|
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "PDF_RENDERER")
|
||||||
|
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "PDF_RENDERER")
|
||||||
|
|
||||||
|
self.logger.warning(f"Raw content that failed to parse: {result}")
|
||||||
|
|
||||||
|
# Try to fix incomplete JSON by adding missing closing braces
|
||||||
|
open_braces = result.count('{')
|
||||||
|
close_braces = result.count('}')
|
||||||
|
|
||||||
|
if open_braces > close_braces:
|
||||||
|
# JSON is incomplete, add missing closing braces
|
||||||
|
missing_braces = open_braces - close_braces
|
||||||
|
result = result + '}' * missing_braces
|
||||||
|
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
|
||||||
|
|
||||||
|
# Try parsing the fixed JSON
|
||||||
|
try:
|
||||||
|
styles = json.loads(result)
|
||||||
|
self.logger.info("Successfully fixed incomplete JSON")
|
||||||
|
except json.JSONDecodeError as fix_error:
|
||||||
|
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
|
||||||
|
# Try to extract just the JSON part if it's embedded in text
|
||||||
|
json_start = result.find('{')
|
||||||
|
json_end = result.rfind('}')
|
||||||
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||||
|
json_part = result[json_start:json_end+1]
|
||||||
|
try:
|
||||||
|
styles = json.loads(json_part)
|
||||||
|
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
# Try to extract just the JSON part if it's embedded in text
|
||||||
|
json_start = result.find('{')
|
||||||
|
json_end = result.rfind('}')
|
||||||
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||||
|
json_part = result[json_start:json_end+1]
|
||||||
|
try:
|
||||||
|
styles = json.loads(json_part)
|
||||||
|
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Convert colors to PDF format (keep as hex strings, PDF renderer will convert them)
|
||||||
|
styles = self._convert_colors_format(styles)
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Convert colors to proper format for PDF compatibility."""
|
||||||
|
try:
|
||||||
|
for style_name, style_config in styles.items():
|
||||||
|
if isinstance(style_config, dict):
|
||||||
|
for prop, value in style_config.items():
|
||||||
|
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
|
||||||
|
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel) for consistency
|
||||||
|
styles[style_name][prop] = f"FF{value[1:]}"
|
||||||
|
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
|
||||||
|
# Already aRGB format, keep as is
|
||||||
|
pass
|
||||||
|
return styles
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Color conversion failed: {str(e)}")
|
||||||
|
return styles
|
||||||
|
|
||||||
|
def _get_safe_color(self, color_value: str, default: str = "#000000") -> str:
|
||||||
|
"""Get a safe hex color value for PDF."""
|
||||||
|
if isinstance(color_value, str) and color_value.startswith('#'):
|
||||||
|
if len(color_value) == 7:
|
||||||
|
return f"FF{color_value[1:]}"
|
||||||
|
elif len(color_value) == 9:
|
||||||
|
return color_value
|
||||||
|
return default
|
||||||
|
|
||||||
|
def _validate_pdf_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Validate and fix contrast issues in AI-generated styles."""
|
||||||
|
try:
|
||||||
|
# Fix table header contrast
|
||||||
|
if "table_header" in styles:
|
||||||
|
header = styles["table_header"]
|
||||||
|
bg_color = header.get("background", "#FFFFFF")
|
||||||
|
text_color = header.get("text_color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["text_color"] = "#FFFFFF"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["text_color"] = "#FFFFFF"
|
||||||
|
|
||||||
|
# Fix table cell contrast
|
||||||
|
if "table_cell" in styles:
|
||||||
|
cell = styles["table_cell"]
|
||||||
|
bg_color = cell.get("background", "#FFFFFF")
|
||||||
|
text_color = cell.get("text_color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["text_color"] = "#2F2F2F"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["text_color"] = "#2F2F2F"
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||||
|
return self._get_default_pdf_styles()
|
||||||
|
|
||||||
|
def _get_default_pdf_styles(self) -> Dict[str, Any]:
|
||||||
|
"""Default PDF styles."""
|
||||||
|
return {
|
||||||
|
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
|
||||||
|
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
|
||||||
|
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
|
||||||
|
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
|
||||||
|
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
|
||||||
|
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
|
||||||
|
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
|
||||||
|
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _create_title_style(self, styles: Dict[str, Any]) -> ParagraphStyle:
|
||||||
|
"""Create title style from style definitions."""
|
||||||
|
title_style_def = styles.get("title", {})
|
||||||
|
|
||||||
|
# DEBUG: Show what color and spacing is being used for title
|
||||||
|
title_color = title_style_def.get("color", "#1F4E79")
|
||||||
|
title_space_after = title_style_def.get("space_after", 30)
|
||||||
|
self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hex_to_color(title_color)}", "PDF_RENDERER")
|
||||||
|
self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER")
|
||||||
|
|
||||||
|
return ParagraphStyle(
|
||||||
|
'CustomTitle',
|
||||||
|
fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20
|
||||||
|
spaceAfter=title_style_def.get("space_after", 30),
|
||||||
|
alignment=self._get_alignment(title_style_def.get("align", "center")),
|
||||||
|
textColor=self._hex_to_color(title_color),
|
||||||
|
leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles
|
||||||
|
spaceBefore=0 # Ensure no space before title
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_heading_style(self, styles: Dict[str, Any], level: int) -> ParagraphStyle:
|
||||||
|
"""Create heading style from style definitions."""
|
||||||
|
heading_key = f"heading{level}"
|
||||||
|
heading_style_def = styles.get(heading_key, styles.get("heading1", {}))
|
||||||
|
|
||||||
|
return ParagraphStyle(
|
||||||
|
f'CustomHeading{level}',
|
||||||
|
fontSize=heading_style_def.get("font_size", 18 - level * 2),
|
||||||
|
spaceAfter=heading_style_def.get("space_after", 12),
|
||||||
|
spaceBefore=heading_style_def.get("space_before", 12),
|
||||||
|
alignment=self._get_alignment(heading_style_def.get("align", "left")),
|
||||||
|
textColor=self._hex_to_color(heading_style_def.get("color", "#2F2F2F"))
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_normal_style(self, styles: Dict[str, Any]) -> ParagraphStyle:
|
||||||
|
"""Create normal paragraph style from style definitions."""
|
||||||
|
paragraph_style_def = styles.get("paragraph", {})
|
||||||
|
|
||||||
|
return ParagraphStyle(
|
||||||
|
'CustomNormal',
|
||||||
|
fontSize=paragraph_style_def.get("font_size", 11),
|
||||||
|
spaceAfter=paragraph_style_def.get("space_after", 6),
|
||||||
|
alignment=self._get_alignment(paragraph_style_def.get("align", "left")),
|
||||||
|
textColor=self._hex_to_color(paragraph_style_def.get("color", "#2F2F2F")),
|
||||||
|
leading=paragraph_style_def.get("line_height", 1.2) * paragraph_style_def.get("font_size", 11)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_alignment(self, align: str) -> int:
|
||||||
|
"""Convert alignment string to reportlab alignment constant."""
|
||||||
|
if not align or not isinstance(align, str):
|
||||||
|
return TA_LEFT
|
||||||
|
|
||||||
|
align_map = {
|
||||||
|
"center": TA_CENTER,
|
||||||
|
"left": TA_LEFT,
|
||||||
|
"justify": TA_JUSTIFY,
|
||||||
|
"right": TA_LEFT, # ReportLab doesn't have TA_RIGHT, use LEFT as fallback
|
||||||
|
"0": TA_LEFT, # Handle numeric strings
|
||||||
|
"1": TA_CENTER,
|
||||||
|
"2": TA_JUSTIFY
|
||||||
|
}
|
||||||
|
return align_map.get(align.lower().strip(), TA_LEFT)
|
||||||
|
|
||||||
|
def _get_table_alignment(self, align: str) -> str:
|
||||||
|
"""Convert alignment string to ReportLab table alignment string."""
|
||||||
|
if not align or not isinstance(align, str):
|
||||||
|
return 'LEFT'
|
||||||
|
|
||||||
|
align_map = {
|
||||||
|
"center": 'CENTER',
|
||||||
|
"left": 'LEFT',
|
||||||
|
"justify": 'LEFT', # Tables don't support justify, use LEFT
|
||||||
|
"right": 'RIGHT',
|
||||||
|
"0": 'LEFT', # Handle numeric strings
|
||||||
|
"1": 'CENTER',
|
||||||
|
"2": 'LEFT' # Tables don't support justify, use LEFT
|
||||||
|
}
|
||||||
|
return align_map.get(align.lower().strip(), 'LEFT')
|
||||||
|
|
||||||
|
def _hex_to_color(self, hex_color: str) -> colors.Color:
|
||||||
|
"""Convert hex color to reportlab color."""
|
||||||
|
try:
|
||||||
|
hex_color = hex_color.lstrip('#')
|
||||||
|
|
||||||
|
# Handle aRGB format (8 characters: FF + RGB)
|
||||||
|
if len(hex_color) == 8:
|
||||||
|
# Skip the alpha channel (first 2 characters)
|
||||||
|
hex_color = hex_color[2:]
|
||||||
|
|
||||||
|
# Handle RGB format (6 characters)
|
||||||
|
if len(hex_color) == 6:
|
||||||
|
r = int(hex_color[0:2], 16) / 255.0
|
||||||
|
g = int(hex_color[2:4], 16) / 255.0
|
||||||
|
b = int(hex_color[4:6], 16) / 255.0
|
||||||
|
return colors.Color(r, g, b)
|
||||||
|
|
||||||
|
# Fallback for other formats
|
||||||
|
return colors.black
|
||||||
|
except:
|
||||||
|
return colors.black
|
||||||
|
|
||||||
|
def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a single JSON section to PDF elements using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
section_type = self._get_section_type(section)
|
||||||
|
elements = self._get_section_data(section)
|
||||||
|
|
||||||
|
# Process each element in the section
|
||||||
|
all_elements = []
|
||||||
|
for element in elements:
|
||||||
|
if section_type == "table":
|
||||||
|
all_elements.extend(self._render_json_table(element, styles))
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
all_elements.extend(self._render_json_bullet_list(element, styles))
|
||||||
|
elif section_type == "heading":
|
||||||
|
all_elements.extend(self._render_json_heading(element, styles))
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
all_elements.extend(self._render_json_paragraph(element, styles))
|
||||||
|
elif section_type == "code_block":
|
||||||
|
all_elements.extend(self._render_json_code_block(element, styles))
|
||||||
|
elif section_type == "image":
|
||||||
|
all_elements.extend(self._render_json_image(element, styles))
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
all_elements.extend(self._render_json_paragraph(element, styles))
|
||||||
|
|
||||||
|
return all_elements
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||||
|
return [Paragraph(f"[Error rendering section: {str(e)}]", self._create_normal_style(styles))]
|
||||||
|
|
||||||
|
def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a JSON table to PDF elements using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
|
||||||
|
if not headers or not rows:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Prepare table data
|
||||||
|
table_data_list = [headers] + rows
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
table = Table(table_data_list)
|
||||||
|
|
||||||
|
# Apply styling
|
||||||
|
table_header_style = styles.get("table_header", {})
|
||||||
|
table_cell_style = styles.get("table_cell", {})
|
||||||
|
|
||||||
|
table_style = [
|
||||||
|
('BACKGROUND', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("background", "#4F4F4F"))),
|
||||||
|
('TEXTCOLOR', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("text_color", "#FFFFFF"))),
|
||||||
|
('ALIGN', (0, 0), (-1, -1), self._get_table_alignment(table_cell_style.get("align", "left"))),
|
||||||
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
|
||||||
|
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
|
||||||
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||||
|
('BACKGROUND', (0, 1), (-1, -1), self._hex_to_color(table_cell_style.get("background", "#FFFFFF"))),
|
||||||
|
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
|
||||||
|
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||||
|
]
|
||||||
|
|
||||||
|
table.setStyle(TableStyle(table_style))
|
||||||
|
|
||||||
|
return [table, Spacer(1, 12)]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
bullet_style_def = styles.get("bullet_list", {})
|
||||||
|
|
||||||
|
elements = []
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
elements.append(Paragraph(f"• {item}", self._create_normal_style(styles)))
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
elements.append(Paragraph(f"• {item['text']}", self._create_normal_style(styles)))
|
||||||
|
|
||||||
|
if elements:
|
||||||
|
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a JSON heading to PDF elements using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
level = max(1, min(6, level))
|
||||||
|
heading_style = self._create_heading_style(styles, level)
|
||||||
|
return [Paragraph(text, heading_style)]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
return [Paragraph(text, self._create_normal_style(styles))]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a JSON code block to PDF elements using AI-generated styles."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
code_style_def = styles.get("code_block", {})
|
||||||
|
|
||||||
|
if code:
|
||||||
|
elements = []
|
||||||
|
|
||||||
|
if language:
|
||||||
|
lang_style = ParagraphStyle(
|
||||||
|
'CodeLanguage',
|
||||||
|
fontSize=code_style_def.get("font_size", 9),
|
||||||
|
textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")),
|
||||||
|
fontName='Helvetica-Bold'
|
||||||
|
)
|
||||||
|
elements.append(Paragraph(f"Code ({language}):", lang_style))
|
||||||
|
|
||||||
|
code_style = ParagraphStyle(
|
||||||
|
'CodeBlock',
|
||||||
|
fontSize=code_style_def.get("font_size", 9),
|
||||||
|
textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")),
|
||||||
|
fontName=code_style_def.get("font", "Courier"),
|
||||||
|
backColor=self._hex_to_color(code_style_def.get("background", "#F5F5F5")),
|
||||||
|
spaceAfter=code_style_def.get("space_after", 6)
|
||||||
|
)
|
||||||
|
elements.append(Paragraph(code, code_style))
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||||
|
"""Render a JSON image to PDF elements."""
|
||||||
|
try:
|
||||||
|
base64_data = image_data.get("base64Data", "")
|
||||||
|
alt_text = image_data.get("altText", "Image")
|
||||||
|
|
||||||
|
if base64_data:
|
||||||
|
# For now, just add a placeholder since reportlab image handling is complex
|
||||||
|
return [Paragraph(f"[Image: {alt_text}]", self._create_normal_style(styles))]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||||
|
return [Paragraph(f"[Image: {image_data.get('altText', 'Image')}]", self._create_normal_style(styles))]
|
||||||
885
modules/services/serviceGeneration/renderers/rendererPptx.py
Normal file
885
modules/services/serviceGeneration/renderers/rendererPptx.py
Normal file
|
|
@ -0,0 +1,885 @@
|
||||||
|
import logging
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
from typing import Dict, Any, Optional, Tuple, List
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class RendererPptx(BaseRenderer):
|
||||||
|
"""Renderer for PowerPoint (.pptx) files using python-pptx library."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.supported_formats = ["pptx", "ppt"]
|
||||||
|
self.output_mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> list:
|
||||||
|
"""Get list of supported output formats."""
|
||||||
|
return ["pptx", "ppt"]
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Render content as PowerPoint presentation from JSON data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extracted_content: JSON content to render as presentation
|
||||||
|
title: Title for the presentation
|
||||||
|
user_prompt: User prompt for AI styling
|
||||||
|
ai_service: AI service for styling
|
||||||
|
**kwargs: Additional rendering options
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Base64-encoded PowerPoint presentation as string
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Import python-pptx
|
||||||
|
from pptx import Presentation
|
||||||
|
from pptx.util import Inches, Pt
|
||||||
|
from pptx.enum.text import PP_ALIGN
|
||||||
|
from pptx.dml.color import RGBColor
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Get AI-generated styling definitions first
|
||||||
|
styles = await self._get_pptx_styles(user_prompt, ai_service)
|
||||||
|
|
||||||
|
# Create new presentation
|
||||||
|
prs = Presentation()
|
||||||
|
|
||||||
|
# Set slide size based on user intent (default to 16:9)
|
||||||
|
slide_size = styles.get("slide_size", "16:9")
|
||||||
|
if slide_size == "4:3":
|
||||||
|
prs.slide_width = Inches(10)
|
||||||
|
prs.slide_height = Inches(7.5)
|
||||||
|
else: # Default to 16:9
|
||||||
|
prs.slide_width = Inches(13.33)
|
||||||
|
prs.slide_height = Inches(7.5)
|
||||||
|
|
||||||
|
# Generate slides from JSON content
|
||||||
|
slides_data = await self._parse_json_to_slides(extracted_content, title, styles)
|
||||||
|
logger.info(f"Parsed {len(slides_data)} slides from JSON content")
|
||||||
|
|
||||||
|
# Debug: Show first 200 chars of content
|
||||||
|
logger.info(f"JSON content preview: {str(extracted_content)[:200]}...")
|
||||||
|
|
||||||
|
for i, slide_data in enumerate(slides_data):
|
||||||
|
logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - {len(slide_data.get('content', ''))} chars")
|
||||||
|
# Debug: Show slide content preview
|
||||||
|
slide_content = slide_data.get('content', '')
|
||||||
|
if slide_content:
|
||||||
|
logger.info(f" Content preview: '{slide_content[:100]}...'")
|
||||||
|
else:
|
||||||
|
logger.warning(f" ⚠️ Slide {i+1} has NO content!")
|
||||||
|
|
||||||
|
# Create slide with appropriate layout based on content
|
||||||
|
slide_layout_index = self._get_slide_layout_index(slide_data, styles)
|
||||||
|
slide_layout = prs.slide_layouts[slide_layout_index]
|
||||||
|
slide = prs.slides.add_slide(slide_layout)
|
||||||
|
|
||||||
|
# Set title with AI-generated styling
|
||||||
|
title_shape = slide.shapes.title
|
||||||
|
title_shape.text = slide_data.get("title", "Slide")
|
||||||
|
|
||||||
|
# Apply title styling
|
||||||
|
title_style = styles.get("title", {})
|
||||||
|
if title_shape.text_frame.paragraphs[0].font:
|
||||||
|
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44))
|
||||||
|
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||||
|
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
||||||
|
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||||
|
|
||||||
|
# Set content with AI-generated styling
|
||||||
|
content_shape = slide.placeholders[1]
|
||||||
|
content_text = slide_data.get("content", "")
|
||||||
|
|
||||||
|
# Format content text with AI styles
|
||||||
|
text_frame = content_shape.text_frame
|
||||||
|
text_frame.clear()
|
||||||
|
|
||||||
|
# Split content into paragraphs
|
||||||
|
paragraphs = content_text.split('\n\n')
|
||||||
|
|
||||||
|
for i, paragraph in enumerate(paragraphs):
|
||||||
|
if paragraph.strip():
|
||||||
|
if i == 0:
|
||||||
|
p = text_frame.paragraphs[0]
|
||||||
|
else:
|
||||||
|
p = text_frame.add_paragraph()
|
||||||
|
|
||||||
|
p.text = paragraph.strip()
|
||||||
|
|
||||||
|
# Apply AI-generated styling based on content type
|
||||||
|
if paragraph.startswith('#'):
|
||||||
|
# Header
|
||||||
|
p.text = paragraph.lstrip('#').strip()
|
||||||
|
heading_style = styles.get("heading", {})
|
||||||
|
p.font.size = Pt(heading_style.get("font_size", 32))
|
||||||
|
p.font.bold = heading_style.get("bold", True)
|
||||||
|
heading_color = self._get_safe_color(heading_style.get("color", (47, 47, 47)))
|
||||||
|
p.font.color.rgb = RGBColor(*heading_color)
|
||||||
|
elif paragraph.startswith('##'):
|
||||||
|
# Subheader
|
||||||
|
p.text = paragraph.lstrip('#').strip()
|
||||||
|
subheading_style = styles.get("subheading", {})
|
||||||
|
p.font.size = Pt(subheading_style.get("font_size", 24))
|
||||||
|
p.font.bold = subheading_style.get("bold", True)
|
||||||
|
subheading_color = self._get_safe_color(subheading_style.get("color", (79, 79, 79)))
|
||||||
|
p.font.color.rgb = RGBColor(*subheading_color)
|
||||||
|
elif paragraph.startswith('*') and paragraph.endswith('*'):
|
||||||
|
# Bold text
|
||||||
|
p.text = paragraph.strip('*')
|
||||||
|
paragraph_style = styles.get("paragraph", {})
|
||||||
|
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||||
|
p.font.bold = True
|
||||||
|
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||||
|
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||||
|
else:
|
||||||
|
# Regular text
|
||||||
|
paragraph_style = styles.get("paragraph", {})
|
||||||
|
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||||
|
p.font.bold = paragraph_style.get("bold", False)
|
||||||
|
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||||
|
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||||
|
|
||||||
|
# Apply alignment
|
||||||
|
align = paragraph_style.get("align", "left")
|
||||||
|
if align == "center":
|
||||||
|
p.alignment = PP_ALIGN.CENTER
|
||||||
|
elif align == "right":
|
||||||
|
p.alignment = PP_ALIGN.RIGHT
|
||||||
|
else:
|
||||||
|
p.alignment = PP_ALIGN.LEFT
|
||||||
|
|
||||||
|
# If no slides were created, create a default slide
|
||||||
|
if not slides_data:
|
||||||
|
slide_layout = prs.slide_layouts[0] # Title slide layout
|
||||||
|
slide = prs.slides.add_slide(slide_layout)
|
||||||
|
|
||||||
|
title_shape = slide.shapes.title
|
||||||
|
title_shape.text = title
|
||||||
|
|
||||||
|
# Apply title styling to default slide
|
||||||
|
title_style = styles.get("title", {})
|
||||||
|
if title_shape.text_frame.paragraphs[0].font:
|
||||||
|
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
|
||||||
|
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||||
|
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
||||||
|
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||||
|
|
||||||
|
subtitle_shape = slide.placeholders[1]
|
||||||
|
subtitle_shape.text = "Generated by PowerOn AI System"
|
||||||
|
|
||||||
|
# Apply subtitle styling
|
||||||
|
paragraph_style = styles.get("paragraph", {})
|
||||||
|
if subtitle_shape.text_frame.paragraphs[0].font:
|
||||||
|
subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20))
|
||||||
|
subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False)
|
||||||
|
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||||
|
subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color)
|
||||||
|
|
||||||
|
# Save to buffer
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
prs.save(buffer)
|
||||||
|
buffer.seek(0)
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
pptx_bytes = buffer.getvalue()
|
||||||
|
pptx_base64 = base64.b64encode(pptx_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
logger.info(f"Successfully rendered PowerPoint presentation: {len(pptx_bytes)} bytes")
|
||||||
|
return pptx_base64, "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||||
|
return "python-pptx library not installed", "text/plain"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
|
||||||
|
return f"Error rendering PowerPoint presentation: {str(e)}", "text/plain"
|
||||||
|
|
||||||
|
def _parse_content_to_slides(self, content: str, title: str) -> list:
|
||||||
|
"""
|
||||||
|
Parse content into slide data structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Content to parse
|
||||||
|
title: Presentation title
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of slide data dictionaries
|
||||||
|
"""
|
||||||
|
slides = []
|
||||||
|
|
||||||
|
# Split content by slide markers or headers
|
||||||
|
slide_sections = self._split_content_into_slides(content)
|
||||||
|
|
||||||
|
for i, section in enumerate(slide_sections):
|
||||||
|
if section.strip():
|
||||||
|
slide_data = {
|
||||||
|
"title": f"Slide {i + 1}",
|
||||||
|
"content": section.strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract title from content if it starts with #
|
||||||
|
lines = section.strip().split('\n')
|
||||||
|
if lines and lines[0].startswith('#'):
|
||||||
|
# Remove # symbols and clean up title
|
||||||
|
slide_title = lines[0].lstrip('#').strip()
|
||||||
|
slide_data["title"] = slide_title
|
||||||
|
slide_data["content"] = '\n'.join(lines[1:]).strip()
|
||||||
|
elif lines and lines[0].strip():
|
||||||
|
# Use first line as title if it looks like a title
|
||||||
|
first_line = lines[0].strip()
|
||||||
|
if len(first_line) < 100 and not first_line.endswith('.'):
|
||||||
|
slide_data["title"] = first_line
|
||||||
|
slide_data["content"] = '\n'.join(lines[1:]).strip()
|
||||||
|
|
||||||
|
slides.append(slide_data)
|
||||||
|
|
||||||
|
return slides
|
||||||
|
|
||||||
|
def _split_content_into_slides(self, content: str) -> list:
|
||||||
|
"""
|
||||||
|
Split content into individual slides based on headers and structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Content to split
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of slide content strings
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# First, try to split by major headers (# or ##)
|
||||||
|
# This is the most common case for AI-generated content
|
||||||
|
header_pattern = r'^(#{1,2})\s+(.+)$'
|
||||||
|
lines = content.split('\n')
|
||||||
|
slides = []
|
||||||
|
current_slide = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# Check if this line is a header
|
||||||
|
header_match = re.match(header_pattern, line.strip())
|
||||||
|
if header_match:
|
||||||
|
# If we have content in current slide, save it
|
||||||
|
if current_slide:
|
||||||
|
slide_content = '\n'.join(current_slide).strip()
|
||||||
|
if slide_content:
|
||||||
|
slides.append(slide_content)
|
||||||
|
current_slide = []
|
||||||
|
|
||||||
|
# Start new slide with this header
|
||||||
|
current_slide.append(line)
|
||||||
|
else:
|
||||||
|
# Add line to current slide
|
||||||
|
current_slide.append(line)
|
||||||
|
|
||||||
|
# Add the last slide
|
||||||
|
if current_slide:
|
||||||
|
slide_content = '\n'.join(current_slide).strip()
|
||||||
|
if slide_content:
|
||||||
|
slides.append(slide_content)
|
||||||
|
|
||||||
|
# If we found slides with headers, return them
|
||||||
|
if len(slides) > 1:
|
||||||
|
return slides
|
||||||
|
|
||||||
|
# Fallback: Split by double newlines
|
||||||
|
sections = content.split('\n\n\n')
|
||||||
|
if len(sections) > 1:
|
||||||
|
return [s.strip() for s in sections if s.strip()]
|
||||||
|
|
||||||
|
# Another fallback: Split by double newlines
|
||||||
|
sections = content.split('\n\n')
|
||||||
|
if len(sections) > 1:
|
||||||
|
return [s.strip() for s in sections if s.strip()]
|
||||||
|
|
||||||
|
# Last resort: return as single slide
|
||||||
|
return [content.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_mime_type(self) -> str:
|
||||||
|
"""Get MIME type for rendered output."""
|
||||||
|
return self.output_mime_type
|
||||||
|
|
||||||
|
async def _get_pptx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||||
|
"""Get PowerPoint styling definitions using base template AI styling."""
|
||||||
|
style_schema = {
|
||||||
|
"title": {"font_size": 52, "color": "#1B365D", "bold": True, "align": "center"},
|
||||||
|
"heading": {"font_size": 36, "color": "#2C5F2D", "bold": True, "align": "left"},
|
||||||
|
"subheading": {"font_size": 28, "color": "#4A90E2", "bold": True, "align": "left"},
|
||||||
|
"paragraph": {"font_size": 20, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"bullet_list": {"font_size": 20, "color": "#2F2F2F", "indent": 20},
|
||||||
|
"table_header": {"font_size": 18, "color": "#FFFFFF", "bold": True, "background": "#1B365D"},
|
||||||
|
"table_cell": {"font_size": 16, "color": "#2F2F2F", "bold": False, "background": "#F8F9FA"},
|
||||||
|
"slide_size": "16:9",
|
||||||
|
"content_per_slide": "concise",
|
||||||
|
"design_theme": "corporate",
|
||||||
|
"color_scheme": "professional",
|
||||||
|
"background_style": "clean",
|
||||||
|
"accent_colors": ["#1B365D", "#2C5F2D", "#4A90E2", "#6B7280"],
|
||||||
|
"professional_grade": True,
|
||||||
|
"executive_ready": True
|
||||||
|
}
|
||||||
|
|
||||||
|
style_template = self._create_professional_pptx_template(user_prompt, style_schema)
|
||||||
|
# Use our own _get_ai_styles_with_pptx_colors method to ensure proper color conversion
|
||||||
|
styles = await self._get_ai_styles_with_pptx_colors(ai_service, style_template, self._get_default_pptx_styles())
|
||||||
|
|
||||||
|
# Validate PowerPoint-specific requirements
|
||||||
|
return self._validate_pptx_styles_readability(styles)
|
||||||
|
|
||||||
|
def _create_professional_pptx_template(self, user_prompt: str, style_schema: Dict[str, Any]) -> str:
|
||||||
|
"""Create a professional PowerPoint-specific AI style template for corporate-quality slides."""
|
||||||
|
import json
|
||||||
|
schema_json = json.dumps(style_schema, indent=4)
|
||||||
|
|
||||||
|
return f"""Customize the JSON below for professional PowerPoint slides.
|
||||||
|
|
||||||
|
User Request: {user_prompt or "Create professional corporate slides"}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Use professional colors (blues, grays, deep greens)
|
||||||
|
- Large, readable font sizes
|
||||||
|
- High contrast
|
||||||
|
- Sophisticated color palettes
|
||||||
|
|
||||||
|
Return ONLY this JSON with your changes:
|
||||||
|
|
||||||
|
{schema_json}
|
||||||
|
|
||||||
|
JSON ONLY. NO OTHER TEXT."""
|
||||||
|
|
||||||
|
async def _get_ai_styles_with_pptx_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Get AI styles with proper PowerPoint color conversion."""
|
||||||
|
if not ai_service:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||||
|
|
||||||
|
# Check if AI service is properly configured
|
||||||
|
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
|
||||||
|
self.logger.warning("AI service not properly configured, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
response = await ai_service.aiObjects.call(request)
|
||||||
|
|
||||||
|
# Check if response is valid
|
||||||
|
if not response:
|
||||||
|
self.logger.warning("AI service returned no response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Clean and parse JSON
|
||||||
|
result = response.content.strip() if response and response.content else ""
|
||||||
|
|
||||||
|
# Check if result is empty
|
||||||
|
if not result:
|
||||||
|
self.logger.warning("AI styling returned empty response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Log the raw response for debugging
|
||||||
|
self.logger.debug(f"AI styling raw response: {result[:200]}...")
|
||||||
|
|
||||||
|
# Extract JSON from various formats
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to extract JSON from explanatory text
|
||||||
|
json_patterns = [
|
||||||
|
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
|
||||||
|
r'\{.*?"title".*?\}', # JSON with title field
|
||||||
|
r'\{.*?"font_size".*?\}', # JSON with font_size field
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in json_patterns:
|
||||||
|
json_match = re.search(pattern, result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(0)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Additional cleanup - remove any leading/trailing whitespace and newlines
|
||||||
|
result = result.strip()
|
||||||
|
|
||||||
|
# Check if result is still empty after cleanup
|
||||||
|
if not result:
|
||||||
|
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
styles = json.loads(result)
|
||||||
|
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
|
||||||
|
except json.JSONDecodeError as json_error:
|
||||||
|
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
|
||||||
|
self.logger.warning(f"Raw content that failed to parse: {result[:100]}...")
|
||||||
|
# Try to extract just the JSON part if it's embedded in text
|
||||||
|
json_start = result.find('{')
|
||||||
|
json_end = result.rfind('}')
|
||||||
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||||
|
json_part = result[json_start:json_end+1]
|
||||||
|
try:
|
||||||
|
styles = json.loads(json_part)
|
||||||
|
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||||
|
self.logger.debug(f"Extracted AI styles: {list(styles.keys())}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
else:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Convert colors to PowerPoint RGB format
|
||||||
|
styles = self._convert_colors_format(styles)
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Convert hex colors to RGB format for PowerPoint compatibility."""
|
||||||
|
try:
|
||||||
|
for style_name, style_config in styles.items():
|
||||||
|
if isinstance(style_config, dict):
|
||||||
|
for prop, value in style_config.items():
|
||||||
|
if isinstance(value, str) and value.startswith('#'):
|
||||||
|
# Convert hex to RGB tuple for PowerPoint
|
||||||
|
hex_color = value.lstrip('#')
|
||||||
|
if len(hex_color) == 6:
|
||||||
|
r = int(hex_color[0:2], 16)
|
||||||
|
g = int(hex_color[2:4], 16)
|
||||||
|
b = int(hex_color[4:6], 16)
|
||||||
|
styles[style_name][prop] = (r, g, b)
|
||||||
|
elif len(hex_color) == 8: # aRGB format
|
||||||
|
r = int(hex_color[2:4], 16)
|
||||||
|
g = int(hex_color[4:6], 16)
|
||||||
|
b = int(hex_color[6:8], 16)
|
||||||
|
styles[style_name][prop] = (r, g, b)
|
||||||
|
return styles
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Color conversion failed: {str(e)}")
|
||||||
|
return styles
|
||||||
|
|
||||||
|
def _get_safe_color(self, color_value, default=(0, 0, 0)) -> tuple:
|
||||||
|
"""Get a safe RGB color tuple for PowerPoint."""
|
||||||
|
if isinstance(color_value, tuple) and len(color_value) == 3:
|
||||||
|
return color_value
|
||||||
|
elif isinstance(color_value, str) and color_value.startswith('#'):
|
||||||
|
hex_color = color_value.lstrip('#')
|
||||||
|
if len(hex_color) == 6:
|
||||||
|
r = int(hex_color[0:2], 16)
|
||||||
|
g = int(hex_color[2:4], 16)
|
||||||
|
b = int(hex_color[4:6], 16)
|
||||||
|
return (r, g, b)
|
||||||
|
elif len(hex_color) == 8: # aRGB format
|
||||||
|
r = int(hex_color[2:4], 16)
|
||||||
|
g = int(hex_color[4:6], 16)
|
||||||
|
b = int(hex_color[6:8], 16)
|
||||||
|
return (r, g, b)
|
||||||
|
return default
|
||||||
|
|
||||||
|
def _validate_pptx_styles_readability(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Validate and fix readability issues in AI-generated styles."""
|
||||||
|
try:
|
||||||
|
# Ensure minimum font sizes for PowerPoint readability
|
||||||
|
min_font_sizes = {
|
||||||
|
"title": 36,
|
||||||
|
"heading": 24,
|
||||||
|
"subheading": 20,
|
||||||
|
"paragraph": 14,
|
||||||
|
"bullet_list": 14,
|
||||||
|
"table_header": 12,
|
||||||
|
"table_cell": 12
|
||||||
|
}
|
||||||
|
|
||||||
|
for style_name, min_size in min_font_sizes.items():
|
||||||
|
if style_name in styles:
|
||||||
|
current_size = styles[style_name].get("font_size", 12)
|
||||||
|
if current_size < min_size:
|
||||||
|
styles[style_name]["font_size"] = min_size
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Style validation failed: {str(e)}")
|
||||||
|
return self._get_default_pptx_styles()
|
||||||
|
|
||||||
|
def _get_default_pptx_styles(self) -> Dict[str, Any]:
|
||||||
|
"""Default PowerPoint styles with corporate professional color scheme."""
|
||||||
|
return {
|
||||||
|
"title": {"font_size": 52, "color": (27, 54, 93), "bold": True, "align": "center"},
|
||||||
|
"heading": {"font_size": 36, "color": (44, 95, 45), "bold": True, "align": "left"},
|
||||||
|
"subheading": {"font_size": 28, "color": (74, 144, 226), "bold": True, "align": "left"},
|
||||||
|
"paragraph": {"font_size": 20, "color": (47, 47, 47), "bold": False, "align": "left"},
|
||||||
|
"bullet_list": {"font_size": 20, "color": (47, 47, 47), "indent": 20},
|
||||||
|
"table_header": {"font_size": 18, "color": (255, 255, 255), "bold": True, "background": (27, 54, 93)},
|
||||||
|
"table_cell": {"font_size": 16, "color": (47, 47, 47), "bold": False, "background": (248, 249, 250)},
|
||||||
|
"slide_size": "16:9",
|
||||||
|
"content_per_slide": "concise",
|
||||||
|
"design_theme": "corporate",
|
||||||
|
"color_scheme": "professional",
|
||||||
|
"background_style": "clean",
|
||||||
|
"accent_colors": [(27, 54, 93), (44, 95, 45), (74, 144, 226), (107, 114, 128)],
|
||||||
|
"professional_grade": True,
|
||||||
|
"executive_ready": True
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _parse_json_to_slides(self, json_content: Dict[str, Any], title: str, styles: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Parse JSON content into slide data structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_content: JSON content to parse
|
||||||
|
title: Presentation title
|
||||||
|
styles: AI-generated styles
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of slide data dictionaries
|
||||||
|
"""
|
||||||
|
slides = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Create title slide
|
||||||
|
slides.append({
|
||||||
|
"title": document_title,
|
||||||
|
"content": "Generated by PowerOn AI System\n\n" + self._format_timestamp()
|
||||||
|
})
|
||||||
|
|
||||||
|
# Process sections into slides based on content and user intent
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
slides.extend(self._create_slides_from_sections(sections, styles))
|
||||||
|
|
||||||
|
# If no content slides were created, create a default content slide
|
||||||
|
if len(slides) == 1: # Only title slide
|
||||||
|
slides.append({
|
||||||
|
"title": "Content Overview",
|
||||||
|
"content": "No structured content found in the source documents.\n\nPlease check the source documents and try again."
|
||||||
|
})
|
||||||
|
|
||||||
|
return slides
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing JSON to slides: {str(e)}")
|
||||||
|
# Return minimal fallback slides
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": title,
|
||||||
|
"content": "Error parsing content for presentation"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
def _create_slide_from_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Create a slide from a JSON section."""
|
||||||
|
try:
|
||||||
|
# Get section title from data or use default
|
||||||
|
section_title = "Untitled Section"
|
||||||
|
if section.get("content_type") == "heading":
|
||||||
|
# Extract text from elements array
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "text" in element:
|
||||||
|
section_title = element.get("text", "Untitled Section")
|
||||||
|
break
|
||||||
|
elif section.get("title"):
|
||||||
|
section_title = section.get("title")
|
||||||
|
|
||||||
|
content_type = section.get("content_type", "paragraph")
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
# Build slide content based on section type
|
||||||
|
content_parts = []
|
||||||
|
|
||||||
|
if content_type == "table":
|
||||||
|
content_parts.append(self._format_table_for_slide(elements))
|
||||||
|
elif content_type == "list":
|
||||||
|
content_parts.append(self._format_list_for_slide(elements))
|
||||||
|
elif content_type == "heading":
|
||||||
|
content_parts.append(self._format_heading_for_slide(elements))
|
||||||
|
elif content_type == "paragraph":
|
||||||
|
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||||
|
elif content_type == "code":
|
||||||
|
content_parts.append(self._format_code_for_slide(elements))
|
||||||
|
else:
|
||||||
|
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||||
|
|
||||||
|
# Combine content parts
|
||||||
|
slide_content = "\n\n".join(filter(None, content_parts))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": section_title,
|
||||||
|
"content": slide_content
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error creating slide from section: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str:
|
||||||
|
"""Format table data for slide presentation."""
|
||||||
|
try:
|
||||||
|
# Extract table data from elements array
|
||||||
|
headers = []
|
||||||
|
rows = []
|
||||||
|
for element in elements:
|
||||||
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
|
headers = element.get("headers", [])
|
||||||
|
rows = element.get("rows", [])
|
||||||
|
break
|
||||||
|
|
||||||
|
if not headers:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Create table representation
|
||||||
|
table_lines = []
|
||||||
|
|
||||||
|
# Add headers
|
||||||
|
header_line = " | ".join(str(h) for h in headers)
|
||||||
|
table_lines.append(header_line)
|
||||||
|
|
||||||
|
# Add separator
|
||||||
|
separator = "-" * len(header_line)
|
||||||
|
table_lines.append(separator)
|
||||||
|
|
||||||
|
# Add data rows (limit based on content density)
|
||||||
|
max_rows = 5 # Default limit
|
||||||
|
for row in rows[:max_rows]:
|
||||||
|
row_line = " | ".join(str(cell) for cell in row)
|
||||||
|
table_lines.append(row_line)
|
||||||
|
|
||||||
|
if len(rows) > max_rows:
|
||||||
|
table_lines.append(f"... and {len(rows) - max_rows} more rows")
|
||||||
|
|
||||||
|
return "\n".join(table_lines)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error formatting table for slide: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _format_list_for_slide(self, list_data: Dict[str, Any]) -> str:
|
||||||
|
"""Format list data for slide presentation."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Create list representation
|
||||||
|
list_lines = []
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
text = item.get("text", "")
|
||||||
|
list_lines.append(f"• {text}")
|
||||||
|
|
||||||
|
# Add subitems (limit to 3 for readability)
|
||||||
|
subitems = item.get("subitems", [])[:3]
|
||||||
|
for subitem in subitems:
|
||||||
|
if isinstance(subitem, dict):
|
||||||
|
list_lines.append(f" - {subitem.get('text', '')}")
|
||||||
|
else:
|
||||||
|
list_lines.append(f" - {subitem}")
|
||||||
|
else:
|
||||||
|
list_lines.append(f"• {str(item)}")
|
||||||
|
|
||||||
|
return "\n".join(list_lines)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error formatting list for slide: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _format_heading_for_slide(self, heading_data: Dict[str, Any]) -> str:
|
||||||
|
"""Format heading data for slide presentation."""
|
||||||
|
try:
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
return f"{'#' * level} {text}"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error formatting heading for slide: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _format_paragraph_for_slide(self, paragraph_data: Dict[str, Any]) -> str:
|
||||||
|
"""Format paragraph data for slide presentation."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Limit paragraph length based on content density
|
||||||
|
max_length = 200 # Default limit
|
||||||
|
if len(text) > max_length:
|
||||||
|
text = text[:max_length] + "..."
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error formatting paragraph for slide: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _format_code_for_slide(self, code_data: Dict[str, Any]) -> str:
|
||||||
|
"""Format code data for slide presentation."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
|
||||||
|
if code:
|
||||||
|
# Limit code length based on content density
|
||||||
|
max_length = 100 # Default limit
|
||||||
|
if len(code) > max_length:
|
||||||
|
code = code[:max_length] + "..."
|
||||||
|
|
||||||
|
if language:
|
||||||
|
return f"Code ({language}):\n{code}"
|
||||||
|
else:
|
||||||
|
return f"Code:\n{code}"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error formatting code for slide: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _get_slide_layout_index(self, slide_data: Dict[str, Any], styles: Dict[str, Any]) -> int:
|
||||||
|
"""Determine the best professional slide layout based on content."""
|
||||||
|
try:
|
||||||
|
content = slide_data.get("content", "")
|
||||||
|
title = slide_data.get("title", "")
|
||||||
|
|
||||||
|
# Check if it's a title slide (first slide)
|
||||||
|
if not content or "Generated by PowerOn AI System" in content:
|
||||||
|
return 0 # Title slide layout
|
||||||
|
|
||||||
|
# Professional layout selection based on content
|
||||||
|
if "|" in content and "-" in content:
|
||||||
|
# Has both tables and lists - use content with caption for professional look
|
||||||
|
return 2
|
||||||
|
elif "|" in content:
|
||||||
|
# Has tables - use content layout for clean table presentation
|
||||||
|
return 1
|
||||||
|
elif content.count("•") > 2:
|
||||||
|
# Has many bullet points - use content layout for better readability
|
||||||
|
return 1
|
||||||
|
elif len(content) > 200:
|
||||||
|
# Long content - use content layout for better text flow
|
||||||
|
return 1
|
||||||
|
elif title and len(title) > 20:
|
||||||
|
# Long title - use title and content layout
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
# Default to title and content layout for professional appearance
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error determining slide layout: {str(e)}")
|
||||||
|
return 1 # Default to title and content layout
|
||||||
|
|
||||||
|
def _create_slides_from_sections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||||
|
"""Create slides from sections based on content density and user intent."""
|
||||||
|
try:
|
||||||
|
slides = []
|
||||||
|
content_per_slide = styles.get("content_per_slide", "concise")
|
||||||
|
|
||||||
|
# Group sections by type and create slides
|
||||||
|
current_slide_content = []
|
||||||
|
current_slide_title = "Content Overview"
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
section_type = section.get("content_type", "paragraph")
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
if section_type == "heading":
|
||||||
|
# If we have accumulated content, create a slide
|
||||||
|
if current_slide_content:
|
||||||
|
slides.append({
|
||||||
|
"title": current_slide_title,
|
||||||
|
"content": "\n\n".join(current_slide_content)
|
||||||
|
})
|
||||||
|
current_slide_content = []
|
||||||
|
|
||||||
|
# Start new slide with heading as title
|
||||||
|
for element in elements:
|
||||||
|
if isinstance(element, dict) and "text" in element:
|
||||||
|
current_slide_title = element.get("text", "Untitled Section")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Add content to current slide
|
||||||
|
formatted_content = self._format_section_content(section)
|
||||||
|
if formatted_content:
|
||||||
|
current_slide_content.append(formatted_content)
|
||||||
|
|
||||||
|
# Add final slide if there's content
|
||||||
|
if current_slide_content:
|
||||||
|
slides.append({
|
||||||
|
"title": current_slide_title,
|
||||||
|
"content": "\n\n".join(current_slide_content)
|
||||||
|
})
|
||||||
|
|
||||||
|
return slides
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error creating slides from sections: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _format_section_content(self, section: Dict[str, Any]) -> str:
|
||||||
|
"""Format section content for slide presentation."""
|
||||||
|
try:
|
||||||
|
content_type = section.get("content_type", "paragraph")
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
|
# Process each element in the section
|
||||||
|
content_parts = []
|
||||||
|
for element in elements:
|
||||||
|
if content_type == "table":
|
||||||
|
content_parts.append(self._format_table_for_slide([element]))
|
||||||
|
elif content_type == "list":
|
||||||
|
content_parts.append(self._format_list_for_slide([element]))
|
||||||
|
elif content_type == "heading":
|
||||||
|
content_parts.append(self._format_heading_for_slide([element]))
|
||||||
|
elif content_type == "paragraph":
|
||||||
|
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||||
|
elif content_type == "code":
|
||||||
|
content_parts.append(self._format_code_for_slide([element]))
|
||||||
|
else:
|
||||||
|
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||||
|
|
||||||
|
return "\n\n".join(filter(None, content_parts))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error formatting section content: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _format_timestamp(self) -> str:
|
||||||
|
"""Format current timestamp for presentation generation."""
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
256
modules/services/serviceGeneration/renderers/rendererText.py
Normal file
256
modules/services/serviceGeneration/renderers/rendererText.py
Normal file
|
|
@ -0,0 +1,256 @@
|
||||||
|
"""
|
||||||
|
Text renderer for report generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
|
||||||
|
class RendererText(BaseRenderer):
|
||||||
|
"""Renders content to plain text format with format-specific extraction."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported text formats (excluding formats with dedicated renderers)."""
|
||||||
|
return [
|
||||||
|
'txt', 'text', 'plain',
|
||||||
|
# Programming languages
|
||||||
|
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
|
||||||
|
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
|
||||||
|
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
|
||||||
|
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
|
||||||
|
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
|
||||||
|
# Web technologies (excluding html/htm which have dedicated renderer)
|
||||||
|
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
|
||||||
|
# Data formats (excluding csv, md/markdown which have dedicated renderers)
|
||||||
|
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
|
||||||
|
# Configuration files
|
||||||
|
'env', 'properties', 'conf', 'config', 'rc',
|
||||||
|
'gitattributes', 'editorconfig', 'eslintrc',
|
||||||
|
# Documentation
|
||||||
|
'readme', 'changelog', 'license', 'authors',
|
||||||
|
'contributing', 'todo', 'notes', 'docs'
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return [
|
||||||
|
'ascii', 'utf8', 'utf-8', 'code', 'source',
|
||||||
|
'script', 'program', 'file', 'document',
|
||||||
|
'raw', 'unformatted', 'plaintext'
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for text renderer."""
|
||||||
|
return 90
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to plain text format."""
|
||||||
|
try:
|
||||||
|
# Generate text from JSON structure
|
||||||
|
text_content = self._generate_text_from_json(extracted_content, title)
|
||||||
|
|
||||||
|
return text_content, "text/plain"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering text: {str(e)}")
|
||||||
|
# Return minimal text fallback
|
||||||
|
return f"{title}\n\nError rendering report: {str(e)}", "text/plain"
|
||||||
|
|
||||||
|
def _generate_text_from_json(self, json_content: Dict[str, Any], title: str) -> str:
|
||||||
|
"""Generate text content from structured JSON document."""
|
||||||
|
try:
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Build text content
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
# Document title
|
||||||
|
text_parts.append(document_title)
|
||||||
|
text_parts.append("=" * len(document_title))
|
||||||
|
text_parts.append("")
|
||||||
|
|
||||||
|
# Process each section
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
for section in sections:
|
||||||
|
section_text = self._render_json_section(section)
|
||||||
|
if section_text:
|
||||||
|
text_parts.append(section_text)
|
||||||
|
text_parts.append("") # Add spacing between sections
|
||||||
|
|
||||||
|
# Add generation info
|
||||||
|
text_parts.append("")
|
||||||
|
text_parts.append(f"Generated: {self._format_timestamp()}")
|
||||||
|
|
||||||
|
return '\n'.join(text_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating text from JSON: {str(e)}")
|
||||||
|
raise Exception(f"Text generation failed: {str(e)}")
|
||||||
|
|
||||||
|
def _render_json_section(self, section: Dict[str, Any]) -> str:
|
||||||
|
"""Render a single JSON section to text."""
|
||||||
|
try:
|
||||||
|
section_type = self._get_section_type(section)
|
||||||
|
section_data = self._get_section_data(section)
|
||||||
|
|
||||||
|
if section_type == "table":
|
||||||
|
# Process the section data to extract table structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_table(processed_data)
|
||||||
|
elif section_type == "bullet_list":
|
||||||
|
# Process the section data to extract bullet list structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_bullet_list(processed_data)
|
||||||
|
elif section_type == "heading":
|
||||||
|
# Render each heading element in the elements array
|
||||||
|
# section_data is already the elements array from _get_section_data
|
||||||
|
rendered_elements = []
|
||||||
|
for element in section_data:
|
||||||
|
rendered_elements.append(self._render_json_heading(element))
|
||||||
|
return "\n".join(rendered_elements)
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
# Render each paragraph element in the elements array
|
||||||
|
# section_data is already the elements array from _get_section_data
|
||||||
|
rendered_elements = []
|
||||||
|
for element in section_data:
|
||||||
|
rendered_elements.append(self._render_json_paragraph(element))
|
||||||
|
return "\n".join(rendered_elements)
|
||||||
|
elif section_type == "code_block":
|
||||||
|
# Process the section data to extract code block structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_code_block(processed_data)
|
||||||
|
elif section_type == "image":
|
||||||
|
# Process the section data to extract image structure
|
||||||
|
processed_data = self._process_section_by_type(section)
|
||||||
|
return self._render_json_image(processed_data)
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types - render each element
|
||||||
|
# section_data is already the elements array from _get_section_data
|
||||||
|
rendered_elements = []
|
||||||
|
for element in section_data:
|
||||||
|
rendered_elements.append(self._render_json_paragraph(element))
|
||||||
|
return "\n".join(rendered_elements)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||||
|
return f"[Error rendering section: {str(e)}]"
|
||||||
|
|
||||||
|
def _render_json_table(self, table_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON table to text."""
|
||||||
|
try:
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
|
||||||
|
if not headers or not rows:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
# Create table header
|
||||||
|
header_line = " | ".join(str(header) for header in headers)
|
||||||
|
text_parts.append(header_line)
|
||||||
|
|
||||||
|
# Add separator line
|
||||||
|
separator_line = " | ".join("-" * len(str(header)) for header in headers)
|
||||||
|
text_parts.append(separator_line)
|
||||||
|
|
||||||
|
# Add data rows
|
||||||
|
for row in rows:
|
||||||
|
row_line = " | ".join(str(cell_data) for cell_data in row)
|
||||||
|
text_parts.append(row_line)
|
||||||
|
|
||||||
|
return '\n'.join(text_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON bullet list to text."""
|
||||||
|
try:
|
||||||
|
items = list_data.get("items", [])
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text_parts = []
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
text_parts.append(f"- {item}")
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
text_parts.append(f"- {item['text']}")
|
||||||
|
|
||||||
|
return '\n'.join(text_parts)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_heading(self, heading_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON heading to text."""
|
||||||
|
try:
|
||||||
|
level = heading_data.get("level", 1)
|
||||||
|
text = heading_data.get("text", "")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
level = max(1, min(6, level))
|
||||||
|
if level == 1:
|
||||||
|
return f"{text}\n{'=' * len(text)}"
|
||||||
|
elif level == 2:
|
||||||
|
return f"{text}\n{'-' * len(text)}"
|
||||||
|
else:
|
||||||
|
return f"{'#' * level} {text}"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON paragraph to text."""
|
||||||
|
try:
|
||||||
|
text = paragraph_data.get("text", "")
|
||||||
|
return text if text else ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_code_block(self, code_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON code block to text."""
|
||||||
|
try:
|
||||||
|
code = code_data.get("code", "")
|
||||||
|
language = code_data.get("language", "")
|
||||||
|
|
||||||
|
if code:
|
||||||
|
if language:
|
||||||
|
return f"Code ({language}):\n{code}"
|
||||||
|
else:
|
||||||
|
return code
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_json_image(self, image_data: Dict[str, Any]) -> str:
|
||||||
|
"""Render a JSON image to text."""
|
||||||
|
try:
|
||||||
|
alt_text = image_data.get("altText", "Image")
|
||||||
|
return f"[Image: {alt_text}]"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||||
|
return f"[Image: {image_data.get('altText', 'Image')}]"
|
||||||
791
modules/services/serviceGeneration/renderers/rendererXlsx.py
Normal file
791
modules/services/serviceGeneration/renderers/rendererXlsx.py
Normal file
|
|
@ -0,0 +1,791 @@
|
||||||
|
"""
|
||||||
|
Excel renderer for report generation using openpyxl.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .rendererBaseTemplate import BaseRenderer
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
import io
|
||||||
|
import base64
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
|
||||||
|
try:
|
||||||
|
from openpyxl import Workbook
|
||||||
|
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||||
|
from openpyxl.utils import get_column_letter
|
||||||
|
from openpyxl.worksheet.table import Table, TableStyleInfo
|
||||||
|
OPENPYXL_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
OPENPYXL_AVAILABLE = False
|
||||||
|
|
||||||
|
class RendererXlsx(BaseRenderer):
|
||||||
|
"""Renders content to Excel format using openpyxl."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_supported_formats(cls) -> List[str]:
|
||||||
|
"""Return supported Excel formats."""
|
||||||
|
return ['xlsx', 'xls', 'excel']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_format_aliases(cls) -> List[str]:
|
||||||
|
"""Return format aliases."""
|
||||||
|
return ['spreadsheet', 'workbook']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_priority(cls) -> int:
|
||||||
|
"""Return priority for Excel renderer."""
|
||||||
|
return 110
|
||||||
|
|
||||||
|
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||||
|
"""Render extracted JSON content to Excel format using AI-analyzed styling."""
|
||||||
|
try:
|
||||||
|
if not OPENPYXL_AVAILABLE:
|
||||||
|
# Fallback to CSV if openpyxl not available
|
||||||
|
from .rendererCsv import RendererCsv
|
||||||
|
csv_renderer = RendererCsv()
|
||||||
|
csv_content, _ = await csv_renderer.render(extracted_content, title, user_prompt, ai_service)
|
||||||
|
return csv_content, "text/csv"
|
||||||
|
|
||||||
|
# Generate Excel using AI-analyzed styling
|
||||||
|
excel_content = await self._generate_excel_from_json(extracted_content, title, user_prompt, ai_service)
|
||||||
|
|
||||||
|
return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error rendering Excel: {str(e)}")
|
||||||
|
# Return CSV fallback
|
||||||
|
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
|
||||||
|
|
||||||
|
def _generate_excel(self, content: str, title: str) -> str:
|
||||||
|
"""Generate Excel content using openpyxl."""
|
||||||
|
try:
|
||||||
|
# Create workbook
|
||||||
|
wb = Workbook()
|
||||||
|
|
||||||
|
# Remove default sheet
|
||||||
|
wb.remove(wb.active)
|
||||||
|
|
||||||
|
# Create sheets
|
||||||
|
summary_sheet = wb.create_sheet("Summary", 0)
|
||||||
|
data_sheet = wb.create_sheet("Data", 1)
|
||||||
|
analysis_sheet = wb.create_sheet("Analysis", 2)
|
||||||
|
|
||||||
|
# Add content to sheets
|
||||||
|
self._populate_summary_sheet(summary_sheet, title)
|
||||||
|
self._populate_data_sheet(data_sheet, content)
|
||||||
|
self._populate_analysis_sheet(analysis_sheet, content)
|
||||||
|
|
||||||
|
# Save to buffer
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
wb.save(buffer)
|
||||||
|
buffer.seek(0)
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
excel_bytes = buffer.getvalue()
|
||||||
|
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
return excel_base64
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating Excel: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _populate_summary_sheet(self, sheet, title: str):
|
||||||
|
"""Populate the summary sheet."""
|
||||||
|
try:
|
||||||
|
# Title
|
||||||
|
sheet['A1'] = title
|
||||||
|
sheet['A1'].font = Font(size=16, bold=True)
|
||||||
|
sheet['A1'].alignment = Alignment(horizontal='center')
|
||||||
|
|
||||||
|
# Generation info
|
||||||
|
sheet['A3'] = "Generated:"
|
||||||
|
sheet['B3'] = self._format_timestamp()
|
||||||
|
sheet['A4'] = "Status:"
|
||||||
|
sheet['B4'] = "Generated Successfully"
|
||||||
|
|
||||||
|
# Key metrics placeholder
|
||||||
|
sheet['A6'] = "Key Metrics:"
|
||||||
|
sheet['A6'].font = Font(bold=True)
|
||||||
|
sheet['A7'] = "Total Items:"
|
||||||
|
sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
sheet.column_dimensions['A'].width = 20
|
||||||
|
sheet.column_dimensions['B'].width = 30
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate summary sheet: {str(e)}")
|
||||||
|
|
||||||
|
def _populate_data_sheet(self, sheet, content: str):
|
||||||
|
"""Populate the data sheet."""
|
||||||
|
try:
|
||||||
|
# Headers
|
||||||
|
headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
|
||||||
|
for col, header in enumerate(headers, 1):
|
||||||
|
cell = sheet.cell(row=1, column=col, value=header)
|
||||||
|
cell.font = Font(bold=True)
|
||||||
|
cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
|
||||||
|
|
||||||
|
# Process content
|
||||||
|
lines = content.split('\n')
|
||||||
|
row = 2
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for table data (lines with |)
|
||||||
|
if '|' in line:
|
||||||
|
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||||
|
for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
|
||||||
|
sheet.cell(row=row, column=col, value=cell_data)
|
||||||
|
row += 1
|
||||||
|
else:
|
||||||
|
# Regular content
|
||||||
|
sheet.cell(row=row, column=1, value=line)
|
||||||
|
row += 1
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
for col in range(1, 6):
|
||||||
|
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate data sheet: {str(e)}")
|
||||||
|
|
||||||
|
def _populate_analysis_sheet(self, sheet, content: str):
|
||||||
|
"""Populate the analysis sheet."""
|
||||||
|
try:
|
||||||
|
# Title
|
||||||
|
sheet['A1'] = "Analysis & Insights"
|
||||||
|
sheet['A1'].font = Font(size=14, bold=True)
|
||||||
|
|
||||||
|
# Content analysis
|
||||||
|
lines = content.split('\n')
|
||||||
|
row = 3
|
||||||
|
|
||||||
|
sheet['A3'] = "Content Analysis:"
|
||||||
|
sheet['A3'].font = Font(bold=True)
|
||||||
|
row += 1
|
||||||
|
|
||||||
|
# Count different types of content
|
||||||
|
table_lines = sum(1 for line in lines if '|' in line)
|
||||||
|
list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
|
||||||
|
text_lines = len(lines) - table_lines - list_lines
|
||||||
|
|
||||||
|
sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = f"Table Rows: {table_lines}"
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = f"List Items: {list_lines}"
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = f"Text Lines: {text_lines}"
|
||||||
|
row += 2
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
sheet[f'A{row}'] = "Recommendations:"
|
||||||
|
sheet[f'A{row}'].font = Font(bold=True)
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = "1. Review data accuracy"
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = "2. Consider additional analysis"
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = "3. Update regularly"
|
||||||
|
|
||||||
|
# Auto-adjust column width
|
||||||
|
sheet.column_dimensions['A'].width = 30
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
|
||||||
|
|
||||||
|
async def _generate_excel_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||||
|
"""Generate Excel content from structured JSON document using AI-generated styling."""
|
||||||
|
try:
|
||||||
|
# Debug output
|
||||||
|
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(json_content)}", "EXCEL_RENDERER")
|
||||||
|
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(json_content.keys()) if isinstance(json_content, dict) else 'Not a dict'}", "EXCEL_RENDERER")
|
||||||
|
|
||||||
|
# Get AI-generated styling definitions
|
||||||
|
styles = await self._get_excel_styles(user_prompt, ai_service)
|
||||||
|
|
||||||
|
# Validate JSON structure
|
||||||
|
if not isinstance(json_content, dict):
|
||||||
|
raise ValueError("JSON content must be a dictionary")
|
||||||
|
|
||||||
|
if "sections" not in json_content:
|
||||||
|
raise ValueError("JSON content must contain 'sections' field")
|
||||||
|
|
||||||
|
# Use title from JSON metadata if available, otherwise use provided title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", title)
|
||||||
|
|
||||||
|
# Create workbook
|
||||||
|
wb = Workbook()
|
||||||
|
|
||||||
|
# Create sheets based on content
|
||||||
|
sheets = self._create_excel_sheets(wb, json_content, styles)
|
||||||
|
self.services.utils.debugLogToFile(f"EXCEL SHEETS CREATED: {list(sheets.keys()) if sheets else 'None'}", "EXCEL_RENDERER")
|
||||||
|
|
||||||
|
# Populate sheets with content
|
||||||
|
self._populate_excel_sheets(sheets, json_content, styles)
|
||||||
|
|
||||||
|
# Save to buffer
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
wb.save(buffer)
|
||||||
|
buffer.seek(0)
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
excel_bytes = buffer.getvalue()
|
||||||
|
self.services.utils.debugLogToFile(f"EXCEL BYTES LENGTH: {len(excel_bytes)}", "EXCEL_RENDERER")
|
||||||
|
try:
|
||||||
|
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
|
||||||
|
self.services.utils.debugLogToFile(f"EXCEL BASE64 LENGTH: {len(excel_base64)}", "EXCEL_RENDERER")
|
||||||
|
except Exception as b64_error:
|
||||||
|
self.services.utils.debugLogToFile(f"BASE64 ENCODING ERROR: {b64_error}", "EXCEL_RENDERER")
|
||||||
|
raise
|
||||||
|
|
||||||
|
return excel_base64
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error generating Excel from JSON: {str(e)}")
|
||||||
|
raise Exception(f"Excel generation failed: {str(e)}")
|
||||||
|
|
||||||
|
async def _get_excel_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||||
|
"""Get Excel styling definitions using base template AI styling."""
|
||||||
|
style_schema = {
|
||||||
|
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
|
||||||
|
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
|
||||||
|
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
|
||||||
|
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2},
|
||||||
|
"paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"}
|
||||||
|
}
|
||||||
|
|
||||||
|
style_template = self._create_ai_style_template("xlsx", user_prompt, style_schema)
|
||||||
|
# Use our own _get_ai_styles_with_excel_colors method to ensure proper color conversion
|
||||||
|
styles = await self._get_ai_styles_with_excel_colors(ai_service, style_template, self._get_default_excel_styles())
|
||||||
|
|
||||||
|
# Validate and fix contrast issues
|
||||||
|
return self._validate_excel_styles_contrast(styles)
|
||||||
|
|
||||||
|
async def _get_ai_styles_with_excel_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Get AI styles with proper Excel color conversion."""
|
||||||
|
if not ai_service:
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||||
|
response = await ai_service.aiObjects.call(request)
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Clean and parse JSON
|
||||||
|
result = response.content.strip() if response and response.content else ""
|
||||||
|
|
||||||
|
# Check if result is empty
|
||||||
|
if not result:
|
||||||
|
self.logger.warning("AI styling returned empty response, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Extract JSON from markdown if present
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
self.services.utils.debugLogToFile(f"EXTRACTED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
self.services.utils.debugLogToFile(f"CLEANED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
self.services.utils.debugLogToFile(f"CLEANED JSON FROM GENERIC MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
styles = json.loads(result)
|
||||||
|
except json.JSONDecodeError as json_error:
|
||||||
|
self.logger.warning(f"AI styling returned invalid JSON: {json_error}, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
# Convert colors to Excel aRGB format
|
||||||
|
styles = self._convert_colors_format(styles)
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||||
|
return default_styles
|
||||||
|
|
||||||
|
def _get_safe_color(self, color_value: str, default: str = "FF000000") -> str:
|
||||||
|
"""Get a safe aRGB color value for Excel (without # prefix)."""
|
||||||
|
if not isinstance(color_value, str):
|
||||||
|
return default
|
||||||
|
|
||||||
|
# Remove # prefix if present
|
||||||
|
if color_value.startswith('#'):
|
||||||
|
color_value = color_value[1:]
|
||||||
|
|
||||||
|
if len(color_value) == 6:
|
||||||
|
# Convert RRGGBB to AARRGGBB
|
||||||
|
return f"FF{color_value}"
|
||||||
|
elif len(color_value) == 8:
|
||||||
|
# Already aRGB format
|
||||||
|
return color_value
|
||||||
|
else:
|
||||||
|
# Unexpected format, return default
|
||||||
|
return default
|
||||||
|
|
||||||
|
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Convert hex colors to aRGB format for Excel compatibility."""
|
||||||
|
try:
|
||||||
|
self.services.utils.debugLogToFile(f"CONVERTING COLORS IN STYLES: {styles}", "EXCEL_RENDERER")
|
||||||
|
for style_name, style_config in styles.items():
|
||||||
|
if isinstance(style_config, dict):
|
||||||
|
for prop, value in style_config.items():
|
||||||
|
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
|
||||||
|
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel)
|
||||||
|
styles[style_name][prop] = f"FF{value[1:]}"
|
||||||
|
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
|
||||||
|
pass # Already aRGB format
|
||||||
|
elif isinstance(value, str) and value.startswith('#'):
|
||||||
|
pass # Unexpected format, keep as is
|
||||||
|
return styles
|
||||||
|
except Exception as e:
|
||||||
|
return styles
|
||||||
|
|
||||||
|
def _validate_excel_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Validate and fix contrast issues in AI-generated styles."""
|
||||||
|
try:
|
||||||
|
# Fix table header contrast
|
||||||
|
if "table_header" in styles:
|
||||||
|
header = styles["table_header"]
|
||||||
|
bg_color = header.get("background", "#FFFFFF")
|
||||||
|
text_color = header.get("text_color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["text_color"] = "#FFFFFF"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
header["background"] = "#4F4F4F"
|
||||||
|
header["text_color"] = "#FFFFFF"
|
||||||
|
|
||||||
|
# Fix table cell contrast
|
||||||
|
if "table_cell" in styles:
|
||||||
|
cell = styles["table_cell"]
|
||||||
|
bg_color = cell.get("background", "#FFFFFF")
|
||||||
|
text_color = cell.get("text_color", "#000000")
|
||||||
|
|
||||||
|
# If both are white or both are dark, fix it
|
||||||
|
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["text_color"] = "#2F2F2F"
|
||||||
|
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||||
|
cell["background"] = "#FFFFFF"
|
||||||
|
cell["text_color"] = "#2F2F2F"
|
||||||
|
|
||||||
|
return styles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||||
|
return self._get_default_excel_styles()
|
||||||
|
|
||||||
|
def _get_default_excel_styles(self) -> Dict[str, Any]:
|
||||||
|
"""Default Excel styles with aRGB color format."""
|
||||||
|
return {
|
||||||
|
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
|
||||||
|
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
|
||||||
|
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
|
||||||
|
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2},
|
||||||
|
"paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||||
|
"code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _create_excel_sheets(self, wb: Workbook, json_content: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Create Excel sheets based on content structure and user intent."""
|
||||||
|
sheets = {}
|
||||||
|
|
||||||
|
# Get sheet names from AI styles or generate based on content
|
||||||
|
sheet_names = styles.get("sheet_names", self._generate_sheet_names_from_content(json_content))
|
||||||
|
self.services.utils.debugLogToFile(f"EXCEL SHEET NAMES: {sheet_names}", "EXCEL_RENDERER")
|
||||||
|
|
||||||
|
# Create sheets
|
||||||
|
for i, sheet_name in enumerate(sheet_names):
|
||||||
|
if i == 0:
|
||||||
|
# Use the default sheet for the first sheet
|
||||||
|
sheet = wb.active
|
||||||
|
sheet.title = sheet_name
|
||||||
|
else:
|
||||||
|
# Create additional sheets
|
||||||
|
sheet = wb.create_sheet(sheet_name, i)
|
||||||
|
sheets[sheet_name.lower()] = sheet
|
||||||
|
|
||||||
|
return sheets
|
||||||
|
|
||||||
|
def _generate_sheet_names_from_content(self, json_content: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Generate sheet names based on actual content structure."""
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
|
||||||
|
# If no sections, create a single sheet
|
||||||
|
if not sections:
|
||||||
|
return ["Content"]
|
||||||
|
|
||||||
|
# Generate sheet names based on content structure
|
||||||
|
sheet_names = []
|
||||||
|
|
||||||
|
# Check if we have multiple table sections
|
||||||
|
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||||
|
|
||||||
|
if len(table_sections) > 1:
|
||||||
|
# Create separate sheets for each table
|
||||||
|
for i, section in enumerate(table_sections, 1):
|
||||||
|
section_title = section.get("title", f"Table {i}")
|
||||||
|
sheet_names.append(section_title[:31]) # Excel sheet name limit
|
||||||
|
else:
|
||||||
|
# Single table or mixed content - create main sheet
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", "Document")
|
||||||
|
sheet_names.append(document_title[:31]) # Excel sheet name limit
|
||||||
|
|
||||||
|
# Add additional sheets for other content types
|
||||||
|
content_types = set()
|
||||||
|
for section in sections:
|
||||||
|
content_type = section.get("content_type", "paragraph")
|
||||||
|
content_types.add(content_type)
|
||||||
|
|
||||||
|
if "table" in content_types and len(table_sections) == 1:
|
||||||
|
sheet_names.append("Table Data")
|
||||||
|
if "list" in content_types:
|
||||||
|
sheet_names.append("Lists")
|
||||||
|
if "paragraph" in content_types or "heading" in content_types:
|
||||||
|
sheet_names.append("Text")
|
||||||
|
|
||||||
|
# Limit to 4 sheets maximum
|
||||||
|
return sheet_names[:4]
|
||||||
|
|
||||||
|
def _populate_excel_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
|
"""Populate Excel sheets with content from JSON based on actual sheet names."""
|
||||||
|
try:
|
||||||
|
# Get the actual sheet names that were created
|
||||||
|
sheet_names = list(sheets.keys())
|
||||||
|
|
||||||
|
if not sheet_names:
|
||||||
|
return
|
||||||
|
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||||
|
|
||||||
|
if len(table_sections) > 1:
|
||||||
|
# Multiple tables - populate each sheet with its corresponding table
|
||||||
|
for i, section in enumerate(table_sections):
|
||||||
|
if i < len(sheet_names):
|
||||||
|
sheet_name = sheet_names[i]
|
||||||
|
sheet = sheets[sheet_name]
|
||||||
|
self._populate_table_sheet(sheet, section, styles, f"Table {i+1}")
|
||||||
|
else:
|
||||||
|
# Single table or mixed content - use original logic
|
||||||
|
first_sheet_name = sheet_names[0]
|
||||||
|
self._populate_main_sheet(sheets[first_sheet_name], json_content, styles)
|
||||||
|
|
||||||
|
# If we have multiple sheets, distribute content by type
|
||||||
|
if len(sheet_names) > 1:
|
||||||
|
self._populate_content_type_sheets(sheets, json_content, styles, sheet_names[1:])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate Excel sheets: {str(e)}")
|
||||||
|
|
||||||
|
def _populate_table_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], sheet_title: str):
|
||||||
|
"""Populate a sheet with a single table section."""
|
||||||
|
try:
|
||||||
|
# Sheet title
|
||||||
|
sheet['A1'] = sheet_title
|
||||||
|
sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79")))
|
||||||
|
sheet['A1'].alignment = Alignment(horizontal="center")
|
||||||
|
|
||||||
|
# Get table data from elements (canonical JSON format)
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||||
|
table_data = elements[0]
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
else:
|
||||||
|
headers = []
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
if not headers and not rows:
|
||||||
|
sheet['A3'] = "No table data available"
|
||||||
|
return
|
||||||
|
|
||||||
|
# Add headers
|
||||||
|
header_style = styles.get("table_header", {})
|
||||||
|
for col, header in enumerate(headers, 1):
|
||||||
|
cell = sheet.cell(row=3, column=col, value=header)
|
||||||
|
if header_style.get("bold"):
|
||||||
|
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
||||||
|
if header_style.get("background"):
|
||||||
|
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
|
||||||
|
|
||||||
|
# Add rows
|
||||||
|
cell_style = styles.get("table_cell", {})
|
||||||
|
for row_idx, row_data in enumerate(rows, 4):
|
||||||
|
for col_idx, cell_value in enumerate(row_data, 1):
|
||||||
|
cell = sheet.cell(row=row_idx, column=col_idx, value=cell_value)
|
||||||
|
if cell_style.get("text_color"):
|
||||||
|
cell.font = Font(color=self._get_safe_color(cell_style["text_color"]))
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
for col in range(1, len(headers) + 1):
|
||||||
|
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate table sheet: {str(e)}")
|
||||||
|
|
||||||
|
def _populate_main_sheet(self, sheet, json_content: Dict[str, Any], styles: Dict[str, Any]):
|
||||||
|
"""Populate the main sheet with document overview and all content."""
|
||||||
|
try:
|
||||||
|
# Document title
|
||||||
|
document_title = json_content.get("metadata", {}).get("title", "Generated Report")
|
||||||
|
sheet['A1'] = document_title
|
||||||
|
|
||||||
|
# Safety check for title style
|
||||||
|
title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "center"})
|
||||||
|
try:
|
||||||
|
safe_color = self._get_safe_color(title_style["color"])
|
||||||
|
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color=safe_color)
|
||||||
|
sheet['A1'].alignment = Alignment(horizontal=title_style["align"])
|
||||||
|
except Exception as font_error:
|
||||||
|
# Try with a safe color
|
||||||
|
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color="FF000000")
|
||||||
|
sheet['A1'].alignment = Alignment(horizontal=title_style["align"])
|
||||||
|
|
||||||
|
# Generation info
|
||||||
|
sheet['A3'] = "Generated:"
|
||||||
|
sheet['B3'] = self._format_timestamp()
|
||||||
|
sheet['A4'] = "Status:"
|
||||||
|
sheet['B4'] = "Generated Successfully"
|
||||||
|
|
||||||
|
# Document metadata
|
||||||
|
metadata = json_content.get("metadata", {})
|
||||||
|
if metadata:
|
||||||
|
sheet['A6'] = "Document Information:"
|
||||||
|
sheet['A6'].font = Font(bold=True)
|
||||||
|
|
||||||
|
row = 7
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if key != "title":
|
||||||
|
sheet[f'A{row}'] = f"{key.title()}:"
|
||||||
|
sheet[f'B{row}'] = str(value)
|
||||||
|
row += 1
|
||||||
|
|
||||||
|
# Content overview
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
sheet[f'A{row + 1}'] = "Content Overview:"
|
||||||
|
sheet[f'A{row + 1}'].font = Font(bold=True)
|
||||||
|
|
||||||
|
row += 2
|
||||||
|
sheet[f'A{row}'] = f"Total Sections: {len(sections)}"
|
||||||
|
|
||||||
|
# Count different content types
|
||||||
|
content_types = {}
|
||||||
|
for section in sections:
|
||||||
|
content_type = section.get("content_type", "unknown")
|
||||||
|
content_types[content_type] = content_types.get(content_type, 0) + 1
|
||||||
|
|
||||||
|
for content_type, count in content_types.items():
|
||||||
|
row += 1
|
||||||
|
sheet[f'A{row}'] = f"{content_type.title()} Sections: {count}"
|
||||||
|
|
||||||
|
# Add all content to this sheet
|
||||||
|
row += 2
|
||||||
|
for section in sections:
|
||||||
|
row = self._add_section_to_sheet(sheet, section, styles, row)
|
||||||
|
row += 1 # Empty row between sections
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
sheet.column_dimensions['A'].width = 20
|
||||||
|
sheet.column_dimensions['B'].width = 30
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate main sheet: {str(e)}")
|
||||||
|
|
||||||
|
def _populate_content_type_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any], sheet_names: List[str]):
|
||||||
|
"""Populate additional sheets based on content types."""
|
||||||
|
try:
|
||||||
|
sections = json_content.get("sections", [])
|
||||||
|
|
||||||
|
for sheet_name in sheet_names:
|
||||||
|
if sheet_name not in sheets:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sheet = sheets[sheet_name]
|
||||||
|
sheet_title = sheet_name.title()
|
||||||
|
sheet['A1'] = sheet_title
|
||||||
|
sheet['A1'].font = Font(size=16, bold=True)
|
||||||
|
|
||||||
|
row = 3
|
||||||
|
|
||||||
|
# Filter sections by content type
|
||||||
|
if sheet_name == "tables":
|
||||||
|
filtered_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||||
|
elif sheet_name == "lists":
|
||||||
|
filtered_sections = [s for s in sections if s.get("content_type") == "list"]
|
||||||
|
elif sheet_name == "text":
|
||||||
|
filtered_sections = [s for s in sections if s.get("content_type") in ["paragraph", "heading"]]
|
||||||
|
else:
|
||||||
|
filtered_sections = sections
|
||||||
|
|
||||||
|
for section in filtered_sections:
|
||||||
|
row = self._add_section_to_sheet(sheet, section, styles, row)
|
||||||
|
row += 1 # Empty row between sections
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
for col in range(1, 6):
|
||||||
|
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not populate content type sheets: {str(e)}")
|
||||||
|
|
||||||
|
def _add_section_to_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||||
|
"""Add a section to a sheet and return the next row."""
|
||||||
|
try:
|
||||||
|
# Add section title
|
||||||
|
section_title = section.get("title")
|
||||||
|
if section_title:
|
||||||
|
sheet[f'A{start_row}'] = f"# {section_title}"
|
||||||
|
sheet[f'A{start_row}'].font = Font(bold=True)
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
# Process section based on type
|
||||||
|
section_type = section.get("content_type", "paragraph")
|
||||||
|
|
||||||
|
# Handle all section types using elements array
|
||||||
|
elements = section.get("elements", [])
|
||||||
|
for element in elements:
|
||||||
|
if section_type == "table":
|
||||||
|
start_row = self._add_table_to_excel(sheet, element, styles, start_row)
|
||||||
|
elif section_type == "list":
|
||||||
|
start_row = self._add_list_to_excel(sheet, element, styles, start_row)
|
||||||
|
elif section_type == "paragraph":
|
||||||
|
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
||||||
|
elif section_type == "heading":
|
||||||
|
start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
|
||||||
|
else:
|
||||||
|
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
||||||
|
|
||||||
|
return start_row
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not add section to sheet: {str(e)}")
|
||||||
|
return start_row + 1
|
||||||
|
|
||||||
|
def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||||
|
"""Add a table element to Excel sheet."""
|
||||||
|
try:
|
||||||
|
# In canonical JSON format, table elements have headers and rows directly
|
||||||
|
headers = element.get("headers", [])
|
||||||
|
rows = element.get("rows", [])
|
||||||
|
|
||||||
|
if not headers and not rows:
|
||||||
|
return start_row
|
||||||
|
|
||||||
|
# Add headers
|
||||||
|
header_style = styles.get("table_header", {})
|
||||||
|
for col, header in enumerate(headers, 1):
|
||||||
|
cell = sheet.cell(row=start_row, column=col, value=header)
|
||||||
|
if header_style.get("bold"):
|
||||||
|
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
||||||
|
if header_style.get("background"):
|
||||||
|
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
|
||||||
|
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
# Add rows
|
||||||
|
cell_style = styles.get("table_cell", {})
|
||||||
|
for row_data in rows:
|
||||||
|
for col, cell_value in enumerate(row_data, 1):
|
||||||
|
cell = sheet.cell(row=start_row, column=col, value=cell_value)
|
||||||
|
if cell_style.get("text_color"):
|
||||||
|
cell.font = Font(color=self._get_safe_color(cell_style["text_color"]))
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
return start_row
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not add table to Excel: {str(e)}")
|
||||||
|
return start_row + 1
|
||||||
|
|
||||||
|
def _add_list_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||||
|
"""Add a list element to Excel sheet."""
|
||||||
|
try:
|
||||||
|
list_items = element.get("items", [])
|
||||||
|
|
||||||
|
list_style = styles.get("bullet_list", {})
|
||||||
|
for item in list_items:
|
||||||
|
sheet.cell(row=start_row, column=1, value=f"• {item}")
|
||||||
|
if list_style.get("color"):
|
||||||
|
sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(list_style["color"]))
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
return start_row
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not add list to Excel: {str(e)}")
|
||||||
|
return start_row + 1
|
||||||
|
|
||||||
|
def _add_paragraph_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||||
|
"""Add a paragraph element to Excel sheet."""
|
||||||
|
try:
|
||||||
|
text = element.get("text", "")
|
||||||
|
if text:
|
||||||
|
sheet.cell(row=start_row, column=1, value=text)
|
||||||
|
|
||||||
|
paragraph_style = styles.get("paragraph", {})
|
||||||
|
if paragraph_style.get("color"):
|
||||||
|
sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(paragraph_style["color"]))
|
||||||
|
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
return start_row
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not add paragraph to Excel: {str(e)}")
|
||||||
|
return start_row + 1
|
||||||
|
|
||||||
|
def _add_heading_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||||
|
"""Add a heading element to Excel sheet."""
|
||||||
|
try:
|
||||||
|
text = element.get("text", "")
|
||||||
|
level = element.get("level", 1)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
sheet.cell(row=start_row, column=1, value=text)
|
||||||
|
|
||||||
|
heading_style = styles.get("heading", {})
|
||||||
|
font_size = heading_style.get("font_size", 14)
|
||||||
|
if level > 1:
|
||||||
|
font_size = max(10, font_size - (level - 1) * 2)
|
||||||
|
|
||||||
|
sheet.cell(row=start_row, column=1).font = Font(
|
||||||
|
size=font_size,
|
||||||
|
bold=True,
|
||||||
|
color=self._get_safe_color(heading_style.get("color", "FF000000"))
|
||||||
|
)
|
||||||
|
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
return start_row
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not add heading to Excel: {str(e)}")
|
||||||
|
return start_row + 1
|
||||||
|
|
||||||
|
def _format_timestamp(self) -> str:
|
||||||
|
"""Format current timestamp for document generation."""
|
||||||
|
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
|
@ -1,94 +0,0 @@
|
||||||
"""
|
|
||||||
Text renderer for report generation.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .base_renderer import BaseRenderer
|
|
||||||
from typing import Dict, Any, Tuple, List
|
|
||||||
|
|
||||||
class TextRenderer(BaseRenderer):
|
|
||||||
"""Renders content to plain text format with format-specific extraction."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_formats(cls) -> List[str]:
|
|
||||||
"""Return supported text formats (excluding formats with dedicated renderers)."""
|
|
||||||
return [
|
|
||||||
'txt', 'text', 'plain',
|
|
||||||
# Programming languages
|
|
||||||
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
|
|
||||||
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
|
|
||||||
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
|
|
||||||
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
|
|
||||||
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
|
|
||||||
# Web technologies (excluding html/htm which have dedicated renderer)
|
|
||||||
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
|
|
||||||
# Data formats (excluding csv, md/markdown which have dedicated renderers)
|
|
||||||
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
|
|
||||||
# Configuration files
|
|
||||||
'env', 'properties', 'conf', 'config', 'rc',
|
|
||||||
'gitattributes', 'editorconfig', 'eslintrc',
|
|
||||||
# Documentation
|
|
||||||
'readme', 'changelog', 'license', 'authors',
|
|
||||||
'contributing', 'todo', 'notes', 'docs'
|
|
||||||
]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_format_aliases(cls) -> List[str]:
|
|
||||||
"""Return format aliases."""
|
|
||||||
return [
|
|
||||||
'ascii', 'utf8', 'utf-8', 'code', 'source',
|
|
||||||
'script', 'program', 'file', 'document',
|
|
||||||
'raw', 'unformatted', 'plaintext'
|
|
||||||
]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_priority(cls) -> int:
|
|
||||||
"""Return priority for text renderer."""
|
|
||||||
return 90
|
|
||||||
|
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
|
||||||
"""Return only plain-text guidelines; global prompt is built centrally."""
|
|
||||||
return (
|
|
||||||
"TEXT FORMAT GUIDELINES:\n"
|
|
||||||
"- Output ONLY plain text (no markdown or HTML).\n"
|
|
||||||
"- Use clear headings (you may underline with === or --- when helpful).\n"
|
|
||||||
"- Use simple bullet lists with '-' and tables with '|' when needed.\n"
|
|
||||||
"- Preserve indentation for code-like content if present.\n"
|
|
||||||
"OUTPUT: Return ONLY the raw text content."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
|
||||||
"""Render extracted content to plain text format."""
|
|
||||||
try:
|
|
||||||
# The extracted content should already be formatted text from the AI
|
|
||||||
# Just clean it up
|
|
||||||
text_content = self._clean_text_content(extracted_content, title)
|
|
||||||
|
|
||||||
return text_content, "text/plain"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error rendering text: {str(e)}")
|
|
||||||
# Return minimal text fallback
|
|
||||||
return f"{title}\n\nError rendering report: {str(e)}", "text/plain"
|
|
||||||
|
|
||||||
def _clean_text_content(self, content: str, title: str) -> str:
|
|
||||||
"""Clean and validate text content from AI."""
|
|
||||||
content = content.strip()
|
|
||||||
|
|
||||||
# Remove markdown code blocks if present
|
|
||||||
if content.startswith("```") and content.endswith("```"):
|
|
||||||
lines = content.split('\n')
|
|
||||||
if len(lines) > 2:
|
|
||||||
content = '\n'.join(lines[1:-1]).strip()
|
|
||||||
|
|
||||||
# Remove any remaining markdown formatting
|
|
||||||
content = content.replace('**', '').replace('*', '')
|
|
||||||
content = content.replace('__', '').replace('_', '')
|
|
||||||
|
|
||||||
# Clean up any HTML-like tags that might have slipped through
|
|
||||||
import re
|
|
||||||
content = re.sub(r'<[^>]+>', '', content)
|
|
||||||
|
|
||||||
# Ensure proper line endings
|
|
||||||
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
|
||||||
|
|
||||||
return content
|
|
||||||
517
modules/services/serviceGeneration/subJsonSchema.py
Normal file
517
modules/services/serviceGeneration/subJsonSchema.py
Normal file
|
|
@ -0,0 +1,517 @@
|
||||||
|
"""
|
||||||
|
JSON Schema definitions for AI-generated document structures.
|
||||||
|
This module provides schemas that guide AI to generate structured JSON output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
|
||||||
|
"""Get the JSON schema for multi-document generation."""
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["metadata", "documents"],
|
||||||
|
"properties": {
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["title", "splitStrategy"],
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string", "description": "Document title"},
|
||||||
|
"splitStrategy": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
|
||||||
|
"description": "Strategy for splitting content into multiple files"
|
||||||
|
},
|
||||||
|
"splitCriteria": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
|
||||||
|
},
|
||||||
|
"fileNamingPattern": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
|
||||||
|
},
|
||||||
|
"author": {"type": "string", "description": "Document author (optional)"},
|
||||||
|
"source_documents": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "List of source document IDs"
|
||||||
|
},
|
||||||
|
"extraction_method": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "ai_extraction",
|
||||||
|
"description": "Method used for extraction"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"documents": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Array of individual documents to generate",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["id", "title", "sections", "filename"],
|
||||||
|
"properties": {
|
||||||
|
"id": {"type": "string", "description": "Unique document identifier"},
|
||||||
|
"title": {"type": "string", "description": "Document title"},
|
||||||
|
"filename": {"type": "string", "description": "Generated filename"},
|
||||||
|
"sections": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Document sections containing structured content",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["id", "content_type", "elements", "order"],
|
||||||
|
"properties": {
|
||||||
|
"id": {"type": "string", "description": "Unique section identifier"},
|
||||||
|
"title": {"type": "string", "description": "Section title (optional)"},
|
||||||
|
"content_type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
||||||
|
"description": "Primary content type of this section"
|
||||||
|
},
|
||||||
|
"elements": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Content elements in this section",
|
||||||
|
"items": {
|
||||||
|
"oneOf": [
|
||||||
|
{"$ref": "#/definitions/table"},
|
||||||
|
{"$ref": "#/definitions/bullet_list"},
|
||||||
|
{"$ref": "#/definitions/paragraph"},
|
||||||
|
{"$ref": "#/definitions/heading"},
|
||||||
|
{"$ref": "#/definitions/code_block"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"order": {"type": "integer", "description": "Section order in document"},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Additional section metadata"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Document-specific metadata"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"definitions": {
|
||||||
|
"table": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["headers", "rows"],
|
||||||
|
"properties": {
|
||||||
|
"headers": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "Table column headers"
|
||||||
|
},
|
||||||
|
"rows": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"}
|
||||||
|
},
|
||||||
|
"description": "Table data rows"
|
||||||
|
},
|
||||||
|
"caption": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Table caption (optional)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"bullet_list": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["items"],
|
||||||
|
"properties": {
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "List item text"},
|
||||||
|
"subitems": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"$ref": "#/definitions/list_item"},
|
||||||
|
"description": "Nested sub-items (optional)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "List items"
|
||||||
|
},
|
||||||
|
"list_type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["bullet", "numbered", "checklist"],
|
||||||
|
"default": "bullet",
|
||||||
|
"description": "Type of list"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"list_item": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "List item text"},
|
||||||
|
"subitems": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"$ref": "#/definitions/list_item"},
|
||||||
|
"description": "Nested sub-items (optional)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"paragraph": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "Paragraph text"},
|
||||||
|
"formatting": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Text formatting (bold, italic, etc.)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"heading": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text", "level"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "Heading text"},
|
||||||
|
"level": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1,
|
||||||
|
"maximum": 6,
|
||||||
|
"description": "Heading level (1-6)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"code_block": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["code"],
|
||||||
|
"properties": {
|
||||||
|
"code": {"type": "string", "description": "Code content"},
|
||||||
|
"language": {"type": "string", "description": "Programming language (optional)"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_document_subJsonSchema() -> Dict[str, Any]:
|
||||||
|
"""Get the JSON schema for structured document generation (single document)."""
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["metadata", "sections"],
|
||||||
|
"properties": {
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["title"],
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string", "description": "Document title"},
|
||||||
|
"author": {"type": "string", "description": "Document author (optional)"},
|
||||||
|
"source_documents": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "List of source document IDs"
|
||||||
|
},
|
||||||
|
"extraction_method": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "ai_extraction",
|
||||||
|
"description": "Method used for extraction"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sections": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Document sections containing structured content",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["id", "content_type", "elements", "order"],
|
||||||
|
"properties": {
|
||||||
|
"id": {"type": "string", "description": "Unique section identifier"},
|
||||||
|
"title": {"type": "string", "description": "Section title (optional)"},
|
||||||
|
"content_type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
||||||
|
"description": "Primary content type of this section"
|
||||||
|
},
|
||||||
|
"elements": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Content elements in this section",
|
||||||
|
"items": {
|
||||||
|
"oneOf": [
|
||||||
|
{"$ref": "#/definitions/table"},
|
||||||
|
{"$ref": "#/definitions/bullet_list"},
|
||||||
|
{"$ref": "#/definitions/paragraph"},
|
||||||
|
{"$ref": "#/definitions/heading"},
|
||||||
|
{"$ref": "#/definitions/code_block"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"order": {"type": "integer", "description": "Section order in document"},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Additional section metadata"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Document summary (optional)"
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "Document tags for categorization"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"definitions": {
|
||||||
|
"table": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["headers", "rows"],
|
||||||
|
"properties": {
|
||||||
|
"headers": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "Table column headers"
|
||||||
|
},
|
||||||
|
"rows": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"}
|
||||||
|
},
|
||||||
|
"description": "Table data rows"
|
||||||
|
},
|
||||||
|
"caption": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Table caption (optional)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"bullet_list": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["items"],
|
||||||
|
"properties": {
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "List item text"},
|
||||||
|
"subitems": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"$ref": "#/definitions/list_item"},
|
||||||
|
"description": "Nested sub-items (optional)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "List items"
|
||||||
|
},
|
||||||
|
"list_type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["bullet", "numbered", "checklist"],
|
||||||
|
"default": "bullet",
|
||||||
|
"description": "Type of list"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"list_item": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "List item text"},
|
||||||
|
"subitems": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"$ref": "#/definitions/list_item"},
|
||||||
|
"description": "Nested sub-items (optional)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"paragraph": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "Paragraph text"},
|
||||||
|
"formatting": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Text formatting (bold, italic, etc.)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"heading": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["text", "level"],
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string", "description": "Heading text"},
|
||||||
|
"level": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1,
|
||||||
|
"maximum": 6,
|
||||||
|
"description": "Heading level (1-6)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"code_block": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["code"],
|
||||||
|
"properties": {
|
||||||
|
"code": {"type": "string", "description": "Code content"},
|
||||||
|
"language": {"type": "string", "description": "Programming language (optional)"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_extraction_prompt_template() -> str:
|
||||||
|
"""Get the template for AI extraction prompts that request JSON output."""
|
||||||
|
return """
|
||||||
|
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.
|
||||||
|
|
||||||
|
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
||||||
|
|
||||||
|
JSON Schema Requirements:
|
||||||
|
- Extract the actual data from the source documents
|
||||||
|
- If content is a table, extract it as a table with headers and rows
|
||||||
|
- If content is a list, extract it as a structured list with items
|
||||||
|
- If content is text, extract it as paragraphs or headings
|
||||||
|
- Preserve the original structure and data - do not summarize or interpret
|
||||||
|
- Use the exact JSON schema provided
|
||||||
|
|
||||||
|
Content Types to Extract:
|
||||||
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
2. Lists: Extract all items with proper nesting
|
||||||
|
3. Headings: Extract with appropriate levels
|
||||||
|
4. Paragraphs: Extract as structured text
|
||||||
|
5. Code: Extract code blocks with language identification
|
||||||
|
|
||||||
|
Return only the JSON structure following the schema. Do not include any text before or after the JSON.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_generation_prompt_template() -> str:
|
||||||
|
"""Get the template for AI generation prompts that work with JSON input."""
|
||||||
|
return """
|
||||||
|
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.
|
||||||
|
|
||||||
|
IMPORTANT: You must respond with valid JSON only, following the document schema.
|
||||||
|
|
||||||
|
Generation Guidelines:
|
||||||
|
- Use the provided JSON structure as the foundation
|
||||||
|
- Enhance the content with proper formatting and organization
|
||||||
|
- Ensure logical flow and readability
|
||||||
|
- Maintain the original data integrity
|
||||||
|
- Add appropriate headings and sections
|
||||||
|
- Organize content in a logical sequence
|
||||||
|
|
||||||
|
Content Enhancement:
|
||||||
|
- Tables: Ensure proper headers and data alignment
|
||||||
|
- Lists: Use appropriate list types (bullet, numbered, checklist)
|
||||||
|
- Headings: Use appropriate heading levels for hierarchy
|
||||||
|
- Paragraphs: Ensure proper text flow and formatting
|
||||||
|
- Code: Preserve code blocks with proper language identification
|
||||||
|
|
||||||
|
Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||||
|
"""Automatically select appropriate schema based on prompt analysis."""
|
||||||
|
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
|
||||||
|
return get_multi_document_subJsonSchema()
|
||||||
|
else:
|
||||||
|
return get_document_subJsonSchema()
|
||||||
|
|
||||||
|
def validate_json_document(json_data: Dict[str, Any]) -> bool:
|
||||||
|
"""Validate that the JSON data follows the document schema."""
|
||||||
|
try:
|
||||||
|
# Basic validation - check required fields
|
||||||
|
if not isinstance(json_data, dict):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if it's multi-document or single-document structure
|
||||||
|
if "documents" in json_data:
|
||||||
|
# Multi-document structure
|
||||||
|
if "metadata" not in json_data:
|
||||||
|
return False
|
||||||
|
|
||||||
|
metadata = json_data["metadata"]
|
||||||
|
if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata:
|
||||||
|
return False
|
||||||
|
|
||||||
|
documents = json_data["documents"]
|
||||||
|
if not isinstance(documents, list):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate each document
|
||||||
|
for doc in documents:
|
||||||
|
if not isinstance(doc, dict):
|
||||||
|
return False
|
||||||
|
|
||||||
|
required_fields = ["id", "title", "sections", "filename"]
|
||||||
|
for field in required_fields:
|
||||||
|
if field not in doc:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate sections in each document
|
||||||
|
sections = doc.get("sections", [])
|
||||||
|
if not isinstance(sections, list):
|
||||||
|
return False
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
if not isinstance(section, dict):
|
||||||
|
return False
|
||||||
|
|
||||||
|
section_required = ["id", "content_type", "elements", "order"]
|
||||||
|
for field in section_required:
|
||||||
|
if field not in section:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate content_type
|
||||||
|
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||||
|
if section["content_type"] not in valid_types:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate elements
|
||||||
|
if not isinstance(section["elements"], list):
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif "sections" in json_data:
|
||||||
|
# Single-document structure (existing validation)
|
||||||
|
if "metadata" not in json_data:
|
||||||
|
return False
|
||||||
|
|
||||||
|
metadata = json_data["metadata"]
|
||||||
|
if not isinstance(metadata, dict) or "title" not in metadata:
|
||||||
|
return False
|
||||||
|
|
||||||
|
sections = json_data["sections"]
|
||||||
|
if not isinstance(sections, list):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate each section
|
||||||
|
for i, section in enumerate(sections):
|
||||||
|
if not isinstance(section, dict):
|
||||||
|
return False
|
||||||
|
|
||||||
|
required_fields = ["id", "content_type", "elements", "order"]
|
||||||
|
for field in required_fields:
|
||||||
|
if field not in section:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate content_type
|
||||||
|
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||||
|
if section["content_type"] not in valid_types:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate elements
|
||||||
|
if not isinstance(section["elements"], list):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
738
modules/services/serviceGeneration/subPromptBuilder.py
Normal file
738
modules/services/serviceGeneration/subPromptBuilder.py
Normal file
|
|
@ -0,0 +1,738 @@
|
||||||
|
"""
|
||||||
|
Prompt builder for AI document generation and extraction.
|
||||||
|
This module builds prompts for AI services to extract and generate documents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
# Type hint for renderer parameter
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .renderers.rendererBaseTemplate import BaseRenderer
|
||||||
|
_RendererLike = BaseRenderer
|
||||||
|
else:
|
||||||
|
_RendererLike = Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def buildAdaptiveExtractionPrompt(
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
promptAnalysis: Dict[str, Any],
|
||||||
|
aiService=None,
|
||||||
|
services=None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Build adaptive extraction prompt based on AI analysis.
|
||||||
|
Uses multi-file or single-file approach based on analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Multi-file example data instead of schema
|
||||||
|
multi_file_example = {
|
||||||
|
"metadata": {
|
||||||
|
"title": "Multi-Document Example",
|
||||||
|
"splitStrategy": "by_section",
|
||||||
|
"source_documents": ["doc_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"id": "doc_section_1",
|
||||||
|
"title": "Section 1 Title",
|
||||||
|
"filename": "section_1.xlsx",
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "section_1",
|
||||||
|
"content_type": "heading",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"level": 1,
|
||||||
|
"text": "1. SECTION TITLE"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "section_2",
|
||||||
|
"content_type": "paragraph",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"text": "This is the actual content that should be extracted from the document."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "section_3",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2"],
|
||||||
|
"rows": [["Value 1", "Value 2"]]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Single-file example data instead of schema
|
||||||
|
single_file_example = {
|
||||||
|
"metadata": {
|
||||||
|
"title": "Single Document Example",
|
||||||
|
"source_documents": ["doc_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "section_1",
|
||||||
|
"content_type": "heading",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"level": 1,
|
||||||
|
"text": "1. SECTION TITLE"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "section_2",
|
||||||
|
"content_type": "paragraph",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"text": "This is the actual content that should be extracted from the document."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "section_3",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2"],
|
||||||
|
"rows": [["Value 1", "Value 2"]]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
if promptAnalysis.get("is_multi_file", False):
|
||||||
|
# Multi-file prompt
|
||||||
|
adaptive_prompt = f"""
|
||||||
|
{userPrompt}
|
||||||
|
|
||||||
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
|
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
|
||||||
|
|
||||||
|
REQUIREMENTS:
|
||||||
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
||||||
|
3. Create one JSON document entry for each section found
|
||||||
|
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
||||||
|
5. Generate appropriate filenames for each section
|
||||||
|
|
||||||
|
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(multi_file_example, indent=2)}
|
||||||
|
|
||||||
|
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
|
||||||
|
- "id": unique identifier
|
||||||
|
- "title": section title from the document
|
||||||
|
- "filename": appropriate filename for the section
|
||||||
|
- "sections": array of content sections
|
||||||
|
|
||||||
|
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
|
||||||
|
|
||||||
|
INSTRUCTIONS:
|
||||||
|
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
|
||||||
|
- Use actual section titles, headings, and text from the document
|
||||||
|
- Create meaningful filenames based on section content
|
||||||
|
- Ensure each section contains the complete content for that part of the document
|
||||||
|
- Do not use generic placeholder text like "Section 1", "Section 2"
|
||||||
|
- Extract real headings, paragraphs, lists, and other content elements
|
||||||
|
- CRITICAL: Return JSON with "documents" array, not "sections" array
|
||||||
|
|
||||||
|
CONTEXT (Document Content):
|
||||||
|
|
||||||
|
Content Types to Extract:
|
||||||
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
2. Lists: Extract all items with proper nesting
|
||||||
|
3. Headings: Extract with appropriate levels
|
||||||
|
4. Paragraphs: Extract as structured text
|
||||||
|
5. Code: Extract code blocks with language identification
|
||||||
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||||
|
|
||||||
|
Image Analysis Requirements:
|
||||||
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||||
|
- Describe everything you see in the image
|
||||||
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||||
|
- If the image is too small, corrupted, or unclear, explain this
|
||||||
|
- Always provide feedback - never return empty responses
|
||||||
|
|
||||||
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||||
|
|
||||||
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||||
|
""".strip()
|
||||||
|
else:
|
||||||
|
# Single-file prompt - use example data instead of schema
|
||||||
|
adaptive_prompt = f"""
|
||||||
|
{userPrompt}
|
||||||
|
|
||||||
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||||
|
|
||||||
|
REQUIREMENTS:
|
||||||
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Extract all content and organize it into logical sections
|
||||||
|
3. Create structured JSON with sections containing the extracted content
|
||||||
|
4. Preserve the original structure and data
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(single_file_example, indent=2)}
|
||||||
|
|
||||||
|
INSTRUCTIONS:
|
||||||
|
- Replace example data with actual content from the document
|
||||||
|
- Use actual headings, paragraphs, and text from the document
|
||||||
|
- Ensure all content is properly structured
|
||||||
|
- Do not use generic placeholder text
|
||||||
|
- Extract real content from the documents
|
||||||
|
|
||||||
|
CONTEXT (Document Content):
|
||||||
|
|
||||||
|
Content Types to Extract:
|
||||||
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
2. Lists: Extract all items with proper nesting
|
||||||
|
3. Headings: Extract with appropriate levels
|
||||||
|
4. Paragraphs: Extract as structured text
|
||||||
|
5. Code: Extract code blocks with language identification
|
||||||
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||||
|
|
||||||
|
Image Analysis Requirements:
|
||||||
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||||
|
- Describe everything you see in the image
|
||||||
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||||
|
- If the image is too small, corrupted, or unclear, explain this
|
||||||
|
- Always provide feedback - never return empty responses
|
||||||
|
|
||||||
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||||
|
|
||||||
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
return adaptive_prompt
|
||||||
|
|
||||||
|
async def buildGenericExtractionPrompt(
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None,
|
||||||
|
services=None
|
||||||
|
) -> str:
|
||||||
|
"""Build generic extraction prompt that works for both single and multi-file."""
|
||||||
|
|
||||||
|
# Use AI to determine the best approach
|
||||||
|
if aiService:
|
||||||
|
try:
|
||||||
|
analysis_prompt = f"""
|
||||||
|
Analyze this user request and determine the best JSON structure for document extraction.
|
||||||
|
|
||||||
|
User request: "{userPrompt}"
|
||||||
|
|
||||||
|
Respond with JSON only:
|
||||||
|
{{
|
||||||
|
"requires_multi_file": true/false,
|
||||||
|
"recommended_schema": "single_document|multi_document",
|
||||||
|
"split_approach": "description of how to organize content",
|
||||||
|
"file_naming": "suggested naming pattern"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Consider the user's intent and the most logical way to organize the extracted content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||||
|
response = await aiService.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
import re
|
||||||
|
|
||||||
|
result = response.content.strip()
|
||||||
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(0)
|
||||||
|
|
||||||
|
analysis = json.loads(result)
|
||||||
|
|
||||||
|
# Use analysis to build appropriate prompt
|
||||||
|
return await buildAdaptiveExtractionPrompt(
|
||||||
|
outputFormat, userPrompt, title, analysis, aiService, services
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||||
|
|
||||||
|
# Fallback to single-file prompt
|
||||||
|
example_data = {
|
||||||
|
"metadata": {
|
||||||
|
"title": "Example Document",
|
||||||
|
"author": "AI Assistant",
|
||||||
|
"source_documents": ["document_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "section_001",
|
||||||
|
"content_type": "heading",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"level": 1,
|
||||||
|
"text": "1. SECTION TITLE"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1,
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": "",
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
|
||||||
|
return f"""
|
||||||
|
{userPrompt}
|
||||||
|
|
||||||
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||||
|
|
||||||
|
REQUIREMENTS:
|
||||||
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Extract all content and organize it into logical sections
|
||||||
|
3. Create structured JSON with sections containing the extracted content
|
||||||
|
4. Preserve the original structure and data
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(example_data, indent=2)}
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Preserve all original data - do not summarize or interpret
|
||||||
|
- Use the exact JSON format shown above
|
||||||
|
- Maintain data integrity and structure
|
||||||
|
|
||||||
|
Content Types to Extract:
|
||||||
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
2. Lists: Extract all items with proper nesting
|
||||||
|
3. Headings: Extract with appropriate levels
|
||||||
|
4. Paragraphs: Extract as structured text
|
||||||
|
5. Code: Extract code blocks with language identification
|
||||||
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||||
|
|
||||||
|
Image Analysis Requirements:
|
||||||
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||||
|
- Describe everything you see in the image
|
||||||
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||||
|
- If the image is too small, corrupted, or unclear, explain this
|
||||||
|
- Always provide feedback - never return empty responses
|
||||||
|
|
||||||
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||||
|
|
||||||
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||||
|
|
||||||
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def buildExtractionPrompt(
|
||||||
|
outputFormat: str,
|
||||||
|
renderer: _RendererLike,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None,
|
||||||
|
services=None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Build the final extraction prompt by combining:
|
||||||
|
- Parsed extraction intent from user prompt (using AI)
|
||||||
|
- Generic cross-format instructions (filename header + real-data policy)
|
||||||
|
- Format-specific guidelines snippet provided by the renderer
|
||||||
|
|
||||||
|
The AI must place a single filename header at the very top:
|
||||||
|
FILENAME: <safe-file-name-with-extension>
|
||||||
|
followed by a blank line and then ONLY the document content according to the target format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Parse user prompt to separate extraction intent from generation format using AI
|
||||||
|
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
|
||||||
|
|
||||||
|
# Import JSON schema for structured output
|
||||||
|
from .subJsonSchema import get_document_subJsonSchema
|
||||||
|
jsonSchema = get_document_subJsonSchema()
|
||||||
|
|
||||||
|
# Generic block for JSON extraction - use mixed example data showing different content types
|
||||||
|
example_data = {
|
||||||
|
"metadata": {
|
||||||
|
"title": "Example Document",
|
||||||
|
"author": "AI Assistant",
|
||||||
|
"source_documents": ["document_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "section_001",
|
||||||
|
"content_type": "heading",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"level": 1,
|
||||||
|
"text": "1. INTRODUCTION"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1,
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "section_002",
|
||||||
|
"content_type": "paragraph",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"text": "This is a sample paragraph with actual content that should be extracted from the document."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 2,
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "section_003",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||||
|
"rows": [
|
||||||
|
["Value 1", "Value 2", "Value 3"],
|
||||||
|
["Value 4", "Value 5", "Value 6"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 3,
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": "",
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
|
||||||
|
genericIntro = f"""
|
||||||
|
{extractionIntent}
|
||||||
|
|
||||||
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||||
|
|
||||||
|
REQUIREMENTS:
|
||||||
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Extract all content and organize it into logical sections
|
||||||
|
3. Create structured JSON with sections containing the extracted content
|
||||||
|
4. Preserve the original structure and data
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(example_data, indent=2)}
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Preserve all original data - do not summarize or interpret
|
||||||
|
- Use the exact JSON format shown above
|
||||||
|
- Maintain data integrity and structure
|
||||||
|
|
||||||
|
Content Types to Extract:
|
||||||
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
2. Lists: Extract all items with proper nesting
|
||||||
|
3. Headings: Extract with appropriate levels
|
||||||
|
4. Paragraphs: Extract as structured text
|
||||||
|
5. Code: Extract code blocks with language identification
|
||||||
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||||
|
|
||||||
|
Image Analysis Requirements:
|
||||||
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||||
|
- Describe everything you see in the image
|
||||||
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||||
|
- If the image is too small, corrupted, or unclear, explain this
|
||||||
|
- Always provide feedback - never return empty responses
|
||||||
|
|
||||||
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||||
|
|
||||||
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||||
|
|
||||||
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get format-specific guidelines from renderer
|
||||||
|
formatGuidelines = ""
|
||||||
|
try:
|
||||||
|
if hasattr(renderer, 'getExtractionGuidelines'):
|
||||||
|
formatGuidelines = renderer.getExtractionGuidelines()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Combine all parts
|
||||||
|
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
|
||||||
|
|
||||||
|
# Save extraction prompt to debug file - only if debug enabled
|
||||||
|
try:
|
||||||
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if debug_enabled:
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
debug_root = "./test-chat/ai"
|
||||||
|
os.makedirs(debug_root, exist_ok=True)
|
||||||
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
||||||
|
f.write(finalPrompt)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return finalPrompt
|
||||||
|
|
||||||
|
|
||||||
|
async def buildGenerationPrompt(
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None,
|
||||||
|
services=None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Use AI to build the generation prompt based on user intent and format requirements.
|
||||||
|
Focus on what's important for the user and how to structure the content.
|
||||||
|
"""
|
||||||
|
if not aiService:
|
||||||
|
# Fallback if no AI service available
|
||||||
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Protect userPrompt from injection
|
||||||
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
||||||
|
|
||||||
|
# Debug output
|
||||||
|
services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")
|
||||||
|
|
||||||
|
# AI call to generate the appropriate generation prompt
|
||||||
|
generationPromptRequest = f"""
|
||||||
|
You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document.
|
||||||
|
|
||||||
|
User request: "{safeUserPrompt}"
|
||||||
|
Document title: "{title}"
|
||||||
|
Target format: {outputFormat}
|
||||||
|
|
||||||
|
Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on:
|
||||||
|
|
||||||
|
1. What content is most important for the user
|
||||||
|
2. How to structure and organize the content using the canonical JSON format with 'sections'
|
||||||
|
3. Specific formatting requirements for the target format
|
||||||
|
4. Language requirements to preserve
|
||||||
|
5. How to ensure the JSON content meets the user's needs
|
||||||
|
|
||||||
|
CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure:
|
||||||
|
{{
|
||||||
|
"metadata": {{
|
||||||
|
"title": "Document Title"
|
||||||
|
}},
|
||||||
|
"sections": [
|
||||||
|
{{
|
||||||
|
"id": "section_1",
|
||||||
|
"content_type": "heading",
|
||||||
|
"elements": [
|
||||||
|
{{
|
||||||
|
"level": 1,
|
||||||
|
"text": "1. SECTION TITLE"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"id": "section_2",
|
||||||
|
"content_type": "paragraph",
|
||||||
|
"elements": [
|
||||||
|
{{
|
||||||
|
"text": "This is the actual content that should be extracted from the document."
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"order": 2
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"id": "section_3",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{{
|
||||||
|
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||||
|
"rows": [
|
||||||
|
["Value 1", "Value 2", "Value 3"],
|
||||||
|
["Value 4", "Value 5", "Value 6"]
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"order": 3
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements".
|
||||||
|
|
||||||
|
Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Call AI service to generate the prompt
|
||||||
|
services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER")
|
||||||
|
|
||||||
|
# Import and set proper options for AI call
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
|
||||||
|
response = await aiService.aiObjects.call(request)
|
||||||
|
result = response.content if response else ""
|
||||||
|
|
||||||
|
# Replace the placeholder that the AI created with actual format rules
|
||||||
|
if result:
|
||||||
|
formatRules = _getFormatRules(outputFormat)
|
||||||
|
result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)
|
||||||
|
|
||||||
|
# Debug output
|
||||||
|
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
|
||||||
|
|
||||||
|
# Save full generation prompt and AI response to debug file - only if debug enabled
|
||||||
|
try:
|
||||||
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if debug_enabled:
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
debug_root = "./test-chat/ai"
|
||||||
|
os.makedirs(debug_root, exist_ok=True)
|
||||||
|
with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
|
||||||
|
f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
|
||||||
|
f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Fallback on any error - preserve user prompt for language instructions
|
||||||
|
services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
|
||||||
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
||||||
|
|
||||||
|
|
||||||
|
def _getFormatRules(outputFormat: str) -> str:
|
||||||
|
"""
|
||||||
|
Get format-specific rules for the generation prompt.
|
||||||
|
"""
|
||||||
|
format_rules = {
|
||||||
|
"xlsx": """
|
||||||
|
XLSX Format Rules:
|
||||||
|
- Create tables with clear headers and organized data
|
||||||
|
- Use appropriate column widths and formatting
|
||||||
|
- Include summary information if relevant
|
||||||
|
- Ensure data is properly structured for spreadsheet analysis
|
||||||
|
""",
|
||||||
|
"pdf": """
|
||||||
|
PDF Format Rules:
|
||||||
|
- Create professional document layout
|
||||||
|
- Use appropriate headings and sections
|
||||||
|
- Include proper spacing and formatting
|
||||||
|
- Ensure content is well-organized and readable
|
||||||
|
""",
|
||||||
|
"docx": """
|
||||||
|
DOCX Format Rules:
|
||||||
|
- Create professional document layout
|
||||||
|
- Use appropriate headings and sections
|
||||||
|
- Include proper spacing and formatting
|
||||||
|
- Ensure content is well-organized and readable
|
||||||
|
""",
|
||||||
|
"html": """
|
||||||
|
HTML Format Rules:
|
||||||
|
- Create clean, semantic HTML structure
|
||||||
|
- Use appropriate tags for content organization
|
||||||
|
- Include proper styling classes
|
||||||
|
- Ensure content is accessible and well-formatted
|
||||||
|
""",
|
||||||
|
"json": """
|
||||||
|
JSON Format Rules:
|
||||||
|
- Create well-structured JSON data
|
||||||
|
- Use appropriate nesting and organization
|
||||||
|
- Include metadata and context information
|
||||||
|
- Ensure data is properly formatted and valid
|
||||||
|
""",
|
||||||
|
"csv": """
|
||||||
|
CSV Format Rules:
|
||||||
|
- Create clear, organized tabular data
|
||||||
|
- Use appropriate headers and data types
|
||||||
|
- Ensure proper CSV formatting
|
||||||
|
- Include all relevant data in structured format
|
||||||
|
""",
|
||||||
|
"txt": """
|
||||||
|
TXT Format Rules:
|
||||||
|
- Create clean, readable text format
|
||||||
|
- Use appropriate spacing and organization
|
||||||
|
- Include clear headings and sections
|
||||||
|
- Ensure content is well-structured and easy to read
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
return format_rules.get(outputFormat.lower(), f"""
|
||||||
|
{outputFormat.upper()} Format Rules:
|
||||||
|
- Create well-structured content appropriate for {outputFormat}
|
||||||
|
- Use appropriate formatting and organization
|
||||||
|
- Ensure content is clear and professional
|
||||||
|
- Include all relevant information in proper format
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
||||||
|
"""
|
||||||
|
Parse user prompt to extract the core extraction intent.
|
||||||
|
"""
|
||||||
|
if not aiService:
|
||||||
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||||
|
|
||||||
|
try:
|
||||||
|
analysis_prompt = f"""
|
||||||
|
Analyze this user request and extract the core extraction intent:
|
||||||
|
|
||||||
|
User request: "{userPrompt}"
|
||||||
|
Target format: {outputFormat}
|
||||||
|
|
||||||
|
Extract the main intent and requirements for document processing. Focus on:
|
||||||
|
1. What content needs to be extracted
|
||||||
|
2. How it should be organized
|
||||||
|
3. Any specific requirements or preferences
|
||||||
|
|
||||||
|
Respond with a clear, concise statement of the extraction intent.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||||
|
response = await aiService.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
return response.content.strip()
|
||||||
|
else:
|
||||||
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||||
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||||
|
|
||||||
|
|
@ -32,7 +32,7 @@ class NeutralizationService:
|
||||||
serviceCenter: Service center instance for accessing other services
|
serviceCenter: Service center instance for accessing other services
|
||||||
NamesToParse: List of names to parse and replace (case-insensitive)
|
NamesToParse: List of names to parse and replace (case-insensitive)
|
||||||
"""
|
"""
|
||||||
self.serviceCenter = serviceCenter
|
self.services = serviceCenter
|
||||||
self.interfaceDbApp = serviceCenter.interfaceDbApp
|
self.interfaceDbApp = serviceCenter.interfaceDbApp
|
||||||
|
|
||||||
# Initialize anonymization processors
|
# Initialize anonymization processors
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,264 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List, Set
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
|
||||||
|
|
||||||
|
class NormalizationService:
|
||||||
|
"""
|
||||||
|
Produces a single canonical table in merged JSON using an AI-provided header mapping
|
||||||
|
and deterministic, in-code value normalization. No language heuristics in code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, services):
|
||||||
|
self.services = services
|
||||||
|
|
||||||
|
# Public API
|
||||||
|
def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
headers: Set[str] = set()
|
||||||
|
samples: Dict[str, List[str]] = {}
|
||||||
|
|
||||||
|
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
|
||||||
|
for section in sections:
|
||||||
|
if not isinstance(section, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Use only the fundamental agreed JSON structure: content_type/elements
|
||||||
|
if section.get("content_type") != "table":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract table data from elements array
|
||||||
|
hdrs = []
|
||||||
|
rows = []
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
|
hdrs = element.get("headers") or []
|
||||||
|
rows = element.get("rows") or []
|
||||||
|
break
|
||||||
|
|
||||||
|
if not hdrs or not rows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for h in hdrs:
|
||||||
|
if not isinstance(h, str):
|
||||||
|
continue
|
||||||
|
headers.add(h)
|
||||||
|
# collect small value samples by column index
|
||||||
|
for row in rows[:5]:
|
||||||
|
if not isinstance(row, list):
|
||||||
|
continue
|
||||||
|
for i, value in enumerate(row):
|
||||||
|
headerName = hdrs[i] if i < len(hdrs) else f"col_{i}"
|
||||||
|
if headerName not in samples:
|
||||||
|
samples[headerName] = []
|
||||||
|
if len(samples[headerName]) < 5:
|
||||||
|
samples[headerName].append(str(value))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"tableHeaders": sorted(list(headers)),
|
||||||
|
"headerSamples": samples,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]:
|
||||||
|
|
||||||
|
# Allow caller to specify any canonical schema. If none provided, default to discovered headers.
|
||||||
|
if canonicalSpec is None:
|
||||||
|
canonicalSpec = {
|
||||||
|
"canonicalHeaders": inventory.get("tableHeaders", []),
|
||||||
|
"constraints": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Protect merge prompt context by wrapping in single quotes and escaping internal quotes
|
||||||
|
protectedMerge = None
|
||||||
|
if mergePrompt:
|
||||||
|
try:
|
||||||
|
protectedMerge = str(mergePrompt).replace("'", "\\'")
|
||||||
|
except Exception:
|
||||||
|
protectedMerge = str(mergePrompt)
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
"You are a mapping generator. Return ONLY JSON.\n\n"
|
||||||
|
"Given discovered headers and sample values, map them to the canonical headers.\n"
|
||||||
|
"Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n"
|
||||||
|
f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n"
|
||||||
|
f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n"
|
||||||
|
+ (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") +
|
||||||
|
"REPLY JSON SHAPE:\n(Example)\n"
|
||||||
|
"{\n \"mappings\": {\"<sourceHeader>\": \"<Canonical>|null\"},\n"
|
||||||
|
" \"normalizationPolicy\": {\n \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n"
|
||||||
|
" \"Currency\": {\"stripSymbols\": true},\n"
|
||||||
|
" \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n }\n}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await self.services.ai.callAi(prompt=prompt)
|
||||||
|
if not response:
|
||||||
|
return {"mapping": {}, "normalizationPolicy": {}}
|
||||||
|
|
||||||
|
# Extract JSON from response more safely
|
||||||
|
start_idx = response.find('{')
|
||||||
|
end_idx = response.rfind('}')
|
||||||
|
if start_idx == -1 or end_idx == -1 or start_idx >= end_idx:
|
||||||
|
return {"mapping": {}, "normalizationPolicy": {}}
|
||||||
|
|
||||||
|
js = response[start_idx:end_idx + 1]
|
||||||
|
try:
|
||||||
|
mapping = json.loads(js)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"mapping": {}, "normalizationPolicy": {}}
|
||||||
|
# Normalize key naming from AI: prefer single key "mapping"
|
||||||
|
if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict):
|
||||||
|
mapping["mapping"] = mapping["mappings"]
|
||||||
|
try:
|
||||||
|
del mapping["mappings"]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Ensure canonicalHeaders present in mapping for downstream use
|
||||||
|
if "canonicalHeaders" not in mapping:
|
||||||
|
mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", [])
|
||||||
|
|
||||||
|
# debug artifact
|
||||||
|
self._writeDebugArtifact("mapping.json", mapping)
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
mappings = (mappingSpec or {}).get("mapping", {})
|
||||||
|
policy = (mappingSpec or {}).get("normalizationPolicy", {})
|
||||||
|
|
||||||
|
# Prefer headers provided by mapping (generic across domains)
|
||||||
|
canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or []
|
||||||
|
if not canonicalHeaders:
|
||||||
|
# Fallback to union of mapped targets
|
||||||
|
canonicalHeaders = sorted(list({t for t in mappings.values() if t}))
|
||||||
|
|
||||||
|
rows: List[List[str]] = []
|
||||||
|
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
|
||||||
|
for section in sections:
|
||||||
|
# Use only the fundamental agreed JSON structure: content_type/elements
|
||||||
|
if section.get("content_type") != "table":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract table data from elements array
|
||||||
|
sourceHeaders = []
|
||||||
|
sourceRows = []
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
|
sourceHeaders = element.get("headers") or []
|
||||||
|
sourceRows = element.get("rows") or []
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourceHeaders or not sourceRows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build index map: canonical -> source index or None
|
||||||
|
indexMap: Dict[str, int] = {}
|
||||||
|
for ci, ch in enumerate(canonicalHeaders):
|
||||||
|
srcIndex = None
|
||||||
|
for si, sh in enumerate(sourceHeaders):
|
||||||
|
# Prefer explicit mapping target; fallback to identity when names match
|
||||||
|
target = mappings.get(sh)
|
||||||
|
if target is None and sh == ch:
|
||||||
|
target = ch
|
||||||
|
if target == ch:
|
||||||
|
srcIndex = si
|
||||||
|
break
|
||||||
|
indexMap[ch] = srcIndex
|
||||||
|
|
||||||
|
# Transform rows
|
||||||
|
for r in sourceRows:
|
||||||
|
canonicalRow: List[str] = []
|
||||||
|
for ch in canonicalHeaders:
|
||||||
|
idx = indexMap.get(ch)
|
||||||
|
try:
|
||||||
|
value = r[idx] if (idx is not None and idx < len(r)) else ""
|
||||||
|
except (IndexError, KeyError) as e:
|
||||||
|
# Handle corrupted data gracefully
|
||||||
|
value = ""
|
||||||
|
canonicalRow.append(self._normalizeValue(ch, value, policy))
|
||||||
|
# consider as row if at least one non-empty meaningful field
|
||||||
|
if any(v.strip() for v in canonicalRow):
|
||||||
|
rows.append(canonicalRow)
|
||||||
|
|
||||||
|
canonical = {
|
||||||
|
"metadata": {
|
||||||
|
"title": mergedJson.get("metadata", {}).get("title", "Merged Document"),
|
||||||
|
"source_documents": mergedJson.get("metadata", {}).get("source_documents", [])
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "canonical_table_1",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": canonicalHeaders,
|
||||||
|
"rows": rows
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# debug artifact
|
||||||
|
self._writeDebugArtifact("canonical_merged.json", canonical)
|
||||||
|
return canonical
|
||||||
|
|
||||||
|
def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
rows = []
|
||||||
|
try:
|
||||||
|
sections = canonicalJson.get("sections", [])
|
||||||
|
for s in sections:
|
||||||
|
if s.get("content_type") == "table":
|
||||||
|
# Extract rows from elements array
|
||||||
|
for element in s.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "rows" in element:
|
||||||
|
rows.extend(element.get("rows", []))
|
||||||
|
except Exception:
|
||||||
|
rows = []
|
||||||
|
report = {
|
||||||
|
"rowCount": len(rows),
|
||||||
|
"success": len(rows) > 0
|
||||||
|
}
|
||||||
|
self._writeDebugArtifact("normalization_report.json", report)
|
||||||
|
return report
|
||||||
|
|
||||||
|
# Internal helpers
|
||||||
|
def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str:
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
text = str(value).strip()
|
||||||
|
# Generic normalization guided by policy; avoid domain specifics
|
||||||
|
if canonicalHeader in (policy.get("numericFields", []) or []):
|
||||||
|
dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator")
|
||||||
|
or (policy.get("numeric") or {}).get("decimalSeparator")
|
||||||
|
or ".")
|
||||||
|
if dec == ",":
|
||||||
|
text = text.replace(".", "").replace(",", ".") if "," in text else text
|
||||||
|
text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+'])
|
||||||
|
elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []):
|
||||||
|
text = ''.join(ch for ch in text if ch.isalpha())
|
||||||
|
text = text.upper()
|
||||||
|
return text
|
||||||
|
|
||||||
|
def _writeDebugArtifact(self, fileName: str, obj: Any) -> None:
|
||||||
|
try:
|
||||||
|
debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if not debugEnabled:
|
||||||
|
return
|
||||||
|
root = "./test-chat/ai"
|
||||||
|
os.makedirs(root, exist_ok=True)
|
||||||
|
# Prefix timestamp for files that are frequently overwritten
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
if fileName in ("mapping.json", "canonical_merged.json"):
|
||||||
|
outName = f"{ts}_{fileName}"
|
||||||
|
else:
|
||||||
|
outName = fileName
|
||||||
|
path = os.path.join(root, outName)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
if isinstance(obj, (dict, list)):
|
||||||
|
f.write(json.dumps(obj, ensure_ascii=False, indent=2))
|
||||||
|
else:
|
||||||
|
f.write(str(obj))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -21,7 +21,7 @@ class SharepointService:
|
||||||
|
|
||||||
Use setAccessTokenFromConnection() method to configure the access token before making API calls.
|
Use setAccessTokenFromConnection() method to configure the access token before making API calls.
|
||||||
"""
|
"""
|
||||||
self.serviceCenter = serviceCenter
|
self.services = serviceCenter
|
||||||
self.access_token = None
|
self.access_token = None
|
||||||
self.base_url = "https://graph.microsoft.com/v1.0"
|
self.base_url = "https://graph.microsoft.com/v1.0"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ class TicketService:
|
||||||
Args:
|
Args:
|
||||||
serviceCenter: Service center instance for accessing other services
|
serviceCenter: Service center instance for accessing other services
|
||||||
"""
|
"""
|
||||||
self.serviceCenter = serviceCenter
|
self.services = serviceCenter
|
||||||
|
|
||||||
async def _createTicketInterfaceByType(
|
async def _createTicketInterfaceByType(
|
||||||
self,
|
self,
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ Provides centralized access to configuration, events, and other utilities.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from typing import Any, Optional, Dict, Callable
|
from typing import Any, Optional, Dict, Callable
|
||||||
from modules.shared.configuration import APP_CONFIG
|
from modules.shared.configuration import APP_CONFIG
|
||||||
from modules.shared.eventManagement import eventManager
|
from modules.shared.eventManagement import eventManager
|
||||||
|
|
@ -139,4 +140,43 @@ class UtilsService:
|
||||||
return TokenManager().getFreshToken(connectionId)
|
return TokenManager().getFreshToken(connectionId)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error getting fresh token for connection {connectionId}: {str(e)}")
|
logger.error(f"Error getting fresh token for connection {connectionId}: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def debugLogToFile(self, message: str, context: str = "DEBUG"):
|
||||||
|
"""
|
||||||
|
Log debug message to file if debug logging is enabled.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: Debug message to log
|
||||||
|
context: Context identifier for the debug message
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Check if debug logging is enabled
|
||||||
|
debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if not debug_enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get debug directory
|
||||||
|
debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat")
|
||||||
|
if not os.path.isabs(debug_dir):
|
||||||
|
# If relative path, make it relative to the gateway directory
|
||||||
|
gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
debug_dir = os.path.join(gateway_dir, debug_dir)
|
||||||
|
|
||||||
|
# Ensure debug directory exists
|
||||||
|
os.makedirs(debug_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create debug file path
|
||||||
|
debug_file = os.path.join(debug_dir, "debug_workflow.log")
|
||||||
|
|
||||||
|
# Format the debug entry
|
||||||
|
timestamp = self.getUtcTimestamp()
|
||||||
|
debug_entry = f"[{timestamp}] [{context}] {message}\n"
|
||||||
|
|
||||||
|
# Write to debug file
|
||||||
|
with open(debug_file, "a", encoding="utf-8") as f:
|
||||||
|
f.write(debug_entry)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Don't log debug errors to avoid recursion
|
||||||
|
pass
|
||||||
|
|
@ -16,7 +16,7 @@ class WorkflowService:
|
||||||
"""Service class containing methods for document processing, chat operations, and workflow management"""
|
"""Service class containing methods for document processing, chat operations, and workflow management"""
|
||||||
|
|
||||||
def __init__(self, serviceCenter):
|
def __init__(self, serviceCenter):
|
||||||
self.serviceCenter = serviceCenter
|
self.services = serviceCenter
|
||||||
self.user = serviceCenter.user
|
self.user = serviceCenter.user
|
||||||
self.workflow = serviceCenter.workflow
|
self.workflow = serviceCenter.workflow
|
||||||
self.interfaceDbChat = serviceCenter.interfaceDbChat
|
self.interfaceDbChat = serviceCenter.interfaceDbChat
|
||||||
|
|
@ -78,11 +78,15 @@ class WorkflowService:
|
||||||
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
|
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
|
||||||
"""Get ChatDocuments from a list of document references using all three formats."""
|
"""Get ChatDocuments from a list of document references using all three formats."""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services (same pattern as setWorkflowContext)
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
# Reload workflow from database to ensure we have all messages
|
||||||
logger.error("No workflow available for document list resolution")
|
if hasattr(workflow, 'id'):
|
||||||
return []
|
try:
|
||||||
|
workflow = self.getWorkflow(workflow.id)
|
||||||
|
logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not reload workflow from database: {str(e)}")
|
||||||
|
|
||||||
all_documents = []
|
all_documents = []
|
||||||
for doc_ref in documentList:
|
for doc_ref in documentList:
|
||||||
|
|
@ -125,7 +129,9 @@ class WorkflowService:
|
||||||
break
|
break
|
||||||
|
|
||||||
if not message_found:
|
if not message_found:
|
||||||
logger.warning(f"Message with ID {message_id} not found in workflow. Available message IDs: {[str(msg.id) for msg in workflow.messages]}")
|
available_ids = [str(msg.id) for msg in workflow.messages]
|
||||||
|
logger.error(f"Message with ID {message_id} not found in workflow. Available message IDs: {available_ids}")
|
||||||
|
raise ValueError(f"Document reference not found: docList:{message_id}:{label}")
|
||||||
elif len(parts) >= 2:
|
elif len(parts) >= 2:
|
||||||
# Format: docList:<label> - find message by documentsLabel
|
# Format: docList:<label> - find message by documentsLabel
|
||||||
label = parts[1]
|
label = parts[1]
|
||||||
|
|
@ -154,7 +160,8 @@ class WorkflowService:
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Found docList reference {doc_ref} but message has no documents")
|
logger.debug(f"Found docList reference {doc_ref} but message has no documents")
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No messages found with documentsLabel: {label}")
|
logger.error(f"No messages found with documentsLabel: {label}")
|
||||||
|
raise ValueError(f"Document reference not found: docList:{label}")
|
||||||
else:
|
else:
|
||||||
# Direct label reference (round1_task2_action3_contextinfo)
|
# Direct label reference (round1_task2_action3_contextinfo)
|
||||||
# Search for messages with matching documentsLabel to find the actual documents
|
# Search for messages with matching documentsLabel to find the actual documents
|
||||||
|
|
@ -198,30 +205,8 @@ class WorkflowService:
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No documents found in newest message {newest_message.id}")
|
logger.debug(f"No documents found in newest message {newest_message.id}")
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No messages found with documentsLabel: {doc_ref}")
|
logger.error(f"No messages found with documentsLabel: {doc_ref}")
|
||||||
# Fallback: also check if any message has this documentsLabel as a prefix
|
raise ValueError(f"Document reference not found: {doc_ref}")
|
||||||
logger.debug(f"Trying fallback search for messages with documentsLabel containing: {doc_ref}")
|
|
||||||
fallback_messages = []
|
|
||||||
for message in workflow.messages:
|
|
||||||
msg_documents_label = getattr(message, 'documentsLabel', '')
|
|
||||||
if msg_documents_label and msg_documents_label.startswith(doc_ref):
|
|
||||||
fallback_messages.append(message)
|
|
||||||
logger.debug(f"Found fallback message {message.id} with documentsLabel: {msg_documents_label}")
|
|
||||||
|
|
||||||
if fallback_messages:
|
|
||||||
# Sort by publishedAt descending (newest first)
|
|
||||||
fallback_messages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
|
|
||||||
newest_fallback = fallback_messages[0]
|
|
||||||
|
|
||||||
logger.debug(f"Using fallback message {newest_fallback.id} with documentsLabel: {getattr(newest_fallback, 'documentsLabel', 'unknown')}")
|
|
||||||
if newest_fallback.documents:
|
|
||||||
doc_names = [doc.fileName for doc in newest_fallback.documents if hasattr(doc, 'fileName')]
|
|
||||||
logger.debug(f"Added {len(newest_fallback.documents)} documents from fallback message {newest_fallback.id}: {doc_names}")
|
|
||||||
all_documents.extend(newest_fallback.documents)
|
|
||||||
else:
|
|
||||||
logger.debug(f"No documents found in fallback message {newest_fallback.id}")
|
|
||||||
else:
|
|
||||||
logger.debug(f"No fallback messages found either")
|
|
||||||
|
|
||||||
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
|
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
|
||||||
return all_documents
|
return all_documents
|
||||||
|
|
@ -260,7 +245,8 @@ class WorkflowService:
|
||||||
token_status = f"error: {str(e)}"
|
token_status = f"error: {str(e)}"
|
||||||
|
|
||||||
# Build enhanced reference with state information
|
# Build enhanced reference with state information
|
||||||
base_ref = f"connection:{connection.authority.value}:{connection.externalUsername}:{connection.id}"
|
# Format: connection:msft:<username> (without UUID)
|
||||||
|
base_ref = f"connection:{connection.authority.value}:{connection.externalUsername}"
|
||||||
state_info = f" [status:{connection.status.value}, token:{token_status}]"
|
state_info = f" [status:{connection.status.value}, token:{token_status}]"
|
||||||
|
|
||||||
logger.debug(f"getConnectionReferenceFromUserConnection: Built reference: {base_ref + state_info}")
|
logger.debug(f"getConnectionReferenceFromUserConnection: Built reference: {base_ref + state_info}")
|
||||||
|
|
@ -283,26 +269,25 @@ class WorkflowService:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getUserConnectionFromConnectionReference(self, connectionReference: str) -> Optional[UserConnection]:
|
def getUserConnectionFromConnectionReference(self, connectionReference: str) -> Optional[UserConnection]:
|
||||||
"""Get UserConnection from reference string (handles both old and enhanced formats)"""
|
"""Get UserConnection from reference string (handles new format without UUID)"""
|
||||||
try:
|
try:
|
||||||
# Parse reference format: connection:{authority}:{username}:{id} [status:..., token:...]
|
# Parse reference format: connection:{authority}:{username} [status:..., token:...]
|
||||||
# Remove state information if present
|
# Remove state information if present
|
||||||
base_reference = connectionReference.split(' [')[0]
|
base_reference = connectionReference.split(' [')[0]
|
||||||
|
|
||||||
parts = base_reference.split(':')
|
parts = base_reference.split(':')
|
||||||
if len(parts) != 4 or parts[0] != "connection":
|
if len(parts) != 3 or parts[0] != "connection":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
authority = parts[1]
|
authority = parts[1]
|
||||||
username = parts[2]
|
username = parts[2]
|
||||||
conn_id = parts[3]
|
|
||||||
|
|
||||||
# Get user connections through AppObjects interface
|
# Get user connections through AppObjects interface
|
||||||
user_connections = self.interfaceDbApp.getUserConnections(self.user.id)
|
user_connections = self.interfaceDbApp.getUserConnections(self.user.id)
|
||||||
|
|
||||||
# Find matching connection
|
# Find matching connection by authority and username (no UUID needed)
|
||||||
for conn in user_connections:
|
for conn in user_connections:
|
||||||
if str(conn.id) == conn_id and conn.authority.value == authority and conn.externalUsername == username:
|
if conn.authority.value == authority and conn.externalUsername == username:
|
||||||
return conn
|
return conn
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -437,11 +422,7 @@ class WorkflowService:
|
||||||
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
|
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
|
||||||
"""Set current workflow context for document generation and routing"""
|
"""Set current workflow context for document generation and routing"""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
|
||||||
logger.error("No workflow available for context setting")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Prepare update data
|
# Prepare update data
|
||||||
update_data = {}
|
update_data = {}
|
||||||
|
|
@ -548,10 +529,7 @@ class WorkflowService:
|
||||||
def getDocumentCount(self) -> str:
|
def getDocumentCount(self) -> str:
|
||||||
"""Get document count for task planning (matching old handlingTasks.py logic)"""
|
"""Get document count for task planning (matching old handlingTasks.py logic)"""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
|
||||||
return "No documents available"
|
|
||||||
|
|
||||||
# Count documents from all messages in the workflow (like old system)
|
# Count documents from all messages in the workflow (like old system)
|
||||||
total_docs = 0
|
total_docs = 0
|
||||||
|
|
@ -570,10 +548,7 @@ class WorkflowService:
|
||||||
def getWorkflowHistoryContext(self) -> str:
|
def getWorkflowHistoryContext(self) -> str:
|
||||||
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
|
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
|
||||||
return "No previous round context available"
|
|
||||||
|
|
||||||
# Check if there are any previous rounds by looking for "first" messages
|
# Check if there are any previous rounds by looking for "first" messages
|
||||||
has_previous_rounds = False
|
has_previous_rounds = False
|
||||||
|
|
@ -622,15 +597,26 @@ class WorkflowService:
|
||||||
if not workflow or not hasattr(workflow, 'messages'):
|
if not workflow or not hasattr(workflow, 'messages'):
|
||||||
return "No documents available"
|
return "No documents available"
|
||||||
|
|
||||||
|
# Use the provided workflow object directly to avoid database reload issues
|
||||||
|
# that can cause filename truncation. The workflow object should already be up-to-date.
|
||||||
|
logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})")
|
||||||
|
|
||||||
|
# Debug: Check document filenames in the workflow object
|
||||||
|
if hasattr(workflow, 'messages') and workflow.messages:
|
||||||
|
for message in workflow.messages:
|
||||||
|
if hasattr(message, 'documents') and message.documents:
|
||||||
|
for doc in message.documents:
|
||||||
|
logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
|
||||||
|
|
||||||
# Get document reference list using the exact same logic as old system
|
# Get document reference list using the exact same logic as old system
|
||||||
document_list = self._getDocumentReferenceList(workflow)
|
document_list = self._getDocumentReferenceList(workflow)
|
||||||
|
|
||||||
# Build technical context string for AI action planning (exact copy of old system)
|
# Build index string for AI action planning
|
||||||
context = "AVAILABLE DOCUMENTS:\n\n"
|
context = ""
|
||||||
|
|
||||||
# Process chat exchanges (current round) - exact copy of old system
|
# Process current round exchanges first
|
||||||
if document_list["chat"]:
|
if document_list["chat"]:
|
||||||
context += "CURRENT ROUND DOCUMENTS:\n"
|
context += "\nCurrent round documents:\n"
|
||||||
for exchange in document_list["chat"]:
|
for exchange in document_list["chat"]:
|
||||||
# Generate docList reference for the exchange (using message ID and label)
|
# Generate docList reference for the exchange (using message ID and label)
|
||||||
# Find the message that corresponds to this exchange
|
# Find the message that corresponds to this exchange
|
||||||
|
|
@ -656,9 +642,9 @@ class WorkflowService:
|
||||||
context += f" - docItem:{doc_ref}\n"
|
context += f" - docItem:{doc_ref}\n"
|
||||||
context += "\n"
|
context += "\n"
|
||||||
|
|
||||||
# Process history exchanges (previous rounds) - exact copy of old system
|
# Process previous rounds after
|
||||||
if document_list["history"]:
|
if document_list["history"]:
|
||||||
context += "WORKFLOW HISTORY DOCUMENTS:\n"
|
context += "\nPast rounds documents:\n"
|
||||||
for exchange in document_list["history"]:
|
for exchange in document_list["history"]:
|
||||||
# Generate docList reference for the exchange (using message ID and label)
|
# Generate docList reference for the exchange (using message ID and label)
|
||||||
# Find the message that corresponds to this exchange
|
# Find the message that corresponds to this exchange
|
||||||
|
|
@ -685,7 +671,7 @@ class WorkflowService:
|
||||||
context += "\n"
|
context += "\n"
|
||||||
|
|
||||||
if not document_list["chat"] and not document_list["history"]:
|
if not document_list["chat"] and not document_list["history"]:
|
||||||
context += "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.\n"
|
context += "\nNO DOCUMENTS AVAILABLE - This workflow has no documents to process.\n"
|
||||||
|
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
|
@ -713,39 +699,23 @@ class WorkflowService:
|
||||||
for message in reversed(workflow.messages):
|
for message in reversed(workflow.messages):
|
||||||
is_first = message.status == "first" if hasattr(message, 'status') else False
|
is_first = message.status == "first" if hasattr(message, 'status') else False
|
||||||
|
|
||||||
# Build a DocumentExchange if message has documents
|
# Build a DocumentExchange if message has documents and an explicit documentsLabel
|
||||||
doc_exchange = None
|
doc_exchange = None
|
||||||
if message.documents:
|
if message.documents:
|
||||||
if message.actionId and message.documentsLabel:
|
existing_label = getattr(message, 'documentsLabel', None)
|
||||||
# Validate that we use the same label as in the message
|
if existing_label:
|
||||||
|
# Validate and use the message's actual documentsLabel
|
||||||
validated_label = self._validateDocumentLabelConsistency(message)
|
validated_label = self._validateDocumentLabelConsistency(message)
|
||||||
|
|
||||||
# Use the message's actual documentsLabel
|
|
||||||
doc_refs = []
|
doc_refs = []
|
||||||
for doc in message.documents:
|
for doc in message.documents:
|
||||||
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
|
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
|
||||||
doc_refs.append(doc_ref)
|
doc_refs.append(doc_ref)
|
||||||
|
|
||||||
doc_exchange = {
|
doc_exchange = {
|
||||||
'documentsLabel': validated_label,
|
'documentsLabel': validated_label,
|
||||||
'documents': doc_refs
|
'documents': doc_refs
|
||||||
}
|
}
|
||||||
else:
|
# IMPORTANT: Never synthesize new labels here. If a message lacks
|
||||||
# Generate new labels for documents without explicit labels
|
# a documentsLabel, we skip adding an exchange for it.
|
||||||
doc_refs = []
|
|
||||||
for doc in message.documents:
|
|
||||||
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
|
|
||||||
doc_refs.append(doc_ref)
|
|
||||||
|
|
||||||
if doc_refs:
|
|
||||||
# Create a label based on message context
|
|
||||||
context_prefix = self._generateWorkflowContextPrefix(message)
|
|
||||||
context_label = f"{context_prefix}_context"
|
|
||||||
|
|
||||||
doc_exchange = {
|
|
||||||
'documentsLabel': context_label,
|
|
||||||
'documents': doc_refs
|
|
||||||
}
|
|
||||||
|
|
||||||
# Append to appropriate container based on boundary
|
# Append to appropriate container based on boundary
|
||||||
if doc_exchange:
|
if doc_exchange:
|
||||||
|
|
@ -773,12 +743,22 @@ class WorkflowService:
|
||||||
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
|
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
try:
|
try:
|
||||||
|
# Debug: Log original filename before refresh
|
||||||
|
original_filename = doc.fileName
|
||||||
|
logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
|
||||||
|
|
||||||
# Use the proper WorkflowService method to get file info
|
# Use the proper WorkflowService method to get file info
|
||||||
file_info = self.getFileInfo(doc.fileId)
|
file_info = self.getFileInfo(doc.fileId)
|
||||||
if file_info:
|
if file_info:
|
||||||
|
db_filename = file_info.get("fileName", doc.fileName)
|
||||||
|
logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})")
|
||||||
|
|
||||||
doc.fileName = file_info.get("fileName", doc.fileName)
|
doc.fileName = file_info.get("fileName", doc.fileName)
|
||||||
doc.fileSize = file_info.get("size", doc.fileSize)
|
doc.fileSize = file_info.get("size", doc.fileSize)
|
||||||
doc.mimeType = file_info.get("mimeType", doc.mimeType)
|
doc.mimeType = file_info.get("mimeType", doc.mimeType)
|
||||||
|
|
||||||
|
# Debug: Log final filename after refresh
|
||||||
|
logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
|
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -794,6 +774,8 @@ class WorkflowService:
|
||||||
def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
|
def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
|
||||||
"""Get document reference using document ID and filename."""
|
"""Get document reference using document ID and filename."""
|
||||||
try:
|
try:
|
||||||
|
# Debug logging to track filename truncation
|
||||||
|
logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})")
|
||||||
# Use document ID and filename for simple reference
|
# Use document ID and filename for simple reference
|
||||||
return f"docItem:{document.id}:{document.fileName}"
|
return f"docItem:{document.id}:{document.fileName}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -844,14 +826,14 @@ class WorkflowService:
|
||||||
"""Get connection reference list (matching old handlingTasks.py logic)"""
|
"""Get connection reference list (matching old handlingTasks.py logic)"""
|
||||||
try:
|
try:
|
||||||
# Get connections from the database using the same logic as the old system
|
# Get connections from the database using the same logic as the old system
|
||||||
if hasattr(self.serviceCenter, 'interfaceDbApp') and hasattr(self.serviceCenter, 'user'):
|
if hasattr(self.services, 'interfaceDbApp') and hasattr(self.services, 'user'):
|
||||||
userId = self.serviceCenter.user.id
|
userId = self.services.user.id
|
||||||
connections = self.serviceCenter.interfaceDbApp.getUserConnections(userId)
|
connections = self.services.interfaceDbApp.getUserConnections(userId)
|
||||||
if connections:
|
if connections:
|
||||||
# Format connections as reference strings using the same pattern as the old system
|
# Format connections as reference strings using the same pattern as the old system
|
||||||
connectionRefs = []
|
connectionRefs = []
|
||||||
for conn in connections:
|
for conn in connections:
|
||||||
# Create reference string in format: connection:{authority}:{username}:{id} [status:..., token:...]
|
# Create reference string in format: connection:{authority}:{username} [status:..., token:...]
|
||||||
# This matches the format expected by getUserConnectionFromConnectionReference()
|
# This matches the format expected by getUserConnectionFromConnectionReference()
|
||||||
ref = self.getConnectionReferenceFromUserConnection(conn)
|
ref = self.getConnectionReferenceFromUserConnection(conn)
|
||||||
connectionRefs.append(ref)
|
connectionRefs.append(ref)
|
||||||
|
|
|
||||||
|
|
@ -42,9 +42,7 @@ class MethodDocument(MethodBase):
|
||||||
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
|
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
|
||||||
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
|
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
|
||||||
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
||||||
- mergeStrategy (dict, optional): Merge strategy for chunked content.
|
- outputMimeType (str, optional): MIME type for output file. Options: "text/plain" (default), "application/json", "text/csv", "text/html". Default: "text/plain".
|
||||||
- expectedDocumentFormats (list, optional): Desired output format specs.
|
|
||||||
- includeMetadata (bool, optional): Include file metadata. Default: True.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
documentList = parameters.get("documentList")
|
documentList = parameters.get("documentList")
|
||||||
|
|
@ -54,13 +52,7 @@ class MethodDocument(MethodBase):
|
||||||
operationType = parameters.get("operationType", "extract_content")
|
operationType = parameters.get("operationType", "extract_content")
|
||||||
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
||||||
chunkAllowed = parameters.get("chunkAllowed", True)
|
chunkAllowed = parameters.get("chunkAllowed", True)
|
||||||
mergeStrategy = parameters.get("mergeStrategy", {
|
outputMimeType = parameters.get("outputMimeType", "text/plain")
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
})
|
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
|
||||||
includeMetadata = parameters.get("includeMetadata", True)
|
|
||||||
|
|
||||||
if not documentList:
|
if not documentList:
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(
|
||||||
|
|
@ -87,19 +79,16 @@ class MethodDocument(MethodBase):
|
||||||
compressContext=not chunkAllowed
|
compressContext=not chunkAllowed
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add format instructions to prompt if expected formats are provided
|
# Add format instructions to prompt based on MIME type
|
||||||
enhanced_prompt = prompt
|
enhanced_prompt = prompt
|
||||||
if expectedDocumentFormats:
|
mime_type_mapping = {
|
||||||
format_instructions = []
|
"text/plain": (".txt", "Plain text format"),
|
||||||
for fmt in expectedDocumentFormats:
|
"application/json": (".json", "Structured JSON format"),
|
||||||
extension = fmt.get("extension", ".txt")
|
"text/csv": (".csv", "Table format"),
|
||||||
mime_type = fmt.get("mimeType", "text/plain")
|
"text/html": (".html", "HTML format")
|
||||||
description = fmt.get("description", "")
|
}
|
||||||
format_instructions.append(f"- {extension} ({mime_type}): {description}")
|
extension, description = mime_type_mapping.get(outputMimeType, (".txt", "Plain text format"))
|
||||||
|
enhanced_prompt += f"\n\nPlease format the output as {extension} ({outputMimeType}): {description}"
|
||||||
if format_instructions:
|
|
||||||
enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
|
|
||||||
enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)
|
|
||||||
|
|
||||||
# Use enhanced AI service for extraction
|
# Use enhanced AI service for extraction
|
||||||
ai_response = await self.services.ai.callAi(
|
ai_response = await self.services.ai.callAi(
|
||||||
|
|
@ -125,8 +114,16 @@ class MethodDocument(MethodBase):
|
||||||
for i, chatDocument in enumerate(chatDocuments):
|
for i, chatDocument in enumerate(chatDocuments):
|
||||||
# Use the AI response directly - it already contains processed content
|
# Use the AI response directly - it already contains processed content
|
||||||
final_content = ai_response
|
final_content = ai_response
|
||||||
final_mime_type = "text/plain"
|
|
||||||
final_extension = ".txt"
|
# Determine output format based on MIME type
|
||||||
|
mime_type_mapping = {
|
||||||
|
"text/plain": ".txt",
|
||||||
|
"application/json": ".json",
|
||||||
|
"text/csv": ".csv",
|
||||||
|
"text/html": ".html"
|
||||||
|
}
|
||||||
|
final_extension = mime_type_mapping.get(outputMimeType, ".txt")
|
||||||
|
final_mime_type = outputMimeType
|
||||||
|
|
||||||
# Create meaningful output fileName with workflow context
|
# Create meaningful output fileName with workflow context
|
||||||
original_fileName = chatDocument.fileName
|
original_fileName = chatDocument.fileName
|
||||||
|
|
@ -156,9 +153,6 @@ class MethodDocument(MethodBase):
|
||||||
error=str(e)
|
error=str(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@action
|
@action
|
||||||
async def generate(self, parameters: Dict[str, Any]) -> ActionResult:
|
async def generate(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
"""
|
"""
|
||||||
|
|
@ -175,8 +169,6 @@ class MethodDocument(MethodBase):
|
||||||
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
|
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
|
||||||
- processDocumentsIndividually (bool, optional): Process per document. Default: True.
|
- processDocumentsIndividually (bool, optional): Process per document. Default: True.
|
||||||
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
||||||
- mergeStrategy (dict, optional): Merging rules for multi-part generation.
|
|
||||||
- includeMetadata (bool, optional): Include file metadata. Default: True.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
documentList = parameters.get("documentList")
|
documentList = parameters.get("documentList")
|
||||||
|
|
@ -188,12 +180,6 @@ class MethodDocument(MethodBase):
|
||||||
operationType = parameters.get("operationType", "generate_report")
|
operationType = parameters.get("operationType", "generate_report")
|
||||||
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
||||||
chunkAllowed = parameters.get("chunkAllowed", True)
|
chunkAllowed = parameters.get("chunkAllowed", True)
|
||||||
mergeStrategy = parameters.get("mergeStrategy", {
|
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
})
|
|
||||||
includeMetadata = parameters.get("includeMetadata", True)
|
|
||||||
|
|
||||||
if not documentList:
|
if not documentList:
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(
|
||||||
|
|
@ -31,14 +31,14 @@ class MethodAi(MethodBase):
|
||||||
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
|
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: AI-based analysis and content generation with optional document context.
|
- Purpose: Process a user prompt with optional unlimited input documents to produce one or many output documents of the SAME format.
|
||||||
- Input requirements: aiPrompt (required); optional documentList, resultType, processingMode, includeMetadata, operationType, priority, maxCost, maxProcessingTime, requiredTags.
|
- Input requirements: aiPrompt (required); optional documentList.
|
||||||
- Output format: Single or multiple documents in requested format.
|
- Output format: Exactly one file format to select. For multiple output file formats to do different calls.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- aiPrompt (str, required): Instruction for the AI.
|
- aiPrompt (str, required): Instruction for the AI.
|
||||||
- documentList (list, optional): Document reference(s) for context.
|
- documentList (list, optional): Document reference(s) for context.
|
||||||
- resultType (str, optional): Output extension (txt, json, md, csv, xml, html, pdf, docx, xlsx, png). Default: txt.
|
- resultType (str, optional): Output file extension - only one extension allowed (e.g. txt, json, md, csv, xml, html, pdf, docx, xlsx, png, ...). Default: txt.
|
||||||
- processingMode (str, optional): basic | advanced | detailed. Default: basic.
|
- processingMode (str, optional): basic | advanced | detailed. Default: basic.
|
||||||
- includeMetadata (bool, optional): Include metadata when available. Default: True.
|
- includeMetadata (bool, optional): Include metadata when available. Default: True.
|
||||||
- operationType (str, optional): general | generate_plan | analyse_content | generate_content | web_research | image_analysis | image_generation. Default: general.
|
- operationType (str, optional): general | generate_plan | analyse_content | generate_content | web_research | image_analysis | image_generation. Default: general.
|
||||||
|
|
@ -169,12 +169,12 @@ class MethodAi(MethodBase):
|
||||||
Parameters:
|
Parameters:
|
||||||
- user_prompt (str, required): Research question or topic.
|
- user_prompt (str, required): Research question or topic.
|
||||||
- urls (list, optional): Specific URLs to crawl.
|
- urls (list, optional): Specific URLs to crawl.
|
||||||
- max_results (int, optional): Max search results. Default: 10.
|
- max_results (int, optional): Max search results. Default: 5.
|
||||||
- max_pages (int, optional): Max pages to crawl per site. Default: 10.
|
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
|
||||||
- search_depth (str, optional): basic | advanced. Default: basic.
|
- search_depth (str, optional): basic | advanced. Default: basic.
|
||||||
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
||||||
- pages_search_depth (int, optional): Crawl depth level. Default: 2.
|
- pages_search_depth (int, optional): Crawl depth level. Default: 2.
|
||||||
- country (str, optional): Country code for bias.
|
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
|
||||||
- time_range (str, optional): d | w | m | y.
|
- time_range (str, optional): d | w | m | y.
|
||||||
- topic (str, optional): general | news | academic.
|
- topic (str, optional): general | news | academic.
|
||||||
- language (str, optional): Language code (e.g., de, en, fr).
|
- language (str, optional): Language code (e.g., de, en, fr).
|
||||||
|
|
@ -182,8 +182,8 @@ class MethodAi(MethodBase):
|
||||||
try:
|
try:
|
||||||
user_prompt = parameters.get("user_prompt")
|
user_prompt = parameters.get("user_prompt")
|
||||||
urls = parameters.get("urls")
|
urls = parameters.get("urls")
|
||||||
max_results = parameters.get("max_results", 10)
|
max_results = parameters.get("max_results", 5)
|
||||||
max_pages = parameters.get("max_pages", 10)
|
max_pages = parameters.get("max_pages", 5)
|
||||||
search_depth = parameters.get("search_depth", "basic")
|
search_depth = parameters.get("search_depth", "basic")
|
||||||
extract_depth = parameters.get("extract_depth", "advanced")
|
extract_depth = parameters.get("extract_depth", "advanced")
|
||||||
pages_search_depth = parameters.get("pages_search_depth", 2)
|
pages_search_depth = parameters.get("pages_search_depth", 2)
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,13 @@ class MethodOutlook(MethodBase):
|
||||||
if not query or not query.strip():
|
if not query or not query.strip():
|
||||||
# No query specified, just get emails from folder
|
# No query specified, just get emails from folder
|
||||||
if folder and folder.lower() != "all":
|
if folder and folder.lower() != "all":
|
||||||
params["$filter"] = f"parentFolderId eq '{folder}'"
|
# Use folder name directly for well-known folders, or get folder ID
|
||||||
|
if folder.lower() in ["inbox", "drafts", "sentitems", "deleteditems"]:
|
||||||
|
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||||
|
else:
|
||||||
|
# For custom folders, we need to get the folder ID first
|
||||||
|
# This will be handled by the calling method
|
||||||
|
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||||
# Add orderby for basic queries
|
# Add orderby for basic queries
|
||||||
params["$orderby"] = "receivedDateTime desc"
|
params["$orderby"] = "receivedDateTime desc"
|
||||||
return params
|
return params
|
||||||
|
|
@ -191,11 +197,21 @@ class MethodOutlook(MethodBase):
|
||||||
|
|
||||||
|
|
||||||
# Use only subject search to keep filter simple
|
# Use only subject search to keep filter simple
|
||||||
params["$filter"] = f"contains(subject,'{clean_query}')"
|
# Handle wildcard queries specially
|
||||||
|
if clean_query == "*" or clean_query == "":
|
||||||
# Add folder filter if specified
|
# For wildcard or empty query, don't use contains filter
|
||||||
if folder and folder.lower() != "all":
|
# Just use folder filter if specified
|
||||||
params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'"
|
if folder and folder.lower() != "all":
|
||||||
|
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||||
|
else:
|
||||||
|
# No filter needed for wildcard search across all folders
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
params["$filter"] = f"contains(subject,'{clean_query}')"
|
||||||
|
|
||||||
|
# Add folder filter if specified
|
||||||
|
if folder and folder.lower() != "all":
|
||||||
|
params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'"
|
||||||
|
|
||||||
# Add orderby for basic queries
|
# Add orderby for basic queries
|
||||||
params["$orderby"] = "receivedDateTime desc"
|
params["$orderby"] = "receivedDateTime desc"
|
||||||
|
|
@ -235,6 +251,10 @@ class MethodOutlook(MethodBase):
|
||||||
if '@' in filter_text and '.' in filter_text and ' ' not in filter_text and not filter_text.startswith('from:'):
|
if '@' in filter_text and '.' in filter_text and ' ' not in filter_text and not filter_text.startswith('from:'):
|
||||||
return {"$filter": f"from/fromAddress/address eq '{filter_text}'"}
|
return {"$filter": f"from/fromAddress/address eq '{filter_text}'"}
|
||||||
|
|
||||||
|
# Handle OData filter conditions (contains 'eq', 'ne', 'gt', 'lt', etc.)
|
||||||
|
if any(op in filter_text.lower() for op in [' eq ', ' ne ', ' gt ', ' lt ', ' ge ', ' le ', ' and ', ' or ']):
|
||||||
|
return {"$filter": filter_text}
|
||||||
|
|
||||||
# Handle text content - search in subject
|
# Handle text content - search in subject
|
||||||
return {"$filter": f"contains(subject,'{filter_text}')"}
|
return {"$filter": f"contains(subject,'{filter_text}')"}
|
||||||
|
|
||||||
|
|
@ -300,26 +320,31 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Read emails and metadata from a mailbox folder.
|
- Purpose: Read emails and metadata from a mailbox folder.
|
||||||
- Input requirements: connectionReference (required); optional folder, limit, filter, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType.
|
||||||
- Output format: JSON with emails and metadata.
|
- Output format: JSON with emails and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- folder (str, optional): Folder to read from. Default: Inbox.
|
- folder (str, optional): Folder to read from. Default: Inbox.
|
||||||
- limit (int, optional): Maximum items to return. Default: 10.
|
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||||
- filter (str, optional): Sender, query operators, or subject text.
|
- filter (str, optional): Sender, query operators, or subject text.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
folder = parameters.get("folder", "Inbox")
|
folder = parameters.get("folder", "Inbox")
|
||||||
limit = parameters.get("limit", 10)
|
limit = parameters.get("limit", 10)
|
||||||
filter = parameters.get("filter")
|
filter = parameters.get("filter")
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
||||||
|
# Validate limit parameter
|
||||||
|
if limit <= 0:
|
||||||
|
limit = 1000
|
||||||
|
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
|
||||||
|
|
||||||
# Validate filter parameter if provided
|
# Validate filter parameter if provided
|
||||||
if filter:
|
if filter:
|
||||||
# Remove any potentially dangerous characters that could break the filter
|
# Remove any potentially dangerous characters that could break the filter
|
||||||
|
|
@ -343,8 +368,16 @@ class MethodOutlook(MethodBase):
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Build the API request
|
# Get the folder ID for the specified folder
|
||||||
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
|
folder_id = self._getFolderId(folder, connection)
|
||||||
|
|
||||||
|
if folder_id:
|
||||||
|
# Build the API request with folder ID
|
||||||
|
api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages"
|
||||||
|
else:
|
||||||
|
# Fallback: use folder name directly (for well-known folders like "Inbox")
|
||||||
|
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
|
||||||
|
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
|
||||||
params = {
|
params = {
|
||||||
"$top": limit,
|
"$top": limit,
|
||||||
"$orderby": "receivedDateTime desc"
|
"$orderby": "receivedDateTime desc"
|
||||||
|
|
@ -380,7 +413,11 @@ class MethodOutlook(MethodBase):
|
||||||
"count": len(emails_data.get("value", [])),
|
"count": len(emails_data.get("value", [])),
|
||||||
"folder": folder,
|
"folder": folder,
|
||||||
"filter": filter,
|
"filter": filter,
|
||||||
"apiResponse": emails_data
|
"apiMetadata": {
|
||||||
|
"@odata.context": emails_data.get("@odata.context"),
|
||||||
|
"@odata.count": emails_data.get("@odata.count"),
|
||||||
|
"@odata.nextLink": emails_data.get("@odata.nextLink")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -405,18 +442,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
|
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -454,27 +488,32 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Search emails by query and return matching items with metadata.
|
- Purpose: Search emails by query and return matching items with metadata.
|
||||||
- Input requirements: connectionReference (required); query (required); optional folder, limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType.
|
||||||
- Output format: JSON with search results and metadata.
|
- Output format: JSON with search results and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- query (str, required): Search expression.
|
- query (str, required): Search expression.
|
||||||
- folder (str, optional): Folder scope or All. Default: All.
|
- folder (str, optional): Folder scope or All. Default: All.
|
||||||
- limit (int, optional): Maximum items to return. Default: 20.
|
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
query = parameters.get("query")
|
query = parameters.get("query")
|
||||||
folder = parameters.get("folder", "All")
|
folder = parameters.get("folder", "All")
|
||||||
limit = parameters.get("limit", 20)
|
limit = parameters.get("limit", 1000)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
# Validate parameters
|
# Validate parameters
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
||||||
|
# Validate limit parameter
|
||||||
|
if limit <= 0:
|
||||||
|
limit = 1000
|
||||||
|
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
|
||||||
|
|
||||||
if not query or not query.strip():
|
if not query or not query.strip():
|
||||||
return ActionResult.isFailure(error="Search query is required and cannot be empty")
|
return ActionResult.isFailure(error="Search query is required and cannot be empty")
|
||||||
|
|
||||||
|
|
@ -488,12 +527,15 @@ class MethodOutlook(MethodBase):
|
||||||
# Validate limit
|
# Validate limit
|
||||||
try:
|
try:
|
||||||
limit = int(limit)
|
limit = int(limit)
|
||||||
if limit <= 0 or limit > 1000: # Microsoft Graph API has limits
|
if limit <= 0:
|
||||||
limit = 20
|
limit = 1000
|
||||||
logger.warning(f"Limit {limit} is out of range, using default value 20")
|
logger.warning(f"Invalid limit value (<=0), using default value 1000")
|
||||||
|
elif limit > 1000: # Microsoft Graph API has limits
|
||||||
|
limit = 1000
|
||||||
|
logger.warning(f"Limit {limit} exceeds maximum (1000), using 1000")
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
limit = 20
|
limit = 1000
|
||||||
logger.warning(f"Invalid limit value, using default value 20")
|
logger.warning(f"Invalid limit value, using default value 1000")
|
||||||
|
|
||||||
# Get Microsoft connection
|
# Get Microsoft connection
|
||||||
connection = self._getMicrosoftConnection(connectionReference)
|
connection = self._getMicrosoftConnection(connectionReference)
|
||||||
|
|
@ -509,9 +551,18 @@ class MethodOutlook(MethodBase):
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Get the folder ID for the specified folder if needed
|
||||||
|
folder_id = None
|
||||||
|
if folder and folder.lower() != "all":
|
||||||
|
folder_id = self._getFolderId(folder, connection)
|
||||||
|
if folder_id:
|
||||||
|
logger.debug(f"Found folder ID for '{folder}': {folder_id}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
|
||||||
|
|
||||||
# Build the search API request
|
# Build the search API request
|
||||||
api_url = f"{graph_url}/me/messages"
|
api_url = f"{graph_url}/me/messages"
|
||||||
params = self._buildSearchParameters(query, folder, limit)
|
params = self._buildSearchParameters(query, folder_id or folder, limit)
|
||||||
|
|
||||||
# Log search parameters for debugging
|
# Log search parameters for debugging
|
||||||
logger.debug(f"Search query: '{query}'")
|
logger.debug(f"Search query: '{query}'")
|
||||||
|
|
@ -605,7 +656,11 @@ class MethodOutlook(MethodBase):
|
||||||
"count": len(emails),
|
"count": len(emails),
|
||||||
"folder": folder,
|
"folder": folder,
|
||||||
"limit": limit,
|
"limit": limit,
|
||||||
"apiResponse": search_data,
|
"apiMetadata": {
|
||||||
|
"@odata.context": search_data.get("@odata.context"),
|
||||||
|
"@odata.count": search_data.get("@odata.count"),
|
||||||
|
"@odata.nextLink": search_data.get("@odata.nextLink")
|
||||||
|
},
|
||||||
"searchParams": params
|
"searchParams": params
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -618,18 +673,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -664,20 +716,20 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: List draft emails from a folder.
|
- Purpose: List draft emails from a folder.
|
||||||
- Input requirements: connectionReference (required); optional folder, limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional folder, limit, outputMimeType.
|
||||||
- Output format: JSON with draft items and metadata.
|
- Output format: JSON with draft items and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- folder (str, optional): Drafts folder to list. Default: Drafts.
|
- folder (str, optional): Drafts folder to list. Default: Drafts.
|
||||||
- limit (int, optional): Maximum items to return. Default: 20.
|
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
folder = parameters.get("folder", "Drafts")
|
folder = parameters.get("folder", "Drafts")
|
||||||
limit = parameters.get("limit", 20)
|
limit = parameters.get("limit", 1000)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
@ -745,18 +797,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -790,18 +839,18 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Find draft emails across folders.
|
- Purpose: Find draft emails across folders.
|
||||||
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional limit, outputMimeType.
|
||||||
- Output format: JSON with drafts and metadata.
|
- Output format: JSON with drafts and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- limit (int, optional): Maximum items to return. Default: 50.
|
- limit (int, optional): Maximum items to return. Default: 50.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
limit = parameters.get("limit", 50)
|
limit = parameters.get("limit", 50)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
@ -859,18 +908,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -930,18 +976,18 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Check contents of the Drafts folder.
|
- Purpose: Check contents of the Drafts folder.
|
||||||
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional limit, outputMimeType.
|
||||||
- Output format: JSON with drafts and metadata.
|
- Output format: JSON with drafts and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- limit (int, optional): Maximum items to return. Default: 20.
|
- limit (int, optional): Maximum items to return. Default: 20.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
limit = parameters.get("limit", 20)
|
limit = parameters.get("limit", 20)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
@ -1003,18 +1049,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -931,7 +931,8 @@ class MethodSharepoint(MethodBase):
|
||||||
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
|
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
|
||||||
|
|
||||||
# Check if pathQuery contains search terms (words without proper path structure)
|
# Check if pathQuery contains search terms (words without proper path structure)
|
||||||
if not pathQuery.startswith('/site:') and not pathQuery.startswith('/Documents') and not pathQuery.startswith('/Shared Documents'):
|
valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents']
|
||||||
|
if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes):
|
||||||
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
|
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
|
||||||
|
|
||||||
# For pathQuery, we need to discover sites to find the specific one
|
# For pathQuery, we need to discover sites to find the specific one
|
||||||
|
|
@ -1627,7 +1628,8 @@ class MethodSharepoint(MethodBase):
|
||||||
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
|
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
|
||||||
|
|
||||||
# Check if pathQuery contains search terms (words without proper path structure)
|
# Check if pathQuery contains search terms (words without proper path structure)
|
||||||
if not pathQuery.startswith('/site:') and not pathQuery.startswith('/Documents') and not pathQuery.startswith('/Shared Documents'):
|
valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents']
|
||||||
|
if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes):
|
||||||
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
|
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
|
||||||
|
|
||||||
# For pathQuery, we need to discover sites to find the specific one
|
# For pathQuery, we need to discover sites to find the specific one
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
# adaptive module for React mode
|
# adaptive module for React mode
|
||||||
# Provides adaptive learning capabilities
|
# Provides adaptive learning capabilities
|
||||||
|
|
||||||
from .intentAnalyzer import IntentAnalyzer, DataType, ExpectedFormat
|
from .intentAnalyzer import IntentAnalyzer
|
||||||
from .contentValidator import ContentValidator
|
from .contentValidator import ContentValidator
|
||||||
from .learningEngine import LearningEngine
|
from .learningEngine import LearningEngine
|
||||||
from .progressTracker import ProgressTracker
|
from .progressTracker import ProgressTracker
|
||||||
|
|
||||||
__all__ = ['IntentAnalyzer', 'ContentValidator', 'LearningEngine', 'ProgressTracker', 'DataType', 'ExpectedFormat']
|
__all__ = ['IntentAnalyzer', 'ContentValidator', 'LearningEngine', 'ProgressTracker']
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,9 @@
|
||||||
# contentValidator.py
|
# contentValidator.py
|
||||||
# Content validation for adaptive React mode
|
# Content validation for adaptive React mode
|
||||||
|
|
||||||
import re
|
|
||||||
import logging
|
import logging
|
||||||
|
import json
|
||||||
|
import re
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -10,34 +11,14 @@ logger = logging.getLogger(__name__)
|
||||||
class ContentValidator:
|
class ContentValidator:
|
||||||
"""Validates delivered content against user intent"""
|
"""Validates delivered content against user intent"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, services=None):
|
||||||
pass
|
self.services = services
|
||||||
|
|
||||||
def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
|
async def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""Validates delivered content against user intent"""
|
"""Validates delivered content against user intent using AI"""
|
||||||
try:
|
try:
|
||||||
validationDetails = []
|
# Use AI for comprehensive validation
|
||||||
|
return await self._validateWithAI(documents, intent)
|
||||||
for doc in documents:
|
|
||||||
content = self._extractContent(doc)
|
|
||||||
detail = self._validateSingleDocument(content, doc, intent)
|
|
||||||
validationDetails.append(detail)
|
|
||||||
|
|
||||||
# Calculate overall success
|
|
||||||
overallSuccess = all(detail.get("successCriteriaMet", [False]) for detail in validationDetails)
|
|
||||||
|
|
||||||
# Calculate quality score
|
|
||||||
qualityScore = self._calculateQualityScore(validationDetails)
|
|
||||||
|
|
||||||
# Generate improvement suggestions
|
|
||||||
improvementSuggestions = self._generateImprovementSuggestions(validationDetails, intent)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"overallSuccess": overallSuccess,
|
|
||||||
"qualityScore": qualityScore,
|
|
||||||
"validationDetails": validationDetails,
|
|
||||||
"improvementSuggestions": improvementSuggestions
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error validating content: {str(e)}")
|
logger.error(f"Error validating content: {str(e)}")
|
||||||
|
|
@ -56,253 +37,236 @@ class ContentValidator:
|
||||||
except Exception:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _validateSingleDocument(self, content: str, doc: Any, intent: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Validates a single document against intent"""
|
|
||||||
# Check data type match
|
|
||||||
dataTypeMatch = self._checkDataTypeMatch(content, intent.get("dataType", "unknown"))
|
|
||||||
|
|
||||||
# Check format match
|
|
||||||
formatMatch = self._checkFormatMatch(content, intent.get("expectedFormat", "unknown"))
|
|
||||||
|
|
||||||
# Calculate quality score
|
|
||||||
qualityScore = self._calculateDocumentQualityScore(content, intent)
|
|
||||||
|
|
||||||
# Check success criteria
|
|
||||||
successCriteriaMet = self._checkSuccessCriteria(content, intent)
|
|
||||||
|
|
||||||
# Identify specific issues
|
|
||||||
specificIssues = self._identifySpecificIssues(content, intent)
|
|
||||||
|
|
||||||
# Generate improvement suggestions
|
|
||||||
improvementSuggestions = self._generateDocumentImprovementSuggestions(content, intent)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"documentName": getattr(doc, 'documentName', 'Unknown'),
|
|
||||||
"dataTypeMatch": dataTypeMatch,
|
|
||||||
"formatMatch": formatMatch,
|
|
||||||
"qualityScore": qualityScore,
|
|
||||||
"successCriteriaMet": successCriteriaMet,
|
|
||||||
"specificIssues": specificIssues,
|
|
||||||
"improvementSuggestions": improvementSuggestions
|
|
||||||
}
|
|
||||||
|
|
||||||
def _checkDataTypeMatch(self, content: str, dataType: str) -> bool:
|
|
||||||
"""Checks if content matches the expected data type"""
|
|
||||||
if dataType == "numbers":
|
|
||||||
return self._containsNumbers(content)
|
|
||||||
elif dataType == "text":
|
|
||||||
return self._containsText(content)
|
|
||||||
elif dataType == "documents":
|
|
||||||
return self._containsDocumentContent(content)
|
|
||||||
elif dataType == "analysis":
|
|
||||||
return self._containsAnalysis(content)
|
|
||||||
elif dataType == "code":
|
|
||||||
return self._containsCode(content)
|
|
||||||
else:
|
|
||||||
return True # Unknown type, assume match
|
|
||||||
|
|
||||||
def _containsNumbers(self, content: str) -> bool:
|
|
||||||
"""Checks if content contains actual numbers (not code)"""
|
|
||||||
# Look for actual numbers in the content
|
|
||||||
numbers = re.findall(r'\b\d+\b', content)
|
|
||||||
|
|
||||||
# Check if it's code (contains function definitions, etc.)
|
|
||||||
isCode = any(keyword in content.lower() for keyword in [
|
|
||||||
'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
|
|
||||||
'return', 'print(', 'console.log', 'public ', 'private '
|
|
||||||
])
|
|
||||||
|
|
||||||
# If it's code, it doesn't contain actual numbers
|
|
||||||
if isCode:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# If it has numbers and it's not code, it contains actual numbers
|
|
||||||
return len(numbers) > 0
|
|
||||||
|
|
||||||
def _containsText(self, content: str) -> bool:
|
|
||||||
"""Checks if content contains readable text"""
|
|
||||||
# Remove numbers and special characters
|
|
||||||
textContent = re.sub(r'[^\w\s]', '', content)
|
|
||||||
words = textContent.split()
|
|
||||||
|
|
||||||
# Check if there are enough words to be considered text
|
|
||||||
return len(words) > 5
|
|
||||||
|
|
||||||
def _containsDocumentContent(self, content: str) -> bool:
|
|
||||||
"""Checks if content is suitable for document creation"""
|
|
||||||
# Check for structured content
|
|
||||||
hasStructure = any(indicator in content for indicator in [
|
|
||||||
'\n', '\t', '|', '-', '*', '1.', '2.', '•', '◦'
|
|
||||||
])
|
|
||||||
|
|
||||||
# Check for meaningful content
|
|
||||||
hasMeaningfulContent = len(content.strip()) > 50
|
|
||||||
|
|
||||||
return hasStructure and hasMeaningfulContent
|
|
||||||
|
|
||||||
def _containsAnalysis(self, content: str) -> bool:
|
|
||||||
"""Checks if content contains analysis"""
|
|
||||||
analysisIndicators = [
|
|
||||||
'analysis', 'findings', 'conclusion', 'summary', 'insights',
|
|
||||||
'trends', 'patterns', 'comparison', 'evaluation', 'assessment'
|
|
||||||
]
|
|
||||||
|
|
||||||
contentLower = content.lower()
|
|
||||||
return any(indicator in contentLower for indicator in analysisIndicators)
|
|
||||||
|
|
||||||
def _containsCode(self, content: str) -> bool:
|
|
||||||
"""Checks if content contains code"""
|
|
||||||
codeIndicators = [
|
|
||||||
'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
|
|
||||||
'return', 'print(', 'console.log', 'public ', 'private ', 'void ',
|
|
||||||
'int ', 'string ', 'var ', 'let ', 'const '
|
|
||||||
]
|
|
||||||
|
|
||||||
contentLower = content.lower()
|
|
||||||
return any(indicator in contentLower for indicator in codeIndicators)
|
|
||||||
|
|
||||||
def _checkFormatMatch(self, content: str, expectedFormat: str) -> bool:
|
|
||||||
"""Checks if content matches expected format"""
|
|
||||||
if expectedFormat == "raw_data":
|
|
||||||
# Raw data should be simple, not heavily formatted
|
|
||||||
return not any(indicator in content for indicator in [
|
|
||||||
'<html>', '<div>', '<table>', '## ', '### ', '**', '__'
|
|
||||||
])
|
|
||||||
elif expectedFormat == "formatted":
|
|
||||||
# Formatted content should have structure
|
|
||||||
return any(indicator in content for indicator in [
|
|
||||||
'\n', '\t', '|', '-', '*', '1.', '2.', '•'
|
|
||||||
])
|
|
||||||
elif expectedFormat == "structured":
|
|
||||||
# Structured content should have clear organization
|
|
||||||
return any(indicator in content for indicator in [
|
|
||||||
'{', '}', '[', ']', '|', '\t', ' '
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
return True # Unknown format, assume match
|
|
||||||
|
|
||||||
def _checkSuccessCriteria(self, content: str, intent: Dict[str, Any]) -> List[bool]:
|
|
||||||
"""Checks if content meets success criteria"""
|
|
||||||
criteriaMet = []
|
|
||||||
successCriteria = intent.get("successCriteria", [])
|
|
||||||
|
|
||||||
for criterion in successCriteria:
|
|
||||||
if 'prime numbers' in criterion.lower():
|
|
||||||
# Check if content contains actual prime numbers, not code
|
|
||||||
hasNumbers = bool(re.search(r'\b\d+\b', content))
|
|
||||||
isNotCode = not any(keyword in content.lower() for keyword in [
|
|
||||||
'def ', 'function', 'import ', 'class '
|
|
||||||
])
|
|
||||||
criteriaMet.append(hasNumbers and isNotCode)
|
|
||||||
elif 'document' in criterion.lower():
|
|
||||||
# Check if content is suitable for document creation
|
|
||||||
hasStructure = any(indicator in content for indicator in [
|
|
||||||
'\n', '\t', '|', '-', '*', '1.', '2.'
|
|
||||||
])
|
|
||||||
criteriaMet.append(hasStructure)
|
|
||||||
elif 'format' in criterion.lower():
|
|
||||||
# Check if content is properly formatted
|
|
||||||
hasFormatting = any(indicator in content for indicator in [
|
|
||||||
'\n', '\t', '|', '-', '*', '1.', '2.', '•'
|
|
||||||
])
|
|
||||||
criteriaMet.append(hasFormatting)
|
|
||||||
else:
|
|
||||||
# Generic check - content should not be empty
|
|
||||||
criteriaMet.append(len(content.strip()) > 0)
|
|
||||||
|
|
||||||
return criteriaMet
|
|
||||||
|
|
||||||
def _calculateDocumentQualityScore(self, content: str, intent: Dict[str, Any]) -> float:
|
|
||||||
"""Calculates quality score for a single document"""
|
|
||||||
score = 0.0
|
|
||||||
|
|
||||||
# Base score for having content
|
|
||||||
if len(content.strip()) > 0:
|
|
||||||
score += 0.2
|
|
||||||
|
|
||||||
# Score for data type match
|
|
||||||
if self._checkDataTypeMatch(content, intent.get("dataType", "unknown")):
|
|
||||||
score += 0.3
|
|
||||||
|
|
||||||
# Score for format match
|
|
||||||
if self._checkFormatMatch(content, intent.get("expectedFormat", "unknown")):
|
|
||||||
score += 0.2
|
|
||||||
|
|
||||||
# Score for success criteria
|
|
||||||
successCriteriaMet = self._checkSuccessCriteria(content, intent)
|
|
||||||
if successCriteriaMet:
|
|
||||||
successRate = sum(successCriteriaMet) / len(successCriteriaMet)
|
|
||||||
score += 0.3 * successRate
|
|
||||||
|
|
||||||
return min(score, 1.0)
|
|
||||||
|
|
||||||
def _calculateQualityScore(self, validationDetails: List[Dict[str, Any]]) -> float:
|
|
||||||
"""Calculates overall quality score from validation details"""
|
|
||||||
if not validationDetails:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
totalScore = sum(detail.get("qualityScore", 0) for detail in validationDetails)
|
|
||||||
return totalScore / len(validationDetails)
|
|
||||||
|
|
||||||
def _identifySpecificIssues(self, content: str, intent: Dict[str, Any]) -> List[str]:
|
|
||||||
"""Identifies specific issues with the content"""
|
|
||||||
issues = []
|
|
||||||
|
|
||||||
# Check for common issues
|
|
||||||
if intent.get("dataType") == "numbers" and self._containsCode(content):
|
|
||||||
issues.append("Content contains code instead of actual numbers")
|
|
||||||
|
|
||||||
if intent.get("expectedFormat") == "raw_data" and any(indicator in content for indicator in ['<html>', '## ', '**']):
|
|
||||||
issues.append("Content is formatted when raw data was requested")
|
|
||||||
|
|
||||||
if len(content.strip()) == 0:
|
|
||||||
issues.append("Content is empty")
|
|
||||||
|
|
||||||
return issues
|
|
||||||
|
|
||||||
def _generateDocumentImprovementSuggestions(self, content: str, intent: Dict[str, Any]) -> List[str]:
|
|
||||||
"""Generates improvement suggestions for a single document"""
|
|
||||||
suggestions = []
|
|
||||||
|
|
||||||
dataType = intent.get("dataType", "unknown")
|
|
||||||
expectedFormat = intent.get("expectedFormat", "unknown")
|
|
||||||
|
|
||||||
if dataType == "numbers" and self._containsCode(content):
|
|
||||||
suggestions.append("Deliver actual numbers, not code to generate them")
|
|
||||||
|
|
||||||
if expectedFormat == "raw_data" and any(indicator in content for indicator in ['<html>', '## ']):
|
|
||||||
suggestions.append("Provide raw data without formatting")
|
|
||||||
|
|
||||||
if len(content.strip()) == 0:
|
|
||||||
suggestions.append("Provide actual content")
|
|
||||||
|
|
||||||
return suggestions
|
|
||||||
|
|
||||||
def _generateImprovementSuggestions(self, validationDetails: List[Dict[str, Any]],
|
|
||||||
intent: Dict[str, Any]) -> List[str]:
|
|
||||||
"""Generates improvement suggestions based on validation results"""
|
|
||||||
suggestions = []
|
|
||||||
|
|
||||||
# Check for common issues
|
|
||||||
if not any(detail.get("dataTypeMatch", False) for detail in validationDetails):
|
|
||||||
dataType = intent.get("dataType", "unknown")
|
|
||||||
suggestions.append(f"Content should contain {dataType} data, not code or other formats")
|
|
||||||
|
|
||||||
if not any(detail.get("formatMatch", False) for detail in validationDetails):
|
|
||||||
expectedFormat = intent.get("expectedFormat", "unknown")
|
|
||||||
suggestions.append(f"Content should be in {expectedFormat} format")
|
|
||||||
|
|
||||||
# Add specific suggestions from validation details
|
|
||||||
for detail in validationDetails:
|
|
||||||
suggestions.extend(detail.get("improvementSuggestions", []))
|
|
||||||
|
|
||||||
return list(set(suggestions)) # Remove duplicates
|
|
||||||
|
|
||||||
def _createFailedValidationResult(self, error: str) -> Dict[str, Any]:
|
def _createFailedValidationResult(self, error: str) -> Dict[str, Any]:
|
||||||
"""Creates a failed validation result"""
|
"""Creates a failed validation result"""
|
||||||
return {
|
return {
|
||||||
"overallSuccess": False,
|
"overallSuccess": False,
|
||||||
"qualityScore": 0.0,
|
"qualityScore": 0.0,
|
||||||
"validationDetails": [],
|
"validationDetails": [],
|
||||||
"improvementSuggestions": [f"Validation failed: {error}"]
|
"improvementSuggestions": [f"NEXT STEP: Fix validation error - {error}. Check system logs for more details and retry the operation."]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _isValidJsonResponse(self, response: str) -> bool:
|
||||||
|
"""Checks if response contains valid JSON structure"""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
# Look for JSON with expected structure
|
||||||
|
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', response, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json.loads(json_match.group(0))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _extractFallbackValidationResult(self, response: str) -> Dict[str, Any]:
|
||||||
|
"""Extracts validation result from malformed AI response"""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Extract key values using regex patterns
|
||||||
|
overall_success = re.search(r'"overallSuccess"\s*:\s*(true|false)', response, re.IGNORECASE)
|
||||||
|
quality_score = re.search(r'"qualityScore"\s*:\s*([0-9.]+)', response)
|
||||||
|
gap_analysis = re.search(r'"gapAnalysis"\s*:\s*"([^"]*)"', response)
|
||||||
|
|
||||||
|
# Determine overall success from context if not found
|
||||||
|
if not overall_success:
|
||||||
|
# Look for positive/negative indicators in the text
|
||||||
|
if any(word in response.lower() for word in ['success', 'complete', 'fulfilled', 'satisfied']):
|
||||||
|
overall_success = True
|
||||||
|
elif any(word in response.lower() for word in ['failed', 'incomplete', 'missing', 'error']):
|
||||||
|
overall_success = False
|
||||||
|
else:
|
||||||
|
overall_success = False
|
||||||
|
|
||||||
|
return {
|
||||||
|
"overallSuccess": overall_success if isinstance(overall_success, bool) else (overall_success.group(1).lower() == 'true' if overall_success else False),
|
||||||
|
"qualityScore": float(quality_score.group(1)) if quality_score else 0.5,
|
||||||
|
"validationDetails": [{
|
||||||
|
"documentName": "AI Validation (Fallback)",
|
||||||
|
"gapAnalysis": gap_analysis.group(1) if gap_analysis else "Unable to parse detailed analysis",
|
||||||
|
"successCriteriaMet": [False] # Conservative fallback
|
||||||
|
}],
|
||||||
|
"improvementSuggestions": ["NEXT STEP: AI response was malformed - retry the operation for better results"]
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Fallback extraction failed: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""AI-based comprehensive validation - single main function"""
|
||||||
|
try:
|
||||||
|
if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
|
||||||
|
return self._createFailedValidationResult("AI service not available")
|
||||||
|
|
||||||
|
# Extract content from all documents
|
||||||
|
documentContents = []
|
||||||
|
for doc in documents:
|
||||||
|
content = self._extractContent(doc)
|
||||||
|
documentContents.append({
|
||||||
|
"name": getattr(doc, 'documentName', 'Unknown'),
|
||||||
|
"content": content[:2000] # Limit content for AI processing
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create comprehensive AI validation prompt
|
||||||
|
validationPrompt = f"""
|
||||||
|
You are a comprehensive task completion validator. Analyze if the delivered content fulfills the user's request.
|
||||||
|
|
||||||
|
USER REQUEST: {intent.get('primaryGoal', 'Unknown')}
|
||||||
|
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
|
||||||
|
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
|
||||||
|
SUCCESS CRITERIA: {intent.get('successCriteria', [])}
|
||||||
|
|
||||||
|
DELIVERED CONTENT:
|
||||||
|
{json.dumps(documentContents, indent=2)}
|
||||||
|
|
||||||
|
Perform comprehensive validation:
|
||||||
|
1. Check if content matches expected data type
|
||||||
|
2. Check if content matches expected format
|
||||||
|
3. Verify success criteria are met
|
||||||
|
4. Assess overall quality and completeness
|
||||||
|
5. Identify specific gaps and issues
|
||||||
|
6. Provide actionable next steps
|
||||||
|
|
||||||
|
CRITICAL: You MUST respond with ONLY the JSON object below. NO TEXT ANALYSIS. NO EXPLANATIONS. NO OTHER CONTENT.
|
||||||
|
|
||||||
|
RESPOND WITH THIS EXACT JSON FORMAT:
|
||||||
|
|
||||||
|
{{
|
||||||
|
"overallSuccess": false,
|
||||||
|
"qualityScore": 0.5,
|
||||||
|
"dataTypeMatch": false,
|
||||||
|
"formatMatch": false,
|
||||||
|
"successCriteriaMet": [false, false],
|
||||||
|
"gapAnalysis": "Content does not match expected format and lacks required elements",
|
||||||
|
"improvementSuggestions": ["NEXT STEP: Create proper content in expected format", "NEXT STEP: Ensure all success criteria are met"],
|
||||||
|
"validationDetails": [
|
||||||
|
{{
|
||||||
|
"documentName": "Content Validation",
|
||||||
|
"issues": ["Format mismatch", "Missing required elements"],
|
||||||
|
"suggestions": ["NEXT STEP: Fix format", "NEXT STEP: Add missing elements"]
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Call AI service for validation
|
||||||
|
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
response = await self.services.ai.callAi(
|
||||||
|
prompt=validationPrompt,
|
||||||
|
documents=None,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
# If first attempt fails, try with more explicit prompt
|
||||||
|
if response and not self._isValidJsonResponse(response):
|
||||||
|
logger.debug("First AI validation attempt failed, retrying with explicit JSON-only prompt")
|
||||||
|
explicitPrompt = f"""
|
||||||
|
VALIDATE AND RETURN JSON ONLY - NO TEXT ANALYSIS
|
||||||
|
|
||||||
|
Request: {intent.get('primaryGoal', 'Unknown')}
|
||||||
|
Data Type: {intent.get('dataType', 'unknown')}
|
||||||
|
Format: {intent.get('expectedFormat', 'unknown')}
|
||||||
|
Criteria: {intent.get('successCriteria', [])}
|
||||||
|
|
||||||
|
Content: {json.dumps(documentContents, indent=2)}
|
||||||
|
|
||||||
|
RESPOND WITH THIS EXACT JSON FORMAT - NO OTHER TEXT:
|
||||||
|
|
||||||
|
{{
|
||||||
|
"overallSuccess": false,
|
||||||
|
"qualityScore": 0.3,
|
||||||
|
"dataTypeMatch": false,
|
||||||
|
"formatMatch": false,
|
||||||
|
"successCriteriaMet": [false, false],
|
||||||
|
"gapAnalysis": "Content does not match expected format and lacks required elements",
|
||||||
|
"improvementSuggestions": ["NEXT STEP: Create proper content in expected format", "NEXT STEP: Ensure all success criteria are met"],
|
||||||
|
"validationDetails": [
|
||||||
|
{{
|
||||||
|
"documentName": "Content Validation",
|
||||||
|
"issues": ["Format mismatch", "Missing required elements"],
|
||||||
|
"suggestions": ["NEXT STEP: Fix format", "NEXT STEP: Add missing elements"]
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
response = await self.services.ai.callAi(
|
||||||
|
prompt=explicitPrompt,
|
||||||
|
documents=None,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response or not response.strip():
|
||||||
|
logger.warning("AI validation returned empty response")
|
||||||
|
return self._createFailedValidationResult("AI validation failed - empty response")
|
||||||
|
|
||||||
|
# Clean and extract JSON from response
|
||||||
|
result = response.strip()
|
||||||
|
logger.debug(f"AI validation response length: {len(result)}")
|
||||||
|
|
||||||
|
# Try to find JSON in the response with multiple strategies
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Strategy 1: Look for JSON in markdown code blocks
|
||||||
|
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1)
|
||||||
|
logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
|
||||||
|
else:
|
||||||
|
# Strategy 2: Look for JSON object with proper structure
|
||||||
|
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', result, re.DOTALL)
|
||||||
|
if not json_match:
|
||||||
|
# Strategy 3: Look for any JSON object
|
||||||
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||||
|
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(0)
|
||||||
|
logger.debug(f"Extracted JSON directly: {result[:200]}...")
|
||||||
|
else:
|
||||||
|
logger.debug(f"No JSON found in AI response, trying fallback extraction: {result[:200]}...")
|
||||||
|
logger.debug(f"Full AI response: {result}")
|
||||||
|
|
||||||
|
# Try fallback extraction for text responses
|
||||||
|
fallback_result = self._extractFallbackValidationResult(result)
|
||||||
|
if fallback_result:
|
||||||
|
logger.info("Using fallback text extraction for validation")
|
||||||
|
return fallback_result
|
||||||
|
|
||||||
|
logger.warning("All AI validation attempts failed - no JSON found and fallback extraction failed")
|
||||||
|
return self._createFailedValidationResult("AI validation failed - no JSON in response")
|
||||||
|
|
||||||
|
try:
|
||||||
|
aiResult = json.loads(result)
|
||||||
|
logger.info("AI validation JSON parsed successfully")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"overallSuccess": aiResult.get("overallSuccess", False),
|
||||||
|
"qualityScore": aiResult.get("qualityScore", 0.0),
|
||||||
|
"validationDetails": aiResult.get("validationDetails", [{
|
||||||
|
"documentName": "AI Validation",
|
||||||
|
"gapAnalysis": aiResult.get("gapAnalysis", ""),
|
||||||
|
"successCriteriaMet": aiResult.get("successCriteriaMet", [False])
|
||||||
|
}]),
|
||||||
|
"improvementSuggestions": aiResult.get("improvementSuggestions", [])
|
||||||
|
}
|
||||||
|
|
||||||
|
except json.JSONDecodeError as json_error:
|
||||||
|
logger.warning(f"All AI validation attempts failed - invalid JSON: {str(json_error)}")
|
||||||
|
logger.debug(f"JSON content: {result}")
|
||||||
|
|
||||||
|
# Try to extract key information from malformed response
|
||||||
|
fallbackResult = self._extractFallbackValidationResult(result)
|
||||||
|
if fallbackResult:
|
||||||
|
logger.info("Using fallback validation result from malformed JSON")
|
||||||
|
return fallbackResult
|
||||||
|
|
||||||
|
return self._createFailedValidationResult(f"AI validation failed - invalid JSON: {str(json_error)}")
|
||||||
|
|
||||||
|
return self._createFailedValidationResult("AI validation failed - no response")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"AI validation failed: {str(e)}")
|
||||||
|
return self._createFailedValidationResult(f"AI validation error: {str(e)}")
|
||||||
|
|
@ -1,228 +1,156 @@
|
||||||
# intentAnalyzer.py
|
# intentAnalyzer.py
|
||||||
# Intent analysis for adaptive React mode
|
# Intent analysis for adaptive React mode - AI-based, language-agnostic
|
||||||
|
|
||||||
import re
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any, List
|
from typing import Dict, Any, List
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class DataType(Enum):
|
|
||||||
NUMBERS = "numbers"
|
|
||||||
TEXT = "text"
|
|
||||||
DOCUMENTS = "documents"
|
|
||||||
ANALYSIS = "analysis"
|
|
||||||
CODE = "code"
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
|
|
||||||
class ExpectedFormat(Enum):
|
|
||||||
RAW_DATA = "raw_data"
|
|
||||||
FORMATTED = "formatted"
|
|
||||||
STRUCTURED = "structured"
|
|
||||||
VISUAL = "visual"
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
|
|
||||||
class IntentAnalyzer:
|
class IntentAnalyzer:
|
||||||
"""Analyzes user intent to understand what they actually want"""
|
"""Analyzes user intent using AI - language-agnostic and generic"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, services=None):
|
||||||
self.dataTypePatterns = {
|
self.services = services
|
||||||
DataType.NUMBERS: [
|
|
||||||
r'\b(numbers?|digits?|count|list|sequence)\b',
|
|
||||||
r'\b(prime|fibonacci|random|even|odd)\s+(numbers?)\b',
|
|
||||||
r'\b(calculate|compute|generate)\s+(numbers?)\b',
|
|
||||||
r'\b(first|last)\s+\d+\s+(numbers?)\b'
|
|
||||||
],
|
|
||||||
DataType.TEXT: [
|
|
||||||
r'\b(text|content|words?|sentences?|paragraphs?)\b',
|
|
||||||
r'\b(write|create|generate)\s+(text|content)\b',
|
|
||||||
r'\b(summary|description|explanation)\b',
|
|
||||||
r'\b(article|essay|report)\b'
|
|
||||||
],
|
|
||||||
DataType.DOCUMENTS: [
|
|
||||||
r'\b(document|file|report|pdf|word|excel)\b',
|
|
||||||
r'\b(create|generate|make)\s+(document|file|report)\b',
|
|
||||||
r'\b(format|structure|organize)\s+(document)\b',
|
|
||||||
r'\b(presentation|slides?)\b'
|
|
||||||
],
|
|
||||||
DataType.ANALYSIS: [
|
|
||||||
r'\b(analyze|analysis|examine|study|evaluate)\b',
|
|
||||||
r'\b(insights?|findings?|results?)\b',
|
|
||||||
r'\b(compare|contrast|evaluate)\b',
|
|
||||||
r'\b(trends?|patterns?)\b'
|
|
||||||
],
|
|
||||||
DataType.CODE: [
|
|
||||||
r'\b(code|program|script|algorithm|function)\b',
|
|
||||||
r'\b(write|create|develop)\s+(code|program|script)\b',
|
|
||||||
r'\b(implement|build|construct)\b',
|
|
||||||
r'\b(debug|fix|optimize)\s+(code)\b'
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
self.formatPatterns = {
|
|
||||||
ExpectedFormat.RAW_DATA: [
|
|
||||||
r'\b(raw|plain|simple|basic)\b',
|
|
||||||
r'\b(numbers?|data|list)\b(?!\s+(in|as|with))',
|
|
||||||
r'\b(just|only)\s+(numbers?|data)\b'
|
|
||||||
],
|
|
||||||
ExpectedFormat.FORMATTED: [
|
|
||||||
r'\b(formatted|structured|organized|presented)\b',
|
|
||||||
r'\b(table|chart|graph|visual)\b',
|
|
||||||
r'\b(pretty|nice|clean)\s+(format|presentation)\b',
|
|
||||||
r'\b(professional|polished)\b'
|
|
||||||
],
|
|
||||||
ExpectedFormat.STRUCTURED: [
|
|
||||||
r'\b(json|xml|csv|structured)\b',
|
|
||||||
r'\b(organized|categorized|grouped)\b',
|
|
||||||
r'\b(systematic|methodical)\b',
|
|
||||||
r'\b(database|spreadsheet)\b'
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
def analyzeUserIntent(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
async def analyzeUserIntent(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
||||||
"""Analyzes user intent from prompt and context"""
|
"""Analyzes user intent from prompt and context using AI"""
|
||||||
try:
|
try:
|
||||||
# Extract primary goal
|
# Use AI to analyze intent
|
||||||
primaryGoal = self._extractPrimaryGoal(userPrompt)
|
aiAnalysis = await self._analyzeIntentWithAI(userPrompt, context)
|
||||||
|
if aiAnalysis:
|
||||||
|
return aiAnalysis
|
||||||
|
|
||||||
# Classify data type
|
# Fallback to basic analysis if AI fails
|
||||||
dataType = self._classifyDataType(userPrompt)
|
return self._createBasicIntentAnalysis(userPrompt)
|
||||||
|
|
||||||
# Determine expected format
|
|
||||||
expectedFormat = self._determineExpectedFormat(userPrompt)
|
|
||||||
|
|
||||||
# Assess quality requirements
|
|
||||||
qualityRequirements = self._assessQualityRequirements(userPrompt, context)
|
|
||||||
|
|
||||||
# Extract success criteria
|
|
||||||
successCriteria = self._extractSuccessCriteria(userPrompt, context)
|
|
||||||
|
|
||||||
# Calculate confidence score
|
|
||||||
confidenceScore = self._calculateConfidenceScore(dataType, expectedFormat, successCriteria)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"primaryGoal": primaryGoal,
|
|
||||||
"dataType": dataType.value,
|
|
||||||
"expectedFormat": expectedFormat.value,
|
|
||||||
"qualityRequirements": qualityRequirements,
|
|
||||||
"successCriteria": successCriteria,
|
|
||||||
"confidenceScore": confidenceScore
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error analyzing user intent: {str(e)}")
|
logger.error(f"Error analyzing user intent: {str(e)}")
|
||||||
return self._createDefaultIntentAnalysis(userPrompt)
|
return self._createDefaultIntentAnalysis(userPrompt)
|
||||||
|
|
||||||
def _extractPrimaryGoal(self, userPrompt: str) -> str:
|
async def _analyzeIntentWithAI(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
||||||
"""Extracts the primary goal from user prompt"""
|
"""Uses AI to analyze user intent - language-agnostic"""
|
||||||
# Simple extraction - can be enhanced
|
try:
|
||||||
return userPrompt.strip()
|
if not self.services or not hasattr(self.services, 'ai'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Create AI analysis prompt
|
||||||
|
analysisPrompt = f"""
|
||||||
|
You are an intent analyzer. Analyze the user's request to understand what they want delivered.
|
||||||
|
|
||||||
|
USER REQUEST: {userPrompt}
|
||||||
|
|
||||||
|
CONTEXT: {getattr(context.task_step, 'objective', '') if hasattr(context, 'task_step') and context.task_step else ''}
|
||||||
|
|
||||||
|
Analyze the user's intent and determine:
|
||||||
|
1. What type of data/content they want (numbers, text, documents, analysis, code, etc.)
|
||||||
|
2. What format they expect (raw data, formatted, structured, visual, etc.)
|
||||||
|
3. What quality requirements they have (accuracy, completeness, format)
|
||||||
|
4. What specific success criteria define completion
|
||||||
|
|
||||||
|
CRITICAL: Respond with ONLY the JSON object below. Do not include any explanatory text, analysis, or other content before or after the JSON.
|
||||||
|
|
||||||
|
{{
|
||||||
|
"primaryGoal": "The main objective the user wants to achieve",
|
||||||
|
"dataType": "numbers|text|documents|analysis|code|unknown",
|
||||||
|
"expectedFormat": "raw_data|formatted|structured|visual|unknown",
|
||||||
|
"qualityRequirements": {{
|
||||||
|
"accuracyThreshold": 0.0-1.0,
|
||||||
|
"completenessThreshold": 0.0-1.0,
|
||||||
|
"formatRequirement": "any|formatted|raw|structured"
|
||||||
|
}},
|
||||||
|
"successCriteria": ["specific criterion 1", "specific criterion 2"],
|
||||||
|
"confidenceScore": 0.0-1.0
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Call AI service for analysis
|
||||||
|
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
response = await self.services.ai.callAi(
|
||||||
|
prompt=analysisPrompt,
|
||||||
|
documents=None,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
# If first attempt fails, try with more explicit prompt
|
||||||
|
if response and not self._isValidJsonResponse(response):
|
||||||
|
logger.debug("First AI intent analysis attempt failed, retrying with explicit JSON-only prompt")
|
||||||
|
explicitPrompt = f"""
|
||||||
|
{analysisPrompt}
|
||||||
|
|
||||||
|
IMPORTANT: You must respond with ONLY valid JSON. No explanations, no analysis, no text before or after. Just the JSON object.
|
||||||
|
"""
|
||||||
|
response = await self.services.ai.callAi(
|
||||||
|
prompt=explicitPrompt,
|
||||||
|
documents=None,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response or not response.strip():
|
||||||
|
logger.warning("AI intent analysis returned empty response")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Clean and extract JSON from response
|
||||||
|
result = response.strip()
|
||||||
|
logger.debug(f"AI intent analysis response length: {len(result)}")
|
||||||
|
|
||||||
|
# Try to find JSON in the response with multiple strategies
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Strategy 1: Look for JSON in markdown code blocks
|
||||||
|
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1)
|
||||||
|
logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
|
||||||
|
else:
|
||||||
|
# Strategy 2: Look for JSON object with proper structure
|
||||||
|
json_match = re.search(r'\{[^{}]*"primaryGoal"[^{}]*\}', result, re.DOTALL)
|
||||||
|
if not json_match:
|
||||||
|
# Strategy 3: Look for any JSON object
|
||||||
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||||
|
|
||||||
|
if not json_match:
|
||||||
|
logger.warning(f"All AI intent analysis attempts failed - no JSON found in response: {result[:200]}...")
|
||||||
|
logger.debug(f"Full AI response: {result}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = json_match.group(0)
|
||||||
|
logger.debug(f"Extracted JSON directly: {result[:200]}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
aiResult = json.loads(result)
|
||||||
|
logger.info("AI intent analysis JSON parsed successfully")
|
||||||
|
return aiResult
|
||||||
|
|
||||||
|
except json.JSONDecodeError as json_error:
|
||||||
|
logger.warning(f"All AI intent analysis attempts failed - invalid JSON: {str(json_error)}")
|
||||||
|
logger.debug(f"JSON content: {result}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"AI intent analysis failed: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
def _classifyDataType(self, userPrompt: str) -> DataType:
|
def _createBasicIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
|
||||||
"""Classifies the type of data the user wants"""
|
"""Creates basic intent analysis without AI"""
|
||||||
promptLower = userPrompt.lower()
|
|
||||||
|
|
||||||
for dataType, patterns in self.dataTypePatterns.items():
|
|
||||||
for pattern in patterns:
|
|
||||||
if re.search(pattern, promptLower):
|
|
||||||
return dataType
|
|
||||||
|
|
||||||
return DataType.UNKNOWN
|
|
||||||
|
|
||||||
def _determineExpectedFormat(self, userPrompt: str) -> ExpectedFormat:
|
|
||||||
"""Determines the expected format of the output"""
|
|
||||||
promptLower = userPrompt.lower()
|
|
||||||
|
|
||||||
for formatType, patterns in self.formatPatterns.items():
|
|
||||||
for pattern in patterns:
|
|
||||||
if re.search(pattern, promptLower):
|
|
||||||
return formatType
|
|
||||||
|
|
||||||
return ExpectedFormat.UNKNOWN
|
|
||||||
|
|
||||||
def _assessQualityRequirements(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
|
||||||
"""Assesses quality requirements from prompt and context"""
|
|
||||||
promptLower = userPrompt.lower()
|
|
||||||
|
|
||||||
# Check for accuracy requirements
|
|
||||||
accuracyThreshold = 0.8
|
|
||||||
if any(word in promptLower for word in ['exact', 'precise', 'accurate', 'correct']):
|
|
||||||
accuracyThreshold = 0.95
|
|
||||||
elif any(word in promptLower for word in ['approximate', 'rough', 'estimate']):
|
|
||||||
accuracyThreshold = 0.7
|
|
||||||
|
|
||||||
# Check for completeness requirements
|
|
||||||
completenessThreshold = 0.8
|
|
||||||
if any(word in promptLower for word in ['complete', 'full', 'comprehensive', 'all']):
|
|
||||||
completenessThreshold = 0.95
|
|
||||||
elif any(word in promptLower for word in ['summary', 'brief', 'overview']):
|
|
||||||
completenessThreshold = 0.6
|
|
||||||
|
|
||||||
# Check for format requirements
|
|
||||||
formatRequirement = "any"
|
|
||||||
if any(word in promptLower for word in ['formatted', 'structured', 'organized']):
|
|
||||||
formatRequirement = "formatted"
|
|
||||||
elif any(word in promptLower for word in ['raw', 'plain', 'simple']):
|
|
||||||
formatRequirement = "raw"
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"accuracyThreshold": accuracyThreshold,
|
"primaryGoal": userPrompt.strip(),
|
||||||
"completenessThreshold": completenessThreshold,
|
"dataType": "unknown",
|
||||||
"formatRequirement": formatRequirement
|
"expectedFormat": "unknown",
|
||||||
|
"qualityRequirements": {
|
||||||
|
"accuracyThreshold": 0.8,
|
||||||
|
"completenessThreshold": 0.8,
|
||||||
|
"formatRequirement": "any"
|
||||||
|
},
|
||||||
|
"successCriteria": ["Delivers what the user requested"],
|
||||||
|
"confidenceScore": 0.5
|
||||||
}
|
}
|
||||||
|
|
||||||
def _extractSuccessCriteria(self, userPrompt: str, context: Any) -> List[str]:
|
|
||||||
"""Extracts success criteria from prompt and context"""
|
|
||||||
criteria = []
|
|
||||||
promptLower = userPrompt.lower()
|
|
||||||
|
|
||||||
# Extract explicit criteria
|
|
||||||
if 'first' in promptLower and 'numbers' in promptLower:
|
|
||||||
criteria.append("Contains the first N numbers as requested")
|
|
||||||
|
|
||||||
if 'prime' in promptLower:
|
|
||||||
criteria.append("Contains actual prime numbers, not code to generate them")
|
|
||||||
|
|
||||||
if 'document' in promptLower:
|
|
||||||
criteria.append("Creates a properly formatted document")
|
|
||||||
|
|
||||||
if 'format' in promptLower:
|
|
||||||
criteria.append("Content is properly formatted as requested")
|
|
||||||
|
|
||||||
# Add context-based criteria
|
|
||||||
if hasattr(context, 'task_step') and context.task_step:
|
|
||||||
taskObjective = context.task_step.objective.lower()
|
|
||||||
if 'word' in taskObjective:
|
|
||||||
criteria.append("Creates a Word document")
|
|
||||||
if 'excel' in taskObjective:
|
|
||||||
criteria.append("Creates an Excel spreadsheet")
|
|
||||||
|
|
||||||
return criteria if criteria else ["Delivers what the user requested"]
|
|
||||||
|
|
||||||
def _calculateConfidenceScore(self, dataType: DataType, expectedFormat: ExpectedFormat,
|
|
||||||
successCriteria: List[str]) -> float:
|
|
||||||
"""Calculates confidence score for the intent analysis"""
|
|
||||||
score = 0.0
|
|
||||||
|
|
||||||
# Data type confidence
|
|
||||||
if dataType != DataType.UNKNOWN:
|
|
||||||
score += 0.3
|
|
||||||
|
|
||||||
# Format confidence
|
|
||||||
if expectedFormat != ExpectedFormat.UNKNOWN:
|
|
||||||
score += 0.2
|
|
||||||
|
|
||||||
# Success criteria confidence
|
|
||||||
if len(successCriteria) > 0:
|
|
||||||
score += 0.3
|
|
||||||
|
|
||||||
# Additional confidence for specific patterns
|
|
||||||
if len(successCriteria) > 1:
|
|
||||||
score += 0.2
|
|
||||||
|
|
||||||
return min(score, 1.0)
|
|
||||||
|
|
||||||
def _createDefaultIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
|
def _createDefaultIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
|
||||||
"""Creates a default intent analysis when analysis fails"""
|
"""Creates a default intent analysis when analysis fails"""
|
||||||
return {
|
return {
|
||||||
|
|
@ -237,3 +165,16 @@ class IntentAnalyzer:
|
||||||
"successCriteria": ["Delivers what the user requested"],
|
"successCriteria": ["Delivers what the user requested"],
|
||||||
"confidenceScore": 0.1
|
"confidenceScore": 0.1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _isValidJsonResponse(self, response: str) -> bool:
|
||||||
|
"""Checks if response contains valid JSON structure"""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
# Look for JSON with expected structure
|
||||||
|
json_match = re.search(r'\{[^{}]*"primaryGoal"[^{}]*\}', response, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json.loads(json_match.group(0))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
|
||||||
|
|
@ -31,8 +31,8 @@ class ReactMode(BaseMode):
|
||||||
def __init__(self, services, workflow):
|
def __init__(self, services, workflow):
|
||||||
super().__init__(services, workflow)
|
super().__init__(services, workflow)
|
||||||
# Initialize adaptive components
|
# Initialize adaptive components
|
||||||
self.intentAnalyzer = IntentAnalyzer()
|
self.intentAnalyzer = IntentAnalyzer(services)
|
||||||
self.contentValidator = ContentValidator()
|
self.contentValidator = ContentValidator(services)
|
||||||
self.learningEngine = LearningEngine()
|
self.learningEngine = LearningEngine()
|
||||||
self.progressTracker = ProgressTracker()
|
self.progressTracker = ProgressTracker()
|
||||||
self.currentIntent = None
|
self.currentIntent = None
|
||||||
|
|
@ -49,13 +49,14 @@ class ReactMode(BaseMode):
|
||||||
"""Execute task using React mode - iterative plan-act-observe-refine loop"""
|
"""Execute task using React mode - iterative plan-act-observe-refine loop"""
|
||||||
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
|
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
|
||||||
|
|
||||||
# NEW: Analyze user intent with both original prompt and task objective
|
# NEW: Analyze intents separately for proper validation vs task completion
|
||||||
# Get original user prompt from services (clean and reliable)
|
# Workflow-level intent from cleaned original user prompt
|
||||||
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
|
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
|
||||||
combined_context = f"Original request: {original_prompt}\n\nCurrent task: {taskStep.objective}"
|
self.workflowIntent = await self.intentAnalyzer.analyzeUserIntent(original_prompt, context)
|
||||||
|
# Task-level intent from current task objective (used only for task-scoped checks)
|
||||||
self.currentIntent = self.intentAnalyzer.analyzeUserIntent(combined_context, context)
|
self.taskIntent = await self.intentAnalyzer.analyzeUserIntent(taskStep.objective, context)
|
||||||
logger.info(f"Intent analysis (original + task): {self.currentIntent}")
|
logger.info(f"Intent analysis — workflow: {self.workflowIntent}")
|
||||||
|
logger.info(f"Intent analysis — task: {self.taskIntent}")
|
||||||
|
|
||||||
# NEW: Reset progress tracking for new task
|
# NEW: Reset progress tracking for new task
|
||||||
self.progressTracker.reset()
|
self.progressTracker.reset()
|
||||||
|
|
@ -99,18 +100,18 @@ class ReactMode(BaseMode):
|
||||||
# Attach deterministic label for clarity
|
# Attach deterministic label for clarity
|
||||||
observation['resultLabel'] = result.resultLabel
|
observation['resultLabel'] = result.resultLabel
|
||||||
|
|
||||||
# NEW: Add content validation
|
# NEW: Add content validation (against original cleaned user prompt / workflow intent)
|
||||||
if self.currentIntent and result.documents:
|
if getattr(self, 'workflowIntent', None) and result.documents:
|
||||||
validationResult = self.contentValidator.validateContent(result.documents, self.currentIntent)
|
validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent)
|
||||||
observation['contentValidation'] = validationResult
|
observation['contentValidation'] = validationResult
|
||||||
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
|
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
|
||||||
|
|
||||||
# NEW: Learn from feedback
|
# NEW: Learn from feedback
|
||||||
feedback = self._collectFeedback(result, validationResult, self.currentIntent)
|
feedback = self._collectFeedback(result, validationResult, self.workflowIntent)
|
||||||
self.learningEngine.learnFromFeedback(feedback, context, self.currentIntent)
|
self.learningEngine.learnFromFeedback(feedback, context, self.workflowIntent)
|
||||||
|
|
||||||
# NEW: Update progress
|
# NEW: Update progress
|
||||||
self.progressTracker.updateProgress(result, validationResult, self.currentIntent)
|
self.progressTracker.updateProgress(result, validationResult, self.workflowIntent)
|
||||||
|
|
||||||
decision = await self._refineDecide(context, observation)
|
decision = await self._refineDecide(context, observation)
|
||||||
|
|
||||||
|
|
@ -204,6 +205,11 @@ class ReactMode(BaseMode):
|
||||||
selection = json.loads(response[jsonStart:jsonEnd])
|
selection = json.loads(response[jsonStart:jsonEnd])
|
||||||
if 'action' not in selection or not isinstance(selection['action'], str):
|
if 'action' not in selection or not isinstance(selection['action'], str):
|
||||||
raise ValueError("Selection missing 'action' as string")
|
raise ValueError("Selection missing 'action' as string")
|
||||||
|
|
||||||
|
# Validate document references - prevent AI from inventing Message IDs
|
||||||
|
if 'requiredInputDocuments' in selection:
|
||||||
|
self._validateDocumentReferences(selection['requiredInputDocuments'], context)
|
||||||
|
|
||||||
# Enforce spec: Stage 1 must NOT include 'parameters'
|
# Enforce spec: Stage 1 must NOT include 'parameters'
|
||||||
if 'parameters' in selection:
|
if 'parameters' in selection:
|
||||||
# Remove to avoid accidental carryover
|
# Remove to avoid accidental carryover
|
||||||
|
|
@ -213,6 +219,38 @@ class ReactMode(BaseMode):
|
||||||
selection['parameters'] = None
|
selection['parameters'] = None
|
||||||
return selection
|
return selection
|
||||||
|
|
||||||
|
def _validateDocumentReferences(self, document_refs: List[str], context: TaskContext) -> None:
|
||||||
|
"""Validate that document references exist in the current workflow"""
|
||||||
|
if not document_refs:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get available documents from the current workflow
|
||||||
|
try:
|
||||||
|
available_docs = self.services.workflow.getAvailableDocuments(self.services.currentWorkflow)
|
||||||
|
if not available_docs or available_docs == "No documents available":
|
||||||
|
logger.warning("No documents available for validation")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Extract all valid references from available documents
|
||||||
|
valid_refs = []
|
||||||
|
for line in available_docs.split('\n'):
|
||||||
|
if 'docList:' in line or 'docItem:' in line:
|
||||||
|
# Extract reference from line like " - docList:msg_xxx:label" or " - docItem:xxx:filename with spaces"
|
||||||
|
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line)
|
||||||
|
if ref_match:
|
||||||
|
valid_refs.append(ref_match.group(1))
|
||||||
|
|
||||||
|
# Check if all provided references are valid
|
||||||
|
for ref in document_refs:
|
||||||
|
if ref not in valid_refs:
|
||||||
|
logger.error(f"Invalid document reference: {ref}")
|
||||||
|
logger.error(f"Available references: {valid_refs}")
|
||||||
|
raise ValueError(f"Document reference '{ref}' not found in available documents. Use only exact references from AVAILABLE_DOCUMENTS_INDEX.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error validating document references: {str(e)}")
|
||||||
|
raise ValueError(f"Failed to validate document references: {str(e)}")
|
||||||
|
|
||||||
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
|
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
|
||||||
workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
|
workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
|
||||||
"""Act: request minimal parameters then execute selected action"""
|
"""Act: request minimal parameters then execute selected action"""
|
||||||
|
|
|
||||||
|
|
@ -42,35 +42,38 @@ def extractUserPrompt(context: Any) -> str:
|
||||||
Fallback to the task_step objective.
|
Fallback to the task_step objective.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Prefer services.currentUserPrompt when accessible through context
|
|
||||||
services = getattr(context, 'services', None)
|
services = getattr(context, 'services', None)
|
||||||
if services and getattr(services, 'currentUserPrompt', None):
|
|
||||||
return services.currentUserPrompt
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if hasattr(context, 'task_step') and context.task_step:
|
# Determine raw user prompt from services or task_step
|
||||||
return context.task_step.objective or 'No request specified'
|
rawPrompt = None
|
||||||
return 'No request specified'
|
if services and getattr(services, 'currentUserPrompt', None):
|
||||||
|
rawPrompt = services.currentUserPrompt
|
||||||
|
elif hasattr(context, 'task_step') and context.task_step:
|
||||||
|
rawPrompt = context.task_step.objective or 'No request specified'
|
||||||
|
else:
|
||||||
|
rawPrompt = 'No request specified'
|
||||||
|
|
||||||
|
# Prefer values computed at workflow start by WorkflowManager analyzer
|
||||||
|
normalized = getattr(services, 'currentUserPromptNormalized', None) if services else None
|
||||||
|
if normalized:
|
||||||
|
return normalized
|
||||||
|
return rawPrompt
|
||||||
|
except Exception:
|
||||||
|
# Robust fallback behavior
|
||||||
|
if hasattr(context, 'task_step') and context.task_step:
|
||||||
|
return context.task_step.objective or 'No request specified'
|
||||||
|
return 'No request specified'
|
||||||
|
|
||||||
def extractWorkflowHistory(service: Any, context: Any) -> str:
|
def extractWorkflowHistory(service: Any, context: Any) -> str:
|
||||||
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
||||||
Reverse-chronological, enriched with message summaries and document labels.
|
Reverse-chronological, enriched with message summaries and document labels.
|
||||||
"""
|
"""
|
||||||
# Prefer explicit workflow on context; else fall back to services.workflow
|
|
||||||
workflow = None
|
|
||||||
try:
|
try:
|
||||||
if hasattr(context, 'workflow') and context.workflow:
|
history = getPreviousRoundContext(service, service.currentWorkflow)
|
||||||
workflow = context.workflow
|
|
||||||
elif hasattr(service, 'workflow') and service.workflow:
|
|
||||||
workflow = service.workflow
|
|
||||||
except Exception:
|
|
||||||
workflow = None
|
|
||||||
|
|
||||||
if workflow:
|
|
||||||
history = getPreviousRoundContext(service, workflow)
|
|
||||||
return history or "No previous workflow rounds available"
|
return history or "No previous workflow rounds available"
|
||||||
return "No previous workflow rounds available"
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting workflow history: {str(e)}")
|
||||||
|
return "No previous workflow rounds available"
|
||||||
|
|
||||||
def extractAvailableMethods(service: Any) -> str:
|
def extractAvailableMethods(service: Any) -> str:
|
||||||
"""Extract available methods for action planning. Maps to {{KEY:AVAILABLE_METHODS}}"""
|
"""Extract available methods for action planning. Maps to {{KEY:AVAILABLE_METHODS}}"""
|
||||||
|
|
@ -99,7 +102,15 @@ def extractAvailableMethods(service: Any) -> str:
|
||||||
|
|
||||||
def extractUserLanguage(service: Any) -> str:
|
def extractUserLanguage(service: Any) -> str:
|
||||||
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
|
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
|
||||||
return service.user.language if service and service.user else 'en'
|
try:
|
||||||
|
# Prefer detected language if available
|
||||||
|
if service and getattr(service, 'currentUserLanguage', None):
|
||||||
|
return service.currentUserLanguage
|
||||||
|
return service.user.language if service and service.user else 'en'
|
||||||
|
except Exception:
|
||||||
|
return 'en'
|
||||||
|
|
||||||
|
# Normalization now happens centrally in WorkflowManager._sendFirstMessage; no AI call here.
|
||||||
|
|
||||||
|
|
||||||
def _computeMessageSummary(msg) -> str:
|
def _computeMessageSummary(msg) -> str:
|
||||||
|
|
@ -371,9 +382,10 @@ def extractLatestRefinementFeedback(context: Any) -> str:
|
||||||
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
||||||
"""Summary of available documents (count only)."""
|
"""Summary of available documents (count only)."""
|
||||||
try:
|
try:
|
||||||
documents = service.workflow.getAvailableDocuments(context.workflow)
|
documents = service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||||
if documents and documents != "No documents available":
|
if documents and documents != "No documents available":
|
||||||
doc_count = documents.count("docList:") + documents.count("docItem:")
|
# Count only actual documents, not list labels
|
||||||
|
doc_count = documents.count("docItem:")
|
||||||
return f"{doc_count} documents available from previous tasks"
|
return f"{doc_count} documents available from previous tasks"
|
||||||
return "No documents available"
|
return "No documents available"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -383,7 +395,7 @@ def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
||||||
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
|
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
|
||||||
"""Index of available documents with detailed references for parameter generation."""
|
"""Index of available documents with detailed references for parameter generation."""
|
||||||
try:
|
try:
|
||||||
return service.workflow.getAvailableDocuments(context.workflow)
|
return service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error getting document index: {str(e)}")
|
logger.error(f"Error getting document index: {str(e)}")
|
||||||
return "No documents available"
|
return "No documents available"
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ def generateReactPlanSelectionPrompt(services, context: Any) -> PromptBundle:
|
||||||
PromptPlaceholder(label="AVAILABLE_CONNECTIONS_INDEX", content=extractAvailableConnectionsIndex(services), summaryAllowed=False),
|
PromptPlaceholder(label="AVAILABLE_CONNECTIONS_INDEX", content=extractAvailableConnectionsIndex(services), summaryAllowed=False),
|
||||||
]
|
]
|
||||||
|
|
||||||
template = """Select exactly one action to advance the task.
|
template = """Select exactly one next action to advance the task incrementally.
|
||||||
|
|
||||||
OBJECTIVE:
|
OBJECTIVE:
|
||||||
{{KEY:USER_PROMPT}}
|
{{KEY:USER_PROMPT}}
|
||||||
|
|
@ -52,7 +52,11 @@ AVAILABLE_DOCUMENTS_INDEX:
|
||||||
AVAILABLE_CONNECTIONS_INDEX:
|
AVAILABLE_CONNECTIONS_INDEX:
|
||||||
{{KEY:AVAILABLE_CONNECTIONS_INDEX}}
|
{{KEY:AVAILABLE_CONNECTIONS_INDEX}}
|
||||||
|
|
||||||
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text):
|
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text). The chosen action MUST:
|
||||||
|
- be the next logical incremental step toward fulfilling the objective
|
||||||
|
- not attempt to complete the entire objective in one step
|
||||||
|
- if producing files, target exactly one output format for this step
|
||||||
|
- reference ONLY existing document IDs/labels from AVAILABLE_DOCUMENTS_INDEX
|
||||||
{{
|
{{
|
||||||
"action": "method.action_name",
|
"action": "method.action_name",
|
||||||
"actionObjective": "...",
|
"actionObjective": "...",
|
||||||
|
|
@ -64,7 +68,7 @@ REPLY: Return ONLY a JSON object with the following structure (no comments, no e
|
||||||
|
|
||||||
EXAMPLE how to assign references from AVAILABLE_DOCUMENTS_INDEX and AVAILABLE_CONNECTIONS_INDEX:
|
EXAMPLE how to assign references from AVAILABLE_DOCUMENTS_INDEX and AVAILABLE_CONNECTIONS_INDEX:
|
||||||
"requiredInputDocuments": ["docList:msg_47a7a578-e8f2-4ba8-ac66-0dbff40605e0:round8_task1_action1_results","docItem:5d8b7aee-b546-4487-b6a8-835c86f7b186:AI_Generated_Document_20251006-104256.docx"],
|
"requiredInputDocuments": ["docList:msg_47a7a578-e8f2-4ba8-ac66-0dbff40605e0:round8_task1_action1_results","docItem:5d8b7aee-b546-4487-b6a8-835c86f7b186:AI_Generated_Document_20251006-104256.docx"],
|
||||||
"requiredConnection": "connection:msft:p.motsch@valueon.ch:1ae8b8e5-128b-49b8-b1cb-7c632669eeae",
|
"requiredConnection": "connection:msft:p.motsch@valueon.ch",
|
||||||
|
|
||||||
RULES:
|
RULES:
|
||||||
1. Use EXACT action names from AVAILABLE_METHODS
|
1. Use EXACT action names from AVAILABLE_METHODS
|
||||||
|
|
@ -72,7 +76,11 @@ RULES:
|
||||||
3. parametersContext must be short and sufficient for Stage 2
|
3. parametersContext must be short and sufficient for Stage 2
|
||||||
4. Return ONLY JSON - no markdown, no explanations
|
4. Return ONLY JSON - no markdown, no explanations
|
||||||
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
|
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
|
||||||
|
- DO NOT invent or modify Message IDs
|
||||||
|
- DO NOT create new references
|
||||||
|
- Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX
|
||||||
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
|
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
|
||||||
|
7. Plan incrementally: if the overall intent needs multiple output formats (e.g., CSV and HTML), choose one format in this step and leave the other(s) for subsequent steps
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return PromptBundle(prompt=template, placeholders=placeholders)
|
return PromptBundle(prompt=template, placeholders=placeholders)
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle:
|
||||||
|
|
||||||
Break down user requests into logical, executable task steps.
|
Break down user requests into logical, executable task steps.
|
||||||
|
|
||||||
|
**IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks.
|
||||||
|
|
||||||
## 📋 Context
|
## 📋 Context
|
||||||
|
|
||||||
### User Request
|
### User Request
|
||||||
|
|
@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps.
|
||||||
- **ONE TOPIC PER TASK** - Each task should handle one complete business objective
|
- **ONE TOPIC PER TASK** - Each task should handle one complete business objective
|
||||||
- **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
|
- **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
|
||||||
- **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
|
- **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
|
||||||
|
- **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job
|
||||||
|
|
||||||
### Task Grouping Examples
|
### Task Grouping Examples
|
||||||
- **Research + Analysis + Report** → ONE task: "Web research report"
|
- **Research + Analysis + Report** → ONE task: "Web research report"
|
||||||
- **Data Collection + Processing + Visualization** → ONE task: "Collect and present data"
|
- **Data Collection + Processing + Visualization** → ONE task: "Collect and present data"
|
||||||
|
- **Document splitting** (analyze + extract + create files) → ONE task: "Split document into separate files"
|
||||||
- **Different topics** (email + flowers) → SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"
|
- **Different topics** (email + flowers) → SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"
|
||||||
|
|
||||||
|
### Common Single-Task Scenarios
|
||||||
|
- **"Split document into sections"** → ONE task: "Split document into separate files"
|
||||||
|
- **"Extract data and create report"** → ONE task: "Extract data and create report"
|
||||||
|
- **"Analyze and summarize document"** → ONE task: "Analyze and summarize document"
|
||||||
|
- **"Convert file to different format"** → ONE task: "Convert file to different format"
|
||||||
|
|
||||||
### Retry Handling
|
### Retry Handling
|
||||||
- **If retry request**: Analyze previous rounds to understand what failed
|
- **If retry request**: Analyze previous rounds to understand what failed
|
||||||
- **Learn from mistakes**: Improve the plan based on previous failures
|
- **Learn from mistakes**: Improve the plan based on previous failures
|
||||||
|
|
|
||||||
|
|
@ -216,23 +216,23 @@ class WorkflowManager:
|
||||||
# Update the message with documents in database
|
# Update the message with documents in database
|
||||||
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
|
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
|
||||||
|
|
||||||
# Analyze the user's input to extract intent and offload bulky context into documents
|
# Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents
|
||||||
try:
|
try:
|
||||||
analyzerPrompt = (
|
analyzerPrompt = (
|
||||||
"You are an input analyzer. Split the user's message into:\n"
|
"You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n"
|
||||||
"1) intent: the user's core request in one concise paragraph, normalized to the user's language.\n"
|
"1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n"
|
||||||
"2) contextItems: supportive data to attach as separate documents if significantly larger than the intent. "
|
"2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n"
|
||||||
"Include large literal data blocks, long lists/tables, code/JSON blocks, quoted transcripts, CSV fragments, or detailed specs. "
|
"3) intent: concise single-paragraph core request in the detected language for high-level routing.\n"
|
||||||
"Keep URLs in the intent unless they include large pasted content.\n\n"
|
"4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"- If total content length (intent + data) is less than 10% of the model's max tokens, do not extract; "
|
"- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n"
|
||||||
"return an empty contextItems and keep a compact, self-contained intent.\n"
|
"- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n"
|
||||||
"- If content exceeds that, move bulky parts into contextItems, keeping the intent short and clear.\n"
|
"- Preserve critical references (URLs, filenames) in intent.\n"
|
||||||
"- Preserve critical references (URLs, filenames) in the intent.\n"
|
"- Normalize to the primary detected language if mixed-language.\n\n"
|
||||||
"- Normalize the intent to the detected language. If mixed-language, use the primary detected language and normalize.\n\n"
|
"Return ONLY JSON (no markdown) with this shape:\n"
|
||||||
"Output JSON only (no markdown):\n"
|
|
||||||
"{\n"
|
"{\n"
|
||||||
" \"detectedLanguage\": \"en\",\n"
|
" \"detectedLanguage\": \"de|en|fr|it|...\",\n"
|
||||||
|
" \"normalizedRequest\": \"Full explicit instruction in detected language\",\n"
|
||||||
" \"intent\": \"Concise normalized request...\",\n"
|
" \"intent\": \"Concise normalized request...\",\n"
|
||||||
" \"contextItems\": [\n"
|
" \"contextItems\": [\n"
|
||||||
" {\n"
|
" {\n"
|
||||||
|
|
@ -249,6 +249,7 @@ class WorkflowManager:
|
||||||
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
|
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
|
||||||
|
|
||||||
detectedLanguage = None
|
detectedLanguage = None
|
||||||
|
normalizedRequest = None
|
||||||
intentText = userInput.prompt
|
intentText = userInput.prompt
|
||||||
contextItems = []
|
contextItems = []
|
||||||
|
|
||||||
|
|
@ -260,6 +261,7 @@ class WorkflowManager:
|
||||||
if jsonStart != -1 and jsonEnd > jsonStart:
|
if jsonStart != -1 and jsonEnd > jsonStart:
|
||||||
parsed = json.loads(aiResponse[jsonStart:jsonEnd])
|
parsed = json.loads(aiResponse[jsonStart:jsonEnd])
|
||||||
detectedLanguage = parsed.get('detectedLanguage') or None
|
detectedLanguage = parsed.get('detectedLanguage') or None
|
||||||
|
normalizedRequest = parsed.get('normalizedRequest') or None
|
||||||
if parsed.get('intent'):
|
if parsed.get('intent'):
|
||||||
intentText = parsed.get('intent')
|
intentText = parsed.get('intent')
|
||||||
contextItems = parsed.get('contextItems') or []
|
contextItems = parsed.get('contextItems') or []
|
||||||
|
|
@ -269,7 +271,18 @@ class WorkflowManager:
|
||||||
# Update services state
|
# Update services state
|
||||||
if detectedLanguage and isinstance(detectedLanguage, str):
|
if detectedLanguage and isinstance(detectedLanguage, str):
|
||||||
self._setUserLanguage(detectedLanguage)
|
self._setUserLanguage(detectedLanguage)
|
||||||
|
try:
|
||||||
|
setattr(self.services, 'currentUserLanguage', detectedLanguage)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
self.services.currentUserPrompt = intentText or userInput.prompt
|
self.services.currentUserPrompt = intentText or userInput.prompt
|
||||||
|
try:
|
||||||
|
if normalizedRequest:
|
||||||
|
setattr(self.services, 'currentUserPromptNormalized', normalizedRequest)
|
||||||
|
if contextItems is not None:
|
||||||
|
setattr(self.services, 'currentUserContextItems', contextItems)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Telemetry (sizes and counts)
|
# Telemetry (sizes and counts)
|
||||||
try:
|
try:
|
||||||
|
|
@ -329,8 +342,6 @@ class WorkflowManager:
|
||||||
if not message.documents:
|
if not message.documents:
|
||||||
message.documents = []
|
message.documents = []
|
||||||
message.documents.extend(created_docs)
|
message.documents.extend(created_docs)
|
||||||
# Ensure label is user_context for discoverability
|
|
||||||
message.documentsLabel = context_label
|
|
||||||
self.services.workflow.updateMessage(message.id, {
|
self.services.workflow.updateMessage(message.id, {
|
||||||
"documents": [d.to_dict() for d in message.documents],
|
"documents": [d.to_dict() for d in message.documents],
|
||||||
"documentsLabel": context_label
|
"documentsLabel": context_label
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ markdown
|
||||||
## Web Scraping & HTTP
|
## Web Scraping & HTTP
|
||||||
beautifulsoup4==4.12.2 # Required for HTML/XML parsing
|
beautifulsoup4==4.12.2 # Required for HTML/XML parsing
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
|
requests-oauthlib==1.3.1 # Required for Google OAuth2Session
|
||||||
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
|
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
|
||||||
aiohttp>=3.8.0 # Required for SharePoint operations (async HTTP)
|
aiohttp>=3.8.0 # Required for SharePoint operations (async HTTP)
|
||||||
selenium>=4.15.0 # Required for web automation and JavaScript-heavy pages
|
selenium>=4.15.0 # Required for web automation and JavaScript-heavy pages
|
||||||
|
|
|
||||||
555
test_document_processing.py
Normal file
555
test_document_processing.py
Normal file
|
|
@ -0,0 +1,555 @@
|
||||||
|
"""
|
||||||
|
Test script for document processing and DOCX generation.
|
||||||
|
Calls the main AI service directly to process PDF documents and generate DOCX summaries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import base64
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add the gateway module to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ChatDocument
|
||||||
|
from modules.datamodels.datamodelAi import EnhancedAiCallOptions
|
||||||
|
from modules.services.serviceAi.mainServiceAi import AiService
|
||||||
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def process_documents_and_generate_summary():
|
||||||
|
"""Process documents using the main AI service with intelligent chunk integration."""
|
||||||
|
logger.info("🚀 Starting intelligent chunk integration test...")
|
||||||
|
|
||||||
|
# Find testdata directory
|
||||||
|
testdata_path = Path("../wiki/poweron/testdata")
|
||||||
|
if not testdata_path.exists():
|
||||||
|
# Try relative to current directory
|
||||||
|
testdata_path = Path("wiki/poweron/testdata")
|
||||||
|
if not testdata_path.exists():
|
||||||
|
# Try relative to parent directory
|
||||||
|
testdata_path = Path("../wiki/poweron/testdata")
|
||||||
|
if not testdata_path.exists():
|
||||||
|
logger.error(f"❌ Testdata path not found. Tried:")
|
||||||
|
logger.error(f" - ../wiki/poweron/testdata")
|
||||||
|
logger.error(f" - wiki/poweron/testdata")
|
||||||
|
logger.error(f" - ../wiki/poweron/testdata")
|
||||||
|
logger.info("Please ensure the testdata folder exists with PDF documents")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Find all supported document files
|
||||||
|
supported_extensions = [
|
||||||
|
# Document formats
|
||||||
|
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
|
||||||
|
# Image formats
|
||||||
|
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
|
||||||
|
# Text and code files
|
||||||
|
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
|
||||||
|
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
|
||||||
|
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
|
||||||
|
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
|
||||||
|
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
|
||||||
|
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
|
||||||
|
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
|
||||||
|
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
|
||||||
|
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
|
||||||
|
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
|
||||||
|
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
|
||||||
|
]
|
||||||
|
document_files = []
|
||||||
|
for ext in supported_extensions:
|
||||||
|
document_files.extend(list(testdata_path.glob(ext)))
|
||||||
|
|
||||||
|
logger.info(f"Found {len(document_files)} document files in testdata:")
|
||||||
|
for doc_file in document_files:
|
||||||
|
logger.info(f" - {doc_file.name}")
|
||||||
|
|
||||||
|
if not document_files:
|
||||||
|
logger.error("❌ No supported document files found in testdata folder")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Mock the database interface to provide our file data BEFORE creating AI service
|
||||||
|
class TestDbInterface:
|
||||||
|
def __init__(self, file_data_map):
|
||||||
|
self.file_data_map = file_data_map
|
||||||
|
|
||||||
|
def getFileData(self, file_id):
|
||||||
|
logger.info(f"TestDbInterface.getFileData called with file_id: {file_id}")
|
||||||
|
data = self.file_data_map.get(file_id)
|
||||||
|
if data:
|
||||||
|
logger.info(f"✅ Found file data for {file_id}: {len(data)} bytes")
|
||||||
|
else:
|
||||||
|
logger.warning(f"❌ No file data found for {file_id}")
|
||||||
|
return data
|
||||||
|
|
||||||
|
# Create file data mapping
|
||||||
|
file_data_map = {}
|
||||||
|
for i, doc_file in enumerate(document_files):
|
||||||
|
with open(doc_file, 'rb') as f:
|
||||||
|
file_data_map[f"test_doc_{i+1}"] = f.read()
|
||||||
|
logger.info(f"📁 Loaded {doc_file.name} as test_doc_{i+1}: {len(file_data_map[f'test_doc_{i+1}'])} bytes")
|
||||||
|
|
||||||
|
# Mock the database interface BEFORE creating AI service
|
||||||
|
import modules.interfaces.interfaceDbComponentObjects as db_interface_module
|
||||||
|
original_get_interface = db_interface_module.getInterface
|
||||||
|
db_interface_module.getInterface = lambda: TestDbInterface(file_data_map)
|
||||||
|
logger.info("🔧 Database interface mocked successfully")
|
||||||
|
|
||||||
|
# Create a mock service center with utils
|
||||||
|
class MockServiceCenter:
|
||||||
|
def __init__(self):
|
||||||
|
self.utils = MockUtils()
|
||||||
|
|
||||||
|
class MockUtils:
|
||||||
|
def debugLogToFile(self, message, label):
|
||||||
|
logger.debug(f"[{label}] {message}")
|
||||||
|
print(f"DEBUG [{label}]: {message}") # Also print to console for visibility
|
||||||
|
|
||||||
|
# Only write to debug file if debug logging is enabled (matching real implementation)
|
||||||
|
debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if debug_enabled:
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat")
|
||||||
|
if not os.path.isabs(debug_dir):
|
||||||
|
# If relative path, make it relative to the gateway directory
|
||||||
|
gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
debug_dir = os.path.join(gateway_dir, debug_dir)
|
||||||
|
|
||||||
|
os.makedirs(debug_dir, exist_ok=True)
|
||||||
|
debug_file = os.path.join(debug_dir, "debug_workflow.log")
|
||||||
|
timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||||
|
debug_entry = f"[{timestamp}] [{label}] {message}\n"
|
||||||
|
with open(debug_file, "a", encoding="utf-8") as f:
|
||||||
|
f.write(debug_entry)
|
||||||
|
except Exception:
|
||||||
|
pass # Don't fail on debug logging errors
|
||||||
|
|
||||||
|
def configGet(self, key, default):
|
||||||
|
# Return debug settings
|
||||||
|
if key == "APP_DEBUG_CHAT_WORKFLOW_ENABLED":
|
||||||
|
return True
|
||||||
|
elif key == "APP_DEBUG_CHAT_WORKFLOW_DIR":
|
||||||
|
return "./test-chat"
|
||||||
|
return default
|
||||||
|
|
||||||
|
mock_service_center = MockServiceCenter()
|
||||||
|
|
||||||
|
# Initialize the main AI service - let it handle everything
|
||||||
|
logger.info("🔧 Initializing main AI service...")
|
||||||
|
ai_service = await AiService.create(mock_service_center)
|
||||||
|
|
||||||
|
# Create test documents - the AI service will handle file access internally
|
||||||
|
documents = []
|
||||||
|
logger.info(f"📁 Found {len(document_files)} document files")
|
||||||
|
for i, doc_file in enumerate(document_files):
|
||||||
|
logger.info(f"📄 Processing file {i+1}/{len(document_files)}: {doc_file.name}")
|
||||||
|
# Determine MIME type based on file extension
|
||||||
|
mime_type = "application/octet-stream" # default
|
||||||
|
if doc_file.suffix.lower() == '.pdf':
|
||||||
|
mime_type = "application/pdf"
|
||||||
|
elif doc_file.suffix.lower() in ['.jpg', '.jpeg']:
|
||||||
|
mime_type = "image/jpeg"
|
||||||
|
elif doc_file.suffix.lower() == '.png':
|
||||||
|
mime_type = "image/png"
|
||||||
|
elif doc_file.suffix.lower() == '.gif':
|
||||||
|
mime_type = "image/gif"
|
||||||
|
elif doc_file.suffix.lower() == '.docx':
|
||||||
|
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
elif doc_file.suffix.lower() == '.xlsx':
|
||||||
|
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
elif doc_file.suffix.lower() == '.pptx':
|
||||||
|
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
elif doc_file.suffix.lower() == '.ppt':
|
||||||
|
mime_type = "application/vnd.ms-powerpoint"
|
||||||
|
elif doc_file.suffix.lower() == '.html':
|
||||||
|
mime_type = "text/html"
|
||||||
|
elif doc_file.suffix.lower() == '.csv':
|
||||||
|
mime_type = "text/csv"
|
||||||
|
elif doc_file.suffix.lower() == '.json':
|
||||||
|
mime_type = "application/json"
|
||||||
|
elif doc_file.suffix.lower() in ['.txt', '.md']:
|
||||||
|
mime_type = "text/plain"
|
||||||
|
|
||||||
|
chat_doc = ChatDocument(
|
||||||
|
fileId=f"test_doc_{i+1}",
|
||||||
|
messageId=f"test_message_{i+1}",
|
||||||
|
fileName=doc_file.name,
|
||||||
|
mimeType=mime_type,
|
||||||
|
fileSize=doc_file.stat().st_size,
|
||||||
|
roundNumber=1,
|
||||||
|
taskNumber=1,
|
||||||
|
actionNumber=1,
|
||||||
|
actionId=f"test_action_{i+1}"
|
||||||
|
)
|
||||||
|
documents.append(chat_doc)
|
||||||
|
logger.info(f"✅ Created ChatDocument: {chat_doc.fileName} ({chat_doc.mimeType}) - {chat_doc.fileSize} bytes")
|
||||||
|
|
||||||
|
logger.info(f"📄 Created {len(documents)} document objects")
|
||||||
|
|
||||||
|
# Create enhanced AI call options for intelligent chunked processing
|
||||||
|
ai_options = EnhancedAiCallOptions(
|
||||||
|
operationType="general",
|
||||||
|
enableParallelProcessing=True,
|
||||||
|
maxConcurrentChunks=5, # Increased for better testing
|
||||||
|
preserveChunkMetadata=True,
|
||||||
|
chunkSeparator="\n\n---\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Call the main AI service directly - let it handle everything including DOCX generation
|
||||||
|
logger.info("🤖 Calling main AI service with intelligent merging...")
|
||||||
|
|
||||||
|
|
||||||
|
# Run a single end-to-end test to avoid the loop issue
|
||||||
|
logger.info("🧪 Running single end-to-end test...")
|
||||||
|
|
||||||
|
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
|
||||||
|
|
||||||
|
# userPrompt = "Can you create one file for each section in the document"
|
||||||
|
|
||||||
|
# userPrompt = "Analyze these documents and create a fitting image for the content"
|
||||||
|
|
||||||
|
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."
|
||||||
|
|
||||||
|
# userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
|
||||||
|
|
||||||
|
# userPrompt = "Create a docx file containing the combined documents in french language."
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Single AI call with DOCX generation
|
||||||
|
ai_response = await ai_service.callAi(
|
||||||
|
prompt=userPrompt,
|
||||||
|
documents=documents,
|
||||||
|
options=ai_options,
|
||||||
|
outputFormat="txt",
|
||||||
|
title="Kunden und Use Cases"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ End-to-end test completed successfully")
|
||||||
|
logger.info(f"📊 Response type: {type(ai_response)}")
|
||||||
|
logger.info(f"📊 Response length: {len(str(ai_response))} characters")
|
||||||
|
|
||||||
|
# Single test result
|
||||||
|
test_results = [{
|
||||||
|
"test_name": "End-to-End DOCX Generation",
|
||||||
|
"success": True,
|
||||||
|
"response_type": type(ai_response).__name__,
|
||||||
|
"response_length": len(str(ai_response)),
|
||||||
|
"response": ai_response
|
||||||
|
}]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ End-to-end test failed: {str(e)}")
|
||||||
|
test_results = [{
|
||||||
|
"test_name": "End-to-End DOCX Generation",
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"response": None
|
||||||
|
}]
|
||||||
|
|
||||||
|
logger.info(f"🎯 Completed 1 end-to-end test")
|
||||||
|
|
||||||
|
# Process all test results and save outputs
|
||||||
|
logger.info("📊 Processing test results...")
|
||||||
|
|
||||||
|
successful_tests = [r for r in test_results if r['success']]
|
||||||
|
failed_tests = [r for r in test_results if not r['success']]
|
||||||
|
|
||||||
|
logger.info(f"✅ Successful tests: {len(successful_tests)}")
|
||||||
|
logger.info(f"❌ Failed tests: {len(failed_tests)}")
|
||||||
|
|
||||||
|
# Display test results summary
|
||||||
|
logger.info("=" * 80)
|
||||||
|
logger.info("END-TO-END TEST RESULTS SUMMARY")
|
||||||
|
logger.info("=" * 80)
|
||||||
|
for i, result in enumerate(test_results, 1):
|
||||||
|
status = "✅ PASS" if result['success'] else "❌ FAIL"
|
||||||
|
logger.info(f"Test {i}: {result['test_name']} - {status}")
|
||||||
|
if result['success']:
|
||||||
|
logger.info(f" Response Type: {result['response_type']}")
|
||||||
|
logger.info(f" Response Length: {result['response_length']} characters")
|
||||||
|
else:
|
||||||
|
logger.info(f" Error: {result['error']}")
|
||||||
|
logger.info("=" * 80)
|
||||||
|
|
||||||
|
# Create output directory if it doesn't exist
|
||||||
|
output_dir = Path("test-chat/unittestoutput")
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save all test results and generated files
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
logger.info("💾 Saving test results and generated files...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i, result in enumerate(successful_tests, 1):
|
||||||
|
test_name = result['test_name'].replace(' ', '_').lower()
|
||||||
|
response = result['response']
|
||||||
|
|
||||||
|
logger.info(f"💾 Saving Test {i}: {result['test_name']}")
|
||||||
|
|
||||||
|
# Handle different response types
|
||||||
|
if isinstance(response, dict):
|
||||||
|
# Document generation response
|
||||||
|
if 'documents' in response and response['documents']:
|
||||||
|
logger.info(f"📄 Found {len(response['documents'])} documents in response")
|
||||||
|
|
||||||
|
for j, doc in enumerate(response['documents']):
|
||||||
|
doc_name = doc.get('documentName', f'{test_name}_document_{j+1}')
|
||||||
|
doc_data = doc.get('documentData', '')
|
||||||
|
doc_mime = doc.get('mimeType', 'application/octet-stream')
|
||||||
|
|
||||||
|
logger.info(f"📄 Document {j+1}: {doc_name}")
|
||||||
|
logger.info(f"📄 MIME Type: {doc_mime}")
|
||||||
|
logger.info(f"📄 Data length: {len(doc_data)} characters")
|
||||||
|
|
||||||
|
# Determine file extension with better MIME type detection
|
||||||
|
file_ext = '.bin' # Default fallback
|
||||||
|
|
||||||
|
if doc_mime:
|
||||||
|
if 'docx' in doc_mime.lower() or 'wordprocessingml' in doc_mime.lower():
|
||||||
|
file_ext = '.docx'
|
||||||
|
elif 'pdf' in doc_mime.lower():
|
||||||
|
file_ext = '.pdf'
|
||||||
|
elif 'txt' in doc_mime.lower() or 'plain' in doc_mime.lower():
|
||||||
|
file_ext = '.txt'
|
||||||
|
elif 'html' in doc_mime.lower():
|
||||||
|
file_ext = '.html'
|
||||||
|
elif 'json' in doc_mime.lower():
|
||||||
|
file_ext = '.json'
|
||||||
|
elif 'csv' in doc_mime.lower():
|
||||||
|
file_ext = '.csv'
|
||||||
|
elif 'xlsx' in doc_mime.lower() or 'spreadsheetml' in doc_mime.lower():
|
||||||
|
file_ext = '.xlsx'
|
||||||
|
elif 'pptx' in doc_mime.lower() or 'presentationml' in doc_mime.lower():
|
||||||
|
file_ext = '.pptx'
|
||||||
|
elif 'markdown' in doc_mime.lower() or 'md' in doc_mime.lower():
|
||||||
|
file_ext = '.md'
|
||||||
|
elif 'png' in doc_mime.lower() or 'image' in doc_mime.lower():
|
||||||
|
file_ext = '.png'
|
||||||
|
elif 'jpg' in doc_mime.lower() or 'jpeg' in doc_mime.lower():
|
||||||
|
file_ext = '.jpg'
|
||||||
|
else:
|
||||||
|
logger.warning(f"⚠️ Unknown MIME type: {doc_mime}, using .bin")
|
||||||
|
|
||||||
|
# Also check filename for hints
|
||||||
|
if doc_name and '.' in doc_name:
|
||||||
|
name_ext = '.' + doc_name.split('.')[-1].lower()
|
||||||
|
if name_ext in ['.docx', '.pdf', '.txt', '.html', '.json', '.csv', '.xlsx', '.pptx', '.md', '.png', '.jpg', '.jpeg']:
|
||||||
|
file_ext = name_ext
|
||||||
|
logger.info(f"📄 Using extension from filename: {file_ext}")
|
||||||
|
|
||||||
|
logger.info(f"📄 Final file extension: {file_ext}")
|
||||||
|
|
||||||
|
# Save document
|
||||||
|
output_path = output_dir / f"{test_name}_{timestamp}{file_ext}"
|
||||||
|
|
||||||
|
# Handle different content types
|
||||||
|
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
|
||||||
|
# Text-based formats - save directly as text
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(doc_data)
|
||||||
|
logger.info(f"✅ Document saved as text: {output_path} ({len(doc_data)} characters)")
|
||||||
|
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
||||||
|
# Image formats - decode from base64
|
||||||
|
try:
|
||||||
|
doc_bytes = base64.b64decode(doc_data)
|
||||||
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(doc_bytes)
|
||||||
|
logger.info(f"✅ Image saved: {output_path} ({len(doc_bytes)} bytes)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Failed to decode image as base64: {e}")
|
||||||
|
# Save as text if base64 decoding fails
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(doc_data)
|
||||||
|
logger.info(f"✅ Image saved as text (fallback): {output_path}")
|
||||||
|
else:
|
||||||
|
# Other binary formats - decode from base64
|
||||||
|
try:
|
||||||
|
doc_bytes = base64.b64decode(doc_data)
|
||||||
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(doc_bytes)
|
||||||
|
logger.info(f"✅ Document saved as binary: {output_path} ({len(doc_bytes)} bytes)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Failed to decode document as base64: {e}")
|
||||||
|
# Save as text if base64 decoding fails
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(doc_data)
|
||||||
|
logger.info(f"✅ Document saved as text (fallback): {output_path}")
|
||||||
|
|
||||||
|
# Also save raw content as text
|
||||||
|
content = response.get('content', '')
|
||||||
|
if content:
|
||||||
|
text_path = output_dir / f"{test_name}_content_{timestamp}.txt"
|
||||||
|
with open(text_path, 'w', encoding='utf-8') as f:
|
||||||
|
# Handle both string and dictionary content
|
||||||
|
if isinstance(content, dict):
|
||||||
|
import json
|
||||||
|
f.write(json.dumps(content, indent=2, ensure_ascii=False))
|
||||||
|
else:
|
||||||
|
f.write(str(content))
|
||||||
|
logger.info(f"✅ Content saved: {text_path}")
|
||||||
|
|
||||||
|
elif isinstance(response, str):
|
||||||
|
# Text response
|
||||||
|
text_path = output_dir / f"{test_name}_response_{timestamp}.txt"
|
||||||
|
with open(text_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(response)
|
||||||
|
logger.info(f"✅ Text response saved: {text_path}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.warning(f"⚠️ Unknown response type for {result['test_name']}: {type(response)}")
|
||||||
|
|
||||||
|
# Save failed test details
|
||||||
|
if failed_tests:
|
||||||
|
error_path = output_dir / f"failed_tests_{timestamp}.txt"
|
||||||
|
with open(error_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write("# Failed Test Details\n\n")
|
||||||
|
for i, result in enumerate(failed_tests, 1):
|
||||||
|
f.write(f"## Test {i}: {result['test_name']}\n")
|
||||||
|
f.write(f"**Error:** {result['error']}\n\n")
|
||||||
|
logger.info(f"✅ Failed test details saved: {error_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error saving test results: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Save comprehensive test report
|
||||||
|
report_path = output_dir / f"end_to_end_test_report_{timestamp}.txt"
|
||||||
|
with open(report_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f"# End-to-End AI Service Test Report\n")
|
||||||
|
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||||
|
|
||||||
|
f.write(f"## Test Configuration\n")
|
||||||
|
f.write(f"- Documents processed: {len(documents)}\n")
|
||||||
|
f.write(f"- Processing method: Intelligent Token-Aware Merging\n")
|
||||||
|
f.write(f"- Parallel processing: {ai_options.enableParallelProcessing}\n")
|
||||||
|
f.write(f"- Max concurrent chunks: {ai_options.maxConcurrentChunks}\n")
|
||||||
|
f.write(f"- Chunk metadata preserved: {ai_options.preserveChunkMetadata}\n")
|
||||||
|
f.write(f"- Chunk separator: '{ai_options.chunkSeparator}'\n\n")
|
||||||
|
|
||||||
|
f.write(f"## Document Inventory\n")
|
||||||
|
for i, doc in enumerate(documents, 1):
|
||||||
|
f.write(f"{i}. **{doc.fileName}**\n")
|
||||||
|
f.write(f" - MIME Type: {doc.mimeType}\n")
|
||||||
|
f.write(f" - File Size: {doc.fileSize:,} bytes\n")
|
||||||
|
f.write(f" - File ID: {doc.fileId}\n\n")
|
||||||
|
|
||||||
|
f.write(f"## Test Results Summary\n")
|
||||||
|
f.write(f"- Total Tests: {len(test_results)}\n")
|
||||||
|
f.write(f"- Successful: {len(successful_tests)}\n")
|
||||||
|
f.write(f"- Failed: {len(failed_tests)}\n")
|
||||||
|
f.write(f"- Success Rate: {len(successful_tests)/len(test_results)*100:.1f}%\n\n")
|
||||||
|
|
||||||
|
f.write(f"## Detailed Test Results\n")
|
||||||
|
for i, result in enumerate(test_results, 1):
|
||||||
|
f.write(f"### Test {i}: {result['test_name']}\n")
|
||||||
|
f.write(f"**Status:** {'✅ PASS' if result['success'] else '❌ FAIL'}\n")
|
||||||
|
|
||||||
|
if result['success']:
|
||||||
|
f.write(f"**Response Type:** {result['response_type']}\n")
|
||||||
|
f.write(f"**Response Length:** {result['response_length']} characters\n")
|
||||||
|
|
||||||
|
# Show response preview
|
||||||
|
response_preview = str(result['response'])[:500]
|
||||||
|
f.write(f"**Response Preview:**\n```\n{response_preview}...\n```\n\n")
|
||||||
|
else:
|
||||||
|
f.write(f"**Error:** {result['error']}\n\n")
|
||||||
|
|
||||||
|
f.write(f"## Technical Implementation Details\n")
|
||||||
|
f.write(f"This test validates the complete AI service pipeline:\n\n")
|
||||||
|
f.write(f"### Tested Components:\n")
|
||||||
|
f.write(f"- **Document Extraction**: PDF, DOCX, images, etc.\n")
|
||||||
|
f.write(f"- **Intelligent Chunking**: Token-aware merging\n")
|
||||||
|
f.write(f"- **Model Selection**: Automatic AI model choice\n")
|
||||||
|
f.write(f"- **Parallel Processing**: Concurrent chunk processing\n")
|
||||||
|
f.write(f"- **Document Generation**: DOCX, PDF, text output\n")
|
||||||
|
f.write(f"- **Error Handling**: Graceful failure management\n\n")
|
||||||
|
|
||||||
|
f.write(f"### Performance Metrics:\n")
|
||||||
|
f.write(f"- **Chunk Optimization**: Intelligent merging reduces AI calls\n")
|
||||||
|
f.write(f"- **Processing Speed**: Parallel execution\n")
|
||||||
|
f.write(f"- **Memory Efficiency**: Token-aware chunking\n")
|
||||||
|
f.write(f"- **Output Quality**: Multiple format support\n\n")
|
||||||
|
|
||||||
|
f.write(f"## Generated Files\n")
|
||||||
|
for i, result in enumerate(successful_tests, 1):
|
||||||
|
test_name = result['test_name'].replace(' ', '_').lower()
|
||||||
|
f.write(f"- **Test {i}**: {result['test_name']} → `{test_name}_*_{timestamp}.*`\n")
|
||||||
|
|
||||||
|
if failed_tests:
|
||||||
|
f.write(f"- **Failed Tests**: `failed_tests_{timestamp}.txt`\n")
|
||||||
|
|
||||||
|
f.write(f"- **This Report**: `end_to_end_test_report_{timestamp}.txt`\n\n")
|
||||||
|
|
||||||
|
f.write(f"The end-to-end test successfully validates the complete AI service\n")
|
||||||
|
f.write(f"pipeline from document input to formatted output generation.\n")
|
||||||
|
|
||||||
|
logger.info(f"✅ Comprehensive test report saved: {report_path}")
|
||||||
|
|
||||||
|
# Show debug file locations
|
||||||
|
debug_files = []
|
||||||
|
try:
|
||||||
|
debug_dir = Path("test-chat")
|
||||||
|
if debug_dir.exists():
|
||||||
|
debug_files.extend(list(debug_dir.glob("*.log")))
|
||||||
|
debug_files.extend(list(debug_dir.glob("ai/*.txt")))
|
||||||
|
|
||||||
|
if debug_files:
|
||||||
|
logger.info("📁 Debug files created:")
|
||||||
|
for debug_file in debug_files:
|
||||||
|
logger.info(f" - {debug_file}")
|
||||||
|
else:
|
||||||
|
logger.info("📁 No debug files found in test-chat directory")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not list debug files: {e}")
|
||||||
|
|
||||||
|
# Restore original database interface
|
||||||
|
db_interface_module.getInterface = original_get_interface
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error during document processing: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
logger.error(f"Traceback: {traceback.format_exc()}")
|
||||||
|
|
||||||
|
# Restore original database interface in case of error
|
||||||
|
try:
|
||||||
|
db_interface_module.getInterface = original_get_interface
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main function to run the intelligent chunk integration test."""
|
||||||
|
logger.info("🎯 Starting Intelligent Chunk Integration Test")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
success = await process_documents_and_generate_summary()
|
||||||
|
|
||||||
|
if success:
|
||||||
|
logger.info("🎉 Intelligent chunk integration test completed successfully!")
|
||||||
|
logger.info("✅ Main AI service handled all processing internally")
|
||||||
|
logger.info("✅ Intelligent token-aware merging activated")
|
||||||
|
logger.info("✅ DOCX document generated directly by AI service")
|
||||||
|
logger.info("✅ Detailed chunk integration analysis saved")
|
||||||
|
logger.info("✅ Performance optimization achieved")
|
||||||
|
else:
|
||||||
|
logger.error("❌ Test failed!")
|
||||||
|
logger.error("Please check the error messages above for details")
|
||||||
|
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
422
tool_security_encrypt_all_env_files.py
Normal file
422
tool_security_encrypt_all_env_files.py
Normal file
|
|
@ -0,0 +1,422 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tool for encrypting all *_SECRET variables in all environment files.
|
||||||
|
|
||||||
|
This tool automatically processes all three environment files (dev, int, prod)
|
||||||
|
and encrypts any unencrypted *_SECRET variables using the appropriate encryption
|
||||||
|
keys for each environment.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Encrypt all secrets in all environment files
|
||||||
|
python tool_security_encrypt_all_env_files.py
|
||||||
|
|
||||||
|
# Dry run - show what would be changed without making changes
|
||||||
|
python tool_security_encrypt_all_env_files.py --dry-run
|
||||||
|
|
||||||
|
# Skip backup creation
|
||||||
|
python tool_security_encrypt_all_env_files.py --no-backup
|
||||||
|
|
||||||
|
# Process only specific environment files
|
||||||
|
python tool_security_encrypt_all_env_files.py --files env_dev.env env_prod.env
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
# Add the modules directory to the Python path
|
||||||
|
current_dir = Path(__file__).parent
|
||||||
|
modules_dir = current_dir / 'modules'
|
||||||
|
if modules_dir.exists():
|
||||||
|
sys.path.insert(0, str(modules_dir))
|
||||||
|
else:
|
||||||
|
print(f"Error: Modules directory not found: {modules_dir}")
|
||||||
|
print(f"Make sure you're running this script from the gateway directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Import encryption functions
|
||||||
|
try:
|
||||||
|
from modules.shared.configuration import encrypt_value
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"Error: Could not import encryption functions from shared.configuration: {e}")
|
||||||
|
print(f"Make sure you're running this script from the gateway directory")
|
||||||
|
print(f"Modules directory: {modules_dir}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def get_env_type_from_file(file_path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Read the APP_ENV_TYPE from the environment file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the environment file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The environment type (dev, int, prod) or 'dev' as default
|
||||||
|
"""
|
||||||
|
if not file_path.exists():
|
||||||
|
return 'dev'
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith('APP_ENV_TYPE') and '=' in line:
|
||||||
|
_, value = line.split('=', 1)
|
||||||
|
return value.strip().lower()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not read APP_ENV_TYPE from {file_path}: {e}")
|
||||||
|
|
||||||
|
return 'dev'
|
||||||
|
|
||||||
|
def is_any_encrypted_value(value: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a value has any encryption prefix (DEV_ENC:, INT_ENC:, PROD_ENC:, etc.).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value: The value to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the value has any encryption prefix, False otherwise
|
||||||
|
"""
|
||||||
|
if not value or not isinstance(value, str):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for any environment-specific encryption prefixes
|
||||||
|
return (value.startswith('DEV_ENC:') or
|
||||||
|
value.startswith('INT_ENC:') or
|
||||||
|
value.startswith('PROD_ENC:') or
|
||||||
|
value.startswith('TEST_ENC:') or
|
||||||
|
value.startswith('STAGING_ENC:'))
|
||||||
|
|
||||||
|
def find_secret_keys_in_file(file_path: Path) -> list:
|
||||||
|
"""
|
||||||
|
Find all *_SECRET keys in an environment file that are not encrypted.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the environment file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of tuples (line_number, key, value, full_line)
|
||||||
|
"""
|
||||||
|
secret_keys = []
|
||||||
|
|
||||||
|
if not file_path.exists():
|
||||||
|
return secret_keys
|
||||||
|
|
||||||
|
# Get the environment type from the file itself
|
||||||
|
file_env_type = get_env_type_from_file(file_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
line = lines[i].strip()
|
||||||
|
|
||||||
|
# Skip empty lines and comments
|
||||||
|
if not line or line.startswith('#'):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if line contains a key-value pair
|
||||||
|
if '=' in line:
|
||||||
|
key, value = line.split('=', 1)
|
||||||
|
key = key.strip()
|
||||||
|
value = value.strip()
|
||||||
|
|
||||||
|
# Check if it's a secret key and not already encrypted with ANY prefix
|
||||||
|
if key.endswith('_SECRET') and value and not is_any_encrypted_value(value):
|
||||||
|
# Check if value starts with { (JSON object)
|
||||||
|
if value.startswith('{'):
|
||||||
|
# Collect all lines until we find the closing }
|
||||||
|
json_lines = [value]
|
||||||
|
start_line = i + 1
|
||||||
|
i += 1
|
||||||
|
brace_count = value.count('{') - value.count('}')
|
||||||
|
|
||||||
|
while i < len(lines) and brace_count > 0:
|
||||||
|
json_lines.append(lines[i].rstrip('\n'))
|
||||||
|
brace_count += lines[i].count('{') - lines[i].count('}')
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Join all lines and create the full JSON value
|
||||||
|
full_json_value = '\n'.join(json_lines)
|
||||||
|
secret_keys.append((start_line, key, full_json_value, line))
|
||||||
|
i -= 1 # Adjust for the loop increment
|
||||||
|
else:
|
||||||
|
# Single line value
|
||||||
|
secret_keys.append((i + 1, key, value, line))
|
||||||
|
# Check if it's a secret key with multiline JSON (value is just "{")
|
||||||
|
elif key.endswith('_SECRET') and value == '{' and not is_any_encrypted_value(value):
|
||||||
|
# Collect all lines until we find the closing }
|
||||||
|
json_lines = [value]
|
||||||
|
start_line = i + 1
|
||||||
|
i += 1
|
||||||
|
brace_count = 1 # We already have one opening brace
|
||||||
|
|
||||||
|
while i < len(lines) and brace_count > 0:
|
||||||
|
json_lines.append(lines[i].rstrip('\n'))
|
||||||
|
brace_count += lines[i].count('{') - lines[i].count('}')
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Join all lines and create the full JSON value
|
||||||
|
full_json_value = '\n'.join(json_lines)
|
||||||
|
secret_keys.append((start_line, key, full_json_value, line))
|
||||||
|
i -= 1 # Adjust for the loop increment
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading {file_path}: {e}")
|
||||||
|
|
||||||
|
return secret_keys
|
||||||
|
|
||||||
|
def backup_file(file_path: Path) -> Path:
|
||||||
|
"""
|
||||||
|
Create a backup of the file before modification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to backup
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path: Path to the backup file
|
||||||
|
"""
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
backup_path = file_path.with_suffix(f'.{timestamp}.backup')
|
||||||
|
shutil.copy2(file_path, backup_path)
|
||||||
|
return backup_path
|
||||||
|
|
||||||
|
def encrypt_all_secrets_in_file(file_path: Path, dry_run: bool = False, create_backup: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Encrypt all non-encrypted secrets in a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the environment file
|
||||||
|
dry_run: If True, only show what would be changed
|
||||||
|
create_backup: If True, create a backup before modifying
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Results of the encryption process
|
||||||
|
"""
|
||||||
|
# Get the environment type from the file itself
|
||||||
|
file_env_type = get_env_type_from_file(file_path)
|
||||||
|
|
||||||
|
results = {
|
||||||
|
'file': str(file_path),
|
||||||
|
'env_type': file_env_type,
|
||||||
|
'secrets_found': 0,
|
||||||
|
'secrets_encrypted': 0,
|
||||||
|
'errors': [],
|
||||||
|
'backup_created': None
|
||||||
|
}
|
||||||
|
|
||||||
|
# Find all secret keys
|
||||||
|
secret_keys = find_secret_keys_in_file(file_path)
|
||||||
|
results['secrets_found'] = len(secret_keys)
|
||||||
|
|
||||||
|
if not secret_keys:
|
||||||
|
print(f" ✅ No unencrypted secrets found - all values already have encryption prefixes")
|
||||||
|
return results
|
||||||
|
|
||||||
|
print(f" Found {len(secret_keys)} non-encrypted secrets")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
print(" [DRY RUN] Would encrypt the following secrets:")
|
||||||
|
for line_num, key, value, full_line in secret_keys:
|
||||||
|
print(f" Line {line_num}: {key} = {value[:50]}{'...' if len(value) > 50 else ''}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Create backup if requested
|
||||||
|
if create_backup:
|
||||||
|
try:
|
||||||
|
backup_path = backup_file(file_path)
|
||||||
|
results['backup_created'] = str(backup_path)
|
||||||
|
print(f" 📋 Backup created: {backup_path.name}")
|
||||||
|
except Exception as e:
|
||||||
|
results['errors'].append(f"Failed to create backup: {e}")
|
||||||
|
print(f" ⚠️ Warning: Could not create backup: {e}")
|
||||||
|
|
||||||
|
# Read the file content
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
except Exception as e:
|
||||||
|
results['errors'].append(f"Failed to read file: {e}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Process each secret key
|
||||||
|
for line_num, key, value, full_line in secret_keys:
|
||||||
|
try:
|
||||||
|
print(f" 🔐 Encrypting {key}...")
|
||||||
|
|
||||||
|
# Encrypt the value using the environment type from the file
|
||||||
|
encrypted_value = encrypt_value(value, file_env_type)
|
||||||
|
|
||||||
|
# Replace the line in the file content
|
||||||
|
new_line = f"{key} = {encrypted_value}\n"
|
||||||
|
lines[line_num - 1] = new_line
|
||||||
|
|
||||||
|
# If this was a multiline JSON, we need to remove the remaining lines
|
||||||
|
if value.startswith('{') and '\n' in value:
|
||||||
|
# Count how many lines the original JSON spanned
|
||||||
|
json_lines = value.split('\n')
|
||||||
|
lines_to_remove = len(json_lines) - 1 # -1 because we already replaced the first line
|
||||||
|
|
||||||
|
# Remove the remaining lines
|
||||||
|
for i in range(line_num, line_num + lines_to_remove):
|
||||||
|
if i < len(lines):
|
||||||
|
lines[i] = ""
|
||||||
|
|
||||||
|
results['secrets_encrypted'] += 1
|
||||||
|
print(f" ✓ Encrypted successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Failed to encrypt {key}: {e}"
|
||||||
|
results['errors'].append(error_msg)
|
||||||
|
print(f" ✗ {error_msg}")
|
||||||
|
|
||||||
|
# Write the modified content back to the file
|
||||||
|
if results['secrets_encrypted'] > 0:
|
||||||
|
try:
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.writelines(lines)
|
||||||
|
print(f" 💾 File updated successfully")
|
||||||
|
except Exception as e:
|
||||||
|
results['errors'].append(f"Failed to write file: {e}")
|
||||||
|
print(f" ✗ Failed to write file: {e}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def process_all_env_files(env_files: List[str] = None, dry_run: bool = False, create_backup: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process all environment files and encrypt unencrypted secrets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
env_files: List of specific files to process (if None, processes all three default files)
|
||||||
|
dry_run: If True, only show what would be changed
|
||||||
|
create_backup: If True, create backups before modifying
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Summary of all processing results
|
||||||
|
"""
|
||||||
|
# Default environment files if none specified
|
||||||
|
if env_files is None:
|
||||||
|
env_files = ['env_dev.env', 'env_int.env', 'env_prod.env']
|
||||||
|
|
||||||
|
# Convert to Path objects and check if they exist
|
||||||
|
env_paths = []
|
||||||
|
for env_file in env_files:
|
||||||
|
env_path = Path(env_file)
|
||||||
|
if not env_path.exists():
|
||||||
|
print(f"⚠️ Warning: Environment file not found: {env_file}")
|
||||||
|
continue
|
||||||
|
env_paths.append(env_path)
|
||||||
|
|
||||||
|
if not env_paths:
|
||||||
|
print("❌ No valid environment files found to process")
|
||||||
|
return {'total_files': 0, 'total_secrets_found': 0, 'total_secrets_encrypted': 0, 'total_errors': 0, 'files': []}
|
||||||
|
|
||||||
|
print("🔐 PowerOn Batch Secret Encryption Tool")
|
||||||
|
print("=" * 60)
|
||||||
|
print("⚠️ IMPORTANT: The tool will read APP_ENV_TYPE from each file itself")
|
||||||
|
print("⚠️ Each file will be processed with its own environment-specific encryption")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
print("🔍 DRY RUN MODE - No changes will be made")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Process each file
|
||||||
|
all_results = []
|
||||||
|
total_secrets_found = 0
|
||||||
|
total_secrets_encrypted = 0
|
||||||
|
total_errors = 0
|
||||||
|
|
||||||
|
for env_path in env_paths:
|
||||||
|
print(f"\n📁 Processing {env_path.name}:")
|
||||||
|
results = encrypt_all_secrets_in_file(env_path, dry_run, create_backup)
|
||||||
|
all_results.append(results)
|
||||||
|
|
||||||
|
total_secrets_found += results['secrets_found']
|
||||||
|
total_secrets_encrypted += results['secrets_encrypted']
|
||||||
|
total_errors += len(results['errors'])
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("📊 SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Files processed: {len(env_paths)}")
|
||||||
|
print(f"Total secrets found: {total_secrets_found}")
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
print(f"Total secrets encrypted: {total_secrets_encrypted}")
|
||||||
|
print(f"Total errors: {total_errors}")
|
||||||
|
|
||||||
|
if total_errors == 0 and total_secrets_encrypted > 0:
|
||||||
|
print("\n🎉 All secrets encrypted successfully!")
|
||||||
|
elif total_errors > 0:
|
||||||
|
print(f"\n⚠️ Completed with {total_errors} errors")
|
||||||
|
else:
|
||||||
|
print("\n✅ No secrets needed encryption")
|
||||||
|
else:
|
||||||
|
print(f"Secrets that would be encrypted: {total_secrets_found}")
|
||||||
|
|
||||||
|
# Show backup information
|
||||||
|
backups_created = [r['backup_created'] for r in all_results if r['backup_created']]
|
||||||
|
if backups_created:
|
||||||
|
print(f"\n📋 Backups created: {len(backups_created)}")
|
||||||
|
for backup in backups_created:
|
||||||
|
print(f" - {Path(backup).name}")
|
||||||
|
|
||||||
|
# Show errors if any
|
||||||
|
all_errors = []
|
||||||
|
for results in all_results:
|
||||||
|
all_errors.extend(results['errors'])
|
||||||
|
|
||||||
|
if all_errors:
|
||||||
|
print(f"\n❌ Errors encountered:")
|
||||||
|
for error in all_errors:
|
||||||
|
print(f" - {error}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_files': len(env_paths),
|
||||||
|
'total_secrets_found': total_secrets_found,
|
||||||
|
'total_secrets_encrypted': total_secrets_encrypted,
|
||||||
|
'total_errors': total_errors,
|
||||||
|
'files': all_results
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Encrypt all *_SECRET variables in all environment files')
|
||||||
|
parser.add_argument('--files', '-f', nargs='+',
|
||||||
|
help='Specific environment files to process (default: all three env files)')
|
||||||
|
parser.add_argument('--dry-run', action='store_true',
|
||||||
|
help='Show what would be changed without making changes')
|
||||||
|
parser.add_argument('--no-backup', action='store_true',
|
||||||
|
help='Skip creating backup files')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = process_all_env_files(
|
||||||
|
env_files=args.files,
|
||||||
|
dry_run=args.dry_run,
|
||||||
|
create_backup=not args.no_backup
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return appropriate exit code
|
||||||
|
if results['total_errors'] > 0:
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
Loading…
Reference in a new issue