Merge branch 'int' into feat/chatbot
This commit is contained in:
commit
57118a633e
93 changed files with 13774 additions and 3691 deletions
|
|
@ -30,3 +30,8 @@ Web_Search_MIN_RESULTS = 1
|
|||
Web_Crawl_TIMEOUT = 30
|
||||
Web_Crawl_MAX_RETRIES = 3
|
||||
Web_Crawl_RETRY_DELAY = 2
|
||||
|
||||
# Web Research configuration
|
||||
Web_Research_MAX_DEPTH = 2
|
||||
Web_Research_MAX_LINKS_PER_DOMAIN = 4
|
||||
Web_Research_CRAWL_TIMEOUT_MINUTES = 10
|
||||
|
|
@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
|
|||
|
||||
# Perplexity AI configuration
|
||||
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
||||
Connector_AiPerplexity_API_SECRET = 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
|
||||
Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0=
|
||||
Connector_AiPerplexity_MODEL_NAME = sonar
|
||||
Connector_AiPerplexity_TEMPERATURE = 0.2
|
||||
Connector_AiPerplexity_MAX_TOKENS = 2000
|
||||
|
||||
# Agent Mail configuration
|
||||
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
||||
Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQk4xYnpmbnItUEU3dHU4eHB5dzVYay1WT012RTRLUWJDTlBILVY5dC1FX3VMNjZmLThrbDRFNWFSNGprY3RRTlpYNGlubVBpNnY3MjNJcGtzVk9PMzRacl9LUlM2RU5vTVVZWHJvaUhWSHVfc1pNR0pfQmI5SEprOG5KdlB1QnQ=
|
||||
Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm83T29rV1pQelMtc1p1MXR4NTFpa19CTEhHQ0xfNmdPUmZqcWp5UHBMS0hYTGl4c1pPdmhTNTJVWUl5WnlnUUZhV0VTRzVCb0d5YjR1NnZPZk5CZ0dGazNGdUJVbjkxeVdrYlNiVjJUYzF2aVFtQnVxTHFqTTJqZlF0RTFGNmE1OGN1TEk=
|
||||
Service_MSFT_TENANT_ID = common
|
||||
|
||||
# Google Service configuration
|
||||
|
|
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpETk5FWWM3Q0JKMzhI
|
|||
|
||||
# Feature SyncDelta JIRA configuration
|
||||
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEbm0yRUJ6VUJKbUwyRW5kMnRaNW4wM2YxMkJUTXVXZUdmdVRCaUZIVHU2TTV2RWZLRmUtZkcwZE4yRUNlNDQ0aUJWYjNfdVg5YjV5c2JwMHhoUUYxZWdkeS11bXR0eGxRLWRVaVU3cUVQZWJlNDRtY1lWUDdqeDVFSlpXS0VFX21WajlRS3lHQjc0bS11akkybWV3QUFlR2hNWUNYLUdiRjZuN2dQODdDSExXWG1Dd2ZGclI2aUhlSWhETVZuY3hYdnhkb2c2LU1JTFBvWFpTNmZtMkNVOTZTejJwbDI2eGE0OS1xUlIwQnlCSmFxRFNCeVJNVzlOMDhTR1VUamx4RDRyV3p6Tk9qVHBrWWdySUM3TVRaYjd3N0JHMFhpdzFhZTNDLTFkRVQ2RVE4U19COXRhRWtNc0NVOHRqUS1CRDFpZ19xQmtFLU9YSDU3TXBZQXpVcld3PT0=
|
||||
|
||||
# Debug Configuration
|
||||
APP_DEBUG_CHAT_WORKFLOW_ENABLED = True
|
||||
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
|
||||
|
|
@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
|
|||
|
||||
# Perplexity AI configuration
|
||||
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
||||
Connector_AiPerplexity_API_SECRET = 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
|
||||
Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0=
|
||||
Connector_AiPerplexity_MODEL_NAME = sonar
|
||||
Connector_AiPerplexity_TEMPERATURE = 0.2
|
||||
Connector_AiPerplexity_MAX_TOKENS = 2000
|
||||
|
||||
# Agent Mail configuration
|
||||
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
||||
Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNzB2M3ZjaE1SVE9ON2FKam9yVURxcHl1Ym5VNVUtS0MyWUpNVXVlaWpWS2U3VVd3em9vQl9lcnVYay03bS04YjNBbDZZNTB4eUtjT3ppQjJjY3dOT0FNLW9LeDhIUU5iaTNqNURUWE5La3kzaHNGcU9yNVI0YjhWZTZRRFktcTk=
|
||||
Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm83T29rMDZvcV9qTG5xb1FzUkdqS1llbzRxSEJXbmpONFFtcUtfZXdtZjQybmJSMjBjMEpnRVhiOGRuczZvVFBFdVVTQV80SG9PSnRQTEpLdVViNm5wc2E5aGRLWjZ4TGF1QjVkNmdRSzBpNWNkYXVublFYclVEdEM5TVBBZWVVMW5RVWk=
|
||||
Service_MSFT_TENANT_ID = common
|
||||
|
||||
# Google Service configuration
|
||||
|
|
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkNmVXZ1pWcHcydTF2
|
|||
|
||||
# Feature SyncDelta JIRA configuration
|
||||
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkTUNsWm4wX0p6eXFDZmJ4dFdHNEs1MV9MUzdrb3RzeC1jVWVYZ0REWHRyZkFiaGZLcUQtTXFBZzZkNzRmQ0gxbEhGbUNlVVFfR1JEQTc0aldkZkgyWnBOcjdlUlZxR0tDTEdKRExULXAyUEtsVmNTMkRKU1BJNnFiM0hlMXo4YndMcHlRMExtZDQ3Zm9vNFhMcEZCcHpBPT0=
|
||||
|
||||
# Debug Configuration
|
||||
APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE
|
||||
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
|
||||
|
|
@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
|
|||
|
||||
# Perplexity AI configuration
|
||||
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
|
||||
Connector_AiPerplexity_API_SECRET = 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
|
||||
Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0=
|
||||
Connector_AiPerplexity_MODEL_NAME = sonar
|
||||
Connector_AiPerplexity_TEMPERATURE = 0.2
|
||||
Connector_AiPerplexity_MAX_TOKENS = 2000
|
||||
|
||||
# Agent Mail configuration
|
||||
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
|
||||
Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pVEhHdlZHU3FNMmhuRGVwaGc3YzIxSjlZNzBCQjlOV2pSYVNXb0t1ZnVwQzZsQzY4cHMtVlZtNF85OEVaV1BMTzdXMmpzaGZpaG1DalJ0bkNPMHA5ZUcwZjNDdGk1TFdxYTJSZnVrVmhhZ2VRUEZxbjJOOGFhWk9EYlY3dmRVTnI=
|
||||
Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm83T29rSzdYLTRydXN5V3lQLXhmRjMyQ1FOaGpuek45QllaX1REN2s5aWNIUl81NGlrYlJTeFV0RlRZd0xPcm5uMDM4QlpibHJQbm5XZTlWeWxfcWNVdFpCUHI2amh0MVBnZ21IN2ptSkhWLTVfaHEwNmI5SEtiS05pQmt5eV8yMnhLMEc=
|
||||
Service_MSFT_TENANT_ID = common
|
||||
|
||||
# Google Service configuration
|
||||
|
|
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pNjlJdmFMeERXUUQ
|
|||
|
||||
# Feature SyncDelta JIRA configuration
|
||||
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pTDhnTVNzRUhScU8wYnZsZk52bHFkSWxLc18xQmtCeC1HbnNwTzVBbXRNTmQzRjZYaGE2MVlCNGtnWDk1T2I5VXVKNHpKU1VRbXEyN2tRWUJnU2ltZE5qZ3lmNEF6Z1hMTTEwZkk2NUNBYjhmVTJEcWpRUW9HNEVpSGFWdjBWQXQ3eUtHUTFJS3U5QWpaeno0RFNhMUxnPT0=
|
||||
|
||||
# Debug Configuration
|
||||
APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE
|
||||
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
import httpx
|
||||
import os
|
||||
from typing import Dict, Any, List, Union
|
||||
from fastapi import HTTPException
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
|
|
@ -147,6 +148,11 @@ class AiAnthropic:
|
|||
# Direct content as string (in older API versions)
|
||||
content = anthropicResponse["content"]
|
||||
|
||||
# Debug logging for empty responses
|
||||
if not content or content.strip() == "":
|
||||
logger.warning(f"Anthropic API returned empty content. Full response: {anthropicResponse}")
|
||||
content = "[Anthropic API returned empty response]"
|
||||
|
||||
# Return in OpenAI format
|
||||
return {
|
||||
"id": anthropicResponse.get("id", ""),
|
||||
|
|
@ -182,14 +188,27 @@ class AiAnthropic:
|
|||
The analysis response as text
|
||||
"""
|
||||
try:
|
||||
# Debug logging
|
||||
logger.info(f"callAiImage called with imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
|
||||
|
||||
# Distinguish between file path and binary data
|
||||
if isinstance(imageData, str):
|
||||
# Check if it's base64 encoded data or a file path
|
||||
if len(imageData) > 100 and not os.path.exists(imageData):
|
||||
# It's likely base64 encoded data
|
||||
logger.info("Treating imageData as base64 encoded string")
|
||||
base64Data = imageData
|
||||
if not mimeType:
|
||||
mimeType = "image/png"
|
||||
else:
|
||||
# It's a file path - import filehandling only when needed
|
||||
logger.info(f"Treating imageData as file path: {imageData}")
|
||||
from modules import agentserviceFilemanager as fileHandler
|
||||
base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData)
|
||||
mimeType = mimeType or autoMimeType
|
||||
else:
|
||||
# It's binary data
|
||||
logger.info("Treating imageData as binary data")
|
||||
import base64
|
||||
base64Data = base64.b64encode(imageData).decode('utf-8')
|
||||
# MIME type must be specified for binary data
|
||||
|
|
@ -216,8 +235,16 @@ class AiAnthropic:
|
|||
# Use the existing callAiBasic function with the Vision model
|
||||
response = await self.callAiBasic(messages)
|
||||
|
||||
# Extract and return content
|
||||
return response["choices"][0]["message"]["content"]
|
||||
# Extract and return content with proper error handling
|
||||
try:
|
||||
content = response["choices"][0]["message"]["content"]
|
||||
if content is None or content.strip() == "":
|
||||
return "[AI returned empty response for image analysis]"
|
||||
return content
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
logger.error(f"Error extracting content from AI response: {str(e)}")
|
||||
logger.error(f"Response structure: {response}")
|
||||
return f"[Error extracting AI response: {str(e)}]"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
|
||||
|
|
|
|||
|
|
@ -189,3 +189,82 @@ class AiOpenai:
|
|||
except Exception as e:
|
||||
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
|
||||
return f"[Error during image analysis: {str(e)}]"
|
||||
|
||||
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> Dict[str, Any]:
|
||||
"""
|
||||
Generate an image using DALL-E 3.
|
||||
|
||||
Args:
|
||||
prompt: The text prompt for image generation
|
||||
size: Image size (1024x1024, 1792x1024, or 1024x1792)
|
||||
quality: Image quality (standard or hd)
|
||||
style: Image style (vivid or natural)
|
||||
|
||||
Returns:
|
||||
Dictionary with success status and image data
|
||||
"""
|
||||
try:
|
||||
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
|
||||
|
||||
# DALL-E 3 API endpoint
|
||||
dalle_url = "https://api.openai.com/v1/images/generations"
|
||||
|
||||
payload = {
|
||||
"model": "dall-e-3",
|
||||
"prompt": prompt,
|
||||
"size": size,
|
||||
"quality": quality,
|
||||
"style": style,
|
||||
"n": 1,
|
||||
"response_format": "b64_json" # Get base64 data directly instead of URLs
|
||||
}
|
||||
|
||||
# Create a separate client for DALL-E API calls
|
||||
dalle_client = httpx.AsyncClient(
|
||||
timeout=120.0,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.apiKey}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
)
|
||||
|
||||
response = await dalle_client.post(
|
||||
dalle_url,
|
||||
json=payload
|
||||
)
|
||||
|
||||
await dalle_client.aclose()
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"DALL-E API error: {response.status_code} - {response.text}"
|
||||
}
|
||||
|
||||
responseJson = response.json()
|
||||
|
||||
if "data" in responseJson and len(responseJson["data"]) > 0:
|
||||
image_data = responseJson["data"][0]["b64_json"]
|
||||
|
||||
logger.info(f"Successfully generated image: {len(image_data)} characters")
|
||||
return {
|
||||
"success": True,
|
||||
"image_data": image_data,
|
||||
"size": size,
|
||||
"quality": quality,
|
||||
"style": style
|
||||
}
|
||||
else:
|
||||
logger.error("No image data in DALL-E response")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No image data in DALL-E response"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during image generation: {str(e)}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Error during image generation: {str(e)}"
|
||||
}
|
||||
|
|
@ -271,6 +271,7 @@ class ConnectorWeb:
|
|||
include_domains: list[str] | None = None,
|
||||
exclude_domains: list[str] | None = None,
|
||||
language: str | None = None,
|
||||
country: str | None = None,
|
||||
include_answer: bool | None = None,
|
||||
include_raw_content: bool | None = None,
|
||||
) -> list[WebSearchResult]:
|
||||
|
|
@ -290,17 +291,20 @@ class ConnectorWeb:
|
|||
kwargs["time_range"] = time_range
|
||||
if topic is not None:
|
||||
kwargs["topic"] = topic
|
||||
if include_domains is not None:
|
||||
if include_domains is not None and len(include_domains) > 0:
|
||||
kwargs["include_domains"] = include_domains
|
||||
if exclude_domains is not None:
|
||||
kwargs["exclude_domains"] = exclude_domains
|
||||
if language is not None:
|
||||
kwargs["language"] = language
|
||||
if country is not None:
|
||||
kwargs["country"] = country
|
||||
if include_answer is not None:
|
||||
kwargs["include_answer"] = include_answer
|
||||
if include_raw_content is not None:
|
||||
kwargs["include_raw_content"] = include_raw_content
|
||||
|
||||
logger.debug(f"Tavily.search kwargs: {kwargs}")
|
||||
response = await self.client.search(**kwargs)
|
||||
|
||||
return [
|
||||
|
|
|
|||
|
|
@ -135,3 +135,29 @@ class AiCallResponse(BaseModel):
|
|||
costEstimate: Optional[float] = Field(default=None, description="Estimated cost of the call")
|
||||
|
||||
|
||||
class EnhancedAiCallOptions(AiCallOptions):
|
||||
"""Enhanced options for improved document processing with chunk mapping."""
|
||||
|
||||
# Parallel processing
|
||||
enableParallelProcessing: bool = Field(
|
||||
default=True,
|
||||
description="Enable parallel processing of chunks"
|
||||
)
|
||||
maxConcurrentChunks: int = Field(
|
||||
default=5,
|
||||
ge=1,
|
||||
le=20,
|
||||
description="Maximum number of chunks to process concurrently"
|
||||
)
|
||||
|
||||
# Chunk mapping
|
||||
preserveChunkMetadata: bool = Field(
|
||||
default=True,
|
||||
description="Preserve chunk metadata during processing"
|
||||
)
|
||||
chunkSeparator: str = Field(
|
||||
default="\n\n---\n\n",
|
||||
description="Separator between chunks in merged output"
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
130
modules/datamodels/datamodelDocument.py
Normal file
130
modules/datamodels/datamodelDocument.py
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
from typing import Any, Dict, List, Optional, Literal, Union
|
||||
from pydantic import BaseModel, Field
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class DocumentMetadata(BaseModel):
|
||||
"""Metadata for the entire document."""
|
||||
title: str = Field(description="Document title")
|
||||
author: Optional[str] = Field(default=None, description="Document author")
|
||||
created_at: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
|
||||
source_documents: List[str] = Field(default_factory=list, description="Source document IDs")
|
||||
extraction_method: str = Field(default="ai_extraction", description="Method used for extraction")
|
||||
version: str = Field(default="1.0", description="Document version")
|
||||
|
||||
|
||||
class TableData(BaseModel):
|
||||
"""Structured table data."""
|
||||
headers: List[str] = Field(description="Table column headers")
|
||||
rows: List[List[str]] = Field(description="Table data rows")
|
||||
caption: Optional[str] = Field(default=None, description="Table caption")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata")
|
||||
|
||||
|
||||
class ListItem(BaseModel):
|
||||
"""Individual list item with optional sub-items."""
|
||||
text: str = Field(description="List item text")
|
||||
subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata")
|
||||
|
||||
|
||||
class BulletList(BaseModel):
|
||||
"""Bulleted or numbered list."""
|
||||
items: List[ListItem] = Field(description="List items")
|
||||
list_type: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata")
|
||||
|
||||
|
||||
class Paragraph(BaseModel):
|
||||
"""Text paragraph with optional formatting."""
|
||||
text: str = Field(description="Paragraph text")
|
||||
formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata")
|
||||
|
||||
|
||||
class Heading(BaseModel):
|
||||
"""Document heading."""
|
||||
text: str = Field(description="Heading text")
|
||||
level: int = Field(ge=1, le=6, description="Heading level (1-6)")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata")
|
||||
|
||||
|
||||
class CodeBlock(BaseModel):
|
||||
"""Code block with syntax highlighting."""
|
||||
code: str = Field(description="Code content")
|
||||
language: Optional[str] = Field(default=None, description="Programming language")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata")
|
||||
|
||||
|
||||
class Image(BaseModel):
|
||||
"""Image with metadata."""
|
||||
data: str = Field(description="Base64 encoded image data")
|
||||
alt_text: Optional[str] = Field(default=None, description="Alternative text")
|
||||
caption: Optional[str] = Field(default=None, description="Image caption")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata")
|
||||
|
||||
|
||||
class DocumentSection(BaseModel):
|
||||
"""A section of the document containing one or more content elements."""
|
||||
id: str = Field(description="Unique section identifier")
|
||||
title: Optional[str] = Field(default=None, description="Section title")
|
||||
content_type: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type")
|
||||
elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section")
|
||||
order: int = Field(description="Section order in document")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata")
|
||||
|
||||
|
||||
class StructuredDocument(BaseModel):
|
||||
"""Complete structured document in JSON format."""
|
||||
metadata: DocumentMetadata = Field(description="Document metadata")
|
||||
sections: List[DocumentSection] = Field(description="Document sections")
|
||||
summary: Optional[str] = Field(default=None, description="Document summary")
|
||||
tags: List[str] = Field(default_factory=list, description="Document tags")
|
||||
|
||||
def get_sections_by_type(self, content_type: str) -> List[DocumentSection]:
|
||||
"""Get all sections of a specific content type."""
|
||||
return [section for section in self.sections if section.content_type == content_type]
|
||||
|
||||
def get_all_tables(self) -> List[TableData]:
|
||||
"""Get all table data from the document."""
|
||||
tables = []
|
||||
for section in self.sections:
|
||||
for element in section.elements:
|
||||
if isinstance(element, TableData):
|
||||
tables.append(element)
|
||||
return tables
|
||||
|
||||
def get_all_lists(self) -> List[BulletList]:
|
||||
"""Get all lists from the document."""
|
||||
lists = []
|
||||
for section in self.sections:
|
||||
for element in section.elements:
|
||||
if isinstance(element, BulletList):
|
||||
lists.append(element)
|
||||
return lists
|
||||
|
||||
|
||||
class JsonChunkResult(BaseModel):
|
||||
"""Result from processing a single chunk with JSON output."""
|
||||
chunk_id: str = Field(description="Chunk identifier")
|
||||
document_section: DocumentSection = Field(description="Structured content from this chunk")
|
||||
processing_time: float = Field(description="Processing time in seconds")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk processing metadata")
|
||||
|
||||
|
||||
class JsonMergeResult(BaseModel):
|
||||
"""Result from merging multiple JSON chunks."""
|
||||
merged_document: StructuredDocument = Field(description="Merged structured document")
|
||||
merge_strategy: str = Field(description="Strategy used for merging")
|
||||
chunks_processed: int = Field(description="Number of chunks processed")
|
||||
merge_time: float = Field(description="Time taken to merge chunks")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
|
||||
|
||||
|
||||
# Update forward references (compatible with Pydantic v1 and v2)
|
||||
try:
|
||||
# Pydantic v2
|
||||
ListItem.model_rebuild()
|
||||
except AttributeError:
|
||||
# Pydantic v1
|
||||
ListItem.update_forward_refs()
|
||||
|
|
@ -18,6 +18,16 @@ class ContentExtracted(BaseModel):
|
|||
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
|
||||
|
||||
|
||||
class ChunkResult(BaseModel):
|
||||
"""Preserves the relationship between a chunk and its AI result."""
|
||||
originalChunk: ContentPart
|
||||
aiResult: str
|
||||
chunkIndex: int
|
||||
documentId: str
|
||||
processingTime: float = 0.0
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class MergeStrategy(BaseModel):
|
||||
"""Strategy configuration for merging content parts and AI results."""
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import logging
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Union, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
|
@ -260,6 +261,7 @@ class AiObjects:
|
|||
if not requiredTags:
|
||||
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
|
||||
|
||||
|
||||
# Override priority based on processing mode if not explicitly set
|
||||
effectivePriority = options.priority
|
||||
if options.priority == Priority.BALANCED:
|
||||
|
|
@ -268,6 +270,7 @@ class AiObjects:
|
|||
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
|
||||
|
||||
for name, info in aiModels.items():
|
||||
logger.info(f"Checking model: {name}, tags: {info.get('tags', [])}, function: {info.get('function', 'unknown')}")
|
||||
# Check context length
|
||||
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
|
||||
continue
|
||||
|
|
@ -279,8 +282,11 @@ class AiObjects:
|
|||
|
||||
# Check required tags/capabilities
|
||||
modelTags = info.get("tags", [])
|
||||
if requiredTags and not any(tag in modelTags for tag in requiredTags):
|
||||
if requiredTags and not all(tag in modelTags for tag in requiredTags):
|
||||
logger.info(f" -> Skipping {name}: missing required tags. Has: {modelTags}, needs: {requiredTags}")
|
||||
continue
|
||||
else:
|
||||
logger.info(f" -> {name} passed tag check")
|
||||
|
||||
# Check processing mode requirements
|
||||
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
|
||||
|
|
@ -288,16 +294,24 @@ class AiObjects:
|
|||
continue
|
||||
|
||||
candidates[name] = info
|
||||
logger.info(f" -> {name} added to candidates")
|
||||
|
||||
logger.info(f"Final candidates: {list(candidates.keys())}")
|
||||
|
||||
if not candidates:
|
||||
logger.info("No candidates found, using fallback")
|
||||
# Fallback based on operation type
|
||||
if options.operationType == OperationType.IMAGE_ANALYSIS:
|
||||
logger.info("Using fallback: openai_callAiImage")
|
||||
return "openai_callAiImage"
|
||||
elif options.operationType == OperationType.IMAGE_GENERATION:
|
||||
logger.info("Using fallback: openai_generateImage")
|
||||
return "openai_generateImage"
|
||||
elif options.operationType == OperationType.WEB_RESEARCH:
|
||||
logger.info("Using fallback: perplexity_callAiWithWebSearch")
|
||||
return "perplexity_callAiWithWebSearch"
|
||||
else:
|
||||
logger.info("Using fallback: openai_callAiBasic_gpt35")
|
||||
return "openai_callAiBasic_gpt35"
|
||||
|
||||
# Special handling for planning operations - use Claude for consistency
|
||||
|
|
@ -313,17 +327,60 @@ class AiObjects:
|
|||
|
||||
# Select based on priority for other operations
|
||||
if effectivePriority == Priority.SPEED:
|
||||
return max(candidates, key=lambda k: candidates[k]["speedRating"])
|
||||
selected = max(candidates, key=lambda k: candidates[k]["speedRating"])
|
||||
logger.info(f"Selected by SPEED: {selected}")
|
||||
return selected
|
||||
elif effectivePriority == Priority.QUALITY:
|
||||
return max(candidates, key=lambda k: candidates[k]["qualityRating"])
|
||||
selected = max(candidates, key=lambda k: candidates[k]["qualityRating"])
|
||||
logger.info(f"Selected by QUALITY: {selected}")
|
||||
return selected
|
||||
elif effectivePriority == Priority.COST:
|
||||
return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
|
||||
selected = min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
|
||||
logger.info(f"Selected by COST: {selected}")
|
||||
return selected
|
||||
else: # BALANCED
|
||||
def balancedScore(name: str) -> float:
|
||||
info = candidates[name]
|
||||
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
|
||||
|
||||
return max(candidates, key=balancedScore)
|
||||
selected = max(candidates, key=balancedScore)
|
||||
logger.info(f"Selected by BALANCED: {selected}")
|
||||
return selected
|
||||
|
||||
def _getFallbackModels(self, operationType: str) -> List[str]:
|
||||
"""Get ordered list of fallback models for a given operation type."""
|
||||
fallbackMappings = {
|
||||
OperationType.GENERAL: [
|
||||
"openai_callAiBasic_gpt35", # Fast and reliable
|
||||
"openai_callAiBasic", # High quality
|
||||
"anthropic_callAiBasic", # Alternative high quality
|
||||
"perplexity_callAiBasic" # Cost effective
|
||||
],
|
||||
OperationType.IMAGE_ANALYSIS: [
|
||||
"openai_callAiImage", # Primary image analysis
|
||||
"anthropic_callAiImage" # Alternative image analysis
|
||||
],
|
||||
OperationType.IMAGE_GENERATION: [
|
||||
"openai_generateImage" # Only image generation model
|
||||
],
|
||||
OperationType.WEB_RESEARCH: [
|
||||
"perplexity_callAiWithWebSearch", # Primary web research
|
||||
"perplexity_callAiBasic", # Alternative with web search
|
||||
"openai_callAiBasic" # Fallback to general model
|
||||
],
|
||||
OperationType.GENERATE_PLAN: [
|
||||
"anthropic_callAiBasic", # Best for planning
|
||||
"openai_callAiBasic", # High quality alternative
|
||||
"openai_callAiBasic_gpt35" # Fast fallback
|
||||
],
|
||||
OperationType.ANALYSE_CONTENT: [
|
||||
"anthropic_callAiBasic", # Best for analysis
|
||||
"openai_callAiBasic", # High quality alternative
|
||||
"openai_callAiBasic_gpt35" # Fast fallback
|
||||
]
|
||||
}
|
||||
|
||||
return fallbackMappings.get(operationType, fallbackMappings[OperationType.GENERAL])
|
||||
|
||||
def _connectorFor(self, modelName: str):
|
||||
"""Get the appropriate connector for the model."""
|
||||
|
|
@ -340,7 +397,7 @@ class AiObjects:
|
|||
raise ValueError(f"Unknown connector type: {connectorType}")
|
||||
|
||||
async def call(self, request: AiCallRequest) -> AiCallResponse:
|
||||
"""Call AI model for text generation."""
|
||||
"""Call AI model for text generation with fallback mechanism."""
|
||||
prompt = request.prompt
|
||||
context = request.context or ""
|
||||
options = request.options
|
||||
|
|
@ -357,9 +414,6 @@ class AiObjects:
|
|||
if options.compressContext and len(context.encode("utf-8")) > 70000:
|
||||
context = maybeTruncate(context, 70000)
|
||||
|
||||
# Select model for text generation
|
||||
modelName = self._selectModel(prompt, context, options)
|
||||
|
||||
# Derive generation parameters
|
||||
temperature = getattr(options, "temperature", None)
|
||||
if temperature is None:
|
||||
|
|
@ -376,6 +430,15 @@ class AiObjects:
|
|||
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
# Get fallback models for this operation type
|
||||
fallbackModels = self._getFallbackModels(options.operationType)
|
||||
|
||||
# Try primary model first, then fallbacks
|
||||
lastError = None
|
||||
for attempt, modelName in enumerate(fallbackModels):
|
||||
try:
|
||||
logger.info(f"Attempting AI call with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})")
|
||||
|
||||
connector = self._connectorFor(modelName)
|
||||
functionName = aiModels[modelName]["function"]
|
||||
|
||||
|
|
@ -406,29 +469,74 @@ class AiObjects:
|
|||
else:
|
||||
raise ValueError(f"Function {functionName} not supported for text generation")
|
||||
|
||||
# Estimate cost/tokens
|
||||
# Success! Estimate cost/tokens and return
|
||||
totalSize = len((prompt + context).encode("utf-8"))
|
||||
cost = self._estimateCost(aiModels[modelName], totalSize)
|
||||
usedTokens = int(totalSize / 4)
|
||||
|
||||
logger.info(f"✅ AI call successful with model: {modelName}")
|
||||
return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost)
|
||||
|
||||
except Exception as e:
|
||||
lastError = e
|
||||
logger.warning(f"❌ AI call failed with model {modelName}: {str(e)}")
|
||||
|
||||
# If this is not the last model, try the next one
|
||||
if attempt < len(fallbackModels) - 1:
|
||||
logger.info(f"🔄 Trying next fallback model...")
|
||||
continue
|
||||
else:
|
||||
# All models failed
|
||||
logger.error(f"💥 All {len(fallbackModels)} models failed for operation {options.operationType}")
|
||||
break
|
||||
|
||||
# All fallback attempts failed
|
||||
errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}"
|
||||
logger.error(errorMsg)
|
||||
raise Exception(errorMsg)
|
||||
|
||||
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str:
|
||||
"""Call AI model for image analysis."""
|
||||
"""Call AI model for image analysis with fallback mechanism."""
|
||||
if options is None:
|
||||
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
|
||||
|
||||
# Select model for image analysis
|
||||
modelName = self._selectModel(prompt, "", options)
|
||||
# Get fallback models for image analysis
|
||||
fallbackModels = self._getFallbackModels(OperationType.IMAGE_ANALYSIS)
|
||||
|
||||
# Try primary model first, then fallbacks
|
||||
lastError = None
|
||||
for attempt, modelName in enumerate(fallbackModels):
|
||||
try:
|
||||
logger.info(f"Attempting image analysis with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})")
|
||||
|
||||
connector = self._connectorFor(modelName)
|
||||
functionName = aiModels[modelName]["function"]
|
||||
|
||||
if functionName == "callAiImage":
|
||||
return await connector.callAiImage(prompt, imageData, mimeType)
|
||||
content = await connector.callAiImage(prompt, imageData, mimeType)
|
||||
logger.info(f"✅ Image analysis successful with model: {modelName}")
|
||||
return content
|
||||
else:
|
||||
raise ValueError(f"Function {functionName} not supported for image analysis")
|
||||
|
||||
except Exception as e:
|
||||
lastError = e
|
||||
logger.warning(f"❌ Image analysis failed with model {modelName}: {str(e)}")
|
||||
|
||||
# If this is not the last model, try the next one
|
||||
if attempt < len(fallbackModels) - 1:
|
||||
logger.info(f"🔄 Trying next fallback model for image analysis...")
|
||||
continue
|
||||
else:
|
||||
# All models failed
|
||||
logger.error(f"💥 All {len(fallbackModels)} models failed for image analysis")
|
||||
break
|
||||
|
||||
# All fallback attempts failed
|
||||
errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}"
|
||||
logger.error(errorMsg)
|
||||
raise Exception(errorMsg)
|
||||
|
||||
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]:
|
||||
"""Generate an image using AI."""
|
||||
if options is None:
|
||||
|
|
@ -694,7 +802,22 @@ class AiObjects:
|
|||
logger.warning(f"Failed to extract links from content: {e}")
|
||||
return []
|
||||
|
||||
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]:
|
||||
def _normalizeUrl(self, url: str) -> str:
|
||||
"""Normalize URL to handle variations that should be considered duplicates."""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# Remove trailing slashes and fragments
|
||||
url = url.rstrip('/')
|
||||
if '#' in url:
|
||||
url = url.split('#')[0]
|
||||
|
||||
# Handle common URL variations
|
||||
url = url.replace('http://', 'https://') # Normalize protocol
|
||||
|
||||
return url
|
||||
|
||||
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Recursively crawl URLs up to specified depth.
|
||||
|
||||
|
|
@ -703,19 +826,28 @@ class AiObjects:
|
|||
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
||||
extract_depth: Tavily extract depth setting
|
||||
max_per_domain: Maximum URLs per domain per level
|
||||
global_processed_urls: Optional global set to track processed URLs across sessions
|
||||
|
||||
Returns:
|
||||
Dictionary mapping URL -> content for all crawled pages
|
||||
"""
|
||||
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
||||
|
||||
# URL index to track all processed URLs
|
||||
# URL index to track all processed URLs (local + global)
|
||||
processed_urls = set()
|
||||
if global_processed_urls is not None:
|
||||
# Use global index if provided, otherwise create local one
|
||||
processed_urls = global_processed_urls
|
||||
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
||||
else:
|
||||
logger.info("Using local URL index for this crawl session")
|
||||
|
||||
all_content = {}
|
||||
|
||||
# Current level URLs to process
|
||||
current_level_urls = urls.copy()
|
||||
|
||||
try:
|
||||
for depth in range(1, max_depth + 1):
|
||||
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
||||
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
||||
|
|
@ -724,18 +856,21 @@ class AiObjects:
|
|||
next_level_urls = []
|
||||
|
||||
for url in current_level_urls:
|
||||
if url in processed_urls:
|
||||
logger.debug(f"URL {url} already processed, skipping")
|
||||
# Normalize URL for duplicate checking
|
||||
normalized_url = self._normalizeUrl(url)
|
||||
if normalized_url in processed_urls:
|
||||
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
||||
continue
|
||||
|
||||
try:
|
||||
logger.info(f"Processing URL at depth {depth}: {url}")
|
||||
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
|
||||
|
||||
# Read page content
|
||||
content = await self.readPage(url, extract_depth)
|
||||
if content:
|
||||
all_content[url] = content
|
||||
processed_urls.add(url)
|
||||
processed_urls.add(normalized_url)
|
||||
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
||||
|
||||
# Get URLs from this page for next level
|
||||
|
|
@ -749,18 +884,21 @@ class AiObjects:
|
|||
# Add new URLs to next level (avoiding already processed ones)
|
||||
new_urls_count = 0
|
||||
for new_url in filtered_urls:
|
||||
if new_url not in processed_urls:
|
||||
normalized_new_url = self._normalizeUrl(new_url)
|
||||
if normalized_new_url not in processed_urls:
|
||||
next_level_urls.append(new_url)
|
||||
new_urls_count += 1
|
||||
else:
|
||||
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
|
||||
|
||||
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
||||
else:
|
||||
logger.warning(f"✗ No content extracted from {url}")
|
||||
processed_urls.add(url) # Mark as processed to avoid retry
|
||||
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
||||
processed_urls.add(url) # Mark as processed to avoid retry
|
||||
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||
|
||||
# Prepare for next iteration
|
||||
current_level_urls = next_level_urls
|
||||
|
|
@ -772,6 +910,15 @@ class AiObjects:
|
|||
break
|
||||
|
||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
|
||||
logger.info(f"Unique URLs found: {len(all_content)}")
|
||||
return all_content
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
|
||||
return all_content
|
||||
except Exception as e:
|
||||
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
||||
return all_content
|
||||
|
||||
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
|
||||
|
|
|
|||
|
|
@ -571,7 +571,9 @@ class ChatObjects:
|
|||
actionName=createdMessage.get("actionName")
|
||||
)
|
||||
|
||||
# Debug: Store message and documents for debugging TODO REMOVE
|
||||
# Debug: Store message and documents for debugging - only if debug enabled
|
||||
debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
self._storeDebugMessageAndDocuments(chat_message)
|
||||
|
||||
return chat_message
|
||||
|
|
@ -1052,8 +1054,11 @@ class ChatObjects:
|
|||
|
||||
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
|
||||
"""
|
||||
Store message and documents for debugging purposes in fileshare.
|
||||
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents
|
||||
Store message and documents (metadata and file bytes) for debugging purposes.
|
||||
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/
|
||||
- message.json, message_text.txt
|
||||
- document_###_metadata.json
|
||||
- document_###_<original_filename> (actual file bytes)
|
||||
|
||||
Args:
|
||||
message: ChatMessage object to store
|
||||
|
|
@ -1157,6 +1162,26 @@ class ChatObjects:
|
|||
|
||||
logger.info(f"Debug: Stored document metadata for {doc.fileName}")
|
||||
|
||||
# Also store the actual file bytes next to metadata for debugging
|
||||
try:
|
||||
# Lazy import to avoid circular deps at module load
|
||||
from modules.interfaces import interfaceDbComponentObjects as comp
|
||||
componentInterface = comp.getInterface(self.currentUser)
|
||||
file_bytes = componentInterface.getFileData(doc.fileId)
|
||||
if file_bytes:
|
||||
# Build a safe filename preserving original name
|
||||
safe_name = doc.fileName or f"document_{i+1:03d}"
|
||||
# Avoid path traversal
|
||||
safe_name = os.path.basename(safe_name)
|
||||
doc_file_path = os.path.join(label_folder, f"document_{i+1:03d}_" + safe_name)
|
||||
with open(doc_file_path, "wb") as df:
|
||||
df.write(file_bytes)
|
||||
logger.info(f"Debug: Stored document file bytes: {doc_file_path} ({len(file_bytes)} bytes)")
|
||||
else:
|
||||
logger.warning(f"Debug: No file bytes returned for fileId {doc.fileId}")
|
||||
except Exception as e:
|
||||
logger.error(f"Debug: Failed to store document file for {doc.fileName} (fileId {doc.fileId}): {e}")
|
||||
|
||||
logger.info(f"Debug: Stored message and documents in {message_path}")
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -95,8 +95,8 @@ async def update_prompt(
|
|||
detail=f"Prompt with ID {promptId} not found"
|
||||
)
|
||||
|
||||
# Convert Prompt to dict for interface
|
||||
update_data = promptData.dict()
|
||||
# Convert Prompt to dict for interface, excluding the id field
|
||||
update_data = promptData.dict(exclude={'id'})
|
||||
|
||||
# Update prompt
|
||||
updatedPrompt = managementInterface.updatePrompt(promptId, update_data)
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from pydantic import BaseModel
|
|||
|
||||
# Import auth modules
|
||||
from modules.security.auth import getCurrentUser, limiter, SECRET_KEY, ALGORITHM
|
||||
from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie
|
||||
from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie, clearAccessTokenCookie, clearRefreshTokenCookie
|
||||
from modules.interfaces.interfaceDbAppObjects import getInterface, getRootInterface
|
||||
from modules.datamodels.datamodelUam import User, UserInDB, AuthAuthority, UserPrivilege
|
||||
from modules.datamodels.datamodelSecurity import Token
|
||||
|
|
@ -263,8 +263,7 @@ async def read_user_me(
|
|||
@limiter.limit("60/minute")
|
||||
async def refresh_token(
|
||||
request: Request,
|
||||
response: Response,
|
||||
currentUser: User = Depends(getCurrentUser)
|
||||
response: Response
|
||||
) -> Dict[str, Any]:
|
||||
"""Refresh access token using refresh token from cookie"""
|
||||
try:
|
||||
|
|
@ -283,12 +282,27 @@ async def refresh_token(
|
|||
except jwt.JWTError:
|
||||
raise HTTPException(status_code=401, detail="Invalid refresh token")
|
||||
|
||||
# Get user information from refresh token payload
|
||||
user_id = payload.get("userId")
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID")
|
||||
|
||||
# Get user from database using the user ID from refresh token
|
||||
try:
|
||||
app_interface = getRootInterface()
|
||||
current_user = app_interface.getUser(user_id)
|
||||
if not current_user:
|
||||
raise HTTPException(status_code=401, detail="User not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get user from database: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail="Failed to validate user")
|
||||
|
||||
# Create new token data
|
||||
token_data = {
|
||||
"sub": currentUser.username,
|
||||
"mandateId": str(currentUser.mandateId),
|
||||
"userId": str(currentUser.id),
|
||||
"authenticationAuthority": currentUser.authenticationAuthority
|
||||
"sub": current_user.username,
|
||||
"mandateId": str(current_user.mandateId),
|
||||
"userId": str(current_user.id),
|
||||
"authenticationAuthority": current_user.authenticationAuthority
|
||||
}
|
||||
|
||||
# Create new access token + set cookie
|
||||
|
|
@ -365,15 +379,18 @@ async def logout(request: Request, response: Response, currentUser: User = Depen
|
|||
# Don't fail if audit logging fails
|
||||
pass
|
||||
|
||||
# Clear httpOnly cookies
|
||||
response.delete_cookie(key="auth_token", httponly=True, samesite="strict")
|
||||
response.delete_cookie(key="refresh_token", httponly=True, samesite="strict")
|
||||
|
||||
return JSONResponse({
|
||||
# Create the JSON response first
|
||||
json_response = JSONResponse({
|
||||
"message": "Successfully logged out - cookies cleared",
|
||||
"revokedTokens": revoked
|
||||
})
|
||||
|
||||
# Clear httpOnly cookies on the response we're actually returning
|
||||
clearAccessTokenCookie(json_response)
|
||||
clearRefreshTokenCookie(json_response)
|
||||
|
||||
return json_response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during logout: {str(e)}")
|
||||
raise HTTPException(
|
||||
|
|
|
|||
|
|
@ -17,6 +17,11 @@ ALGORITHM = APP_CONFIG.get("Auth_ALGORITHM")
|
|||
ACCESS_TOKEN_EXPIRE_MINUTES = int(APP_CONFIG.get("APP_TOKEN_EXPIRY"))
|
||||
REFRESH_TOKEN_EXPIRE_DAYS = int(APP_CONFIG.get("APP_REFRESH_TOKEN_EXPIRY", "7"))
|
||||
|
||||
# Cookie security settings - use secure cookies based on whether API uses HTTPS
|
||||
# Cookies must have secure=True on HTTPS sites, secure=False on HTTP sites
|
||||
APP_API_URL = APP_CONFIG.get("APP_API_URL", "http://localhost:8000")
|
||||
USE_SECURE_COOKIES = APP_API_URL.startswith("https://") if APP_API_URL else False
|
||||
|
||||
|
||||
def createAccessToken(data: dict, expiresDelta: Optional[timedelta] = None) -> Tuple[str, "datetime"]:
|
||||
"""Create a JWT access token and return (token, expiresAt)."""
|
||||
|
|
@ -52,8 +57,9 @@ def setAccessTokenCookie(response: Response, token: str, expiresDelta: Optional[
|
|||
key="auth_token",
|
||||
value=token,
|
||||
httponly=True,
|
||||
secure=True,
|
||||
secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS)
|
||||
samesite="strict",
|
||||
path="/",
|
||||
max_age=maxAge
|
||||
)
|
||||
|
||||
|
|
@ -64,9 +70,46 @@ def setRefreshTokenCookie(response: Response, token: str) -> None:
|
|||
key="refresh_token",
|
||||
value=token,
|
||||
httponly=True,
|
||||
secure=True,
|
||||
secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS)
|
||||
samesite="strict",
|
||||
path="/",
|
||||
max_age=REFRESH_TOKEN_EXPIRE_DAYS * 24 * 60 * 60
|
||||
)
|
||||
|
||||
|
||||
def clearAccessTokenCookie(response: Response) -> None:
|
||||
"""
|
||||
Clear access token cookie by setting it to expire immediately.
|
||||
Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility.
|
||||
"""
|
||||
# Build secure flag based on environment
|
||||
secure_flag = "; Secure" if USE_SECURE_COOKIES else ""
|
||||
|
||||
# Primary method: Raw Set-Cookie header for guaranteed deletion
|
||||
response.headers.append(
|
||||
"Set-Cookie",
|
||||
f"auth_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict"
|
||||
)
|
||||
|
||||
# Fallback: Also use FastAPI's built-in method
|
||||
response.delete_cookie(key="auth_token", path="/")
|
||||
|
||||
|
||||
def clearRefreshTokenCookie(response: Response) -> None:
|
||||
"""
|
||||
Clear refresh token cookie by setting it to expire immediately.
|
||||
Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility.
|
||||
"""
|
||||
# Build secure flag based on environment
|
||||
secure_flag = "; Secure" if USE_SECURE_COOKIES else ""
|
||||
|
||||
# Primary method: Raw Set-Cookie header for guaranteed deletion
|
||||
response.headers.append(
|
||||
"Set-Cookie",
|
||||
f"refresh_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict"
|
||||
)
|
||||
|
||||
# Fallback: Also use FastAPI's built-in method
|
||||
response.delete_cookie(key="refresh_token", path="/")
|
||||
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
596
modules/services/serviceAi/subCoreAi.py
Normal file
596
modules/services/serviceAi/subCoreAi.py
Normal file
|
|
@ -0,0 +1,596 @@
|
|||
import logging
|
||||
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||
from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority
|
||||
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubCoreAi:
|
||||
"""Core AI operations including image analysis, text generation, and planning calls."""
|
||||
|
||||
def __init__(self, services, aiObjects):
|
||||
"""Initialize core AI operations.
|
||||
|
||||
Args:
|
||||
services: Service center instance for accessing other services
|
||||
aiObjects: Initialized AiObjects instance
|
||||
"""
|
||||
self.services = services
|
||||
self.aiObjects = aiObjects
|
||||
|
||||
# AI Processing Call
|
||||
async def callAi(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]] = None,
|
||||
placeholders: Optional[List[PromptPlaceholder]] = None,
|
||||
options: Optional[AiCallOptions] = None,
|
||||
outputFormat: Optional[str] = None,
|
||||
title: Optional[str] = None,
|
||||
documentProcessor=None,
|
||||
documentGenerator=None
|
||||
) -> Union[str, Dict[str, Any]]:
|
||||
"""
|
||||
Unified AI call interface that automatically routes to appropriate handler.
|
||||
|
||||
Args:
|
||||
prompt: The main prompt for the AI call
|
||||
documents: Optional list of documents to process
|
||||
placeholders: Optional list of placeholder replacements for planning calls
|
||||
options: AI call configuration options
|
||||
outputFormat: Optional output format (html, pdf, docx, txt, md, json, csv, xlsx) for document generation
|
||||
title: Optional title for generated documents
|
||||
documentProcessor: Document processing service instance
|
||||
documentGenerator: Document generation service instance
|
||||
|
||||
Returns:
|
||||
AI response as string, or dict with documents if outputFormat is specified
|
||||
|
||||
Raises:
|
||||
Exception: If all available models fail
|
||||
"""
|
||||
if options is None:
|
||||
options = AiCallOptions()
|
||||
|
||||
# Normalize placeholders from List[PromptPlaceholder]
|
||||
placeholders_dict: Dict[str, str] = {}
|
||||
placeholders_meta: Dict[str, bool] = {}
|
||||
if placeholders:
|
||||
placeholders_dict = {p.label: p.content for p in placeholders}
|
||||
placeholders_meta = {p.label: bool(getattr(p, 'summaryAllowed', False)) for p in placeholders}
|
||||
|
||||
# Auto-determine call type based on documents and operation type
|
||||
call_type = self._determineCallType(documents, options.operationType)
|
||||
options.callType = call_type
|
||||
|
||||
try:
|
||||
# Build the full prompt that will be sent to AI
|
||||
if placeholders:
|
||||
full_prompt = prompt
|
||||
for p in placeholders:
|
||||
placeholder = f"{{{{KEY:{p.label}}}}}"
|
||||
full_prompt = full_prompt.replace(placeholder, p.content)
|
||||
else:
|
||||
full_prompt = prompt
|
||||
|
||||
self._writeAiResponseDebug(
|
||||
label='ai_prompt_debug',
|
||||
content=full_prompt,
|
||||
partIndex=1,
|
||||
modelName=None,
|
||||
continuation=False
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Handle document generation with specific output format
|
||||
if outputFormat and documentGenerator:
|
||||
result = await documentGenerator.callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title)
|
||||
# Log AI response for debugging
|
||||
try:
|
||||
if isinstance(result, dict) and 'content' in result:
|
||||
self._writeAiResponseDebug(
|
||||
label='ai_document_generation',
|
||||
content=result['content'],
|
||||
partIndex=1,
|
||||
modelName=None, # Document generation doesn't return model info
|
||||
continuation=False
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
if call_type == "planning":
|
||||
result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options)
|
||||
# Log AI response for debugging
|
||||
try:
|
||||
self._writeAiResponseDebug(
|
||||
label='ai_planning',
|
||||
content=result or "",
|
||||
partIndex=1,
|
||||
modelName=None, # Planning doesn't return model info
|
||||
continuation=False
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
else:
|
||||
# Set processDocumentsIndividually from the legacy parameter if not set in options
|
||||
if options.processDocumentsIndividually is None and documents:
|
||||
options.processDocumentsIndividually = False # Default to batch processing
|
||||
|
||||
# For text calls, we need to build the full prompt with placeholders here
|
||||
# since _callAiText doesn't handle placeholders directly
|
||||
if placeholders_dict:
|
||||
full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders_dict)
|
||||
else:
|
||||
full_prompt = prompt
|
||||
|
||||
if documentProcessor and documents:
|
||||
result = await documentProcessor.callAiText(full_prompt, documents, options)
|
||||
else:
|
||||
# Fallback to direct AI call if no document processor available
|
||||
request = AiCallRequest(
|
||||
prompt=full_prompt,
|
||||
context="",
|
||||
options=options
|
||||
)
|
||||
response = await self.aiObjects.call(request)
|
||||
result = response.content
|
||||
|
||||
# Log AI response for debugging (additional logging for text calls)
|
||||
try:
|
||||
self._writeAiResponseDebug(
|
||||
label='ai_text_main',
|
||||
content=result or "",
|
||||
partIndex=1,
|
||||
modelName=None, # Text calls already log internally
|
||||
continuation=False
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
# AI Image Analysis
|
||||
async def readImage(
|
||||
self,
|
||||
prompt: str,
|
||||
imageData: Union[str, bytes],
|
||||
mimeType: str = None,
|
||||
options: Optional[AiCallOptions] = None,
|
||||
) -> str:
|
||||
"""Call AI for image analysis using interface.callImage()."""
|
||||
try:
|
||||
# Check if imageData is valid
|
||||
if not imageData:
|
||||
error_msg = "No image data provided"
|
||||
self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE")
|
||||
logger.error(f"Error in AI image analysis: {error_msg}")
|
||||
return f"Error: {error_msg}"
|
||||
|
||||
self.services.utils.debugLogToFile(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}", "AI_SERVICE")
|
||||
logger.info(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
|
||||
|
||||
# Always use IMAGE_ANALYSIS operation type for image processing
|
||||
if options is None:
|
||||
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
|
||||
else:
|
||||
# Override the operation type to ensure image analysis
|
||||
options.operationType = OperationType.IMAGE_ANALYSIS
|
||||
|
||||
self.services.utils.debugLogToFile(f"Calling aiObjects.callImage with operationType: {options.operationType}", "AI_SERVICE")
|
||||
logger.info(f"Calling aiObjects.callImage with operationType: {options.operationType}")
|
||||
result = await self.aiObjects.callImage(prompt, imageData, mimeType, options)
|
||||
|
||||
# Debug the result
|
||||
self.services.utils.debugLogToFile(f"Raw AI result type: {type(result)}, value: {repr(result)}", "AI_SERVICE")
|
||||
|
||||
# Check if result is valid
|
||||
if not result or (isinstance(result, str) and not result.strip()):
|
||||
error_msg = f"No response from AI image analysis (result: {repr(result)})"
|
||||
self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE")
|
||||
logger.error(f"Error in AI image analysis: {error_msg}")
|
||||
return f"Error: {error_msg}"
|
||||
|
||||
self.services.utils.debugLogToFile(f"callImage returned: {result[:200]}..." if len(result) > 200 else result, "AI_SERVICE")
|
||||
logger.info(f"callImage returned: {result[:200]}..." if len(result) > 200 else result)
|
||||
return result
|
||||
except Exception as e:
|
||||
self.services.utils.debugLogToFile(f"Error in AI image analysis: {str(e)}", "AI_SERVICE")
|
||||
logger.error(f"Error in AI image analysis: {str(e)}")
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
# AI Image Generation
|
||||
async def generateImage(
|
||||
self,
|
||||
prompt: str,
|
||||
size: str = "1024x1024",
|
||||
quality: str = "standard",
|
||||
style: str = "vivid",
|
||||
options: Optional[AiCallOptions] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate an image using AI using interface.generateImage()."""
|
||||
try:
|
||||
return await self.aiObjects.generateImage(prompt, size, quality, style, options)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in AI image generation: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str:
|
||||
"""
|
||||
Determine call type based on documents and operation type.
|
||||
|
||||
Criteria: no documents AND operationType is "generate_plan" -> planning
|
||||
All other cases -> text
|
||||
"""
|
||||
has_documents = documents is not None and len(documents) > 0
|
||||
is_planning_operation = operation_type == OperationType.GENERATE_PLAN
|
||||
|
||||
if not has_documents and is_planning_operation:
|
||||
return "planning"
|
||||
else:
|
||||
return "text"
|
||||
|
||||
async def _callAiPlanning(
|
||||
self,
|
||||
prompt: str,
|
||||
placeholders: Optional[Dict[str, str]],
|
||||
placeholdersMeta: Optional[Dict[str, bool]],
|
||||
options: AiCallOptions
|
||||
) -> str:
|
||||
"""
|
||||
Handle planning calls with placeholder system and selective summarization.
|
||||
"""
|
||||
# Build full prompt with placeholders; if too large, summarize summaryAllowed placeholders proportionally
|
||||
effective_placeholders = placeholders or {}
|
||||
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
|
||||
|
||||
if options.compressPrompt and placeholdersMeta:
|
||||
# Determine model capacity
|
||||
try:
|
||||
caps = self._getModelCapabilitiesForContent(full_prompt, None, options)
|
||||
max_bytes = caps.get("maxContextBytes", len(full_prompt.encode("utf-8")))
|
||||
except Exception:
|
||||
max_bytes = len(full_prompt.encode("utf-8"))
|
||||
|
||||
current_bytes = len(full_prompt.encode("utf-8"))
|
||||
if current_bytes > max_bytes:
|
||||
# Compute total bytes contributed by allowed placeholders (approximate by content length)
|
||||
allowed_labels = [l for l, allow in placeholdersMeta.items() if allow]
|
||||
allowed_sizes = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels}
|
||||
total_allowed = sum(allowed_sizes.values())
|
||||
|
||||
overage = current_bytes - max_bytes
|
||||
if total_allowed > 0 and overage > 0:
|
||||
# Target total for allowed after reduction
|
||||
target_allowed = max(total_allowed - overage, 0)
|
||||
# Global ratio to apply across allowed placeholders
|
||||
ratio = target_allowed / total_allowed if total_allowed > 0 else 1.0
|
||||
ratio = max(0.0, min(1.0, ratio))
|
||||
|
||||
reduced: Dict[str, str] = {}
|
||||
for label, content in effective_placeholders.items():
|
||||
if label in allowed_labels and isinstance(content, str) and len(content) > 0:
|
||||
old_len = len(content)
|
||||
# Reduce by proportional ratio on characters (fallback if empty)
|
||||
reduction_factor = ratio if old_len > 0 else 1.0
|
||||
reduced[label] = self._reduceText(content, reduction_factor)
|
||||
else:
|
||||
reduced[label] = content
|
||||
|
||||
effective_placeholders = reduced
|
||||
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
|
||||
|
||||
# If still slightly over, perform a second-pass fine adjustment with updated ratio
|
||||
current_bytes = len(full_prompt.encode("utf-8"))
|
||||
if current_bytes > max_bytes and total_allowed > 0:
|
||||
overage2 = current_bytes - max_bytes
|
||||
# Recompute allowed sizes after first reduction
|
||||
allowed_sizes2 = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels}
|
||||
total_allowed2 = sum(allowed_sizes2.values())
|
||||
if total_allowed2 > 0 and overage2 > 0:
|
||||
target_allowed2 = max(total_allowed2 - overage2, 0)
|
||||
ratio2 = target_allowed2 / total_allowed2
|
||||
ratio2 = max(0.0, min(1.0, ratio2))
|
||||
reduced2: Dict[str, str] = {}
|
||||
for label, content in effective_placeholders.items():
|
||||
if label in allowed_labels and isinstance(content, str) and len(content) > 0:
|
||||
old_len = len(content)
|
||||
reduction_factor = ratio2 if old_len > 0 else 1.0
|
||||
reduced2[label] = self._reduceText(content, reduction_factor)
|
||||
else:
|
||||
reduced2[label] = content
|
||||
effective_placeholders = reduced2
|
||||
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
|
||||
|
||||
|
||||
# Make AI call using AiObjects (let it handle model selection)
|
||||
request = AiCallRequest(
|
||||
prompt=full_prompt,
|
||||
context="", # Context is already included in the prompt
|
||||
options=options
|
||||
)
|
||||
response = await self.aiObjects.call(request)
|
||||
try:
|
||||
logger.debug(f"AI model selected (planning): {getattr(response, 'modelName', 'unknown')}")
|
||||
except Exception:
|
||||
pass
|
||||
return response.content
|
||||
|
||||
async def _callAiDirect(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
documentProcessor=None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Call AI directly with prompt and documents for JSON output.
|
||||
Used for multi-file generation - uses the existing generation pipeline.
|
||||
"""
|
||||
# Use the existing generation pipeline that already works
|
||||
# This ensures proper document processing and content extraction
|
||||
logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents")
|
||||
|
||||
if documentProcessor:
|
||||
# Process documents with JSON merging using the existing pipeline
|
||||
result = await documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
|
||||
else:
|
||||
# Fallback to simple AI call
|
||||
request = AiCallRequest(
|
||||
prompt=prompt,
|
||||
context="",
|
||||
options=options
|
||||
)
|
||||
response = await self.aiObjects.call(request)
|
||||
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]}
|
||||
|
||||
# Convert single-file result to multi-file format if needed
|
||||
if "sections" in result and "documents" not in result:
|
||||
logger.info("Converting single-file result to multi-file format")
|
||||
# This is a single-file result, convert it to multi-file format
|
||||
return {
|
||||
"metadata": result.get("metadata", {"title": "Converted Document"}),
|
||||
"documents": [{
|
||||
"id": "doc_1",
|
||||
"title": result.get("metadata", {}).get("title", "Document"),
|
||||
"filename": "document.txt",
|
||||
"sections": result.get("sections", [])
|
||||
}]
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
|
||||
"""
|
||||
Get model capabilities for content processing, including appropriate size limits for chunking.
|
||||
"""
|
||||
# Estimate total content size
|
||||
prompt_size = len(prompt.encode('utf-8'))
|
||||
document_size = 0
|
||||
if documents:
|
||||
# Rough estimate of document content size
|
||||
for doc in documents:
|
||||
document_size += doc.fileSize or 0
|
||||
|
||||
total_size = prompt_size + document_size
|
||||
|
||||
# Use AiObjects to select the best model for this content size
|
||||
# We'll simulate the model selection by checking available models
|
||||
from modules.interfaces.interfaceAiObjects import aiModels
|
||||
|
||||
# Find the best model for this content size and operation
|
||||
best_model = None
|
||||
best_context_length = 0
|
||||
|
||||
for model_name, model_info in aiModels.items():
|
||||
context_length = model_info.get("contextLength", 0)
|
||||
|
||||
# Skip models with no context length or too small for content
|
||||
if context_length == 0:
|
||||
continue
|
||||
|
||||
# Check if model supports the operation type
|
||||
capabilities = model_info.get("capabilities", [])
|
||||
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
|
||||
continue
|
||||
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
|
||||
continue
|
||||
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
|
||||
continue
|
||||
elif "text_generation" not in capabilities:
|
||||
continue
|
||||
|
||||
# Prefer models that can handle the content without chunking, but allow chunking if needed
|
||||
if context_length >= total_size * 0.8: # 80% of content size
|
||||
if context_length > best_context_length:
|
||||
best_model = model_info
|
||||
best_context_length = context_length
|
||||
elif best_model is None: # Fallback to largest available model
|
||||
if context_length > best_context_length:
|
||||
best_model = model_info
|
||||
best_context_length = context_length
|
||||
|
||||
# Fallback to a reasonable default if no model found
|
||||
if best_model is None:
|
||||
best_model = {
|
||||
"contextLength": 128000, # GPT-4o default
|
||||
"llmName": "gpt-4o"
|
||||
}
|
||||
|
||||
# Calculate appropriate sizes
|
||||
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
|
||||
context_length_bytes = int(best_model["contextLength"] * 4)
|
||||
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
|
||||
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
|
||||
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
|
||||
|
||||
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
|
||||
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
|
||||
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
|
||||
|
||||
return {
|
||||
"maxContextBytes": max_context_bytes,
|
||||
"textChunkSize": text_chunk_size,
|
||||
"imageChunkSize": image_chunk_size
|
||||
}
|
||||
|
||||
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
|
||||
"""
|
||||
Get models capable of handling the specific operation with capability filtering.
|
||||
"""
|
||||
# Use the actual AI objects model selection instead of hardcoded default
|
||||
if hasattr(self, 'aiObjects') and self.aiObjects:
|
||||
# Let AiObjects handle the model selection
|
||||
return []
|
||||
else:
|
||||
# Fallback to default model if AiObjects not available
|
||||
default_model = ModelCapabilities(
|
||||
name="default",
|
||||
maxTokens=4000,
|
||||
capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"],
|
||||
costPerToken=0.001,
|
||||
processingTime=1.0,
|
||||
isAvailable=True
|
||||
)
|
||||
return [default_model]
|
||||
|
||||
def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str:
|
||||
"""
|
||||
Build full prompt by replacing placeholders with their content.
|
||||
Uses the new {{KEY:placeholder}} format.
|
||||
"""
|
||||
if not placeholders:
|
||||
return prompt
|
||||
|
||||
full_prompt = prompt
|
||||
for placeholder, content in placeholders.items():
|
||||
# Replace both old format {{placeholder}} and new format {{KEY:placeholder}}
|
||||
full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content)
|
||||
full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content)
|
||||
|
||||
return full_prompt
|
||||
|
||||
def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
|
||||
"""Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled."""
|
||||
try:
|
||||
# Check if debug logging is enabled
|
||||
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if not debug_enabled:
|
||||
return
|
||||
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
# Base dir: gateway/test-chat/ai (go up 4 levels from this file)
|
||||
# .../gateway/modules/services/serviceAi/subCoreAi.py -> up to gateway root
|
||||
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
outDir = os.path.join(gatewayDir, 'test-chat', 'ai')
|
||||
os.makedirs(outDir, exist_ok=True)
|
||||
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
|
||||
suffix = []
|
||||
if partIndex is not None:
|
||||
suffix.append(f"part{partIndex}")
|
||||
if continuation is not None:
|
||||
suffix.append(f"cont_{str(continuation).lower()}")
|
||||
if modelName:
|
||||
safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName)
|
||||
suffix.append(safeModel)
|
||||
suffixStr = ('_' + '_'.join(suffix)) if suffix else ''
|
||||
fname = f"{ts}_{label}{suffixStr}.txt"
|
||||
fpath = os.path.join(outDir, fname)
|
||||
with open(fpath, 'w', encoding='utf-8') as f:
|
||||
f.write(content or '')
|
||||
except Exception:
|
||||
# Do not raise; best-effort debug write
|
||||
pass
|
||||
|
||||
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
|
||||
"""
|
||||
Check if text exceeds model token limit with safety margin.
|
||||
"""
|
||||
# Simple character-based estimation (4 chars per token)
|
||||
estimated_tokens = len(text) // 4
|
||||
max_tokens = int(model.maxTokens * (1 - safety_margin))
|
||||
return estimated_tokens > max_tokens
|
||||
|
||||
def _reducePlanningPrompt(
|
||||
self,
|
||||
full_prompt: str,
|
||||
placeholders: Optional[Dict[str, str]],
|
||||
model: ModelCapabilities,
|
||||
options: AiCallOptions
|
||||
) -> str:
|
||||
"""
|
||||
Reduce planning prompt size by summarizing placeholders while preserving prompt structure.
|
||||
"""
|
||||
if not placeholders:
|
||||
return self._reduceText(full_prompt, 0.7)
|
||||
|
||||
# Reduce placeholders while preserving prompt
|
||||
reduced_placeholders = {}
|
||||
for placeholder, content in placeholders.items():
|
||||
if len(content) > 1000: # Only reduce long content
|
||||
reduction_factor = 0.7
|
||||
reduced_content = self._reduceText(content, reduction_factor)
|
||||
reduced_placeholders[placeholder] = reduced_content
|
||||
else:
|
||||
reduced_placeholders[placeholder] = content
|
||||
|
||||
return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders)
|
||||
|
||||
def _reduceTextPrompt(
|
||||
self,
|
||||
prompt: str,
|
||||
context: str,
|
||||
model: ModelCapabilities,
|
||||
options: AiCallOptions
|
||||
) -> str:
|
||||
"""
|
||||
Reduce text prompt size using typeGroup-aware chunking and merging.
|
||||
"""
|
||||
max_size = int(model.maxTokens * (1 - options.safetyMargin))
|
||||
|
||||
if options.compressPrompt:
|
||||
# Reduce both prompt and context
|
||||
target_size = max_size
|
||||
current_size = len(prompt) + len(context)
|
||||
reduction_factor = (target_size * 0.7) / current_size
|
||||
|
||||
if reduction_factor < 1.0:
|
||||
prompt = self._reduceText(prompt, reduction_factor)
|
||||
context = self._reduceText(context, reduction_factor)
|
||||
else:
|
||||
# Only reduce context, preserve prompt integrity
|
||||
max_context_size = max_size - len(prompt)
|
||||
if len(context) > max_context_size:
|
||||
reduction_factor = max_context_size / len(context)
|
||||
context = self._reduceText(context, reduction_factor)
|
||||
|
||||
return prompt + "\n\n" + context if context else prompt
|
||||
|
||||
def _extractTextFromContentParts(self, extracted_content) -> str:
|
||||
"""
|
||||
Extract text content from ExtractionService ContentPart objects.
|
||||
"""
|
||||
if not extracted_content or not hasattr(extracted_content, 'parts'):
|
||||
return ""
|
||||
|
||||
text_parts = []
|
||||
for part in extracted_content.parts:
|
||||
if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']:
|
||||
if hasattr(part, 'data') and part.data:
|
||||
text_parts.append(part.data)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def _reduceText(self, text: str, reduction_factor: float) -> str:
|
||||
"""
|
||||
Reduce text size by the specified factor.
|
||||
"""
|
||||
if reduction_factor >= 1.0:
|
||||
return text
|
||||
|
||||
target_length = int(len(text) * reduction_factor)
|
||||
return text[:target_length] + "... [reduced]"
|
||||
804
modules/services/serviceAi/subDocumentGeneration.py
Normal file
804
modules/services/serviceAi/subDocumentGeneration.py
Normal file
|
|
@ -0,0 +1,804 @@
|
|||
import logging
|
||||
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubDocumentGeneration:
|
||||
"""Document generation operations including single-file and multi-file generation."""
|
||||
|
||||
def __init__(self, services, aiObjects, documentProcessor):
|
||||
"""Initialize document generation service.
|
||||
|
||||
Args:
|
||||
services: Service center instance for accessing other services
|
||||
aiObjects: Initialized AiObjects instance
|
||||
documentProcessor: Document processing service instance
|
||||
"""
|
||||
self.services = services
|
||||
self.aiObjects = aiObjects
|
||||
self.documentProcessor = documentProcessor
|
||||
|
||||
async def callAiWithDocumentGeneration(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
outputFormat: str,
|
||||
title: Optional[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Handle AI calls with document generation in specific output format.
|
||||
Now supports both single-file and multi-file generation.
|
||||
|
||||
Args:
|
||||
prompt: The main prompt for the AI call
|
||||
documents: Optional list of documents to process
|
||||
options: AI call configuration options
|
||||
outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||
title: Optional title for generated documents
|
||||
|
||||
Returns:
|
||||
Dict with generated documents and metadata
|
||||
"""
|
||||
try:
|
||||
# Use AI to analyze prompt intent
|
||||
prompt_analysis = await self._analyzePromptIntent(prompt, self)
|
||||
logger.info(f"Prompt analysis result: {prompt_analysis}")
|
||||
|
||||
if prompt_analysis.get("is_multi_file", False):
|
||||
return await self._callAiWithMultiFileGeneration(
|
||||
prompt, documents, options, outputFormat, title, prompt_analysis
|
||||
)
|
||||
else:
|
||||
return await self._callAiWithSingleFileGeneration(
|
||||
prompt, documents, options, outputFormat, title
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in document generation: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"content": "",
|
||||
"rendered_content": "",
|
||||
"mime_type": "text/plain",
|
||||
"filename": f"error_{outputFormat}",
|
||||
"format": outputFormat,
|
||||
"title": title or "Error",
|
||||
"documents": []
|
||||
}
|
||||
|
||||
async def _callAiWithSingleFileGeneration(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
outputFormat: str,
|
||||
title: Optional[str],
|
||||
generationPrompt: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Handle single-file document generation (existing functionality)."""
|
||||
try:
|
||||
# Get format-specific extraction prompt from generation service
|
||||
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||
generation_service = GenerationService(self.services)
|
||||
|
||||
# Use default title if not provided
|
||||
if not title:
|
||||
title = "AI Generated Document"
|
||||
|
||||
# Get format-specific extraction prompt
|
||||
extractionPrompt = await generation_service.getExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=title,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Process documents with format-specific prompt using JSON mode
|
||||
# This ensures structured JSON output instead of text
|
||||
aiResponseJson = await self._callAiJson(extractionPrompt, documents, options)
|
||||
|
||||
# Validate JSON response
|
||||
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
|
||||
raise Exception("AI response is not valid JSON document structure")
|
||||
|
||||
# Emit raw extracted data as a chat message attachment before rendering
|
||||
try:
|
||||
await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
|
||||
except Exception:
|
||||
logger.warning("Failed to emit raw extraction chat message (single-file)")
|
||||
|
||||
# Generate filename from document metadata
|
||||
parsedFilename = None
|
||||
try:
|
||||
if aiResponseJson.get("metadata", {}).get("title"):
|
||||
title = aiResponseJson["metadata"]["title"]
|
||||
# Clean title for filename
|
||||
import re
|
||||
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title)
|
||||
parsed = re.sub(r"-+", "-", parsed).strip('-')
|
||||
if parsed:
|
||||
parsedFilename = f"{parsed}.{outputFormat}"
|
||||
except Exception:
|
||||
parsedFilename = None
|
||||
|
||||
# Use AI generation to enhance the extracted JSON before rendering
|
||||
enhancedContent = aiResponseJson # Default to original
|
||||
if prompt:
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
# Get generation prompt
|
||||
generationPrompt = await generation_service.getGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=title,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Prepare the AI call
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
# Create context with the extracted JSON content
|
||||
import json
|
||||
context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
context=context,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
# Call AI to enhance the content
|
||||
response = await self.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
# Parse the AI response as JSON
|
||||
try:
|
||||
import re
|
||||
result = response.content.strip()
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to parse JSON
|
||||
enhancedContent = json.loads(result)
|
||||
logger.info(f"AI enhanced JSON content successfully")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
||||
enhancedContent = aiResponseJson
|
||||
else:
|
||||
logger.warning("AI generation returned empty response, using original content")
|
||||
enhancedContent = aiResponseJson
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||
enhancedContent = aiResponseJson
|
||||
|
||||
# Render the enhanced JSON content
|
||||
renderedContent, mimeType = await generation_service.renderReport(
|
||||
extractedContent=enhancedContent,
|
||||
outputFormat=outputFormat,
|
||||
title=title,
|
||||
userPrompt=prompt,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Generate meaningful filename (use AI-provided if valid, else fallback)
|
||||
from datetime import datetime, UTC
|
||||
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
|
||||
filename = parsedFilename
|
||||
else:
|
||||
safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
|
||||
filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
|
||||
|
||||
# Return structured result with document information
|
||||
return {
|
||||
"success": True,
|
||||
"content": aiResponseJson, # Structured JSON document
|
||||
"rendered_content": renderedContent, # Formatted content
|
||||
"mime_type": mimeType,
|
||||
"filename": filename,
|
||||
"format": outputFormat,
|
||||
"title": title,
|
||||
"documents": [{
|
||||
"documentName": filename,
|
||||
"documentData": renderedContent,
|
||||
"mimeType": mimeType
|
||||
}],
|
||||
"is_multi_file": False
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in single-file document generation: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _callAiWithMultiFileGeneration(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
outputFormat: str,
|
||||
title: Optional[str],
|
||||
prompt_analysis: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Handle multi-file document generation using AI analysis."""
|
||||
try:
|
||||
# Get multi-file extraction prompt based on AI analysis
|
||||
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||
generation_service = GenerationService(self.services)
|
||||
|
||||
# Use default title if not provided
|
||||
if not title:
|
||||
title = "AI Generated Documents"
|
||||
|
||||
# Get adaptive extraction prompt
|
||||
extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=title,
|
||||
promptAnalysis=prompt_analysis,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
|
||||
logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
|
||||
|
||||
# Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
|
||||
logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
|
||||
logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
|
||||
|
||||
# Use the existing pipeline but replace the prompt with our adaptive one
|
||||
# This ensures proper document processing while using the multi-file prompt
|
||||
ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
|
||||
|
||||
logger.info(f"AI response type: {type(ai_response)}")
|
||||
logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
|
||||
logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
|
||||
|
||||
# Validate response structure
|
||||
if not self._validateResponseStructure(ai_response, prompt_analysis):
|
||||
# Fallback to single-file if multi-file fails
|
||||
logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
|
||||
logger.warning(f"Prompt analysis: {prompt_analysis}")
|
||||
logger.warning("Falling back to single-file generation")
|
||||
return await self._callAiWithSingleFileGeneration(
|
||||
prompt, documents, options, outputFormat, title
|
||||
)
|
||||
|
||||
# Emit raw extracted data as a chat message attachment before transformation/rendering
|
||||
try:
|
||||
await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
|
||||
except Exception:
|
||||
logger.warning("Failed to emit raw extraction chat message (multi-file)")
|
||||
|
||||
# Process multiple documents
|
||||
generated_documents = []
|
||||
for i, doc_data in enumerate(ai_response.get("documents", [])):
|
||||
# Transform AI-generated sections to renderer-compatible format
|
||||
transformed_sections = []
|
||||
for section in doc_data.get("sections", []):
|
||||
# Convert AI format to renderer format
|
||||
transformed_section = {
|
||||
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
||||
"content_type": section.get("content_type", "paragraph"),
|
||||
"elements": section.get("elements", []),
|
||||
"order": section.get("order", len(transformed_sections) + 1)
|
||||
}
|
||||
|
||||
# Extract text from elements for simple text-based sections
|
||||
if section.get("content_type") in ["paragraph", "heading"]:
|
||||
text_parts = []
|
||||
for element in section.get("elements", []):
|
||||
if "text" in element:
|
||||
text_parts.append(element["text"])
|
||||
# Add text to the first element or create a new one
|
||||
if transformed_section["elements"]:
|
||||
transformed_section["elements"][0]["text"] = "\n".join(text_parts)
|
||||
else:
|
||||
transformed_section["elements"] = [{"text": "\n".join(text_parts)}]
|
||||
|
||||
transformed_sections.append(transformed_section)
|
||||
|
||||
# Create complete document structure for rendering
|
||||
complete_document = {
|
||||
"metadata": {
|
||||
"title": doc_data["title"],
|
||||
"source_document": "multi_file_generation",
|
||||
"document_id": doc_data.get("id", f"doc_{i+1}"),
|
||||
"filename": doc_data.get("filename", f"document_{i+1}"),
|
||||
"split_strategy": prompt_analysis.get("strategy", "custom")
|
||||
},
|
||||
"sections": transformed_sections,
|
||||
"summary": f"Generated document: {doc_data['title']}",
|
||||
"tags": ["multi_file", "ai_generated"]
|
||||
}
|
||||
|
||||
# Use AI generation to enhance the extracted JSON before rendering
|
||||
enhancedContent = complete_document # Default to original
|
||||
if prompt:
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
# Get generation prompt
|
||||
generationPrompt = await generation_service.getGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=doc_data["title"],
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Prepare the AI call
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
# Create context with the extracted JSON content
|
||||
import json
|
||||
context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
context=context,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
# Call AI to enhance the content
|
||||
response = await self.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
# Parse the AI response as JSON
|
||||
try:
|
||||
import re
|
||||
result = response.content.strip()
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to parse JSON
|
||||
enhancedContent = json.loads(result)
|
||||
logger.info(f"AI enhanced JSON content successfully")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...")
|
||||
# Try to repair common JSON issues
|
||||
try:
|
||||
repaired_result = self._repairJson(result)
|
||||
enhancedContent = json.loads(repaired_result)
|
||||
logger.info(f"Successfully repaired JSON content")
|
||||
except (json.JSONDecodeError, Exception) as repair_error:
|
||||
logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...")
|
||||
# Try AI-powered JSON repair as last resort
|
||||
try:
|
||||
ai_repaired = await self._repairJsonWithAI(result)
|
||||
enhancedContent = json.loads(ai_repaired)
|
||||
logger.info(f"AI successfully repaired JSON content")
|
||||
except Exception as ai_repair_error:
|
||||
logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content")
|
||||
enhancedContent = complete_document
|
||||
else:
|
||||
logger.warning("AI generation returned empty response, using original content")
|
||||
enhancedContent = complete_document
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||
enhancedContent = complete_document
|
||||
|
||||
# Render the enhanced JSON content
|
||||
rendered_content, mime_type = await generation_service.renderReport(
|
||||
extractedContent=enhancedContent,
|
||||
outputFormat=outputFormat,
|
||||
title=doc_data["title"],
|
||||
userPrompt=prompt,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Generate proper filename with correct extension
|
||||
base_filename = doc_data.get("filename", f"document_{i+1}")
|
||||
# Remove any existing extension and add the correct one
|
||||
if '.' in base_filename:
|
||||
base_filename = base_filename.rsplit('.', 1)[0]
|
||||
|
||||
# Add proper extension based on output format
|
||||
if outputFormat.lower() == "docx":
|
||||
filename = f"{base_filename}.docx"
|
||||
elif outputFormat.lower() == "pdf":
|
||||
filename = f"{base_filename}.pdf"
|
||||
elif outputFormat.lower() == "html":
|
||||
filename = f"{base_filename}.html"
|
||||
else:
|
||||
filename = f"{base_filename}.{outputFormat}"
|
||||
|
||||
generated_documents.append({
|
||||
"documentName": filename,
|
||||
"documentData": rendered_content,
|
||||
"mimeType": mime_type
|
||||
})
|
||||
|
||||
# Save debug files for multi-file generation - only if debug enabled
|
||||
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
try:
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
debug_root = "./test-chat/ai"
|
||||
debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
|
||||
os.makedirs(debug_dir, exist_ok=True)
|
||||
|
||||
# Save metadata
|
||||
with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"title: {title}\n")
|
||||
f.write(f"format: {outputFormat}\n")
|
||||
f.write(f"documents_count: {len(generated_documents)}\n")
|
||||
f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
|
||||
f.write(f"prompt_analysis: {prompt_analysis}\n")
|
||||
|
||||
# Save each generated document
|
||||
for i, doc in enumerate(generated_documents):
|
||||
doc_filename = doc["documentName"]
|
||||
doc_data = doc["documentData"]
|
||||
doc_mime = doc["mimeType"]
|
||||
|
||||
# Determine file extension
|
||||
if outputFormat.lower() == "docx":
|
||||
file_ext = ".docx"
|
||||
elif outputFormat.lower() == "pdf":
|
||||
file_ext = ".pdf"
|
||||
elif outputFormat.lower() == "html":
|
||||
file_ext = ".html"
|
||||
else:
|
||||
file_ext = f".{outputFormat}"
|
||||
|
||||
# Save the rendered document
|
||||
output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
|
||||
|
||||
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
|
||||
# Text-based formats
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
else:
|
||||
# Binary formats - decode from base64 if needed
|
||||
try:
|
||||
import base64
|
||||
doc_bytes = base64.b64decode(doc_data)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(doc_bytes)
|
||||
except Exception:
|
||||
# If not base64, save as text
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
|
||||
logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")
|
||||
|
||||
logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save multi-file debug output: {e}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"content": ai_response,
|
||||
"rendered_content": None, # Not applicable for multi-file
|
||||
"mime_type": None, # Not applicable for multi-file
|
||||
"filename": None, # Not applicable for multi-file
|
||||
"format": outputFormat,
|
||||
"title": title,
|
||||
"documents": generated_documents,
|
||||
"is_multi_file": True,
|
||||
"split_strategy": prompt_analysis.get("strategy", "custom")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in multi-file document generation: {str(e)}")
|
||||
# Fallback to single-file
|
||||
return await self._callAiWithSingleFileGeneration(
|
||||
prompt, documents, options, outputFormat, title
|
||||
)
|
||||
|
||||
async def _callAiJson(
|
||||
self,
|
||||
prompt: str,
|
||||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Handle AI calls with document processing for JSON output.
|
||||
Returns structured JSON document instead of text.
|
||||
"""
|
||||
# Process documents with JSON merging
|
||||
return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
|
||||
|
||||
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Use AI to analyze user prompt and determine processing requirements."""
|
||||
if not ai_service:
|
||||
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||
|
||||
try:
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and determine if it requires multiple file output or single file output.
|
||||
|
||||
User request: "{prompt}"
|
||||
|
||||
Respond with JSON only in this exact format:
|
||||
{{
|
||||
"is_multi_file": true/false,
|
||||
"strategy": "single|per_entity|by_section|by_criteria|custom",
|
||||
"criteria": "description of how to split content",
|
||||
"file_naming_pattern": "suggested pattern for filenames",
|
||||
"reasoning": "brief explanation of the analysis"
|
||||
}}
|
||||
|
||||
Consider:
|
||||
- Does the user want separate files for different entities (customers, products, etc.)?
|
||||
- Does the user want to split content into multiple documents?
|
||||
- What would be the most logical way to organize the content?
|
||||
- What language is the request in? (analyze in the original language)
|
||||
|
||||
Return only the JSON response.
|
||||
"""
|
||||
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Extract JSON from response
|
||||
result = response.content.strip()
|
||||
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
|
||||
analysis = json.loads(result)
|
||||
return analysis
|
||||
else:
|
||||
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
|
||||
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
||||
|
||||
def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
|
||||
"""Validate that AI response matches the expected structure."""
|
||||
try:
|
||||
if not isinstance(response, dict):
|
||||
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
|
||||
return False
|
||||
|
||||
# Check for multi-file structure
|
||||
if prompt_analysis.get("is_multi_file", False):
|
||||
has_documents = "documents" in response
|
||||
is_documents_list = isinstance(response.get("documents"), list)
|
||||
logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
|
||||
if has_documents and is_documents_list:
|
||||
logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
|
||||
else:
|
||||
logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
|
||||
logger.warning(f"Available keys: {list(response.keys())}")
|
||||
return has_documents and is_documents_list
|
||||
else:
|
||||
has_sections = "sections" in response
|
||||
is_sections_list = isinstance(response.get("sections"), list)
|
||||
logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
|
||||
return has_sections and is_sections_list
|
||||
except Exception as e:
|
||||
logger.warning(f"Response validation failed with exception: {str(e)}")
|
||||
return False
|
||||
|
||||
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
|
||||
"""
|
||||
Create a ChatMessage with the extracted raw JSON attached as a file so the user
|
||||
has access to the data even if downstream processing fails.
|
||||
"""
|
||||
try:
|
||||
services = self.services
|
||||
workflow = services.currentWorkflow
|
||||
|
||||
# Serialize payload
|
||||
import json as _json
|
||||
from datetime import datetime, UTC
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
content_bytes = content_text.encode('utf-8')
|
||||
|
||||
# Store as file via component storage
|
||||
file_name = f"{label}_{ts}.json"
|
||||
file_item = services.interfaceDbComponent.createFile(
|
||||
name=file_name,
|
||||
mimeType="application/json",
|
||||
content=content_bytes
|
||||
)
|
||||
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
||||
|
||||
# Lookup file info for ChatDocument
|
||||
file_info = services.workflow.getFileInfo(file_item.id)
|
||||
doc = ChatDocument(
|
||||
messageId="", # set after message creation
|
||||
fileId=file_item.id,
|
||||
fileName=file_info.get("fileName", file_name) if file_info else file_name,
|
||||
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
|
||||
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
|
||||
)
|
||||
|
||||
# Create message referencing the file
|
||||
messageData = {
|
||||
"workflowId": workflow.id,
|
||||
"role": "assistant",
|
||||
"message": "Raw extraction data saved",
|
||||
"status": "data",
|
||||
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
|
||||
"publishedAt": services.utils.getUtcTimestamp(),
|
||||
"documentsLabel": label,
|
||||
"documents": []
|
||||
}
|
||||
message = services.workflow.createMessage(messageData)
|
||||
if not message:
|
||||
return
|
||||
|
||||
# Persist ChatDocument with messageId
|
||||
doc.messageId = message.id
|
||||
services.interfaceDbChat.createDocument(doc.to_dict())
|
||||
|
||||
# Update message to include document
|
||||
try:
|
||||
if not message.documents:
|
||||
message.documents = []
|
||||
message.documents.append(doc)
|
||||
services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]})
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
# Non-fatal; ignore if storage or chat creation fails
|
||||
return
|
||||
|
||||
def _repairJson(self, json_string: str) -> str:
|
||||
"""Repair common JSON syntax errors efficiently for large JSON."""
|
||||
try:
|
||||
import re
|
||||
import json
|
||||
|
||||
# Remove any leading/trailing whitespace
|
||||
json_string = json_string.strip()
|
||||
|
||||
# For large JSON, skip substring extraction and go straight to targeted repairs
|
||||
logger.info(f"Attempting JSON repair for {len(json_string)} characters...")
|
||||
|
||||
# Try to parse first to see what specific error we get
|
||||
try:
|
||||
json.loads(json_string)
|
||||
return json_string # Already valid
|
||||
except json.JSONDecodeError as e:
|
||||
error_msg = str(e)
|
||||
logger.info(f"JSON error: {error_msg}")
|
||||
|
||||
# Apply targeted fixes based on the specific error
|
||||
if "Expecting ',' delimiter" in error_msg:
|
||||
# Fix missing commas between array elements
|
||||
json_string = re.sub(r'\]\s*\[', '], [', json_string)
|
||||
json_string = re.sub(r'\}\s*\{', '}, {', json_string)
|
||||
# Fix missing commas between object properties
|
||||
json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string)
|
||||
|
||||
if "Expecting value" in error_msg:
|
||||
# Fix missing values (replace empty with null)
|
||||
json_string = re.sub(r':\s*,', ': null,', json_string)
|
||||
json_string = re.sub(r':\s*}', ': null}', json_string)
|
||||
|
||||
if "Expecting property name" in error_msg:
|
||||
# Fix unquoted property names
|
||||
json_string = re.sub(r'(\w+):', r'"\1":', json_string)
|
||||
|
||||
# Fix trailing commas before closing brackets/braces
|
||||
json_string = re.sub(r',(\s*[}\]])', r'\1', json_string)
|
||||
|
||||
# Fix missing closing brackets/braces (only if reasonable)
|
||||
open_braces = json_string.count('{')
|
||||
close_braces = json_string.count('}')
|
||||
open_brackets = json_string.count('[')
|
||||
close_brackets = json_string.count(']')
|
||||
|
||||
# Only add missing brackets if the difference is small (avoid runaway)
|
||||
if 0 < (open_braces - close_braces) <= 5:
|
||||
missing_braces = open_braces - close_braces
|
||||
json_string += '}' * missing_braces
|
||||
|
||||
if 0 < (open_brackets - close_brackets) <= 5:
|
||||
missing_brackets = open_brackets - close_brackets
|
||||
json_string += ']' * missing_brackets
|
||||
|
||||
# Try to parse again
|
||||
try:
|
||||
json.loads(json_string)
|
||||
logger.info("JSON repair successful")
|
||||
return json_string
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("JSON repair failed - will try AI repair")
|
||||
return json_string
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"JSON repair failed: {str(e)}")
|
||||
return json_string
|
||||
|
||||
async def _repairJsonWithAI(self, malformed_json: str) -> str:
|
||||
"""Use AI to repair malformed JSON efficiently for large files."""
|
||||
try:
|
||||
# Limit JSON size for AI processing (max 50KB to avoid token limits)
|
||||
max_json_size = 50000
|
||||
json_to_repair = malformed_json
|
||||
|
||||
if len(malformed_json) > max_json_size:
|
||||
logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair")
|
||||
# Try to find a good truncation point (end of a complete object/array)
|
||||
truncate_at = max_json_size
|
||||
for i in range(max_json_size, max(0, max_json_size - 1000), -1):
|
||||
if malformed_json[i] in ['}', ']']:
|
||||
truncate_at = i + 1
|
||||
break
|
||||
json_to_repair = malformed_json[:truncate_at] + "..."
|
||||
|
||||
repair_prompt = f"""
|
||||
You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations.
|
||||
|
||||
Malformed JSON:
|
||||
{json_to_repair}
|
||||
|
||||
Return only the valid JSON:
|
||||
"""
|
||||
|
||||
# Use AI to repair the JSON
|
||||
repaired_json = await self.services.ai.callAi(
|
||||
prompt=repair_prompt,
|
||||
documents=None,
|
||||
options={
|
||||
"process_type": "text",
|
||||
"operation_type": "generate_content",
|
||||
"priority": "speed",
|
||||
"max_cost": 0.01
|
||||
}
|
||||
)
|
||||
|
||||
# Clean up the response (remove any markdown formatting)
|
||||
repaired_json = repaired_json.strip()
|
||||
if repaired_json.startswith('```json'):
|
||||
repaired_json = repaired_json[7:]
|
||||
if repaired_json.endswith('```'):
|
||||
repaired_json = repaired_json[:-3]
|
||||
repaired_json = repaired_json.strip()
|
||||
|
||||
# Validate the repaired JSON
|
||||
import json
|
||||
json.loads(repaired_json)
|
||||
logger.info("AI JSON repair successful")
|
||||
return repaired_json
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI JSON repair failed: {str(e)}")
|
||||
return malformed_json
|
||||
1132
modules/services/serviceAi/subDocumentProcessing.py
Normal file
1132
modules/services/serviceAi/subDocumentProcessing.py
Normal file
File diff suppressed because it is too large
Load diff
316
modules/services/serviceAi/subUtilities.py
Normal file
316
modules/services/serviceAi/subUtilities.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
import logging
|
||||
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||
from modules.datamodels.datamodelAi import ModelCapabilities, AiCallOptions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubUtilities:
|
||||
"""Utility functions for text processing, debugging, and helper operations."""
|
||||
|
||||
def __init__(self, services):
|
||||
"""Initialize utilities service.
|
||||
|
||||
Args:
|
||||
services: Service center instance for accessing other services
|
||||
"""
|
||||
self.services = services
|
||||
|
||||
def _writeTraceLog(self, contextText: str, data: Any) -> None:
|
||||
"""Write raw data to the central trace log file without truncation."""
|
||||
try:
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime, UTC
|
||||
# Only write if logger is in debug mode
|
||||
if logger.level > logging.DEBUG:
|
||||
return
|
||||
# Get log directory from configuration via service center if possible
|
||||
logDir = None
|
||||
try:
|
||||
logDir = self.services.utils.configGet("APP_LOGGING_LOG_DIR", "./")
|
||||
except Exception:
|
||||
pass
|
||||
if not logDir:
|
||||
logDir = "./"
|
||||
if not os.path.isabs(logDir):
|
||||
# Make it relative to gateway directory
|
||||
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
logDir = os.path.join(gatewayDir, logDir)
|
||||
os.makedirs(logDir, exist_ok=True)
|
||||
traceFile = os.path.join(logDir, "log_trace.log")
|
||||
timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||
traceEntry = f"[{timestamp}] {contextText}\n" + ("=" * 80) + "\n"
|
||||
if data is None:
|
||||
traceEntry += "No data provided\n"
|
||||
else:
|
||||
# Prefer exact text; if dict/list, pretty print JSON
|
||||
try:
|
||||
if isinstance(data, (dict, list)):
|
||||
traceEntry += f"JSON Data:\n{json.dumps(data, indent=2, ensure_ascii=False)}\n"
|
||||
else:
|
||||
text = str(data)
|
||||
traceEntry += f"Text Data:\n{text}\n"
|
||||
except Exception:
|
||||
traceEntry += f"Data (fallback): {str(data)}\n"
|
||||
traceEntry += ("=" * 80) + "\n\n"
|
||||
with open(traceFile, "a", encoding="utf-8") as f:
|
||||
f.write(traceEntry)
|
||||
except Exception:
|
||||
# Swallow to avoid recursive logging issues
|
||||
pass
|
||||
|
||||
def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
|
||||
"""Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled."""
|
||||
try:
|
||||
# Check if debug logging is enabled
|
||||
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if not debug_enabled:
|
||||
return
|
||||
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
# Base dir: gateway/test-chat/ai (go up 4 levels from this file)
|
||||
# .../gateway/modules/services/serviceAi/subUtilities.py -> up to gateway root
|
||||
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
outDir = os.path.join(gatewayDir, 'test-chat', 'ai')
|
||||
os.makedirs(outDir, exist_ok=True)
|
||||
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
|
||||
suffix = []
|
||||
if partIndex is not None:
|
||||
suffix.append(f"part{partIndex}")
|
||||
if continuation is not None:
|
||||
suffix.append(f"cont_{str(continuation).lower()}")
|
||||
if modelName:
|
||||
safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName)
|
||||
suffix.append(safeModel)
|
||||
suffixStr = ('_' + '_'.join(suffix)) if suffix else ''
|
||||
fname = f"{ts}_{label}{suffixStr}.txt"
|
||||
fpath = os.path.join(outDir, fname)
|
||||
with open(fpath, 'w', encoding='utf-8') as f:
|
||||
f.write(content or '')
|
||||
except Exception:
|
||||
# Do not raise; best-effort debug write
|
||||
pass
|
||||
|
||||
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
|
||||
"""
|
||||
Check if text exceeds model token limit with safety margin.
|
||||
"""
|
||||
# Simple character-based estimation (4 chars per token)
|
||||
estimated_tokens = len(text) // 4
|
||||
max_tokens = int(model.maxTokens * (1 - safety_margin))
|
||||
return estimated_tokens > max_tokens
|
||||
|
||||
def _reduceText(self, text: str, reduction_factor: float) -> str:
|
||||
"""
|
||||
Reduce text size by the specified factor.
|
||||
"""
|
||||
if reduction_factor >= 1.0:
|
||||
return text
|
||||
|
||||
target_length = int(len(text) * reduction_factor)
|
||||
return text[:target_length] + "... [reduced]"
|
||||
|
||||
def _extractTextFromContentParts(self, extracted_content) -> str:
|
||||
"""
|
||||
Extract text content from ExtractionService ContentPart objects.
|
||||
"""
|
||||
if not extracted_content or not hasattr(extracted_content, 'parts'):
|
||||
return ""
|
||||
|
||||
text_parts = []
|
||||
for part in extracted_content.parts:
|
||||
if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']:
|
||||
if hasattr(part, 'data') and part.data:
|
||||
text_parts.append(part.data)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str:
|
||||
"""
|
||||
Build full prompt by replacing placeholders with their content.
|
||||
Uses the new {{KEY:placeholder}} format.
|
||||
"""
|
||||
if not placeholders:
|
||||
return prompt
|
||||
|
||||
full_prompt = prompt
|
||||
for placeholder, content in placeholders.items():
|
||||
# Replace both old format {{placeholder}} and new format {{KEY:placeholder}}
|
||||
full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content)
|
||||
full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content)
|
||||
|
||||
return full_prompt
|
||||
|
||||
def _reducePlanningPrompt(
|
||||
self,
|
||||
full_prompt: str,
|
||||
placeholders: Optional[Dict[str, str]],
|
||||
model: ModelCapabilities,
|
||||
options: AiCallOptions
|
||||
) -> str:
|
||||
"""
|
||||
Reduce planning prompt size by summarizing placeholders while preserving prompt structure.
|
||||
"""
|
||||
if not placeholders:
|
||||
return self._reduceText(full_prompt, 0.7)
|
||||
|
||||
# Reduce placeholders while preserving prompt
|
||||
reduced_placeholders = {}
|
||||
for placeholder, content in placeholders.items():
|
||||
if len(content) > 1000: # Only reduce long content
|
||||
reduction_factor = 0.7
|
||||
reduced_content = self._reduceText(content, reduction_factor)
|
||||
reduced_placeholders[placeholder] = reduced_content
|
||||
else:
|
||||
reduced_placeholders[placeholder] = content
|
||||
|
||||
return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders)
|
||||
|
||||
def _reduceTextPrompt(
|
||||
self,
|
||||
prompt: str,
|
||||
context: str,
|
||||
model: ModelCapabilities,
|
||||
options: AiCallOptions
|
||||
) -> str:
|
||||
"""
|
||||
Reduce text prompt size using typeGroup-aware chunking and merging.
|
||||
"""
|
||||
max_size = int(model.maxTokens * (1 - options.safetyMargin))
|
||||
|
||||
if options.compressPrompt:
|
||||
# Reduce both prompt and context
|
||||
target_size = max_size
|
||||
current_size = len(prompt) + len(context)
|
||||
reduction_factor = (target_size * 0.7) / current_size
|
||||
|
||||
if reduction_factor < 1.0:
|
||||
prompt = self._reduceText(prompt, reduction_factor)
|
||||
context = self._reduceText(context, reduction_factor)
|
||||
else:
|
||||
# Only reduce context, preserve prompt integrity
|
||||
max_context_size = max_size - len(prompt)
|
||||
if len(context) > max_context_size:
|
||||
reduction_factor = max_context_size / len(context)
|
||||
context = self._reduceText(context, reduction_factor)
|
||||
|
||||
return prompt + "\n\n" + context if context else prompt
|
||||
|
||||
async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str:
|
||||
"""Compress content to target size."""
|
||||
if len(content.encode("utf-8")) <= targetSize:
|
||||
return content
|
||||
|
||||
try:
|
||||
compressionPrompt = f"""
|
||||
Komprimiere den folgenden {contentType} auf maximal {targetSize} Zeichen,
|
||||
behalte aber alle wichtigen Informationen bei:
|
||||
|
||||
{content}
|
||||
|
||||
Gib nur den komprimierten Inhalt zurück, ohne zusätzliche Erklärungen.
|
||||
"""
|
||||
|
||||
# Service must not call connectors directly; use simple truncation fallback here
|
||||
data = content.encode("utf-8")
|
||||
return data[:targetSize].decode("utf-8", errors="ignore") + "... [truncated]"
|
||||
except Exception as e:
|
||||
logger.warning(f"AI compression failed, using truncation: {str(e)}")
|
||||
return content[:targetSize] + "... [truncated]"
|
||||
|
||||
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List], options: AiCallOptions) -> Dict[str, int]:
|
||||
"""
|
||||
Get model capabilities for content processing, including appropriate size limits for chunking.
|
||||
"""
|
||||
# Estimate total content size
|
||||
prompt_size = len(prompt.encode('utf-8'))
|
||||
document_size = 0
|
||||
if documents:
|
||||
# Rough estimate of document content size
|
||||
for doc in documents:
|
||||
document_size += getattr(doc, 'fileSize', 0) or 0
|
||||
|
||||
total_size = prompt_size + document_size
|
||||
|
||||
# Use AiObjects to select the best model for this content size
|
||||
# We'll simulate the model selection by checking available models
|
||||
from modules.interfaces.interfaceAiObjects import aiModels
|
||||
|
||||
# Find the best model for this content size and operation
|
||||
best_model = None
|
||||
best_context_length = 0
|
||||
|
||||
for model_name, model_info in aiModels.items():
|
||||
context_length = model_info.get("contextLength", 0)
|
||||
|
||||
# Skip models with no context length or too small for content
|
||||
if context_length == 0:
|
||||
continue
|
||||
|
||||
# Check if model supports the operation type
|
||||
capabilities = model_info.get("capabilities", [])
|
||||
from modules.datamodels.datamodelAi import OperationType
|
||||
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
|
||||
continue
|
||||
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
|
||||
continue
|
||||
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
|
||||
continue
|
||||
elif "text_generation" not in capabilities:
|
||||
continue
|
||||
|
||||
# Prefer models that can handle the content without chunking, but allow chunking if needed
|
||||
if context_length >= total_size * 0.8: # 80% of content size
|
||||
if context_length > best_context_length:
|
||||
best_model = model_info
|
||||
best_context_length = context_length
|
||||
elif best_model is None: # Fallback to largest available model
|
||||
if context_length > best_context_length:
|
||||
best_model = model_info
|
||||
best_context_length = context_length
|
||||
|
||||
# Fallback to a reasonable default if no model found
|
||||
if best_model is None:
|
||||
best_model = {
|
||||
"contextLength": 128000, # GPT-4o default
|
||||
"llmName": "gpt-4o"
|
||||
}
|
||||
|
||||
# Calculate appropriate sizes
|
||||
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
|
||||
context_length_bytes = int(best_model["contextLength"] * 4)
|
||||
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
|
||||
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
|
||||
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
|
||||
|
||||
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
|
||||
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
|
||||
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
|
||||
|
||||
return {
|
||||
"maxContextBytes": max_context_bytes,
|
||||
"textChunkSize": text_chunk_size,
|
||||
"imageChunkSize": image_chunk_size
|
||||
}
|
||||
|
||||
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
|
||||
"""
|
||||
Get models capable of handling the specific operation with capability filtering.
|
||||
"""
|
||||
# Use the actual AI objects model selection instead of hardcoded default
|
||||
if hasattr(self, 'aiObjects') and self.aiObjects:
|
||||
# Let AiObjects handle the model selection
|
||||
return []
|
||||
else:
|
||||
# Fallback to default model if AiObjects not available
|
||||
default_model = ModelCapabilities(
|
||||
name="default",
|
||||
maxTokens=4000,
|
||||
capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"],
|
||||
costPerToken=0.001,
|
||||
processingTime=1.0,
|
||||
isAvailable=True
|
||||
)
|
||||
return [default_model]
|
||||
384
modules/services/serviceAi/subWebResearch.py
Normal file
384
modules/services/serviceAi/subWebResearch.py
Normal file
|
|
@ -0,0 +1,384 @@
|
|||
import logging
|
||||
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||
from modules.datamodels.datamodelWeb import (
|
||||
WebResearchRequest,
|
||||
WebResearchActionResult,
|
||||
WebResearchDocumentData,
|
||||
WebResearchActionDocument,
|
||||
WebSearchResultItem,
|
||||
)
|
||||
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubWebResearch:
|
||||
"""Web research operations including search, crawling, and analysis."""
|
||||
|
||||
def __init__(self, services, aiObjects):
|
||||
"""Initialize web research service.
|
||||
|
||||
Args:
|
||||
services: Service center instance for accessing other services
|
||||
aiObjects: Initialized AiObjects instance
|
||||
"""
|
||||
self.services = services
|
||||
self.aiObjects = aiObjects
|
||||
|
||||
async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult:
|
||||
"""Perform web research using interface functions."""
|
||||
try:
|
||||
logger.info(f"WEB RESEARCH STARTED")
|
||||
logger.info(f"User Query: {request.user_prompt}")
|
||||
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
||||
|
||||
# Global URL index to track all processed URLs across the entire research session
|
||||
global_processed_urls = set()
|
||||
|
||||
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
||||
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
||||
|
||||
if request.urls:
|
||||
# Use provided URLs as initial main URLs
|
||||
websites = request.urls
|
||||
logger.info(f"Using provided URLs ({len(websites)}):")
|
||||
for i, url in enumerate(websites, 1):
|
||||
logger.info(f" {i}. {url}")
|
||||
else:
|
||||
# Use AI to determine main URLs based on user's intention
|
||||
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
||||
|
||||
# Use AI to generate optimized Tavily search query and search parameters
|
||||
query_optimizer_prompt = f"""You are a search query optimizer.
|
||||
|
||||
USER QUERY: {request.user_prompt}
|
||||
|
||||
Your task: Create a search query and parameters for the USER QUERY given.
|
||||
|
||||
RULES:
|
||||
1. The search query MUST be related to the user query above
|
||||
2. Extract key terms from the user query
|
||||
3. Determine appropriate country/language based on the query context
|
||||
4. Keep search query short (2-6 words)
|
||||
|
||||
Return ONLY this JSON format:
|
||||
{{
|
||||
"user_prompt": "search query based on user query above",
|
||||
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
||||
"language": "language_code_or_null",
|
||||
"topic": "general|news|academic_or_null",
|
||||
"time_range": "d|w|m|y_or_null",
|
||||
"selection_strategy": "single|multiple|specific_page",
|
||||
"selection_criteria": "what URLs to prioritize",
|
||||
"expected_url_patterns": ["pattern1", "pattern2"],
|
||||
"estimated_result_count": number
|
||||
}}"""
|
||||
|
||||
# Get AI response for query optimization
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
||||
ai_request = AiCallRequest(
|
||||
prompt=query_optimizer_prompt,
|
||||
options=AiCallOptions()
|
||||
)
|
||||
ai_response_obj = await self.aiObjects.call(ai_request)
|
||||
ai_response = ai_response_obj.content
|
||||
logger.debug(f"AI query optimizer response: {ai_response}")
|
||||
|
||||
# Parse AI response to extract search query
|
||||
import json
|
||||
try:
|
||||
# Clean the response by removing markdown code blocks
|
||||
cleaned_response = ai_response.strip()
|
||||
if cleaned_response.startswith('```json'):
|
||||
cleaned_response = cleaned_response[7:] # Remove ```json
|
||||
if cleaned_response.endswith('```'):
|
||||
cleaned_response = cleaned_response[:-3] # Remove ```
|
||||
cleaned_response = cleaned_response.strip()
|
||||
|
||||
query_data = json.loads(cleaned_response)
|
||||
search_query = query_data.get("user_prompt", request.user_prompt)
|
||||
ai_country = query_data.get("country")
|
||||
ai_language = query_data.get("language")
|
||||
ai_topic = query_data.get("topic")
|
||||
ai_time_range = query_data.get("time_range")
|
||||
selection_strategy = query_data.get("selection_strategy", "multiple")
|
||||
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
|
||||
expected_patterns = query_data.get("expected_url_patterns", [])
|
||||
estimated_count = query_data.get("estimated_result_count", request.max_results)
|
||||
|
||||
logger.info(f"AI optimized search query: '{search_query}'")
|
||||
logger.info(f"Selection strategy: {selection_strategy}")
|
||||
logger.info(f"Selection criteria: {selection_criteria}")
|
||||
logger.info(f"Expected URL patterns: {expected_patterns}")
|
||||
logger.info(f"Estimated result count: {estimated_count}")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Failed to parse AI response as JSON, using original query")
|
||||
search_query = request.user_prompt
|
||||
ai_country = None
|
||||
ai_language = None
|
||||
ai_topic = None
|
||||
ai_time_range = None
|
||||
selection_strategy = "multiple"
|
||||
|
||||
# Perform the web search with AI-determined parameters
|
||||
search_kwargs = {
|
||||
"query": search_query,
|
||||
"max_results": request.max_results,
|
||||
"search_depth": request.options.search_depth,
|
||||
"auto_parameters": False # Use explicit parameters
|
||||
}
|
||||
|
||||
# Add parameters only if they have valid values
|
||||
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
|
||||
if not c:
|
||||
return None
|
||||
s = str(c).strip()
|
||||
if not s or s.lower() in ['null', 'none', 'undefined']:
|
||||
return None
|
||||
# Map common codes to full English names when easy to do without extra deps
|
||||
mapping = {
|
||||
'ch': 'Switzerland', 'che': 'Switzerland',
|
||||
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
|
||||
'at': 'Austria', 'aut': 'Austria',
|
||||
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
|
||||
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
|
||||
}
|
||||
key = s.lower()
|
||||
if key in mapping:
|
||||
return mapping[key]
|
||||
# If looks like full name, capitalize first letter only (Tavily accepts English names)
|
||||
return s
|
||||
|
||||
norm_ai_country = _normalizeCountry(ai_country)
|
||||
norm_req_country = _normalizeCountry(request.options.country)
|
||||
if norm_ai_country:
|
||||
search_kwargs["country"] = norm_ai_country
|
||||
elif norm_req_country:
|
||||
search_kwargs["country"] = norm_req_country
|
||||
|
||||
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
||||
search_kwargs["language"] = ai_language
|
||||
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
|
||||
search_kwargs["language"] = request.options.language
|
||||
|
||||
if ai_topic and ai_topic in ['general', 'news', 'academic']:
|
||||
search_kwargs["topic"] = ai_topic
|
||||
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
|
||||
search_kwargs["topic"] = request.options.topic
|
||||
|
||||
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
|
||||
search_kwargs["time_range"] = ai_time_range
|
||||
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
||||
search_kwargs["time_range"] = request.options.time_range
|
||||
|
||||
# Constrain by expected domains if provided by AI
|
||||
try:
|
||||
include_domains = []
|
||||
for p in expected_patterns or []:
|
||||
if not isinstance(p, str):
|
||||
continue
|
||||
# Extract bare domain from pattern or URL
|
||||
import re
|
||||
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
|
||||
if m:
|
||||
domain = m.group(1).lower()
|
||||
# strip leading www.
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
include_domains.append(domain)
|
||||
# Deduplicate
|
||||
if include_domains:
|
||||
seen = set()
|
||||
uniq = []
|
||||
for d in include_domains:
|
||||
if d not in seen:
|
||||
seen.add(d)
|
||||
uniq.append(d)
|
||||
search_kwargs["include_domains"] = uniq
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Log the parameters being used
|
||||
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
|
||||
|
||||
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
||||
|
||||
logger.debug(f"Web search returned {len(search_results)} results:")
|
||||
for i, result in enumerate(search_results, 1):
|
||||
logger.debug(f" {i}. {result.url} - {result.title}")
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
search_urls = []
|
||||
for r in search_results:
|
||||
u = str(r.url)
|
||||
if u not in seen:
|
||||
seen.add(u)
|
||||
search_urls.append(u)
|
||||
|
||||
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
|
||||
|
||||
if not search_urls:
|
||||
logger.error("No relevant websites found")
|
||||
return WebResearchActionResult(success=False, error="No relevant websites found")
|
||||
|
||||
# Now use AI to determine the main URLs based on user's intention
|
||||
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
|
||||
|
||||
# Create a prompt for AI to identify main URLs based on user's intention
|
||||
ai_prompt = f"""
|
||||
Select the most relevant URLs from these search results:
|
||||
|
||||
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
|
||||
|
||||
Return only the URLs that are most relevant for the user's query.
|
||||
One URL per line.
|
||||
"""
|
||||
# Create AI call request
|
||||
ai_request = AiCallRequest(
|
||||
prompt=ai_prompt,
|
||||
options=AiCallOptions()
|
||||
)
|
||||
ai_response_obj = await self.aiObjects.call(ai_request)
|
||||
ai_response = ai_response_obj.content
|
||||
logger.debug(f"AI response for main URL selection: {ai_response}")
|
||||
|
||||
# Parse AI response to extract URLs
|
||||
websites = []
|
||||
for line in ai_response.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line and ('http://' in line or 'https://' in line):
|
||||
# Extract URL from the line
|
||||
for word in line.split():
|
||||
if word.startswith('http://') or word.startswith('https://'):
|
||||
websites.append(word.rstrip('.,;'))
|
||||
break
|
||||
|
||||
if not websites:
|
||||
logger.warning("AI did not identify any main URLs, using first few search results")
|
||||
websites = search_urls[:3] # Fallback to first 3 search results
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
unique_websites = []
|
||||
for url in websites:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_websites.append(url)
|
||||
|
||||
websites = unique_websites
|
||||
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
|
||||
|
||||
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
||||
for i, url in enumerate(websites, 1):
|
||||
logger.info(f" {i}. {url}")
|
||||
|
||||
# Step 2: Smart website selection using AI interface
|
||||
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
||||
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
||||
|
||||
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
|
||||
|
||||
logger.debug(f"AI Response: {aiResponse}")
|
||||
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
|
||||
for i, url in enumerate(selectedWebsites, 1):
|
||||
logger.debug(f" {i}. {url}")
|
||||
|
||||
# Show which were filtered out
|
||||
filtered_out = [url for url in websites if url not in selectedWebsites]
|
||||
if filtered_out:
|
||||
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
|
||||
for i, url in enumerate(filtered_out, 1):
|
||||
logger.debug(f" {i}. {url}")
|
||||
|
||||
# Step 3+4+5: Recursive crawling with configurable depth
|
||||
# Get configuration parameters
|
||||
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
||||
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
||||
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
||||
crawl_timeout_seconds = crawl_timeout_minutes * 60
|
||||
|
||||
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
|
||||
effective_depth = min(max_depth, request.options.pages_search_depth)
|
||||
|
||||
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
|
||||
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
||||
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
|
||||
logger.info(f"Max links per domain: {max_links_per_domain}")
|
||||
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
||||
|
||||
# Use recursive crawling with URL index to avoid duplicates
|
||||
import asyncio
|
||||
try:
|
||||
allContent = await asyncio.wait_for(
|
||||
self.aiObjects.crawlRecursively(
|
||||
urls=selectedWebsites,
|
||||
max_depth=effective_depth,
|
||||
extract_depth=request.options.extract_depth,
|
||||
max_per_domain=max_links_per_domain,
|
||||
global_processed_urls=global_processed_urls
|
||||
),
|
||||
timeout=crawl_timeout_seconds
|
||||
)
|
||||
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
|
||||
# crawlRecursively now handles timeouts gracefully and returns partial results
|
||||
# Try to get the partial results that were collected
|
||||
allContent = {}
|
||||
|
||||
if not allContent:
|
||||
logger.error("Could not extract content from any websites")
|
||||
return WebResearchActionResult(success=False, error="Could not extract content from any websites")
|
||||
|
||||
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
||||
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
||||
logger.info(f"Crawl depth: {effective_depth} levels")
|
||||
|
||||
# Create simple result with raw content
|
||||
sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites]
|
||||
|
||||
# Get all additional links (all URLs except main ones)
|
||||
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
|
||||
|
||||
# Combine all content into a single result
|
||||
combinedContent = ""
|
||||
for url, content in allContent.items():
|
||||
combinedContent += f"\n\n=== {url} ===\n{content}\n"
|
||||
|
||||
documentData = WebResearchDocumentData(
|
||||
user_prompt=request.user_prompt,
|
||||
websites_analyzed=len(allContent),
|
||||
additional_links_found=len(additional_links),
|
||||
analysis_result=combinedContent, # Raw content, no analysis
|
||||
sources=sources,
|
||||
additional_links=additional_links,
|
||||
individual_content=allContent, # Individual URL -> content mapping
|
||||
debug_info={
|
||||
"crawl_depth": effective_depth,
|
||||
"max_configured_depth": max_depth,
|
||||
"max_links_per_domain": max_links_per_domain,
|
||||
"crawl_timeout_minutes": crawl_timeout_minutes,
|
||||
"total_urls_crawled": len(allContent),
|
||||
"main_urls": len(selectedWebsites),
|
||||
"additional_urls": len(additional_links)
|
||||
}
|
||||
)
|
||||
|
||||
document = WebResearchActionDocument(
|
||||
documentName=f"web_research_{request.user_prompt[:50]}.json",
|
||||
documentData=documentData,
|
||||
mimeType="application/json"
|
||||
)
|
||||
|
||||
return WebResearchActionResult(
|
||||
success=True,
|
||||
documents=[document],
|
||||
resultLabel="web_research_results"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web research: {str(e)}")
|
||||
return WebResearchActionResult(success=False, error=str(e))
|
||||
|
|
@ -7,9 +7,29 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class BinaryExtractor(Extractor):
|
||||
"""
|
||||
Fallback extractor for unsupported file types.
|
||||
|
||||
This extractor handles any file type that doesn't match other extractors.
|
||||
It encodes the file as base64 and marks it as binary data.
|
||||
|
||||
Supported formats:
|
||||
- All file types (fallback)
|
||||
- MIME types: application/octet-stream (default)
|
||||
- File extensions: All (fallback)
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return True
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions (all)."""
|
||||
return [] # Accepts all extensions as fallback
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types (all)."""
|
||||
return [] # Accepts all MIME types as fallback
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/octet-stream"
|
||||
return [ContentPart(
|
||||
|
|
@ -6,9 +6,26 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class CsvExtractor(Extractor):
|
||||
"""
|
||||
Extractor for CSV files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/csv
|
||||
- File extensions: .csv
|
||||
- Special handling: Treats as table data
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".csv"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/csv"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/csv"
|
||||
|
|
@ -7,6 +7,16 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class DocxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for Microsoft Word documents.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
||||
- File extensions: .docx
|
||||
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
|
||||
- Dependencies: python-docx
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
|
@ -25,6 +35,14 @@ class DocxExtractor(Extractor):
|
|||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".docx"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
|
|
@ -7,9 +7,27 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class HtmlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for HTML files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/html
|
||||
- File extensions: .html, .htm
|
||||
- Special handling: Uses BeautifulSoup for parsing
|
||||
- Dependencies: beautifulsoup4
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".html", ".htm"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/html"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "text/html"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
from typing import Any, Dict, List
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ImageExtractor(Extractor):
|
||||
"""
|
||||
Extractor for image files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
|
||||
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
|
||||
- Special handling: GIF files are converted to PNG during extraction
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return ((mimeType or "").startswith("image/") or
|
||||
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "image/unknown"
|
||||
fileName = context.get("fileName", "")
|
||||
|
||||
# Convert GIF to PNG during extraction
|
||||
if mimeType.lower() == "image/gif":
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Open GIF and convert to PNG
|
||||
with Image.open(io.BytesIO(fileBytes)) as img:
|
||||
# Convert to RGB (removes animation)
|
||||
if img.mode in ('RGBA', 'LA', 'P'):
|
||||
img = img.convert('RGB')
|
||||
|
||||
# Save as PNG in memory
|
||||
png_buffer = io.BytesIO()
|
||||
img.save(png_buffer, format='PNG')
|
||||
png_data = png_buffer.getvalue()
|
||||
|
||||
# Update mimeType and fileBytes
|
||||
mimeType = "image/png"
|
||||
fileBytes = png_data
|
||||
|
||||
logger.info(f"GIF converted to PNG during extraction: {fileName}, original={len(fileBytes)} bytes, converted={len(png_data)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"GIF conversion failed during extraction for {fileName}: {str(e)}, using original")
|
||||
# Keep original GIF data if conversion fails
|
||||
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="image",
|
||||
typeGroup="image",
|
||||
mimeType=mimeType,
|
||||
data=base64.b64encode(fileBytes).decode("utf-8"),
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -7,9 +7,26 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class JsonExtractor(Extractor):
|
||||
"""
|
||||
Extractor for JSON files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/json
|
||||
- File extensions: .json
|
||||
- Special handling: Validates JSON format, falls back to text if invalid
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".json"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/json"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/json"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class PdfExtractor(Extractor):
|
||||
"""
|
||||
Extractor for PDF files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/pdf
|
||||
- File extensions: .pdf
|
||||
- Special handling: Extracts text per page and embedded images
|
||||
- Dependencies: PyPDF2, PyMuPDF (fitz)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
|
@ -27,6 +37,14 @@ class PdfExtractor(Extractor):
|
|||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".pdf"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/pdf"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
225
modules/services/serviceExtraction/extractors/extractorPptx.py
Normal file
225
modules/services/serviceExtraction/extractors/extractorPptx.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
import logging
|
||||
import base64
|
||||
from typing import List, Dict, Any, Optional
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PptxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for PowerPoint files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
|
||||
- File extensions: .pptx, .ppt
|
||||
- Special handling: Extracts slide content, tables, and images
|
||||
- Dependencies: python-pptx
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
||||
def _load(self):
|
||||
if self._loaded:
|
||||
return
|
||||
self._loaded = True
|
||||
try:
|
||||
global Presentation
|
||||
from pptx import Presentation
|
||||
self._haveLibs = True
|
||||
except Exception:
|
||||
self._haveLibs = False
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType in [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-powerpoint"
|
||||
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".pptx", ".ppt"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-powerpoint"
|
||||
]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
"""
|
||||
Extract content from PowerPoint files.
|
||||
|
||||
Args:
|
||||
fileBytes: Raw file data as bytes
|
||||
context: Context dictionary with file information
|
||||
|
||||
Returns:
|
||||
List of ContentPart objects with extracted content
|
||||
"""
|
||||
self._load()
|
||||
|
||||
if not self._haveLibs:
|
||||
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||
return [ContentPart(
|
||||
id="error",
|
||||
label="PowerPoint Extraction Error",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data="Error: python-pptx library not installed",
|
||||
metadata={"error": True, "error_message": "python-pptx library not installed"}
|
||||
)]
|
||||
|
||||
try:
|
||||
import io
|
||||
|
||||
# Load presentation from bytes
|
||||
presentation = Presentation(io.BytesIO(fileBytes))
|
||||
|
||||
parts = []
|
||||
slide_index = 0
|
||||
|
||||
# Extract content from each slide
|
||||
for slide in presentation.slides:
|
||||
slide_index += 1
|
||||
slide_content = []
|
||||
|
||||
# Extract text from slide
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
slide_content.append(shape.text.strip())
|
||||
|
||||
# Extract table data
|
||||
for shape in slide.shapes:
|
||||
if shape.has_table:
|
||||
table = shape.table
|
||||
table_data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
row_data.append(cell.text.strip())
|
||||
table_data.append(row_data)
|
||||
|
||||
if table_data:
|
||||
# Convert table to markdown format
|
||||
table_md = self._table_to_markdown(table_data)
|
||||
slide_content.append(table_md)
|
||||
|
||||
# Extract images
|
||||
for shape in slide.shapes:
|
||||
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
|
||||
try:
|
||||
image = shape.image
|
||||
image_bytes = image.blob
|
||||
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||
|
||||
# Create image part
|
||||
image_part = ContentPart(
|
||||
id=f"slide_{slide_index}_image_{len(parts)}",
|
||||
label=f"Slide {slide_index} Image",
|
||||
typeGroup="image",
|
||||
mimeType="image/png", # Default to PNG
|
||||
data=image_b64,
|
||||
metadata={
|
||||
"slide_number": slide_index,
|
||||
"shape_type": "image",
|
||||
"extracted_from": "powerpoint"
|
||||
}
|
||||
)
|
||||
parts.append(image_part)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
|
||||
|
||||
# Create slide content part
|
||||
if slide_content:
|
||||
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
|
||||
|
||||
slide_part = ContentPart(
|
||||
id=f"slide_{slide_index}",
|
||||
label=f"Slide {slide_index} Content",
|
||||
typeGroup="structure",
|
||||
mimeType="text/plain",
|
||||
data=slide_text,
|
||||
metadata={
|
||||
"slide_number": slide_index,
|
||||
"content_type": "slide",
|
||||
"extracted_from": "powerpoint",
|
||||
"text_length": len(slide_text)
|
||||
}
|
||||
)
|
||||
parts.append(slide_part)
|
||||
|
||||
# Create presentation overview
|
||||
file_name = context.get("fileName", "presentation.pptx")
|
||||
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
|
||||
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
|
||||
overview_text += f"**Content Parts:** {len(parts)}\n\n"
|
||||
|
||||
# Add slide summaries
|
||||
for i, slide in enumerate(presentation.slides, 1):
|
||||
slide_text_parts = []
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
slide_text_parts.append(shape.text.strip())
|
||||
|
||||
if slide_text_parts:
|
||||
overview_text += f"## Slide {i}\n"
|
||||
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
|
||||
overview_text += "\n\n"
|
||||
|
||||
# Create overview part
|
||||
overview_part = ContentPart(
|
||||
id="presentation_overview",
|
||||
label="Presentation Overview",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=overview_text,
|
||||
metadata={
|
||||
"content_type": "overview",
|
||||
"extracted_from": "powerpoint",
|
||||
"total_slides": len(presentation.slides),
|
||||
"text_length": len(overview_text)
|
||||
}
|
||||
)
|
||||
parts.insert(0, overview_part) # Insert at beginning
|
||||
|
||||
return parts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting PowerPoint content: {str(e)}")
|
||||
return [ContentPart(
|
||||
id="error",
|
||||
label="PowerPoint Extraction Error",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=f"Error extracting PowerPoint content: {str(e)}",
|
||||
metadata={"error": True, "error_message": str(e)}
|
||||
)]
|
||||
|
||||
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
|
||||
"""Convert table data to markdown format."""
|
||||
if not table_data:
|
||||
return ""
|
||||
|
||||
markdown_lines = []
|
||||
|
||||
# Header row
|
||||
if table_data:
|
||||
header = "| " + " | ".join(table_data[0]) + " |"
|
||||
markdown_lines.append(header)
|
||||
|
||||
# Separator row
|
||||
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
|
||||
markdown_lines.append(separator)
|
||||
|
||||
# Data rows
|
||||
for row in table_data[1:]:
|
||||
data_row = "| " + " | ".join(row) + " |"
|
||||
markdown_lines.append(data_row)
|
||||
|
||||
return "\n".join(markdown_lines)
|
||||
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class SqlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for SQL files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/x-sql, application/sql
|
||||
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
|
||||
- Special handling: Treats as structured text with SQL syntax
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType in ("text/x-sql", "application/sql") or
|
||||
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/x-sql", "application/sql"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/x-sql"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
|
||||
# Add SQL-specific metadata
|
||||
metadata = {
|
||||
"size": len(fileBytes),
|
||||
"file_type": "sql",
|
||||
"line_count": len(data.splitlines()),
|
||||
"has_select": "SELECT" in data.upper(),
|
||||
"has_insert": "INSERT" in data.upper(),
|
||||
"has_update": "UPDATE" in data.upper(),
|
||||
"has_delete": "DELETE" in data.upper(),
|
||||
"has_create": "CREATE" in data.upper(),
|
||||
"has_drop": "DROP" in data.upper()
|
||||
}
|
||||
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="structure",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)]
|
||||
103
modules/services/serviceExtraction/extractors/extractorText.py
Normal file
103
modules/services/serviceExtraction/extractors/extractorText.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class TextExtractor(Extractor):
|
||||
"""
|
||||
Extractor for plain text files and code files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
|
||||
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
# Check MIME types
|
||||
if mimeType and mimeType.startswith("text/"):
|
||||
return True
|
||||
|
||||
# Check file extensions
|
||||
if fileName:
|
||||
ext = fileName.lower()
|
||||
return ext.endswith((
|
||||
# Basic text files
|
||||
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||
# Programming languages
|
||||
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||
# Web technologies
|
||||
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||
# Configuration files
|
||||
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||
# Scripts and automation
|
||||
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||
# Data files
|
||||
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||
# Documentation
|
||||
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||
# Other text formats
|
||||
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||
))
|
||||
|
||||
return False
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [
|
||||
# Basic text files
|
||||
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||
# Programming languages
|
||||
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||
# Web technologies
|
||||
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||
# Configuration files
|
||||
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||
# Scripts and automation
|
||||
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||
# Data files
|
||||
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||
# Documentation
|
||||
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||
# Other text formats
|
||||
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||
]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return [
|
||||
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
|
||||
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
|
||||
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
|
||||
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
|
||||
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
|
||||
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
|
||||
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
|
||||
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
|
||||
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
|
||||
"application/x-yaml", "application/x-toml", "application/x-ini",
|
||||
"application/x-config", "application/x-properties", "application/x-log"
|
||||
]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/plain"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="text",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class XlsxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for Microsoft Excel spreadsheets.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
||||
- File extensions: .xlsx, .xlsm
|
||||
- Special handling: Extracts all sheets as CSV data
|
||||
- Dependencies: openpyxl
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
|
@ -27,6 +37,14 @@ class XlsxExtractor(Extractor):
|
|||
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".xlsx", ".xlsm"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
|
|
@ -7,9 +7,26 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class XmlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for XML files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/xml
|
||||
- File extensions: .xml, .rss, .atom
|
||||
- Special handling: Uses ElementTree for parsing
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".xml", ".rss", ".atom"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/xml"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/xml"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
from typing import Any, Dict, List
|
||||
import base64
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class ImageExtractor(Extractor):
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType or "").startswith("image/")
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "image/unknown"
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="image",
|
||||
typeGroup="image",
|
||||
mimeType=mimeType,
|
||||
data=base64.b64encode(fileBytes).decode("utf-8"),
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class TextExtractor(Extractor):
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType in ("text/plain", "text/markdown")
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/plain"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="text",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -67,10 +67,12 @@ class ExtractionService:
|
|||
if part.metadata:
|
||||
logger.debug(f" Metadata: {part.metadata}")
|
||||
|
||||
# Attach document id to parts if missing
|
||||
# Attach document id and MIME type to parts if missing
|
||||
for p in ec.parts:
|
||||
if "documentId" not in p.metadata:
|
||||
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
||||
if "documentMimeType" not in p.metadata:
|
||||
p.metadata["documentMimeType"] = documentData["mimeType"]
|
||||
|
||||
# Log chunking information
|
||||
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
||||
|
|
|
|||
209
modules/services/serviceExtraction/subMerger.py
Normal file
209
modules/services/serviceExtraction/subMerger.py
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
"""
|
||||
Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits.
|
||||
"""
|
||||
from typing import List, Dict, Any, Tuple
|
||||
import logging
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from .subUtils import makeId
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntelligentTokenAwareMerger:
|
||||
"""
|
||||
Intelligent merger that groups chunks based on LLM token limits to minimize AI calls.
|
||||
|
||||
Strategy:
|
||||
1. Calculate token count for each chunk
|
||||
2. Group chunks to maximize token usage without exceeding limits
|
||||
3. Preserve document structure and semantic boundaries
|
||||
4. Minimize total number of AI calls
|
||||
"""
|
||||
|
||||
def __init__(self, model_capabilities: Dict[str, Any]):
|
||||
self.max_tokens = model_capabilities.get("maxTokens", 4000)
|
||||
self.safety_margin = model_capabilities.get("safetyMargin", 0.1)
|
||||
self.effective_max_tokens = int(self.max_tokens * (1 - self.safety_margin))
|
||||
self.chars_per_token = model_capabilities.get("charsPerToken", 4) # Rough estimation
|
||||
|
||||
def merge_chunks_intelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]:
|
||||
"""
|
||||
Merge chunks intelligently based on token limits.
|
||||
|
||||
Args:
|
||||
chunks: List of ContentPart chunks to merge
|
||||
prompt: AI prompt to account for in token calculation
|
||||
|
||||
Returns:
|
||||
List of optimally merged ContentPart objects
|
||||
"""
|
||||
if not chunks:
|
||||
return chunks
|
||||
|
||||
logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, max_tokens={self.effective_max_tokens}")
|
||||
|
||||
# Calculate tokens for prompt
|
||||
prompt_tokens = self._estimate_tokens(prompt)
|
||||
available_tokens = self.effective_max_tokens - prompt_tokens
|
||||
|
||||
logger.info(f"📊 Prompt tokens: {prompt_tokens}, Available for content: {available_tokens}")
|
||||
|
||||
# Group chunks by document and type for semantic coherence
|
||||
grouped_chunks = self._group_chunks_by_document_and_type(chunks)
|
||||
|
||||
merged_parts = []
|
||||
|
||||
for group_key, group_chunks in grouped_chunks.items():
|
||||
logger.info(f"📁 Processing group: {group_key} ({len(group_chunks)} chunks)")
|
||||
|
||||
# Merge chunks within this group optimally
|
||||
group_merged = self._merge_group_optimally(group_chunks, available_tokens)
|
||||
merged_parts.extend(group_merged)
|
||||
|
||||
logger.info(f"✅ Intelligent merging complete: {len(chunks)} → {len(merged_parts)} parts")
|
||||
return merged_parts
|
||||
|
||||
def _group_chunks_by_document_and_type(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]:
|
||||
"""Group chunks by document and type for semantic coherence."""
|
||||
groups = {}
|
||||
|
||||
for chunk in chunks:
|
||||
# Create group key: document_id + type_group
|
||||
doc_id = chunk.metadata.get("documentId", "unknown")
|
||||
type_group = chunk.typeGroup
|
||||
group_key = f"{doc_id}_{type_group}"
|
||||
|
||||
if group_key not in groups:
|
||||
groups[group_key] = []
|
||||
groups[group_key].append(chunk)
|
||||
|
||||
return groups
|
||||
|
||||
def _merge_group_optimally(self, chunks: List[ContentPart], available_tokens: int) -> List[ContentPart]:
|
||||
"""Merge chunks within a group optimally to minimize AI calls."""
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# Sort chunks by size (smallest first for better packing)
|
||||
sorted_chunks = sorted(chunks, key=lambda c: self._estimate_tokens(c.data))
|
||||
|
||||
merged_parts = []
|
||||
current_group = []
|
||||
current_tokens = 0
|
||||
|
||||
for chunk in sorted_chunks:
|
||||
chunk_tokens = self._estimate_tokens(chunk.data)
|
||||
|
||||
# Special case: If single chunk is already at max size, process it alone
|
||||
if chunk_tokens >= available_tokens * 0.9: # 90% of available tokens
|
||||
# Finalize current group if it exists
|
||||
if current_group:
|
||||
merged_part = self._create_merged_part(current_group, current_tokens)
|
||||
merged_parts.append(merged_part)
|
||||
current_group = []
|
||||
current_tokens = 0
|
||||
|
||||
# Process large chunk individually
|
||||
merged_parts.append(chunk)
|
||||
logger.debug(f"🔍 Large chunk processed individually: {chunk_tokens} tokens")
|
||||
continue
|
||||
|
||||
# If adding this chunk would exceed limit, finalize current group
|
||||
if current_tokens + chunk_tokens > available_tokens and current_group:
|
||||
merged_part = self._create_merged_part(current_group, current_tokens)
|
||||
merged_parts.append(merged_part)
|
||||
current_group = [chunk]
|
||||
current_tokens = chunk_tokens
|
||||
else:
|
||||
current_group.append(chunk)
|
||||
current_tokens += chunk_tokens
|
||||
|
||||
# Finalize remaining group
|
||||
if current_group:
|
||||
merged_part = self._create_merged_part(current_group, current_tokens)
|
||||
merged_parts.append(merged_part)
|
||||
|
||||
logger.info(f"📦 Group merged: {len(chunks)} → {len(merged_parts)} parts")
|
||||
return merged_parts
|
||||
|
||||
def _create_merged_part(self, chunks: List[ContentPart], total_tokens: int) -> ContentPart:
|
||||
"""Create a merged ContentPart from multiple chunks."""
|
||||
if len(chunks) == 1:
|
||||
return chunks[0] # No need to merge single chunk
|
||||
|
||||
# Combine data with semantic separators
|
||||
combined_data = self._combine_chunk_data(chunks)
|
||||
|
||||
# Use metadata from first chunk as base
|
||||
base_chunk = chunks[0]
|
||||
merged_metadata = base_chunk.metadata.copy()
|
||||
merged_metadata.update({
|
||||
"merged": True,
|
||||
"originalChunkCount": len(chunks),
|
||||
"totalTokens": total_tokens,
|
||||
"originalChunkIds": [c.id for c in chunks],
|
||||
"size": len(combined_data.encode('utf-8'))
|
||||
})
|
||||
|
||||
merged_part = ContentPart(
|
||||
id=makeId(),
|
||||
parentId=base_chunk.parentId,
|
||||
label=f"merged_{len(chunks)}_chunks",
|
||||
typeGroup=base_chunk.typeGroup,
|
||||
mimeType=base_chunk.mimeType,
|
||||
data=combined_data,
|
||||
metadata=merged_metadata
|
||||
)
|
||||
|
||||
logger.debug(f"🔗 Created merged part: {len(chunks)} chunks, {total_tokens} tokens")
|
||||
return merged_part
|
||||
|
||||
def _combine_chunk_data(self, chunks: List[ContentPart]) -> str:
|
||||
"""Combine chunk data with appropriate separators."""
|
||||
if not chunks:
|
||||
return ""
|
||||
|
||||
# Use different separators based on content type
|
||||
if chunks[0].typeGroup == "text":
|
||||
separator = "\n\n---\n\n" # Clear text separation
|
||||
elif chunks[0].typeGroup == "table":
|
||||
separator = "\n\n[TABLE BREAK]\n\n" # Table separation
|
||||
else:
|
||||
separator = "\n\n---\n\n" # Default separation
|
||||
|
||||
return separator.join([chunk.data for chunk in chunks])
|
||||
|
||||
def _estimate_tokens(self, text: str) -> int:
|
||||
"""Estimate token count for text."""
|
||||
if not text:
|
||||
return 0
|
||||
return len(text) // self.chars_per_token
|
||||
|
||||
def calculate_optimization_stats(self, original_chunks: List[ContentPart], merged_parts: List[ContentPart]) -> Dict[str, Any]:
|
||||
"""Calculate optimization statistics with detailed analysis."""
|
||||
original_calls = len(original_chunks)
|
||||
optimized_calls = len(merged_parts)
|
||||
reduction_percent = ((original_calls - optimized_calls) / original_calls * 100) if original_calls > 0 else 0
|
||||
|
||||
# Analyze chunk sizes
|
||||
large_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) >= self.effective_max_tokens * 0.9]
|
||||
small_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) < self.effective_max_tokens * 0.9]
|
||||
|
||||
# Calculate theoretical maximum optimization (if all small chunks could be merged)
|
||||
theoretical_min_calls = len(large_chunks) + max(1, len(small_chunks) // 3) # Assume 3 small chunks per call
|
||||
theoretical_reduction = ((original_calls - theoretical_min_calls) / original_calls * 100) if original_calls > 0 else 0
|
||||
|
||||
return {
|
||||
"original_ai_calls": original_calls,
|
||||
"optimized_ai_calls": optimized_calls,
|
||||
"reduction_percent": round(reduction_percent, 1),
|
||||
"cost_savings": f"{reduction_percent:.1f}%",
|
||||
"efficiency_gain": f"{original_calls / optimized_calls:.1f}x" if optimized_calls > 0 else "∞",
|
||||
"analysis": {
|
||||
"large_chunks": len(large_chunks),
|
||||
"small_chunks": len(small_chunks),
|
||||
"theoretical_min_calls": theoretical_min_calls,
|
||||
"theoretical_reduction": round(theoretical_reduction, 1),
|
||||
"optimization_potential": "high" if reduction_percent > 50 else "moderate" if reduction_percent > 20 else "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -3,11 +3,13 @@ import logging
|
|||
import os
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
from .subUtils import makeId
|
||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||
from .merging.text_merger import TextMerger
|
||||
from .merging.table_merger import TableMerger
|
||||
from .merging.default_merger import DefaultMerger
|
||||
from .merging.mergerText import TextMerger
|
||||
from .merging.mergerTable import TableMerger
|
||||
from .merging.mergerDefault import DefaultMerger
|
||||
from .subMerger import IntelligentTokenAwareMerger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -84,16 +86,25 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
|
|||
chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]
|
||||
|
||||
logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
|
||||
logger.debug(f"runExtraction - non_chunk_parts: {len(non_chunk_parts)}, chunk_parts: {len(chunk_parts)}")
|
||||
|
||||
# Apply intelligent merging for small text parts
|
||||
if non_chunk_parts:
|
||||
# Count text parts
|
||||
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
|
||||
if len(text_parts) > 5: # If we have many small text parts, merge them
|
||||
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
|
||||
non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)
|
||||
|
||||
# Combine non-chunk parts with chunk parts (chunks stay separate)
|
||||
parts = non_chunk_parts + chunk_parts
|
||||
|
||||
logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
|
||||
# DEBUG: dump parts and chunks to files TODO TO REMOVE
|
||||
logger.debug(f"runExtraction - Final parts: {len(parts)} (chunks: {len(chunk_parts)})")
|
||||
# DEBUG: dump parts and chunks to files - only if debug enabled
|
||||
try:
|
||||
debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
base_dir = "./test-chat/ai"
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
|
||||
|
|
@ -146,13 +157,22 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
|||
kept: List[ContentPart] = []
|
||||
remaining: List[ContentPart] = []
|
||||
|
||||
for p in parts:
|
||||
logger.debug(f"Starting poolAndLimit with {len(parts)} parts, maxSize={maxSize}")
|
||||
|
||||
for i, p in enumerate(parts):
|
||||
size = int(p.metadata.get("size", 0) or 0)
|
||||
# Show first 50 characters of text content for debugging
|
||||
content_preview = p.data[:50].replace('\n', '\\n') if p.data else ""
|
||||
logger.debug(f"Part {i}: {p.typeGroup} - {size} bytes - '{content_preview}...' (current: {current})")
|
||||
if current + size <= maxSize:
|
||||
kept.append(p)
|
||||
current += size
|
||||
logger.debug(f"Part {i} kept (total: {current})")
|
||||
else:
|
||||
remaining.append(p)
|
||||
logger.debug(f"Part {i} moved to remaining")
|
||||
|
||||
logger.debug(f"Kept: {len(kept)}, Remaining: {len(remaining)}")
|
||||
|
||||
# If we have remaining parts and chunking is allowed, try chunking
|
||||
if remaining and chunkAllowed:
|
||||
|
|
@ -160,12 +180,15 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
|||
logger.debug(f"Remaining parts to chunk: {len(remaining)}")
|
||||
logger.debug(f"Max size limit: {maxSize} bytes")
|
||||
logger.debug(f"Current size used: {current} bytes")
|
||||
logger.debug(f"Chunking {len(remaining)} remaining parts")
|
||||
|
||||
for p in remaining:
|
||||
if p.typeGroup in ("text", "table", "structure", "image"):
|
||||
if p.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
||||
logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
|
||||
logger.debug(f"Chunking {p.typeGroup} part with {len(p.data)} chars")
|
||||
chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
|
||||
logger.debug(f"Created {len(chunks)} chunks")
|
||||
logger.debug(f"Created {len(chunks)} chunks")
|
||||
|
||||
chunks_added = 0
|
||||
for ch in chunks:
|
||||
|
|
@ -197,12 +220,18 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
|||
|
||||
logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")
|
||||
|
||||
# Apply intelligent merging for small text parts
|
||||
if non_chunk_parts:
|
||||
# Count text parts
|
||||
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
|
||||
if len(text_parts) > 5: # If we have many small text parts, merge them
|
||||
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
|
||||
non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)
|
||||
|
||||
# Combine non-chunk parts with chunk parts (chunks stay separate)
|
||||
kept = non_chunk_parts + chunk_parts
|
||||
|
||||
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
|
||||
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
|
||||
|
||||
# Re-check size after merging
|
||||
|
|
@ -211,11 +240,30 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
|
|||
# Apply size limit to merged parts
|
||||
kept = _applySizeLimit(kept, maxSize)
|
||||
|
||||
logger.debug(f"poolAndLimit returning {len(kept)} parts")
|
||||
return kept
|
||||
|
||||
|
||||
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
||||
"""Apply merging strategy to parts."""
|
||||
"""Apply merging strategy to parts with intelligent token-aware merging."""
|
||||
logger.debug(f"_applyMerging called with {len(parts)} parts")
|
||||
|
||||
# Check if intelligent merging is enabled
|
||||
if strategy.get("useIntelligentMerging", False):
|
||||
model_capabilities = strategy.get("modelCapabilities", {})
|
||||
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
||||
|
||||
# Use intelligent merging for all parts
|
||||
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
||||
|
||||
# Calculate and log optimization stats
|
||||
stats = subMerger.calculate_optimization_stats(parts, merged)
|
||||
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
||||
logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
||||
|
||||
return merged
|
||||
|
||||
# Fallback to traditional merging
|
||||
textMerger = TextMerger()
|
||||
tableMerger = TableMerger()
|
||||
defaultMerger = DefaultMerger()
|
||||
|
|
@ -226,18 +274,29 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
|
|||
structureParts = [p for p in parts if p.typeGroup == "structure"]
|
||||
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
|
||||
|
||||
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
|
||||
|
||||
merged: List[ContentPart] = []
|
||||
|
||||
if textParts:
|
||||
merged.extend(textMerger.merge(textParts, strategy))
|
||||
textMerged = textMerger.merge(textParts, strategy)
|
||||
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
|
||||
merged.extend(textMerged)
|
||||
if tableParts:
|
||||
merged.extend(tableMerger.merge(tableParts, strategy))
|
||||
tableMerged = tableMerger.merge(tableParts, strategy)
|
||||
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
|
||||
merged.extend(tableMerged)
|
||||
if structureParts:
|
||||
# For now, treat structure like text
|
||||
merged.extend(textMerger.merge(structureParts, strategy))
|
||||
structureMerged = textMerger.merge(structureParts, strategy)
|
||||
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
|
||||
merged.extend(structureMerged)
|
||||
if otherParts:
|
||||
merged.extend(defaultMerger.merge(otherParts, strategy))
|
||||
otherMerged = defaultMerger.merge(otherParts, strategy)
|
||||
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
|
||||
merged.extend(otherMerged)
|
||||
|
||||
logger.debug(f"_applyMerging returning {len(merged)} parts")
|
||||
return merged
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,38 @@
|
|||
from typing import Any, Dict, Optional
|
||||
import logging
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Extractor:
|
||||
"""
|
||||
Base class for all document extractors.
|
||||
|
||||
Each extractor should implement:
|
||||
- detect(): Check if this extractor can handle the given file
|
||||
- extract(): Extract content from the file
|
||||
- getSupportedExtensions(): Return supported file extensions
|
||||
- getSupportedMimeTypes(): Return supported MIME types
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
"""Check if this extractor can handle the given file."""
|
||||
return False
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
||||
"""Extract content from the file bytes."""
|
||||
raise NotImplementedError
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions (including dots)."""
|
||||
return []
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return []
|
||||
|
||||
|
||||
class Chunker:
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
|
|
@ -20,51 +43,86 @@ class ExtractorRegistry:
|
|||
def __init__(self):
|
||||
self._map: Dict[str, Extractor] = {}
|
||||
self._fallback: Optional[Extractor] = None
|
||||
# Register built-ins
|
||||
self._auto_discover_extractors()
|
||||
|
||||
def _auto_discover_extractors(self):
|
||||
"""Auto-discover and register all extractors from the extractors directory."""
|
||||
try:
|
||||
from .formats.text_extractor import TextExtractor
|
||||
from .formats.csv_extractor import CsvExtractor
|
||||
from .formats.json_extractor import JsonExtractor
|
||||
from .formats.xml_extractor import XmlExtractor
|
||||
from .formats.html_extractor import HtmlExtractor
|
||||
from .formats.pdf_extractor import PdfExtractor
|
||||
from .formats.docx_extractor import DocxExtractor
|
||||
from .formats.xlsx_extractor import XlsxExtractor
|
||||
from .formats.image_extractor import ImageExtractor
|
||||
from .formats.binary_extractor import BinaryExtractor
|
||||
self.register("text/plain", TextExtractor())
|
||||
self.register("text/markdown", TextExtractor())
|
||||
self.register("text/csv", CsvExtractor())
|
||||
self.register("application/json", JsonExtractor())
|
||||
self.register("application/xml", XmlExtractor())
|
||||
self.register("text/html", HtmlExtractor())
|
||||
self.register("application/pdf", PdfExtractor())
|
||||
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
|
||||
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
|
||||
# images
|
||||
self.register("image/jpeg", ImageExtractor())
|
||||
self.register("image/png", ImageExtractor())
|
||||
self.register("image/gif", ImageExtractor())
|
||||
# extension fallbacks
|
||||
self.register("txt", TextExtractor())
|
||||
self.register("md", TextExtractor())
|
||||
self.register("csv", CsvExtractor())
|
||||
self.register("json", JsonExtractor())
|
||||
self.register("xml", XmlExtractor())
|
||||
self.register("html", HtmlExtractor())
|
||||
self.register("htm", HtmlExtractor())
|
||||
self.register("pdf", PdfExtractor())
|
||||
self.register("docx", DocxExtractor())
|
||||
self.register("xlsx", XlsxExtractor())
|
||||
self.register("xlsm", XlsxExtractor())
|
||||
# fallback
|
||||
self.setFallback(BinaryExtractor())
|
||||
print(f"✅ ExtractorRegistry: Successfully registered {len(self._map)} extractors")
|
||||
import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
# Get the extractors directory
|
||||
current_dir = Path(__file__).parent
|
||||
extractors_dir = current_dir / "extractors"
|
||||
|
||||
if not extractors_dir.exists():
|
||||
logger.error(f"Extractors directory not found: {extractors_dir}")
|
||||
return
|
||||
|
||||
# Import all extractor modules
|
||||
extractor_modules = []
|
||||
for file_path in extractors_dir.glob("extractor*.py"):
|
||||
if file_path.name == "__init__.py":
|
||||
continue
|
||||
|
||||
module_name = file_path.stem
|
||||
try:
|
||||
# Import the module
|
||||
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
|
||||
|
||||
# Find all extractor classes in the module
|
||||
for attr_name in dir(module):
|
||||
attr = getattr(module, attr_name)
|
||||
if (isinstance(attr, type) and
|
||||
issubclass(attr, Extractor) and
|
||||
attr != Extractor and
|
||||
not attr_name.startswith('_')):
|
||||
|
||||
# Create instance and auto-register
|
||||
extractor_instance = attr()
|
||||
self._auto_register_extractor(extractor_instance)
|
||||
extractor_modules.append(attr_name)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ ExtractorRegistry: Failed to register extractors: {str(e)}")
|
||||
logger.warning(f"Failed to import {module_name}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Set fallback extractor
|
||||
try:
|
||||
from .extractors.extractorBinary import BinaryExtractor
|
||||
self.setFallback(BinaryExtractor())
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set fallback extractor: {str(e)}")
|
||||
|
||||
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
|
||||
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def _auto_register_extractor(self, extractor: Extractor):
|
||||
"""Auto-register an extractor based on its declared supported formats."""
|
||||
try:
|
||||
# Register MIME types
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
for mime_type in mime_types:
|
||||
self.register(mime_type, extractor)
|
||||
logger.debug(f"Registered MIME type: {mime_type} → {extractor.__class__.__name__}")
|
||||
|
||||
# Register file extensions
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
for ext in extensions:
|
||||
# Remove leading dot for registry key
|
||||
ext_key = ext.lstrip('.')
|
||||
self.register(ext_key, extractor)
|
||||
logger.debug(f"Registered extension: .{ext_key} → {extractor.__class__.__name__}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
|
||||
|
||||
def register(self, key: str, extractor: Extractor):
|
||||
self._map[key] = extractor
|
||||
|
||||
|
|
@ -81,6 +139,43 @@ class ExtractorRegistry:
|
|||
return self._map[ext]
|
||||
return self._fallback
|
||||
|
||||
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
|
||||
"""
|
||||
Get all supported formats from all registered extractors.
|
||||
|
||||
Returns:
|
||||
Dictionary with format information:
|
||||
{
|
||||
"extensions": {
|
||||
"extractor_name": [".ext1", ".ext2", ...]
|
||||
},
|
||||
"mime_types": {
|
||||
"extractor_name": ["mime/type1", "mime/type2", ...]
|
||||
}
|
||||
}
|
||||
"""
|
||||
formats = {"extensions": {}, "mime_types": {}}
|
||||
|
||||
# Get formats from registered extractors
|
||||
for key, extractor in self._map.items():
|
||||
if hasattr(extractor, 'getSupportedExtensions'):
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
if extensions:
|
||||
formats["extensions"][key] = extensions
|
||||
|
||||
if hasattr(extractor, 'getSupportedMimeTypes'):
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
if mime_types:
|
||||
formats["mime_types"][key] = mime_types
|
||||
|
||||
# Add fallback extractor info
|
||||
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
|
||||
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
|
||||
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
|
||||
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
|
||||
|
||||
return formats
|
||||
|
||||
|
||||
class ChunkerRegistry:
|
||||
def __init__(self):
|
||||
|
|
@ -88,17 +183,19 @@ class ChunkerRegistry:
|
|||
self._noop = Chunker()
|
||||
# Register default chunkers
|
||||
try:
|
||||
from .chunking.text_chunker import TextChunker
|
||||
from .chunking.table_chunker import TableChunker
|
||||
from .chunking.structure_chunker import StructureChunker
|
||||
# Skip ImageChunker for now to avoid PIL import hang
|
||||
# from .chunking.image_chunker import ImageChunker
|
||||
from .chunking.chunkerText import TextChunker
|
||||
from .chunking.chunkerTable import TableChunker
|
||||
from .chunking.chunkerStructure import StructureChunker
|
||||
from .chunking.chunkerImage import ImageChunker
|
||||
self.register("text", TextChunker())
|
||||
self.register("table", TableChunker())
|
||||
self.register("structure", StructureChunker())
|
||||
# self.register("image", ImageChunker())
|
||||
self.register("image", ImageChunker())
|
||||
# Use text chunker for container and binary content
|
||||
self.register("container", TextChunker())
|
||||
self.register("binary", TextChunker())
|
||||
except Exception as e:
|
||||
print(f"❌ ChunkerRegistry: Failed to register chunkers: {str(e)}")
|
||||
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import logging
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||
from datetime import datetime, UTC
|
||||
import re
|
||||
from modules.shared.timezoneUtils import get_utc_timestamp
|
||||
|
|
@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
|
|||
class GenerationService:
|
||||
def __init__(self, serviceCenter=None):
|
||||
# Directly use interfaces from the provided service center (no self.service calls)
|
||||
self.serviceCenter = serviceCenter
|
||||
self.services = serviceCenter
|
||||
self.interfaceDbComponent = getattr(serviceCenter, 'interfaceDbComponent', None) if serviceCenter else None
|
||||
self.interfaceDbChat = getattr(serviceCenter, 'interfaceDbChat', None) if serviceCenter else None
|
||||
self.workflow = getattr(serviceCenter, 'workflow', None) if serviceCenter else None
|
||||
|
|
@ -296,101 +297,237 @@ class GenerationService:
|
|||
'workflowId': 'unknown'
|
||||
}
|
||||
|
||||
async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]:
|
||||
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
|
||||
"""
|
||||
Render extracted content to the specified output format.
|
||||
Render extracted JSON content to the specified output format.
|
||||
|
||||
Args:
|
||||
extracted_content: Content extracted by AI using format-specific prompt
|
||||
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||
extractedContent: Structured JSON document from AI extraction
|
||||
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||
title: Report title
|
||||
userPrompt: User's original prompt for report generation
|
||||
aiService: AI service instance for generation prompt creation
|
||||
|
||||
Returns:
|
||||
tuple: (rendered_content, mime_type)
|
||||
"""
|
||||
try:
|
||||
# DEBUG: dump renderer input to diagnose JSON+HTML mixtures TODO REMOVE
|
||||
# Validate JSON input
|
||||
if not isinstance(extractedContent, dict):
|
||||
raise ValueError("extractedContent must be a JSON dictionary")
|
||||
|
||||
if "sections" not in extractedContent:
|
||||
raise ValueError("extractedContent must contain 'sections' field")
|
||||
|
||||
# DEBUG: Log renderer input metadata only (no verbose JSON) - only if debug enabled
|
||||
try:
|
||||
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
import os
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
debug_root = "./test-chat/ai"
|
||||
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
|
||||
os.makedirs(debug_dir, exist_ok=True)
|
||||
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
|
||||
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(extracted_content or "")
|
||||
f.write(f"title: {title}\nformat: {outputFormat}\ncontent_type: {type(extractedContent).__name__}\n")
|
||||
f.write(f"content_size: {len(str(extractedContent))} characters\n")
|
||||
f.write(f"sections_count: {len(extractedContent.get('sections', []))}\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Get the appropriate renderer for the format
|
||||
renderer = self._getFormatRenderer(output_format)
|
||||
renderer = self._getFormatRenderer(outputFormat)
|
||||
if not renderer:
|
||||
raise ValueError(f"Unsupported output format: {output_format}")
|
||||
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||
|
||||
# Render the content
|
||||
rendered_content, mime_type = await renderer.render(extracted_content, title)
|
||||
# Render the JSON content directly (AI generation handled by main service)
|
||||
renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService)
|
||||
# DEBUG: dump rendered output
|
||||
try:
|
||||
import os
|
||||
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(rendered_content or "")
|
||||
f.write(renderedContent or "")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
|
||||
return rendered_content, mime_type
|
||||
logger.info(f"Successfully rendered JSON report to {outputFormat} format: {len(renderedContent)} characters")
|
||||
return renderedContent, mimeType
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering report to {output_format}: {str(e)}")
|
||||
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
|
||||
raise
|
||||
|
||||
def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
|
||||
async def getAdaptiveExtractionPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
promptAnalysis: Dict[str, Any],
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get adaptive extraction prompt based on AI analysis."""
|
||||
from .subPromptBuilder import buildAdaptiveExtractionPrompt
|
||||
return await buildAdaptiveExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
promptAnalysis=promptAnalysis,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
async def getGenerationPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get generation prompt for enhancing extracted JSON content."""
|
||||
from .subPromptBuilder import buildGenerationPrompt
|
||||
return await buildGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
async def getGenericExtractionPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get generic extraction prompt that works for both single and multi-file."""
|
||||
from .subPromptBuilder import buildGenericExtractionPrompt
|
||||
return await buildGenericExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
|
||||
"""
|
||||
Get the format-specific extraction prompt for AI content extraction.
|
||||
|
||||
Args:
|
||||
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||
user_prompt: User's original prompt for report generation
|
||||
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||
userPrompt: User's original prompt for report generation
|
||||
title: Report title
|
||||
aiService: AI service instance for intent extraction
|
||||
|
||||
Returns:
|
||||
str: Format-specific prompt for AI extraction
|
||||
"""
|
||||
try:
|
||||
# Get the appropriate renderer for the format
|
||||
renderer = self._getFormatRenderer(output_format)
|
||||
renderer = self._getFormatRenderer(outputFormat)
|
||||
if not renderer:
|
||||
raise ValueError(f"Unsupported output format: {output_format}")
|
||||
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||
|
||||
# Build centralized prompt with generic rules + format-specific guidelines
|
||||
from .prompt_builder import buildExtractionPrompt
|
||||
extraction_prompt = buildExtractionPrompt(
|
||||
output_format=output_format,
|
||||
from .subPromptBuilder import buildExtractionPrompt
|
||||
extractionPrompt = await buildExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
renderer=renderer,
|
||||
user_prompt=user_prompt,
|
||||
title=title
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
|
||||
return extraction_prompt
|
||||
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
|
||||
return extractionPrompt
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
|
||||
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
|
||||
raise
|
||||
|
||||
async def renderAdaptiveReport(
|
||||
self,
|
||||
extractedContent: Dict[str, Any],
|
||||
outputFormat: str,
|
||||
title: str,
|
||||
userPrompt: str = None,
|
||||
aiService=None,
|
||||
isMultiFile: bool = False
|
||||
) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
|
||||
"""Render report adaptively based on content structure."""
|
||||
|
||||
if isMultiFile and "documents" in extractedContent:
|
||||
return await self._renderMultiFileReport(
|
||||
extractedContent, outputFormat, title, userPrompt, aiService
|
||||
)
|
||||
else:
|
||||
return await self._renderSingleFileReport(
|
||||
extractedContent, outputFormat, title, userPrompt, aiService
|
||||
)
|
||||
|
||||
async def _renderMultiFileReport(
|
||||
self,
|
||||
extractedContent: Dict[str, Any],
|
||||
outputFormat: str,
|
||||
title: str,
|
||||
userPrompt: str = None,
|
||||
aiService=None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Render multiple documents from extracted content."""
|
||||
|
||||
generated_documents = []
|
||||
|
||||
for doc_data in extractedContent.get("documents", []):
|
||||
# Use existing single-file renderer for each document
|
||||
renderer = self._getFormatRenderer(outputFormat)
|
||||
if not renderer:
|
||||
continue
|
||||
|
||||
# Render individual document
|
||||
rendered_content, mime_type = await renderer.render(
|
||||
extractedContent={"sections": doc_data["sections"]},
|
||||
title=doc_data["title"],
|
||||
userPrompt=userPrompt,
|
||||
aiService=aiService
|
||||
)
|
||||
|
||||
generated_documents.append({
|
||||
"filename": doc_data["filename"],
|
||||
"content": rendered_content,
|
||||
"mime_type": mime_type,
|
||||
"title": doc_data["title"]
|
||||
})
|
||||
|
||||
return generated_documents
|
||||
|
||||
async def _renderSingleFileReport(
|
||||
self,
|
||||
extractedContent: Dict[str, Any],
|
||||
outputFormat: str,
|
||||
title: str,
|
||||
userPrompt: str = None,
|
||||
aiService=None
|
||||
) -> Tuple[str, str]:
|
||||
"""Render single file report (existing functionality)."""
|
||||
# Use existing renderReport method
|
||||
return await self.renderReport(
|
||||
extractedContent, outputFormat, title, userPrompt, aiService
|
||||
)
|
||||
|
||||
def _getFormatRenderer(self, output_format: str):
|
||||
"""Get the appropriate renderer for the specified format using auto-discovery."""
|
||||
try:
|
||||
from .renderers.registry import get_renderer
|
||||
renderer = get_renderer(output_format)
|
||||
renderer = get_renderer(output_format, services=self.services)
|
||||
|
||||
if renderer:
|
||||
return renderer
|
||||
|
||||
# Fallback to text renderer if no specific renderer found
|
||||
logger.warning(f"No renderer found for format {output_format}, falling back to text")
|
||||
fallback_renderer = get_renderer('text')
|
||||
fallback_renderer = get_renderer('text', services=self.services)
|
||||
if fallback_renderer:
|
||||
return fallback_renderer
|
||||
|
||||
|
|
|
|||
|
|
@ -1,72 +0,0 @@
|
|||
"""
|
||||
Centralized prompt builder for document generation across formats.
|
||||
|
||||
Builds a robust prompt that:
|
||||
- Accepts any user intent (no fixed structure assumptions)
|
||||
- Injects format-specific guidelines from the selected renderer
|
||||
- Adds a common policy section to always use real data from source docs
|
||||
- Requires the AI to output a filename header that we can parse and use
|
||||
"""
|
||||
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class _RendererLike(Protocol):
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
|
||||
...
|
||||
|
||||
|
||||
def buildExtractionPrompt(
|
||||
output_format: str,
|
||||
renderer: _RendererLike,
|
||||
user_prompt: str,
|
||||
title: str
|
||||
) -> str:
|
||||
"""
|
||||
Build the final extraction prompt by combining:
|
||||
- The raw user prompt (verbatim)
|
||||
- Generic cross-format instructions (filename header + real-data policy)
|
||||
- Format-specific guidelines snippet provided by the renderer
|
||||
|
||||
The AI must place a single filename header at the very top:
|
||||
FILENAME: <safe-file-name-with-extension>
|
||||
followed by a blank line and then ONLY the document content according to the target format.
|
||||
"""
|
||||
|
||||
format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
|
||||
|
||||
# Generic block appears once for every format
|
||||
generic_intro = f"""
|
||||
{user_prompt}
|
||||
|
||||
You are generating a document in {output_format.upper()} format for the title: "{title}".
|
||||
|
||||
Rules:
|
||||
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
|
||||
- Use only factual information extracted from the supplied source documents.
|
||||
- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD").
|
||||
- The output must strictly follow the target format and be ready for saving without extra wrapping.
|
||||
- At the VERY TOP output exactly one line with the filename header:
|
||||
FILENAME: <safe-file-name-with-extension>
|
||||
- The base name should be short, descriptive, and kebab-case or snake-case without spaces.
|
||||
- Include the correct extension for the requested format (e.g., .html, .pdf, .docx, .md, .txt, .json, .csv, .xlsx).
|
||||
- Avoid special characters beyond [a-zA-Z0-9-_].
|
||||
- After this header, insert a single blank line and then provide ONLY the document content.
|
||||
|
||||
Common policy:
|
||||
- Use the actual data from the source documents to create the content.
|
||||
- Do not generate placeholder text or templates.
|
||||
- Extract and use the real data provided in the source documents to create meaningful content.
|
||||
""".strip()
|
||||
|
||||
# Final assembly
|
||||
final_prompt = (
|
||||
generic_intro
|
||||
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
|
||||
+ format_guidelines.strip()
|
||||
+ "\n\nGenerate the complete document content now based on the source documents below:"
|
||||
)
|
||||
|
||||
return final_prompt
|
||||
|
||||
|
||||
|
|
@ -1,86 +0,0 @@
|
|||
"""
|
||||
Base renderer class for all format renderers.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BaseRenderer(ABC):
|
||||
"""Base class for all format renderers."""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logger
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""
|
||||
Return list of supported format names for this renderer.
|
||||
Override this method in subclasses to specify supported formats.
|
||||
"""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""
|
||||
Return list of format aliases for this renderer.
|
||||
Override this method in subclasses to specify format aliases.
|
||||
"""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""
|
||||
Return priority for this renderer (higher number = higher priority).
|
||||
Used when multiple renderers support the same format.
|
||||
"""
|
||||
return 0
|
||||
|
||||
@abstractmethod
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""
|
||||
Get the format-specific extraction prompt for AI content extraction.
|
||||
|
||||
Args:
|
||||
user_prompt: User's original prompt for report generation
|
||||
title: Report title
|
||||
|
||||
Returns:
|
||||
str: Format-specific prompt for AI extraction
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Render extracted content to the target format.
|
||||
|
||||
Args:
|
||||
extracted_content: Raw content extracted by AI using format-specific prompt
|
||||
title: Report title
|
||||
|
||||
Returns:
|
||||
tuple: (rendered_content, mime_type)
|
||||
"""
|
||||
pass
|
||||
|
||||
def _extract_sections(self, report_data: Dict[str, Any]) -> list:
|
||||
"""Extract sections from report data."""
|
||||
return report_data.get('sections', [])
|
||||
|
||||
def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extract metadata from report data."""
|
||||
return report_data.get('metadata', {})
|
||||
|
||||
def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
|
||||
"""Get title from report data or use fallback."""
|
||||
return report_data.get('title', fallback_title)
|
||||
|
||||
def _format_timestamp(self, timestamp: str = None) -> str:
|
||||
"""Format timestamp for display."""
|
||||
if timestamp:
|
||||
return timestamp
|
||||
from datetime import datetime, UTC
|
||||
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
"""
|
||||
CSV renderer for report generation.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import csv
|
||||
import io
|
||||
|
||||
class CsvRenderer(BaseRenderer):
|
||||
"""Renders content to CSV format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported CSV formats."""
|
||||
return ['csv']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['spreadsheet', 'table']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for CSV renderer."""
|
||||
return 70
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only CSV-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"CSV FORMAT GUIDELINES:\n"
|
||||
"- Emit ONLY CSV text without fences or commentary.\n"
|
||||
"- Include a single header row with clear column names.\n"
|
||||
"- Quote fields containing commas, quotes, or newlines; escape quotes by doubling them.\n"
|
||||
"- Use rows to represent items/records derived from sources.\n"
|
||||
"- Keep cells concise; include units in headers when useful.\n"
|
||||
"OUTPUT: Return ONLY valid CSV content that can be imported."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to CSV format."""
|
||||
try:
|
||||
# The extracted content should already be CSV from the AI
|
||||
# Just clean it up
|
||||
csv_content = self._clean_csv_content(extracted_content, title)
|
||||
|
||||
return csv_content, "text/csv"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering CSV: {str(e)}")
|
||||
# Return minimal CSV fallback
|
||||
return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
|
||||
|
||||
def _clean_csv_content(self, content: str, title: str) -> str:
|
||||
"""Clean and validate CSV content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
return content
|
||||
|
|
@ -1,249 +0,0 @@
|
|||
"""
|
||||
DOCX renderer for report generation using python-docx.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import io
|
||||
import base64
|
||||
from datetime import datetime, UTC
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||||
from docx.oxml.shared import OxmlElement, qn
|
||||
from docx.oxml.ns import nsdecls
|
||||
from docx.oxml import parse_xml
|
||||
DOCX_AVAILABLE = True
|
||||
except ImportError:
|
||||
DOCX_AVAILABLE = False
|
||||
|
||||
class DocxRenderer(BaseRenderer):
|
||||
"""Renders content to DOCX format using python-docx."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported DOCX formats."""
|
||||
return ['docx', 'doc']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['word', 'document']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for DOCX renderer."""
|
||||
return 115
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only DOCX-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"DOCX FORMAT GUIDELINES:\n"
|
||||
"- Provide plain text content suitable for Word generation (no markdown/HTML).\n"
|
||||
"- Use clear section hierarchy; bullet and numbered lists where needed.\n"
|
||||
"- Include tables as simple pipe-delimited lines if tabular data is needed.\n"
|
||||
"OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to DOCX format."""
|
||||
try:
|
||||
if not DOCX_AVAILABLE:
|
||||
# Fallback to HTML if python-docx not available
|
||||
from .html_renderer import HtmlRenderer
|
||||
html_renderer = HtmlRenderer()
|
||||
html_content, _ = await html_renderer.render(extracted_content, title)
|
||||
return html_content, "text/html"
|
||||
|
||||
# Generate DOCX using python-docx
|
||||
docx_content = self._generate_docx(extracted_content, title)
|
||||
|
||||
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
||||
# Return minimal fallback
|
||||
return f"DOCX Generation Error: {str(e)}", "text/plain"
|
||||
|
||||
def _generate_docx(self, content: str, title: str) -> str:
|
||||
"""Generate DOCX content using python-docx."""
|
||||
try:
|
||||
# Create new document
|
||||
doc = Document()
|
||||
|
||||
# Set up document styles
|
||||
self._setup_document_styles(doc)
|
||||
|
||||
# Add title
|
||||
title_para = doc.add_heading(title, 0)
|
||||
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
# Add generation date
|
||||
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
|
||||
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
# Add page break
|
||||
doc.add_page_break()
|
||||
|
||||
# Process content
|
||||
lines = content.split('\n')
|
||||
current_section = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Check for ALL CAPS headings (major headings)
|
||||
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
|
||||
if current_section:
|
||||
self._process_section(doc, current_section)
|
||||
current_section = []
|
||||
doc.add_heading(line, level=1)
|
||||
# Check for Title Case headings (subheadings)
|
||||
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
|
||||
if current_section:
|
||||
self._process_section(doc, current_section)
|
||||
current_section = []
|
||||
doc.add_heading(line, level=2)
|
||||
# Check for markdown headings (fallback)
|
||||
elif line.startswith('# '):
|
||||
# H1 heading
|
||||
if current_section:
|
||||
self._process_section(doc, current_section)
|
||||
current_section = []
|
||||
doc.add_heading(line[2:], level=1)
|
||||
elif line.startswith('## '):
|
||||
# H2 heading
|
||||
if current_section:
|
||||
self._process_section(doc, current_section)
|
||||
current_section = []
|
||||
doc.add_heading(line[3:], level=2)
|
||||
elif line.startswith('### '):
|
||||
# H3 heading
|
||||
if current_section:
|
||||
self._process_section(doc, current_section)
|
||||
current_section = []
|
||||
doc.add_heading(line[4:], level=3)
|
||||
else:
|
||||
current_section.append(line)
|
||||
|
||||
# Process remaining content
|
||||
if current_section:
|
||||
self._process_section(doc, current_section)
|
||||
|
||||
# Save to buffer
|
||||
buffer = io.BytesIO()
|
||||
doc.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert to base64
|
||||
docx_bytes = buffer.getvalue()
|
||||
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
||||
|
||||
return docx_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating DOCX: {str(e)}")
|
||||
raise
|
||||
|
||||
def _setup_document_styles(self, doc):
|
||||
"""Set up document styles."""
|
||||
try:
|
||||
# Set default font
|
||||
style = doc.styles['Normal']
|
||||
font = style.font
|
||||
font.name = 'Calibri'
|
||||
font.size = Pt(11)
|
||||
|
||||
# Set heading styles
|
||||
for i in range(1, 4):
|
||||
heading_style = doc.styles[f'Heading {i}']
|
||||
heading_font = heading_style.font
|
||||
heading_font.name = 'Calibri'
|
||||
heading_font.size = Pt(16 - i * 2)
|
||||
heading_font.bold = True
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
||||
|
||||
def _process_section(self, doc, lines: list):
|
||||
"""Process a section of content into DOCX elements."""
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Check for tables (lines with |)
|
||||
if '|' in line and not line.startswith('|'):
|
||||
# This might be part of a table, process as table
|
||||
table_data = self._extract_table_data(lines)
|
||||
if table_data:
|
||||
self._add_table(doc, table_data)
|
||||
return
|
||||
|
||||
# Check for lists
|
||||
if line.startswith('- ') or line.startswith('* '):
|
||||
# This is a list item
|
||||
doc.add_paragraph(line[2:], style='List Bullet')
|
||||
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
||||
# This is a numbered list item
|
||||
doc.add_paragraph(line[3:], style='List Number')
|
||||
else:
|
||||
# Regular paragraph
|
||||
doc.add_paragraph(line)
|
||||
|
||||
def _extract_table_data(self, lines: list) -> list:
|
||||
"""Extract table data from lines."""
|
||||
table_data = []
|
||||
in_table = False
|
||||
|
||||
for line in lines:
|
||||
if '|' in line:
|
||||
if not in_table:
|
||||
in_table = True
|
||||
# Split by | and clean up
|
||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||
if cells:
|
||||
table_data.append(cells)
|
||||
elif in_table and not line.strip():
|
||||
# Empty line, might be end of table
|
||||
break
|
||||
|
||||
return table_data if len(table_data) > 1 else []
|
||||
|
||||
def _add_table(self, doc, table_data: list):
|
||||
"""Add a table to the document."""
|
||||
try:
|
||||
if not table_data:
|
||||
return
|
||||
|
||||
# Create table
|
||||
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
||||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||
|
||||
# Add data to table
|
||||
for row_idx, row_data in enumerate(table_data):
|
||||
for col_idx, cell_data in enumerate(row_data):
|
||||
if col_idx < len(table.rows[row_idx].cells):
|
||||
table.rows[row_idx].cells[col_idx].text = cell_data
|
||||
|
||||
# Style the table
|
||||
self._style_table(table)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add table: {str(e)}")
|
||||
|
||||
def _style_table(self, table):
|
||||
"""Apply styling to the table."""
|
||||
try:
|
||||
# Style header row
|
||||
if len(table.rows) > 0:
|
||||
header_cells = table.rows[0].cells
|
||||
for cell in header_cells:
|
||||
for paragraph in cell.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
run.bold = True
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not style table: {str(e)}")
|
||||
|
|
@ -1,210 +0,0 @@
|
|||
"""
|
||||
Excel renderer for report generation using openpyxl.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import io
|
||||
import base64
|
||||
from datetime import datetime, UTC
|
||||
|
||||
try:
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
from openpyxl.worksheet.table import Table, TableStyleInfo
|
||||
OPENPYXL_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENPYXL_AVAILABLE = False
|
||||
|
||||
class ExcelRenderer(BaseRenderer):
|
||||
"""Renders content to Excel format using openpyxl."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported Excel formats."""
|
||||
return ['xlsx', 'xls', 'excel']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['spreadsheet', 'workbook']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for Excel renderer."""
|
||||
return 110
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only Excel-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"EXCEL FORMAT GUIDELINES:\n"
|
||||
"- Output one or more pipe-delimited tables with a single header row.\n"
|
||||
"- Let user intent define columns; use clear names and ISO dates.\n"
|
||||
"- Separate multiple tables by a single blank line.\n"
|
||||
"- No markdown/HTML/code fences; tables only unless user explicitly asks for notes.\n"
|
||||
"OUTPUT: Return ONLY pipe-delimited tables suitable for import."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to Excel format."""
|
||||
try:
|
||||
if not OPENPYXL_AVAILABLE:
|
||||
# Fallback to CSV if openpyxl not available
|
||||
from .csv_renderer import CsvRenderer
|
||||
csv_renderer = CsvRenderer()
|
||||
csv_content, _ = await csv_renderer.render(extracted_content, title)
|
||||
return csv_content, "text/csv"
|
||||
|
||||
# Generate Excel using openpyxl
|
||||
excel_content = self._generate_excel(extracted_content, title)
|
||||
|
||||
return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering Excel: {str(e)}")
|
||||
# Return CSV fallback
|
||||
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
|
||||
|
||||
def _generate_excel(self, content: str, title: str) -> str:
|
||||
"""Generate Excel content using openpyxl."""
|
||||
try:
|
||||
# Create workbook
|
||||
wb = Workbook()
|
||||
|
||||
# Remove default sheet
|
||||
wb.remove(wb.active)
|
||||
|
||||
# Create sheets
|
||||
summary_sheet = wb.create_sheet("Summary", 0)
|
||||
data_sheet = wb.create_sheet("Data", 1)
|
||||
analysis_sheet = wb.create_sheet("Analysis", 2)
|
||||
|
||||
# Add content to sheets
|
||||
self._populate_summary_sheet(summary_sheet, title)
|
||||
self._populate_data_sheet(data_sheet, content)
|
||||
self._populate_analysis_sheet(analysis_sheet, content)
|
||||
|
||||
# Save to buffer
|
||||
buffer = io.BytesIO()
|
||||
wb.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert to base64
|
||||
excel_bytes = buffer.getvalue()
|
||||
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
|
||||
|
||||
return excel_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating Excel: {str(e)}")
|
||||
raise
|
||||
|
||||
def _populate_summary_sheet(self, sheet, title: str):
|
||||
"""Populate the summary sheet."""
|
||||
try:
|
||||
# Title
|
||||
sheet['A1'] = title
|
||||
sheet['A1'].font = Font(size=16, bold=True)
|
||||
sheet['A1'].alignment = Alignment(horizontal='center')
|
||||
|
||||
# Generation info
|
||||
sheet['A3'] = "Generated:"
|
||||
sheet['B3'] = self._format_timestamp()
|
||||
sheet['A4'] = "Status:"
|
||||
sheet['B4'] = "Generated Successfully"
|
||||
|
||||
# Key metrics placeholder
|
||||
sheet['A6'] = "Key Metrics:"
|
||||
sheet['A6'].font = Font(bold=True)
|
||||
sheet['A7'] = "Total Items:"
|
||||
sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
|
||||
|
||||
# Auto-adjust column widths
|
||||
sheet.column_dimensions['A'].width = 20
|
||||
sheet.column_dimensions['B'].width = 30
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate summary sheet: {str(e)}")
|
||||
|
||||
def _populate_data_sheet(self, sheet, content: str):
|
||||
"""Populate the data sheet."""
|
||||
try:
|
||||
# Headers
|
||||
headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
|
||||
for col, header in enumerate(headers, 1):
|
||||
cell = sheet.cell(row=1, column=col, value=header)
|
||||
cell.font = Font(bold=True)
|
||||
cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
|
||||
|
||||
# Process content
|
||||
lines = content.split('\n')
|
||||
row = 2
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Check for table data (lines with |)
|
||||
if '|' in line:
|
||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||
for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
|
||||
sheet.cell(row=row, column=col, value=cell_data)
|
||||
row += 1
|
||||
else:
|
||||
# Regular content
|
||||
sheet.cell(row=row, column=1, value=line)
|
||||
row += 1
|
||||
|
||||
# Auto-adjust column widths
|
||||
for col in range(1, 6):
|
||||
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate data sheet: {str(e)}")
|
||||
|
||||
def _populate_analysis_sheet(self, sheet, content: str):
|
||||
"""Populate the analysis sheet."""
|
||||
try:
|
||||
# Title
|
||||
sheet['A1'] = "Analysis & Insights"
|
||||
sheet['A1'].font = Font(size=14, bold=True)
|
||||
|
||||
# Content analysis
|
||||
lines = content.split('\n')
|
||||
row = 3
|
||||
|
||||
sheet['A3'] = "Content Analysis:"
|
||||
sheet['A3'].font = Font(bold=True)
|
||||
row += 1
|
||||
|
||||
# Count different types of content
|
||||
table_lines = sum(1 for line in lines if '|' in line)
|
||||
list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
|
||||
text_lines = len(lines) - table_lines - list_lines
|
||||
|
||||
sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"Table Rows: {table_lines}"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"List Items: {list_lines}"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"Text Lines: {text_lines}"
|
||||
row += 2
|
||||
|
||||
# Recommendations
|
||||
sheet[f'A{row}'] = "Recommendations:"
|
||||
sheet[f'A{row}'].font = Font(bold=True)
|
||||
row += 1
|
||||
sheet[f'A{row}'] = "1. Review data accuracy"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = "2. Consider additional analysis"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = "3. Update regularly"
|
||||
|
||||
# Auto-adjust column width
|
||||
sheet.column_dimensions['A'].width = 30
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
"""
|
||||
HTML renderer for report generation.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
|
||||
class HtmlRenderer(BaseRenderer):
|
||||
"""Renders content to HTML format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported HTML formats."""
|
||||
return ['html', 'htm']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['web', 'webpage']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for HTML renderer."""
|
||||
return 100
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only HTML-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"HTML FORMAT GUIDELINES:\n"
|
||||
"- Output a complete HTML5 document starting with <!DOCTYPE html>.\n"
|
||||
"- Include <html>, <head> with <meta charset=\"UTF-8\"> and <title>, and <body>.\n"
|
||||
"- Use semantic elements: <header>, <main>, <section>, <article>, <footer>.\n"
|
||||
"- Provide professional CSS in a <style> block; responsive, clean typography.\n"
|
||||
"- Use h1/h2/h3 for headings; tables and lists for structure.\n"
|
||||
"OUTPUT: Return ONLY valid HTML (no markdown, no code fences)."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to HTML format."""
|
||||
try:
|
||||
# The extracted content should already be HTML from the AI
|
||||
# Just clean it up and ensure it's valid
|
||||
html_content = self._clean_html_content(extracted_content, title)
|
||||
|
||||
return html_content, "text/html"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering HTML: {str(e)}")
|
||||
# Return minimal HTML fallback
|
||||
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
|
||||
|
||||
def _clean_html_content(self, content: str, title: str) -> str:
|
||||
"""Clean and validate HTML content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
# Ensure it starts with DOCTYPE
|
||||
if not content.startswith('<!DOCTYPE'):
|
||||
if content.startswith('<html'):
|
||||
content = '<!DOCTYPE html>\n' + content
|
||||
else:
|
||||
content = f'<!DOCTYPE html>\n<html>\n<head><meta charset="UTF-8"><title>{title}</title></head>\n<body>\n{content}\n</body>\n</html>'
|
||||
|
||||
return content
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
"""
|
||||
JSON renderer for report generation.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import json
|
||||
|
||||
class JsonRenderer(BaseRenderer):
|
||||
"""Renders content to JSON format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported JSON formats."""
|
||||
return ['json']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['data']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for JSON renderer."""
|
||||
return 80
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only JSON-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"JSON FORMAT GUIDELINES:\n"
|
||||
"- Output ONLY a single valid JSON object (no fences, no pre/post text).\n"
|
||||
"- Choose a structure that best fits the user's intent; include a top-level title and data.\n"
|
||||
"- Prefer arrays/objects that map cleanly to the extracted facts.\n"
|
||||
"- Include minimal metadata only if useful (e.g., generatedAt, sources).\n"
|
||||
"OUTPUT: Return ONLY valid, parseable JSON."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to JSON format."""
|
||||
try:
|
||||
# The extracted content should already be JSON from the AI
|
||||
# Just validate and format it
|
||||
json_content = self._clean_json_content(extracted_content, title)
|
||||
|
||||
return json_content, "application/json"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering JSON: {str(e)}")
|
||||
# Return minimal JSON fallback
|
||||
fallback_data = {
|
||||
"title": title,
|
||||
"sections": [{"type": "text", "content": f"Error rendering report: {str(e)}"}],
|
||||
"metadata": {"error": str(e)}
|
||||
}
|
||||
return json.dumps(fallback_data, indent=2), "application/json"
|
||||
|
||||
def _clean_json_content(self, content: str, title: str) -> str:
|
||||
"""Clean and validate JSON content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
# Validate JSON
|
||||
try:
|
||||
parsed = json.loads(content)
|
||||
# Re-format with proper indentation
|
||||
return json.dumps(parsed, indent=2, ensure_ascii=False)
|
||||
except json.JSONDecodeError:
|
||||
# If not valid JSON, return as-is
|
||||
return content
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
"""
|
||||
Markdown renderer for report generation.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
|
||||
class MarkdownRenderer(BaseRenderer):
|
||||
"""Renders content to Markdown format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported Markdown formats."""
|
||||
return ['md', 'markdown']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['mdown', 'mkd']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for markdown renderer."""
|
||||
return 95
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only Markdown-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"MARKDOWN FORMAT GUIDELINES:\n"
|
||||
"- Use proper Markdown syntax only (no HTML wrappers).\n"
|
||||
"- # for main title, ## for sections, ### for subsections.\n"
|
||||
"- Tables with | separators and a header row.\n"
|
||||
"- Bullet lists with - or *.\n"
|
||||
"- Emphasis with **bold** and *italic*.\n"
|
||||
"- Code blocks with ```language.\n"
|
||||
"- Horizontal rules (---) to separate major sections when helpful.\n"
|
||||
"- Include links [text](url) and images  when referenced by sources.\n"
|
||||
"OUTPUT: Return ONLY raw Markdown content without code fences."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to Markdown format."""
|
||||
try:
|
||||
# The extracted content should already be Markdown from the AI
|
||||
# Just clean it up
|
||||
markdown_content = self._clean_markdown_content(extracted_content, title)
|
||||
|
||||
return markdown_content, "text/markdown"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering markdown: {str(e)}")
|
||||
# Return minimal markdown fallback
|
||||
return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown"
|
||||
|
||||
def _clean_markdown_content(self, content: str, title: str) -> str:
|
||||
"""Clean and validate Markdown content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
return content
|
||||
|
|
@ -1,225 +0,0 @@
|
|||
"""
|
||||
PDF renderer for report generation using reportlab.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import io
|
||||
import base64
|
||||
from datetime import datetime, UTC
|
||||
|
||||
try:
|
||||
from reportlab.lib.pagesizes import letter, A4
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.lib.units import inch
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
|
||||
REPORTLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
REPORTLAB_AVAILABLE = False
|
||||
|
||||
class PdfRenderer(BaseRenderer):
|
||||
"""Renders content to PDF format using reportlab."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported PDF formats."""
|
||||
return ['pdf']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['document', 'print']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for PDF renderer."""
|
||||
return 120
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only PDF-specific guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"PDF FORMAT GUIDELINES:\n"
|
||||
"- Provide structured content suitable for pagination and headings (H1/H2/H3-like).\n"
|
||||
"- Use bullet lists and tables where useful; separate major sections clearly.\n"
|
||||
"- Avoid markdown/HTML; produce clean, plain content that can be laid out as PDF.\n"
|
||||
"OUTPUT: Return ONLY the PDF-ready textual content (no fences)."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to PDF format."""
|
||||
try:
|
||||
if not REPORTLAB_AVAILABLE:
|
||||
# Fallback to HTML if reportlab not available
|
||||
from .html_renderer import HtmlRenderer
|
||||
html_renderer = HtmlRenderer()
|
||||
html_content, _ = await html_renderer.render(extracted_content, title)
|
||||
return html_content, "text/html"
|
||||
|
||||
# Generate PDF using reportlab
|
||||
pdf_content = self._generate_pdf(extracted_content, title)
|
||||
|
||||
return pdf_content, "application/pdf"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering PDF: {str(e)}")
|
||||
# Return minimal fallback
|
||||
return f"PDF Generation Error: {str(e)}", "text/plain"
|
||||
|
||||
def _generate_pdf(self, content: str, title: str) -> str:
|
||||
"""Generate PDF content using reportlab."""
|
||||
try:
|
||||
# Create a buffer to hold the PDF
|
||||
buffer = io.BytesIO()
|
||||
|
||||
# Create PDF document
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=72,
|
||||
leftMargin=72,
|
||||
topMargin=72,
|
||||
bottomMargin=18
|
||||
)
|
||||
|
||||
# Get styles
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# Create custom styles
|
||||
title_style = ParagraphStyle(
|
||||
'CustomTitle',
|
||||
parent=styles['Heading1'],
|
||||
fontSize=24,
|
||||
spaceAfter=30,
|
||||
alignment=TA_CENTER,
|
||||
textColor=colors.darkblue
|
||||
)
|
||||
|
||||
heading_style = ParagraphStyle(
|
||||
'CustomHeading',
|
||||
parent=styles['Heading2'],
|
||||
fontSize=16,
|
||||
spaceAfter=12,
|
||||
spaceBefore=12,
|
||||
textColor=colors.darkblue
|
||||
)
|
||||
|
||||
# Build PDF content
|
||||
story = []
|
||||
|
||||
# Title page
|
||||
story.append(Paragraph(title, title_style))
|
||||
story.append(Spacer(1, 20))
|
||||
story.append(Paragraph(f"Generated: {self._format_timestamp()}", styles['Normal']))
|
||||
story.append(PageBreak())
|
||||
|
||||
# Process content
|
||||
lines = content.split('\n')
|
||||
current_section = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Check for headings
|
||||
if line.startswith('# '):
|
||||
# H1 heading
|
||||
if current_section:
|
||||
story.extend(self._process_section(current_section, styles))
|
||||
current_section = []
|
||||
story.append(Paragraph(line[2:], title_style))
|
||||
story.append(Spacer(1, 12))
|
||||
elif line.startswith('## '):
|
||||
# H2 heading
|
||||
if current_section:
|
||||
story.extend(self._process_section(current_section, styles))
|
||||
current_section = []
|
||||
story.append(Paragraph(line[3:], heading_style))
|
||||
story.append(Spacer(1, 8))
|
||||
elif line.startswith('### '):
|
||||
# H3 heading
|
||||
if current_section:
|
||||
story.extend(self._process_section(current_section, styles))
|
||||
current_section = []
|
||||
story.append(Paragraph(line[4:], styles['Heading3']))
|
||||
story.append(Spacer(1, 6))
|
||||
else:
|
||||
current_section.append(line)
|
||||
|
||||
# Process remaining content
|
||||
if current_section:
|
||||
story.extend(self._process_section(current_section, styles))
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
|
||||
# Get PDF content as base64
|
||||
buffer.seek(0)
|
||||
pdf_bytes = buffer.getvalue()
|
||||
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
||||
|
||||
return pdf_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating PDF: {str(e)}")
|
||||
raise
|
||||
|
||||
def _process_section(self, lines: list, styles) -> list:
|
||||
"""Process a section of content into PDF elements."""
|
||||
elements = []
|
||||
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Check for tables (lines with |)
|
||||
if '|' in line and not line.startswith('|'):
|
||||
# This might be part of a table, process as table
|
||||
table_data = self._extract_table_data(lines)
|
||||
if table_data:
|
||||
table = Table(table_data)
|
||||
table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), 14),
|
||||
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
||||
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||
]))
|
||||
elements.append(table)
|
||||
elements.append(Spacer(1, 12))
|
||||
return elements
|
||||
|
||||
# Check for lists
|
||||
if line.startswith('- ') or line.startswith('* '):
|
||||
# This is a list item
|
||||
elements.append(Paragraph(f"• {line[2:]}", styles['Normal']))
|
||||
else:
|
||||
# Regular paragraph
|
||||
elements.append(Paragraph(line, styles['Normal']))
|
||||
|
||||
elements.append(Spacer(1, 6))
|
||||
return elements
|
||||
|
||||
def _extract_table_data(self, lines: list) -> list:
|
||||
"""Extract table data from lines."""
|
||||
table_data = []
|
||||
in_table = False
|
||||
|
||||
for line in lines:
|
||||
if '|' in line:
|
||||
if not in_table:
|
||||
in_table = True
|
||||
# Split by | and clean up
|
||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||
if cells:
|
||||
table_data.append(cells)
|
||||
elif in_table and not line.strip():
|
||||
# Empty line, might be end of table
|
||||
break
|
||||
|
||||
return table_data if len(table_data) > 1 else []
|
||||
|
|
@ -6,7 +6,7 @@ import logging
|
|||
import importlib
|
||||
import pkgutil
|
||||
from typing import Dict, Type, List, Optional
|
||||
from .base_renderer import BaseRenderer
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -37,7 +37,7 @@ class RendererRegistry:
|
|||
|
||||
# Scan all Python files in the renderers directory
|
||||
for file_path in renderers_dir.glob("*.py"):
|
||||
if file_path.name in ['registry.py', 'base_renderer.py', '__init__.py']:
|
||||
if file_path.name in ['registry.py', 'rendererBaseTemplate.py', '__init__.py']:
|
||||
continue
|
||||
|
||||
# Extract module name from filename
|
||||
|
|
@ -92,7 +92,7 @@ class RendererRegistry:
|
|||
except Exception as e:
|
||||
logger.error(f"Error registering renderer {renderer_class.__name__}: {str(e)}")
|
||||
|
||||
def get_renderer(self, output_format: str) -> Optional[BaseRenderer]:
|
||||
def get_renderer(self, output_format: str, services=None) -> Optional[BaseRenderer]:
|
||||
"""Get a renderer instance for the specified format."""
|
||||
if not self._discovered:
|
||||
self.discover_renderers()
|
||||
|
|
@ -109,7 +109,7 @@ class RendererRegistry:
|
|||
|
||||
if renderer_class:
|
||||
try:
|
||||
return renderer_class()
|
||||
return renderer_class(services=services)
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating renderer instance for {format_name}: {str(e)}")
|
||||
return None
|
||||
|
|
@ -144,9 +144,9 @@ class RendererRegistry:
|
|||
# Global registry instance
|
||||
_registry = RendererRegistry()
|
||||
|
||||
def get_renderer(output_format: str) -> Optional[BaseRenderer]:
|
||||
def get_renderer(output_format: str, services=None) -> Optional[BaseRenderer]:
|
||||
"""Get a renderer instance for the specified format."""
|
||||
return _registry.get_renderer(output_format)
|
||||
return _registry.get_renderer(output_format, services)
|
||||
|
||||
def get_supported_formats() -> List[str]:
|
||||
"""Get list of all supported formats."""
|
||||
|
|
|
|||
|
|
@ -0,0 +1,459 @@
|
|||
"""
|
||||
Base renderer class for all format renderers.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import logging
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BaseRenderer(ABC):
|
||||
"""Base class for all format renderers."""
|
||||
|
||||
def __init__(self, services=None):
|
||||
self.logger = logger
|
||||
self.services = services # Add services attribute
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""
|
||||
Return list of supported format names for this renderer.
|
||||
Override this method in subclasses to specify supported formats.
|
||||
"""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""
|
||||
Return list of format aliases for this renderer.
|
||||
Override this method in subclasses to specify format aliases.
|
||||
"""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""
|
||||
Return priority for this renderer (higher number = higher priority).
|
||||
Used when multiple renderers support the same format.
|
||||
"""
|
||||
return 0
|
||||
|
||||
@abstractmethod
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""
|
||||
Render extracted JSON content to the target format.
|
||||
|
||||
Args:
|
||||
extracted_content: Structured JSON content with sections and metadata
|
||||
title: Report title
|
||||
user_prompt: Original user prompt for context
|
||||
ai_service: AI service instance for additional processing
|
||||
|
||||
Returns:
|
||||
tuple: (rendered_content, mime_type)
|
||||
"""
|
||||
pass
|
||||
|
||||
def _extract_sections(self, report_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Extract sections from report data."""
|
||||
return report_data.get('sections', [])
|
||||
|
||||
def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extract metadata from report data."""
|
||||
return report_data.get('metadata', {})
|
||||
|
||||
def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
|
||||
"""Get title from report data or use fallback."""
|
||||
metadata = report_data.get('metadata', {})
|
||||
return metadata.get('title', fallback_title)
|
||||
|
||||
def _validate_json_structure(self, json_content: Dict[str, Any]) -> bool:
|
||||
"""Validate that JSON content has the expected structure."""
|
||||
if not isinstance(json_content, dict):
|
||||
return False
|
||||
|
||||
if "sections" not in json_content:
|
||||
return False
|
||||
|
||||
sections = json_content.get("sections", [])
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
# Validate each section has content_type and elements
|
||||
for section in sections:
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
if "content_type" not in section or "elements" not in section:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _get_section_type(self, section: Dict[str, Any]) -> str:
|
||||
"""Get the type of a section."""
|
||||
return section.get("content_type", "paragraph")
|
||||
|
||||
def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Get the elements of a section."""
|
||||
return section.get("elements", [])
|
||||
|
||||
def _get_section_id(self, section: Dict[str, Any]) -> str:
|
||||
"""Get the ID of a section (if available)."""
|
||||
return section.get("id", "unknown")
|
||||
|
||||
def _extract_table_data(self, section_data: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
|
||||
"""Extract table headers and rows from section data."""
|
||||
headers = section_data.get("headers", [])
|
||||
rows = section_data.get("rows", [])
|
||||
return headers, rows
|
||||
|
||||
def _extract_bullet_list_items(self, section_data: Dict[str, Any]) -> List[str]:
|
||||
"""Extract bullet list items from section data."""
|
||||
items = section_data.get("items", [])
|
||||
result = []
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
result.append(item)
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
result.append(item["text"])
|
||||
return result
|
||||
|
||||
def _extract_heading_data(self, section_data: Dict[str, Any]) -> Tuple[int, str]:
|
||||
"""Extract heading level and text from section data."""
|
||||
level = section_data.get("level", 1)
|
||||
text = section_data.get("text", "")
|
||||
return level, text
|
||||
|
||||
def _extract_paragraph_text(self, section_data: Dict[str, Any]) -> str:
|
||||
"""Extract paragraph text from section data."""
|
||||
return section_data.get("text", "")
|
||||
|
||||
def _extract_code_block_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]:
|
||||
"""Extract code and language from section data."""
|
||||
code = section_data.get("code", "")
|
||||
language = section_data.get("language", "")
|
||||
return code, language
|
||||
|
||||
def _extract_image_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]:
|
||||
"""Extract base64 data and alt text from section data."""
|
||||
base64_data = section_data.get("base64Data", "")
|
||||
alt_text = section_data.get("altText", "Image")
|
||||
return base64_data, alt_text
|
||||
|
||||
def _render_image_section(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
|
||||
"""
|
||||
Render an image section. This is a base implementation that should be overridden
|
||||
by format-specific renderers.
|
||||
|
||||
Args:
|
||||
section: Image section data
|
||||
styles: Optional styling information
|
||||
|
||||
Returns:
|
||||
Format-specific image representation
|
||||
"""
|
||||
section_data = self._get_section_data(section)
|
||||
base64_data, alt_text = self._extract_image_data(section_data)
|
||||
|
||||
# Base implementation returns a simple dict
|
||||
# Format-specific renderers should override this method
|
||||
return {
|
||||
"content_type": "image",
|
||||
"base64Data": base64_data,
|
||||
"altText": alt_text,
|
||||
"width": section_data.get("width", None),
|
||||
"height": section_data.get("height", None),
|
||||
"caption": section_data.get("caption", "")
|
||||
}
|
||||
|
||||
def _validate_image_data(self, base64_data: str, alt_text: str) -> bool:
|
||||
"""Validate image data."""
|
||||
if not base64_data:
|
||||
self.logger.warning("Image section has no base64 data")
|
||||
return False
|
||||
|
||||
if not alt_text:
|
||||
self.logger.warning("Image section has no alt text")
|
||||
return False
|
||||
|
||||
# Basic base64 validation
|
||||
try:
|
||||
import base64
|
||||
base64.b64decode(base64_data, validate=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Invalid base64 image data: {str(e)}")
|
||||
return False
|
||||
|
||||
def _get_image_dimensions(self, base64_data: str) -> Tuple[int, int]:
|
||||
"""
|
||||
Get image dimensions from base64 data.
|
||||
This is a helper method that format-specific renderers can use.
|
||||
"""
|
||||
try:
|
||||
import base64
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Decode base64 data
|
||||
image_data = base64.b64decode(base64_data)
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
|
||||
return image.size # Returns (width, height)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not determine image dimensions: {str(e)}")
|
||||
return (0, 0)
|
||||
|
||||
def _resize_image_if_needed(self, base64_data: str, max_width: int = 800, max_height: int = 600) -> str:
|
||||
"""
|
||||
Resize image if it exceeds maximum dimensions.
|
||||
Returns the resized image as base64 string.
|
||||
"""
|
||||
try:
|
||||
import base64
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Decode base64 data
|
||||
image_data = base64.b64decode(base64_data)
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
|
||||
# Check if resizing is needed
|
||||
width, height = image.size
|
||||
if width <= max_width and height <= max_height:
|
||||
return base64_data # No resizing needed
|
||||
|
||||
# Calculate new dimensions maintaining aspect ratio
|
||||
ratio = min(max_width / width, max_height / height)
|
||||
new_width = int(width * ratio)
|
||||
new_height = int(height * ratio)
|
||||
|
||||
# Resize image
|
||||
resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
# Convert back to base64
|
||||
buffer = io.BytesIO()
|
||||
resized_image.save(buffer, format=image.format or 'PNG')
|
||||
resized_data = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(resized_data).decode('utf-8')
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not resize image: {str(e)}")
|
||||
return base64_data # Return original if resize fails
|
||||
|
||||
def _get_supported_section_types(self) -> List[str]:
|
||||
"""Return list of supported section types."""
|
||||
return ["table", "bullet_list", "heading", "paragraph", "code_block", "image"]
|
||||
|
||||
def _is_valid_section_type(self, section_type: str) -> bool:
|
||||
"""Check if a section type is valid."""
|
||||
return section_type in self._get_supported_section_types()
|
||||
|
||||
def _process_section_by_type(self, section: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Process a section and return structured data based on its type."""
|
||||
section_type = self._get_section_type(section)
|
||||
section_data = self._get_section_data(section)
|
||||
|
||||
if section_type == "table":
|
||||
headers, rows = self._extract_table_data(section_data)
|
||||
return {"content_type": "table", "headers": headers, "rows": rows}
|
||||
elif section_type == "bullet_list":
|
||||
items = self._extract_bullet_list_items(section_data)
|
||||
return {"content_type": "bullet_list", "items": items}
|
||||
elif section_type == "heading":
|
||||
level, text = self._extract_heading_data(section_data)
|
||||
return {"content_type": "heading", "level": level, "text": text}
|
||||
elif section_type == "paragraph":
|
||||
text = self._extract_paragraph_text(section_data)
|
||||
return {"content_type": "paragraph", "text": text}
|
||||
elif section_type == "code_block":
|
||||
code, language = self._extract_code_block_data(section_data)
|
||||
return {"content_type": "code_block", "code": code, "language": language}
|
||||
elif section_type == "image":
|
||||
base64_data, alt_text = self._extract_image_data(section_data)
|
||||
# Validate image data
|
||||
if self._validate_image_data(base64_data, alt_text):
|
||||
return {
|
||||
"content_type": "image",
|
||||
"base64Data": base64_data,
|
||||
"altText": alt_text,
|
||||
"width": section_data.get("width"),
|
||||
"height": section_data.get("height"),
|
||||
"caption": section_data.get("caption", "")
|
||||
}
|
||||
else:
|
||||
# Return placeholder if image data is invalid
|
||||
return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"}
|
||||
else:
|
||||
# Fallback to paragraph
|
||||
text = self._extract_paragraph_text(section_data)
|
||||
return {"content_type": "paragraph", "text": text}
|
||||
|
||||
def _format_timestamp(self, timestamp: str = None) -> str:
|
||||
"""Format timestamp for display."""
|
||||
if timestamp:
|
||||
return timestamp
|
||||
from datetime import datetime, UTC
|
||||
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
||||
# ===== GENERIC AI STYLING HELPERS =====
|
||||
|
||||
async def _get_ai_styles(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generic AI styling method that can be used by all renderers.
|
||||
|
||||
Args:
|
||||
ai_service: AI service instance
|
||||
style_template: Format-specific style template
|
||||
default_styles: Default styles to fall back to
|
||||
|
||||
Returns:
|
||||
Dict with styling definitions
|
||||
"""
|
||||
# DEBUG: Show which renderer is calling this method
|
||||
|
||||
if not ai_service:
|
||||
return default_styles
|
||||
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||
|
||||
# DEBUG: Show the actual prompt being sent to AI
|
||||
self.logger.debug(f"AI Style Template Prompt:")
|
||||
self.logger.debug(f"{style_template}")
|
||||
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
# Clean and parse JSON
|
||||
result = response.content.strip() if response and response.content else ""
|
||||
|
||||
# Check if result is empty
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty response, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
except json.JSONDecodeError as json_error:
|
||||
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
|
||||
|
||||
# Use print instead of logger to avoid truncation
|
||||
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "RENDERER")
|
||||
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "RENDERER")
|
||||
|
||||
self.logger.warning(f"Raw content that failed to parse: {result}")
|
||||
|
||||
# Try to fix incomplete JSON by adding missing closing braces
|
||||
open_braces = result.count('{')
|
||||
close_braces = result.count('}')
|
||||
|
||||
if open_braces > close_braces:
|
||||
# JSON is incomplete, add missing closing braces
|
||||
missing_braces = open_braces - close_braces
|
||||
result = result + '}' * missing_braces
|
||||
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
|
||||
self.logger.debug(f"Fixed JSON: {result}")
|
||||
|
||||
# Try parsing the fixed JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
self.logger.info("Successfully fixed incomplete JSON")
|
||||
except json.JSONDecodeError as fix_error:
|
||||
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
|
||||
self.logger.warning(f"Fixed JSON content: {result}")
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
json_start = result.find('{')
|
||||
json_end = result.rfind('}')
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
json_part = result[json_start:json_end+1]
|
||||
try:
|
||||
styles = json.loads(json_part)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return default_styles
|
||||
else:
|
||||
return default_styles
|
||||
else:
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
json_start = result.find('{')
|
||||
json_end = result.rfind('}')
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
json_part = result[json_start:json_end+1]
|
||||
try:
|
||||
styles = json.loads(json_part)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return default_styles
|
||||
else:
|
||||
return default_styles
|
||||
|
||||
# Convert colors to appropriate format
|
||||
styles = self._convert_colors_format(styles)
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||
return default_styles
|
||||
|
||||
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert colors to appropriate format based on renderer type.
|
||||
Override this method in subclasses for format-specific color handling.
|
||||
"""
|
||||
return styles
|
||||
|
||||
def _create_ai_style_template(self, format_name: str, user_prompt: str, style_schema: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Create a standardized AI style template for any format.
|
||||
|
||||
Args:
|
||||
format_name: Name of the format (e.g., "docx", "xlsx", "pptx")
|
||||
user_prompt: User's original prompt
|
||||
style_schema: Format-specific style schema
|
||||
|
||||
Returns:
|
||||
Formatted prompt string
|
||||
"""
|
||||
schema_json = json.dumps(style_schema, indent=4)
|
||||
|
||||
# DEBUG: Show the schema being sent
|
||||
|
||||
return f"""You are a professional document styling expert. Generate a complete JSON styling configuration for {format_name.upper()} documents.
|
||||
|
||||
Use this schema as a template and customize the values for professional document styling:
|
||||
|
||||
{schema_json}
|
||||
|
||||
Requirements:
|
||||
- Return ONLY the complete JSON object (no markdown, no explanations)
|
||||
- Customize colors, fonts, and spacing for professional appearance
|
||||
- Ensure all objects are properly closed with closing braces
|
||||
- Make the styling modern and professional
|
||||
|
||||
Return the complete JSON:"""
|
||||
260
modules/services/serviceGeneration/renderers/rendererCsv.py
Normal file
260
modules/services/serviceGeneration/renderers/rendererCsv.py
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
"""
|
||||
CSV renderer for report generation.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import csv
|
||||
import io
|
||||
|
||||
class RendererCsv(BaseRenderer):
|
||||
"""Renders content to CSV format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported CSV formats."""
|
||||
return ['csv']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['spreadsheet', 'table']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for CSV renderer."""
|
||||
return 70
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to CSV format."""
|
||||
try:
|
||||
# Generate CSV directly from JSON (no styling needed for CSV)
|
||||
csv_content = await self._generate_csv_from_json(extracted_content, title)
|
||||
|
||||
return csv_content, "text/csv"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering CSV: {str(e)}")
|
||||
# Return minimal CSV fallback
|
||||
return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
|
||||
|
||||
async def _generate_csv_from_json(self, json_content: Dict[str, Any], title: str) -> str:
|
||||
"""Generate CSV content from structured JSON document."""
|
||||
try:
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Generate CSV content
|
||||
csv_rows = []
|
||||
|
||||
# Add title row
|
||||
if document_title:
|
||||
csv_rows.append([document_title])
|
||||
csv_rows.append([]) # Empty row
|
||||
|
||||
# Process each section in order
|
||||
sections = json_content.get("sections", [])
|
||||
for section in sections:
|
||||
section_csv = self._render_json_section_to_csv(section)
|
||||
if section_csv:
|
||||
csv_rows.extend(section_csv)
|
||||
csv_rows.append([]) # Empty row between sections
|
||||
|
||||
# Convert to CSV string
|
||||
csv_content = self._convert_rows_to_csv(csv_rows)
|
||||
|
||||
return csv_content
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating CSV from JSON: {str(e)}")
|
||||
raise Exception(f"CSV generation failed: {str(e)}")
|
||||
|
||||
def _render_json_section_to_csv(self, section: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a single JSON section to CSV rows."""
|
||||
try:
|
||||
section_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
csv_rows = []
|
||||
|
||||
# Add section title if available
|
||||
section_title = section.get("title")
|
||||
if section_title:
|
||||
csv_rows.append([f"# {section_title}"])
|
||||
|
||||
# Process each element in the section
|
||||
for element in elements:
|
||||
if section_type == "table":
|
||||
csv_rows.extend(self._render_json_table_to_csv(element))
|
||||
elif section_type == "list":
|
||||
csv_rows.extend(self._render_json_list_to_csv(element))
|
||||
elif section_type == "heading":
|
||||
csv_rows.extend(self._render_json_heading_to_csv(element))
|
||||
elif section_type == "paragraph":
|
||||
csv_rows.extend(self._render_json_paragraph_to_csv(element))
|
||||
elif section_type == "code":
|
||||
csv_rows.extend(self._render_json_code_to_csv(element))
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
csv_rows.extend(self._render_json_paragraph_to_csv(element))
|
||||
|
||||
return csv_rows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||
return [["[Error rendering section]"]]
|
||||
|
||||
def _render_json_table_to_csv(self, table_data: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON table to CSV rows."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
csv_rows = []
|
||||
|
||||
if headers:
|
||||
csv_rows.append(headers)
|
||||
|
||||
if rows:
|
||||
csv_rows.extend(rows)
|
||||
|
||||
return csv_rows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
return [["[Error rendering table]"]]
|
||||
|
||||
def _render_json_list_to_csv(self, list_data: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON list to CSV rows."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
csv_rows = []
|
||||
|
||||
for item in items:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text", "")
|
||||
subitems = item.get("subitems", [])
|
||||
csv_rows.append([text])
|
||||
|
||||
# Add subitems as indented rows
|
||||
for subitem in subitems:
|
||||
if isinstance(subitem, dict):
|
||||
csv_rows.append([f" - {subitem.get('text', '')}"])
|
||||
else:
|
||||
csv_rows.append([f" - {subitem}"])
|
||||
else:
|
||||
csv_rows.append([str(item)])
|
||||
|
||||
return csv_rows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering list: {str(e)}")
|
||||
return [["[Error rendering list]"]]
|
||||
|
||||
def _render_json_heading_to_csv(self, heading_data: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON heading to CSV rows."""
|
||||
try:
|
||||
text = heading_data.get("text", "")
|
||||
level = heading_data.get("level", 1)
|
||||
|
||||
if text:
|
||||
# Use # symbols for heading levels
|
||||
heading_text = f"{'#' * level} {text}"
|
||||
return [[heading_text]]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
return [["[Error rendering heading]"]]
|
||||
|
||||
def _render_json_paragraph_to_csv(self, paragraph_data: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON paragraph to CSV rows."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
|
||||
if text:
|
||||
# Split long paragraphs into multiple rows if needed
|
||||
if len(text) > 100:
|
||||
words = text.split()
|
||||
rows = []
|
||||
current_row = []
|
||||
current_length = 0
|
||||
|
||||
for word in words:
|
||||
if current_length + len(word) > 100 and current_row:
|
||||
rows.append([" ".join(current_row)])
|
||||
current_row = [word]
|
||||
current_length = len(word)
|
||||
else:
|
||||
current_row.append(word)
|
||||
current_length += len(word) + 1
|
||||
|
||||
if current_row:
|
||||
rows.append([" ".join(current_row)])
|
||||
|
||||
return rows
|
||||
else:
|
||||
return [[text]]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
return [["[Error rendering paragraph]"]]
|
||||
|
||||
def _render_json_code_to_csv(self, code_data: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON code block to CSV rows."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
|
||||
csv_rows = []
|
||||
|
||||
if language:
|
||||
csv_rows.append([f"Code ({language}):"])
|
||||
|
||||
if code:
|
||||
# Split code into lines
|
||||
code_lines = code.split('\n')
|
||||
for line in code_lines:
|
||||
csv_rows.append([f" {line}"])
|
||||
|
||||
return csv_rows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
return [["[Error rendering code block]"]]
|
||||
|
||||
def _convert_rows_to_csv(self, rows: List[List[str]]) -> str:
|
||||
"""Convert rows to CSV string."""
|
||||
import csv
|
||||
import io
|
||||
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
|
||||
for row in rows:
|
||||
if row: # Only write non-empty rows
|
||||
writer.writerow(row)
|
||||
|
||||
return output.getvalue()
|
||||
|
||||
def _clean_csv_content(self, content: str, title: str) -> str:
|
||||
"""Clean and validate CSV content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
return content
|
||||
958
modules/services/serviceGeneration/renderers/rendererDocx.py
Normal file
958
modules/services/serviceGeneration/renderers/rendererDocx.py
Normal file
|
|
@ -0,0 +1,958 @@
|
|||
"""
|
||||
DOCX renderer for report generation using python-docx.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import io
|
||||
import base64
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||||
from docx.oxml.shared import OxmlElement, qn
|
||||
from docx.oxml.ns import nsdecls
|
||||
from docx.oxml import parse_xml
|
||||
DOCX_AVAILABLE = True
|
||||
except ImportError:
|
||||
DOCX_AVAILABLE = False
|
||||
|
||||
class RendererDocx(BaseRenderer):
|
||||
"""Renders content to DOCX format using python-docx."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported DOCX formats."""
|
||||
return ['docx', 'doc']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['word', 'document']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for DOCX renderer."""
|
||||
return 115
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
|
||||
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={user_prompt[:50] if user_prompt else 'None'}...", "DOCX_RENDERER")
|
||||
try:
|
||||
if not DOCX_AVAILABLE:
|
||||
# Fallback to HTML if python-docx not available
|
||||
from .rendererHtml import RendererHtml
|
||||
html_renderer = RendererHtml()
|
||||
html_content, _ = await html_renderer.render(extracted_content, title)
|
||||
return html_content, "text/html"
|
||||
|
||||
# Generate DOCX using AI-analyzed styling
|
||||
docx_content = await self._generate_docx_from_json(extracted_content, title, user_prompt, ai_service)
|
||||
|
||||
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
||||
# Return minimal fallback
|
||||
return f"DOCX Generation Error: {str(e)}", "text/plain"
|
||||
|
||||
async def _generate_docx_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||
"""Generate DOCX content from structured JSON document using AI-generated styling."""
|
||||
try:
|
||||
# Create new document
|
||||
doc = Document()
|
||||
|
||||
# Get AI-generated styling definitions
|
||||
self.logger.info(f"About to call AI styling with user_prompt: {user_prompt[:100] if user_prompt else 'None'}...")
|
||||
styles = await self._get_docx_styles(user_prompt, ai_service)
|
||||
|
||||
# Apply basic document setup
|
||||
self._setup_basic_document_styles(doc)
|
||||
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Add document title using analyzed styles
|
||||
if document_title:
|
||||
title_heading = doc.add_heading(document_title, level=1)
|
||||
title_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
# Process each section in order
|
||||
sections = json_content.get("sections", [])
|
||||
for section in sections:
|
||||
self._render_json_section(doc, section, styles)
|
||||
|
||||
# Save to buffer
|
||||
buffer = io.BytesIO()
|
||||
doc.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert to base64
|
||||
docx_bytes = buffer.getvalue()
|
||||
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
||||
|
||||
return docx_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
|
||||
raise Exception(f"DOCX generation failed: {str(e)}")
|
||||
|
||||
async def _get_docx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Get DOCX styling definitions using base template AI styling."""
|
||||
style_schema = {
|
||||
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
|
||||
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
|
||||
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
|
||||
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||||
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
|
||||
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
|
||||
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
|
||||
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
|
||||
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
|
||||
}
|
||||
|
||||
style_template = self._create_ai_style_template("docx", user_prompt, style_schema)
|
||||
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_styles())
|
||||
|
||||
# Validate and fix contrast issues
|
||||
return self._validate_styles_contrast(styles)
|
||||
|
||||
def _validate_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate and fix contrast issues in AI-generated styles."""
|
||||
try:
|
||||
# Fix table header contrast
|
||||
if "table_header" in styles:
|
||||
header = styles["table_header"]
|
||||
bg_color = header.get("background", "#FFFFFF")
|
||||
text_color = header.get("text_color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["text_color"] = "#FFFFFF"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["text_color"] = "#FFFFFF"
|
||||
|
||||
# Fix table cell contrast
|
||||
if "table_cell" in styles:
|
||||
cell = styles["table_cell"]
|
||||
bg_color = cell.get("background", "#FFFFFF")
|
||||
text_color = cell.get("text_color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["text_color"] = "#2F2F2F"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["text_color"] = "#2F2F2F"
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||
return self._get_default_styles()
|
||||
|
||||
def _get_default_styles(self) -> Dict[str, Any]:
|
||||
"""Default DOCX styles."""
|
||||
return {
|
||||
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
|
||||
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
|
||||
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
|
||||
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||||
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
|
||||
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
|
||||
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
|
||||
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
|
||||
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
|
||||
}
|
||||
|
||||
def _setup_basic_document_styles(self, doc: Document) -> None:
|
||||
"""Set up basic document styles."""
|
||||
try:
|
||||
# Set default font
|
||||
style = doc.styles['Normal']
|
||||
font = style.font
|
||||
font.name = 'Calibri'
|
||||
font.size = Pt(11)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
|
||||
|
||||
|
||||
|
||||
|
||||
def _clear_template_content(self, doc: Document) -> None:
|
||||
"""Clear template content while preserving styles."""
|
||||
try:
|
||||
# Remove all paragraphs except keep the styles
|
||||
for paragraph in list(doc.paragraphs):
|
||||
# Keep the paragraph but clear its content
|
||||
paragraph.clear()
|
||||
|
||||
# Remove all tables
|
||||
for table in list(doc.tables):
|
||||
table._element.getparent().remove(table._element)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not clear template content: {str(e)}")
|
||||
|
||||
def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a single JSON section to DOCX using AI-generated styles."""
|
||||
try:
|
||||
section_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
# Process each element in the section
|
||||
for element in elements:
|
||||
if section_type == "table":
|
||||
self._render_json_table(doc, element, styles)
|
||||
elif section_type == "bullet_list":
|
||||
self._render_json_bullet_list(doc, element, styles)
|
||||
elif section_type == "heading":
|
||||
self._render_json_heading(doc, element, styles)
|
||||
elif section_type == "paragraph":
|
||||
self._render_json_paragraph(doc, element, styles)
|
||||
elif section_type == "code_block":
|
||||
self._render_json_code_block(doc, element, styles)
|
||||
elif section_type == "image":
|
||||
self._render_json_image(doc, element, styles)
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
self._render_json_paragraph(doc, element, styles)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||
# Add error paragraph as fallback
|
||||
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
|
||||
|
||||
def _render_json_table(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON table to DOCX using AI-generated styles."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return
|
||||
|
||||
# Create table
|
||||
table = doc.add_table(rows=len(rows) + 1, cols=len(headers))
|
||||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||
|
||||
# Apply table borders based on AI style
|
||||
border_style = styles["table_border"]["style"]
|
||||
if border_style == "horizontal_only":
|
||||
self._apply_horizontal_borders_only(table)
|
||||
elif border_style == "grid":
|
||||
table.style = 'Table Grid'
|
||||
# else: no borders
|
||||
|
||||
# Add headers with AI-generated styling
|
||||
header_row = table.rows[0]
|
||||
header_style = styles["table_header"]
|
||||
for i, header in enumerate(headers):
|
||||
if i < len(header_row.cells):
|
||||
cell = header_row.cells[i]
|
||||
cell.text = str(header)
|
||||
|
||||
# Apply background color
|
||||
bg_color = header_style["background"].lstrip('#')
|
||||
self._set_cell_background(cell, RGBColor(int(bg_color[0:2], 16), int(bg_color[2:4], 16), int(bg_color[4:6], 16)))
|
||||
|
||||
# Apply text styling
|
||||
for paragraph in cell.paragraphs:
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if header_style["align"] == "center" else WD_ALIGN_PARAGRAPH.LEFT
|
||||
for run in paragraph.runs:
|
||||
run.bold = header_style["bold"]
|
||||
run.font.size = Pt(11)
|
||||
text_color = header_style["text_color"].lstrip('#')
|
||||
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
|
||||
|
||||
# Add data rows with AI-generated styling
|
||||
cell_style = styles["table_cell"]
|
||||
for row_idx, row_data in enumerate(rows):
|
||||
if row_idx + 1 < len(table.rows):
|
||||
table_row = table.rows[row_idx + 1]
|
||||
for col_idx, cell_data in enumerate(row_data):
|
||||
if col_idx < len(table_row.cells):
|
||||
cell = table_row.cells[col_idx]
|
||||
cell.text = str(cell_data)
|
||||
|
||||
# Apply text styling
|
||||
for paragraph in cell.paragraphs:
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||||
for run in paragraph.runs:
|
||||
run.font.size = Pt(10)
|
||||
text_color = cell_style["text_color"].lstrip('#')
|
||||
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
|
||||
def _apply_horizontal_borders_only(self, table) -> None:
|
||||
"""Apply only horizontal borders to the table (no vertical borders)."""
|
||||
try:
|
||||
from docx.oxml.shared import OxmlElement, qn
|
||||
|
||||
# Get table properties
|
||||
tbl_pr = table._element.find(qn('w:tblPr'))
|
||||
if tbl_pr is None:
|
||||
tbl_pr = OxmlElement('w:tblPr')
|
||||
table._element.insert(0, tbl_pr)
|
||||
|
||||
# Remove existing borders
|
||||
existing_borders = tbl_pr.find(qn('w:tblBorders'))
|
||||
if existing_borders is not None:
|
||||
tbl_pr.remove(existing_borders)
|
||||
|
||||
# Create new borders element
|
||||
tbl_borders = OxmlElement('w:tblBorders')
|
||||
|
||||
# Top border
|
||||
top_border = OxmlElement('w:top')
|
||||
top_border.set(qn('w:val'), 'single')
|
||||
top_border.set(qn('w:sz'), '4')
|
||||
top_border.set(qn('w:space'), '0')
|
||||
top_border.set(qn('w:color'), '000000')
|
||||
tbl_borders.append(top_border)
|
||||
|
||||
# Bottom border
|
||||
bottom_border = OxmlElement('w:bottom')
|
||||
bottom_border.set(qn('w:val'), 'single')
|
||||
bottom_border.set(qn('w:sz'), '4')
|
||||
bottom_border.set(qn('w:space'), '0')
|
||||
bottom_border.set(qn('w:color'), '000000')
|
||||
tbl_borders.append(bottom_border)
|
||||
|
||||
# Left border - none
|
||||
left_border = OxmlElement('w:left')
|
||||
left_border.set(qn('w:val'), 'none')
|
||||
tbl_borders.append(left_border)
|
||||
|
||||
# Right border - none
|
||||
right_border = OxmlElement('w:right')
|
||||
right_border.set(qn('w:val'), 'none')
|
||||
tbl_borders.append(right_border)
|
||||
|
||||
# Inside horizontal border
|
||||
inside_h_border = OxmlElement('w:insideH')
|
||||
inside_h_border.set(qn('w:val'), 'single')
|
||||
inside_h_border.set(qn('w:sz'), '4')
|
||||
inside_h_border.set(qn('w:space'), '0')
|
||||
inside_h_border.set(qn('w:color'), '000000')
|
||||
tbl_borders.append(inside_h_border)
|
||||
|
||||
# Inside vertical border - none
|
||||
inside_v_border = OxmlElement('w:insideV')
|
||||
inside_v_border.set(qn('w:val'), 'none')
|
||||
tbl_borders.append(inside_v_border)
|
||||
|
||||
tbl_pr.append(tbl_borders)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not apply horizontal borders: {str(e)}")
|
||||
|
||||
def _set_cell_background(self, cell, color: RGBColor) -> None:
|
||||
"""Set the background color of a table cell."""
|
||||
try:
|
||||
from docx.oxml.shared import OxmlElement, qn
|
||||
|
||||
# Get cell properties
|
||||
tc_pr = cell._element.find(qn('w:tcPr'))
|
||||
if tc_pr is None:
|
||||
tc_pr = OxmlElement('w:tcPr')
|
||||
cell._element.insert(0, tc_pr)
|
||||
|
||||
# Remove existing shading
|
||||
existing_shading = tc_pr.find(qn('w:shd'))
|
||||
if existing_shading is not None:
|
||||
tc_pr.remove(existing_shading)
|
||||
|
||||
# Create new shading element
|
||||
shading = OxmlElement('w:shd')
|
||||
shading.set(qn('w:val'), 'clear')
|
||||
shading.set(qn('w:color'), 'auto')
|
||||
# Convert RGBColor to hex string by unpacking RGB components
|
||||
red, green, blue = color
|
||||
hex_color = f"{red:02x}{green:02x}{blue:02x}"
|
||||
shading.set(qn('w:fill'), hex_color)
|
||||
tc_pr.append(shading)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not set cell background: {str(e)}")
|
||||
|
||||
|
||||
def _render_json_bullet_list(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON bullet list to DOCX using AI-generated styles."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
bullet_style = styles["bullet_list"]
|
||||
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
para = doc.add_paragraph(item, style='List Bullet')
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
para = doc.add_paragraph(item["text"], style='List Bullet')
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||
|
||||
def _render_json_heading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON heading to DOCX using AI-generated styles."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
doc.add_heading(text, level=level)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
|
||||
def _render_json_paragraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON paragraph to DOCX using AI-generated styles."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
|
||||
if text:
|
||||
para = doc.add_paragraph(text)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
|
||||
def _render_json_code_block(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON code block to DOCX using AI-generated styles."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
lang_para = doc.add_paragraph(f"Code ({language}):")
|
||||
lang_para.runs[0].bold = True
|
||||
|
||||
code_para = doc.add_paragraph(code)
|
||||
for run in code_para.runs:
|
||||
run.font.name = 'Courier New'
|
||||
run.font.size = Pt(10)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
|
||||
def _render_json_image(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Render a JSON image to DOCX."""
|
||||
try:
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
|
||||
if base64_data:
|
||||
image_bytes = base64.b64decode(base64_data)
|
||||
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
|
||||
|
||||
if alt_text:
|
||||
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
|
||||
caption_para.runs[0].italic = True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]")
|
||||
|
||||
def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]:
|
||||
"""Extract document structure from user prompt."""
|
||||
structure = {
|
||||
'title': title,
|
||||
'sections': [],
|
||||
'format': 'standard'
|
||||
}
|
||||
|
||||
if not user_prompt:
|
||||
return structure
|
||||
|
||||
# Extract title from prompt if not provided
|
||||
if not title or title == "Generated Document":
|
||||
# Look for "create a ... document" or "generate a ... report"
|
||||
import re
|
||||
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower())
|
||||
if title_match:
|
||||
structure['title'] = title_match.group(1).strip().title()
|
||||
|
||||
# Extract sections from numbered lists in prompt
|
||||
import re
|
||||
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
|
||||
sections = re.findall(section_pattern, user_prompt)
|
||||
|
||||
for num, section_text in sections:
|
||||
structure['sections'].append({
|
||||
'number': int(num),
|
||||
'title': section_text.strip(),
|
||||
'level': 2 # H2 level
|
||||
})
|
||||
|
||||
# If no numbered sections found, try to extract from "including:" patterns
|
||||
if not structure['sections']:
|
||||
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL)
|
||||
if including_match:
|
||||
including_text = including_match.group(1)
|
||||
# Split by common separators
|
||||
parts = re.split(r'[,;]\s*', including_text)
|
||||
for i, part in enumerate(parts, 1):
|
||||
part = part.strip()
|
||||
if part:
|
||||
structure['sections'].append({
|
||||
'number': i,
|
||||
'title': part,
|
||||
'level': 2
|
||||
})
|
||||
|
||||
# If still no sections, extract from any list-like patterns
|
||||
if not structure['sections']:
|
||||
# Look for bullet points or dashes
|
||||
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
|
||||
bullets = re.findall(bullet_pattern, user_prompt)
|
||||
for i, bullet in enumerate(bullets, 1):
|
||||
bullet = bullet.strip()
|
||||
if bullet and len(bullet) > 3:
|
||||
structure['sections'].append({
|
||||
'number': i,
|
||||
'title': bullet,
|
||||
'level': 2
|
||||
})
|
||||
|
||||
# If still no sections, extract from sentence structure
|
||||
if not structure['sections']:
|
||||
# Split prompt into sentences and use as sections
|
||||
sentences = re.split(r'[.!?]\s+', user_prompt)
|
||||
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
|
||||
sentence = sentence.strip()
|
||||
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
|
||||
structure['sections'].append({
|
||||
'number': i,
|
||||
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
|
||||
'level': 2
|
||||
})
|
||||
|
||||
# Final fallback: create sections from prompt keywords
|
||||
if not structure['sections']:
|
||||
# Extract key action words from prompt
|
||||
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
|
||||
found_actions = []
|
||||
for action in action_words:
|
||||
if action in user_prompt.lower():
|
||||
found_actions.append(action.title())
|
||||
|
||||
if found_actions:
|
||||
for i, action in enumerate(found_actions[:3], 1):
|
||||
structure['sections'].append({
|
||||
'number': i,
|
||||
'title': f"{action} Document Content",
|
||||
'level': 2
|
||||
})
|
||||
else:
|
||||
# Last resort: generic but meaningful sections
|
||||
structure['sections'] = [
|
||||
{'number': 1, 'title': 'Document Analysis', 'level': 2},
|
||||
{'number': 2, 'title': 'Key Information', 'level': 2},
|
||||
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
|
||||
]
|
||||
|
||||
return structure
|
||||
|
||||
def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]):
|
||||
"""Generate DOCX content based on extracted structure."""
|
||||
# Add sections based on prompt structure
|
||||
for section in structure['sections']:
|
||||
# Add section heading
|
||||
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
|
||||
|
||||
# Add AI-generated content for this section
|
||||
# Try to extract relevant content for this section from the AI response
|
||||
section_content = self._extract_section_content(content, section['title'])
|
||||
|
||||
if section_content:
|
||||
doc.add_paragraph(section_content)
|
||||
else:
|
||||
# If no specific content found, add a note
|
||||
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
|
||||
|
||||
# Add some spacing
|
||||
doc.add_paragraph()
|
||||
|
||||
# Add the complete AI-generated content as additional analysis
|
||||
if content and content.strip():
|
||||
doc.add_heading("Complete Analysis", level=1)
|
||||
doc.add_paragraph(content)
|
||||
|
||||
def _extract_section_content(self, content: str, section_title: str) -> str:
|
||||
"""Extract relevant content for a specific section from AI response."""
|
||||
if not content or not section_title:
|
||||
return ""
|
||||
|
||||
# Look for content that matches the section title
|
||||
section_keywords = section_title.lower().split()
|
||||
|
||||
# Split content into paragraphs
|
||||
paragraphs = content.split('\n\n')
|
||||
|
||||
relevant_paragraphs = []
|
||||
for paragraph in paragraphs:
|
||||
paragraph_lower = paragraph.lower()
|
||||
# Check if paragraph contains keywords from section title
|
||||
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
|
||||
relevant_paragraphs.append(paragraph.strip())
|
||||
|
||||
if relevant_paragraphs:
|
||||
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
|
||||
|
||||
return ""
|
||||
|
||||
def _setup_document_styles(self, doc):
|
||||
"""Set up document styles."""
|
||||
try:
|
||||
# Set default font
|
||||
style = doc.styles['Normal']
|
||||
font = style.font
|
||||
font.name = 'Calibri'
|
||||
font.size = Pt(11)
|
||||
|
||||
# Set heading styles
|
||||
for i in range(1, 4):
|
||||
heading_style = doc.styles[f'Heading {i}']
|
||||
heading_font = heading_style.font
|
||||
heading_font.name = 'Calibri'
|
||||
heading_font.size = Pt(16 - i * 2)
|
||||
heading_font.bold = True
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
||||
|
||||
def _process_section(self, doc, lines: list):
|
||||
"""Process a section of content into DOCX elements."""
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Check for tables (lines with |)
|
||||
if '|' in line and not line.startswith('|'):
|
||||
# This might be part of a table, process as table
|
||||
table_data = self._extract_table_data(lines)
|
||||
if table_data:
|
||||
self._add_table(doc, table_data)
|
||||
return
|
||||
|
||||
# Check for lists
|
||||
if line.startswith('- ') or line.startswith('* '):
|
||||
# This is a list item
|
||||
doc.add_paragraph(line[2:], style='List Bullet')
|
||||
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
||||
# This is a numbered list item
|
||||
doc.add_paragraph(line[3:], style='List Number')
|
||||
else:
|
||||
# Regular paragraph
|
||||
doc.add_paragraph(line)
|
||||
|
||||
def _extract_table_data(self, lines: list) -> list:
|
||||
"""Extract table data from lines."""
|
||||
table_data = []
|
||||
in_table = False
|
||||
|
||||
for line in lines:
|
||||
if '|' in line:
|
||||
if not in_table:
|
||||
in_table = True
|
||||
# Split by | and clean up
|
||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||
if cells:
|
||||
table_data.append(cells)
|
||||
elif in_table and not line.strip():
|
||||
# Empty line, might be end of table
|
||||
break
|
||||
|
||||
return table_data if len(table_data) > 1 else []
|
||||
|
||||
def _add_table(self, doc, table_data: list):
|
||||
"""Add a table to the document."""
|
||||
try:
|
||||
if not table_data:
|
||||
return
|
||||
|
||||
# Create table
|
||||
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
||||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||
|
||||
# Add data to table
|
||||
for row_idx, row_data in enumerate(table_data):
|
||||
for col_idx, cell_data in enumerate(row_data):
|
||||
if col_idx < len(table.rows[row_idx].cells):
|
||||
table.rows[row_idx].cells[col_idx].text = cell_data
|
||||
|
||||
# Style the table
|
||||
self._style_table(table)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add table: {str(e)}")
|
||||
|
||||
def _style_table(self, table):
|
||||
"""Apply styling to the table."""
|
||||
try:
|
||||
# Style header row
|
||||
if len(table.rows) > 0:
|
||||
header_cells = table.rows[0].cells
|
||||
for cell in header_cells:
|
||||
for paragraph in cell.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
run.bold = True
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not style table: {str(e)}")
|
||||
|
||||
def _process_table_row(self, doc, line: str):
|
||||
"""Process a table row and add it to the document."""
|
||||
if not line.strip():
|
||||
return
|
||||
|
||||
# Split by pipe separator
|
||||
parts = [part.strip() for part in line.split('|')]
|
||||
|
||||
if len(parts) >= 2:
|
||||
# This is a table row - create a table if it doesn't exist
|
||||
if not hasattr(self, '_current_table') or self._current_table is None:
|
||||
# Create new table
|
||||
self._current_table = doc.add_table(rows=1, cols=len(parts))
|
||||
self._current_table.style = 'Table Grid'
|
||||
|
||||
# Add header row
|
||||
for i, part in enumerate(parts):
|
||||
if i < len(self._current_table.rows[0].cells):
|
||||
cell = self._current_table.rows[0].cells[i]
|
||||
cell.text = part
|
||||
# Make header bold
|
||||
for paragraph in cell.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
run.bold = True
|
||||
else:
|
||||
# Add data row to existing table
|
||||
row = self._current_table.add_row()
|
||||
for i, part in enumerate(parts):
|
||||
if i < len(row.cells):
|
||||
row.cells[i].text = part
|
||||
else:
|
||||
# Not a table row, treat as regular text
|
||||
doc.add_paragraph(line)
|
||||
|
||||
def _clean_ai_content(self, content: str) -> str:
|
||||
"""Clean AI-generated content by removing debug information and duplicates."""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# Remove debug information
|
||||
lines = content.split('\n')
|
||||
clean_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Skip debug lines and separators
|
||||
if (line.startswith('[Skipped ') or
|
||||
line.startswith('=== DOCUMENT:') or
|
||||
line.startswith('---') or
|
||||
line.startswith('FILENAME:') or
|
||||
line.strip() == '' or
|
||||
line.strip() == '---'):
|
||||
continue
|
||||
clean_lines.append(line)
|
||||
|
||||
# Join lines and remove duplicate content
|
||||
clean_content = '\n'.join(clean_lines)
|
||||
|
||||
# Remove duplicate sections by keeping only the first occurrence
|
||||
sections = clean_content.split('\n\n')
|
||||
seen_sections = set()
|
||||
unique_sections = []
|
||||
|
||||
for section in sections:
|
||||
section_key = section.strip()[:50] # Use first 50 chars as key
|
||||
if section_key not in seen_sections and section.strip():
|
||||
seen_sections.add(section_key)
|
||||
unique_sections.append(section)
|
||||
|
||||
return '\n\n'.join(unique_sections)
|
||||
|
||||
def _process_tables(self, doc, content: str) -> str:
|
||||
"""
|
||||
Process tables in the content (both CSV and pipe-separated) and convert them to Word tables.
|
||||
Returns the content with tables replaced by placeholders.
|
||||
"""
|
||||
import csv
|
||||
import io
|
||||
|
||||
lines = content.split('\n')
|
||||
processed_lines = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Check if this line looks like a table (contains pipes or commas with multiple fields)
|
||||
is_pipe_table = '|' in line and len(line.split('|')) >= 2
|
||||
is_csv_table = ',' in line and len(line.split(',')) >= 2
|
||||
|
||||
if is_pipe_table or is_csv_table:
|
||||
# Collect consecutive table lines
|
||||
table_lines = []
|
||||
j = i
|
||||
|
||||
# Determine separator and collect lines
|
||||
separator = '|' if is_pipe_table else ','
|
||||
while j < len(lines):
|
||||
current_line = lines[j].strip()
|
||||
if separator in current_line and len(current_line.split(separator)) >= 2:
|
||||
table_lines.append(current_line)
|
||||
j += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if len(table_lines) >= 2: # At least header + 1 data row
|
||||
# Create Word table
|
||||
try:
|
||||
if separator == '|':
|
||||
# Process pipe-separated table
|
||||
rows = []
|
||||
for table_line in table_lines:
|
||||
# Split by pipe and clean up
|
||||
cells = [cell.strip() for cell in table_line.split('|')]
|
||||
rows.append(cells)
|
||||
else:
|
||||
# Process CSV table
|
||||
csv_content = '\n'.join(table_lines)
|
||||
csv_reader = csv.reader(io.StringIO(csv_content))
|
||||
rows = list(csv_reader)
|
||||
|
||||
if rows and len(rows[0]) > 0:
|
||||
# Create Word table
|
||||
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
|
||||
table.style = 'Table Grid'
|
||||
|
||||
# Populate table
|
||||
for row_idx, row_data in enumerate(rows):
|
||||
for col_idx, cell_data in enumerate(row_data):
|
||||
if col_idx < len(table.rows[row_idx].cells):
|
||||
table.rows[row_idx].cells[col_idx].text = cell_data.strip()
|
||||
|
||||
# Make header row bold
|
||||
if row_idx == 0:
|
||||
for cell in table.rows[row_idx].cells:
|
||||
for paragraph in cell.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
run.bold = True
|
||||
|
||||
# Add placeholder to mark where table was inserted
|
||||
processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]")
|
||||
|
||||
# Skip the table lines
|
||||
i = j
|
||||
continue
|
||||
except Exception as e:
|
||||
# If table parsing fails, treat as regular text
|
||||
pass
|
||||
|
||||
processed_lines.append(line)
|
||||
i += 1
|
||||
|
||||
return '\n'.join(processed_lines)
|
||||
|
||||
def _parse_and_format_content(self, doc, content: str, title: str):
|
||||
"""Parse AI-generated content in standardized format and apply proper DOCX formatting."""
|
||||
if not content:
|
||||
return
|
||||
|
||||
# Process tables and replace them with placeholders
|
||||
content = self._process_tables(doc, content)
|
||||
|
||||
# Parse content line by line in exact sequence
|
||||
lines = content.split('\n')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
# Empty line - add paragraph break
|
||||
doc.add_paragraph()
|
||||
continue
|
||||
|
||||
# Skip table placeholders (already processed)
|
||||
if line.startswith('[TABLE_INSERTED_'):
|
||||
continue
|
||||
|
||||
# Check if this is a Markdown heading (# ## ###)
|
||||
if line.startswith('#'):
|
||||
level = len(line) - len(line.lstrip('#'))
|
||||
heading_text = line.lstrip('# ').strip()
|
||||
doc.add_heading(heading_text, level=min(level, 3))
|
||||
|
||||
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
|
||||
elif re.match(r'^\d+\)\s+.+', line):
|
||||
heading_text = re.sub(r'^\d+\)\s+', '', line)
|
||||
doc.add_heading(heading_text, level=1)
|
||||
|
||||
# Check if this is a Markdown list item
|
||||
elif line.startswith('- ') or re.match(r'^\d+\.\s+', line):
|
||||
bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line)
|
||||
self._add_bullet_point(doc, bullet_text)
|
||||
|
||||
# Check if this is a code block
|
||||
elif line.startswith('```'):
|
||||
if not line.endswith('```'):
|
||||
# Start of code block - collect until end
|
||||
code_lines = [line]
|
||||
continue
|
||||
else:
|
||||
# End of code block
|
||||
if 'code_lines' in locals():
|
||||
code_lines.append(line)
|
||||
code_text = '\n'.join(code_lines)
|
||||
para = doc.add_paragraph()
|
||||
run = para.add_run(code_text)
|
||||
run.font.name = 'Courier New'
|
||||
del code_lines
|
||||
|
||||
# Regular paragraph
|
||||
else:
|
||||
self._add_paragraph_to_doc(doc, line)
|
||||
|
||||
def _add_paragraph_to_doc(self, doc, text: str):
|
||||
"""Add a paragraph to the document with proper formatting."""
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
# Check for Markdown formatting (**bold**, *italic*)
|
||||
para = doc.add_paragraph()
|
||||
|
||||
# Split by bold markers
|
||||
parts = text.split('**')
|
||||
for i, part in enumerate(parts):
|
||||
if i % 2 == 0:
|
||||
# Regular text - check for italic
|
||||
italic_parts = part.split('*')
|
||||
for j, italic_part in enumerate(italic_parts):
|
||||
if j % 2 == 0:
|
||||
# Regular text
|
||||
if italic_part:
|
||||
para.add_run(italic_part)
|
||||
else:
|
||||
# Italic text
|
||||
if italic_part:
|
||||
run = para.add_run(italic_part)
|
||||
run.italic = True
|
||||
else:
|
||||
# Bold text
|
||||
if part:
|
||||
run = para.add_run(part)
|
||||
run.bold = True
|
||||
424
modules/services/serviceGeneration/renderers/rendererHtml.py
Normal file
424
modules/services/serviceGeneration/renderers/rendererHtml.py
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
"""
|
||||
HTML renderer for report generation.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
|
||||
class RendererHtml(BaseRenderer):
|
||||
"""Renders content to HTML format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported HTML formats."""
|
||||
return ['html', 'htm']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['web', 'webpage']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for HTML renderer."""
|
||||
return 100
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to HTML format using AI-analyzed styling."""
|
||||
try:
|
||||
# Generate HTML using AI-analyzed styling
|
||||
html_content = await self._generate_html_from_json(extracted_content, title, user_prompt, ai_service)
|
||||
|
||||
return html_content, "text/html"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering HTML: {str(e)}")
|
||||
# Return minimal HTML fallback
|
||||
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
|
||||
|
||||
async def _generate_html_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||
"""Generate HTML content from structured JSON document using AI-generated styling."""
|
||||
try:
|
||||
# Get AI-generated styling definitions
|
||||
styles = await self._get_html_styles(user_prompt, ai_service)
|
||||
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Build HTML document
|
||||
html_parts = []
|
||||
|
||||
# HTML document structure
|
||||
html_parts.append('<!DOCTYPE html>')
|
||||
html_parts.append('<html lang="en">')
|
||||
html_parts.append('<head>')
|
||||
html_parts.append('<meta charset="UTF-8">')
|
||||
html_parts.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
|
||||
html_parts.append(f'<title>{document_title}</title>')
|
||||
html_parts.append('<style>')
|
||||
html_parts.append(self._generate_css_styles(styles))
|
||||
html_parts.append('</style>')
|
||||
html_parts.append('</head>')
|
||||
html_parts.append('<body>')
|
||||
|
||||
# Document header
|
||||
html_parts.append(f'<header><h1 class="document-title">{document_title}</h1></header>')
|
||||
|
||||
# Main content
|
||||
html_parts.append('<main>')
|
||||
|
||||
# Process each section
|
||||
sections = json_content.get("sections", [])
|
||||
for section in sections:
|
||||
section_html = self._render_json_section(section, styles)
|
||||
if section_html:
|
||||
html_parts.append(section_html)
|
||||
|
||||
html_parts.append('</main>')
|
||||
|
||||
# Footer
|
||||
html_parts.append('<footer>')
|
||||
html_parts.append(f'<p class="generated-info">Generated: {self._format_timestamp()}</p>')
|
||||
html_parts.append('</footer>')
|
||||
|
||||
html_parts.append('</body>')
|
||||
html_parts.append('</html>')
|
||||
|
||||
return '\n'.join(html_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
|
||||
raise Exception(f"HTML generation failed: {str(e)}")
|
||||
|
||||
async def _get_html_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Get HTML styling definitions using base template AI styling."""
|
||||
style_schema = {
|
||||
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
|
||||
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
|
||||
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
|
||||
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
|
||||
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
|
||||
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
|
||||
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
|
||||
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
|
||||
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
|
||||
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
|
||||
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
|
||||
}
|
||||
|
||||
style_template = self._create_ai_style_template("html", user_prompt, style_schema)
|
||||
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_html_styles())
|
||||
|
||||
# Validate and fix contrast issues
|
||||
return self._validate_html_styles_contrast(styles)
|
||||
|
||||
def _validate_html_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate and fix contrast issues in AI-generated styles."""
|
||||
try:
|
||||
# Fix table header contrast
|
||||
if "table_header" in styles:
|
||||
header = styles["table_header"]
|
||||
bg_color = header.get("background", "#FFFFFF")
|
||||
text_color = header.get("color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["color"] = "#FFFFFF"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["color"] = "#FFFFFF"
|
||||
|
||||
# Fix table cell contrast
|
||||
if "table_cell" in styles:
|
||||
cell = styles["table_cell"]
|
||||
bg_color = cell.get("background", "#FFFFFF")
|
||||
text_color = cell.get("color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["color"] = "#2F2F2F"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["color"] = "#2F2F2F"
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||
return self._get_default_html_styles()
|
||||
|
||||
|
||||
def _get_default_html_styles(self) -> Dict[str, Any]:
|
||||
"""Default HTML styles."""
|
||||
return {
|
||||
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
|
||||
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
|
||||
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
|
||||
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
|
||||
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
|
||||
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
|
||||
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
|
||||
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
|
||||
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
|
||||
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
|
||||
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
|
||||
}
|
||||
|
||||
def _generate_css_styles(self, styles: Dict[str, Any]) -> str:
|
||||
"""Generate CSS from style definitions."""
|
||||
css_parts = []
|
||||
|
||||
# Body styles
|
||||
body_style = styles.get("body", {})
|
||||
css_parts.append("body {")
|
||||
for property_name, value in body_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Document title
|
||||
title_style = styles.get("title", {})
|
||||
css_parts.append(".document-title {")
|
||||
for property_name, value in title_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Headings
|
||||
for heading_level in ["heading1", "heading2"]:
|
||||
heading_style = styles.get(heading_level, {})
|
||||
css_class = f"h{heading_level[-1]}"
|
||||
css_parts.append(f"{css_class} {{")
|
||||
for property_name, value in heading_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Paragraphs
|
||||
paragraph_style = styles.get("paragraph", {})
|
||||
css_parts.append("p {")
|
||||
for property_name, value in paragraph_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Tables
|
||||
table_style = styles.get("table", {})
|
||||
css_parts.append("table {")
|
||||
for property_name, value in table_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Table headers
|
||||
table_header_style = styles.get("table_header", {})
|
||||
css_parts.append("th {")
|
||||
for property_name, value in table_header_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Table cells
|
||||
table_cell_style = styles.get("table_cell", {})
|
||||
css_parts.append("td {")
|
||||
for property_name, value in table_cell_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Lists
|
||||
bullet_list_style = styles.get("bullet_list", {})
|
||||
css_parts.append("ul {")
|
||||
for property_name, value in bullet_list_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Code blocks
|
||||
code_block_style = styles.get("code_block", {})
|
||||
css_parts.append("pre {")
|
||||
for property_name, value in code_block_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Images
|
||||
image_style = styles.get("image", {})
|
||||
css_parts.append("img {")
|
||||
for property_name, value in image_style.items():
|
||||
css_property = property_name.replace("_", "-")
|
||||
css_parts.append(f" {css_property}: {value};")
|
||||
css_parts.append("}")
|
||||
|
||||
# Generated info
|
||||
css_parts.append(".generated-info {")
|
||||
css_parts.append(" font-size: 0.9em;")
|
||||
css_parts.append(" color: #666;")
|
||||
css_parts.append(" text-align: center;")
|
||||
css_parts.append(" margin-top: 2em;")
|
||||
css_parts.append(" padding-top: 1em;")
|
||||
css_parts.append(" border-top: 1px solid #ddd;")
|
||||
css_parts.append("}")
|
||||
|
||||
return '\n'.join(css_parts)
|
||||
|
||||
def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a single JSON section to HTML using AI-generated styles."""
|
||||
try:
|
||||
section_type = self._get_section_type(section)
|
||||
section_data = self._get_section_data(section)
|
||||
|
||||
if section_type == "table":
|
||||
# Process the section data to extract table structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_table(processed_data, styles)
|
||||
elif section_type == "bullet_list":
|
||||
# Process the section data to extract bullet list structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_bullet_list(processed_data, styles)
|
||||
elif section_type == "heading":
|
||||
return self._render_json_heading(section_data, styles)
|
||||
elif section_type == "paragraph":
|
||||
return self._render_json_paragraph(section_data, styles)
|
||||
elif section_type == "code_block":
|
||||
# Process the section data to extract code block structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_code_block(processed_data, styles)
|
||||
elif section_type == "image":
|
||||
# Process the section data to extract image structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_image(processed_data, styles)
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
return self._render_json_paragraph(section_data, styles)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||
return f'<div class="error">[Error rendering section: {str(e)}]</div>'
|
||||
|
||||
def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON table to HTML using AI-generated styles."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return ""
|
||||
|
||||
html_parts = ['<table>']
|
||||
|
||||
# Table header
|
||||
html_parts.append('<thead><tr>')
|
||||
for header in headers:
|
||||
html_parts.append(f'<th>{header}</th>')
|
||||
html_parts.append('</tr></thead>')
|
||||
|
||||
# Table body
|
||||
html_parts.append('<tbody>')
|
||||
for row in rows:
|
||||
html_parts.append('<tr>')
|
||||
for cell_data in row:
|
||||
html_parts.append(f'<td>{cell_data}</td>')
|
||||
html_parts.append('</tr>')
|
||||
html_parts.append('</tbody>')
|
||||
|
||||
html_parts.append('</table>')
|
||||
return '\n'.join(html_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON bullet list to HTML using AI-generated styles."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
||||
html_parts = ['<ul>']
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
html_parts.append(f'<li>{item}</li>')
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
html_parts.append(f'<li>{item["text"]}</li>')
|
||||
html_parts.append('</ul>')
|
||||
|
||||
return '\n'.join(html_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON heading to HTML using AI-generated styles."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
return f'<h{level}>{text}</h{level}>'
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON paragraph to HTML using AI-generated styles."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
|
||||
if text:
|
||||
return f'<p>{text}</p>'
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON code block to HTML using AI-generated styles."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
return f'<pre><code class="language-{language}">{code}</code></pre>'
|
||||
else:
|
||||
return f'<pre><code>{code}</code></pre>'
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
||||
"""Render a JSON image to HTML."""
|
||||
try:
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
|
||||
if base64_data:
|
||||
return f'<img src="data:image/png;base64,{base64_data}" alt="{alt_text}">'
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return f'<div class="error">[Image: {image_data.get("altText", "Image")}]</div>'
|
||||
281
modules/services/serviceGeneration/renderers/rendererImage.py
Normal file
281
modules/services/serviceGeneration/renderers/rendererImage.py
Normal file
|
|
@ -0,0 +1,281 @@
|
|||
"""
|
||||
Image renderer for report generation using AI image generation.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import base64
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class RendererImage(BaseRenderer):
|
||||
"""Renders content to image format using AI image generation."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported image formats."""
|
||||
return ['png', 'jpg', 'jpeg', 'image']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['img', 'picture', 'photo', 'graphic']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for image renderer."""
|
||||
return 90
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to image format using AI image generation."""
|
||||
try:
|
||||
# Generate AI image from content
|
||||
image_content = await self._generate_ai_image(extracted_content, title, user_prompt, ai_service)
|
||||
|
||||
return image_content, "image/png"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering image: {str(e)}")
|
||||
# Re-raise the exception instead of using fallback
|
||||
raise Exception(f"Image rendering failed: {str(e)}")
|
||||
|
||||
async def _generate_ai_image(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||
"""Generate AI image from extracted content."""
|
||||
try:
|
||||
if not ai_service:
|
||||
raise ValueError("AI service is required for image generation")
|
||||
|
||||
# Validate JSON structure
|
||||
if not isinstance(extracted_content, dict):
|
||||
raise ValueError("Extracted content must be a dictionary")
|
||||
|
||||
if "sections" not in extracted_content:
|
||||
raise ValueError("Extracted content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = extracted_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Create AI prompt for image generation
|
||||
image_prompt = await self._create_image_generation_prompt(extracted_content, document_title, user_prompt, ai_service)
|
||||
|
||||
# Generate image using AI
|
||||
image_result = await ai_service.aiObjects.generateImage(
|
||||
prompt=image_prompt,
|
||||
size="1024x1024",
|
||||
quality="standard",
|
||||
style="vivid"
|
||||
)
|
||||
|
||||
# Extract base64 image data from result
|
||||
if image_result and image_result.get("success", False):
|
||||
image_data = image_result.get("image_data", "")
|
||||
if image_data:
|
||||
return image_data
|
||||
else:
|
||||
raise ValueError("No image data returned from AI")
|
||||
else:
|
||||
error_msg = image_result.get("error", "Unknown error") if image_result else "No result"
|
||||
raise ValueError(f"AI image generation failed: {error_msg}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating AI image: {str(e)}")
|
||||
raise Exception(f"AI image generation failed: {str(e)}")
|
||||
|
||||
async def _create_image_generation_prompt(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||
"""Create a detailed prompt for AI image generation based on the content."""
|
||||
try:
|
||||
# Start with base prompt
|
||||
prompt_parts = []
|
||||
|
||||
# Add user's original intent if available
|
||||
if user_prompt:
|
||||
prompt_parts.append(f"User Request: {user_prompt}")
|
||||
|
||||
# Add document title
|
||||
prompt_parts.append(f"Document Title: {title}")
|
||||
|
||||
# Analyze content and create visual description
|
||||
sections = extracted_content.get("sections", [])
|
||||
content_description = self._analyze_content_for_visual_description(sections)
|
||||
|
||||
if content_description:
|
||||
prompt_parts.append(f"Content to Visualize: {content_description}")
|
||||
|
||||
# Add style guidance
|
||||
style_guidance = self._get_style_guidance_from_content(extracted_content, user_prompt)
|
||||
if style_guidance:
|
||||
prompt_parts.append(f"Visual Style: {style_guidance}")
|
||||
|
||||
# Combine all parts
|
||||
full_prompt = "Create a professional, informative image that visualizes the following content:\n\n" + "\n\n".join(prompt_parts)
|
||||
|
||||
# Add technical requirements
|
||||
full_prompt += "\n\nTechnical Requirements:"
|
||||
full_prompt += "\n- High quality, professional appearance"
|
||||
full_prompt += "\n- Clear, readable text if any text is included"
|
||||
full_prompt += "\n- Appropriate colors and layout"
|
||||
full_prompt += "\n- Suitable for business/professional use"
|
||||
|
||||
# Truncate prompt if it exceeds DALL-E's 4000 character limit
|
||||
if len(full_prompt) > 4000:
|
||||
# Use AI to compress the prompt intelligently
|
||||
compressed_prompt = await self._compress_prompt_with_ai(full_prompt, ai_service)
|
||||
if compressed_prompt and len(compressed_prompt) <= 4000:
|
||||
return compressed_prompt
|
||||
|
||||
# Fallback to minimal prompt if AI compression fails or is still too long
|
||||
minimal_prompt = f"Create a professional image representing: {title}"
|
||||
if user_prompt:
|
||||
minimal_prompt += f" - {user_prompt}"
|
||||
|
||||
# If even the minimal prompt is too long, truncate it
|
||||
if len(minimal_prompt) > 4000:
|
||||
minimal_prompt = minimal_prompt[:3997] + "..."
|
||||
|
||||
return minimal_prompt
|
||||
|
||||
return full_prompt
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error creating image prompt: {str(e)}")
|
||||
# Fallback to simple prompt
|
||||
return f"Create a professional image representing: {title}"
|
||||
|
||||
async def _compress_prompt_with_ai(self, long_prompt: str, ai_service=None) -> str:
|
||||
"""Use AI to intelligently compress a long prompt while preserving key information."""
|
||||
try:
|
||||
if not ai_service:
|
||||
return None
|
||||
|
||||
compression_prompt = f"""
|
||||
You are an expert at creating concise, effective prompts for AI image generation.
|
||||
|
||||
The following prompt is too long for DALL-E (4000 character limit) and needs to be compressed to under 4000 characters while preserving the most important visual information.
|
||||
|
||||
Original prompt ({len(long_prompt)} characters):
|
||||
{long_prompt}
|
||||
|
||||
Please create a compressed version that:
|
||||
1. Keeps the most important visual elements and requirements
|
||||
2. Maintains the core intent and style guidance
|
||||
3. Preserves technical requirements
|
||||
4. Stays under 4000 characters
|
||||
5. Is optimized for DALL-E image generation
|
||||
|
||||
Return only the compressed prompt, no explanations.
|
||||
"""
|
||||
|
||||
# Use AI to compress the prompt - call the AI service correctly
|
||||
# The ai_service has an aiObjects attribute that contains the actual AI interface
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=compression_prompt,
|
||||
options=AiCallOptions(
|
||||
operationType=OperationType.GENERAL,
|
||||
maxTokens=2000,
|
||||
temperature=0.3 # Lower temperature for more consistent compression
|
||||
)
|
||||
)
|
||||
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
compressed = response.content.strip()
|
||||
|
||||
# Validate the compressed prompt
|
||||
if compressed and len(compressed) <= 4000 and len(compressed) > 50:
|
||||
self.logger.info(f"Successfully compressed prompt from {len(long_prompt)} to {len(compressed)} characters")
|
||||
return compressed
|
||||
else:
|
||||
self.logger.warning(f"AI compression failed or produced invalid result: {len(compressed) if compressed else 0} chars")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error compressing prompt with AI: {str(e)}")
|
||||
return None
|
||||
|
||||
def _analyze_content_for_visual_description(self, sections: List[Dict[str, Any]]) -> str:
|
||||
"""Analyze content sections and create a visual description for AI."""
|
||||
try:
|
||||
descriptions = []
|
||||
|
||||
for section in sections:
|
||||
section_type = self._get_section_type(section)
|
||||
section_data = self._get_section_data(section)
|
||||
|
||||
if section_type == "table":
|
||||
headers = section_data.get("headers", [])
|
||||
rows = section_data.get("rows", [])
|
||||
if headers and rows:
|
||||
descriptions.append(f"Data table with {len(headers)} columns and {len(rows)} rows: {', '.join(headers)}")
|
||||
|
||||
elif section_type == "bullet_list":
|
||||
items = section_data.get("items", [])
|
||||
if items:
|
||||
descriptions.append(f"List with {len(items)} items")
|
||||
|
||||
elif section_type == "heading":
|
||||
text = section_data.get("text", "")
|
||||
level = section_data.get("level", 1)
|
||||
if text:
|
||||
descriptions.append(f"Heading {level}: {text}")
|
||||
|
||||
elif section_type == "paragraph":
|
||||
text = section_data.get("text", "")
|
||||
if text and len(text) > 10: # Only include substantial paragraphs
|
||||
# Truncate long text
|
||||
truncated = text[:100] + "..." if len(text) > 100 else text
|
||||
descriptions.append(f"Text content: {truncated}")
|
||||
|
||||
elif section_type == "code_block":
|
||||
code = section_data.get("code", "")
|
||||
language = section_data.get("language", "")
|
||||
if code:
|
||||
descriptions.append(f"Code block ({language}): {code[:50]}...")
|
||||
|
||||
return "; ".join(descriptions) if descriptions else "General document content"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error analyzing content: {str(e)}")
|
||||
return "Document content"
|
||||
|
||||
def _get_style_guidance_from_content(self, extracted_content: Dict[str, Any], user_prompt: str = None) -> str:
|
||||
"""Determine visual style guidance based on content and user prompt."""
|
||||
try:
|
||||
style_elements = []
|
||||
|
||||
# Analyze user prompt for style hints
|
||||
if user_prompt:
|
||||
prompt_lower = user_prompt.lower()
|
||||
|
||||
if any(word in prompt_lower for word in ["modern", "contemporary", "sleek"]):
|
||||
style_elements.append("modern, clean design")
|
||||
elif any(word in prompt_lower for word in ["classic", "traditional", "formal"]):
|
||||
style_elements.append("classic, formal design")
|
||||
elif any(word in prompt_lower for word in ["creative", "artistic", "colorful"]):
|
||||
style_elements.append("creative, artistic design")
|
||||
elif any(word in prompt_lower for word in ["corporate", "business", "professional"]):
|
||||
style_elements.append("corporate, professional design")
|
||||
|
||||
# Analyze content type for additional style hints
|
||||
sections = extracted_content.get("sections", [])
|
||||
has_tables = any(self._get_section_type(s) == "table" for s in sections)
|
||||
has_lists = any(self._get_section_type(s) == "bullet_list" for s in sections)
|
||||
has_code = any(self._get_section_type(s) == "code_block" for s in sections)
|
||||
|
||||
if has_tables:
|
||||
style_elements.append("data-focused layout")
|
||||
if has_lists:
|
||||
style_elements.append("organized, structured presentation")
|
||||
if has_code:
|
||||
style_elements.append("technical, developer-friendly")
|
||||
|
||||
# Default style if no specific guidance
|
||||
if not style_elements:
|
||||
style_elements.append("professional, clean design")
|
||||
|
||||
return ", ".join(style_elements)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error determining style guidance: {str(e)}")
|
||||
return "professional design"
|
||||
79
modules/services/serviceGeneration/renderers/rendererJson.py
Normal file
79
modules/services/serviceGeneration/renderers/rendererJson.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
"""
|
||||
JSON renderer for report generation.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import json
|
||||
|
||||
class RendererJson(BaseRenderer):
|
||||
"""Renders content to JSON format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported JSON formats."""
|
||||
return ['json']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['data']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for JSON renderer."""
|
||||
return 80
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to JSON format."""
|
||||
try:
|
||||
# The extracted content should already be JSON from the AI
|
||||
# Just validate and format it
|
||||
json_content = self._clean_json_content(extracted_content, title)
|
||||
|
||||
return json_content, "application/json"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering JSON: {str(e)}")
|
||||
# Return minimal JSON fallback
|
||||
fallback_data = {
|
||||
"title": title,
|
||||
"sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}],
|
||||
"metadata": {"error": str(e)}
|
||||
}
|
||||
return json.dumps(fallback_data, indent=2), "application/json"
|
||||
|
||||
def _clean_json_content(self, content: Dict[str, Any], title: str) -> str:
|
||||
"""Clean and validate JSON content from AI."""
|
||||
try:
|
||||
# Validate JSON structure
|
||||
if not isinstance(content, dict):
|
||||
raise ValueError("Content must be a dictionary")
|
||||
|
||||
# Ensure it has the expected structure
|
||||
if "sections" not in content:
|
||||
# Convert old format to new format
|
||||
content = {
|
||||
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||
"metadata": {"title": title}
|
||||
}
|
||||
|
||||
# Ensure metadata exists
|
||||
if "metadata" not in content:
|
||||
content["metadata"] = {}
|
||||
|
||||
# Set title in metadata if not present
|
||||
if "title" not in content["metadata"]:
|
||||
content["metadata"]["title"] = title
|
||||
|
||||
# Re-format with proper indentation
|
||||
return json.dumps(content, indent=2, ensure_ascii=False)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error cleaning JSON content: {str(e)}")
|
||||
# Return minimal valid JSON
|
||||
fallback_data = {
|
||||
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||
"metadata": {"title": title, "error": str(e)}
|
||||
}
|
||||
return json.dumps(fallback_data, indent=2, ensure_ascii=False)
|
||||
221
modules/services/serviceGeneration/renderers/rendererMarkdown.py
Normal file
221
modules/services/serviceGeneration/renderers/rendererMarkdown.py
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
"""
|
||||
Markdown renderer for report generation.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
|
||||
class RendererMarkdown(BaseRenderer):
|
||||
"""Renders content to Markdown format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported Markdown formats."""
|
||||
return ['md', 'markdown']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['mdown', 'mkd']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for markdown renderer."""
|
||||
return 95
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to Markdown format."""
|
||||
try:
|
||||
# Generate markdown from JSON structure
|
||||
markdown_content = self._generate_markdown_from_json(extracted_content, title)
|
||||
|
||||
return markdown_content, "text/markdown"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering markdown: {str(e)}")
|
||||
# Return minimal markdown fallback
|
||||
return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown"
|
||||
|
||||
def _generate_markdown_from_json(self, json_content: Dict[str, Any], title: str) -> str:
|
||||
"""Generate markdown content from structured JSON document."""
|
||||
try:
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Build markdown content
|
||||
markdown_parts = []
|
||||
|
||||
# Document title
|
||||
markdown_parts.append(f"# {document_title}")
|
||||
markdown_parts.append("")
|
||||
|
||||
# Process each section
|
||||
sections = json_content.get("sections", [])
|
||||
for section in sections:
|
||||
section_markdown = self._render_json_section(section)
|
||||
if section_markdown:
|
||||
markdown_parts.append(section_markdown)
|
||||
markdown_parts.append("") # Add spacing between sections
|
||||
|
||||
# Add generation info
|
||||
markdown_parts.append("---")
|
||||
markdown_parts.append(f"*Generated: {self._format_timestamp()}*")
|
||||
|
||||
return '\n'.join(markdown_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating markdown from JSON: {str(e)}")
|
||||
raise Exception(f"Markdown generation failed: {str(e)}")
|
||||
|
||||
def _render_json_section(self, section: Dict[str, Any]) -> str:
|
||||
"""Render a single JSON section to markdown."""
|
||||
try:
|
||||
section_type = self._get_section_type(section)
|
||||
section_data = self._get_section_data(section)
|
||||
|
||||
if section_type == "table":
|
||||
# Process the section data to extract table structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_table(processed_data)
|
||||
elif section_type == "bullet_list":
|
||||
# Process the section data to extract bullet list structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_bullet_list(processed_data)
|
||||
elif section_type == "heading":
|
||||
return self._render_json_heading(section_data)
|
||||
elif section_type == "paragraph":
|
||||
return self._render_json_paragraph(section_data)
|
||||
elif section_type == "code_block":
|
||||
# Process the section data to extract code block structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_code_block(processed_data)
|
||||
elif section_type == "image":
|
||||
# Process the section data to extract image structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_image(processed_data)
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
return self._render_json_paragraph(section_data)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||
return f"*[Error rendering section: {str(e)}]*"
|
||||
|
||||
def _render_json_table(self, table_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON table to markdown."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return ""
|
||||
|
||||
markdown_parts = []
|
||||
|
||||
# Create table header
|
||||
header_line = " | ".join(str(header) for header in headers)
|
||||
markdown_parts.append(header_line)
|
||||
|
||||
# Add separator line
|
||||
separator_line = " | ".join("---" for _ in headers)
|
||||
markdown_parts.append(separator_line)
|
||||
|
||||
# Add data rows
|
||||
for row in rows:
|
||||
row_line = " | ".join(str(cell_data) for cell_data in row)
|
||||
markdown_parts.append(row_line)
|
||||
|
||||
return '\n'.join(markdown_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON bullet list to markdown."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
||||
markdown_parts = []
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
markdown_parts.append(f"- {item}")
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
markdown_parts.append(f"- {item['text']}")
|
||||
|
||||
return '\n'.join(markdown_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_heading(self, heading_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON heading to markdown."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
return f"{'#' * level} {text}"
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON paragraph to markdown."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
return text if text else ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_code_block(self, code_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON code block to markdown."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
return f"```{language}\n{code}\n```"
|
||||
else:
|
||||
return f"```\n{code}\n```"
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_image(self, image_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON image to markdown."""
|
||||
try:
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
|
||||
if base64_data:
|
||||
# For base64 images, we can't embed them directly in markdown
|
||||
# So we'll use a placeholder with the alt text
|
||||
return f""
|
||||
else:
|
||||
return f""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return f""
|
||||
642
modules/services/serviceGeneration/renderers/rendererPdf.py
Normal file
642
modules/services/serviceGeneration/renderers/rendererPdf.py
Normal file
|
|
@ -0,0 +1,642 @@
|
|||
"""
|
||||
PDF renderer for report generation using reportlab.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import io
|
||||
import base64
|
||||
from datetime import datetime, UTC
|
||||
|
||||
try:
|
||||
from reportlab.lib.pagesizes import letter, A4
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.lib.units import inch
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
|
||||
REPORTLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
REPORTLAB_AVAILABLE = False
|
||||
|
||||
class RendererPdf(BaseRenderer):
|
||||
"""Renders content to PDF format using reportlab."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported PDF formats."""
|
||||
return ['pdf']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['document', 'print']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for PDF renderer."""
|
||||
return 120
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to PDF format using AI-analyzed styling."""
|
||||
try:
|
||||
if not REPORTLAB_AVAILABLE:
|
||||
# Fallback to HTML if reportlab not available
|
||||
from .rendererHtml import RendererHtml
|
||||
html_renderer = RendererHtml()
|
||||
html_content, _ = await html_renderer.render(extracted_content, title, user_prompt, ai_service)
|
||||
return html_content, "text/html"
|
||||
|
||||
# Generate PDF using AI-analyzed styling
|
||||
pdf_content = await self._generate_pdf_from_json(extracted_content, title, user_prompt, ai_service)
|
||||
|
||||
return pdf_content, "application/pdf"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering PDF: {str(e)}")
|
||||
# Return minimal fallback
|
||||
return f"PDF Generation Error: {str(e)}", "text/plain"
|
||||
|
||||
async def _generate_pdf_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||
"""Generate PDF content from structured JSON document using AI-generated styling."""
|
||||
try:
|
||||
# Get AI-generated styling definitions
|
||||
styles = await self._get_pdf_styles(user_prompt, ai_service)
|
||||
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Make title shorter to prevent wrapping/overlapping
|
||||
if len(document_title) > 40:
|
||||
document_title = "PowerOn - Consent Agreement"
|
||||
|
||||
# Create a buffer to hold the PDF
|
||||
buffer = io.BytesIO()
|
||||
|
||||
# Create PDF document
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=72,
|
||||
leftMargin=72,
|
||||
topMargin=72,
|
||||
bottomMargin=18
|
||||
)
|
||||
|
||||
# Build PDF content
|
||||
story = []
|
||||
|
||||
# Title page
|
||||
title_style = self._create_title_style(styles)
|
||||
story.append(Paragraph(document_title, title_style))
|
||||
story.append(Spacer(1, 50)) # Increased spacing to prevent overlap
|
||||
story.append(Paragraph(f"Generated: {self._format_timestamp()}", self._create_normal_style(styles)))
|
||||
story.append(Spacer(1, 30)) # Add spacing before page break
|
||||
story.append(PageBreak())
|
||||
|
||||
# Process each section
|
||||
sections = json_content.get("sections", [])
|
||||
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
|
||||
for i, section in enumerate(sections):
|
||||
self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
|
||||
section_elements = self._render_json_section(section, styles)
|
||||
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
|
||||
story.extend(section_elements)
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
|
||||
# Get PDF content as base64
|
||||
buffer.seek(0)
|
||||
pdf_bytes = buffer.getvalue()
|
||||
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
||||
|
||||
return pdf_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
|
||||
raise Exception(f"PDF generation failed: {str(e)}")
|
||||
|
||||
async def _get_pdf_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Get PDF styling definitions using base template AI styling."""
|
||||
style_schema = {
|
||||
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
|
||||
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
|
||||
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
|
||||
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
|
||||
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
|
||||
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
|
||||
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
|
||||
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
|
||||
}
|
||||
|
||||
style_template = self._create_ai_style_template("pdf", user_prompt, style_schema)
|
||||
|
||||
# Use base template method like DOCX does (this works!)
|
||||
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_pdf_styles())
|
||||
|
||||
if styles is None:
|
||||
return self._get_default_pdf_styles()
|
||||
|
||||
# Convert colors to PDF format after getting styles
|
||||
styles = self._convert_colors_format(styles)
|
||||
|
||||
# Validate and fix contrast issues
|
||||
return self._validate_pdf_styles_contrast(styles)
|
||||
|
||||
async def _get_ai_styles_with_pdf_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get AI styles with proper PDF color conversion."""
|
||||
if not ai_service:
|
||||
return default_styles
|
||||
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||
|
||||
# Check if AI service is properly configured
|
||||
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
|
||||
self.logger.warning("AI service not properly configured, using defaults")
|
||||
return default_styles
|
||||
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
|
||||
# Check if response is valid
|
||||
if not response:
|
||||
self.logger.warning("AI service returned no response, using defaults")
|
||||
return default_styles
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
# Clean and parse JSON
|
||||
result = response.content.strip() if response and response.content else ""
|
||||
|
||||
# Check if result is empty
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty response, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Log the raw response for debugging
|
||||
self.logger.debug(f"AI styling raw response: {result[:200]}...")
|
||||
|
||||
# Extract JSON from various formats
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to extract JSON from explanatory text
|
||||
json_patterns = [
|
||||
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
|
||||
r'\{.*?"title".*?\}', # JSON with title field
|
||||
r'\{.*?"font_size".*?\}', # JSON with font_size field
|
||||
]
|
||||
|
||||
for pattern in json_patterns:
|
||||
json_match = re.search(pattern, result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
break
|
||||
|
||||
# Additional cleanup - remove any leading/trailing whitespace and newlines
|
||||
result = result.strip()
|
||||
|
||||
# Check if result is still empty after cleanup
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
|
||||
except json.JSONDecodeError as json_error:
|
||||
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
|
||||
|
||||
# Use print instead of logger to avoid truncation
|
||||
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "PDF_RENDERER")
|
||||
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "PDF_RENDERER")
|
||||
|
||||
self.logger.warning(f"Raw content that failed to parse: {result}")
|
||||
|
||||
# Try to fix incomplete JSON by adding missing closing braces
|
||||
open_braces = result.count('{')
|
||||
close_braces = result.count('}')
|
||||
|
||||
if open_braces > close_braces:
|
||||
# JSON is incomplete, add missing closing braces
|
||||
missing_braces = open_braces - close_braces
|
||||
result = result + '}' * missing_braces
|
||||
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
|
||||
|
||||
# Try parsing the fixed JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
self.logger.info("Successfully fixed incomplete JSON")
|
||||
except json.JSONDecodeError as fix_error:
|
||||
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
json_start = result.find('{')
|
||||
json_end = result.rfind('}')
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
json_part = result[json_start:json_end+1]
|
||||
try:
|
||||
styles = json.loads(json_part)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return default_styles
|
||||
else:
|
||||
return default_styles
|
||||
else:
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
json_start = result.find('{')
|
||||
json_end = result.rfind('}')
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
json_part = result[json_start:json_end+1]
|
||||
try:
|
||||
styles = json.loads(json_part)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return default_styles
|
||||
else:
|
||||
return default_styles
|
||||
|
||||
# Convert colors to PDF format (keep as hex strings, PDF renderer will convert them)
|
||||
styles = self._convert_colors_format(styles)
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||
return default_styles
|
||||
|
||||
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert colors to proper format for PDF compatibility."""
|
||||
try:
|
||||
for style_name, style_config in styles.items():
|
||||
if isinstance(style_config, dict):
|
||||
for prop, value in style_config.items():
|
||||
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
|
||||
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel) for consistency
|
||||
styles[style_name][prop] = f"FF{value[1:]}"
|
||||
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
|
||||
# Already aRGB format, keep as is
|
||||
pass
|
||||
return styles
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Color conversion failed: {str(e)}")
|
||||
return styles
|
||||
|
||||
def _get_safe_color(self, color_value: str, default: str = "#000000") -> str:
|
||||
"""Get a safe hex color value for PDF."""
|
||||
if isinstance(color_value, str) and color_value.startswith('#'):
|
||||
if len(color_value) == 7:
|
||||
return f"FF{color_value[1:]}"
|
||||
elif len(color_value) == 9:
|
||||
return color_value
|
||||
return default
|
||||
|
||||
def _validate_pdf_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate and fix contrast issues in AI-generated styles."""
|
||||
try:
|
||||
# Fix table header contrast
|
||||
if "table_header" in styles:
|
||||
header = styles["table_header"]
|
||||
bg_color = header.get("background", "#FFFFFF")
|
||||
text_color = header.get("text_color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["text_color"] = "#FFFFFF"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["text_color"] = "#FFFFFF"
|
||||
|
||||
# Fix table cell contrast
|
||||
if "table_cell" in styles:
|
||||
cell = styles["table_cell"]
|
||||
bg_color = cell.get("background", "#FFFFFF")
|
||||
text_color = cell.get("text_color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["text_color"] = "#2F2F2F"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["text_color"] = "#2F2F2F"
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||
return self._get_default_pdf_styles()
|
||||
|
||||
def _get_default_pdf_styles(self) -> Dict[str, Any]:
|
||||
"""Default PDF styles."""
|
||||
return {
|
||||
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
|
||||
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
|
||||
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
|
||||
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
|
||||
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
|
||||
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
|
||||
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
|
||||
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
|
||||
}
|
||||
|
||||
def _create_title_style(self, styles: Dict[str, Any]) -> ParagraphStyle:
|
||||
"""Create title style from style definitions."""
|
||||
title_style_def = styles.get("title", {})
|
||||
|
||||
# DEBUG: Show what color and spacing is being used for title
|
||||
title_color = title_style_def.get("color", "#1F4E79")
|
||||
title_space_after = title_style_def.get("space_after", 30)
|
||||
self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hex_to_color(title_color)}", "PDF_RENDERER")
|
||||
self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER")
|
||||
|
||||
return ParagraphStyle(
|
||||
'CustomTitle',
|
||||
fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20
|
||||
spaceAfter=title_style_def.get("space_after", 30),
|
||||
alignment=self._get_alignment(title_style_def.get("align", "center")),
|
||||
textColor=self._hex_to_color(title_color),
|
||||
leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles
|
||||
spaceBefore=0 # Ensure no space before title
|
||||
)
|
||||
|
||||
def _create_heading_style(self, styles: Dict[str, Any], level: int) -> ParagraphStyle:
|
||||
"""Create heading style from style definitions."""
|
||||
heading_key = f"heading{level}"
|
||||
heading_style_def = styles.get(heading_key, styles.get("heading1", {}))
|
||||
|
||||
return ParagraphStyle(
|
||||
f'CustomHeading{level}',
|
||||
fontSize=heading_style_def.get("font_size", 18 - level * 2),
|
||||
spaceAfter=heading_style_def.get("space_after", 12),
|
||||
spaceBefore=heading_style_def.get("space_before", 12),
|
||||
alignment=self._get_alignment(heading_style_def.get("align", "left")),
|
||||
textColor=self._hex_to_color(heading_style_def.get("color", "#2F2F2F"))
|
||||
)
|
||||
|
||||
def _create_normal_style(self, styles: Dict[str, Any]) -> ParagraphStyle:
|
||||
"""Create normal paragraph style from style definitions."""
|
||||
paragraph_style_def = styles.get("paragraph", {})
|
||||
|
||||
return ParagraphStyle(
|
||||
'CustomNormal',
|
||||
fontSize=paragraph_style_def.get("font_size", 11),
|
||||
spaceAfter=paragraph_style_def.get("space_after", 6),
|
||||
alignment=self._get_alignment(paragraph_style_def.get("align", "left")),
|
||||
textColor=self._hex_to_color(paragraph_style_def.get("color", "#2F2F2F")),
|
||||
leading=paragraph_style_def.get("line_height", 1.2) * paragraph_style_def.get("font_size", 11)
|
||||
)
|
||||
|
||||
def _get_alignment(self, align: str) -> int:
|
||||
"""Convert alignment string to reportlab alignment constant."""
|
||||
if not align or not isinstance(align, str):
|
||||
return TA_LEFT
|
||||
|
||||
align_map = {
|
||||
"center": TA_CENTER,
|
||||
"left": TA_LEFT,
|
||||
"justify": TA_JUSTIFY,
|
||||
"right": TA_LEFT, # ReportLab doesn't have TA_RIGHT, use LEFT as fallback
|
||||
"0": TA_LEFT, # Handle numeric strings
|
||||
"1": TA_CENTER,
|
||||
"2": TA_JUSTIFY
|
||||
}
|
||||
return align_map.get(align.lower().strip(), TA_LEFT)
|
||||
|
||||
def _get_table_alignment(self, align: str) -> str:
|
||||
"""Convert alignment string to ReportLab table alignment string."""
|
||||
if not align or not isinstance(align, str):
|
||||
return 'LEFT'
|
||||
|
||||
align_map = {
|
||||
"center": 'CENTER',
|
||||
"left": 'LEFT',
|
||||
"justify": 'LEFT', # Tables don't support justify, use LEFT
|
||||
"right": 'RIGHT',
|
||||
"0": 'LEFT', # Handle numeric strings
|
||||
"1": 'CENTER',
|
||||
"2": 'LEFT' # Tables don't support justify, use LEFT
|
||||
}
|
||||
return align_map.get(align.lower().strip(), 'LEFT')
|
||||
|
||||
def _hex_to_color(self, hex_color: str) -> colors.Color:
|
||||
"""Convert hex color to reportlab color."""
|
||||
try:
|
||||
hex_color = hex_color.lstrip('#')
|
||||
|
||||
# Handle aRGB format (8 characters: FF + RGB)
|
||||
if len(hex_color) == 8:
|
||||
# Skip the alpha channel (first 2 characters)
|
||||
hex_color = hex_color[2:]
|
||||
|
||||
# Handle RGB format (6 characters)
|
||||
if len(hex_color) == 6:
|
||||
r = int(hex_color[0:2], 16) / 255.0
|
||||
g = int(hex_color[2:4], 16) / 255.0
|
||||
b = int(hex_color[4:6], 16) / 255.0
|
||||
return colors.Color(r, g, b)
|
||||
|
||||
# Fallback for other formats
|
||||
return colors.black
|
||||
except:
|
||||
return colors.black
|
||||
|
||||
def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a single JSON section to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
section_type = self._get_section_type(section)
|
||||
elements = self._get_section_data(section)
|
||||
|
||||
# Process each element in the section
|
||||
all_elements = []
|
||||
for element in elements:
|
||||
if section_type == "table":
|
||||
all_elements.extend(self._render_json_table(element, styles))
|
||||
elif section_type == "bullet_list":
|
||||
all_elements.extend(self._render_json_bullet_list(element, styles))
|
||||
elif section_type == "heading":
|
||||
all_elements.extend(self._render_json_heading(element, styles))
|
||||
elif section_type == "paragraph":
|
||||
all_elements.extend(self._render_json_paragraph(element, styles))
|
||||
elif section_type == "code_block":
|
||||
all_elements.extend(self._render_json_code_block(element, styles))
|
||||
elif section_type == "image":
|
||||
all_elements.extend(self._render_json_image(element, styles))
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
all_elements.extend(self._render_json_paragraph(element, styles))
|
||||
|
||||
return all_elements
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||
return [Paragraph(f"[Error rendering section: {str(e)}]", self._create_normal_style(styles))]
|
||||
|
||||
def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON table to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return []
|
||||
|
||||
# Prepare table data
|
||||
table_data_list = [headers] + rows
|
||||
|
||||
# Create table
|
||||
table = Table(table_data_list)
|
||||
|
||||
# Apply styling
|
||||
table_header_style = styles.get("table_header", {})
|
||||
table_cell_style = styles.get("table_cell", {})
|
||||
|
||||
table_style = [
|
||||
('BACKGROUND', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("background", "#4F4F4F"))),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("text_color", "#FFFFFF"))),
|
||||
('ALIGN', (0, 0), (-1, -1), self._get_table_alignment(table_cell_style.get("align", "left"))),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
|
||||
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||
('BACKGROUND', (0, 1), (-1, -1), self._hex_to_color(table_cell_style.get("background", "#FFFFFF"))),
|
||||
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
|
||||
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||
]
|
||||
|
||||
table.setStyle(TableStyle(table_style))
|
||||
|
||||
return [table, Spacer(1, 12)]
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
return []
|
||||
|
||||
def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
bullet_style_def = styles.get("bullet_list", {})
|
||||
|
||||
elements = []
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
elements.append(Paragraph(f"• {item}", self._create_normal_style(styles)))
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
elements.append(Paragraph(f"• {item['text']}", self._create_normal_style(styles)))
|
||||
|
||||
if elements:
|
||||
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
|
||||
|
||||
return elements
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||
return []
|
||||
|
||||
def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON heading to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
heading_style = self._create_heading_style(styles, level)
|
||||
return [Paragraph(text, heading_style)]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
return []
|
||||
|
||||
def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
|
||||
if text:
|
||||
return [Paragraph(text, self._create_normal_style(styles))]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
return []
|
||||
|
||||
def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON code block to PDF elements using AI-generated styles."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
code_style_def = styles.get("code_block", {})
|
||||
|
||||
if code:
|
||||
elements = []
|
||||
|
||||
if language:
|
||||
lang_style = ParagraphStyle(
|
||||
'CodeLanguage',
|
||||
fontSize=code_style_def.get("font_size", 9),
|
||||
textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")),
|
||||
fontName='Helvetica-Bold'
|
||||
)
|
||||
elements.append(Paragraph(f"Code ({language}):", lang_style))
|
||||
|
||||
code_style = ParagraphStyle(
|
||||
'CodeBlock',
|
||||
fontSize=code_style_def.get("font_size", 9),
|
||||
textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")),
|
||||
fontName=code_style_def.get("font", "Courier"),
|
||||
backColor=self._hex_to_color(code_style_def.get("background", "#F5F5F5")),
|
||||
spaceAfter=code_style_def.get("space_after", 6)
|
||||
)
|
||||
elements.append(Paragraph(code, code_style))
|
||||
|
||||
return elements
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
return []
|
||||
|
||||
def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
|
||||
"""Render a JSON image to PDF elements."""
|
||||
try:
|
||||
base64_data = image_data.get("base64Data", "")
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
|
||||
if base64_data:
|
||||
# For now, just add a placeholder since reportlab image handling is complex
|
||||
return [Paragraph(f"[Image: {alt_text}]", self._create_normal_style(styles))]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return [Paragraph(f"[Image: {image_data.get('altText', 'Image')}]", self._create_normal_style(styles))]
|
||||
885
modules/services/serviceGeneration/renderers/rendererPptx.py
Normal file
885
modules/services/serviceGeneration/renderers/rendererPptx.py
Normal file
|
|
@ -0,0 +1,885 @@
|
|||
import logging
|
||||
import base64
|
||||
import io
|
||||
from typing import Dict, Any, Optional, Tuple, List
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RendererPptx(BaseRenderer):
|
||||
"""Renderer for PowerPoint (.pptx) files using python-pptx library."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_formats = ["pptx", "ppt"]
|
||||
self.output_mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> list:
|
||||
"""Get list of supported output formats."""
|
||||
return ["pptx", "ppt"]
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""
|
||||
Render content as PowerPoint presentation from JSON data.
|
||||
|
||||
Args:
|
||||
extracted_content: JSON content to render as presentation
|
||||
title: Title for the presentation
|
||||
user_prompt: User prompt for AI styling
|
||||
ai_service: AI service for styling
|
||||
**kwargs: Additional rendering options
|
||||
|
||||
Returns:
|
||||
Base64-encoded PowerPoint presentation as string
|
||||
"""
|
||||
try:
|
||||
# Import python-pptx
|
||||
from pptx import Presentation
|
||||
from pptx.util import Inches, Pt
|
||||
from pptx.enum.text import PP_ALIGN
|
||||
from pptx.dml.color import RGBColor
|
||||
import re
|
||||
|
||||
# Get AI-generated styling definitions first
|
||||
styles = await self._get_pptx_styles(user_prompt, ai_service)
|
||||
|
||||
# Create new presentation
|
||||
prs = Presentation()
|
||||
|
||||
# Set slide size based on user intent (default to 16:9)
|
||||
slide_size = styles.get("slide_size", "16:9")
|
||||
if slide_size == "4:3":
|
||||
prs.slide_width = Inches(10)
|
||||
prs.slide_height = Inches(7.5)
|
||||
else: # Default to 16:9
|
||||
prs.slide_width = Inches(13.33)
|
||||
prs.slide_height = Inches(7.5)
|
||||
|
||||
# Generate slides from JSON content
|
||||
slides_data = await self._parse_json_to_slides(extracted_content, title, styles)
|
||||
logger.info(f"Parsed {len(slides_data)} slides from JSON content")
|
||||
|
||||
# Debug: Show first 200 chars of content
|
||||
logger.info(f"JSON content preview: {str(extracted_content)[:200]}...")
|
||||
|
||||
for i, slide_data in enumerate(slides_data):
|
||||
logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - {len(slide_data.get('content', ''))} chars")
|
||||
# Debug: Show slide content preview
|
||||
slide_content = slide_data.get('content', '')
|
||||
if slide_content:
|
||||
logger.info(f" Content preview: '{slide_content[:100]}...'")
|
||||
else:
|
||||
logger.warning(f" ⚠️ Slide {i+1} has NO content!")
|
||||
|
||||
# Create slide with appropriate layout based on content
|
||||
slide_layout_index = self._get_slide_layout_index(slide_data, styles)
|
||||
slide_layout = prs.slide_layouts[slide_layout_index]
|
||||
slide = prs.slides.add_slide(slide_layout)
|
||||
|
||||
# Set title with AI-generated styling
|
||||
title_shape = slide.shapes.title
|
||||
title_shape.text = slide_data.get("title", "Slide")
|
||||
|
||||
# Apply title styling
|
||||
title_style = styles.get("title", {})
|
||||
if title_shape.text_frame.paragraphs[0].font:
|
||||
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44))
|
||||
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
||||
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||
|
||||
# Set content with AI-generated styling
|
||||
content_shape = slide.placeholders[1]
|
||||
content_text = slide_data.get("content", "")
|
||||
|
||||
# Format content text with AI styles
|
||||
text_frame = content_shape.text_frame
|
||||
text_frame.clear()
|
||||
|
||||
# Split content into paragraphs
|
||||
paragraphs = content_text.split('\n\n')
|
||||
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if paragraph.strip():
|
||||
if i == 0:
|
||||
p = text_frame.paragraphs[0]
|
||||
else:
|
||||
p = text_frame.add_paragraph()
|
||||
|
||||
p.text = paragraph.strip()
|
||||
|
||||
# Apply AI-generated styling based on content type
|
||||
if paragraph.startswith('#'):
|
||||
# Header
|
||||
p.text = paragraph.lstrip('#').strip()
|
||||
heading_style = styles.get("heading", {})
|
||||
p.font.size = Pt(heading_style.get("font_size", 32))
|
||||
p.font.bold = heading_style.get("bold", True)
|
||||
heading_color = self._get_safe_color(heading_style.get("color", (47, 47, 47)))
|
||||
p.font.color.rgb = RGBColor(*heading_color)
|
||||
elif paragraph.startswith('##'):
|
||||
# Subheader
|
||||
p.text = paragraph.lstrip('#').strip()
|
||||
subheading_style = styles.get("subheading", {})
|
||||
p.font.size = Pt(subheading_style.get("font_size", 24))
|
||||
p.font.bold = subheading_style.get("bold", True)
|
||||
subheading_color = self._get_safe_color(subheading_style.get("color", (79, 79, 79)))
|
||||
p.font.color.rgb = RGBColor(*subheading_color)
|
||||
elif paragraph.startswith('*') and paragraph.endswith('*'):
|
||||
# Bold text
|
||||
p.text = paragraph.strip('*')
|
||||
paragraph_style = styles.get("paragraph", {})
|
||||
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||
p.font.bold = True
|
||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||
else:
|
||||
# Regular text
|
||||
paragraph_style = styles.get("paragraph", {})
|
||||
p.font.size = Pt(paragraph_style.get("font_size", 18))
|
||||
p.font.bold = paragraph_style.get("bold", False)
|
||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||
p.font.color.rgb = RGBColor(*paragraph_color)
|
||||
|
||||
# Apply alignment
|
||||
align = paragraph_style.get("align", "left")
|
||||
if align == "center":
|
||||
p.alignment = PP_ALIGN.CENTER
|
||||
elif align == "right":
|
||||
p.alignment = PP_ALIGN.RIGHT
|
||||
else:
|
||||
p.alignment = PP_ALIGN.LEFT
|
||||
|
||||
# If no slides were created, create a default slide
|
||||
if not slides_data:
|
||||
slide_layout = prs.slide_layouts[0] # Title slide layout
|
||||
slide = prs.slides.add_slide(slide_layout)
|
||||
|
||||
title_shape = slide.shapes.title
|
||||
title_shape.text = title
|
||||
|
||||
# Apply title styling to default slide
|
||||
title_style = styles.get("title", {})
|
||||
if title_shape.text_frame.paragraphs[0].font:
|
||||
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
|
||||
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
|
||||
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
|
||||
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
|
||||
|
||||
subtitle_shape = slide.placeholders[1]
|
||||
subtitle_shape.text = "Generated by PowerOn AI System"
|
||||
|
||||
# Apply subtitle styling
|
||||
paragraph_style = styles.get("paragraph", {})
|
||||
if subtitle_shape.text_frame.paragraphs[0].font:
|
||||
subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20))
|
||||
subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False)
|
||||
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
|
||||
subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color)
|
||||
|
||||
# Save to buffer
|
||||
buffer = io.BytesIO()
|
||||
prs.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert to base64
|
||||
pptx_bytes = buffer.getvalue()
|
||||
pptx_base64 = base64.b64encode(pptx_bytes).decode('utf-8')
|
||||
|
||||
logger.info(f"Successfully rendered PowerPoint presentation: {len(pptx_bytes)} bytes")
|
||||
return pptx_base64, "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
|
||||
except ImportError:
|
||||
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||
return "python-pptx library not installed", "text/plain"
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
|
||||
return f"Error rendering PowerPoint presentation: {str(e)}", "text/plain"
|
||||
|
||||
def _parse_content_to_slides(self, content: str, title: str) -> list:
|
||||
"""
|
||||
Parse content into slide data structure.
|
||||
|
||||
Args:
|
||||
content: Content to parse
|
||||
title: Presentation title
|
||||
|
||||
Returns:
|
||||
List of slide data dictionaries
|
||||
"""
|
||||
slides = []
|
||||
|
||||
# Split content by slide markers or headers
|
||||
slide_sections = self._split_content_into_slides(content)
|
||||
|
||||
for i, section in enumerate(slide_sections):
|
||||
if section.strip():
|
||||
slide_data = {
|
||||
"title": f"Slide {i + 1}",
|
||||
"content": section.strip()
|
||||
}
|
||||
|
||||
# Extract title from content if it starts with #
|
||||
lines = section.strip().split('\n')
|
||||
if lines and lines[0].startswith('#'):
|
||||
# Remove # symbols and clean up title
|
||||
slide_title = lines[0].lstrip('#').strip()
|
||||
slide_data["title"] = slide_title
|
||||
slide_data["content"] = '\n'.join(lines[1:]).strip()
|
||||
elif lines and lines[0].strip():
|
||||
# Use first line as title if it looks like a title
|
||||
first_line = lines[0].strip()
|
||||
if len(first_line) < 100 and not first_line.endswith('.'):
|
||||
slide_data["title"] = first_line
|
||||
slide_data["content"] = '\n'.join(lines[1:]).strip()
|
||||
|
||||
slides.append(slide_data)
|
||||
|
||||
return slides
|
||||
|
||||
def _split_content_into_slides(self, content: str) -> list:
|
||||
"""
|
||||
Split content into individual slides based on headers and structure.
|
||||
|
||||
Args:
|
||||
content: Content to split
|
||||
|
||||
Returns:
|
||||
List of slide content strings
|
||||
"""
|
||||
import re
|
||||
|
||||
# First, try to split by major headers (# or ##)
|
||||
# This is the most common case for AI-generated content
|
||||
header_pattern = r'^(#{1,2})\s+(.+)$'
|
||||
lines = content.split('\n')
|
||||
slides = []
|
||||
current_slide = []
|
||||
|
||||
for line in lines:
|
||||
# Check if this line is a header
|
||||
header_match = re.match(header_pattern, line.strip())
|
||||
if header_match:
|
||||
# If we have content in current slide, save it
|
||||
if current_slide:
|
||||
slide_content = '\n'.join(current_slide).strip()
|
||||
if slide_content:
|
||||
slides.append(slide_content)
|
||||
current_slide = []
|
||||
|
||||
# Start new slide with this header
|
||||
current_slide.append(line)
|
||||
else:
|
||||
# Add line to current slide
|
||||
current_slide.append(line)
|
||||
|
||||
# Add the last slide
|
||||
if current_slide:
|
||||
slide_content = '\n'.join(current_slide).strip()
|
||||
if slide_content:
|
||||
slides.append(slide_content)
|
||||
|
||||
# If we found slides with headers, return them
|
||||
if len(slides) > 1:
|
||||
return slides
|
||||
|
||||
# Fallback: Split by double newlines
|
||||
sections = content.split('\n\n\n')
|
||||
if len(sections) > 1:
|
||||
return [s.strip() for s in sections if s.strip()]
|
||||
|
||||
# Another fallback: Split by double newlines
|
||||
sections = content.split('\n\n')
|
||||
if len(sections) > 1:
|
||||
return [s.strip() for s in sections if s.strip()]
|
||||
|
||||
# Last resort: return as single slide
|
||||
return [content.strip()]
|
||||
|
||||
|
||||
def get_output_mime_type(self) -> str:
|
||||
"""Get MIME type for rendered output."""
|
||||
return self.output_mime_type
|
||||
|
||||
async def _get_pptx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Get PowerPoint styling definitions using base template AI styling."""
|
||||
style_schema = {
|
||||
"title": {"font_size": 52, "color": "#1B365D", "bold": True, "align": "center"},
|
||||
"heading": {"font_size": 36, "color": "#2C5F2D", "bold": True, "align": "left"},
|
||||
"subheading": {"font_size": 28, "color": "#4A90E2", "bold": True, "align": "left"},
|
||||
"paragraph": {"font_size": 20, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||||
"bullet_list": {"font_size": 20, "color": "#2F2F2F", "indent": 20},
|
||||
"table_header": {"font_size": 18, "color": "#FFFFFF", "bold": True, "background": "#1B365D"},
|
||||
"table_cell": {"font_size": 16, "color": "#2F2F2F", "bold": False, "background": "#F8F9FA"},
|
||||
"slide_size": "16:9",
|
||||
"content_per_slide": "concise",
|
||||
"design_theme": "corporate",
|
||||
"color_scheme": "professional",
|
||||
"background_style": "clean",
|
||||
"accent_colors": ["#1B365D", "#2C5F2D", "#4A90E2", "#6B7280"],
|
||||
"professional_grade": True,
|
||||
"executive_ready": True
|
||||
}
|
||||
|
||||
style_template = self._create_professional_pptx_template(user_prompt, style_schema)
|
||||
# Use our own _get_ai_styles_with_pptx_colors method to ensure proper color conversion
|
||||
styles = await self._get_ai_styles_with_pptx_colors(ai_service, style_template, self._get_default_pptx_styles())
|
||||
|
||||
# Validate PowerPoint-specific requirements
|
||||
return self._validate_pptx_styles_readability(styles)
|
||||
|
||||
def _create_professional_pptx_template(self, user_prompt: str, style_schema: Dict[str, Any]) -> str:
|
||||
"""Create a professional PowerPoint-specific AI style template for corporate-quality slides."""
|
||||
import json
|
||||
schema_json = json.dumps(style_schema, indent=4)
|
||||
|
||||
return f"""Customize the JSON below for professional PowerPoint slides.
|
||||
|
||||
User Request: {user_prompt or "Create professional corporate slides"}
|
||||
|
||||
Rules:
|
||||
- Use professional colors (blues, grays, deep greens)
|
||||
- Large, readable font sizes
|
||||
- High contrast
|
||||
- Sophisticated color palettes
|
||||
|
||||
Return ONLY this JSON with your changes:
|
||||
|
||||
{schema_json}
|
||||
|
||||
JSON ONLY. NO OTHER TEXT."""
|
||||
|
||||
async def _get_ai_styles_with_pptx_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get AI styles with proper PowerPoint color conversion."""
|
||||
if not ai_service:
|
||||
return default_styles
|
||||
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||
|
||||
# Check if AI service is properly configured
|
||||
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
|
||||
self.logger.warning("AI service not properly configured, using defaults")
|
||||
return default_styles
|
||||
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
|
||||
# Check if response is valid
|
||||
if not response:
|
||||
self.logger.warning("AI service returned no response, using defaults")
|
||||
return default_styles
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
# Clean and parse JSON
|
||||
result = response.content.strip() if response and response.content else ""
|
||||
|
||||
# Check if result is empty
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty response, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Log the raw response for debugging
|
||||
self.logger.debug(f"AI styling raw response: {result[:200]}...")
|
||||
|
||||
# Extract JSON from various formats
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to extract JSON from explanatory text
|
||||
json_patterns = [
|
||||
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
|
||||
r'\{.*?"title".*?\}', # JSON with title field
|
||||
r'\{.*?"font_size".*?\}', # JSON with font_size field
|
||||
]
|
||||
|
||||
for pattern in json_patterns:
|
||||
json_match = re.search(pattern, result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
break
|
||||
|
||||
# Additional cleanup - remove any leading/trailing whitespace and newlines
|
||||
result = result.strip()
|
||||
|
||||
# Check if result is still empty after cleanup
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
|
||||
except json.JSONDecodeError as json_error:
|
||||
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
|
||||
self.logger.warning(f"Raw content that failed to parse: {result[:100]}...")
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
json_start = result.find('{')
|
||||
json_end = result.rfind('}')
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
json_part = result[json_start:json_end+1]
|
||||
try:
|
||||
styles = json.loads(json_part)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
self.logger.debug(f"Extracted AI styles: {list(styles.keys())}")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return default_styles
|
||||
else:
|
||||
return default_styles
|
||||
|
||||
# Convert colors to PowerPoint RGB format
|
||||
styles = self._convert_colors_format(styles)
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||
return default_styles
|
||||
|
||||
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert hex colors to RGB format for PowerPoint compatibility."""
|
||||
try:
|
||||
for style_name, style_config in styles.items():
|
||||
if isinstance(style_config, dict):
|
||||
for prop, value in style_config.items():
|
||||
if isinstance(value, str) and value.startswith('#'):
|
||||
# Convert hex to RGB tuple for PowerPoint
|
||||
hex_color = value.lstrip('#')
|
||||
if len(hex_color) == 6:
|
||||
r = int(hex_color[0:2], 16)
|
||||
g = int(hex_color[2:4], 16)
|
||||
b = int(hex_color[4:6], 16)
|
||||
styles[style_name][prop] = (r, g, b)
|
||||
elif len(hex_color) == 8: # aRGB format
|
||||
r = int(hex_color[2:4], 16)
|
||||
g = int(hex_color[4:6], 16)
|
||||
b = int(hex_color[6:8], 16)
|
||||
styles[style_name][prop] = (r, g, b)
|
||||
return styles
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Color conversion failed: {str(e)}")
|
||||
return styles
|
||||
|
||||
def _get_safe_color(self, color_value, default=(0, 0, 0)) -> tuple:
|
||||
"""Get a safe RGB color tuple for PowerPoint."""
|
||||
if isinstance(color_value, tuple) and len(color_value) == 3:
|
||||
return color_value
|
||||
elif isinstance(color_value, str) and color_value.startswith('#'):
|
||||
hex_color = color_value.lstrip('#')
|
||||
if len(hex_color) == 6:
|
||||
r = int(hex_color[0:2], 16)
|
||||
g = int(hex_color[2:4], 16)
|
||||
b = int(hex_color[4:6], 16)
|
||||
return (r, g, b)
|
||||
elif len(hex_color) == 8: # aRGB format
|
||||
r = int(hex_color[2:4], 16)
|
||||
g = int(hex_color[4:6], 16)
|
||||
b = int(hex_color[6:8], 16)
|
||||
return (r, g, b)
|
||||
return default
|
||||
|
||||
def _validate_pptx_styles_readability(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate and fix readability issues in AI-generated styles."""
|
||||
try:
|
||||
# Ensure minimum font sizes for PowerPoint readability
|
||||
min_font_sizes = {
|
||||
"title": 36,
|
||||
"heading": 24,
|
||||
"subheading": 20,
|
||||
"paragraph": 14,
|
||||
"bullet_list": 14,
|
||||
"table_header": 12,
|
||||
"table_cell": 12
|
||||
}
|
||||
|
||||
for style_name, min_size in min_font_sizes.items():
|
||||
if style_name in styles:
|
||||
current_size = styles[style_name].get("font_size", 12)
|
||||
if current_size < min_size:
|
||||
styles[style_name]["font_size"] = min_size
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Style validation failed: {str(e)}")
|
||||
return self._get_default_pptx_styles()
|
||||
|
||||
def _get_default_pptx_styles(self) -> Dict[str, Any]:
|
||||
"""Default PowerPoint styles with corporate professional color scheme."""
|
||||
return {
|
||||
"title": {"font_size": 52, "color": (27, 54, 93), "bold": True, "align": "center"},
|
||||
"heading": {"font_size": 36, "color": (44, 95, 45), "bold": True, "align": "left"},
|
||||
"subheading": {"font_size": 28, "color": (74, 144, 226), "bold": True, "align": "left"},
|
||||
"paragraph": {"font_size": 20, "color": (47, 47, 47), "bold": False, "align": "left"},
|
||||
"bullet_list": {"font_size": 20, "color": (47, 47, 47), "indent": 20},
|
||||
"table_header": {"font_size": 18, "color": (255, 255, 255), "bold": True, "background": (27, 54, 93)},
|
||||
"table_cell": {"font_size": 16, "color": (47, 47, 47), "bold": False, "background": (248, 249, 250)},
|
||||
"slide_size": "16:9",
|
||||
"content_per_slide": "concise",
|
||||
"design_theme": "corporate",
|
||||
"color_scheme": "professional",
|
||||
"background_style": "clean",
|
||||
"accent_colors": [(27, 54, 93), (44, 95, 45), (74, 144, 226), (107, 114, 128)],
|
||||
"professional_grade": True,
|
||||
"executive_ready": True
|
||||
}
|
||||
|
||||
async def _parse_json_to_slides(self, json_content: Dict[str, Any], title: str, styles: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse JSON content into slide data structure.
|
||||
|
||||
Args:
|
||||
json_content: JSON content to parse
|
||||
title: Presentation title
|
||||
styles: AI-generated styles
|
||||
|
||||
Returns:
|
||||
List of slide data dictionaries
|
||||
"""
|
||||
slides = []
|
||||
|
||||
try:
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Create title slide
|
||||
slides.append({
|
||||
"title": document_title,
|
||||
"content": "Generated by PowerOn AI System\n\n" + self._format_timestamp()
|
||||
})
|
||||
|
||||
# Process sections into slides based on content and user intent
|
||||
sections = json_content.get("sections", [])
|
||||
slides.extend(self._create_slides_from_sections(sections, styles))
|
||||
|
||||
# If no content slides were created, create a default content slide
|
||||
if len(slides) == 1: # Only title slide
|
||||
slides.append({
|
||||
"title": "Content Overview",
|
||||
"content": "No structured content found in the source documents.\n\nPlease check the source documents and try again."
|
||||
})
|
||||
|
||||
return slides
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing JSON to slides: {str(e)}")
|
||||
# Return minimal fallback slides
|
||||
return [
|
||||
{
|
||||
"title": title,
|
||||
"content": "Error parsing content for presentation"
|
||||
}
|
||||
]
|
||||
|
||||
def _create_slide_from_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create a slide from a JSON section."""
|
||||
try:
|
||||
# Get section title from data or use default
|
||||
section_title = "Untitled Section"
|
||||
if section.get("content_type") == "heading":
|
||||
# Extract text from elements array
|
||||
for element in section.get("elements", []):
|
||||
if isinstance(element, dict) and "text" in element:
|
||||
section_title = element.get("text", "Untitled Section")
|
||||
break
|
||||
elif section.get("title"):
|
||||
section_title = section.get("title")
|
||||
|
||||
content_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
# Build slide content based on section type
|
||||
content_parts = []
|
||||
|
||||
if content_type == "table":
|
||||
content_parts.append(self._format_table_for_slide(elements))
|
||||
elif content_type == "list":
|
||||
content_parts.append(self._format_list_for_slide(elements))
|
||||
elif content_type == "heading":
|
||||
content_parts.append(self._format_heading_for_slide(elements))
|
||||
elif content_type == "paragraph":
|
||||
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||
elif content_type == "code":
|
||||
content_parts.append(self._format_code_for_slide(elements))
|
||||
else:
|
||||
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||
|
||||
# Combine content parts
|
||||
slide_content = "\n\n".join(filter(None, content_parts))
|
||||
|
||||
return {
|
||||
"title": section_title,
|
||||
"content": slide_content
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error creating slide from section: {str(e)}")
|
||||
return None
|
||||
|
||||
def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str:
|
||||
"""Format table data for slide presentation."""
|
||||
try:
|
||||
# Extract table data from elements array
|
||||
headers = []
|
||||
rows = []
|
||||
for element in elements:
|
||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||
headers = element.get("headers", [])
|
||||
rows = element.get("rows", [])
|
||||
break
|
||||
|
||||
if not headers:
|
||||
return ""
|
||||
|
||||
# Create table representation
|
||||
table_lines = []
|
||||
|
||||
# Add headers
|
||||
header_line = " | ".join(str(h) for h in headers)
|
||||
table_lines.append(header_line)
|
||||
|
||||
# Add separator
|
||||
separator = "-" * len(header_line)
|
||||
table_lines.append(separator)
|
||||
|
||||
# Add data rows (limit based on content density)
|
||||
max_rows = 5 # Default limit
|
||||
for row in rows[:max_rows]:
|
||||
row_line = " | ".join(str(cell) for cell in row)
|
||||
table_lines.append(row_line)
|
||||
|
||||
if len(rows) > max_rows:
|
||||
table_lines.append(f"... and {len(rows) - max_rows} more rows")
|
||||
|
||||
return "\n".join(table_lines)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting table for slide: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _format_list_for_slide(self, list_data: Dict[str, Any]) -> str:
|
||||
"""Format list data for slide presentation."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
||||
# Create list representation
|
||||
list_lines = []
|
||||
|
||||
for item in items:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text", "")
|
||||
list_lines.append(f"• {text}")
|
||||
|
||||
# Add subitems (limit to 3 for readability)
|
||||
subitems = item.get("subitems", [])[:3]
|
||||
for subitem in subitems:
|
||||
if isinstance(subitem, dict):
|
||||
list_lines.append(f" - {subitem.get('text', '')}")
|
||||
else:
|
||||
list_lines.append(f" - {subitem}")
|
||||
else:
|
||||
list_lines.append(f"• {str(item)}")
|
||||
|
||||
return "\n".join(list_lines)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting list for slide: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _format_heading_for_slide(self, heading_data: Dict[str, Any]) -> str:
|
||||
"""Format heading data for slide presentation."""
|
||||
try:
|
||||
text = heading_data.get("text", "")
|
||||
level = heading_data.get("level", 1)
|
||||
|
||||
if text:
|
||||
return f"{'#' * level} {text}"
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting heading for slide: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _format_paragraph_for_slide(self, paragraph_data: Dict[str, Any]) -> str:
|
||||
"""Format paragraph data for slide presentation."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
|
||||
if text:
|
||||
# Limit paragraph length based on content density
|
||||
max_length = 200 # Default limit
|
||||
if len(text) > max_length:
|
||||
text = text[:max_length] + "..."
|
||||
|
||||
return text
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting paragraph for slide: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _format_code_for_slide(self, code_data: Dict[str, Any]) -> str:
|
||||
"""Format code data for slide presentation."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
|
||||
if code:
|
||||
# Limit code length based on content density
|
||||
max_length = 100 # Default limit
|
||||
if len(code) > max_length:
|
||||
code = code[:max_length] + "..."
|
||||
|
||||
if language:
|
||||
return f"Code ({language}):\n{code}"
|
||||
else:
|
||||
return f"Code:\n{code}"
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting code for slide: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _get_slide_layout_index(self, slide_data: Dict[str, Any], styles: Dict[str, Any]) -> int:
|
||||
"""Determine the best professional slide layout based on content."""
|
||||
try:
|
||||
content = slide_data.get("content", "")
|
||||
title = slide_data.get("title", "")
|
||||
|
||||
# Check if it's a title slide (first slide)
|
||||
if not content or "Generated by PowerOn AI System" in content:
|
||||
return 0 # Title slide layout
|
||||
|
||||
# Professional layout selection based on content
|
||||
if "|" in content and "-" in content:
|
||||
# Has both tables and lists - use content with caption for professional look
|
||||
return 2
|
||||
elif "|" in content:
|
||||
# Has tables - use content layout for clean table presentation
|
||||
return 1
|
||||
elif content.count("•") > 2:
|
||||
# Has many bullet points - use content layout for better readability
|
||||
return 1
|
||||
elif len(content) > 200:
|
||||
# Long content - use content layout for better text flow
|
||||
return 1
|
||||
elif title and len(title) > 20:
|
||||
# Long title - use title and content layout
|
||||
return 1
|
||||
else:
|
||||
# Default to title and content layout for professional appearance
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error determining slide layout: {str(e)}")
|
||||
return 1 # Default to title and content layout
|
||||
|
||||
def _create_slides_from_sections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Create slides from sections based on content density and user intent."""
|
||||
try:
|
||||
slides = []
|
||||
content_per_slide = styles.get("content_per_slide", "concise")
|
||||
|
||||
# Group sections by type and create slides
|
||||
current_slide_content = []
|
||||
current_slide_title = "Content Overview"
|
||||
|
||||
for section in sections:
|
||||
section_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
if section_type == "heading":
|
||||
# If we have accumulated content, create a slide
|
||||
if current_slide_content:
|
||||
slides.append({
|
||||
"title": current_slide_title,
|
||||
"content": "\n\n".join(current_slide_content)
|
||||
})
|
||||
current_slide_content = []
|
||||
|
||||
# Start new slide with heading as title
|
||||
for element in elements:
|
||||
if isinstance(element, dict) and "text" in element:
|
||||
current_slide_title = element.get("text", "Untitled Section")
|
||||
break
|
||||
else:
|
||||
# Add content to current slide
|
||||
formatted_content = self._format_section_content(section)
|
||||
if formatted_content:
|
||||
current_slide_content.append(formatted_content)
|
||||
|
||||
# Add final slide if there's content
|
||||
if current_slide_content:
|
||||
slides.append({
|
||||
"title": current_slide_title,
|
||||
"content": "\n\n".join(current_slide_content)
|
||||
})
|
||||
|
||||
return slides
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error creating slides from sections: {str(e)}")
|
||||
return []
|
||||
|
||||
def _format_section_content(self, section: Dict[str, Any]) -> str:
|
||||
"""Format section content for slide presentation."""
|
||||
try:
|
||||
content_type = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
# Process each element in the section
|
||||
content_parts = []
|
||||
for element in elements:
|
||||
if content_type == "table":
|
||||
content_parts.append(self._format_table_for_slide([element]))
|
||||
elif content_type == "list":
|
||||
content_parts.append(self._format_list_for_slide([element]))
|
||||
elif content_type == "heading":
|
||||
content_parts.append(self._format_heading_for_slide([element]))
|
||||
elif content_type == "paragraph":
|
||||
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||
elif content_type == "code":
|
||||
content_parts.append(self._format_code_for_slide([element]))
|
||||
else:
|
||||
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||
|
||||
return "\n\n".join(filter(None, content_parts))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting section content: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _format_timestamp(self) -> str:
|
||||
"""Format current timestamp for presentation generation."""
|
||||
from datetime import datetime, UTC
|
||||
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
256
modules/services/serviceGeneration/renderers/rendererText.py
Normal file
256
modules/services/serviceGeneration/renderers/rendererText.py
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
"""
|
||||
Text renderer for report generation.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
|
||||
class RendererText(BaseRenderer):
|
||||
"""Renders content to plain text format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported text formats (excluding formats with dedicated renderers)."""
|
||||
return [
|
||||
'txt', 'text', 'plain',
|
||||
# Programming languages
|
||||
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
|
||||
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
|
||||
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
|
||||
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
|
||||
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
|
||||
# Web technologies (excluding html/htm which have dedicated renderer)
|
||||
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
|
||||
# Data formats (excluding csv, md/markdown which have dedicated renderers)
|
||||
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
|
||||
# Configuration files
|
||||
'env', 'properties', 'conf', 'config', 'rc',
|
||||
'gitattributes', 'editorconfig', 'eslintrc',
|
||||
# Documentation
|
||||
'readme', 'changelog', 'license', 'authors',
|
||||
'contributing', 'todo', 'notes', 'docs'
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return [
|
||||
'ascii', 'utf8', 'utf-8', 'code', 'source',
|
||||
'script', 'program', 'file', 'document',
|
||||
'raw', 'unformatted', 'plaintext'
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for text renderer."""
|
||||
return 90
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to plain text format."""
|
||||
try:
|
||||
# Generate text from JSON structure
|
||||
text_content = self._generate_text_from_json(extracted_content, title)
|
||||
|
||||
return text_content, "text/plain"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering text: {str(e)}")
|
||||
# Return minimal text fallback
|
||||
return f"{title}\n\nError rendering report: {str(e)}", "text/plain"
|
||||
|
||||
def _generate_text_from_json(self, json_content: Dict[str, Any], title: str) -> str:
|
||||
"""Generate text content from structured JSON document."""
|
||||
try:
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Build text content
|
||||
text_parts = []
|
||||
|
||||
# Document title
|
||||
text_parts.append(document_title)
|
||||
text_parts.append("=" * len(document_title))
|
||||
text_parts.append("")
|
||||
|
||||
# Process each section
|
||||
sections = json_content.get("sections", [])
|
||||
for section in sections:
|
||||
section_text = self._render_json_section(section)
|
||||
if section_text:
|
||||
text_parts.append(section_text)
|
||||
text_parts.append("") # Add spacing between sections
|
||||
|
||||
# Add generation info
|
||||
text_parts.append("")
|
||||
text_parts.append(f"Generated: {self._format_timestamp()}")
|
||||
|
||||
return '\n'.join(text_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating text from JSON: {str(e)}")
|
||||
raise Exception(f"Text generation failed: {str(e)}")
|
||||
|
||||
def _render_json_section(self, section: Dict[str, Any]) -> str:
|
||||
"""Render a single JSON section to text."""
|
||||
try:
|
||||
section_type = self._get_section_type(section)
|
||||
section_data = self._get_section_data(section)
|
||||
|
||||
if section_type == "table":
|
||||
# Process the section data to extract table structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_table(processed_data)
|
||||
elif section_type == "bullet_list":
|
||||
# Process the section data to extract bullet list structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_bullet_list(processed_data)
|
||||
elif section_type == "heading":
|
||||
# Render each heading element in the elements array
|
||||
# section_data is already the elements array from _get_section_data
|
||||
rendered_elements = []
|
||||
for element in section_data:
|
||||
rendered_elements.append(self._render_json_heading(element))
|
||||
return "\n".join(rendered_elements)
|
||||
elif section_type == "paragraph":
|
||||
# Render each paragraph element in the elements array
|
||||
# section_data is already the elements array from _get_section_data
|
||||
rendered_elements = []
|
||||
for element in section_data:
|
||||
rendered_elements.append(self._render_json_paragraph(element))
|
||||
return "\n".join(rendered_elements)
|
||||
elif section_type == "code_block":
|
||||
# Process the section data to extract code block structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_code_block(processed_data)
|
||||
elif section_type == "image":
|
||||
# Process the section data to extract image structure
|
||||
processed_data = self._process_section_by_type(section)
|
||||
return self._render_json_image(processed_data)
|
||||
else:
|
||||
# Fallback to paragraph for unknown types - render each element
|
||||
# section_data is already the elements array from _get_section_data
|
||||
rendered_elements = []
|
||||
for element in section_data:
|
||||
rendered_elements.append(self._render_json_paragraph(element))
|
||||
return "\n".join(rendered_elements)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||
return f"[Error rendering section: {str(e)}]"
|
||||
|
||||
def _render_json_table(self, table_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON table to text."""
|
||||
try:
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
if not headers or not rows:
|
||||
return ""
|
||||
|
||||
text_parts = []
|
||||
|
||||
# Create table header
|
||||
header_line = " | ".join(str(header) for header in headers)
|
||||
text_parts.append(header_line)
|
||||
|
||||
# Add separator line
|
||||
separator_line = " | ".join("-" * len(str(header)) for header in headers)
|
||||
text_parts.append(separator_line)
|
||||
|
||||
# Add data rows
|
||||
for row in rows:
|
||||
row_line = " | ".join(str(cell_data) for cell_data in row)
|
||||
text_parts.append(row_line)
|
||||
|
||||
return '\n'.join(text_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON bullet list to text."""
|
||||
try:
|
||||
items = list_data.get("items", [])
|
||||
|
||||
if not items:
|
||||
return ""
|
||||
|
||||
text_parts = []
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
text_parts.append(f"- {item}")
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
text_parts.append(f"- {item['text']}")
|
||||
|
||||
return '\n'.join(text_parts)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_heading(self, heading_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON heading to text."""
|
||||
try:
|
||||
level = heading_data.get("level", 1)
|
||||
text = heading_data.get("text", "")
|
||||
|
||||
if text:
|
||||
level = max(1, min(6, level))
|
||||
if level == 1:
|
||||
return f"{text}\n{'=' * len(text)}"
|
||||
elif level == 2:
|
||||
return f"{text}\n{'-' * len(text)}"
|
||||
else:
|
||||
return f"{'#' * level} {text}"
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON paragraph to text."""
|
||||
try:
|
||||
text = paragraph_data.get("text", "")
|
||||
return text if text else ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_code_block(self, code_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON code block to text."""
|
||||
try:
|
||||
code = code_data.get("code", "")
|
||||
language = code_data.get("language", "")
|
||||
|
||||
if code:
|
||||
if language:
|
||||
return f"Code ({language}):\n{code}"
|
||||
else:
|
||||
return code
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _render_json_image(self, image_data: Dict[str, Any]) -> str:
|
||||
"""Render a JSON image to text."""
|
||||
try:
|
||||
alt_text = image_data.get("altText", "Image")
|
||||
return f"[Image: {alt_text}]"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering image: {str(e)}")
|
||||
return f"[Image: {image_data.get('altText', 'Image')}]"
|
||||
791
modules/services/serviceGeneration/renderers/rendererXlsx.py
Normal file
791
modules/services/serviceGeneration/renderers/rendererXlsx.py
Normal file
|
|
@ -0,0 +1,791 @@
|
|||
"""
|
||||
Excel renderer for report generation using openpyxl.
|
||||
"""
|
||||
|
||||
from .rendererBaseTemplate import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
import io
|
||||
import base64
|
||||
from datetime import datetime, UTC
|
||||
|
||||
try:
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
from openpyxl.worksheet.table import Table, TableStyleInfo
|
||||
OPENPYXL_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENPYXL_AVAILABLE = False
|
||||
|
||||
class RendererXlsx(BaseRenderer):
|
||||
"""Renders content to Excel format using openpyxl."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported Excel formats."""
|
||||
return ['xlsx', 'xls', 'excel']
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['spreadsheet', 'workbook']
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for Excel renderer."""
|
||||
return 110
|
||||
|
||||
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
|
||||
"""Render extracted JSON content to Excel format using AI-analyzed styling."""
|
||||
try:
|
||||
if not OPENPYXL_AVAILABLE:
|
||||
# Fallback to CSV if openpyxl not available
|
||||
from .rendererCsv import RendererCsv
|
||||
csv_renderer = RendererCsv()
|
||||
csv_content, _ = await csv_renderer.render(extracted_content, title, user_prompt, ai_service)
|
||||
return csv_content, "text/csv"
|
||||
|
||||
# Generate Excel using AI-analyzed styling
|
||||
excel_content = await self._generate_excel_from_json(extracted_content, title, user_prompt, ai_service)
|
||||
|
||||
return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering Excel: {str(e)}")
|
||||
# Return CSV fallback
|
||||
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
|
||||
|
||||
def _generate_excel(self, content: str, title: str) -> str:
|
||||
"""Generate Excel content using openpyxl."""
|
||||
try:
|
||||
# Create workbook
|
||||
wb = Workbook()
|
||||
|
||||
# Remove default sheet
|
||||
wb.remove(wb.active)
|
||||
|
||||
# Create sheets
|
||||
summary_sheet = wb.create_sheet("Summary", 0)
|
||||
data_sheet = wb.create_sheet("Data", 1)
|
||||
analysis_sheet = wb.create_sheet("Analysis", 2)
|
||||
|
||||
# Add content to sheets
|
||||
self._populate_summary_sheet(summary_sheet, title)
|
||||
self._populate_data_sheet(data_sheet, content)
|
||||
self._populate_analysis_sheet(analysis_sheet, content)
|
||||
|
||||
# Save to buffer
|
||||
buffer = io.BytesIO()
|
||||
wb.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert to base64
|
||||
excel_bytes = buffer.getvalue()
|
||||
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
|
||||
|
||||
return excel_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating Excel: {str(e)}")
|
||||
raise
|
||||
|
||||
def _populate_summary_sheet(self, sheet, title: str):
|
||||
"""Populate the summary sheet."""
|
||||
try:
|
||||
# Title
|
||||
sheet['A1'] = title
|
||||
sheet['A1'].font = Font(size=16, bold=True)
|
||||
sheet['A1'].alignment = Alignment(horizontal='center')
|
||||
|
||||
# Generation info
|
||||
sheet['A3'] = "Generated:"
|
||||
sheet['B3'] = self._format_timestamp()
|
||||
sheet['A4'] = "Status:"
|
||||
sheet['B4'] = "Generated Successfully"
|
||||
|
||||
# Key metrics placeholder
|
||||
sheet['A6'] = "Key Metrics:"
|
||||
sheet['A6'].font = Font(bold=True)
|
||||
sheet['A7'] = "Total Items:"
|
||||
sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
|
||||
|
||||
# Auto-adjust column widths
|
||||
sheet.column_dimensions['A'].width = 20
|
||||
sheet.column_dimensions['B'].width = 30
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate summary sheet: {str(e)}")
|
||||
|
||||
def _populate_data_sheet(self, sheet, content: str):
|
||||
"""Populate the data sheet."""
|
||||
try:
|
||||
# Headers
|
||||
headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
|
||||
for col, header in enumerate(headers, 1):
|
||||
cell = sheet.cell(row=1, column=col, value=header)
|
||||
cell.font = Font(bold=True)
|
||||
cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
|
||||
|
||||
# Process content
|
||||
lines = content.split('\n')
|
||||
row = 2
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Check for table data (lines with |)
|
||||
if '|' in line:
|
||||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||
for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
|
||||
sheet.cell(row=row, column=col, value=cell_data)
|
||||
row += 1
|
||||
else:
|
||||
# Regular content
|
||||
sheet.cell(row=row, column=1, value=line)
|
||||
row += 1
|
||||
|
||||
# Auto-adjust column widths
|
||||
for col in range(1, 6):
|
||||
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate data sheet: {str(e)}")
|
||||
|
||||
def _populate_analysis_sheet(self, sheet, content: str):
|
||||
"""Populate the analysis sheet."""
|
||||
try:
|
||||
# Title
|
||||
sheet['A1'] = "Analysis & Insights"
|
||||
sheet['A1'].font = Font(size=14, bold=True)
|
||||
|
||||
# Content analysis
|
||||
lines = content.split('\n')
|
||||
row = 3
|
||||
|
||||
sheet['A3'] = "Content Analysis:"
|
||||
sheet['A3'].font = Font(bold=True)
|
||||
row += 1
|
||||
|
||||
# Count different types of content
|
||||
table_lines = sum(1 for line in lines if '|' in line)
|
||||
list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
|
||||
text_lines = len(lines) - table_lines - list_lines
|
||||
|
||||
sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"Table Rows: {table_lines}"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"List Items: {list_lines}"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"Text Lines: {text_lines}"
|
||||
row += 2
|
||||
|
||||
# Recommendations
|
||||
sheet[f'A{row}'] = "Recommendations:"
|
||||
sheet[f'A{row}'].font = Font(bold=True)
|
||||
row += 1
|
||||
sheet[f'A{row}'] = "1. Review data accuracy"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = "2. Consider additional analysis"
|
||||
row += 1
|
||||
sheet[f'A{row}'] = "3. Update regularly"
|
||||
|
||||
# Auto-adjust column width
|
||||
sheet.column_dimensions['A'].width = 30
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
|
||||
|
||||
async def _generate_excel_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
|
||||
"""Generate Excel content from structured JSON document using AI-generated styling."""
|
||||
try:
|
||||
# Debug output
|
||||
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(json_content)}", "EXCEL_RENDERER")
|
||||
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(json_content.keys()) if isinstance(json_content, dict) else 'Not a dict'}", "EXCEL_RENDERER")
|
||||
|
||||
# Get AI-generated styling definitions
|
||||
styles = await self._get_excel_styles(user_prompt, ai_service)
|
||||
|
||||
# Validate JSON structure
|
||||
if not isinstance(json_content, dict):
|
||||
raise ValueError("JSON content must be a dictionary")
|
||||
|
||||
if "sections" not in json_content:
|
||||
raise ValueError("JSON content must contain 'sections' field")
|
||||
|
||||
# Use title from JSON metadata if available, otherwise use provided title
|
||||
document_title = json_content.get("metadata", {}).get("title", title)
|
||||
|
||||
# Create workbook
|
||||
wb = Workbook()
|
||||
|
||||
# Create sheets based on content
|
||||
sheets = self._create_excel_sheets(wb, json_content, styles)
|
||||
self.services.utils.debugLogToFile(f"EXCEL SHEETS CREATED: {list(sheets.keys()) if sheets else 'None'}", "EXCEL_RENDERER")
|
||||
|
||||
# Populate sheets with content
|
||||
self._populate_excel_sheets(sheets, json_content, styles)
|
||||
|
||||
# Save to buffer
|
||||
buffer = io.BytesIO()
|
||||
wb.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert to base64
|
||||
excel_bytes = buffer.getvalue()
|
||||
self.services.utils.debugLogToFile(f"EXCEL BYTES LENGTH: {len(excel_bytes)}", "EXCEL_RENDERER")
|
||||
try:
|
||||
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
|
||||
self.services.utils.debugLogToFile(f"EXCEL BASE64 LENGTH: {len(excel_base64)}", "EXCEL_RENDERER")
|
||||
except Exception as b64_error:
|
||||
self.services.utils.debugLogToFile(f"BASE64 ENCODING ERROR: {b64_error}", "EXCEL_RENDERER")
|
||||
raise
|
||||
|
||||
return excel_base64
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating Excel from JSON: {str(e)}")
|
||||
raise Exception(f"Excel generation failed: {str(e)}")
|
||||
|
||||
async def _get_excel_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
|
||||
"""Get Excel styling definitions using base template AI styling."""
|
||||
style_schema = {
|
||||
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
|
||||
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
|
||||
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
|
||||
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||
"bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2},
|
||||
"paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||
"code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"}
|
||||
}
|
||||
|
||||
style_template = self._create_ai_style_template("xlsx", user_prompt, style_schema)
|
||||
# Use our own _get_ai_styles_with_excel_colors method to ensure proper color conversion
|
||||
styles = await self._get_ai_styles_with_excel_colors(ai_service, style_template, self._get_default_excel_styles())
|
||||
|
||||
# Validate and fix contrast issues
|
||||
return self._validate_excel_styles_contrast(styles)
|
||||
|
||||
async def _get_ai_styles_with_excel_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get AI styles with proper Excel color conversion."""
|
||||
if not ai_service:
|
||||
return default_styles
|
||||
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=style_template, context="", options=request_options)
|
||||
response = await ai_service.aiObjects.call(request)
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
# Clean and parse JSON
|
||||
result = response.content.strip() if response and response.content else ""
|
||||
|
||||
# Check if result is empty
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty response, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
self.services.utils.debugLogToFile(f"EXTRACTED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
self.services.utils.debugLogToFile(f"CLEANED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
self.services.utils.debugLogToFile(f"CLEANED JSON FROM GENERIC MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
except json.JSONDecodeError as json_error:
|
||||
self.logger.warning(f"AI styling returned invalid JSON: {json_error}, using defaults")
|
||||
return default_styles
|
||||
|
||||
# Convert colors to Excel aRGB format
|
||||
styles = self._convert_colors_format(styles)
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||
return default_styles
|
||||
|
||||
def _get_safe_color(self, color_value: str, default: str = "FF000000") -> str:
|
||||
"""Get a safe aRGB color value for Excel (without # prefix)."""
|
||||
if not isinstance(color_value, str):
|
||||
return default
|
||||
|
||||
# Remove # prefix if present
|
||||
if color_value.startswith('#'):
|
||||
color_value = color_value[1:]
|
||||
|
||||
if len(color_value) == 6:
|
||||
# Convert RRGGBB to AARRGGBB
|
||||
return f"FF{color_value}"
|
||||
elif len(color_value) == 8:
|
||||
# Already aRGB format
|
||||
return color_value
|
||||
else:
|
||||
# Unexpected format, return default
|
||||
return default
|
||||
|
||||
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert hex colors to aRGB format for Excel compatibility."""
|
||||
try:
|
||||
self.services.utils.debugLogToFile(f"CONVERTING COLORS IN STYLES: {styles}", "EXCEL_RENDERER")
|
||||
for style_name, style_config in styles.items():
|
||||
if isinstance(style_config, dict):
|
||||
for prop, value in style_config.items():
|
||||
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
|
||||
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel)
|
||||
styles[style_name][prop] = f"FF{value[1:]}"
|
||||
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
|
||||
pass # Already aRGB format
|
||||
elif isinstance(value, str) and value.startswith('#'):
|
||||
pass # Unexpected format, keep as is
|
||||
return styles
|
||||
except Exception as e:
|
||||
return styles
|
||||
|
||||
def _validate_excel_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate and fix contrast issues in AI-generated styles."""
|
||||
try:
|
||||
# Fix table header contrast
|
||||
if "table_header" in styles:
|
||||
header = styles["table_header"]
|
||||
bg_color = header.get("background", "#FFFFFF")
|
||||
text_color = header.get("text_color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["text_color"] = "#FFFFFF"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
header["background"] = "#4F4F4F"
|
||||
header["text_color"] = "#FFFFFF"
|
||||
|
||||
# Fix table cell contrast
|
||||
if "table_cell" in styles:
|
||||
cell = styles["table_cell"]
|
||||
bg_color = cell.get("background", "#FFFFFF")
|
||||
text_color = cell.get("text_color", "#000000")
|
||||
|
||||
# If both are white or both are dark, fix it
|
||||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["text_color"] = "#2F2F2F"
|
||||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||||
cell["background"] = "#FFFFFF"
|
||||
cell["text_color"] = "#2F2F2F"
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Style validation failed: {str(e)}")
|
||||
return self._get_default_excel_styles()
|
||||
|
||||
def _get_default_excel_styles(self) -> Dict[str, Any]:
|
||||
"""Default Excel styles with aRGB color format."""
|
||||
return {
|
||||
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
|
||||
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
|
||||
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
|
||||
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||
"bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2},
|
||||
"paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"},
|
||||
"code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"}
|
||||
}
|
||||
|
||||
def _create_excel_sheets(self, wb: Workbook, json_content: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create Excel sheets based on content structure and user intent."""
|
||||
sheets = {}
|
||||
|
||||
# Get sheet names from AI styles or generate based on content
|
||||
sheet_names = styles.get("sheet_names", self._generate_sheet_names_from_content(json_content))
|
||||
self.services.utils.debugLogToFile(f"EXCEL SHEET NAMES: {sheet_names}", "EXCEL_RENDERER")
|
||||
|
||||
# Create sheets
|
||||
for i, sheet_name in enumerate(sheet_names):
|
||||
if i == 0:
|
||||
# Use the default sheet for the first sheet
|
||||
sheet = wb.active
|
||||
sheet.title = sheet_name
|
||||
else:
|
||||
# Create additional sheets
|
||||
sheet = wb.create_sheet(sheet_name, i)
|
||||
sheets[sheet_name.lower()] = sheet
|
||||
|
||||
return sheets
|
||||
|
||||
def _generate_sheet_names_from_content(self, json_content: Dict[str, Any]) -> List[str]:
|
||||
"""Generate sheet names based on actual content structure."""
|
||||
sections = json_content.get("sections", [])
|
||||
|
||||
# If no sections, create a single sheet
|
||||
if not sections:
|
||||
return ["Content"]
|
||||
|
||||
# Generate sheet names based on content structure
|
||||
sheet_names = []
|
||||
|
||||
# Check if we have multiple table sections
|
||||
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||
|
||||
if len(table_sections) > 1:
|
||||
# Create separate sheets for each table
|
||||
for i, section in enumerate(table_sections, 1):
|
||||
section_title = section.get("title", f"Table {i}")
|
||||
sheet_names.append(section_title[:31]) # Excel sheet name limit
|
||||
else:
|
||||
# Single table or mixed content - create main sheet
|
||||
document_title = json_content.get("metadata", {}).get("title", "Document")
|
||||
sheet_names.append(document_title[:31]) # Excel sheet name limit
|
||||
|
||||
# Add additional sheets for other content types
|
||||
content_types = set()
|
||||
for section in sections:
|
||||
content_type = section.get("content_type", "paragraph")
|
||||
content_types.add(content_type)
|
||||
|
||||
if "table" in content_types and len(table_sections) == 1:
|
||||
sheet_names.append("Table Data")
|
||||
if "list" in content_types:
|
||||
sheet_names.append("Lists")
|
||||
if "paragraph" in content_types or "heading" in content_types:
|
||||
sheet_names.append("Text")
|
||||
|
||||
# Limit to 4 sheets maximum
|
||||
return sheet_names[:4]
|
||||
|
||||
def _populate_excel_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||
"""Populate Excel sheets with content from JSON based on actual sheet names."""
|
||||
try:
|
||||
# Get the actual sheet names that were created
|
||||
sheet_names = list(sheets.keys())
|
||||
|
||||
if not sheet_names:
|
||||
return
|
||||
|
||||
sections = json_content.get("sections", [])
|
||||
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||
|
||||
if len(table_sections) > 1:
|
||||
# Multiple tables - populate each sheet with its corresponding table
|
||||
for i, section in enumerate(table_sections):
|
||||
if i < len(sheet_names):
|
||||
sheet_name = sheet_names[i]
|
||||
sheet = sheets[sheet_name]
|
||||
self._populate_table_sheet(sheet, section, styles, f"Table {i+1}")
|
||||
else:
|
||||
# Single table or mixed content - use original logic
|
||||
first_sheet_name = sheet_names[0]
|
||||
self._populate_main_sheet(sheets[first_sheet_name], json_content, styles)
|
||||
|
||||
# If we have multiple sheets, distribute content by type
|
||||
if len(sheet_names) > 1:
|
||||
self._populate_content_type_sheets(sheets, json_content, styles, sheet_names[1:])
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate Excel sheets: {str(e)}")
|
||||
|
||||
def _populate_table_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], sheet_title: str):
|
||||
"""Populate a sheet with a single table section."""
|
||||
try:
|
||||
# Sheet title
|
||||
sheet['A1'] = sheet_title
|
||||
sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79")))
|
||||
sheet['A1'].alignment = Alignment(horizontal="center")
|
||||
|
||||
# Get table data from elements (canonical JSON format)
|
||||
elements = section.get("elements", [])
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
table_data = elements[0]
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
else:
|
||||
headers = []
|
||||
rows = []
|
||||
|
||||
if not headers and not rows:
|
||||
sheet['A3'] = "No table data available"
|
||||
return
|
||||
|
||||
# Add headers
|
||||
header_style = styles.get("table_header", {})
|
||||
for col, header in enumerate(headers, 1):
|
||||
cell = sheet.cell(row=3, column=col, value=header)
|
||||
if header_style.get("bold"):
|
||||
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
||||
if header_style.get("background"):
|
||||
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
|
||||
|
||||
# Add rows
|
||||
cell_style = styles.get("table_cell", {})
|
||||
for row_idx, row_data in enumerate(rows, 4):
|
||||
for col_idx, cell_value in enumerate(row_data, 1):
|
||||
cell = sheet.cell(row=row_idx, column=col_idx, value=cell_value)
|
||||
if cell_style.get("text_color"):
|
||||
cell.font = Font(color=self._get_safe_color(cell_style["text_color"]))
|
||||
|
||||
# Auto-adjust column widths
|
||||
for col in range(1, len(headers) + 1):
|
||||
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate table sheet: {str(e)}")
|
||||
|
||||
def _populate_main_sheet(self, sheet, json_content: Dict[str, Any], styles: Dict[str, Any]):
|
||||
"""Populate the main sheet with document overview and all content."""
|
||||
try:
|
||||
# Document title
|
||||
document_title = json_content.get("metadata", {}).get("title", "Generated Report")
|
||||
sheet['A1'] = document_title
|
||||
|
||||
# Safety check for title style
|
||||
title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "center"})
|
||||
try:
|
||||
safe_color = self._get_safe_color(title_style["color"])
|
||||
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color=safe_color)
|
||||
sheet['A1'].alignment = Alignment(horizontal=title_style["align"])
|
||||
except Exception as font_error:
|
||||
# Try with a safe color
|
||||
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color="FF000000")
|
||||
sheet['A1'].alignment = Alignment(horizontal=title_style["align"])
|
||||
|
||||
# Generation info
|
||||
sheet['A3'] = "Generated:"
|
||||
sheet['B3'] = self._format_timestamp()
|
||||
sheet['A4'] = "Status:"
|
||||
sheet['B4'] = "Generated Successfully"
|
||||
|
||||
# Document metadata
|
||||
metadata = json_content.get("metadata", {})
|
||||
if metadata:
|
||||
sheet['A6'] = "Document Information:"
|
||||
sheet['A6'].font = Font(bold=True)
|
||||
|
||||
row = 7
|
||||
for key, value in metadata.items():
|
||||
if key != "title":
|
||||
sheet[f'A{row}'] = f"{key.title()}:"
|
||||
sheet[f'B{row}'] = str(value)
|
||||
row += 1
|
||||
|
||||
# Content overview
|
||||
sections = json_content.get("sections", [])
|
||||
sheet[f'A{row + 1}'] = "Content Overview:"
|
||||
sheet[f'A{row + 1}'].font = Font(bold=True)
|
||||
|
||||
row += 2
|
||||
sheet[f'A{row}'] = f"Total Sections: {len(sections)}"
|
||||
|
||||
# Count different content types
|
||||
content_types = {}
|
||||
for section in sections:
|
||||
content_type = section.get("content_type", "unknown")
|
||||
content_types[content_type] = content_types.get(content_type, 0) + 1
|
||||
|
||||
for content_type, count in content_types.items():
|
||||
row += 1
|
||||
sheet[f'A{row}'] = f"{content_type.title()} Sections: {count}"
|
||||
|
||||
# Add all content to this sheet
|
||||
row += 2
|
||||
for section in sections:
|
||||
row = self._add_section_to_sheet(sheet, section, styles, row)
|
||||
row += 1 # Empty row between sections
|
||||
|
||||
# Auto-adjust column widths
|
||||
sheet.column_dimensions['A'].width = 20
|
||||
sheet.column_dimensions['B'].width = 30
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate main sheet: {str(e)}")
|
||||
|
||||
def _populate_content_type_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any], sheet_names: List[str]):
|
||||
"""Populate additional sheets based on content types."""
|
||||
try:
|
||||
sections = json_content.get("sections", [])
|
||||
|
||||
for sheet_name in sheet_names:
|
||||
if sheet_name not in sheets:
|
||||
continue
|
||||
|
||||
sheet = sheets[sheet_name]
|
||||
sheet_title = sheet_name.title()
|
||||
sheet['A1'] = sheet_title
|
||||
sheet['A1'].font = Font(size=16, bold=True)
|
||||
|
||||
row = 3
|
||||
|
||||
# Filter sections by content type
|
||||
if sheet_name == "tables":
|
||||
filtered_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||
elif sheet_name == "lists":
|
||||
filtered_sections = [s for s in sections if s.get("content_type") == "list"]
|
||||
elif sheet_name == "text":
|
||||
filtered_sections = [s for s in sections if s.get("content_type") in ["paragraph", "heading"]]
|
||||
else:
|
||||
filtered_sections = sections
|
||||
|
||||
for section in filtered_sections:
|
||||
row = self._add_section_to_sheet(sheet, section, styles, row)
|
||||
row += 1 # Empty row between sections
|
||||
|
||||
# Auto-adjust column widths
|
||||
for col in range(1, 6):
|
||||
sheet.column_dimensions[get_column_letter(col)].width = 20
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not populate content type sheets: {str(e)}")
|
||||
|
||||
def _add_section_to_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||
"""Add a section to a sheet and return the next row."""
|
||||
try:
|
||||
# Add section title
|
||||
section_title = section.get("title")
|
||||
if section_title:
|
||||
sheet[f'A{start_row}'] = f"# {section_title}"
|
||||
sheet[f'A{start_row}'].font = Font(bold=True)
|
||||
start_row += 1
|
||||
|
||||
# Process section based on type
|
||||
section_type = section.get("content_type", "paragraph")
|
||||
|
||||
# Handle all section types using elements array
|
||||
elements = section.get("elements", [])
|
||||
for element in elements:
|
||||
if section_type == "table":
|
||||
start_row = self._add_table_to_excel(sheet, element, styles, start_row)
|
||||
elif section_type == "list":
|
||||
start_row = self._add_list_to_excel(sheet, element, styles, start_row)
|
||||
elif section_type == "paragraph":
|
||||
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
||||
elif section_type == "heading":
|
||||
start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
|
||||
else:
|
||||
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
||||
|
||||
return start_row
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add section to sheet: {str(e)}")
|
||||
return start_row + 1
|
||||
|
||||
def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||
"""Add a table element to Excel sheet."""
|
||||
try:
|
||||
# In canonical JSON format, table elements have headers and rows directly
|
||||
headers = element.get("headers", [])
|
||||
rows = element.get("rows", [])
|
||||
|
||||
if not headers and not rows:
|
||||
return start_row
|
||||
|
||||
# Add headers
|
||||
header_style = styles.get("table_header", {})
|
||||
for col, header in enumerate(headers, 1):
|
||||
cell = sheet.cell(row=start_row, column=col, value=header)
|
||||
if header_style.get("bold"):
|
||||
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
||||
if header_style.get("background"):
|
||||
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
|
||||
|
||||
start_row += 1
|
||||
|
||||
# Add rows
|
||||
cell_style = styles.get("table_cell", {})
|
||||
for row_data in rows:
|
||||
for col, cell_value in enumerate(row_data, 1):
|
||||
cell = sheet.cell(row=start_row, column=col, value=cell_value)
|
||||
if cell_style.get("text_color"):
|
||||
cell.font = Font(color=self._get_safe_color(cell_style["text_color"]))
|
||||
start_row += 1
|
||||
|
||||
return start_row
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add table to Excel: {str(e)}")
|
||||
return start_row + 1
|
||||
|
||||
def _add_list_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||
"""Add a list element to Excel sheet."""
|
||||
try:
|
||||
list_items = element.get("items", [])
|
||||
|
||||
list_style = styles.get("bullet_list", {})
|
||||
for item in list_items:
|
||||
sheet.cell(row=start_row, column=1, value=f"• {item}")
|
||||
if list_style.get("color"):
|
||||
sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(list_style["color"]))
|
||||
start_row += 1
|
||||
|
||||
return start_row
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add list to Excel: {str(e)}")
|
||||
return start_row + 1
|
||||
|
||||
def _add_paragraph_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||
"""Add a paragraph element to Excel sheet."""
|
||||
try:
|
||||
text = element.get("text", "")
|
||||
if text:
|
||||
sheet.cell(row=start_row, column=1, value=text)
|
||||
|
||||
paragraph_style = styles.get("paragraph", {})
|
||||
if paragraph_style.get("color"):
|
||||
sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(paragraph_style["color"]))
|
||||
|
||||
start_row += 1
|
||||
|
||||
return start_row
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add paragraph to Excel: {str(e)}")
|
||||
return start_row + 1
|
||||
|
||||
def _add_heading_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||
"""Add a heading element to Excel sheet."""
|
||||
try:
|
||||
text = element.get("text", "")
|
||||
level = element.get("level", 1)
|
||||
|
||||
if text:
|
||||
sheet.cell(row=start_row, column=1, value=text)
|
||||
|
||||
heading_style = styles.get("heading", {})
|
||||
font_size = heading_style.get("font_size", 14)
|
||||
if level > 1:
|
||||
font_size = max(10, font_size - (level - 1) * 2)
|
||||
|
||||
sheet.cell(row=start_row, column=1).font = Font(
|
||||
size=font_size,
|
||||
bold=True,
|
||||
color=self._get_safe_color(heading_style.get("color", "FF000000"))
|
||||
)
|
||||
|
||||
start_row += 1
|
||||
|
||||
return start_row
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not add heading to Excel: {str(e)}")
|
||||
return start_row + 1
|
||||
|
||||
def _format_timestamp(self) -> str:
|
||||
"""Format current timestamp for document generation."""
|
||||
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
|
@ -1,94 +0,0 @@
|
|||
"""
|
||||
Text renderer for report generation.
|
||||
"""
|
||||
|
||||
from .base_renderer import BaseRenderer
|
||||
from typing import Dict, Any, Tuple, List
|
||||
|
||||
class TextRenderer(BaseRenderer):
|
||||
"""Renders content to plain text format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def get_supported_formats(cls) -> List[str]:
|
||||
"""Return supported text formats (excluding formats with dedicated renderers)."""
|
||||
return [
|
||||
'txt', 'text', 'plain',
|
||||
# Programming languages
|
||||
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
|
||||
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
|
||||
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
|
||||
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
|
||||
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
|
||||
# Web technologies (excluding html/htm which have dedicated renderer)
|
||||
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
|
||||
# Data formats (excluding csv, md/markdown which have dedicated renderers)
|
||||
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
|
||||
# Configuration files
|
||||
'env', 'properties', 'conf', 'config', 'rc',
|
||||
'gitattributes', 'editorconfig', 'eslintrc',
|
||||
# Documentation
|
||||
'readme', 'changelog', 'license', 'authors',
|
||||
'contributing', 'todo', 'notes', 'docs'
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_format_aliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return [
|
||||
'ascii', 'utf8', 'utf-8', 'code', 'source',
|
||||
'script', 'program', 'file', 'document',
|
||||
'raw', 'unformatted', 'plaintext'
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_priority(cls) -> int:
|
||||
"""Return priority for text renderer."""
|
||||
return 90
|
||||
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
|
||||
"""Return only plain-text guidelines; global prompt is built centrally."""
|
||||
return (
|
||||
"TEXT FORMAT GUIDELINES:\n"
|
||||
"- Output ONLY plain text (no markdown or HTML).\n"
|
||||
"- Use clear headings (you may underline with === or --- when helpful).\n"
|
||||
"- Use simple bullet lists with '-' and tables with '|' when needed.\n"
|
||||
"- Preserve indentation for code-like content if present.\n"
|
||||
"OUTPUT: Return ONLY the raw text content."
|
||||
)
|
||||
|
||||
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
|
||||
"""Render extracted content to plain text format."""
|
||||
try:
|
||||
# The extracted content should already be formatted text from the AI
|
||||
# Just clean it up
|
||||
text_content = self._clean_text_content(extracted_content, title)
|
||||
|
||||
return text_content, "text/plain"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering text: {str(e)}")
|
||||
# Return minimal text fallback
|
||||
return f"{title}\n\nError rendering report: {str(e)}", "text/plain"
|
||||
|
||||
def _clean_text_content(self, content: str, title: str) -> str:
|
||||
"""Clean and validate text content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
# Remove any remaining markdown formatting
|
||||
content = content.replace('**', '').replace('*', '')
|
||||
content = content.replace('__', '').replace('_', '')
|
||||
|
||||
# Clean up any HTML-like tags that might have slipped through
|
||||
import re
|
||||
content = re.sub(r'<[^>]+>', '', content)
|
||||
|
||||
# Ensure proper line endings
|
||||
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
return content
|
||||
517
modules/services/serviceGeneration/subJsonSchema.py
Normal file
517
modules/services/serviceGeneration/subJsonSchema.py
Normal file
|
|
@ -0,0 +1,517 @@
|
|||
"""
|
||||
JSON Schema definitions for AI-generated document structures.
|
||||
This module provides schemas that guide AI to generate structured JSON output.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
|
||||
"""Get the JSON schema for multi-document generation."""
|
||||
return {
|
||||
"type": "object",
|
||||
"required": ["metadata", "documents"],
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"required": ["title", "splitStrategy"],
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "Document title"},
|
||||
"splitStrategy": {
|
||||
"type": "string",
|
||||
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
|
||||
"description": "Strategy for splitting content into multiple files"
|
||||
},
|
||||
"splitCriteria": {
|
||||
"type": "object",
|
||||
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
|
||||
},
|
||||
"fileNamingPattern": {
|
||||
"type": "string",
|
||||
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
|
||||
},
|
||||
"author": {"type": "string", "description": "Document author (optional)"},
|
||||
"source_documents": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of source document IDs"
|
||||
},
|
||||
"extraction_method": {
|
||||
"type": "string",
|
||||
"default": "ai_extraction",
|
||||
"description": "Method used for extraction"
|
||||
}
|
||||
}
|
||||
},
|
||||
"documents": {
|
||||
"type": "array",
|
||||
"description": "Array of individual documents to generate",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "title", "sections", "filename"],
|
||||
"properties": {
|
||||
"id": {"type": "string", "description": "Unique document identifier"},
|
||||
"title": {"type": "string", "description": "Document title"},
|
||||
"filename": {"type": "string", "description": "Generated filename"},
|
||||
"sections": {
|
||||
"type": "array",
|
||||
"description": "Document sections containing structured content",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "content_type", "elements", "order"],
|
||||
"properties": {
|
||||
"id": {"type": "string", "description": "Unique section identifier"},
|
||||
"title": {"type": "string", "description": "Section title (optional)"},
|
||||
"content_type": {
|
||||
"type": "string",
|
||||
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
||||
"description": "Primary content type of this section"
|
||||
},
|
||||
"elements": {
|
||||
"type": "array",
|
||||
"description": "Content elements in this section",
|
||||
"items": {
|
||||
"oneOf": [
|
||||
{"$ref": "#/definitions/table"},
|
||||
{"$ref": "#/definitions/bullet_list"},
|
||||
{"$ref": "#/definitions/paragraph"},
|
||||
{"$ref": "#/definitions/heading"},
|
||||
{"$ref": "#/definitions/code_block"}
|
||||
]
|
||||
}
|
||||
},
|
||||
"order": {"type": "integer", "description": "Section order in document"},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "Additional section metadata"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "Document-specific metadata"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"table": {
|
||||
"type": "object",
|
||||
"required": ["headers", "rows"],
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Table column headers"
|
||||
},
|
||||
"rows": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"description": "Table data rows"
|
||||
},
|
||||
"caption": {
|
||||
"type": "string",
|
||||
"description": "Table caption (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"bullet_list": {
|
||||
"type": "object",
|
||||
"required": ["items"],
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "List item text"},
|
||||
"subitems": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/list_item"},
|
||||
"description": "Nested sub-items (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "List items"
|
||||
},
|
||||
"list_type": {
|
||||
"type": "string",
|
||||
"enum": ["bullet", "numbered", "checklist"],
|
||||
"default": "bullet",
|
||||
"description": "Type of list"
|
||||
}
|
||||
}
|
||||
},
|
||||
"list_item": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "List item text"},
|
||||
"subitems": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/list_item"},
|
||||
"description": "Nested sub-items (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"paragraph": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Paragraph text"},
|
||||
"formatting": {
|
||||
"type": "object",
|
||||
"description": "Text formatting (bold, italic, etc.)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"heading": {
|
||||
"type": "object",
|
||||
"required": ["text", "level"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Heading text"},
|
||||
"level": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"maximum": 6,
|
||||
"description": "Heading level (1-6)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"code_block": {
|
||||
"type": "object",
|
||||
"required": ["code"],
|
||||
"properties": {
|
||||
"code": {"type": "string", "description": "Code content"},
|
||||
"language": {"type": "string", "description": "Programming language (optional)"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def get_document_subJsonSchema() -> Dict[str, Any]:
|
||||
"""Get the JSON schema for structured document generation (single document)."""
|
||||
return {
|
||||
"type": "object",
|
||||
"required": ["metadata", "sections"],
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"required": ["title"],
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "Document title"},
|
||||
"author": {"type": "string", "description": "Document author (optional)"},
|
||||
"source_documents": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of source document IDs"
|
||||
},
|
||||
"extraction_method": {
|
||||
"type": "string",
|
||||
"default": "ai_extraction",
|
||||
"description": "Method used for extraction"
|
||||
}
|
||||
}
|
||||
},
|
||||
"sections": {
|
||||
"type": "array",
|
||||
"description": "Document sections containing structured content",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "content_type", "elements", "order"],
|
||||
"properties": {
|
||||
"id": {"type": "string", "description": "Unique section identifier"},
|
||||
"title": {"type": "string", "description": "Section title (optional)"},
|
||||
"content_type": {
|
||||
"type": "string",
|
||||
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
||||
"description": "Primary content type of this section"
|
||||
},
|
||||
"elements": {
|
||||
"type": "array",
|
||||
"description": "Content elements in this section",
|
||||
"items": {
|
||||
"oneOf": [
|
||||
{"$ref": "#/definitions/table"},
|
||||
{"$ref": "#/definitions/bullet_list"},
|
||||
{"$ref": "#/definitions/paragraph"},
|
||||
{"$ref": "#/definitions/heading"},
|
||||
{"$ref": "#/definitions/code_block"}
|
||||
]
|
||||
}
|
||||
},
|
||||
"order": {"type": "integer", "description": "Section order in document"},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "Additional section metadata"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"summary": {
|
||||
"type": "string",
|
||||
"description": "Document summary (optional)"
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Document tags for categorization"
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"table": {
|
||||
"type": "object",
|
||||
"required": ["headers", "rows"],
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Table column headers"
|
||||
},
|
||||
"rows": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"description": "Table data rows"
|
||||
},
|
||||
"caption": {
|
||||
"type": "string",
|
||||
"description": "Table caption (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"bullet_list": {
|
||||
"type": "object",
|
||||
"required": ["items"],
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "List item text"},
|
||||
"subitems": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/list_item"},
|
||||
"description": "Nested sub-items (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "List items"
|
||||
},
|
||||
"list_type": {
|
||||
"type": "string",
|
||||
"enum": ["bullet", "numbered", "checklist"],
|
||||
"default": "bullet",
|
||||
"description": "Type of list"
|
||||
}
|
||||
}
|
||||
},
|
||||
"list_item": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "List item text"},
|
||||
"subitems": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/list_item"},
|
||||
"description": "Nested sub-items (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"paragraph": {
|
||||
"type": "object",
|
||||
"required": ["text"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Paragraph text"},
|
||||
"formatting": {
|
||||
"type": "object",
|
||||
"description": "Text formatting (bold, italic, etc.)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"heading": {
|
||||
"type": "object",
|
||||
"required": ["text", "level"],
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Heading text"},
|
||||
"level": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"maximum": 6,
|
||||
"description": "Heading level (1-6)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"code_block": {
|
||||
"type": "object",
|
||||
"required": ["code"],
|
||||
"properties": {
|
||||
"code": {"type": "string", "description": "Code content"},
|
||||
"language": {"type": "string", "description": "Programming language (optional)"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_extraction_prompt_template() -> str:
|
||||
"""Get the template for AI extraction prompts that request JSON output."""
|
||||
return """
|
||||
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.
|
||||
|
||||
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
||||
|
||||
JSON Schema Requirements:
|
||||
- Extract the actual data from the source documents
|
||||
- If content is a table, extract it as a table with headers and rows
|
||||
- If content is a list, extract it as a structured list with items
|
||||
- If content is text, extract it as paragraphs or headings
|
||||
- Preserve the original structure and data - do not summarize or interpret
|
||||
- Use the exact JSON schema provided
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
|
||||
Return only the JSON structure following the schema. Do not include any text before or after the JSON.
|
||||
"""
|
||||
|
||||
|
||||
def get_generation_prompt_template() -> str:
|
||||
"""Get the template for AI generation prompts that work with JSON input."""
|
||||
return """
|
||||
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.
|
||||
|
||||
IMPORTANT: You must respond with valid JSON only, following the document schema.
|
||||
|
||||
Generation Guidelines:
|
||||
- Use the provided JSON structure as the foundation
|
||||
- Enhance the content with proper formatting and organization
|
||||
- Ensure logical flow and readability
|
||||
- Maintain the original data integrity
|
||||
- Add appropriate headings and sections
|
||||
- Organize content in a logical sequence
|
||||
|
||||
Content Enhancement:
|
||||
- Tables: Ensure proper headers and data alignment
|
||||
- Lists: Use appropriate list types (bullet, numbered, checklist)
|
||||
- Headings: Use appropriate heading levels for hierarchy
|
||||
- Paragraphs: Ensure proper text flow and formatting
|
||||
- Code: Preserve code blocks with proper language identification
|
||||
|
||||
Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
|
||||
"""
|
||||
|
||||
|
||||
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""Automatically select appropriate schema based on prompt analysis."""
|
||||
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
|
||||
return get_multi_document_subJsonSchema()
|
||||
else:
|
||||
return get_document_subJsonSchema()
|
||||
|
||||
def validate_json_document(json_data: Dict[str, Any]) -> bool:
|
||||
"""Validate that the JSON data follows the document schema."""
|
||||
try:
|
||||
# Basic validation - check required fields
|
||||
if not isinstance(json_data, dict):
|
||||
return False
|
||||
|
||||
# Check if it's multi-document or single-document structure
|
||||
if "documents" in json_data:
|
||||
# Multi-document structure
|
||||
if "metadata" not in json_data:
|
||||
return False
|
||||
|
||||
metadata = json_data["metadata"]
|
||||
if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata:
|
||||
return False
|
||||
|
||||
documents = json_data["documents"]
|
||||
if not isinstance(documents, list):
|
||||
return False
|
||||
|
||||
# Validate each document
|
||||
for doc in documents:
|
||||
if not isinstance(doc, dict):
|
||||
return False
|
||||
|
||||
required_fields = ["id", "title", "sections", "filename"]
|
||||
for field in required_fields:
|
||||
if field not in doc:
|
||||
return False
|
||||
|
||||
# Validate sections in each document
|
||||
sections = doc.get("sections", [])
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
for section in sections:
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
|
||||
section_required = ["id", "content_type", "elements", "order"]
|
||||
for field in section_required:
|
||||
if field not in section:
|
||||
return False
|
||||
|
||||
# Validate content_type
|
||||
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||
if section["content_type"] not in valid_types:
|
||||
return False
|
||||
|
||||
# Validate elements
|
||||
if not isinstance(section["elements"], list):
|
||||
return False
|
||||
|
||||
elif "sections" in json_data:
|
||||
# Single-document structure (existing validation)
|
||||
if "metadata" not in json_data:
|
||||
return False
|
||||
|
||||
metadata = json_data["metadata"]
|
||||
if not isinstance(metadata, dict) or "title" not in metadata:
|
||||
return False
|
||||
|
||||
sections = json_data["sections"]
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
# Validate each section
|
||||
for i, section in enumerate(sections):
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
|
||||
required_fields = ["id", "content_type", "elements", "order"]
|
||||
for field in required_fields:
|
||||
if field not in section:
|
||||
return False
|
||||
|
||||
# Validate content_type
|
||||
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
||||
if section["content_type"] not in valid_types:
|
||||
return False
|
||||
|
||||
# Validate elements
|
||||
if not isinstance(section["elements"], list):
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
738
modules/services/serviceGeneration/subPromptBuilder.py
Normal file
738
modules/services/serviceGeneration/subPromptBuilder.py
Normal file
|
|
@ -0,0 +1,738 @@
|
|||
"""
|
||||
Prompt builder for AI document generation and extraction.
|
||||
This module builds prompts for AI services to extract and generate documents.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
# Type hint for renderer parameter
|
||||
if TYPE_CHECKING:
|
||||
from .renderers.rendererBaseTemplate import BaseRenderer
|
||||
_RendererLike = BaseRenderer
|
||||
else:
|
||||
_RendererLike = Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def buildAdaptiveExtractionPrompt(
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
promptAnalysis: Dict[str, Any],
|
||||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""
|
||||
Build adaptive extraction prompt based on AI analysis.
|
||||
Uses multi-file or single-file approach based on analysis.
|
||||
"""
|
||||
|
||||
# Multi-file example data instead of schema
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "Multi-Document Example",
|
||||
"splitStrategy": "by_section",
|
||||
"source_documents": ["doc_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_section_1",
|
||||
"title": "Section 1 Title",
|
||||
"filename": "section_1.xlsx",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"level": 1,
|
||||
"text": "1. SECTION TITLE"
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "This is the actual content that should be extracted from the document."
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
},
|
||||
{
|
||||
"id": "section_3",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [["Value 1", "Value 2"]]
|
||||
}
|
||||
],
|
||||
"order": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Single-file example data instead of schema
|
||||
single_file_example = {
|
||||
"metadata": {
|
||||
"title": "Single Document Example",
|
||||
"source_documents": ["doc_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"level": 1,
|
||||
"text": "1. SECTION TITLE"
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "This is the actual content that should be extracted from the document."
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
},
|
||||
{
|
||||
"id": "section_3",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [["Value 1", "Value 2"]]
|
||||
}
|
||||
],
|
||||
"order": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
if promptAnalysis.get("is_multi_file", False):
|
||||
# Multi-file prompt
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
|
||||
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
||||
3. Create one JSON document entry for each section found
|
||||
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
||||
5. Generate appropriate filenames for each section
|
||||
|
||||
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(multi_file_example, indent=2)}
|
||||
|
||||
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
|
||||
- "id": unique identifier
|
||||
- "title": section title from the document
|
||||
- "filename": appropriate filename for the section
|
||||
- "sections": array of content sections
|
||||
|
||||
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
|
||||
|
||||
INSTRUCTIONS:
|
||||
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
|
||||
- Use actual section titles, headings, and text from the document
|
||||
- Create meaningful filenames based on section content
|
||||
- Ensure each section contains the complete content for that part of the document
|
||||
- Do not use generic placeholder text like "Section 1", "Section 2"
|
||||
- Extract real headings, paragraphs, lists, and other content elements
|
||||
- CRITICAL: Return JSON with "documents" array, not "sections" array
|
||||
|
||||
CONTEXT (Document Content):
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
""".strip()
|
||||
else:
|
||||
# Single-file prompt - use example data instead of schema
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Extract all content and organize it into logical sections
|
||||
3. Create structured JSON with sections containing the extracted content
|
||||
4. Preserve the original structure and data
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(single_file_example, indent=2)}
|
||||
|
||||
INSTRUCTIONS:
|
||||
- Replace example data with actual content from the document
|
||||
- Use actual headings, paragraphs, and text from the document
|
||||
- Ensure all content is properly structured
|
||||
- Do not use generic placeholder text
|
||||
- Extract real content from the documents
|
||||
|
||||
CONTEXT (Document Content):
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
""".strip()
|
||||
|
||||
return adaptive_prompt
|
||||
|
||||
async def buildGenericExtractionPrompt(
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""Build generic extraction prompt that works for both single and multi-file."""
|
||||
|
||||
# Use AI to determine the best approach
|
||||
if aiService:
|
||||
try:
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and determine the best JSON structure for document extraction.
|
||||
|
||||
User request: "{userPrompt}"
|
||||
|
||||
Respond with JSON only:
|
||||
{{
|
||||
"requires_multi_file": true/false,
|
||||
"recommended_schema": "single_document|multi_document",
|
||||
"split_approach": "description of how to organize content",
|
||||
"file_naming": "suggested naming pattern"
|
||||
}}
|
||||
|
||||
Consider the user's intent and the most logical way to organize the extracted content.
|
||||
"""
|
||||
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await aiService.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
import re
|
||||
|
||||
result = response.content.strip()
|
||||
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
|
||||
analysis = json.loads(result)
|
||||
|
||||
# Use analysis to build appropriate prompt
|
||||
return await buildAdaptiveExtractionPrompt(
|
||||
outputFormat, userPrompt, title, analysis, aiService, services
|
||||
)
|
||||
except Exception as e:
|
||||
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||
|
||||
# Fallback to single-file prompt
|
||||
example_data = {
|
||||
"metadata": {
|
||||
"title": "Example Document",
|
||||
"author": "AI Assistant",
|
||||
"source_documents": ["document_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_001",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"level": 1,
|
||||
"text": "1. SECTION TITLE"
|
||||
}
|
||||
],
|
||||
"order": 1,
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"summary": "",
|
||||
"tags": []
|
||||
}
|
||||
|
||||
return f"""
|
||||
{userPrompt}
|
||||
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Extract all content and organize it into logical sections
|
||||
3. Create structured JSON with sections containing the extracted content
|
||||
4. Preserve the original structure and data
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(example_data, indent=2)}
|
||||
|
||||
Requirements:
|
||||
- Preserve all original data - do not summarize or interpret
|
||||
- Use the exact JSON format shown above
|
||||
- Maintain data integrity and structure
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
|
||||
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||
"""
|
||||
|
||||
async def buildExtractionPrompt(
|
||||
outputFormat: str,
|
||||
renderer: _RendererLike,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""
|
||||
Build the final extraction prompt by combining:
|
||||
- Parsed extraction intent from user prompt (using AI)
|
||||
- Generic cross-format instructions (filename header + real-data policy)
|
||||
- Format-specific guidelines snippet provided by the renderer
|
||||
|
||||
The AI must place a single filename header at the very top:
|
||||
FILENAME: <safe-file-name-with-extension>
|
||||
followed by a blank line and then ONLY the document content according to the target format.
|
||||
"""
|
||||
|
||||
# Parse user prompt to separate extraction intent from generation format using AI
|
||||
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
|
||||
|
||||
# Import JSON schema for structured output
|
||||
from .subJsonSchema import get_document_subJsonSchema
|
||||
jsonSchema = get_document_subJsonSchema()
|
||||
|
||||
# Generic block for JSON extraction - use mixed example data showing different content types
|
||||
example_data = {
|
||||
"metadata": {
|
||||
"title": "Example Document",
|
||||
"author": "AI Assistant",
|
||||
"source_documents": ["document_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_001",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"level": 1,
|
||||
"text": "1. INTRODUCTION"
|
||||
}
|
||||
],
|
||||
"order": 1,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"id": "section_002",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "This is a sample paragraph with actual content that should be extracted from the document."
|
||||
}
|
||||
],
|
||||
"order": 2,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"id": "section_003",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||
"rows": [
|
||||
["Value 1", "Value 2", "Value 3"],
|
||||
["Value 4", "Value 5", "Value 6"]
|
||||
]
|
||||
}
|
||||
],
|
||||
"order": 3,
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"summary": "",
|
||||
"tags": []
|
||||
}
|
||||
|
||||
genericIntro = f"""
|
||||
{extractionIntent}
|
||||
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Extract all content and organize it into logical sections
|
||||
3. Create structured JSON with sections containing the extracted content
|
||||
4. Preserve the original structure and data
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(example_data, indent=2)}
|
||||
|
||||
Requirements:
|
||||
- Preserve all original data - do not summarize or interpret
|
||||
- Use the exact JSON format shown above
|
||||
- Maintain data integrity and structure
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
|
||||
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||
"""
|
||||
|
||||
# Get format-specific guidelines from renderer
|
||||
formatGuidelines = ""
|
||||
try:
|
||||
if hasattr(renderer, 'getExtractionGuidelines'):
|
||||
formatGuidelines = renderer.getExtractionGuidelines()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Combine all parts
|
||||
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
|
||||
|
||||
# Save extraction prompt to debug file - only if debug enabled
|
||||
try:
|
||||
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
debug_root = "./test-chat/ai"
|
||||
os.makedirs(debug_root, exist_ok=True)
|
||||
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(finalPrompt)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return finalPrompt
|
||||
|
||||
|
||||
async def buildGenerationPrompt(
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""
|
||||
Use AI to build the generation prompt based on user intent and format requirements.
|
||||
Focus on what's important for the user and how to structure the content.
|
||||
"""
|
||||
if not aiService:
|
||||
# Fallback if no AI service available
|
||||
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||
|
||||
try:
|
||||
# Protect userPrompt from injection
|
||||
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
||||
|
||||
# Debug output
|
||||
services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")
|
||||
|
||||
# AI call to generate the appropriate generation prompt
|
||||
generationPromptRequest = f"""
|
||||
You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document.
|
||||
|
||||
User request: "{safeUserPrompt}"
|
||||
Document title: "{title}"
|
||||
Target format: {outputFormat}
|
||||
|
||||
Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on:
|
||||
|
||||
1. What content is most important for the user
|
||||
2. How to structure and organize the content using the canonical JSON format with 'sections'
|
||||
3. Specific formatting requirements for the target format
|
||||
4. Language requirements to preserve
|
||||
5. How to ensure the JSON content meets the user's needs
|
||||
|
||||
CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure:
|
||||
{{
|
||||
"metadata": {{
|
||||
"title": "Document Title"
|
||||
}},
|
||||
"sections": [
|
||||
{{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{{
|
||||
"level": 1,
|
||||
"text": "1. SECTION TITLE"
|
||||
}}
|
||||
],
|
||||
"order": 1
|
||||
}},
|
||||
{{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{{
|
||||
"text": "This is the actual content that should be extracted from the document."
|
||||
}}
|
||||
],
|
||||
"order": 2
|
||||
}},
|
||||
{{
|
||||
"id": "section_3",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{{
|
||||
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||
"rows": [
|
||||
["Value 1", "Value 2", "Value 3"],
|
||||
["Value 4", "Value 5", "Value 6"]
|
||||
]
|
||||
}}
|
||||
],
|
||||
"order": 3
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements".
|
||||
|
||||
Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format.
|
||||
"""
|
||||
|
||||
# Call AI service to generate the prompt
|
||||
services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER")
|
||||
|
||||
# Import and set proper options for AI call
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
|
||||
response = await aiService.aiObjects.call(request)
|
||||
result = response.content if response else ""
|
||||
|
||||
# Replace the placeholder that the AI created with actual format rules
|
||||
if result:
|
||||
formatRules = _getFormatRules(outputFormat)
|
||||
result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)
|
||||
|
||||
# Debug output
|
||||
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
|
||||
|
||||
# Save full generation prompt and AI response to debug file - only if debug enabled
|
||||
try:
|
||||
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
debug_root = "./test-chat/ai"
|
||||
os.makedirs(debug_root, exist_ok=True)
|
||||
with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
|
||||
f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
|
||||
f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||
|
||||
except Exception as e:
|
||||
# Fallback on any error - preserve user prompt for language instructions
|
||||
services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
|
||||
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
||||
|
||||
|
||||
def _getFormatRules(outputFormat: str) -> str:
|
||||
"""
|
||||
Get format-specific rules for the generation prompt.
|
||||
"""
|
||||
format_rules = {
|
||||
"xlsx": """
|
||||
XLSX Format Rules:
|
||||
- Create tables with clear headers and organized data
|
||||
- Use appropriate column widths and formatting
|
||||
- Include summary information if relevant
|
||||
- Ensure data is properly structured for spreadsheet analysis
|
||||
""",
|
||||
"pdf": """
|
||||
PDF Format Rules:
|
||||
- Create professional document layout
|
||||
- Use appropriate headings and sections
|
||||
- Include proper spacing and formatting
|
||||
- Ensure content is well-organized and readable
|
||||
""",
|
||||
"docx": """
|
||||
DOCX Format Rules:
|
||||
- Create professional document layout
|
||||
- Use appropriate headings and sections
|
||||
- Include proper spacing and formatting
|
||||
- Ensure content is well-organized and readable
|
||||
""",
|
||||
"html": """
|
||||
HTML Format Rules:
|
||||
- Create clean, semantic HTML structure
|
||||
- Use appropriate tags for content organization
|
||||
- Include proper styling classes
|
||||
- Ensure content is accessible and well-formatted
|
||||
""",
|
||||
"json": """
|
||||
JSON Format Rules:
|
||||
- Create well-structured JSON data
|
||||
- Use appropriate nesting and organization
|
||||
- Include metadata and context information
|
||||
- Ensure data is properly formatted and valid
|
||||
""",
|
||||
"csv": """
|
||||
CSV Format Rules:
|
||||
- Create clear, organized tabular data
|
||||
- Use appropriate headers and data types
|
||||
- Ensure proper CSV formatting
|
||||
- Include all relevant data in structured format
|
||||
""",
|
||||
"txt": """
|
||||
TXT Format Rules:
|
||||
- Create clean, readable text format
|
||||
- Use appropriate spacing and organization
|
||||
- Include clear headings and sections
|
||||
- Ensure content is well-structured and easy to read
|
||||
"""
|
||||
}
|
||||
|
||||
return format_rules.get(outputFormat.lower(), f"""
|
||||
{outputFormat.upper()} Format Rules:
|
||||
- Create well-structured content appropriate for {outputFormat}
|
||||
- Use appropriate formatting and organization
|
||||
- Ensure content is clear and professional
|
||||
- Include all relevant information in proper format
|
||||
""")
|
||||
|
||||
|
||||
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
||||
"""
|
||||
Parse user prompt to extract the core extraction intent.
|
||||
"""
|
||||
if not aiService:
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
try:
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and extract the core extraction intent:
|
||||
|
||||
User request: "{userPrompt}"
|
||||
Target format: {outputFormat}
|
||||
|
||||
Extract the main intent and requirements for document processing. Focus on:
|
||||
1. What content needs to be extracted
|
||||
2. How it should be organized
|
||||
3. Any specific requirements or preferences
|
||||
|
||||
Respond with a clear, concise statement of the extraction intent.
|
||||
"""
|
||||
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await aiService.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
return response.content.strip()
|
||||
else:
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
except Exception as e:
|
||||
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ class NeutralizationService:
|
|||
serviceCenter: Service center instance for accessing other services
|
||||
NamesToParse: List of names to parse and replace (case-insensitive)
|
||||
"""
|
||||
self.serviceCenter = serviceCenter
|
||||
self.services = serviceCenter
|
||||
self.interfaceDbApp = serviceCenter.interfaceDbApp
|
||||
|
||||
# Initialize anonymization processors
|
||||
|
|
|
|||
|
|
@ -0,0 +1,264 @@
|
|||
import json
|
||||
import os
|
||||
from typing import Any, Dict, List, Set
|
||||
from datetime import datetime, UTC
|
||||
|
||||
|
||||
class NormalizationService:
|
||||
"""
|
||||
Produces a single canonical table in merged JSON using an AI-provided header mapping
|
||||
and deterministic, in-code value normalization. No language heuristics in code.
|
||||
"""
|
||||
|
||||
def __init__(self, services):
|
||||
self.services = services
|
||||
|
||||
# Public API
|
||||
def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]:
|
||||
headers: Set[str] = set()
|
||||
samples: Dict[str, List[str]] = {}
|
||||
|
||||
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
|
||||
for section in sections:
|
||||
if not isinstance(section, dict):
|
||||
continue
|
||||
|
||||
# Use only the fundamental agreed JSON structure: content_type/elements
|
||||
if section.get("content_type") != "table":
|
||||
continue
|
||||
|
||||
# Extract table data from elements array
|
||||
hdrs = []
|
||||
rows = []
|
||||
for element in section.get("elements", []):
|
||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||
hdrs = element.get("headers") or []
|
||||
rows = element.get("rows") or []
|
||||
break
|
||||
|
||||
if not hdrs or not rows:
|
||||
continue
|
||||
|
||||
for h in hdrs:
|
||||
if not isinstance(h, str):
|
||||
continue
|
||||
headers.add(h)
|
||||
# collect small value samples by column index
|
||||
for row in rows[:5]:
|
||||
if not isinstance(row, list):
|
||||
continue
|
||||
for i, value in enumerate(row):
|
||||
headerName = hdrs[i] if i < len(hdrs) else f"col_{i}"
|
||||
if headerName not in samples:
|
||||
samples[headerName] = []
|
||||
if len(samples[headerName]) < 5:
|
||||
samples[headerName].append(str(value))
|
||||
|
||||
return {
|
||||
"tableHeaders": sorted(list(headers)),
|
||||
"headerSamples": samples,
|
||||
}
|
||||
|
||||
async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]:
|
||||
|
||||
# Allow caller to specify any canonical schema. If none provided, default to discovered headers.
|
||||
if canonicalSpec is None:
|
||||
canonicalSpec = {
|
||||
"canonicalHeaders": inventory.get("tableHeaders", []),
|
||||
"constraints": {}
|
||||
}
|
||||
|
||||
# Protect merge prompt context by wrapping in single quotes and escaping internal quotes
|
||||
protectedMerge = None
|
||||
if mergePrompt:
|
||||
try:
|
||||
protectedMerge = str(mergePrompt).replace("'", "\\'")
|
||||
except Exception:
|
||||
protectedMerge = str(mergePrompt)
|
||||
|
||||
prompt = (
|
||||
"You are a mapping generator. Return ONLY JSON.\n\n"
|
||||
"Given discovered headers and sample values, map them to the canonical headers.\n"
|
||||
"Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n"
|
||||
f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n"
|
||||
f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n"
|
||||
+ (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") +
|
||||
"REPLY JSON SHAPE:\n(Example)\n"
|
||||
"{\n \"mappings\": {\"<sourceHeader>\": \"<Canonical>|null\"},\n"
|
||||
" \"normalizationPolicy\": {\n \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n"
|
||||
" \"Currency\": {\"stripSymbols\": true},\n"
|
||||
" \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n }\n}\n"
|
||||
)
|
||||
|
||||
response = await self.services.ai.callAi(prompt=prompt)
|
||||
if not response:
|
||||
return {"mapping": {}, "normalizationPolicy": {}}
|
||||
|
||||
# Extract JSON from response more safely
|
||||
start_idx = response.find('{')
|
||||
end_idx = response.rfind('}')
|
||||
if start_idx == -1 or end_idx == -1 or start_idx >= end_idx:
|
||||
return {"mapping": {}, "normalizationPolicy": {}}
|
||||
|
||||
js = response[start_idx:end_idx + 1]
|
||||
try:
|
||||
mapping = json.loads(js)
|
||||
except json.JSONDecodeError:
|
||||
return {"mapping": {}, "normalizationPolicy": {}}
|
||||
# Normalize key naming from AI: prefer single key "mapping"
|
||||
if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict):
|
||||
mapping["mapping"] = mapping["mappings"]
|
||||
try:
|
||||
del mapping["mappings"]
|
||||
except Exception:
|
||||
pass
|
||||
# Ensure canonicalHeaders present in mapping for downstream use
|
||||
if "canonicalHeaders" not in mapping:
|
||||
mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", [])
|
||||
|
||||
# debug artifact
|
||||
self._writeDebugArtifact("mapping.json", mapping)
|
||||
return mapping
|
||||
|
||||
def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]:
|
||||
mappings = (mappingSpec or {}).get("mapping", {})
|
||||
policy = (mappingSpec or {}).get("normalizationPolicy", {})
|
||||
|
||||
# Prefer headers provided by mapping (generic across domains)
|
||||
canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or []
|
||||
if not canonicalHeaders:
|
||||
# Fallback to union of mapped targets
|
||||
canonicalHeaders = sorted(list({t for t in mappings.values() if t}))
|
||||
|
||||
rows: List[List[str]] = []
|
||||
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
|
||||
for section in sections:
|
||||
# Use only the fundamental agreed JSON structure: content_type/elements
|
||||
if section.get("content_type") != "table":
|
||||
continue
|
||||
|
||||
# Extract table data from elements array
|
||||
sourceHeaders = []
|
||||
sourceRows = []
|
||||
for element in section.get("elements", []):
|
||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||
sourceHeaders = element.get("headers") or []
|
||||
sourceRows = element.get("rows") or []
|
||||
break
|
||||
|
||||
if not sourceHeaders or not sourceRows:
|
||||
continue
|
||||
|
||||
# Build index map: canonical -> source index or None
|
||||
indexMap: Dict[str, int] = {}
|
||||
for ci, ch in enumerate(canonicalHeaders):
|
||||
srcIndex = None
|
||||
for si, sh in enumerate(sourceHeaders):
|
||||
# Prefer explicit mapping target; fallback to identity when names match
|
||||
target = mappings.get(sh)
|
||||
if target is None and sh == ch:
|
||||
target = ch
|
||||
if target == ch:
|
||||
srcIndex = si
|
||||
break
|
||||
indexMap[ch] = srcIndex
|
||||
|
||||
# Transform rows
|
||||
for r in sourceRows:
|
||||
canonicalRow: List[str] = []
|
||||
for ch in canonicalHeaders:
|
||||
idx = indexMap.get(ch)
|
||||
try:
|
||||
value = r[idx] if (idx is not None and idx < len(r)) else ""
|
||||
except (IndexError, KeyError) as e:
|
||||
# Handle corrupted data gracefully
|
||||
value = ""
|
||||
canonicalRow.append(self._normalizeValue(ch, value, policy))
|
||||
# consider as row if at least one non-empty meaningful field
|
||||
if any(v.strip() for v in canonicalRow):
|
||||
rows.append(canonicalRow)
|
||||
|
||||
canonical = {
|
||||
"metadata": {
|
||||
"title": mergedJson.get("metadata", {}).get("title", "Merged Document"),
|
||||
"source_documents": mergedJson.get("metadata", {}).get("source_documents", [])
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "canonical_table_1",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": canonicalHeaders,
|
||||
"rows": rows
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# debug artifact
|
||||
self._writeDebugArtifact("canonical_merged.json", canonical)
|
||||
return canonical
|
||||
|
||||
def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]:
|
||||
rows = []
|
||||
try:
|
||||
sections = canonicalJson.get("sections", [])
|
||||
for s in sections:
|
||||
if s.get("content_type") == "table":
|
||||
# Extract rows from elements array
|
||||
for element in s.get("elements", []):
|
||||
if isinstance(element, dict) and "rows" in element:
|
||||
rows.extend(element.get("rows", []))
|
||||
except Exception:
|
||||
rows = []
|
||||
report = {
|
||||
"rowCount": len(rows),
|
||||
"success": len(rows) > 0
|
||||
}
|
||||
self._writeDebugArtifact("normalization_report.json", report)
|
||||
return report
|
||||
|
||||
# Internal helpers
|
||||
def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
text = str(value).strip()
|
||||
# Generic normalization guided by policy; avoid domain specifics
|
||||
if canonicalHeader in (policy.get("numericFields", []) or []):
|
||||
dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator")
|
||||
or (policy.get("numeric") or {}).get("decimalSeparator")
|
||||
or ".")
|
||||
if dec == ",":
|
||||
text = text.replace(".", "").replace(",", ".") if "," in text else text
|
||||
text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+'])
|
||||
elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []):
|
||||
text = ''.join(ch for ch in text if ch.isalpha())
|
||||
text = text.upper()
|
||||
return text
|
||||
|
||||
def _writeDebugArtifact(self, fileName: str, obj: Any) -> None:
|
||||
try:
|
||||
debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if not debugEnabled:
|
||||
return
|
||||
root = "./test-chat/ai"
|
||||
os.makedirs(root, exist_ok=True)
|
||||
# Prefix timestamp for files that are frequently overwritten
|
||||
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
if fileName in ("mapping.json", "canonical_merged.json"):
|
||||
outName = f"{ts}_{fileName}"
|
||||
else:
|
||||
outName = fileName
|
||||
path = os.path.join(root, outName)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
if isinstance(obj, (dict, list)):
|
||||
f.write(json.dumps(obj, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
f.write(str(obj))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
|
|
@ -21,7 +21,7 @@ class SharepointService:
|
|||
|
||||
Use setAccessTokenFromConnection() method to configure the access token before making API calls.
|
||||
"""
|
||||
self.serviceCenter = serviceCenter
|
||||
self.services = serviceCenter
|
||||
self.access_token = None
|
||||
self.base_url = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ class TicketService:
|
|||
Args:
|
||||
serviceCenter: Service center instance for accessing other services
|
||||
"""
|
||||
self.serviceCenter = serviceCenter
|
||||
self.services = serviceCenter
|
||||
|
||||
async def _createTicketInterfaceByType(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ Provides centralized access to configuration, events, and other utilities.
|
|||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Optional, Dict, Callable
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
from modules.shared.eventManagement import eventManager
|
||||
|
|
@ -140,3 +141,42 @@ class UtilsService:
|
|||
except Exception as e:
|
||||
logger.error(f"Error getting fresh token for connection {connectionId}: {str(e)}")
|
||||
return None
|
||||
|
||||
def debugLogToFile(self, message: str, context: str = "DEBUG"):
|
||||
"""
|
||||
Log debug message to file if debug logging is enabled.
|
||||
|
||||
Args:
|
||||
message: Debug message to log
|
||||
context: Context identifier for the debug message
|
||||
"""
|
||||
try:
|
||||
# Check if debug logging is enabled
|
||||
debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if not debug_enabled:
|
||||
return
|
||||
|
||||
# Get debug directory
|
||||
debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat")
|
||||
if not os.path.isabs(debug_dir):
|
||||
# If relative path, make it relative to the gateway directory
|
||||
gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
debug_dir = os.path.join(gateway_dir, debug_dir)
|
||||
|
||||
# Ensure debug directory exists
|
||||
os.makedirs(debug_dir, exist_ok=True)
|
||||
|
||||
# Create debug file path
|
||||
debug_file = os.path.join(debug_dir, "debug_workflow.log")
|
||||
|
||||
# Format the debug entry
|
||||
timestamp = self.getUtcTimestamp()
|
||||
debug_entry = f"[{timestamp}] [{context}] {message}\n"
|
||||
|
||||
# Write to debug file
|
||||
with open(debug_file, "a", encoding="utf-8") as f:
|
||||
f.write(debug_entry)
|
||||
|
||||
except Exception as e:
|
||||
# Don't log debug errors to avoid recursion
|
||||
pass
|
||||
|
|
@ -16,7 +16,7 @@ class WorkflowService:
|
|||
"""Service class containing methods for document processing, chat operations, and workflow management"""
|
||||
|
||||
def __init__(self, serviceCenter):
|
||||
self.serviceCenter = serviceCenter
|
||||
self.services = serviceCenter
|
||||
self.user = serviceCenter.user
|
||||
self.workflow = serviceCenter.workflow
|
||||
self.interfaceDbChat = serviceCenter.interfaceDbChat
|
||||
|
|
@ -78,11 +78,15 @@ class WorkflowService:
|
|||
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
|
||||
"""Get ChatDocuments from a list of document references using all three formats."""
|
||||
try:
|
||||
# Get the current workflow from services (same pattern as setWorkflowContext)
|
||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
logger.error("No workflow available for document list resolution")
|
||||
return []
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Reload workflow from database to ensure we have all messages
|
||||
if hasattr(workflow, 'id'):
|
||||
try:
|
||||
workflow = self.getWorkflow(workflow.id)
|
||||
logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reload workflow from database: {str(e)}")
|
||||
|
||||
all_documents = []
|
||||
for doc_ref in documentList:
|
||||
|
|
@ -125,7 +129,9 @@ class WorkflowService:
|
|||
break
|
||||
|
||||
if not message_found:
|
||||
logger.warning(f"Message with ID {message_id} not found in workflow. Available message IDs: {[str(msg.id) for msg in workflow.messages]}")
|
||||
available_ids = [str(msg.id) for msg in workflow.messages]
|
||||
logger.error(f"Message with ID {message_id} not found in workflow. Available message IDs: {available_ids}")
|
||||
raise ValueError(f"Document reference not found: docList:{message_id}:{label}")
|
||||
elif len(parts) >= 2:
|
||||
# Format: docList:<label> - find message by documentsLabel
|
||||
label = parts[1]
|
||||
|
|
@ -154,7 +160,8 @@ class WorkflowService:
|
|||
else:
|
||||
logger.debug(f"Found docList reference {doc_ref} but message has no documents")
|
||||
else:
|
||||
logger.debug(f"No messages found with documentsLabel: {label}")
|
||||
logger.error(f"No messages found with documentsLabel: {label}")
|
||||
raise ValueError(f"Document reference not found: docList:{label}")
|
||||
else:
|
||||
# Direct label reference (round1_task2_action3_contextinfo)
|
||||
# Search for messages with matching documentsLabel to find the actual documents
|
||||
|
|
@ -198,30 +205,8 @@ class WorkflowService:
|
|||
else:
|
||||
logger.debug(f"No documents found in newest message {newest_message.id}")
|
||||
else:
|
||||
logger.debug(f"No messages found with documentsLabel: {doc_ref}")
|
||||
# Fallback: also check if any message has this documentsLabel as a prefix
|
||||
logger.debug(f"Trying fallback search for messages with documentsLabel containing: {doc_ref}")
|
||||
fallback_messages = []
|
||||
for message in workflow.messages:
|
||||
msg_documents_label = getattr(message, 'documentsLabel', '')
|
||||
if msg_documents_label and msg_documents_label.startswith(doc_ref):
|
||||
fallback_messages.append(message)
|
||||
logger.debug(f"Found fallback message {message.id} with documentsLabel: {msg_documents_label}")
|
||||
|
||||
if fallback_messages:
|
||||
# Sort by publishedAt descending (newest first)
|
||||
fallback_messages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
|
||||
newest_fallback = fallback_messages[0]
|
||||
|
||||
logger.debug(f"Using fallback message {newest_fallback.id} with documentsLabel: {getattr(newest_fallback, 'documentsLabel', 'unknown')}")
|
||||
if newest_fallback.documents:
|
||||
doc_names = [doc.fileName for doc in newest_fallback.documents if hasattr(doc, 'fileName')]
|
||||
logger.debug(f"Added {len(newest_fallback.documents)} documents from fallback message {newest_fallback.id}: {doc_names}")
|
||||
all_documents.extend(newest_fallback.documents)
|
||||
else:
|
||||
logger.debug(f"No documents found in fallback message {newest_fallback.id}")
|
||||
else:
|
||||
logger.debug(f"No fallback messages found either")
|
||||
logger.error(f"No messages found with documentsLabel: {doc_ref}")
|
||||
raise ValueError(f"Document reference not found: {doc_ref}")
|
||||
|
||||
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
|
||||
return all_documents
|
||||
|
|
@ -260,7 +245,8 @@ class WorkflowService:
|
|||
token_status = f"error: {str(e)}"
|
||||
|
||||
# Build enhanced reference with state information
|
||||
base_ref = f"connection:{connection.authority.value}:{connection.externalUsername}:{connection.id}"
|
||||
# Format: connection:msft:<username> (without UUID)
|
||||
base_ref = f"connection:{connection.authority.value}:{connection.externalUsername}"
|
||||
state_info = f" [status:{connection.status.value}, token:{token_status}]"
|
||||
|
||||
logger.debug(f"getConnectionReferenceFromUserConnection: Built reference: {base_ref + state_info}")
|
||||
|
|
@ -283,26 +269,25 @@ class WorkflowService:
|
|||
return None
|
||||
|
||||
def getUserConnectionFromConnectionReference(self, connectionReference: str) -> Optional[UserConnection]:
|
||||
"""Get UserConnection from reference string (handles both old and enhanced formats)"""
|
||||
"""Get UserConnection from reference string (handles new format without UUID)"""
|
||||
try:
|
||||
# Parse reference format: connection:{authority}:{username}:{id} [status:..., token:...]
|
||||
# Parse reference format: connection:{authority}:{username} [status:..., token:...]
|
||||
# Remove state information if present
|
||||
base_reference = connectionReference.split(' [')[0]
|
||||
|
||||
parts = base_reference.split(':')
|
||||
if len(parts) != 4 or parts[0] != "connection":
|
||||
if len(parts) != 3 or parts[0] != "connection":
|
||||
return None
|
||||
|
||||
authority = parts[1]
|
||||
username = parts[2]
|
||||
conn_id = parts[3]
|
||||
|
||||
# Get user connections through AppObjects interface
|
||||
user_connections = self.interfaceDbApp.getUserConnections(self.user.id)
|
||||
|
||||
# Find matching connection
|
||||
# Find matching connection by authority and username (no UUID needed)
|
||||
for conn in user_connections:
|
||||
if str(conn.id) == conn_id and conn.authority.value == authority and conn.externalUsername == username:
|
||||
if conn.authority.value == authority and conn.externalUsername == username:
|
||||
return conn
|
||||
return None
|
||||
|
||||
|
|
@ -437,11 +422,7 @@ class WorkflowService:
|
|||
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
|
||||
"""Set current workflow context for document generation and routing"""
|
||||
try:
|
||||
# Get the current workflow from services
|
||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
logger.error("No workflow available for context setting")
|
||||
return
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Prepare update data
|
||||
update_data = {}
|
||||
|
|
@ -548,10 +529,7 @@ class WorkflowService:
|
|||
def getDocumentCount(self) -> str:
|
||||
"""Get document count for task planning (matching old handlingTasks.py logic)"""
|
||||
try:
|
||||
# Get the current workflow from services
|
||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
return "No documents available"
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Count documents from all messages in the workflow (like old system)
|
||||
total_docs = 0
|
||||
|
|
@ -570,10 +548,7 @@ class WorkflowService:
|
|||
def getWorkflowHistoryContext(self) -> str:
|
||||
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
|
||||
try:
|
||||
# Get the current workflow from services
|
||||
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
return "No previous round context available"
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Check if there are any previous rounds by looking for "first" messages
|
||||
has_previous_rounds = False
|
||||
|
|
@ -622,15 +597,26 @@ class WorkflowService:
|
|||
if not workflow or not hasattr(workflow, 'messages'):
|
||||
return "No documents available"
|
||||
|
||||
# Use the provided workflow object directly to avoid database reload issues
|
||||
# that can cause filename truncation. The workflow object should already be up-to-date.
|
||||
logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})")
|
||||
|
||||
# Debug: Check document filenames in the workflow object
|
||||
if hasattr(workflow, 'messages') and workflow.messages:
|
||||
for message in workflow.messages:
|
||||
if hasattr(message, 'documents') and message.documents:
|
||||
for doc in message.documents:
|
||||
logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
|
||||
|
||||
# Get document reference list using the exact same logic as old system
|
||||
document_list = self._getDocumentReferenceList(workflow)
|
||||
|
||||
# Build technical context string for AI action planning (exact copy of old system)
|
||||
context = "AVAILABLE DOCUMENTS:\n\n"
|
||||
# Build index string for AI action planning
|
||||
context = ""
|
||||
|
||||
# Process chat exchanges (current round) - exact copy of old system
|
||||
# Process current round exchanges first
|
||||
if document_list["chat"]:
|
||||
context += "CURRENT ROUND DOCUMENTS:\n"
|
||||
context += "\nCurrent round documents:\n"
|
||||
for exchange in document_list["chat"]:
|
||||
# Generate docList reference for the exchange (using message ID and label)
|
||||
# Find the message that corresponds to this exchange
|
||||
|
|
@ -656,9 +642,9 @@ class WorkflowService:
|
|||
context += f" - docItem:{doc_ref}\n"
|
||||
context += "\n"
|
||||
|
||||
# Process history exchanges (previous rounds) - exact copy of old system
|
||||
# Process previous rounds after
|
||||
if document_list["history"]:
|
||||
context += "WORKFLOW HISTORY DOCUMENTS:\n"
|
||||
context += "\nPast rounds documents:\n"
|
||||
for exchange in document_list["history"]:
|
||||
# Generate docList reference for the exchange (using message ID and label)
|
||||
# Find the message that corresponds to this exchange
|
||||
|
|
@ -685,7 +671,7 @@ class WorkflowService:
|
|||
context += "\n"
|
||||
|
||||
if not document_list["chat"] and not document_list["history"]:
|
||||
context += "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.\n"
|
||||
context += "\nNO DOCUMENTS AVAILABLE - This workflow has no documents to process.\n"
|
||||
|
||||
return context
|
||||
|
||||
|
|
@ -713,39 +699,23 @@ class WorkflowService:
|
|||
for message in reversed(workflow.messages):
|
||||
is_first = message.status == "first" if hasattr(message, 'status') else False
|
||||
|
||||
# Build a DocumentExchange if message has documents
|
||||
# Build a DocumentExchange if message has documents and an explicit documentsLabel
|
||||
doc_exchange = None
|
||||
if message.documents:
|
||||
if message.actionId and message.documentsLabel:
|
||||
# Validate that we use the same label as in the message
|
||||
existing_label = getattr(message, 'documentsLabel', None)
|
||||
if existing_label:
|
||||
# Validate and use the message's actual documentsLabel
|
||||
validated_label = self._validateDocumentLabelConsistency(message)
|
||||
|
||||
# Use the message's actual documentsLabel
|
||||
doc_refs = []
|
||||
for doc in message.documents:
|
||||
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
|
||||
doc_refs.append(doc_ref)
|
||||
|
||||
doc_exchange = {
|
||||
'documentsLabel': validated_label,
|
||||
'documents': doc_refs
|
||||
}
|
||||
else:
|
||||
# Generate new labels for documents without explicit labels
|
||||
doc_refs = []
|
||||
for doc in message.documents:
|
||||
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
|
||||
doc_refs.append(doc_ref)
|
||||
|
||||
if doc_refs:
|
||||
# Create a label based on message context
|
||||
context_prefix = self._generateWorkflowContextPrefix(message)
|
||||
context_label = f"{context_prefix}_context"
|
||||
|
||||
doc_exchange = {
|
||||
'documentsLabel': context_label,
|
||||
'documents': doc_refs
|
||||
}
|
||||
# IMPORTANT: Never synthesize new labels here. If a message lacks
|
||||
# a documentsLabel, we skip adding an exchange for it.
|
||||
|
||||
# Append to appropriate container based on boundary
|
||||
if doc_exchange:
|
||||
|
|
@ -773,12 +743,22 @@ class WorkflowService:
|
|||
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
|
||||
for doc in documents:
|
||||
try:
|
||||
# Debug: Log original filename before refresh
|
||||
original_filename = doc.fileName
|
||||
logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
|
||||
|
||||
# Use the proper WorkflowService method to get file info
|
||||
file_info = self.getFileInfo(doc.fileId)
|
||||
if file_info:
|
||||
db_filename = file_info.get("fileName", doc.fileName)
|
||||
logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})")
|
||||
|
||||
doc.fileName = file_info.get("fileName", doc.fileName)
|
||||
doc.fileSize = file_info.get("size", doc.fileSize)
|
||||
doc.mimeType = file_info.get("mimeType", doc.mimeType)
|
||||
|
||||
# Debug: Log final filename after refresh
|
||||
logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
|
||||
else:
|
||||
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
|
||||
except Exception as e:
|
||||
|
|
@ -794,6 +774,8 @@ class WorkflowService:
|
|||
def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
|
||||
"""Get document reference using document ID and filename."""
|
||||
try:
|
||||
# Debug logging to track filename truncation
|
||||
logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})")
|
||||
# Use document ID and filename for simple reference
|
||||
return f"docItem:{document.id}:{document.fileName}"
|
||||
except Exception as e:
|
||||
|
|
@ -844,14 +826,14 @@ class WorkflowService:
|
|||
"""Get connection reference list (matching old handlingTasks.py logic)"""
|
||||
try:
|
||||
# Get connections from the database using the same logic as the old system
|
||||
if hasattr(self.serviceCenter, 'interfaceDbApp') and hasattr(self.serviceCenter, 'user'):
|
||||
userId = self.serviceCenter.user.id
|
||||
connections = self.serviceCenter.interfaceDbApp.getUserConnections(userId)
|
||||
if hasattr(self.services, 'interfaceDbApp') and hasattr(self.services, 'user'):
|
||||
userId = self.services.user.id
|
||||
connections = self.services.interfaceDbApp.getUserConnections(userId)
|
||||
if connections:
|
||||
# Format connections as reference strings using the same pattern as the old system
|
||||
connectionRefs = []
|
||||
for conn in connections:
|
||||
# Create reference string in format: connection:{authority}:{username}:{id} [status:..., token:...]
|
||||
# Create reference string in format: connection:{authority}:{username} [status:..., token:...]
|
||||
# This matches the format expected by getUserConnectionFromConnectionReference()
|
||||
ref = self.getConnectionReferenceFromUserConnection(conn)
|
||||
connectionRefs.append(ref)
|
||||
|
|
|
|||
|
|
@ -42,9 +42,7 @@ class MethodDocument(MethodBase):
|
|||
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
|
||||
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
|
||||
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
||||
- mergeStrategy (dict, optional): Merge strategy for chunked content.
|
||||
- expectedDocumentFormats (list, optional): Desired output format specs.
|
||||
- includeMetadata (bool, optional): Include file metadata. Default: True.
|
||||
- outputMimeType (str, optional): MIME type for output file. Options: "text/plain" (default), "application/json", "text/csv", "text/html". Default: "text/plain".
|
||||
"""
|
||||
try:
|
||||
documentList = parameters.get("documentList")
|
||||
|
|
@ -54,13 +52,7 @@ class MethodDocument(MethodBase):
|
|||
operationType = parameters.get("operationType", "extract_content")
|
||||
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
||||
chunkAllowed = parameters.get("chunkAllowed", True)
|
||||
mergeStrategy = parameters.get("mergeStrategy", {
|
||||
"groupBy": "typeGroup",
|
||||
"orderBy": "id",
|
||||
"mergeType": "concatenate"
|
||||
})
|
||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
||||
includeMetadata = parameters.get("includeMetadata", True)
|
||||
outputMimeType = parameters.get("outputMimeType", "text/plain")
|
||||
|
||||
if not documentList:
|
||||
return ActionResult.isFailure(
|
||||
|
|
@ -87,19 +79,16 @@ class MethodDocument(MethodBase):
|
|||
compressContext=not chunkAllowed
|
||||
)
|
||||
|
||||
# Add format instructions to prompt if expected formats are provided
|
||||
# Add format instructions to prompt based on MIME type
|
||||
enhanced_prompt = prompt
|
||||
if expectedDocumentFormats:
|
||||
format_instructions = []
|
||||
for fmt in expectedDocumentFormats:
|
||||
extension = fmt.get("extension", ".txt")
|
||||
mime_type = fmt.get("mimeType", "text/plain")
|
||||
description = fmt.get("description", "")
|
||||
format_instructions.append(f"- {extension} ({mime_type}): {description}")
|
||||
|
||||
if format_instructions:
|
||||
enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
|
||||
enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)
|
||||
mime_type_mapping = {
|
||||
"text/plain": (".txt", "Plain text format"),
|
||||
"application/json": (".json", "Structured JSON format"),
|
||||
"text/csv": (".csv", "Table format"),
|
||||
"text/html": (".html", "HTML format")
|
||||
}
|
||||
extension, description = mime_type_mapping.get(outputMimeType, (".txt", "Plain text format"))
|
||||
enhanced_prompt += f"\n\nPlease format the output as {extension} ({outputMimeType}): {description}"
|
||||
|
||||
# Use enhanced AI service for extraction
|
||||
ai_response = await self.services.ai.callAi(
|
||||
|
|
@ -125,8 +114,16 @@ class MethodDocument(MethodBase):
|
|||
for i, chatDocument in enumerate(chatDocuments):
|
||||
# Use the AI response directly - it already contains processed content
|
||||
final_content = ai_response
|
||||
final_mime_type = "text/plain"
|
||||
final_extension = ".txt"
|
||||
|
||||
# Determine output format based on MIME type
|
||||
mime_type_mapping = {
|
||||
"text/plain": ".txt",
|
||||
"application/json": ".json",
|
||||
"text/csv": ".csv",
|
||||
"text/html": ".html"
|
||||
}
|
||||
final_extension = mime_type_mapping.get(outputMimeType, ".txt")
|
||||
final_mime_type = outputMimeType
|
||||
|
||||
# Create meaningful output fileName with workflow context
|
||||
original_fileName = chatDocument.fileName
|
||||
|
|
@ -156,9 +153,6 @@ class MethodDocument(MethodBase):
|
|||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
@action
|
||||
async def generate(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
|
|
@ -175,8 +169,6 @@ class MethodDocument(MethodBase):
|
|||
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
|
||||
- processDocumentsIndividually (bool, optional): Process per document. Default: True.
|
||||
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
||||
- mergeStrategy (dict, optional): Merging rules for multi-part generation.
|
||||
- includeMetadata (bool, optional): Include file metadata. Default: True.
|
||||
"""
|
||||
try:
|
||||
documentList = parameters.get("documentList")
|
||||
|
|
@ -188,12 +180,6 @@ class MethodDocument(MethodBase):
|
|||
operationType = parameters.get("operationType", "generate_report")
|
||||
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
||||
chunkAllowed = parameters.get("chunkAllowed", True)
|
||||
mergeStrategy = parameters.get("mergeStrategy", {
|
||||
"groupBy": "typeGroup",
|
||||
"orderBy": "id",
|
||||
"mergeType": "concatenate"
|
||||
})
|
||||
includeMetadata = parameters.get("includeMetadata", True)
|
||||
|
||||
if not documentList:
|
||||
return ActionResult.isFailure(
|
||||
|
|
@ -31,14 +31,14 @@ class MethodAi(MethodBase):
|
|||
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: AI-based analysis and content generation with optional document context.
|
||||
- Input requirements: aiPrompt (required); optional documentList, resultType, processingMode, includeMetadata, operationType, priority, maxCost, maxProcessingTime, requiredTags.
|
||||
- Output format: Single or multiple documents in requested format.
|
||||
- Purpose: Process a user prompt with optional unlimited input documents to produce one or many output documents of the SAME format.
|
||||
- Input requirements: aiPrompt (required); optional documentList.
|
||||
- Output format: Exactly one file format to select. For multiple output file formats to do different calls.
|
||||
|
||||
Parameters:
|
||||
- aiPrompt (str, required): Instruction for the AI.
|
||||
- documentList (list, optional): Document reference(s) for context.
|
||||
- resultType (str, optional): Output extension (txt, json, md, csv, xml, html, pdf, docx, xlsx, png). Default: txt.
|
||||
- resultType (str, optional): Output file extension - only one extension allowed (e.g. txt, json, md, csv, xml, html, pdf, docx, xlsx, png, ...). Default: txt.
|
||||
- processingMode (str, optional): basic | advanced | detailed. Default: basic.
|
||||
- includeMetadata (bool, optional): Include metadata when available. Default: True.
|
||||
- operationType (str, optional): general | generate_plan | analyse_content | generate_content | web_research | image_analysis | image_generation. Default: general.
|
||||
|
|
@ -169,12 +169,12 @@ class MethodAi(MethodBase):
|
|||
Parameters:
|
||||
- user_prompt (str, required): Research question or topic.
|
||||
- urls (list, optional): Specific URLs to crawl.
|
||||
- max_results (int, optional): Max search results. Default: 10.
|
||||
- max_pages (int, optional): Max pages to crawl per site. Default: 10.
|
||||
- max_results (int, optional): Max search results. Default: 5.
|
||||
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
|
||||
- search_depth (str, optional): basic | advanced. Default: basic.
|
||||
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
||||
- pages_search_depth (int, optional): Crawl depth level. Default: 2.
|
||||
- country (str, optional): Country code for bias.
|
||||
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
|
||||
- time_range (str, optional): d | w | m | y.
|
||||
- topic (str, optional): general | news | academic.
|
||||
- language (str, optional): Language code (e.g., de, en, fr).
|
||||
|
|
@ -182,8 +182,8 @@ class MethodAi(MethodBase):
|
|||
try:
|
||||
user_prompt = parameters.get("user_prompt")
|
||||
urls = parameters.get("urls")
|
||||
max_results = parameters.get("max_results", 10)
|
||||
max_pages = parameters.get("max_pages", 10)
|
||||
max_results = parameters.get("max_results", 5)
|
||||
max_pages = parameters.get("max_pages", 5)
|
||||
search_depth = parameters.get("search_depth", "basic")
|
||||
extract_depth = parameters.get("extract_depth", "advanced")
|
||||
pages_search_depth = parameters.get("pages_search_depth", 2)
|
||||
|
|
|
|||
|
|
@ -154,6 +154,12 @@ class MethodOutlook(MethodBase):
|
|||
if not query or not query.strip():
|
||||
# No query specified, just get emails from folder
|
||||
if folder and folder.lower() != "all":
|
||||
# Use folder name directly for well-known folders, or get folder ID
|
||||
if folder.lower() in ["inbox", "drafts", "sentitems", "deleteditems"]:
|
||||
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||
else:
|
||||
# For custom folders, we need to get the folder ID first
|
||||
# This will be handled by the calling method
|
||||
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||
# Add orderby for basic queries
|
||||
params["$orderby"] = "receivedDateTime desc"
|
||||
|
|
@ -191,6 +197,16 @@ class MethodOutlook(MethodBase):
|
|||
|
||||
|
||||
# Use only subject search to keep filter simple
|
||||
# Handle wildcard queries specially
|
||||
if clean_query == "*" or clean_query == "":
|
||||
# For wildcard or empty query, don't use contains filter
|
||||
# Just use folder filter if specified
|
||||
if folder and folder.lower() != "all":
|
||||
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||
else:
|
||||
# No filter needed for wildcard search across all folders
|
||||
pass
|
||||
else:
|
||||
params["$filter"] = f"contains(subject,'{clean_query}')"
|
||||
|
||||
# Add folder filter if specified
|
||||
|
|
@ -235,6 +251,10 @@ class MethodOutlook(MethodBase):
|
|||
if '@' in filter_text and '.' in filter_text and ' ' not in filter_text and not filter_text.startswith('from:'):
|
||||
return {"$filter": f"from/fromAddress/address eq '{filter_text}'"}
|
||||
|
||||
# Handle OData filter conditions (contains 'eq', 'ne', 'gt', 'lt', etc.)
|
||||
if any(op in filter_text.lower() for op in [' eq ', ' ne ', ' gt ', ' lt ', ' ge ', ' le ', ' and ', ' or ']):
|
||||
return {"$filter": filter_text}
|
||||
|
||||
# Handle text content - search in subject
|
||||
return {"$filter": f"contains(subject,'{filter_text}')"}
|
||||
|
||||
|
|
@ -300,26 +320,31 @@ class MethodOutlook(MethodBase):
|
|||
"""
|
||||
GENERAL:
|
||||
- Purpose: Read emails and metadata from a mailbox folder.
|
||||
- Input requirements: connectionReference (required); optional folder, limit, filter, expectedDocumentFormats.
|
||||
- Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType.
|
||||
- Output format: JSON with emails and metadata.
|
||||
|
||||
Parameters:
|
||||
- connectionReference (str, required): Microsoft connection label.
|
||||
- folder (str, optional): Folder to read from. Default: Inbox.
|
||||
- limit (int, optional): Maximum items to return. Default: 10.
|
||||
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||
- filter (str, optional): Sender, query operators, or subject text.
|
||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
||||
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||
"""
|
||||
try:
|
||||
connectionReference = parameters.get("connectionReference")
|
||||
folder = parameters.get("folder", "Inbox")
|
||||
limit = parameters.get("limit", 10)
|
||||
filter = parameters.get("filter")
|
||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
||||
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||
|
||||
if not connectionReference:
|
||||
return ActionResult.isFailure(error="Connection reference is required")
|
||||
|
||||
# Validate limit parameter
|
||||
if limit <= 0:
|
||||
limit = 1000
|
||||
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
|
||||
|
||||
# Validate filter parameter if provided
|
||||
if filter:
|
||||
# Remove any potentially dangerous characters that could break the filter
|
||||
|
|
@ -343,8 +368,16 @@ class MethodOutlook(MethodBase):
|
|||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Build the API request
|
||||
# Get the folder ID for the specified folder
|
||||
folder_id = self._getFolderId(folder, connection)
|
||||
|
||||
if folder_id:
|
||||
# Build the API request with folder ID
|
||||
api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages"
|
||||
else:
|
||||
# Fallback: use folder name directly (for well-known folders like "Inbox")
|
||||
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
|
||||
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
|
||||
params = {
|
||||
"$top": limit,
|
||||
"$orderby": "receivedDateTime desc"
|
||||
|
|
@ -380,7 +413,11 @@ class MethodOutlook(MethodBase):
|
|||
"count": len(emails_data.get("value", [])),
|
||||
"folder": folder,
|
||||
"filter": filter,
|
||||
"apiResponse": emails_data
|
||||
"apiMetadata": {
|
||||
"@odata.context": emails_data.get("@odata.context"),
|
||||
"@odata.count": emails_data.get("@odata.count"),
|
||||
"@odata.nextLink": emails_data.get("@odata.nextLink")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -405,18 +442,15 @@ class MethodOutlook(MethodBase):
|
|||
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
|
||||
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
|
||||
|
||||
# Determine output format based on expected formats
|
||||
output_extension = ".json" # Default
|
||||
output_mime_type = "application/json" # Default
|
||||
|
||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
||||
# Use the first expected format
|
||||
expected_format = expectedDocumentFormats[0]
|
||||
output_extension = expected_format.get("extension", ".json")
|
||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
||||
else:
|
||||
logger.info("No expected format specified, using default .json format")
|
||||
# Determine output format based on MIME type
|
||||
mime_type_mapping = {
|
||||
"application/json": ".json",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv"
|
||||
}
|
||||
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||
output_mime_type = outputMimeType
|
||||
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||
|
||||
|
||||
|
||||
|
|
@ -454,27 +488,32 @@ class MethodOutlook(MethodBase):
|
|||
"""
|
||||
GENERAL:
|
||||
- Purpose: Search emails by query and return matching items with metadata.
|
||||
- Input requirements: connectionReference (required); query (required); optional folder, limit, expectedDocumentFormats.
|
||||
- Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType.
|
||||
- Output format: JSON with search results and metadata.
|
||||
|
||||
Parameters:
|
||||
- connectionReference (str, required): Microsoft connection label.
|
||||
- query (str, required): Search expression.
|
||||
- folder (str, optional): Folder scope or All. Default: All.
|
||||
- limit (int, optional): Maximum items to return. Default: 20.
|
||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
||||
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||
"""
|
||||
try:
|
||||
connectionReference = parameters.get("connectionReference")
|
||||
query = parameters.get("query")
|
||||
folder = parameters.get("folder", "All")
|
||||
limit = parameters.get("limit", 20)
|
||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
||||
limit = parameters.get("limit", 1000)
|
||||
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||
|
||||
# Validate parameters
|
||||
if not connectionReference:
|
||||
return ActionResult.isFailure(error="Connection reference is required")
|
||||
|
||||
# Validate limit parameter
|
||||
if limit <= 0:
|
||||
limit = 1000
|
||||
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
|
||||
|
||||
if not query or not query.strip():
|
||||
return ActionResult.isFailure(error="Search query is required and cannot be empty")
|
||||
|
||||
|
|
@ -488,12 +527,15 @@ class MethodOutlook(MethodBase):
|
|||
# Validate limit
|
||||
try:
|
||||
limit = int(limit)
|
||||
if limit <= 0 or limit > 1000: # Microsoft Graph API has limits
|
||||
limit = 20
|
||||
logger.warning(f"Limit {limit} is out of range, using default value 20")
|
||||
if limit <= 0:
|
||||
limit = 1000
|
||||
logger.warning(f"Invalid limit value (<=0), using default value 1000")
|
||||
elif limit > 1000: # Microsoft Graph API has limits
|
||||
limit = 1000
|
||||
logger.warning(f"Limit {limit} exceeds maximum (1000), using 1000")
|
||||
except (ValueError, TypeError):
|
||||
limit = 20
|
||||
logger.warning(f"Invalid limit value, using default value 20")
|
||||
limit = 1000
|
||||
logger.warning(f"Invalid limit value, using default value 1000")
|
||||
|
||||
# Get Microsoft connection
|
||||
connection = self._getMicrosoftConnection(connectionReference)
|
||||
|
|
@ -509,9 +551,18 @@ class MethodOutlook(MethodBase):
|
|||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Get the folder ID for the specified folder if needed
|
||||
folder_id = None
|
||||
if folder and folder.lower() != "all":
|
||||
folder_id = self._getFolderId(folder, connection)
|
||||
if folder_id:
|
||||
logger.debug(f"Found folder ID for '{folder}': {folder_id}")
|
||||
else:
|
||||
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
|
||||
|
||||
# Build the search API request
|
||||
api_url = f"{graph_url}/me/messages"
|
||||
params = self._buildSearchParameters(query, folder, limit)
|
||||
params = self._buildSearchParameters(query, folder_id or folder, limit)
|
||||
|
||||
# Log search parameters for debugging
|
||||
logger.debug(f"Search query: '{query}'")
|
||||
|
|
@ -605,7 +656,11 @@ class MethodOutlook(MethodBase):
|
|||
"count": len(emails),
|
||||
"folder": folder,
|
||||
"limit": limit,
|
||||
"apiResponse": search_data,
|
||||
"apiMetadata": {
|
||||
"@odata.context": search_data.get("@odata.context"),
|
||||
"@odata.count": search_data.get("@odata.count"),
|
||||
"@odata.nextLink": search_data.get("@odata.nextLink")
|
||||
},
|
||||
"searchParams": params
|
||||
}
|
||||
|
||||
|
|
@ -618,18 +673,15 @@ class MethodOutlook(MethodBase):
|
|||
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
|
||||
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
|
||||
|
||||
# Determine output format based on expected formats
|
||||
output_extension = ".json" # Default
|
||||
output_mime_type = "application/json" # Default
|
||||
|
||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
||||
# Use the first expected format
|
||||
expected_format = expectedDocumentFormats[0]
|
||||
output_extension = expected_format.get("extension", ".json")
|
||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
||||
else:
|
||||
logger.info("No expected format specified, using default .json format")
|
||||
# Determine output format based on MIME type
|
||||
mime_type_mapping = {
|
||||
"application/json": ".json",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv"
|
||||
}
|
||||
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||
output_mime_type = outputMimeType
|
||||
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||
|
||||
|
||||
|
||||
|
|
@ -664,20 +716,20 @@ class MethodOutlook(MethodBase):
|
|||
"""
|
||||
GENERAL:
|
||||
- Purpose: List draft emails from a folder.
|
||||
- Input requirements: connectionReference (required); optional folder, limit, expectedDocumentFormats.
|
||||
- Input requirements: connectionReference (required); optional folder, limit, outputMimeType.
|
||||
- Output format: JSON with draft items and metadata.
|
||||
|
||||
Parameters:
|
||||
- connectionReference (str, required): Microsoft connection label.
|
||||
- folder (str, optional): Drafts folder to list. Default: Drafts.
|
||||
- limit (int, optional): Maximum items to return. Default: 20.
|
||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
||||
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||
"""
|
||||
try:
|
||||
connectionReference = parameters.get("connectionReference")
|
||||
folder = parameters.get("folder", "Drafts")
|
||||
limit = parameters.get("limit", 20)
|
||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
||||
limit = parameters.get("limit", 1000)
|
||||
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||
|
||||
if not connectionReference:
|
||||
return ActionResult.isFailure(error="Connection reference is required")
|
||||
|
|
@ -745,18 +797,15 @@ class MethodOutlook(MethodBase):
|
|||
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
|
||||
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
|
||||
|
||||
# Determine output format based on expected formats
|
||||
output_extension = ".json" # Default
|
||||
output_mime_type = "application/json" # Default
|
||||
|
||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
||||
# Use the first expected format
|
||||
expected_format = expectedDocumentFormats[0]
|
||||
output_extension = expected_format.get("extension", ".json")
|
||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
||||
else:
|
||||
logger.info("No expected format specified, using default .json format")
|
||||
# Determine output format based on MIME type
|
||||
mime_type_mapping = {
|
||||
"application/json": ".json",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv"
|
||||
}
|
||||
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||
output_mime_type = outputMimeType
|
||||
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||
|
||||
|
||||
|
||||
|
|
@ -790,18 +839,18 @@ class MethodOutlook(MethodBase):
|
|||
"""
|
||||
GENERAL:
|
||||
- Purpose: Find draft emails across folders.
|
||||
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
|
||||
- Input requirements: connectionReference (required); optional limit, outputMimeType.
|
||||
- Output format: JSON with drafts and metadata.
|
||||
|
||||
Parameters:
|
||||
- connectionReference (str, required): Microsoft connection label.
|
||||
- limit (int, optional): Maximum items to return. Default: 50.
|
||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
||||
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||
"""
|
||||
try:
|
||||
connectionReference = parameters.get("connectionReference")
|
||||
limit = parameters.get("limit", 50)
|
||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
||||
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||
|
||||
if not connectionReference:
|
||||
return ActionResult.isFailure(error="Connection reference is required")
|
||||
|
|
@ -859,18 +908,15 @@ class MethodOutlook(MethodBase):
|
|||
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
|
||||
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
|
||||
|
||||
# Determine output format based on expected formats
|
||||
output_extension = ".json" # Default
|
||||
output_mime_type = "application/json" # Default
|
||||
|
||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
||||
# Use the first expected format
|
||||
expected_format = expectedDocumentFormats[0]
|
||||
output_extension = expected_format.get("extension", ".json")
|
||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
||||
else:
|
||||
logger.info("No expected format specified, using default .json format")
|
||||
# Determine output format based on MIME type
|
||||
mime_type_mapping = {
|
||||
"application/json": ".json",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv"
|
||||
}
|
||||
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||
output_mime_type = outputMimeType
|
||||
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||
|
||||
|
||||
|
||||
|
|
@ -930,18 +976,18 @@ class MethodOutlook(MethodBase):
|
|||
"""
|
||||
GENERAL:
|
||||
- Purpose: Check contents of the Drafts folder.
|
||||
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
|
||||
- Input requirements: connectionReference (required); optional limit, outputMimeType.
|
||||
- Output format: JSON with drafts and metadata.
|
||||
|
||||
Parameters:
|
||||
- connectionReference (str, required): Microsoft connection label.
|
||||
- limit (int, optional): Maximum items to return. Default: 20.
|
||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
||||
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||
"""
|
||||
try:
|
||||
connectionReference = parameters.get("connectionReference")
|
||||
limit = parameters.get("limit", 20)
|
||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
||||
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||
|
||||
if not connectionReference:
|
||||
return ActionResult.isFailure(error="Connection reference is required")
|
||||
|
|
@ -1003,18 +1049,15 @@ class MethodOutlook(MethodBase):
|
|||
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
|
||||
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
|
||||
|
||||
# Determine output format based on expected formats
|
||||
output_extension = ".json" # Default
|
||||
output_mime_type = "application/json" # Default
|
||||
|
||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
||||
# Use the first expected format
|
||||
expected_format = expectedDocumentFormats[0]
|
||||
output_extension = expected_format.get("extension", ".json")
|
||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
||||
else:
|
||||
logger.info("No expected format specified, using default .json format")
|
||||
# Determine output format based on MIME type
|
||||
mime_type_mapping = {
|
||||
"application/json": ".json",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv"
|
||||
}
|
||||
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||
output_mime_type = outputMimeType
|
||||
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -931,7 +931,8 @@ class MethodSharepoint(MethodBase):
|
|||
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
|
||||
|
||||
# Check if pathQuery contains search terms (words without proper path structure)
|
||||
if not pathQuery.startswith('/site:') and not pathQuery.startswith('/Documents') and not pathQuery.startswith('/Shared Documents'):
|
||||
valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents']
|
||||
if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes):
|
||||
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
|
||||
|
||||
# For pathQuery, we need to discover sites to find the specific one
|
||||
|
|
@ -1627,7 +1628,8 @@ class MethodSharepoint(MethodBase):
|
|||
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
|
||||
|
||||
# Check if pathQuery contains search terms (words without proper path structure)
|
||||
if not pathQuery.startswith('/site:') and not pathQuery.startswith('/Documents') and not pathQuery.startswith('/Shared Documents'):
|
||||
valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents']
|
||||
if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes):
|
||||
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
|
||||
|
||||
# For pathQuery, we need to discover sites to find the specific one
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
# adaptive module for React mode
|
||||
# Provides adaptive learning capabilities
|
||||
|
||||
from .intentAnalyzer import IntentAnalyzer, DataType, ExpectedFormat
|
||||
from .intentAnalyzer import IntentAnalyzer
|
||||
from .contentValidator import ContentValidator
|
||||
from .learningEngine import LearningEngine
|
||||
from .progressTracker import ProgressTracker
|
||||
|
||||
__all__ = ['IntentAnalyzer', 'ContentValidator', 'LearningEngine', 'ProgressTracker', 'DataType', 'ExpectedFormat']
|
||||
__all__ = ['IntentAnalyzer', 'ContentValidator', 'LearningEngine', 'ProgressTracker']
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
# contentValidator.py
|
||||
# Content validation for adaptive React mode
|
||||
|
||||
import re
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
from typing import List, Dict, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -10,34 +11,14 @@ logger = logging.getLogger(__name__)
|
|||
class ContentValidator:
|
||||
"""Validates delivered content against user intent"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def __init__(self, services=None):
|
||||
self.services = services
|
||||
|
||||
def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validates delivered content against user intent"""
|
||||
async def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validates delivered content against user intent using AI"""
|
||||
try:
|
||||
validationDetails = []
|
||||
|
||||
for doc in documents:
|
||||
content = self._extractContent(doc)
|
||||
detail = self._validateSingleDocument(content, doc, intent)
|
||||
validationDetails.append(detail)
|
||||
|
||||
# Calculate overall success
|
||||
overallSuccess = all(detail.get("successCriteriaMet", [False]) for detail in validationDetails)
|
||||
|
||||
# Calculate quality score
|
||||
qualityScore = self._calculateQualityScore(validationDetails)
|
||||
|
||||
# Generate improvement suggestions
|
||||
improvementSuggestions = self._generateImprovementSuggestions(validationDetails, intent)
|
||||
|
||||
return {
|
||||
"overallSuccess": overallSuccess,
|
||||
"qualityScore": qualityScore,
|
||||
"validationDetails": validationDetails,
|
||||
"improvementSuggestions": improvementSuggestions
|
||||
}
|
||||
# Use AI for comprehensive validation
|
||||
return await self._validateWithAI(documents, intent)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating content: {str(e)}")
|
||||
|
|
@ -56,253 +37,236 @@ class ContentValidator:
|
|||
except Exception:
|
||||
return ""
|
||||
|
||||
def _validateSingleDocument(self, content: str, doc: Any, intent: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validates a single document against intent"""
|
||||
# Check data type match
|
||||
dataTypeMatch = self._checkDataTypeMatch(content, intent.get("dataType", "unknown"))
|
||||
|
||||
# Check format match
|
||||
formatMatch = self._checkFormatMatch(content, intent.get("expectedFormat", "unknown"))
|
||||
|
||||
# Calculate quality score
|
||||
qualityScore = self._calculateDocumentQualityScore(content, intent)
|
||||
|
||||
# Check success criteria
|
||||
successCriteriaMet = self._checkSuccessCriteria(content, intent)
|
||||
|
||||
# Identify specific issues
|
||||
specificIssues = self._identifySpecificIssues(content, intent)
|
||||
|
||||
# Generate improvement suggestions
|
||||
improvementSuggestions = self._generateDocumentImprovementSuggestions(content, intent)
|
||||
|
||||
return {
|
||||
"documentName": getattr(doc, 'documentName', 'Unknown'),
|
||||
"dataTypeMatch": dataTypeMatch,
|
||||
"formatMatch": formatMatch,
|
||||
"qualityScore": qualityScore,
|
||||
"successCriteriaMet": successCriteriaMet,
|
||||
"specificIssues": specificIssues,
|
||||
"improvementSuggestions": improvementSuggestions
|
||||
}
|
||||
|
||||
def _checkDataTypeMatch(self, content: str, dataType: str) -> bool:
|
||||
"""Checks if content matches the expected data type"""
|
||||
if dataType == "numbers":
|
||||
return self._containsNumbers(content)
|
||||
elif dataType == "text":
|
||||
return self._containsText(content)
|
||||
elif dataType == "documents":
|
||||
return self._containsDocumentContent(content)
|
||||
elif dataType == "analysis":
|
||||
return self._containsAnalysis(content)
|
||||
elif dataType == "code":
|
||||
return self._containsCode(content)
|
||||
else:
|
||||
return True # Unknown type, assume match
|
||||
|
||||
def _containsNumbers(self, content: str) -> bool:
|
||||
"""Checks if content contains actual numbers (not code)"""
|
||||
# Look for actual numbers in the content
|
||||
numbers = re.findall(r'\b\d+\b', content)
|
||||
|
||||
# Check if it's code (contains function definitions, etc.)
|
||||
isCode = any(keyword in content.lower() for keyword in [
|
||||
'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
|
||||
'return', 'print(', 'console.log', 'public ', 'private '
|
||||
])
|
||||
|
||||
# If it's code, it doesn't contain actual numbers
|
||||
if isCode:
|
||||
return False
|
||||
|
||||
# If it has numbers and it's not code, it contains actual numbers
|
||||
return len(numbers) > 0
|
||||
|
||||
def _containsText(self, content: str) -> bool:
|
||||
"""Checks if content contains readable text"""
|
||||
# Remove numbers and special characters
|
||||
textContent = re.sub(r'[^\w\s]', '', content)
|
||||
words = textContent.split()
|
||||
|
||||
# Check if there are enough words to be considered text
|
||||
return len(words) > 5
|
||||
|
||||
def _containsDocumentContent(self, content: str) -> bool:
|
||||
"""Checks if content is suitable for document creation"""
|
||||
# Check for structured content
|
||||
hasStructure = any(indicator in content for indicator in [
|
||||
'\n', '\t', '|', '-', '*', '1.', '2.', '•', '◦'
|
||||
])
|
||||
|
||||
# Check for meaningful content
|
||||
hasMeaningfulContent = len(content.strip()) > 50
|
||||
|
||||
return hasStructure and hasMeaningfulContent
|
||||
|
||||
def _containsAnalysis(self, content: str) -> bool:
|
||||
"""Checks if content contains analysis"""
|
||||
analysisIndicators = [
|
||||
'analysis', 'findings', 'conclusion', 'summary', 'insights',
|
||||
'trends', 'patterns', 'comparison', 'evaluation', 'assessment'
|
||||
]
|
||||
|
||||
contentLower = content.lower()
|
||||
return any(indicator in contentLower for indicator in analysisIndicators)
|
||||
|
||||
def _containsCode(self, content: str) -> bool:
|
||||
"""Checks if content contains code"""
|
||||
codeIndicators = [
|
||||
'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
|
||||
'return', 'print(', 'console.log', 'public ', 'private ', 'void ',
|
||||
'int ', 'string ', 'var ', 'let ', 'const '
|
||||
]
|
||||
|
||||
contentLower = content.lower()
|
||||
return any(indicator in contentLower for indicator in codeIndicators)
|
||||
|
||||
def _checkFormatMatch(self, content: str, expectedFormat: str) -> bool:
|
||||
"""Checks if content matches expected format"""
|
||||
if expectedFormat == "raw_data":
|
||||
# Raw data should be simple, not heavily formatted
|
||||
return not any(indicator in content for indicator in [
|
||||
'<html>', '<div>', '<table>', '## ', '### ', '**', '__'
|
||||
])
|
||||
elif expectedFormat == "formatted":
|
||||
# Formatted content should have structure
|
||||
return any(indicator in content for indicator in [
|
||||
'\n', '\t', '|', '-', '*', '1.', '2.', '•'
|
||||
])
|
||||
elif expectedFormat == "structured":
|
||||
# Structured content should have clear organization
|
||||
return any(indicator in content for indicator in [
|
||||
'{', '}', '[', ']', '|', '\t', ' '
|
||||
])
|
||||
else:
|
||||
return True # Unknown format, assume match
|
||||
|
||||
def _checkSuccessCriteria(self, content: str, intent: Dict[str, Any]) -> List[bool]:
|
||||
"""Checks if content meets success criteria"""
|
||||
criteriaMet = []
|
||||
successCriteria = intent.get("successCriteria", [])
|
||||
|
||||
for criterion in successCriteria:
|
||||
if 'prime numbers' in criterion.lower():
|
||||
# Check if content contains actual prime numbers, not code
|
||||
hasNumbers = bool(re.search(r'\b\d+\b', content))
|
||||
isNotCode = not any(keyword in content.lower() for keyword in [
|
||||
'def ', 'function', 'import ', 'class '
|
||||
])
|
||||
criteriaMet.append(hasNumbers and isNotCode)
|
||||
elif 'document' in criterion.lower():
|
||||
# Check if content is suitable for document creation
|
||||
hasStructure = any(indicator in content for indicator in [
|
||||
'\n', '\t', '|', '-', '*', '1.', '2.'
|
||||
])
|
||||
criteriaMet.append(hasStructure)
|
||||
elif 'format' in criterion.lower():
|
||||
# Check if content is properly formatted
|
||||
hasFormatting = any(indicator in content for indicator in [
|
||||
'\n', '\t', '|', '-', '*', '1.', '2.', '•'
|
||||
])
|
||||
criteriaMet.append(hasFormatting)
|
||||
else:
|
||||
# Generic check - content should not be empty
|
||||
criteriaMet.append(len(content.strip()) > 0)
|
||||
|
||||
return criteriaMet
|
||||
|
||||
def _calculateDocumentQualityScore(self, content: str, intent: Dict[str, Any]) -> float:
|
||||
"""Calculates quality score for a single document"""
|
||||
score = 0.0
|
||||
|
||||
# Base score for having content
|
||||
if len(content.strip()) > 0:
|
||||
score += 0.2
|
||||
|
||||
# Score for data type match
|
||||
if self._checkDataTypeMatch(content, intent.get("dataType", "unknown")):
|
||||
score += 0.3
|
||||
|
||||
# Score for format match
|
||||
if self._checkFormatMatch(content, intent.get("expectedFormat", "unknown")):
|
||||
score += 0.2
|
||||
|
||||
# Score for success criteria
|
||||
successCriteriaMet = self._checkSuccessCriteria(content, intent)
|
||||
if successCriteriaMet:
|
||||
successRate = sum(successCriteriaMet) / len(successCriteriaMet)
|
||||
score += 0.3 * successRate
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
def _calculateQualityScore(self, validationDetails: List[Dict[str, Any]]) -> float:
|
||||
"""Calculates overall quality score from validation details"""
|
||||
if not validationDetails:
|
||||
return 0.0
|
||||
|
||||
totalScore = sum(detail.get("qualityScore", 0) for detail in validationDetails)
|
||||
return totalScore / len(validationDetails)
|
||||
|
||||
def _identifySpecificIssues(self, content: str, intent: Dict[str, Any]) -> List[str]:
|
||||
"""Identifies specific issues with the content"""
|
||||
issues = []
|
||||
|
||||
# Check for common issues
|
||||
if intent.get("dataType") == "numbers" and self._containsCode(content):
|
||||
issues.append("Content contains code instead of actual numbers")
|
||||
|
||||
if intent.get("expectedFormat") == "raw_data" and any(indicator in content for indicator in ['<html>', '## ', '**']):
|
||||
issues.append("Content is formatted when raw data was requested")
|
||||
|
||||
if len(content.strip()) == 0:
|
||||
issues.append("Content is empty")
|
||||
|
||||
return issues
|
||||
|
||||
def _generateDocumentImprovementSuggestions(self, content: str, intent: Dict[str, Any]) -> List[str]:
|
||||
"""Generates improvement suggestions for a single document"""
|
||||
suggestions = []
|
||||
|
||||
dataType = intent.get("dataType", "unknown")
|
||||
expectedFormat = intent.get("expectedFormat", "unknown")
|
||||
|
||||
if dataType == "numbers" and self._containsCode(content):
|
||||
suggestions.append("Deliver actual numbers, not code to generate them")
|
||||
|
||||
if expectedFormat == "raw_data" and any(indicator in content for indicator in ['<html>', '## ']):
|
||||
suggestions.append("Provide raw data without formatting")
|
||||
|
||||
if len(content.strip()) == 0:
|
||||
suggestions.append("Provide actual content")
|
||||
|
||||
return suggestions
|
||||
|
||||
def _generateImprovementSuggestions(self, validationDetails: List[Dict[str, Any]],
|
||||
intent: Dict[str, Any]) -> List[str]:
|
||||
"""Generates improvement suggestions based on validation results"""
|
||||
suggestions = []
|
||||
|
||||
# Check for common issues
|
||||
if not any(detail.get("dataTypeMatch", False) for detail in validationDetails):
|
||||
dataType = intent.get("dataType", "unknown")
|
||||
suggestions.append(f"Content should contain {dataType} data, not code or other formats")
|
||||
|
||||
if not any(detail.get("formatMatch", False) for detail in validationDetails):
|
||||
expectedFormat = intent.get("expectedFormat", "unknown")
|
||||
suggestions.append(f"Content should be in {expectedFormat} format")
|
||||
|
||||
# Add specific suggestions from validation details
|
||||
for detail in validationDetails:
|
||||
suggestions.extend(detail.get("improvementSuggestions", []))
|
||||
|
||||
return list(set(suggestions)) # Remove duplicates
|
||||
|
||||
def _createFailedValidationResult(self, error: str) -> Dict[str, Any]:
|
||||
"""Creates a failed validation result"""
|
||||
return {
|
||||
"overallSuccess": False,
|
||||
"qualityScore": 0.0,
|
||||
"validationDetails": [],
|
||||
"improvementSuggestions": [f"Validation failed: {error}"]
|
||||
"improvementSuggestions": [f"NEXT STEP: Fix validation error - {error}. Check system logs for more details and retry the operation."]
|
||||
}
|
||||
|
||||
def _isValidJsonResponse(self, response: str) -> bool:
|
||||
"""Checks if response contains valid JSON structure"""
|
||||
try:
|
||||
import re
|
||||
# Look for JSON with expected structure
|
||||
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', response, re.DOTALL)
|
||||
if json_match:
|
||||
json.loads(json_match.group(0))
|
||||
return True
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
||||
def _extractFallbackValidationResult(self, response: str) -> Dict[str, Any]:
|
||||
"""Extracts validation result from malformed AI response"""
|
||||
try:
|
||||
import re
|
||||
|
||||
# Extract key values using regex patterns
|
||||
overall_success = re.search(r'"overallSuccess"\s*:\s*(true|false)', response, re.IGNORECASE)
|
||||
quality_score = re.search(r'"qualityScore"\s*:\s*([0-9.]+)', response)
|
||||
gap_analysis = re.search(r'"gapAnalysis"\s*:\s*"([^"]*)"', response)
|
||||
|
||||
# Determine overall success from context if not found
|
||||
if not overall_success:
|
||||
# Look for positive/negative indicators in the text
|
||||
if any(word in response.lower() for word in ['success', 'complete', 'fulfilled', 'satisfied']):
|
||||
overall_success = True
|
||||
elif any(word in response.lower() for word in ['failed', 'incomplete', 'missing', 'error']):
|
||||
overall_success = False
|
||||
else:
|
||||
overall_success = False
|
||||
|
||||
return {
|
||||
"overallSuccess": overall_success if isinstance(overall_success, bool) else (overall_success.group(1).lower() == 'true' if overall_success else False),
|
||||
"qualityScore": float(quality_score.group(1)) if quality_score else 0.5,
|
||||
"validationDetails": [{
|
||||
"documentName": "AI Validation (Fallback)",
|
||||
"gapAnalysis": gap_analysis.group(1) if gap_analysis else "Unable to parse detailed analysis",
|
||||
"successCriteriaMet": [False] # Conservative fallback
|
||||
}],
|
||||
"improvementSuggestions": ["NEXT STEP: AI response was malformed - retry the operation for better results"]
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Fallback extraction failed: {str(e)}")
|
||||
return None
|
||||
|
||||
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""AI-based comprehensive validation - single main function"""
|
||||
try:
|
||||
if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
|
||||
return self._createFailedValidationResult("AI service not available")
|
||||
|
||||
# Extract content from all documents
|
||||
documentContents = []
|
||||
for doc in documents:
|
||||
content = self._extractContent(doc)
|
||||
documentContents.append({
|
||||
"name": getattr(doc, 'documentName', 'Unknown'),
|
||||
"content": content[:2000] # Limit content for AI processing
|
||||
})
|
||||
|
||||
# Create comprehensive AI validation prompt
|
||||
validationPrompt = f"""
|
||||
You are a comprehensive task completion validator. Analyze if the delivered content fulfills the user's request.
|
||||
|
||||
USER REQUEST: {intent.get('primaryGoal', 'Unknown')}
|
||||
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
|
||||
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
|
||||
SUCCESS CRITERIA: {intent.get('successCriteria', [])}
|
||||
|
||||
DELIVERED CONTENT:
|
||||
{json.dumps(documentContents, indent=2)}
|
||||
|
||||
Perform comprehensive validation:
|
||||
1. Check if content matches expected data type
|
||||
2. Check if content matches expected format
|
||||
3. Verify success criteria are met
|
||||
4. Assess overall quality and completeness
|
||||
5. Identify specific gaps and issues
|
||||
6. Provide actionable next steps
|
||||
|
||||
CRITICAL: You MUST respond with ONLY the JSON object below. NO TEXT ANALYSIS. NO EXPLANATIONS. NO OTHER CONTENT.
|
||||
|
||||
RESPOND WITH THIS EXACT JSON FORMAT:
|
||||
|
||||
{{
|
||||
"overallSuccess": false,
|
||||
"qualityScore": 0.5,
|
||||
"dataTypeMatch": false,
|
||||
"formatMatch": false,
|
||||
"successCriteriaMet": [false, false],
|
||||
"gapAnalysis": "Content does not match expected format and lacks required elements",
|
||||
"improvementSuggestions": ["NEXT STEP: Create proper content in expected format", "NEXT STEP: Ensure all success criteria are met"],
|
||||
"validationDetails": [
|
||||
{{
|
||||
"documentName": "Content Validation",
|
||||
"issues": ["Format mismatch", "Missing required elements"],
|
||||
"suggestions": ["NEXT STEP: Fix format", "NEXT STEP: Add missing elements"]
|
||||
}}
|
||||
]
|
||||
}}
|
||||
"""
|
||||
|
||||
# Call AI service for validation
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
response = await self.services.ai.callAi(
|
||||
prompt=validationPrompt,
|
||||
documents=None,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
# If first attempt fails, try with more explicit prompt
|
||||
if response and not self._isValidJsonResponse(response):
|
||||
logger.debug("First AI validation attempt failed, retrying with explicit JSON-only prompt")
|
||||
explicitPrompt = f"""
|
||||
VALIDATE AND RETURN JSON ONLY - NO TEXT ANALYSIS
|
||||
|
||||
Request: {intent.get('primaryGoal', 'Unknown')}
|
||||
Data Type: {intent.get('dataType', 'unknown')}
|
||||
Format: {intent.get('expectedFormat', 'unknown')}
|
||||
Criteria: {intent.get('successCriteria', [])}
|
||||
|
||||
Content: {json.dumps(documentContents, indent=2)}
|
||||
|
||||
RESPOND WITH THIS EXACT JSON FORMAT - NO OTHER TEXT:
|
||||
|
||||
{{
|
||||
"overallSuccess": false,
|
||||
"qualityScore": 0.3,
|
||||
"dataTypeMatch": false,
|
||||
"formatMatch": false,
|
||||
"successCriteriaMet": [false, false],
|
||||
"gapAnalysis": "Content does not match expected format and lacks required elements",
|
||||
"improvementSuggestions": ["NEXT STEP: Create proper content in expected format", "NEXT STEP: Ensure all success criteria are met"],
|
||||
"validationDetails": [
|
||||
{{
|
||||
"documentName": "Content Validation",
|
||||
"issues": ["Format mismatch", "Missing required elements"],
|
||||
"suggestions": ["NEXT STEP: Fix format", "NEXT STEP: Add missing elements"]
|
||||
}}
|
||||
]
|
||||
}}
|
||||
"""
|
||||
response = await self.services.ai.callAi(
|
||||
prompt=explicitPrompt,
|
||||
documents=None,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
if not response or not response.strip():
|
||||
logger.warning("AI validation returned empty response")
|
||||
return self._createFailedValidationResult("AI validation failed - empty response")
|
||||
|
||||
# Clean and extract JSON from response
|
||||
result = response.strip()
|
||||
logger.debug(f"AI validation response length: {len(result)}")
|
||||
|
||||
# Try to find JSON in the response with multiple strategies
|
||||
import re
|
||||
|
||||
# Strategy 1: Look for JSON in markdown code blocks
|
||||
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1)
|
||||
logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
|
||||
else:
|
||||
# Strategy 2: Look for JSON object with proper structure
|
||||
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', result, re.DOTALL)
|
||||
if not json_match:
|
||||
# Strategy 3: Look for any JSON object
|
||||
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||
|
||||
if json_match:
|
||||
result = json_match.group(0)
|
||||
logger.debug(f"Extracted JSON directly: {result[:200]}...")
|
||||
else:
|
||||
logger.debug(f"No JSON found in AI response, trying fallback extraction: {result[:200]}...")
|
||||
logger.debug(f"Full AI response: {result}")
|
||||
|
||||
# Try fallback extraction for text responses
|
||||
fallback_result = self._extractFallbackValidationResult(result)
|
||||
if fallback_result:
|
||||
logger.info("Using fallback text extraction for validation")
|
||||
return fallback_result
|
||||
|
||||
logger.warning("All AI validation attempts failed - no JSON found and fallback extraction failed")
|
||||
return self._createFailedValidationResult("AI validation failed - no JSON in response")
|
||||
|
||||
try:
|
||||
aiResult = json.loads(result)
|
||||
logger.info("AI validation JSON parsed successfully")
|
||||
|
||||
return {
|
||||
"overallSuccess": aiResult.get("overallSuccess", False),
|
||||
"qualityScore": aiResult.get("qualityScore", 0.0),
|
||||
"validationDetails": aiResult.get("validationDetails", [{
|
||||
"documentName": "AI Validation",
|
||||
"gapAnalysis": aiResult.get("gapAnalysis", ""),
|
||||
"successCriteriaMet": aiResult.get("successCriteriaMet", [False])
|
||||
}]),
|
||||
"improvementSuggestions": aiResult.get("improvementSuggestions", [])
|
||||
}
|
||||
|
||||
except json.JSONDecodeError as json_error:
|
||||
logger.warning(f"All AI validation attempts failed - invalid JSON: {str(json_error)}")
|
||||
logger.debug(f"JSON content: {result}")
|
||||
|
||||
# Try to extract key information from malformed response
|
||||
fallbackResult = self._extractFallbackValidationResult(result)
|
||||
if fallbackResult:
|
||||
logger.info("Using fallback validation result from malformed JSON")
|
||||
return fallbackResult
|
||||
|
||||
return self._createFailedValidationResult(f"AI validation failed - invalid JSON: {str(json_error)}")
|
||||
|
||||
return self._createFailedValidationResult("AI validation failed - no response")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI validation failed: {str(e)}")
|
||||
return self._createFailedValidationResult(f"AI validation error: {str(e)}")
|
||||
|
|
@ -1,228 +1,156 @@
|
|||
# intentAnalyzer.py
|
||||
# Intent analysis for adaptive React mode
|
||||
# Intent analysis for adaptive React mode - AI-based, language-agnostic
|
||||
|
||||
import re
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List
|
||||
from enum import Enum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DataType(Enum):
|
||||
NUMBERS = "numbers"
|
||||
TEXT = "text"
|
||||
DOCUMENTS = "documents"
|
||||
ANALYSIS = "analysis"
|
||||
CODE = "code"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
class ExpectedFormat(Enum):
|
||||
RAW_DATA = "raw_data"
|
||||
FORMATTED = "formatted"
|
||||
STRUCTURED = "structured"
|
||||
VISUAL = "visual"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
class IntentAnalyzer:
|
||||
"""Analyzes user intent to understand what they actually want"""
|
||||
"""Analyzes user intent using AI - language-agnostic and generic"""
|
||||
|
||||
def __init__(self):
|
||||
self.dataTypePatterns = {
|
||||
DataType.NUMBERS: [
|
||||
r'\b(numbers?|digits?|count|list|sequence)\b',
|
||||
r'\b(prime|fibonacci|random|even|odd)\s+(numbers?)\b',
|
||||
r'\b(calculate|compute|generate)\s+(numbers?)\b',
|
||||
r'\b(first|last)\s+\d+\s+(numbers?)\b'
|
||||
],
|
||||
DataType.TEXT: [
|
||||
r'\b(text|content|words?|sentences?|paragraphs?)\b',
|
||||
r'\b(write|create|generate)\s+(text|content)\b',
|
||||
r'\b(summary|description|explanation)\b',
|
||||
r'\b(article|essay|report)\b'
|
||||
],
|
||||
DataType.DOCUMENTS: [
|
||||
r'\b(document|file|report|pdf|word|excel)\b',
|
||||
r'\b(create|generate|make)\s+(document|file|report)\b',
|
||||
r'\b(format|structure|organize)\s+(document)\b',
|
||||
r'\b(presentation|slides?)\b'
|
||||
],
|
||||
DataType.ANALYSIS: [
|
||||
r'\b(analyze|analysis|examine|study|evaluate)\b',
|
||||
r'\b(insights?|findings?|results?)\b',
|
||||
r'\b(compare|contrast|evaluate)\b',
|
||||
r'\b(trends?|patterns?)\b'
|
||||
],
|
||||
DataType.CODE: [
|
||||
r'\b(code|program|script|algorithm|function)\b',
|
||||
r'\b(write|create|develop)\s+(code|program|script)\b',
|
||||
r'\b(implement|build|construct)\b',
|
||||
r'\b(debug|fix|optimize)\s+(code)\b'
|
||||
]
|
||||
}
|
||||
def __init__(self, services=None):
|
||||
self.services = services
|
||||
|
||||
self.formatPatterns = {
|
||||
ExpectedFormat.RAW_DATA: [
|
||||
r'\b(raw|plain|simple|basic)\b',
|
||||
r'\b(numbers?|data|list)\b(?!\s+(in|as|with))',
|
||||
r'\b(just|only)\s+(numbers?|data)\b'
|
||||
],
|
||||
ExpectedFormat.FORMATTED: [
|
||||
r'\b(formatted|structured|organized|presented)\b',
|
||||
r'\b(table|chart|graph|visual)\b',
|
||||
r'\b(pretty|nice|clean)\s+(format|presentation)\b',
|
||||
r'\b(professional|polished)\b'
|
||||
],
|
||||
ExpectedFormat.STRUCTURED: [
|
||||
r'\b(json|xml|csv|structured)\b',
|
||||
r'\b(organized|categorized|grouped)\b',
|
||||
r'\b(systematic|methodical)\b',
|
||||
r'\b(database|spreadsheet)\b'
|
||||
]
|
||||
}
|
||||
|
||||
def analyzeUserIntent(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
||||
"""Analyzes user intent from prompt and context"""
|
||||
async def analyzeUserIntent(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
||||
"""Analyzes user intent from prompt and context using AI"""
|
||||
try:
|
||||
# Extract primary goal
|
||||
primaryGoal = self._extractPrimaryGoal(userPrompt)
|
||||
# Use AI to analyze intent
|
||||
aiAnalysis = await self._analyzeIntentWithAI(userPrompt, context)
|
||||
if aiAnalysis:
|
||||
return aiAnalysis
|
||||
|
||||
# Classify data type
|
||||
dataType = self._classifyDataType(userPrompt)
|
||||
|
||||
# Determine expected format
|
||||
expectedFormat = self._determineExpectedFormat(userPrompt)
|
||||
|
||||
# Assess quality requirements
|
||||
qualityRequirements = self._assessQualityRequirements(userPrompt, context)
|
||||
|
||||
# Extract success criteria
|
||||
successCriteria = self._extractSuccessCriteria(userPrompt, context)
|
||||
|
||||
# Calculate confidence score
|
||||
confidenceScore = self._calculateConfidenceScore(dataType, expectedFormat, successCriteria)
|
||||
|
||||
return {
|
||||
"primaryGoal": primaryGoal,
|
||||
"dataType": dataType.value,
|
||||
"expectedFormat": expectedFormat.value,
|
||||
"qualityRequirements": qualityRequirements,
|
||||
"successCriteria": successCriteria,
|
||||
"confidenceScore": confidenceScore
|
||||
}
|
||||
# Fallback to basic analysis if AI fails
|
||||
return self._createBasicIntentAnalysis(userPrompt)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing user intent: {str(e)}")
|
||||
return self._createDefaultIntentAnalysis(userPrompt)
|
||||
|
||||
def _extractPrimaryGoal(self, userPrompt: str) -> str:
|
||||
"""Extracts the primary goal from user prompt"""
|
||||
# Simple extraction - can be enhanced
|
||||
return userPrompt.strip()
|
||||
async def _analyzeIntentWithAI(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
||||
"""Uses AI to analyze user intent - language-agnostic"""
|
||||
try:
|
||||
if not self.services or not hasattr(self.services, 'ai'):
|
||||
return None
|
||||
|
||||
def _classifyDataType(self, userPrompt: str) -> DataType:
|
||||
"""Classifies the type of data the user wants"""
|
||||
promptLower = userPrompt.lower()
|
||||
# Create AI analysis prompt
|
||||
analysisPrompt = f"""
|
||||
You are an intent analyzer. Analyze the user's request to understand what they want delivered.
|
||||
|
||||
for dataType, patterns in self.dataTypePatterns.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, promptLower):
|
||||
return dataType
|
||||
USER REQUEST: {userPrompt}
|
||||
|
||||
return DataType.UNKNOWN
|
||||
CONTEXT: {getattr(context.task_step, 'objective', '') if hasattr(context, 'task_step') and context.task_step else ''}
|
||||
|
||||
def _determineExpectedFormat(self, userPrompt: str) -> ExpectedFormat:
|
||||
"""Determines the expected format of the output"""
|
||||
promptLower = userPrompt.lower()
|
||||
Analyze the user's intent and determine:
|
||||
1. What type of data/content they want (numbers, text, documents, analysis, code, etc.)
|
||||
2. What format they expect (raw data, formatted, structured, visual, etc.)
|
||||
3. What quality requirements they have (accuracy, completeness, format)
|
||||
4. What specific success criteria define completion
|
||||
|
||||
for formatType, patterns in self.formatPatterns.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, promptLower):
|
||||
return formatType
|
||||
CRITICAL: Respond with ONLY the JSON object below. Do not include any explanatory text, analysis, or other content before or after the JSON.
|
||||
|
||||
return ExpectedFormat.UNKNOWN
|
||||
{{
|
||||
"primaryGoal": "The main objective the user wants to achieve",
|
||||
"dataType": "numbers|text|documents|analysis|code|unknown",
|
||||
"expectedFormat": "raw_data|formatted|structured|visual|unknown",
|
||||
"qualityRequirements": {{
|
||||
"accuracyThreshold": 0.0-1.0,
|
||||
"completenessThreshold": 0.0-1.0,
|
||||
"formatRequirement": "any|formatted|raw|structured"
|
||||
}},
|
||||
"successCriteria": ["specific criterion 1", "specific criterion 2"],
|
||||
"confidenceScore": 0.0-1.0
|
||||
}}
|
||||
"""
|
||||
|
||||
def _assessQualityRequirements(self, userPrompt: str, context: Any) -> Dict[str, Any]:
|
||||
"""Assesses quality requirements from prompt and context"""
|
||||
promptLower = userPrompt.lower()
|
||||
# Call AI service for analysis
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
# Check for accuracy requirements
|
||||
accuracyThreshold = 0.8
|
||||
if any(word in promptLower for word in ['exact', 'precise', 'accurate', 'correct']):
|
||||
accuracyThreshold = 0.95
|
||||
elif any(word in promptLower for word in ['approximate', 'rough', 'estimate']):
|
||||
accuracyThreshold = 0.7
|
||||
response = await self.services.ai.callAi(
|
||||
prompt=analysisPrompt,
|
||||
documents=None,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
# Check for completeness requirements
|
||||
completenessThreshold = 0.8
|
||||
if any(word in promptLower for word in ['complete', 'full', 'comprehensive', 'all']):
|
||||
completenessThreshold = 0.95
|
||||
elif any(word in promptLower for word in ['summary', 'brief', 'overview']):
|
||||
completenessThreshold = 0.6
|
||||
# If first attempt fails, try with more explicit prompt
|
||||
if response and not self._isValidJsonResponse(response):
|
||||
logger.debug("First AI intent analysis attempt failed, retrying with explicit JSON-only prompt")
|
||||
explicitPrompt = f"""
|
||||
{analysisPrompt}
|
||||
|
||||
# Check for format requirements
|
||||
formatRequirement = "any"
|
||||
if any(word in promptLower for word in ['formatted', 'structured', 'organized']):
|
||||
formatRequirement = "formatted"
|
||||
elif any(word in promptLower for word in ['raw', 'plain', 'simple']):
|
||||
formatRequirement = "raw"
|
||||
IMPORTANT: You must respond with ONLY valid JSON. No explanations, no analysis, no text before or after. Just the JSON object.
|
||||
"""
|
||||
response = await self.services.ai.callAi(
|
||||
prompt=explicitPrompt,
|
||||
documents=None,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
if not response or not response.strip():
|
||||
logger.warning("AI intent analysis returned empty response")
|
||||
return None
|
||||
|
||||
# Clean and extract JSON from response
|
||||
result = response.strip()
|
||||
logger.debug(f"AI intent analysis response length: {len(result)}")
|
||||
|
||||
# Try to find JSON in the response with multiple strategies
|
||||
import re
|
||||
|
||||
# Strategy 1: Look for JSON in markdown code blocks
|
||||
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1)
|
||||
logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
|
||||
else:
|
||||
# Strategy 2: Look for JSON object with proper structure
|
||||
json_match = re.search(r'\{[^{}]*"primaryGoal"[^{}]*\}', result, re.DOTALL)
|
||||
if not json_match:
|
||||
# Strategy 3: Look for any JSON object
|
||||
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
logger.warning(f"All AI intent analysis attempts failed - no JSON found in response: {result[:200]}...")
|
||||
logger.debug(f"Full AI response: {result}")
|
||||
return None
|
||||
|
||||
result = json_match.group(0)
|
||||
logger.debug(f"Extracted JSON directly: {result[:200]}...")
|
||||
|
||||
try:
|
||||
aiResult = json.loads(result)
|
||||
logger.info("AI intent analysis JSON parsed successfully")
|
||||
return aiResult
|
||||
|
||||
except json.JSONDecodeError as json_error:
|
||||
logger.warning(f"All AI intent analysis attempts failed - invalid JSON: {str(json_error)}")
|
||||
logger.debug(f"JSON content: {result}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI intent analysis failed: {str(e)}")
|
||||
return None
|
||||
|
||||
def _createBasicIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
|
||||
"""Creates basic intent analysis without AI"""
|
||||
return {
|
||||
"accuracyThreshold": accuracyThreshold,
|
||||
"completenessThreshold": completenessThreshold,
|
||||
"formatRequirement": formatRequirement
|
||||
"primaryGoal": userPrompt.strip(),
|
||||
"dataType": "unknown",
|
||||
"expectedFormat": "unknown",
|
||||
"qualityRequirements": {
|
||||
"accuracyThreshold": 0.8,
|
||||
"completenessThreshold": 0.8,
|
||||
"formatRequirement": "any"
|
||||
},
|
||||
"successCriteria": ["Delivers what the user requested"],
|
||||
"confidenceScore": 0.5
|
||||
}
|
||||
|
||||
def _extractSuccessCriteria(self, userPrompt: str, context: Any) -> List[str]:
|
||||
"""Extracts success criteria from prompt and context"""
|
||||
criteria = []
|
||||
promptLower = userPrompt.lower()
|
||||
|
||||
# Extract explicit criteria
|
||||
if 'first' in promptLower and 'numbers' in promptLower:
|
||||
criteria.append("Contains the first N numbers as requested")
|
||||
|
||||
if 'prime' in promptLower:
|
||||
criteria.append("Contains actual prime numbers, not code to generate them")
|
||||
|
||||
if 'document' in promptLower:
|
||||
criteria.append("Creates a properly formatted document")
|
||||
|
||||
if 'format' in promptLower:
|
||||
criteria.append("Content is properly formatted as requested")
|
||||
|
||||
# Add context-based criteria
|
||||
if hasattr(context, 'task_step') and context.task_step:
|
||||
taskObjective = context.task_step.objective.lower()
|
||||
if 'word' in taskObjective:
|
||||
criteria.append("Creates a Word document")
|
||||
if 'excel' in taskObjective:
|
||||
criteria.append("Creates an Excel spreadsheet")
|
||||
|
||||
return criteria if criteria else ["Delivers what the user requested"]
|
||||
|
||||
def _calculateConfidenceScore(self, dataType: DataType, expectedFormat: ExpectedFormat,
|
||||
successCriteria: List[str]) -> float:
|
||||
"""Calculates confidence score for the intent analysis"""
|
||||
score = 0.0
|
||||
|
||||
# Data type confidence
|
||||
if dataType != DataType.UNKNOWN:
|
||||
score += 0.3
|
||||
|
||||
# Format confidence
|
||||
if expectedFormat != ExpectedFormat.UNKNOWN:
|
||||
score += 0.2
|
||||
|
||||
# Success criteria confidence
|
||||
if len(successCriteria) > 0:
|
||||
score += 0.3
|
||||
|
||||
# Additional confidence for specific patterns
|
||||
if len(successCriteria) > 1:
|
||||
score += 0.2
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
def _createDefaultIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
|
||||
"""Creates a default intent analysis when analysis fails"""
|
||||
return {
|
||||
|
|
@ -237,3 +165,16 @@ class IntentAnalyzer:
|
|||
"successCriteria": ["Delivers what the user requested"],
|
||||
"confidenceScore": 0.1
|
||||
}
|
||||
|
||||
def _isValidJsonResponse(self, response: str) -> bool:
|
||||
"""Checks if response contains valid JSON structure"""
|
||||
try:
|
||||
import re
|
||||
# Look for JSON with expected structure
|
||||
json_match = re.search(r'\{[^{}]*"primaryGoal"[^{}]*\}', response, re.DOTALL)
|
||||
if json_match:
|
||||
json.loads(json_match.group(0))
|
||||
return True
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -31,8 +31,8 @@ class ReactMode(BaseMode):
|
|||
def __init__(self, services, workflow):
|
||||
super().__init__(services, workflow)
|
||||
# Initialize adaptive components
|
||||
self.intentAnalyzer = IntentAnalyzer()
|
||||
self.contentValidator = ContentValidator()
|
||||
self.intentAnalyzer = IntentAnalyzer(services)
|
||||
self.contentValidator = ContentValidator(services)
|
||||
self.learningEngine = LearningEngine()
|
||||
self.progressTracker = ProgressTracker()
|
||||
self.currentIntent = None
|
||||
|
|
@ -49,13 +49,14 @@ class ReactMode(BaseMode):
|
|||
"""Execute task using React mode - iterative plan-act-observe-refine loop"""
|
||||
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
|
||||
|
||||
# NEW: Analyze user intent with both original prompt and task objective
|
||||
# Get original user prompt from services (clean and reliable)
|
||||
# NEW: Analyze intents separately for proper validation vs task completion
|
||||
# Workflow-level intent from cleaned original user prompt
|
||||
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
|
||||
combined_context = f"Original request: {original_prompt}\n\nCurrent task: {taskStep.objective}"
|
||||
|
||||
self.currentIntent = self.intentAnalyzer.analyzeUserIntent(combined_context, context)
|
||||
logger.info(f"Intent analysis (original + task): {self.currentIntent}")
|
||||
self.workflowIntent = await self.intentAnalyzer.analyzeUserIntent(original_prompt, context)
|
||||
# Task-level intent from current task objective (used only for task-scoped checks)
|
||||
self.taskIntent = await self.intentAnalyzer.analyzeUserIntent(taskStep.objective, context)
|
||||
logger.info(f"Intent analysis — workflow: {self.workflowIntent}")
|
||||
logger.info(f"Intent analysis — task: {self.taskIntent}")
|
||||
|
||||
# NEW: Reset progress tracking for new task
|
||||
self.progressTracker.reset()
|
||||
|
|
@ -99,18 +100,18 @@ class ReactMode(BaseMode):
|
|||
# Attach deterministic label for clarity
|
||||
observation['resultLabel'] = result.resultLabel
|
||||
|
||||
# NEW: Add content validation
|
||||
if self.currentIntent and result.documents:
|
||||
validationResult = self.contentValidator.validateContent(result.documents, self.currentIntent)
|
||||
# NEW: Add content validation (against original cleaned user prompt / workflow intent)
|
||||
if getattr(self, 'workflowIntent', None) and result.documents:
|
||||
validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent)
|
||||
observation['contentValidation'] = validationResult
|
||||
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
|
||||
|
||||
# NEW: Learn from feedback
|
||||
feedback = self._collectFeedback(result, validationResult, self.currentIntent)
|
||||
self.learningEngine.learnFromFeedback(feedback, context, self.currentIntent)
|
||||
feedback = self._collectFeedback(result, validationResult, self.workflowIntent)
|
||||
self.learningEngine.learnFromFeedback(feedback, context, self.workflowIntent)
|
||||
|
||||
# NEW: Update progress
|
||||
self.progressTracker.updateProgress(result, validationResult, self.currentIntent)
|
||||
self.progressTracker.updateProgress(result, validationResult, self.workflowIntent)
|
||||
|
||||
decision = await self._refineDecide(context, observation)
|
||||
|
||||
|
|
@ -204,6 +205,11 @@ class ReactMode(BaseMode):
|
|||
selection = json.loads(response[jsonStart:jsonEnd])
|
||||
if 'action' not in selection or not isinstance(selection['action'], str):
|
||||
raise ValueError("Selection missing 'action' as string")
|
||||
|
||||
# Validate document references - prevent AI from inventing Message IDs
|
||||
if 'requiredInputDocuments' in selection:
|
||||
self._validateDocumentReferences(selection['requiredInputDocuments'], context)
|
||||
|
||||
# Enforce spec: Stage 1 must NOT include 'parameters'
|
||||
if 'parameters' in selection:
|
||||
# Remove to avoid accidental carryover
|
||||
|
|
@ -213,6 +219,38 @@ class ReactMode(BaseMode):
|
|||
selection['parameters'] = None
|
||||
return selection
|
||||
|
||||
def _validateDocumentReferences(self, document_refs: List[str], context: TaskContext) -> None:
|
||||
"""Validate that document references exist in the current workflow"""
|
||||
if not document_refs:
|
||||
return
|
||||
|
||||
# Get available documents from the current workflow
|
||||
try:
|
||||
available_docs = self.services.workflow.getAvailableDocuments(self.services.currentWorkflow)
|
||||
if not available_docs or available_docs == "No documents available":
|
||||
logger.warning("No documents available for validation")
|
||||
return
|
||||
|
||||
# Extract all valid references from available documents
|
||||
valid_refs = []
|
||||
for line in available_docs.split('\n'):
|
||||
if 'docList:' in line or 'docItem:' in line:
|
||||
# Extract reference from line like " - docList:msg_xxx:label" or " - docItem:xxx:filename with spaces"
|
||||
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line)
|
||||
if ref_match:
|
||||
valid_refs.append(ref_match.group(1))
|
||||
|
||||
# Check if all provided references are valid
|
||||
for ref in document_refs:
|
||||
if ref not in valid_refs:
|
||||
logger.error(f"Invalid document reference: {ref}")
|
||||
logger.error(f"Available references: {valid_refs}")
|
||||
raise ValueError(f"Document reference '{ref}' not found in available documents. Use only exact references from AVAILABLE_DOCUMENTS_INDEX.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating document references: {str(e)}")
|
||||
raise ValueError(f"Failed to validate document references: {str(e)}")
|
||||
|
||||
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
|
||||
workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
|
||||
"""Act: request minimal parameters then execute selected action"""
|
||||
|
|
|
|||
|
|
@ -42,13 +42,24 @@ def extractUserPrompt(context: Any) -> str:
|
|||
Fallback to the task_step objective.
|
||||
"""
|
||||
try:
|
||||
# Prefer services.currentUserPrompt when accessible through context
|
||||
services = getattr(context, 'services', None)
|
||||
if services and getattr(services, 'currentUserPrompt', None):
|
||||
return services.currentUserPrompt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Determine raw user prompt from services or task_step
|
||||
rawPrompt = None
|
||||
if services and getattr(services, 'currentUserPrompt', None):
|
||||
rawPrompt = services.currentUserPrompt
|
||||
elif hasattr(context, 'task_step') and context.task_step:
|
||||
rawPrompt = context.task_step.objective or 'No request specified'
|
||||
else:
|
||||
rawPrompt = 'No request specified'
|
||||
|
||||
# Prefer values computed at workflow start by WorkflowManager analyzer
|
||||
normalized = getattr(services, 'currentUserPromptNormalized', None) if services else None
|
||||
if normalized:
|
||||
return normalized
|
||||
return rawPrompt
|
||||
except Exception:
|
||||
# Robust fallback behavior
|
||||
if hasattr(context, 'task_step') and context.task_step:
|
||||
return context.task_step.objective or 'No request specified'
|
||||
return 'No request specified'
|
||||
|
|
@ -57,19 +68,11 @@ def extractWorkflowHistory(service: Any, context: Any) -> str:
|
|||
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
||||
Reverse-chronological, enriched with message summaries and document labels.
|
||||
"""
|
||||
# Prefer explicit workflow on context; else fall back to services.workflow
|
||||
workflow = None
|
||||
try:
|
||||
if hasattr(context, 'workflow') and context.workflow:
|
||||
workflow = context.workflow
|
||||
elif hasattr(service, 'workflow') and service.workflow:
|
||||
workflow = service.workflow
|
||||
except Exception:
|
||||
workflow = None
|
||||
|
||||
if workflow:
|
||||
history = getPreviousRoundContext(service, workflow)
|
||||
history = getPreviousRoundContext(service, service.currentWorkflow)
|
||||
return history or "No previous workflow rounds available"
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting workflow history: {str(e)}")
|
||||
return "No previous workflow rounds available"
|
||||
|
||||
def extractAvailableMethods(service: Any) -> str:
|
||||
|
|
@ -99,7 +102,15 @@ def extractAvailableMethods(service: Any) -> str:
|
|||
|
||||
def extractUserLanguage(service: Any) -> str:
|
||||
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
|
||||
try:
|
||||
# Prefer detected language if available
|
||||
if service and getattr(service, 'currentUserLanguage', None):
|
||||
return service.currentUserLanguage
|
||||
return service.user.language if service and service.user else 'en'
|
||||
except Exception:
|
||||
return 'en'
|
||||
|
||||
# Normalization now happens centrally in WorkflowManager._sendFirstMessage; no AI call here.
|
||||
|
||||
|
||||
def _computeMessageSummary(msg) -> str:
|
||||
|
|
@ -371,9 +382,10 @@ def extractLatestRefinementFeedback(context: Any) -> str:
|
|||
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
||||
"""Summary of available documents (count only)."""
|
||||
try:
|
||||
documents = service.workflow.getAvailableDocuments(context.workflow)
|
||||
documents = service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||
if documents and documents != "No documents available":
|
||||
doc_count = documents.count("docList:") + documents.count("docItem:")
|
||||
# Count only actual documents, not list labels
|
||||
doc_count = documents.count("docItem:")
|
||||
return f"{doc_count} documents available from previous tasks"
|
||||
return "No documents available"
|
||||
except Exception as e:
|
||||
|
|
@ -383,7 +395,7 @@ def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
|||
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
|
||||
"""Index of available documents with detailed references for parameter generation."""
|
||||
try:
|
||||
return service.workflow.getAvailableDocuments(context.workflow)
|
||||
return service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting document index: {str(e)}")
|
||||
return "No documents available"
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ def generateReactPlanSelectionPrompt(services, context: Any) -> PromptBundle:
|
|||
PromptPlaceholder(label="AVAILABLE_CONNECTIONS_INDEX", content=extractAvailableConnectionsIndex(services), summaryAllowed=False),
|
||||
]
|
||||
|
||||
template = """Select exactly one action to advance the task.
|
||||
template = """Select exactly one next action to advance the task incrementally.
|
||||
|
||||
OBJECTIVE:
|
||||
{{KEY:USER_PROMPT}}
|
||||
|
|
@ -52,7 +52,11 @@ AVAILABLE_DOCUMENTS_INDEX:
|
|||
AVAILABLE_CONNECTIONS_INDEX:
|
||||
{{KEY:AVAILABLE_CONNECTIONS_INDEX}}
|
||||
|
||||
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text):
|
||||
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text). The chosen action MUST:
|
||||
- be the next logical incremental step toward fulfilling the objective
|
||||
- not attempt to complete the entire objective in one step
|
||||
- if producing files, target exactly one output format for this step
|
||||
- reference ONLY existing document IDs/labels from AVAILABLE_DOCUMENTS_INDEX
|
||||
{{
|
||||
"action": "method.action_name",
|
||||
"actionObjective": "...",
|
||||
|
|
@ -64,7 +68,7 @@ REPLY: Return ONLY a JSON object with the following structure (no comments, no e
|
|||
|
||||
EXAMPLE how to assign references from AVAILABLE_DOCUMENTS_INDEX and AVAILABLE_CONNECTIONS_INDEX:
|
||||
"requiredInputDocuments": ["docList:msg_47a7a578-e8f2-4ba8-ac66-0dbff40605e0:round8_task1_action1_results","docItem:5d8b7aee-b546-4487-b6a8-835c86f7b186:AI_Generated_Document_20251006-104256.docx"],
|
||||
"requiredConnection": "connection:msft:p.motsch@valueon.ch:1ae8b8e5-128b-49b8-b1cb-7c632669eeae",
|
||||
"requiredConnection": "connection:msft:p.motsch@valueon.ch",
|
||||
|
||||
RULES:
|
||||
1. Use EXACT action names from AVAILABLE_METHODS
|
||||
|
|
@ -72,7 +76,11 @@ RULES:
|
|||
3. parametersContext must be short and sufficient for Stage 2
|
||||
4. Return ONLY JSON - no markdown, no explanations
|
||||
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
|
||||
- DO NOT invent or modify Message IDs
|
||||
- DO NOT create new references
|
||||
- Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX
|
||||
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
|
||||
7. Plan incrementally: if the overall intent needs multiple output formats (e.g., CSV and HTML), choose one format in this step and leave the other(s) for subsequent steps
|
||||
"""
|
||||
|
||||
return PromptBundle(prompt=template, placeholders=placeholders)
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle:
|
|||
|
||||
Break down user requests into logical, executable task steps.
|
||||
|
||||
**IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks.
|
||||
|
||||
## 📋 Context
|
||||
|
||||
### User Request
|
||||
|
|
@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps.
|
|||
- **ONE TOPIC PER TASK** - Each task should handle one complete business objective
|
||||
- **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
|
||||
- **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
|
||||
- **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job
|
||||
|
||||
### Task Grouping Examples
|
||||
- **Research + Analysis + Report** → ONE task: "Web research report"
|
||||
- **Data Collection + Processing + Visualization** → ONE task: "Collect and present data"
|
||||
- **Document splitting** (analyze + extract + create files) → ONE task: "Split document into separate files"
|
||||
- **Different topics** (email + flowers) → SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"
|
||||
|
||||
### Common Single-Task Scenarios
|
||||
- **"Split document into sections"** → ONE task: "Split document into separate files"
|
||||
- **"Extract data and create report"** → ONE task: "Extract data and create report"
|
||||
- **"Analyze and summarize document"** → ONE task: "Analyze and summarize document"
|
||||
- **"Convert file to different format"** → ONE task: "Convert file to different format"
|
||||
|
||||
### Retry Handling
|
||||
- **If retry request**: Analyze previous rounds to understand what failed
|
||||
- **Learn from mistakes**: Improve the plan based on previous failures
|
||||
|
|
|
|||
|
|
@ -216,23 +216,23 @@ class WorkflowManager:
|
|||
# Update the message with documents in database
|
||||
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
|
||||
|
||||
# Analyze the user's input to extract intent and offload bulky context into documents
|
||||
# Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents
|
||||
try:
|
||||
analyzerPrompt = (
|
||||
"You are an input analyzer. Split the user's message into:\n"
|
||||
"1) intent: the user's core request in one concise paragraph, normalized to the user's language.\n"
|
||||
"2) contextItems: supportive data to attach as separate documents if significantly larger than the intent. "
|
||||
"Include large literal data blocks, long lists/tables, code/JSON blocks, quoted transcripts, CSV fragments, or detailed specs. "
|
||||
"Keep URLs in the intent unless they include large pasted content.\n\n"
|
||||
"You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n"
|
||||
"1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n"
|
||||
"2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n"
|
||||
"3) intent: concise single-paragraph core request in the detected language for high-level routing.\n"
|
||||
"4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n"
|
||||
"Rules:\n"
|
||||
"- If total content length (intent + data) is less than 10% of the model's max tokens, do not extract; "
|
||||
"return an empty contextItems and keep a compact, self-contained intent.\n"
|
||||
"- If content exceeds that, move bulky parts into contextItems, keeping the intent short and clear.\n"
|
||||
"- Preserve critical references (URLs, filenames) in the intent.\n"
|
||||
"- Normalize the intent to the detected language. If mixed-language, use the primary detected language and normalize.\n\n"
|
||||
"Output JSON only (no markdown):\n"
|
||||
"- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n"
|
||||
"- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n"
|
||||
"- Preserve critical references (URLs, filenames) in intent.\n"
|
||||
"- Normalize to the primary detected language if mixed-language.\n\n"
|
||||
"Return ONLY JSON (no markdown) with this shape:\n"
|
||||
"{\n"
|
||||
" \"detectedLanguage\": \"en\",\n"
|
||||
" \"detectedLanguage\": \"de|en|fr|it|...\",\n"
|
||||
" \"normalizedRequest\": \"Full explicit instruction in detected language\",\n"
|
||||
" \"intent\": \"Concise normalized request...\",\n"
|
||||
" \"contextItems\": [\n"
|
||||
" {\n"
|
||||
|
|
@ -249,6 +249,7 @@ class WorkflowManager:
|
|||
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
|
||||
|
||||
detectedLanguage = None
|
||||
normalizedRequest = None
|
||||
intentText = userInput.prompt
|
||||
contextItems = []
|
||||
|
||||
|
|
@ -260,6 +261,7 @@ class WorkflowManager:
|
|||
if jsonStart != -1 and jsonEnd > jsonStart:
|
||||
parsed = json.loads(aiResponse[jsonStart:jsonEnd])
|
||||
detectedLanguage = parsed.get('detectedLanguage') or None
|
||||
normalizedRequest = parsed.get('normalizedRequest') or None
|
||||
if parsed.get('intent'):
|
||||
intentText = parsed.get('intent')
|
||||
contextItems = parsed.get('contextItems') or []
|
||||
|
|
@ -269,7 +271,18 @@ class WorkflowManager:
|
|||
# Update services state
|
||||
if detectedLanguage and isinstance(detectedLanguage, str):
|
||||
self._setUserLanguage(detectedLanguage)
|
||||
try:
|
||||
setattr(self.services, 'currentUserLanguage', detectedLanguage)
|
||||
except Exception:
|
||||
pass
|
||||
self.services.currentUserPrompt = intentText or userInput.prompt
|
||||
try:
|
||||
if normalizedRequest:
|
||||
setattr(self.services, 'currentUserPromptNormalized', normalizedRequest)
|
||||
if contextItems is not None:
|
||||
setattr(self.services, 'currentUserContextItems', contextItems)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Telemetry (sizes and counts)
|
||||
try:
|
||||
|
|
@ -329,8 +342,6 @@ class WorkflowManager:
|
|||
if not message.documents:
|
||||
message.documents = []
|
||||
message.documents.extend(created_docs)
|
||||
# Ensure label is user_context for discoverability
|
||||
message.documentsLabel = context_label
|
||||
self.services.workflow.updateMessage(message.id, {
|
||||
"documents": [d.to_dict() for d in message.documents],
|
||||
"documentsLabel": context_label
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ markdown
|
|||
## Web Scraping & HTTP
|
||||
beautifulsoup4==4.12.2 # Required for HTML/XML parsing
|
||||
requests==2.31.0
|
||||
requests-oauthlib==1.3.1 # Required for Google OAuth2Session
|
||||
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
|
||||
aiohttp>=3.8.0 # Required for SharePoint operations (async HTTP)
|
||||
selenium>=4.15.0 # Required for web automation and JavaScript-heavy pages
|
||||
|
|
|
|||
555
test_document_processing.py
Normal file
555
test_document_processing.py
Normal file
|
|
@ -0,0 +1,555 @@
|
|||
"""
|
||||
Test script for document processing and DOCX generation.
|
||||
Calls the main AI service directly to process PDF documents and generate DOCX summaries.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
import base64
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Add the gateway module to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
|
||||
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.datamodels.datamodelAi import EnhancedAiCallOptions
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def process_documents_and_generate_summary():
|
||||
"""Process documents using the main AI service with intelligent chunk integration."""
|
||||
logger.info("🚀 Starting intelligent chunk integration test...")
|
||||
|
||||
# Find testdata directory
|
||||
testdata_path = Path("../wiki/poweron/testdata")
|
||||
if not testdata_path.exists():
|
||||
# Try relative to current directory
|
||||
testdata_path = Path("wiki/poweron/testdata")
|
||||
if not testdata_path.exists():
|
||||
# Try relative to parent directory
|
||||
testdata_path = Path("../wiki/poweron/testdata")
|
||||
if not testdata_path.exists():
|
||||
logger.error(f"❌ Testdata path not found. Tried:")
|
||||
logger.error(f" - ../wiki/poweron/testdata")
|
||||
logger.error(f" - wiki/poweron/testdata")
|
||||
logger.error(f" - ../wiki/poweron/testdata")
|
||||
logger.info("Please ensure the testdata folder exists with PDF documents")
|
||||
return False
|
||||
|
||||
# Find all supported document files
|
||||
supported_extensions = [
|
||||
# Document formats
|
||||
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
|
||||
# Image formats
|
||||
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
|
||||
# Text and code files
|
||||
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
|
||||
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
|
||||
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
|
||||
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
|
||||
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
|
||||
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
|
||||
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
|
||||
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
|
||||
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
|
||||
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
|
||||
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
|
||||
]
|
||||
document_files = []
|
||||
for ext in supported_extensions:
|
||||
document_files.extend(list(testdata_path.glob(ext)))
|
||||
|
||||
logger.info(f"Found {len(document_files)} document files in testdata:")
|
||||
for doc_file in document_files:
|
||||
logger.info(f" - {doc_file.name}")
|
||||
|
||||
if not document_files:
|
||||
logger.error("❌ No supported document files found in testdata folder")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Mock the database interface to provide our file data BEFORE creating AI service
|
||||
class TestDbInterface:
|
||||
def __init__(self, file_data_map):
|
||||
self.file_data_map = file_data_map
|
||||
|
||||
def getFileData(self, file_id):
|
||||
logger.info(f"TestDbInterface.getFileData called with file_id: {file_id}")
|
||||
data = self.file_data_map.get(file_id)
|
||||
if data:
|
||||
logger.info(f"✅ Found file data for {file_id}: {len(data)} bytes")
|
||||
else:
|
||||
logger.warning(f"❌ No file data found for {file_id}")
|
||||
return data
|
||||
|
||||
# Create file data mapping
|
||||
file_data_map = {}
|
||||
for i, doc_file in enumerate(document_files):
|
||||
with open(doc_file, 'rb') as f:
|
||||
file_data_map[f"test_doc_{i+1}"] = f.read()
|
||||
logger.info(f"📁 Loaded {doc_file.name} as test_doc_{i+1}: {len(file_data_map[f'test_doc_{i+1}'])} bytes")
|
||||
|
||||
# Mock the database interface BEFORE creating AI service
|
||||
import modules.interfaces.interfaceDbComponentObjects as db_interface_module
|
||||
original_get_interface = db_interface_module.getInterface
|
||||
db_interface_module.getInterface = lambda: TestDbInterface(file_data_map)
|
||||
logger.info("🔧 Database interface mocked successfully")
|
||||
|
||||
# Create a mock service center with utils
|
||||
class MockServiceCenter:
|
||||
def __init__(self):
|
||||
self.utils = MockUtils()
|
||||
|
||||
class MockUtils:
|
||||
def debugLogToFile(self, message, label):
|
||||
logger.debug(f"[{label}] {message}")
|
||||
print(f"DEBUG [{label}]: {message}") # Also print to console for visibility
|
||||
|
||||
# Only write to debug file if debug logging is enabled (matching real implementation)
|
||||
debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
try:
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat")
|
||||
if not os.path.isabs(debug_dir):
|
||||
# If relative path, make it relative to the gateway directory
|
||||
gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
debug_dir = os.path.join(gateway_dir, debug_dir)
|
||||
|
||||
os.makedirs(debug_dir, exist_ok=True)
|
||||
debug_file = os.path.join(debug_dir, "debug_workflow.log")
|
||||
timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||
debug_entry = f"[{timestamp}] [{label}] {message}\n"
|
||||
with open(debug_file, "a", encoding="utf-8") as f:
|
||||
f.write(debug_entry)
|
||||
except Exception:
|
||||
pass # Don't fail on debug logging errors
|
||||
|
||||
def configGet(self, key, default):
|
||||
# Return debug settings
|
||||
if key == "APP_DEBUG_CHAT_WORKFLOW_ENABLED":
|
||||
return True
|
||||
elif key == "APP_DEBUG_CHAT_WORKFLOW_DIR":
|
||||
return "./test-chat"
|
||||
return default
|
||||
|
||||
mock_service_center = MockServiceCenter()
|
||||
|
||||
# Initialize the main AI service - let it handle everything
|
||||
logger.info("🔧 Initializing main AI service...")
|
||||
ai_service = await AiService.create(mock_service_center)
|
||||
|
||||
# Create test documents - the AI service will handle file access internally
|
||||
documents = []
|
||||
logger.info(f"📁 Found {len(document_files)} document files")
|
||||
for i, doc_file in enumerate(document_files):
|
||||
logger.info(f"📄 Processing file {i+1}/{len(document_files)}: {doc_file.name}")
|
||||
# Determine MIME type based on file extension
|
||||
mime_type = "application/octet-stream" # default
|
||||
if doc_file.suffix.lower() == '.pdf':
|
||||
mime_type = "application/pdf"
|
||||
elif doc_file.suffix.lower() in ['.jpg', '.jpeg']:
|
||||
mime_type = "image/jpeg"
|
||||
elif doc_file.suffix.lower() == '.png':
|
||||
mime_type = "image/png"
|
||||
elif doc_file.suffix.lower() == '.gif':
|
||||
mime_type = "image/gif"
|
||||
elif doc_file.suffix.lower() == '.docx':
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
elif doc_file.suffix.lower() == '.xlsx':
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
elif doc_file.suffix.lower() == '.pptx':
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
elif doc_file.suffix.lower() == '.ppt':
|
||||
mime_type = "application/vnd.ms-powerpoint"
|
||||
elif doc_file.suffix.lower() == '.html':
|
||||
mime_type = "text/html"
|
||||
elif doc_file.suffix.lower() == '.csv':
|
||||
mime_type = "text/csv"
|
||||
elif doc_file.suffix.lower() == '.json':
|
||||
mime_type = "application/json"
|
||||
elif doc_file.suffix.lower() in ['.txt', '.md']:
|
||||
mime_type = "text/plain"
|
||||
|
||||
chat_doc = ChatDocument(
|
||||
fileId=f"test_doc_{i+1}",
|
||||
messageId=f"test_message_{i+1}",
|
||||
fileName=doc_file.name,
|
||||
mimeType=mime_type,
|
||||
fileSize=doc_file.stat().st_size,
|
||||
roundNumber=1,
|
||||
taskNumber=1,
|
||||
actionNumber=1,
|
||||
actionId=f"test_action_{i+1}"
|
||||
)
|
||||
documents.append(chat_doc)
|
||||
logger.info(f"✅ Created ChatDocument: {chat_doc.fileName} ({chat_doc.mimeType}) - {chat_doc.fileSize} bytes")
|
||||
|
||||
logger.info(f"📄 Created {len(documents)} document objects")
|
||||
|
||||
# Create enhanced AI call options for intelligent chunked processing
|
||||
ai_options = EnhancedAiCallOptions(
|
||||
operationType="general",
|
||||
enableParallelProcessing=True,
|
||||
maxConcurrentChunks=5, # Increased for better testing
|
||||
preserveChunkMetadata=True,
|
||||
chunkSeparator="\n\n---\n\n"
|
||||
)
|
||||
|
||||
# Call the main AI service directly - let it handle everything including DOCX generation
|
||||
logger.info("🤖 Calling main AI service with intelligent merging...")
|
||||
|
||||
|
||||
# Run a single end-to-end test to avoid the loop issue
|
||||
logger.info("🧪 Running single end-to-end test...")
|
||||
|
||||
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
|
||||
|
||||
# userPrompt = "Can you create one file for each section in the document"
|
||||
|
||||
# userPrompt = "Analyze these documents and create a fitting image for the content"
|
||||
|
||||
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."
|
||||
|
||||
# userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
|
||||
|
||||
# userPrompt = "Create a docx file containing the combined documents in french language."
|
||||
|
||||
try:
|
||||
# Single AI call with DOCX generation
|
||||
ai_response = await ai_service.callAi(
|
||||
prompt=userPrompt,
|
||||
documents=documents,
|
||||
options=ai_options,
|
||||
outputFormat="txt",
|
||||
title="Kunden und Use Cases"
|
||||
)
|
||||
|
||||
logger.info(f"✅ End-to-end test completed successfully")
|
||||
logger.info(f"📊 Response type: {type(ai_response)}")
|
||||
logger.info(f"📊 Response length: {len(str(ai_response))} characters")
|
||||
|
||||
# Single test result
|
||||
test_results = [{
|
||||
"test_name": "End-to-End DOCX Generation",
|
||||
"success": True,
|
||||
"response_type": type(ai_response).__name__,
|
||||
"response_length": len(str(ai_response)),
|
||||
"response": ai_response
|
||||
}]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ End-to-end test failed: {str(e)}")
|
||||
test_results = [{
|
||||
"test_name": "End-to-End DOCX Generation",
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"response": None
|
||||
}]
|
||||
|
||||
logger.info(f"🎯 Completed 1 end-to-end test")
|
||||
|
||||
# Process all test results and save outputs
|
||||
logger.info("📊 Processing test results...")
|
||||
|
||||
successful_tests = [r for r in test_results if r['success']]
|
||||
failed_tests = [r for r in test_results if not r['success']]
|
||||
|
||||
logger.info(f"✅ Successful tests: {len(successful_tests)}")
|
||||
logger.info(f"❌ Failed tests: {len(failed_tests)}")
|
||||
|
||||
# Display test results summary
|
||||
logger.info("=" * 80)
|
||||
logger.info("END-TO-END TEST RESULTS SUMMARY")
|
||||
logger.info("=" * 80)
|
||||
for i, result in enumerate(test_results, 1):
|
||||
status = "✅ PASS" if result['success'] else "❌ FAIL"
|
||||
logger.info(f"Test {i}: {result['test_name']} - {status}")
|
||||
if result['success']:
|
||||
logger.info(f" Response Type: {result['response_type']}")
|
||||
logger.info(f" Response Length: {result['response_length']} characters")
|
||||
else:
|
||||
logger.info(f" Error: {result['error']}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = Path("test-chat/unittestoutput")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save all test results and generated files
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
logger.info("💾 Saving test results and generated files...")
|
||||
|
||||
try:
|
||||
for i, result in enumerate(successful_tests, 1):
|
||||
test_name = result['test_name'].replace(' ', '_').lower()
|
||||
response = result['response']
|
||||
|
||||
logger.info(f"💾 Saving Test {i}: {result['test_name']}")
|
||||
|
||||
# Handle different response types
|
||||
if isinstance(response, dict):
|
||||
# Document generation response
|
||||
if 'documents' in response and response['documents']:
|
||||
logger.info(f"📄 Found {len(response['documents'])} documents in response")
|
||||
|
||||
for j, doc in enumerate(response['documents']):
|
||||
doc_name = doc.get('documentName', f'{test_name}_document_{j+1}')
|
||||
doc_data = doc.get('documentData', '')
|
||||
doc_mime = doc.get('mimeType', 'application/octet-stream')
|
||||
|
||||
logger.info(f"📄 Document {j+1}: {doc_name}")
|
||||
logger.info(f"📄 MIME Type: {doc_mime}")
|
||||
logger.info(f"📄 Data length: {len(doc_data)} characters")
|
||||
|
||||
# Determine file extension with better MIME type detection
|
||||
file_ext = '.bin' # Default fallback
|
||||
|
||||
if doc_mime:
|
||||
if 'docx' in doc_mime.lower() or 'wordprocessingml' in doc_mime.lower():
|
||||
file_ext = '.docx'
|
||||
elif 'pdf' in doc_mime.lower():
|
||||
file_ext = '.pdf'
|
||||
elif 'txt' in doc_mime.lower() or 'plain' in doc_mime.lower():
|
||||
file_ext = '.txt'
|
||||
elif 'html' in doc_mime.lower():
|
||||
file_ext = '.html'
|
||||
elif 'json' in doc_mime.lower():
|
||||
file_ext = '.json'
|
||||
elif 'csv' in doc_mime.lower():
|
||||
file_ext = '.csv'
|
||||
elif 'xlsx' in doc_mime.lower() or 'spreadsheetml' in doc_mime.lower():
|
||||
file_ext = '.xlsx'
|
||||
elif 'pptx' in doc_mime.lower() or 'presentationml' in doc_mime.lower():
|
||||
file_ext = '.pptx'
|
||||
elif 'markdown' in doc_mime.lower() or 'md' in doc_mime.lower():
|
||||
file_ext = '.md'
|
||||
elif 'png' in doc_mime.lower() or 'image' in doc_mime.lower():
|
||||
file_ext = '.png'
|
||||
elif 'jpg' in doc_mime.lower() or 'jpeg' in doc_mime.lower():
|
||||
file_ext = '.jpg'
|
||||
else:
|
||||
logger.warning(f"⚠️ Unknown MIME type: {doc_mime}, using .bin")
|
||||
|
||||
# Also check filename for hints
|
||||
if doc_name and '.' in doc_name:
|
||||
name_ext = '.' + doc_name.split('.')[-1].lower()
|
||||
if name_ext in ['.docx', '.pdf', '.txt', '.html', '.json', '.csv', '.xlsx', '.pptx', '.md', '.png', '.jpg', '.jpeg']:
|
||||
file_ext = name_ext
|
||||
logger.info(f"📄 Using extension from filename: {file_ext}")
|
||||
|
||||
logger.info(f"📄 Final file extension: {file_ext}")
|
||||
|
||||
# Save document
|
||||
output_path = output_dir / f"{test_name}_{timestamp}{file_ext}"
|
||||
|
||||
# Handle different content types
|
||||
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
|
||||
# Text-based formats - save directly as text
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
logger.info(f"✅ Document saved as text: {output_path} ({len(doc_data)} characters)")
|
||||
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
||||
# Image formats - decode from base64
|
||||
try:
|
||||
doc_bytes = base64.b64decode(doc_data)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(doc_bytes)
|
||||
logger.info(f"✅ Image saved: {output_path} ({len(doc_bytes)} bytes)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to decode image as base64: {e}")
|
||||
# Save as text if base64 decoding fails
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
logger.info(f"✅ Image saved as text (fallback): {output_path}")
|
||||
else:
|
||||
# Other binary formats - decode from base64
|
||||
try:
|
||||
doc_bytes = base64.b64decode(doc_data)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(doc_bytes)
|
||||
logger.info(f"✅ Document saved as binary: {output_path} ({len(doc_bytes)} bytes)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to decode document as base64: {e}")
|
||||
# Save as text if base64 decoding fails
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(doc_data)
|
||||
logger.info(f"✅ Document saved as text (fallback): {output_path}")
|
||||
|
||||
# Also save raw content as text
|
||||
content = response.get('content', '')
|
||||
if content:
|
||||
text_path = output_dir / f"{test_name}_content_{timestamp}.txt"
|
||||
with open(text_path, 'w', encoding='utf-8') as f:
|
||||
# Handle both string and dictionary content
|
||||
if isinstance(content, dict):
|
||||
import json
|
||||
f.write(json.dumps(content, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
f.write(str(content))
|
||||
logger.info(f"✅ Content saved: {text_path}")
|
||||
|
||||
elif isinstance(response, str):
|
||||
# Text response
|
||||
text_path = output_dir / f"{test_name}_response_{timestamp}.txt"
|
||||
with open(text_path, 'w', encoding='utf-8') as f:
|
||||
f.write(response)
|
||||
logger.info(f"✅ Text response saved: {text_path}")
|
||||
|
||||
else:
|
||||
logger.warning(f"⚠️ Unknown response type for {result['test_name']}: {type(response)}")
|
||||
|
||||
# Save failed test details
|
||||
if failed_tests:
|
||||
error_path = output_dir / f"failed_tests_{timestamp}.txt"
|
||||
with open(error_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# Failed Test Details\n\n")
|
||||
for i, result in enumerate(failed_tests, 1):
|
||||
f.write(f"## Test {i}: {result['test_name']}\n")
|
||||
f.write(f"**Error:** {result['error']}\n\n")
|
||||
logger.info(f"✅ Failed test details saved: {error_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error saving test results: {str(e)}")
|
||||
return False
|
||||
|
||||
# Save comprehensive test report
|
||||
report_path = output_dir / f"end_to_end_test_report_{timestamp}.txt"
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"# End-to-End AI Service Test Report\n")
|
||||
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
||||
|
||||
f.write(f"## Test Configuration\n")
|
||||
f.write(f"- Documents processed: {len(documents)}\n")
|
||||
f.write(f"- Processing method: Intelligent Token-Aware Merging\n")
|
||||
f.write(f"- Parallel processing: {ai_options.enableParallelProcessing}\n")
|
||||
f.write(f"- Max concurrent chunks: {ai_options.maxConcurrentChunks}\n")
|
||||
f.write(f"- Chunk metadata preserved: {ai_options.preserveChunkMetadata}\n")
|
||||
f.write(f"- Chunk separator: '{ai_options.chunkSeparator}'\n\n")
|
||||
|
||||
f.write(f"## Document Inventory\n")
|
||||
for i, doc in enumerate(documents, 1):
|
||||
f.write(f"{i}. **{doc.fileName}**\n")
|
||||
f.write(f" - MIME Type: {doc.mimeType}\n")
|
||||
f.write(f" - File Size: {doc.fileSize:,} bytes\n")
|
||||
f.write(f" - File ID: {doc.fileId}\n\n")
|
||||
|
||||
f.write(f"## Test Results Summary\n")
|
||||
f.write(f"- Total Tests: {len(test_results)}\n")
|
||||
f.write(f"- Successful: {len(successful_tests)}\n")
|
||||
f.write(f"- Failed: {len(failed_tests)}\n")
|
||||
f.write(f"- Success Rate: {len(successful_tests)/len(test_results)*100:.1f}%\n\n")
|
||||
|
||||
f.write(f"## Detailed Test Results\n")
|
||||
for i, result in enumerate(test_results, 1):
|
||||
f.write(f"### Test {i}: {result['test_name']}\n")
|
||||
f.write(f"**Status:** {'✅ PASS' if result['success'] else '❌ FAIL'}\n")
|
||||
|
||||
if result['success']:
|
||||
f.write(f"**Response Type:** {result['response_type']}\n")
|
||||
f.write(f"**Response Length:** {result['response_length']} characters\n")
|
||||
|
||||
# Show response preview
|
||||
response_preview = str(result['response'])[:500]
|
||||
f.write(f"**Response Preview:**\n```\n{response_preview}...\n```\n\n")
|
||||
else:
|
||||
f.write(f"**Error:** {result['error']}\n\n")
|
||||
|
||||
f.write(f"## Technical Implementation Details\n")
|
||||
f.write(f"This test validates the complete AI service pipeline:\n\n")
|
||||
f.write(f"### Tested Components:\n")
|
||||
f.write(f"- **Document Extraction**: PDF, DOCX, images, etc.\n")
|
||||
f.write(f"- **Intelligent Chunking**: Token-aware merging\n")
|
||||
f.write(f"- **Model Selection**: Automatic AI model choice\n")
|
||||
f.write(f"- **Parallel Processing**: Concurrent chunk processing\n")
|
||||
f.write(f"- **Document Generation**: DOCX, PDF, text output\n")
|
||||
f.write(f"- **Error Handling**: Graceful failure management\n\n")
|
||||
|
||||
f.write(f"### Performance Metrics:\n")
|
||||
f.write(f"- **Chunk Optimization**: Intelligent merging reduces AI calls\n")
|
||||
f.write(f"- **Processing Speed**: Parallel execution\n")
|
||||
f.write(f"- **Memory Efficiency**: Token-aware chunking\n")
|
||||
f.write(f"- **Output Quality**: Multiple format support\n\n")
|
||||
|
||||
f.write(f"## Generated Files\n")
|
||||
for i, result in enumerate(successful_tests, 1):
|
||||
test_name = result['test_name'].replace(' ', '_').lower()
|
||||
f.write(f"- **Test {i}**: {result['test_name']} → `{test_name}_*_{timestamp}.*`\n")
|
||||
|
||||
if failed_tests:
|
||||
f.write(f"- **Failed Tests**: `failed_tests_{timestamp}.txt`\n")
|
||||
|
||||
f.write(f"- **This Report**: `end_to_end_test_report_{timestamp}.txt`\n\n")
|
||||
|
||||
f.write(f"The end-to-end test successfully validates the complete AI service\n")
|
||||
f.write(f"pipeline from document input to formatted output generation.\n")
|
||||
|
||||
logger.info(f"✅ Comprehensive test report saved: {report_path}")
|
||||
|
||||
# Show debug file locations
|
||||
debug_files = []
|
||||
try:
|
||||
debug_dir = Path("test-chat")
|
||||
if debug_dir.exists():
|
||||
debug_files.extend(list(debug_dir.glob("*.log")))
|
||||
debug_files.extend(list(debug_dir.glob("ai/*.txt")))
|
||||
|
||||
if debug_files:
|
||||
logger.info("📁 Debug files created:")
|
||||
for debug_file in debug_files:
|
||||
logger.info(f" - {debug_file}")
|
||||
else:
|
||||
logger.info("📁 No debug files found in test-chat directory")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not list debug files: {e}")
|
||||
|
||||
# Restore original database interface
|
||||
db_interface_module.getInterface = original_get_interface
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error during document processing: {str(e)}")
|
||||
import traceback
|
||||
logger.error(f"Traceback: {traceback.format_exc()}")
|
||||
|
||||
# Restore original database interface in case of error
|
||||
try:
|
||||
db_interface_module.getInterface = original_get_interface
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
async def main():
|
||||
"""Main function to run the intelligent chunk integration test."""
|
||||
logger.info("🎯 Starting Intelligent Chunk Integration Test")
|
||||
logger.info("=" * 60)
|
||||
|
||||
success = await process_documents_and_generate_summary()
|
||||
|
||||
if success:
|
||||
logger.info("🎉 Intelligent chunk integration test completed successfully!")
|
||||
logger.info("✅ Main AI service handled all processing internally")
|
||||
logger.info("✅ Intelligent token-aware merging activated")
|
||||
logger.info("✅ DOCX document generated directly by AI service")
|
||||
logger.info("✅ Detailed chunk integration analysis saved")
|
||||
logger.info("✅ Performance optimization achieved")
|
||||
else:
|
||||
logger.error("❌ Test failed!")
|
||||
logger.error("Please check the error messages above for details")
|
||||
|
||||
logger.info("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
422
tool_security_encrypt_all_env_files.py
Normal file
422
tool_security_encrypt_all_env_files.py
Normal file
|
|
@ -0,0 +1,422 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tool for encrypting all *_SECRET variables in all environment files.
|
||||
|
||||
This tool automatically processes all three environment files (dev, int, prod)
|
||||
and encrypts any unencrypted *_SECRET variables using the appropriate encryption
|
||||
keys for each environment.
|
||||
|
||||
Usage:
|
||||
# Encrypt all secrets in all environment files
|
||||
python tool_security_encrypt_all_env_files.py
|
||||
|
||||
# Dry run - show what would be changed without making changes
|
||||
python tool_security_encrypt_all_env_files.py --dry-run
|
||||
|
||||
# Skip backup creation
|
||||
python tool_security_encrypt_all_env_files.py --no-backup
|
||||
|
||||
# Process only specific environment files
|
||||
python tool_security_encrypt_all_env_files.py --files env_dev.env env_prod.env
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Add the modules directory to the Python path
|
||||
current_dir = Path(__file__).parent
|
||||
modules_dir = current_dir / 'modules'
|
||||
if modules_dir.exists():
|
||||
sys.path.insert(0, str(modules_dir))
|
||||
else:
|
||||
print(f"Error: Modules directory not found: {modules_dir}")
|
||||
print(f"Make sure you're running this script from the gateway directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Import encryption functions
|
||||
try:
|
||||
from modules.shared.configuration import encrypt_value
|
||||
except ImportError as e:
|
||||
print(f"Error: Could not import encryption functions from shared.configuration: {e}")
|
||||
print(f"Make sure you're running this script from the gateway directory")
|
||||
print(f"Modules directory: {modules_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
def get_env_type_from_file(file_path: Path) -> str:
|
||||
"""
|
||||
Read the APP_ENV_TYPE from the environment file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the environment file
|
||||
|
||||
Returns:
|
||||
str: The environment type (dev, int, prod) or 'dev' as default
|
||||
"""
|
||||
if not file_path.exists():
|
||||
return 'dev'
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith('APP_ENV_TYPE') and '=' in line:
|
||||
_, value = line.split('=', 1)
|
||||
return value.strip().lower()
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not read APP_ENV_TYPE from {file_path}: {e}")
|
||||
|
||||
return 'dev'
|
||||
|
||||
def is_any_encrypted_value(value: str) -> bool:
|
||||
"""
|
||||
Check if a value has any encryption prefix (DEV_ENC:, INT_ENC:, PROD_ENC:, etc.).
|
||||
|
||||
Args:
|
||||
value: The value to check
|
||||
|
||||
Returns:
|
||||
bool: True if the value has any encryption prefix, False otherwise
|
||||
"""
|
||||
if not value or not isinstance(value, str):
|
||||
return False
|
||||
|
||||
# Check for any environment-specific encryption prefixes
|
||||
return (value.startswith('DEV_ENC:') or
|
||||
value.startswith('INT_ENC:') or
|
||||
value.startswith('PROD_ENC:') or
|
||||
value.startswith('TEST_ENC:') or
|
||||
value.startswith('STAGING_ENC:'))
|
||||
|
||||
def find_secret_keys_in_file(file_path: Path) -> list:
|
||||
"""
|
||||
Find all *_SECRET keys in an environment file that are not encrypted.
|
||||
|
||||
Args:
|
||||
file_path: Path to the environment file
|
||||
|
||||
Returns:
|
||||
list: List of tuples (line_number, key, value, full_line)
|
||||
"""
|
||||
secret_keys = []
|
||||
|
||||
if not file_path.exists():
|
||||
return secret_keys
|
||||
|
||||
# Get the environment type from the file itself
|
||||
file_env_type = get_env_type_from_file(file_path)
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith('#'):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check if line contains a key-value pair
|
||||
if '=' in line:
|
||||
key, value = line.split('=', 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Check if it's a secret key and not already encrypted with ANY prefix
|
||||
if key.endswith('_SECRET') and value and not is_any_encrypted_value(value):
|
||||
# Check if value starts with { (JSON object)
|
||||
if value.startswith('{'):
|
||||
# Collect all lines until we find the closing }
|
||||
json_lines = [value]
|
||||
start_line = i + 1
|
||||
i += 1
|
||||
brace_count = value.count('{') - value.count('}')
|
||||
|
||||
while i < len(lines) and brace_count > 0:
|
||||
json_lines.append(lines[i].rstrip('\n'))
|
||||
brace_count += lines[i].count('{') - lines[i].count('}')
|
||||
i += 1
|
||||
|
||||
# Join all lines and create the full JSON value
|
||||
full_json_value = '\n'.join(json_lines)
|
||||
secret_keys.append((start_line, key, full_json_value, line))
|
||||
i -= 1 # Adjust for the loop increment
|
||||
else:
|
||||
# Single line value
|
||||
secret_keys.append((i + 1, key, value, line))
|
||||
# Check if it's a secret key with multiline JSON (value is just "{")
|
||||
elif key.endswith('_SECRET') and value == '{' and not is_any_encrypted_value(value):
|
||||
# Collect all lines until we find the closing }
|
||||
json_lines = [value]
|
||||
start_line = i + 1
|
||||
i += 1
|
||||
brace_count = 1 # We already have one opening brace
|
||||
|
||||
while i < len(lines) and brace_count > 0:
|
||||
json_lines.append(lines[i].rstrip('\n'))
|
||||
brace_count += lines[i].count('{') - lines[i].count('}')
|
||||
i += 1
|
||||
|
||||
# Join all lines and create the full JSON value
|
||||
full_json_value = '\n'.join(json_lines)
|
||||
secret_keys.append((start_line, key, full_json_value, line))
|
||||
i -= 1 # Adjust for the loop increment
|
||||
|
||||
i += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
|
||||
return secret_keys
|
||||
|
||||
def backup_file(file_path: Path) -> Path:
|
||||
"""
|
||||
Create a backup of the file before modification.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to backup
|
||||
|
||||
Returns:
|
||||
Path: Path to the backup file
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_path = file_path.with_suffix(f'.{timestamp}.backup')
|
||||
shutil.copy2(file_path, backup_path)
|
||||
return backup_path
|
||||
|
||||
def encrypt_all_secrets_in_file(file_path: Path, dry_run: bool = False, create_backup: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Encrypt all non-encrypted secrets in a file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the environment file
|
||||
dry_run: If True, only show what would be changed
|
||||
create_backup: If True, create a backup before modifying
|
||||
|
||||
Returns:
|
||||
dict: Results of the encryption process
|
||||
"""
|
||||
# Get the environment type from the file itself
|
||||
file_env_type = get_env_type_from_file(file_path)
|
||||
|
||||
results = {
|
||||
'file': str(file_path),
|
||||
'env_type': file_env_type,
|
||||
'secrets_found': 0,
|
||||
'secrets_encrypted': 0,
|
||||
'errors': [],
|
||||
'backup_created': None
|
||||
}
|
||||
|
||||
# Find all secret keys
|
||||
secret_keys = find_secret_keys_in_file(file_path)
|
||||
results['secrets_found'] = len(secret_keys)
|
||||
|
||||
if not secret_keys:
|
||||
print(f" ✅ No unencrypted secrets found - all values already have encryption prefixes")
|
||||
return results
|
||||
|
||||
print(f" Found {len(secret_keys)} non-encrypted secrets")
|
||||
|
||||
if dry_run:
|
||||
print(" [DRY RUN] Would encrypt the following secrets:")
|
||||
for line_num, key, value, full_line in secret_keys:
|
||||
print(f" Line {line_num}: {key} = {value[:50]}{'...' if len(value) > 50 else ''}")
|
||||
return results
|
||||
|
||||
# Create backup if requested
|
||||
if create_backup:
|
||||
try:
|
||||
backup_path = backup_file(file_path)
|
||||
results['backup_created'] = str(backup_path)
|
||||
print(f" 📋 Backup created: {backup_path.name}")
|
||||
except Exception as e:
|
||||
results['errors'].append(f"Failed to create backup: {e}")
|
||||
print(f" ⚠️ Warning: Could not create backup: {e}")
|
||||
|
||||
# Read the file content
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
except Exception as e:
|
||||
results['errors'].append(f"Failed to read file: {e}")
|
||||
return results
|
||||
|
||||
# Process each secret key
|
||||
for line_num, key, value, full_line in secret_keys:
|
||||
try:
|
||||
print(f" 🔐 Encrypting {key}...")
|
||||
|
||||
# Encrypt the value using the environment type from the file
|
||||
encrypted_value = encrypt_value(value, file_env_type)
|
||||
|
||||
# Replace the line in the file content
|
||||
new_line = f"{key} = {encrypted_value}\n"
|
||||
lines[line_num - 1] = new_line
|
||||
|
||||
# If this was a multiline JSON, we need to remove the remaining lines
|
||||
if value.startswith('{') and '\n' in value:
|
||||
# Count how many lines the original JSON spanned
|
||||
json_lines = value.split('\n')
|
||||
lines_to_remove = len(json_lines) - 1 # -1 because we already replaced the first line
|
||||
|
||||
# Remove the remaining lines
|
||||
for i in range(line_num, line_num + lines_to_remove):
|
||||
if i < len(lines):
|
||||
lines[i] = ""
|
||||
|
||||
results['secrets_encrypted'] += 1
|
||||
print(f" ✓ Encrypted successfully")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to encrypt {key}: {e}"
|
||||
results['errors'].append(error_msg)
|
||||
print(f" ✗ {error_msg}")
|
||||
|
||||
# Write the modified content back to the file
|
||||
if results['secrets_encrypted'] > 0:
|
||||
try:
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.writelines(lines)
|
||||
print(f" 💾 File updated successfully")
|
||||
except Exception as e:
|
||||
results['errors'].append(f"Failed to write file: {e}")
|
||||
print(f" ✗ Failed to write file: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def process_all_env_files(env_files: List[str] = None, dry_run: bool = False, create_backup: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Process all environment files and encrypt unencrypted secrets.
|
||||
|
||||
Args:
|
||||
env_files: List of specific files to process (if None, processes all three default files)
|
||||
dry_run: If True, only show what would be changed
|
||||
create_backup: If True, create backups before modifying
|
||||
|
||||
Returns:
|
||||
dict: Summary of all processing results
|
||||
"""
|
||||
# Default environment files if none specified
|
||||
if env_files is None:
|
||||
env_files = ['env_dev.env', 'env_int.env', 'env_prod.env']
|
||||
|
||||
# Convert to Path objects and check if they exist
|
||||
env_paths = []
|
||||
for env_file in env_files:
|
||||
env_path = Path(env_file)
|
||||
if not env_path.exists():
|
||||
print(f"⚠️ Warning: Environment file not found: {env_file}")
|
||||
continue
|
||||
env_paths.append(env_path)
|
||||
|
||||
if not env_paths:
|
||||
print("❌ No valid environment files found to process")
|
||||
return {'total_files': 0, 'total_secrets_found': 0, 'total_secrets_encrypted': 0, 'total_errors': 0, 'files': []}
|
||||
|
||||
print("🔐 PowerOn Batch Secret Encryption Tool")
|
||||
print("=" * 60)
|
||||
print("⚠️ IMPORTANT: The tool will read APP_ENV_TYPE from each file itself")
|
||||
print("⚠️ Each file will be processed with its own environment-specific encryption")
|
||||
print()
|
||||
|
||||
if dry_run:
|
||||
print("🔍 DRY RUN MODE - No changes will be made")
|
||||
print()
|
||||
|
||||
# Process each file
|
||||
all_results = []
|
||||
total_secrets_found = 0
|
||||
total_secrets_encrypted = 0
|
||||
total_errors = 0
|
||||
|
||||
for env_path in env_paths:
|
||||
print(f"\n📁 Processing {env_path.name}:")
|
||||
results = encrypt_all_secrets_in_file(env_path, dry_run, create_backup)
|
||||
all_results.append(results)
|
||||
|
||||
total_secrets_found += results['secrets_found']
|
||||
total_secrets_encrypted += results['secrets_encrypted']
|
||||
total_errors += len(results['errors'])
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Files processed: {len(env_paths)}")
|
||||
print(f"Total secrets found: {total_secrets_found}")
|
||||
|
||||
if not dry_run:
|
||||
print(f"Total secrets encrypted: {total_secrets_encrypted}")
|
||||
print(f"Total errors: {total_errors}")
|
||||
|
||||
if total_errors == 0 and total_secrets_encrypted > 0:
|
||||
print("\n🎉 All secrets encrypted successfully!")
|
||||
elif total_errors > 0:
|
||||
print(f"\n⚠️ Completed with {total_errors} errors")
|
||||
else:
|
||||
print("\n✅ No secrets needed encryption")
|
||||
else:
|
||||
print(f"Secrets that would be encrypted: {total_secrets_found}")
|
||||
|
||||
# Show backup information
|
||||
backups_created = [r['backup_created'] for r in all_results if r['backup_created']]
|
||||
if backups_created:
|
||||
print(f"\n📋 Backups created: {len(backups_created)}")
|
||||
for backup in backups_created:
|
||||
print(f" - {Path(backup).name}")
|
||||
|
||||
# Show errors if any
|
||||
all_errors = []
|
||||
for results in all_results:
|
||||
all_errors.extend(results['errors'])
|
||||
|
||||
if all_errors:
|
||||
print(f"\n❌ Errors encountered:")
|
||||
for error in all_errors:
|
||||
print(f" - {error}")
|
||||
|
||||
return {
|
||||
'total_files': len(env_paths),
|
||||
'total_secrets_found': total_secrets_found,
|
||||
'total_secrets_encrypted': total_secrets_encrypted,
|
||||
'total_errors': total_errors,
|
||||
'files': all_results
|
||||
}
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Encrypt all *_SECRET variables in all environment files')
|
||||
parser.add_argument('--files', '-f', nargs='+',
|
||||
help='Specific environment files to process (default: all three env files)')
|
||||
parser.add_argument('--dry-run', action='store_true',
|
||||
help='Show what would be changed without making changes')
|
||||
parser.add_argument('--no-backup', action='store_true',
|
||||
help='Skip creating backup files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
results = process_all_env_files(
|
||||
env_files=args.files,
|
||||
dry_run=args.dry_run,
|
||||
create_backup=not args.no_backup
|
||||
)
|
||||
|
||||
# Return appropriate exit code
|
||||
if results['total_errors'] > 0:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Loading…
Reference in a new issue