Merge branch 'int' into feat/chatbot

This commit is contained in:
ValueOn AG 2025-10-15 12:38:42 +02:00 committed by GitHub
commit 57118a633e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
93 changed files with 13774 additions and 3691 deletions

View file

@ -30,3 +30,8 @@ Web_Search_MIN_RESULTS = 1
Web_Crawl_TIMEOUT = 30
Web_Crawl_MAX_RETRIES = 3
Web_Crawl_RETRY_DELAY = 2
# Web Research configuration
Web_Research_MAX_DEPTH = 2
Web_Research_MAX_LINKS_PER_DOMAIN = 4
Web_Research_CRAWL_TIMEOUT_MINUTES = 10

View file

@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
# Perplexity AI configuration
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
Connector_AiPerplexity_API_SECRET = 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
Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0=
Connector_AiPerplexity_MODEL_NAME = sonar
Connector_AiPerplexity_TEMPERATURE = 0.2
Connector_AiPerplexity_MAX_TOKENS = 2000
# Agent Mail configuration
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQk4xYnpmbnItUEU3dHU4eHB5dzVYay1WT012RTRLUWJDTlBILVY5dC1FX3VMNjZmLThrbDRFNWFSNGprY3RRTlpYNGlubVBpNnY3MjNJcGtzVk9PMzRacl9LUlM2RU5vTVVZWHJvaUhWSHVfc1pNR0pfQmI5SEprOG5KdlB1QnQ=
Service_MSFT_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm83T29rV1pQelMtc1p1MXR4NTFpa19CTEhHQ0xfNmdPUmZqcWp5UHBMS0hYTGl4c1pPdmhTNTJVWUl5WnlnUUZhV0VTRzVCb0d5YjR1NnZPZk5CZ0dGazNGdUJVbjkxeVdrYlNiVjJUYzF2aVFtQnVxTHFqTTJqZlF0RTFGNmE1OGN1TEk=
Service_MSFT_TENANT_ID = common
# Google Service configuration
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpETk5FWWM3Q0JKMzhI
# Feature SyncDelta JIRA configuration
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEbm0yRUJ6VUJKbUwyRW5kMnRaNW4wM2YxMkJUTXVXZUdmdVRCaUZIVHU2TTV2RWZLRmUtZkcwZE4yRUNlNDQ0aUJWYjNfdVg5YjV5c2JwMHhoUUYxZWdkeS11bXR0eGxRLWRVaVU3cUVQZWJlNDRtY1lWUDdqeDVFSlpXS0VFX21WajlRS3lHQjc0bS11akkybWV3QUFlR2hNWUNYLUdiRjZuN2dQODdDSExXWG1Dd2ZGclI2aUhlSWhETVZuY3hYdnhkb2c2LU1JTFBvWFpTNmZtMkNVOTZTejJwbDI2eGE0OS1xUlIwQnlCSmFxRFNCeVJNVzlOMDhTR1VUamx4RDRyV3p6Tk9qVHBrWWdySUM3TVRaYjd3N0JHMFhpdzFhZTNDLTFkRVQ2RVE4U19COXRhRWtNc0NVOHRqUS1CRDFpZ19xQmtFLU9YSDU3TXBZQXpVcld3PT0=
# Debug Configuration
APP_DEBUG_CHAT_WORKFLOW_ENABLED = True
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat

View file

@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
# Perplexity AI configuration
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
Connector_AiPerplexity_API_SECRET = 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
Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0=
Connector_AiPerplexity_MODEL_NAME = sonar
Connector_AiPerplexity_TEMPERATURE = 0.2
Connector_AiPerplexity_MAX_TOKENS = 2000
# Agent Mail configuration
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNzB2M3ZjaE1SVE9ON2FKam9yVURxcHl1Ym5VNVUtS0MyWUpNVXVlaWpWS2U3VVd3em9vQl9lcnVYay03bS04YjNBbDZZNTB4eUtjT3ppQjJjY3dOT0FNLW9LeDhIUU5iaTNqNURUWE5La3kzaHNGcU9yNVI0YjhWZTZRRFktcTk=
Service_MSFT_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm83T29rMDZvcV9qTG5xb1FzUkdqS1llbzRxSEJXbmpONFFtcUtfZXdtZjQybmJSMjBjMEpnRVhiOGRuczZvVFBFdVVTQV80SG9PSnRQTEpLdVViNm5wc2E5aGRLWjZ4TGF1QjVkNmdRSzBpNWNkYXVublFYclVEdEM5TVBBZWVVMW5RVWk=
Service_MSFT_TENANT_ID = common
# Google Service configuration
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkNmVXZ1pWcHcydTF2
# Feature SyncDelta JIRA configuration
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkTUNsWm4wX0p6eXFDZmJ4dFdHNEs1MV9MUzdrb3RzeC1jVWVYZ0REWHRyZkFiaGZLcUQtTXFBZzZkNzRmQ0gxbEhGbUNlVVFfR1JEQTc0aldkZkgyWnBOcjdlUlZxR0tDTEdKRExULXAyUEtsVmNTMkRKU1BJNnFiM0hlMXo4YndMcHlRMExtZDQ3Zm9vNFhMcEZCcHpBPT0=
# Debug Configuration
APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat

View file

@ -66,14 +66,14 @@ Connector_AiAnthropic_MAX_TOKENS = 2000
# Perplexity AI configuration
Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions
Connector_AiPerplexity_API_SECRET = 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
Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0=
Connector_AiPerplexity_MODEL_NAME = sonar
Connector_AiPerplexity_TEMPERATURE = 0.2
Connector_AiPerplexity_MAX_TOKENS = 2000
# Agent Mail configuration
Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c
Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pVEhHdlZHU3FNMmhuRGVwaGc3YzIxSjlZNzBCQjlOV2pSYVNXb0t1ZnVwQzZsQzY4cHMtVlZtNF85OEVaV1BMTzdXMmpzaGZpaG1DalJ0bkNPMHA5ZUcwZjNDdGk1TFdxYTJSZnVrVmhhZ2VRUEZxbjJOOGFhWk9EYlY3dmRVTnI=
Service_MSFT_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQm83T29rSzdYLTRydXN5V3lQLXhmRjMyQ1FOaGpuek45QllaX1REN2s5aWNIUl81NGlrYlJTeFV0RlRZd0xPcm5uMDM4QlpibHJQbm5XZTlWeWxfcWNVdFpCUHI2amh0MVBnZ21IN2ptSkhWLTVfaHEwNmI5SEtiS05pQmt5eV8yMnhLMEc=
Service_MSFT_TENANT_ID = common
# Google Service configuration
@ -88,3 +88,7 @@ Connector_GoogleSpeech_API_KEY_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pNjlJdmFMeERXUUQ
# Feature SyncDelta JIRA configuration
Feature_SyncDelta_JIRA_DELTA_TOKEN_SECRET = PROD_ENC:Z0FBQUFBQm8xSU5pTDhnTVNzRUhScU8wYnZsZk52bHFkSWxLc18xQmtCeC1HbnNwTzVBbXRNTmQzRjZYaGE2MVlCNGtnWDk1T2I5VXVKNHpKU1VRbXEyN2tRWUJnU2ltZE5qZ3lmNEF6Z1hMTTEwZkk2NUNBYjhmVTJEcWpRUW9HNEVpSGFWdjBWQXQ3eUtHUTFJS3U5QWpaeno0RFNhMUxnPT0=
# Debug Configuration
APP_DEBUG_CHAT_WORKFLOW_ENABLED = FALSE
APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat

View file

@ -1,5 +1,6 @@
import logging
import httpx
import os
from typing import Dict, Any, List, Union
from fastapi import HTTPException
from modules.shared.configuration import APP_CONFIG
@ -147,6 +148,11 @@ class AiAnthropic:
# Direct content as string (in older API versions)
content = anthropicResponse["content"]
# Debug logging for empty responses
if not content or content.strip() == "":
logger.warning(f"Anthropic API returned empty content. Full response: {anthropicResponse}")
content = "[Anthropic API returned empty response]"
# Return in OpenAI format
return {
"id": anthropicResponse.get("id", ""),
@ -182,14 +188,27 @@ class AiAnthropic:
The analysis response as text
"""
try:
# Debug logging
logger.info(f"callAiImage called with imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
# Distinguish between file path and binary data
if isinstance(imageData, str):
# Check if it's base64 encoded data or a file path
if len(imageData) > 100 and not os.path.exists(imageData):
# It's likely base64 encoded data
logger.info("Treating imageData as base64 encoded string")
base64Data = imageData
if not mimeType:
mimeType = "image/png"
else:
# It's a file path - import filehandling only when needed
logger.info(f"Treating imageData as file path: {imageData}")
from modules import agentserviceFilemanager as fileHandler
base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData)
mimeType = mimeType or autoMimeType
else:
# It's binary data
logger.info("Treating imageData as binary data")
import base64
base64Data = base64.b64encode(imageData).decode('utf-8')
# MIME type must be specified for binary data
@ -216,8 +235,16 @@ class AiAnthropic:
# Use the existing callAiBasic function with the Vision model
response = await self.callAiBasic(messages)
# Extract and return content
return response["choices"][0]["message"]["content"]
# Extract and return content with proper error handling
try:
content = response["choices"][0]["message"]["content"]
if content is None or content.strip() == "":
return "[AI returned empty response for image analysis]"
return content
except (KeyError, IndexError, TypeError) as e:
logger.error(f"Error extracting content from AI response: {str(e)}")
logger.error(f"Response structure: {response}")
return f"[Error extracting AI response: {str(e)}]"
except Exception as e:
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)

View file

@ -189,3 +189,82 @@ class AiOpenai:
except Exception as e:
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
return f"[Error during image analysis: {str(e)}]"
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> Dict[str, Any]:
"""
Generate an image using DALL-E 3.
Args:
prompt: The text prompt for image generation
size: Image size (1024x1024, 1792x1024, or 1024x1792)
quality: Image quality (standard or hd)
style: Image style (vivid or natural)
Returns:
Dictionary with success status and image data
"""
try:
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
# DALL-E 3 API endpoint
dalle_url = "https://api.openai.com/v1/images/generations"
payload = {
"model": "dall-e-3",
"prompt": prompt,
"size": size,
"quality": quality,
"style": style,
"n": 1,
"response_format": "b64_json" # Get base64 data directly instead of URLs
}
# Create a separate client for DALL-E API calls
dalle_client = httpx.AsyncClient(
timeout=120.0,
headers={
"Authorization": f"Bearer {self.apiKey}",
"Content-Type": "application/json"
}
)
response = await dalle_client.post(
dalle_url,
json=payload
)
await dalle_client.aclose()
if response.status_code != 200:
logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
return {
"success": False,
"error": f"DALL-E API error: {response.status_code} - {response.text}"
}
responseJson = response.json()
if "data" in responseJson and len(responseJson["data"]) > 0:
image_data = responseJson["data"][0]["b64_json"]
logger.info(f"Successfully generated image: {len(image_data)} characters")
return {
"success": True,
"image_data": image_data,
"size": size,
"quality": quality,
"style": style
}
else:
logger.error("No image data in DALL-E response")
return {
"success": False,
"error": "No image data in DALL-E response"
}
except Exception as e:
logger.error(f"Error during image generation: {str(e)}", exc_info=True)
return {
"success": False,
"error": f"Error during image generation: {str(e)}"
}

View file

@ -271,6 +271,7 @@ class ConnectorWeb:
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
country: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
) -> list[WebSearchResult]:
@ -290,17 +291,20 @@ class ConnectorWeb:
kwargs["time_range"] = time_range
if topic is not None:
kwargs["topic"] = topic
if include_domains is not None:
if include_domains is not None and len(include_domains) > 0:
kwargs["include_domains"] = include_domains
if exclude_domains is not None:
kwargs["exclude_domains"] = exclude_domains
if language is not None:
kwargs["language"] = language
if country is not None:
kwargs["country"] = country
if include_answer is not None:
kwargs["include_answer"] = include_answer
if include_raw_content is not None:
kwargs["include_raw_content"] = include_raw_content
logger.debug(f"Tavily.search kwargs: {kwargs}")
response = await self.client.search(**kwargs)
return [

View file

@ -135,3 +135,29 @@ class AiCallResponse(BaseModel):
costEstimate: Optional[float] = Field(default=None, description="Estimated cost of the call")
class EnhancedAiCallOptions(AiCallOptions):
"""Enhanced options for improved document processing with chunk mapping."""
# Parallel processing
enableParallelProcessing: bool = Field(
default=True,
description="Enable parallel processing of chunks"
)
maxConcurrentChunks: int = Field(
default=5,
ge=1,
le=20,
description="Maximum number of chunks to process concurrently"
)
# Chunk mapping
preserveChunkMetadata: bool = Field(
default=True,
description="Preserve chunk metadata during processing"
)
chunkSeparator: str = Field(
default="\n\n---\n\n",
description="Separator between chunks in merged output"
)

View file

@ -0,0 +1,130 @@
from typing import Any, Dict, List, Optional, Literal, Union
from pydantic import BaseModel, Field
from datetime import datetime
class DocumentMetadata(BaseModel):
"""Metadata for the entire document."""
title: str = Field(description="Document title")
author: Optional[str] = Field(default=None, description="Document author")
created_at: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
source_documents: List[str] = Field(default_factory=list, description="Source document IDs")
extraction_method: str = Field(default="ai_extraction", description="Method used for extraction")
version: str = Field(default="1.0", description="Document version")
class TableData(BaseModel):
"""Structured table data."""
headers: List[str] = Field(description="Table column headers")
rows: List[List[str]] = Field(description="Table data rows")
caption: Optional[str] = Field(default=None, description="Table caption")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata")
class ListItem(BaseModel):
"""Individual list item with optional sub-items."""
text: str = Field(description="List item text")
subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata")
class BulletList(BaseModel):
"""Bulleted or numbered list."""
items: List[ListItem] = Field(description="List items")
list_type: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type")
metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata")
class Paragraph(BaseModel):
"""Text paragraph with optional formatting."""
text: str = Field(description="Paragraph text")
formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata")
class Heading(BaseModel):
"""Document heading."""
text: str = Field(description="Heading text")
level: int = Field(ge=1, le=6, description="Heading level (1-6)")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata")
class CodeBlock(BaseModel):
"""Code block with syntax highlighting."""
code: str = Field(description="Code content")
language: Optional[str] = Field(default=None, description="Programming language")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata")
class Image(BaseModel):
"""Image with metadata."""
data: str = Field(description="Base64 encoded image data")
alt_text: Optional[str] = Field(default=None, description="Alternative text")
caption: Optional[str] = Field(default=None, description="Image caption")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata")
class DocumentSection(BaseModel):
"""A section of the document containing one or more content elements."""
id: str = Field(description="Unique section identifier")
title: Optional[str] = Field(default=None, description="Section title")
content_type: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type")
elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section")
order: int = Field(description="Section order in document")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata")
class StructuredDocument(BaseModel):
"""Complete structured document in JSON format."""
metadata: DocumentMetadata = Field(description="Document metadata")
sections: List[DocumentSection] = Field(description="Document sections")
summary: Optional[str] = Field(default=None, description="Document summary")
tags: List[str] = Field(default_factory=list, description="Document tags")
def get_sections_by_type(self, content_type: str) -> List[DocumentSection]:
"""Get all sections of a specific content type."""
return [section for section in self.sections if section.content_type == content_type]
def get_all_tables(self) -> List[TableData]:
"""Get all table data from the document."""
tables = []
for section in self.sections:
for element in section.elements:
if isinstance(element, TableData):
tables.append(element)
return tables
def get_all_lists(self) -> List[BulletList]:
"""Get all lists from the document."""
lists = []
for section in self.sections:
for element in section.elements:
if isinstance(element, BulletList):
lists.append(element)
return lists
class JsonChunkResult(BaseModel):
"""Result from processing a single chunk with JSON output."""
chunk_id: str = Field(description="Chunk identifier")
document_section: DocumentSection = Field(description="Structured content from this chunk")
processing_time: float = Field(description="Processing time in seconds")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk processing metadata")
class JsonMergeResult(BaseModel):
"""Result from merging multiple JSON chunks."""
merged_document: StructuredDocument = Field(description="Merged structured document")
merge_strategy: str = Field(description="Strategy used for merging")
chunks_processed: int = Field(description="Number of chunks processed")
merge_time: float = Field(description="Time taken to merge chunks")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
# Update forward references (compatible with Pydantic v1 and v2)
try:
# Pydantic v2
ListItem.model_rebuild()
except AttributeError:
# Pydantic v1
ListItem.update_forward_refs()

View file

@ -18,6 +18,16 @@ class ContentExtracted(BaseModel):
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
class ChunkResult(BaseModel):
"""Preserves the relationship between a chunk and its AI result."""
originalChunk: ContentPart
aiResult: str
chunkIndex: int
documentId: str
processingTime: float = 0.0
metadata: Dict[str, Any] = Field(default_factory=dict)
class MergeStrategy(BaseModel):
"""Strategy configuration for merging content parts and AI results."""

View file

@ -1,4 +1,5 @@
import logging
import asyncio
from typing import Dict, Any, List, Union, Tuple, Optional
from dataclasses import dataclass
@ -260,6 +261,7 @@ class AiObjects:
if not requiredTags:
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
# Override priority based on processing mode if not explicitly set
effectivePriority = options.priority
if options.priority == Priority.BALANCED:
@ -268,6 +270,7 @@ class AiObjects:
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
for name, info in aiModels.items():
logger.info(f"Checking model: {name}, tags: {info.get('tags', [])}, function: {info.get('function', 'unknown')}")
# Check context length
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
continue
@ -279,8 +282,11 @@ class AiObjects:
# Check required tags/capabilities
modelTags = info.get("tags", [])
if requiredTags and not any(tag in modelTags for tag in requiredTags):
if requiredTags and not all(tag in modelTags for tag in requiredTags):
logger.info(f" -> Skipping {name}: missing required tags. Has: {modelTags}, needs: {requiredTags}")
continue
else:
logger.info(f" -> {name} passed tag check")
# Check processing mode requirements
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
@ -288,16 +294,24 @@ class AiObjects:
continue
candidates[name] = info
logger.info(f" -> {name} added to candidates")
logger.info(f"Final candidates: {list(candidates.keys())}")
if not candidates:
logger.info("No candidates found, using fallback")
# Fallback based on operation type
if options.operationType == OperationType.IMAGE_ANALYSIS:
logger.info("Using fallback: openai_callAiImage")
return "openai_callAiImage"
elif options.operationType == OperationType.IMAGE_GENERATION:
logger.info("Using fallback: openai_generateImage")
return "openai_generateImage"
elif options.operationType == OperationType.WEB_RESEARCH:
logger.info("Using fallback: perplexity_callAiWithWebSearch")
return "perplexity_callAiWithWebSearch"
else:
logger.info("Using fallback: openai_callAiBasic_gpt35")
return "openai_callAiBasic_gpt35"
# Special handling for planning operations - use Claude for consistency
@ -313,17 +327,60 @@ class AiObjects:
# Select based on priority for other operations
if effectivePriority == Priority.SPEED:
return max(candidates, key=lambda k: candidates[k]["speedRating"])
selected = max(candidates, key=lambda k: candidates[k]["speedRating"])
logger.info(f"Selected by SPEED: {selected}")
return selected
elif effectivePriority == Priority.QUALITY:
return max(candidates, key=lambda k: candidates[k]["qualityRating"])
selected = max(candidates, key=lambda k: candidates[k]["qualityRating"])
logger.info(f"Selected by QUALITY: {selected}")
return selected
elif effectivePriority == Priority.COST:
return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
selected = min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
logger.info(f"Selected by COST: {selected}")
return selected
else: # BALANCED
def balancedScore(name: str) -> float:
info = candidates[name]
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
return max(candidates, key=balancedScore)
selected = max(candidates, key=balancedScore)
logger.info(f"Selected by BALANCED: {selected}")
return selected
def _getFallbackModels(self, operationType: str) -> List[str]:
"""Get ordered list of fallback models for a given operation type."""
fallbackMappings = {
OperationType.GENERAL: [
"openai_callAiBasic_gpt35", # Fast and reliable
"openai_callAiBasic", # High quality
"anthropic_callAiBasic", # Alternative high quality
"perplexity_callAiBasic" # Cost effective
],
OperationType.IMAGE_ANALYSIS: [
"openai_callAiImage", # Primary image analysis
"anthropic_callAiImage" # Alternative image analysis
],
OperationType.IMAGE_GENERATION: [
"openai_generateImage" # Only image generation model
],
OperationType.WEB_RESEARCH: [
"perplexity_callAiWithWebSearch", # Primary web research
"perplexity_callAiBasic", # Alternative with web search
"openai_callAiBasic" # Fallback to general model
],
OperationType.GENERATE_PLAN: [
"anthropic_callAiBasic", # Best for planning
"openai_callAiBasic", # High quality alternative
"openai_callAiBasic_gpt35" # Fast fallback
],
OperationType.ANALYSE_CONTENT: [
"anthropic_callAiBasic", # Best for analysis
"openai_callAiBasic", # High quality alternative
"openai_callAiBasic_gpt35" # Fast fallback
]
}
return fallbackMappings.get(operationType, fallbackMappings[OperationType.GENERAL])
def _connectorFor(self, modelName: str):
"""Get the appropriate connector for the model."""
@ -340,7 +397,7 @@ class AiObjects:
raise ValueError(f"Unknown connector type: {connectorType}")
async def call(self, request: AiCallRequest) -> AiCallResponse:
"""Call AI model for text generation."""
"""Call AI model for text generation with fallback mechanism."""
prompt = request.prompt
context = request.context or ""
options = request.options
@ -357,9 +414,6 @@ class AiObjects:
if options.compressContext and len(context.encode("utf-8")) > 70000:
context = maybeTruncate(context, 70000)
# Select model for text generation
modelName = self._selectModel(prompt, context, options)
# Derive generation parameters
temperature = getattr(options, "temperature", None)
if temperature is None:
@ -376,6 +430,15 @@ class AiObjects:
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
messages.append({"role": "user", "content": prompt})
# Get fallback models for this operation type
fallbackModels = self._getFallbackModels(options.operationType)
# Try primary model first, then fallbacks
lastError = None
for attempt, modelName in enumerate(fallbackModels):
try:
logger.info(f"Attempting AI call with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})")
connector = self._connectorFor(modelName)
functionName = aiModels[modelName]["function"]
@ -406,29 +469,74 @@ class AiObjects:
else:
raise ValueError(f"Function {functionName} not supported for text generation")
# Estimate cost/tokens
# Success! Estimate cost/tokens and return
totalSize = len((prompt + context).encode("utf-8"))
cost = self._estimateCost(aiModels[modelName], totalSize)
usedTokens = int(totalSize / 4)
logger.info(f"✅ AI call successful with model: {modelName}")
return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost)
except Exception as e:
lastError = e
logger.warning(f"❌ AI call failed with model {modelName}: {str(e)}")
# If this is not the last model, try the next one
if attempt < len(fallbackModels) - 1:
logger.info(f"🔄 Trying next fallback model...")
continue
else:
# All models failed
logger.error(f"💥 All {len(fallbackModels)} models failed for operation {options.operationType}")
break
# All fallback attempts failed
errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}"
logger.error(errorMsg)
raise Exception(errorMsg)
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str:
"""Call AI model for image analysis."""
"""Call AI model for image analysis with fallback mechanism."""
if options is None:
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
# Select model for image analysis
modelName = self._selectModel(prompt, "", options)
# Get fallback models for image analysis
fallbackModels = self._getFallbackModels(OperationType.IMAGE_ANALYSIS)
# Try primary model first, then fallbacks
lastError = None
for attempt, modelName in enumerate(fallbackModels):
try:
logger.info(f"Attempting image analysis with model: {modelName} (attempt {attempt + 1}/{len(fallbackModels)})")
connector = self._connectorFor(modelName)
functionName = aiModels[modelName]["function"]
if functionName == "callAiImage":
return await connector.callAiImage(prompt, imageData, mimeType)
content = await connector.callAiImage(prompt, imageData, mimeType)
logger.info(f"✅ Image analysis successful with model: {modelName}")
return content
else:
raise ValueError(f"Function {functionName} not supported for image analysis")
except Exception as e:
lastError = e
logger.warning(f"❌ Image analysis failed with model {modelName}: {str(e)}")
# If this is not the last model, try the next one
if attempt < len(fallbackModels) - 1:
logger.info(f"🔄 Trying next fallback model for image analysis...")
continue
else:
# All models failed
logger.error(f"💥 All {len(fallbackModels)} models failed for image analysis")
break
# All fallback attempts failed
errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}"
logger.error(errorMsg)
raise Exception(errorMsg)
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]:
"""Generate an image using AI."""
if options is None:
@ -694,7 +802,22 @@ class AiObjects:
logger.warning(f"Failed to extract links from content: {e}")
return []
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]:
def _normalizeUrl(self, url: str) -> str:
"""Normalize URL to handle variations that should be considered duplicates."""
if not url:
return url
# Remove trailing slashes and fragments
url = url.rstrip('/')
if '#' in url:
url = url.split('#')[0]
# Handle common URL variations
url = url.replace('http://', 'https://') # Normalize protocol
return url
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
"""
Recursively crawl URLs up to specified depth.
@ -703,19 +826,28 @@ class AiObjects:
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
extract_depth: Tavily extract depth setting
max_per_domain: Maximum URLs per domain per level
global_processed_urls: Optional global set to track processed URLs across sessions
Returns:
Dictionary mapping URL -> content for all crawled pages
"""
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
# URL index to track all processed URLs
# URL index to track all processed URLs (local + global)
processed_urls = set()
if global_processed_urls is not None:
# Use global index if provided, otherwise create local one
processed_urls = global_processed_urls
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
else:
logger.info("Using local URL index for this crawl session")
all_content = {}
# Current level URLs to process
current_level_urls = urls.copy()
try:
for depth in range(1, max_depth + 1):
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
@ -724,18 +856,21 @@ class AiObjects:
next_level_urls = []
for url in current_level_urls:
if url in processed_urls:
logger.debug(f"URL {url} already processed, skipping")
# Normalize URL for duplicate checking
normalized_url = self._normalizeUrl(url)
if normalized_url in processed_urls:
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
continue
try:
logger.info(f"Processing URL at depth {depth}: {url}")
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
# Read page content
content = await self.readPage(url, extract_depth)
if content:
all_content[url] = content
processed_urls.add(url)
processed_urls.add(normalized_url)
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
# Get URLs from this page for next level
@ -749,18 +884,21 @@ class AiObjects:
# Add new URLs to next level (avoiding already processed ones)
new_urls_count = 0
for new_url in filtered_urls:
if new_url not in processed_urls:
normalized_new_url = self._normalizeUrl(new_url)
if normalized_new_url not in processed_urls:
next_level_urls.append(new_url)
new_urls_count += 1
else:
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
else:
logger.warning(f"✗ No content extracted from {url}")
processed_urls.add(url) # Mark as processed to avoid retry
processed_urls.add(normalized_url) # Mark as processed to avoid retry
except Exception as e:
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
processed_urls.add(url) # Mark as processed to avoid retry
processed_urls.add(normalized_url) # Mark as processed to avoid retry
# Prepare for next iteration
current_level_urls = next_level_urls
@ -772,6 +910,15 @@ class AiObjects:
break
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
logger.info(f"Unique URLs found: {len(all_content)}")
return all_content
except asyncio.TimeoutError:
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
return all_content
except Exception as e:
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
return all_content
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:

View file

@ -571,7 +571,9 @@ class ChatObjects:
actionName=createdMessage.get("actionName")
)
# Debug: Store message and documents for debugging TODO REMOVE
# Debug: Store message and documents for debugging - only if debug enabled
debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
self._storeDebugMessageAndDocuments(chat_message)
return chat_message
@ -1052,8 +1054,11 @@ class ChatObjects:
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
"""
Store message and documents for debugging purposes in fileshare.
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents
Store message and documents (metadata and file bytes) for debugging purposes.
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/
- message.json, message_text.txt
- document_###_metadata.json
- document_###_<original_filename> (actual file bytes)
Args:
message: ChatMessage object to store
@ -1157,6 +1162,26 @@ class ChatObjects:
logger.info(f"Debug: Stored document metadata for {doc.fileName}")
# Also store the actual file bytes next to metadata for debugging
try:
# Lazy import to avoid circular deps at module load
from modules.interfaces import interfaceDbComponentObjects as comp
componentInterface = comp.getInterface(self.currentUser)
file_bytes = componentInterface.getFileData(doc.fileId)
if file_bytes:
# Build a safe filename preserving original name
safe_name = doc.fileName or f"document_{i+1:03d}"
# Avoid path traversal
safe_name = os.path.basename(safe_name)
doc_file_path = os.path.join(label_folder, f"document_{i+1:03d}_" + safe_name)
with open(doc_file_path, "wb") as df:
df.write(file_bytes)
logger.info(f"Debug: Stored document file bytes: {doc_file_path} ({len(file_bytes)} bytes)")
else:
logger.warning(f"Debug: No file bytes returned for fileId {doc.fileId}")
except Exception as e:
logger.error(f"Debug: Failed to store document file for {doc.fileName} (fileId {doc.fileId}): {e}")
logger.info(f"Debug: Stored message and documents in {message_path}")
except Exception as e:

View file

@ -95,8 +95,8 @@ async def update_prompt(
detail=f"Prompt with ID {promptId} not found"
)
# Convert Prompt to dict for interface
update_data = promptData.dict()
# Convert Prompt to dict for interface, excluding the id field
update_data = promptData.dict(exclude={'id'})
# Update prompt
updatedPrompt = managementInterface.updatePrompt(promptId, update_data)

View file

@ -14,7 +14,7 @@ from pydantic import BaseModel
# Import auth modules
from modules.security.auth import getCurrentUser, limiter, SECRET_KEY, ALGORITHM
from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie
from modules.security.jwtService import createAccessToken, createRefreshToken, setAccessTokenCookie, setRefreshTokenCookie, clearAccessTokenCookie, clearRefreshTokenCookie
from modules.interfaces.interfaceDbAppObjects import getInterface, getRootInterface
from modules.datamodels.datamodelUam import User, UserInDB, AuthAuthority, UserPrivilege
from modules.datamodels.datamodelSecurity import Token
@ -263,8 +263,7 @@ async def read_user_me(
@limiter.limit("60/minute")
async def refresh_token(
request: Request,
response: Response,
currentUser: User = Depends(getCurrentUser)
response: Response
) -> Dict[str, Any]:
"""Refresh access token using refresh token from cookie"""
try:
@ -283,12 +282,27 @@ async def refresh_token(
except jwt.JWTError:
raise HTTPException(status_code=401, detail="Invalid refresh token")
# Get user information from refresh token payload
user_id = payload.get("userId")
if not user_id:
raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID")
# Get user from database using the user ID from refresh token
try:
app_interface = getRootInterface()
current_user = app_interface.getUser(user_id)
if not current_user:
raise HTTPException(status_code=401, detail="User not found")
except Exception as e:
logger.error(f"Failed to get user from database: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to validate user")
# Create new token data
token_data = {
"sub": currentUser.username,
"mandateId": str(currentUser.mandateId),
"userId": str(currentUser.id),
"authenticationAuthority": currentUser.authenticationAuthority
"sub": current_user.username,
"mandateId": str(current_user.mandateId),
"userId": str(current_user.id),
"authenticationAuthority": current_user.authenticationAuthority
}
# Create new access token + set cookie
@ -365,15 +379,18 @@ async def logout(request: Request, response: Response, currentUser: User = Depen
# Don't fail if audit logging fails
pass
# Clear httpOnly cookies
response.delete_cookie(key="auth_token", httponly=True, samesite="strict")
response.delete_cookie(key="refresh_token", httponly=True, samesite="strict")
return JSONResponse({
# Create the JSON response first
json_response = JSONResponse({
"message": "Successfully logged out - cookies cleared",
"revokedTokens": revoked
})
# Clear httpOnly cookies on the response we're actually returning
clearAccessTokenCookie(json_response)
clearRefreshTokenCookie(json_response)
return json_response
except Exception as e:
logger.error(f"Error during logout: {str(e)}")
raise HTTPException(

View file

@ -17,6 +17,11 @@ ALGORITHM = APP_CONFIG.get("Auth_ALGORITHM")
ACCESS_TOKEN_EXPIRE_MINUTES = int(APP_CONFIG.get("APP_TOKEN_EXPIRY"))
REFRESH_TOKEN_EXPIRE_DAYS = int(APP_CONFIG.get("APP_REFRESH_TOKEN_EXPIRY", "7"))
# Cookie security settings - use secure cookies based on whether API uses HTTPS
# Cookies must have secure=True on HTTPS sites, secure=False on HTTP sites
APP_API_URL = APP_CONFIG.get("APP_API_URL", "http://localhost:8000")
USE_SECURE_COOKIES = APP_API_URL.startswith("https://") if APP_API_URL else False
def createAccessToken(data: dict, expiresDelta: Optional[timedelta] = None) -> Tuple[str, "datetime"]:
"""Create a JWT access token and return (token, expiresAt)."""
@ -52,8 +57,9 @@ def setAccessTokenCookie(response: Response, token: str, expiresDelta: Optional[
key="auth_token",
value=token,
httponly=True,
secure=True,
secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS)
samesite="strict",
path="/",
max_age=maxAge
)
@ -64,9 +70,46 @@ def setRefreshTokenCookie(response: Response, token: str) -> None:
key="refresh_token",
value=token,
httponly=True,
secure=True,
secure=USE_SECURE_COOKIES, # Only secure in production (HTTPS)
samesite="strict",
path="/",
max_age=REFRESH_TOKEN_EXPIRE_DAYS * 24 * 60 * 60
)
def clearAccessTokenCookie(response: Response) -> None:
"""
Clear access token cookie by setting it to expire immediately.
Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility.
"""
# Build secure flag based on environment
secure_flag = "; Secure" if USE_SECURE_COOKIES else ""
# Primary method: Raw Set-Cookie header for guaranteed deletion
response.headers.append(
"Set-Cookie",
f"auth_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict"
)
# Fallback: Also use FastAPI's built-in method
response.delete_cookie(key="auth_token", path="/")
def clearRefreshTokenCookie(response: Response) -> None:
"""
Clear refresh token cookie by setting it to expire immediately.
Uses both raw header manipulation and FastAPI's delete_cookie for maximum browser compatibility.
"""
# Build secure flag based on environment
secure_flag = "; Secure" if USE_SECURE_COOKIES else ""
# Primary method: Raw Set-Cookie header for guaranteed deletion
response.headers.append(
"Set-Cookie",
f"refresh_token=deleted; Path=/; Max-Age=0; Expires=Thu, 01 Jan 1970 00:00:00 GMT; HttpOnly{secure_flag}; SameSite=Strict"
)
# Fallback: Also use FastAPI's built-in method
response.delete_cookie(key="refresh_token", path="/")

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,596 @@
import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority
from modules.interfaces.interfaceAiObjects import AiObjects
logger = logging.getLogger(__name__)
class SubCoreAi:
"""Core AI operations including image analysis, text generation, and planning calls."""
def __init__(self, services, aiObjects):
"""Initialize core AI operations.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
"""
self.services = services
self.aiObjects = aiObjects
# AI Processing Call
async def callAi(
self,
prompt: str,
documents: Optional[List[ChatDocument]] = None,
placeholders: Optional[List[PromptPlaceholder]] = None,
options: Optional[AiCallOptions] = None,
outputFormat: Optional[str] = None,
title: Optional[str] = None,
documentProcessor=None,
documentGenerator=None
) -> Union[str, Dict[str, Any]]:
"""
Unified AI call interface that automatically routes to appropriate handler.
Args:
prompt: The main prompt for the AI call
documents: Optional list of documents to process
placeholders: Optional list of placeholder replacements for planning calls
options: AI call configuration options
outputFormat: Optional output format (html, pdf, docx, txt, md, json, csv, xlsx) for document generation
title: Optional title for generated documents
documentProcessor: Document processing service instance
documentGenerator: Document generation service instance
Returns:
AI response as string, or dict with documents if outputFormat is specified
Raises:
Exception: If all available models fail
"""
if options is None:
options = AiCallOptions()
# Normalize placeholders from List[PromptPlaceholder]
placeholders_dict: Dict[str, str] = {}
placeholders_meta: Dict[str, bool] = {}
if placeholders:
placeholders_dict = {p.label: p.content for p in placeholders}
placeholders_meta = {p.label: bool(getattr(p, 'summaryAllowed', False)) for p in placeholders}
# Auto-determine call type based on documents and operation type
call_type = self._determineCallType(documents, options.operationType)
options.callType = call_type
try:
# Build the full prompt that will be sent to AI
if placeholders:
full_prompt = prompt
for p in placeholders:
placeholder = f"{{{{KEY:{p.label}}}}}"
full_prompt = full_prompt.replace(placeholder, p.content)
else:
full_prompt = prompt
self._writeAiResponseDebug(
label='ai_prompt_debug',
content=full_prompt,
partIndex=1,
modelName=None,
continuation=False
)
except Exception:
pass
# Handle document generation with specific output format
if outputFormat and documentGenerator:
result = await documentGenerator.callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title)
# Log AI response for debugging
try:
if isinstance(result, dict) and 'content' in result:
self._writeAiResponseDebug(
label='ai_document_generation',
content=result['content'],
partIndex=1,
modelName=None, # Document generation doesn't return model info
continuation=False
)
except Exception:
pass
return result
if call_type == "planning":
result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options)
# Log AI response for debugging
try:
self._writeAiResponseDebug(
label='ai_planning',
content=result or "",
partIndex=1,
modelName=None, # Planning doesn't return model info
continuation=False
)
except Exception:
pass
return result
else:
# Set processDocumentsIndividually from the legacy parameter if not set in options
if options.processDocumentsIndividually is None and documents:
options.processDocumentsIndividually = False # Default to batch processing
# For text calls, we need to build the full prompt with placeholders here
# since _callAiText doesn't handle placeholders directly
if placeholders_dict:
full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders_dict)
else:
full_prompt = prompt
if documentProcessor and documents:
result = await documentProcessor.callAiText(full_prompt, documents, options)
else:
# Fallback to direct AI call if no document processor available
request = AiCallRequest(
prompt=full_prompt,
context="",
options=options
)
response = await self.aiObjects.call(request)
result = response.content
# Log AI response for debugging (additional logging for text calls)
try:
self._writeAiResponseDebug(
label='ai_text_main',
content=result or "",
partIndex=1,
modelName=None, # Text calls already log internally
continuation=False
)
except Exception:
pass
return result
# AI Image Analysis
async def readImage(
self,
prompt: str,
imageData: Union[str, bytes],
mimeType: str = None,
options: Optional[AiCallOptions] = None,
) -> str:
"""Call AI for image analysis using interface.callImage()."""
try:
# Check if imageData is valid
if not imageData:
error_msg = "No image data provided"
self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE")
logger.error(f"Error in AI image analysis: {error_msg}")
return f"Error: {error_msg}"
self.services.utils.debugLogToFile(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}", "AI_SERVICE")
logger.info(f"readImage called with prompt, imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}")
# Always use IMAGE_ANALYSIS operation type for image processing
if options is None:
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
else:
# Override the operation type to ensure image analysis
options.operationType = OperationType.IMAGE_ANALYSIS
self.services.utils.debugLogToFile(f"Calling aiObjects.callImage with operationType: {options.operationType}", "AI_SERVICE")
logger.info(f"Calling aiObjects.callImage with operationType: {options.operationType}")
result = await self.aiObjects.callImage(prompt, imageData, mimeType, options)
# Debug the result
self.services.utils.debugLogToFile(f"Raw AI result type: {type(result)}, value: {repr(result)}", "AI_SERVICE")
# Check if result is valid
if not result or (isinstance(result, str) and not result.strip()):
error_msg = f"No response from AI image analysis (result: {repr(result)})"
self.services.utils.debugLogToFile(f"Error in AI image analysis: {error_msg}", "AI_SERVICE")
logger.error(f"Error in AI image analysis: {error_msg}")
return f"Error: {error_msg}"
self.services.utils.debugLogToFile(f"callImage returned: {result[:200]}..." if len(result) > 200 else result, "AI_SERVICE")
logger.info(f"callImage returned: {result[:200]}..." if len(result) > 200 else result)
return result
except Exception as e:
self.services.utils.debugLogToFile(f"Error in AI image analysis: {str(e)}", "AI_SERVICE")
logger.error(f"Error in AI image analysis: {str(e)}")
return f"Error: {str(e)}"
# AI Image Generation
async def generateImage(
self,
prompt: str,
size: str = "1024x1024",
quality: str = "standard",
style: str = "vivid",
options: Optional[AiCallOptions] = None,
) -> Dict[str, Any]:
"""Generate an image using AI using interface.generateImage()."""
try:
return await self.aiObjects.generateImage(prompt, size, quality, style, options)
except Exception as e:
logger.error(f"Error in AI image generation: {str(e)}")
return {"success": False, "error": str(e)}
def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str:
"""
Determine call type based on documents and operation type.
Criteria: no documents AND operationType is "generate_plan" -> planning
All other cases -> text
"""
has_documents = documents is not None and len(documents) > 0
is_planning_operation = operation_type == OperationType.GENERATE_PLAN
if not has_documents and is_planning_operation:
return "planning"
else:
return "text"
async def _callAiPlanning(
self,
prompt: str,
placeholders: Optional[Dict[str, str]],
placeholdersMeta: Optional[Dict[str, bool]],
options: AiCallOptions
) -> str:
"""
Handle planning calls with placeholder system and selective summarization.
"""
# Build full prompt with placeholders; if too large, summarize summaryAllowed placeholders proportionally
effective_placeholders = placeholders or {}
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
if options.compressPrompt and placeholdersMeta:
# Determine model capacity
try:
caps = self._getModelCapabilitiesForContent(full_prompt, None, options)
max_bytes = caps.get("maxContextBytes", len(full_prompt.encode("utf-8")))
except Exception:
max_bytes = len(full_prompt.encode("utf-8"))
current_bytes = len(full_prompt.encode("utf-8"))
if current_bytes > max_bytes:
# Compute total bytes contributed by allowed placeholders (approximate by content length)
allowed_labels = [l for l, allow in placeholdersMeta.items() if allow]
allowed_sizes = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels}
total_allowed = sum(allowed_sizes.values())
overage = current_bytes - max_bytes
if total_allowed > 0 and overage > 0:
# Target total for allowed after reduction
target_allowed = max(total_allowed - overage, 0)
# Global ratio to apply across allowed placeholders
ratio = target_allowed / total_allowed if total_allowed > 0 else 1.0
ratio = max(0.0, min(1.0, ratio))
reduced: Dict[str, str] = {}
for label, content in effective_placeholders.items():
if label in allowed_labels and isinstance(content, str) and len(content) > 0:
old_len = len(content)
# Reduce by proportional ratio on characters (fallback if empty)
reduction_factor = ratio if old_len > 0 else 1.0
reduced[label] = self._reduceText(content, reduction_factor)
else:
reduced[label] = content
effective_placeholders = reduced
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
# If still slightly over, perform a second-pass fine adjustment with updated ratio
current_bytes = len(full_prompt.encode("utf-8"))
if current_bytes > max_bytes and total_allowed > 0:
overage2 = current_bytes - max_bytes
# Recompute allowed sizes after first reduction
allowed_sizes2 = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels}
total_allowed2 = sum(allowed_sizes2.values())
if total_allowed2 > 0 and overage2 > 0:
target_allowed2 = max(total_allowed2 - overage2, 0)
ratio2 = target_allowed2 / total_allowed2
ratio2 = max(0.0, min(1.0, ratio2))
reduced2: Dict[str, str] = {}
for label, content in effective_placeholders.items():
if label in allowed_labels and isinstance(content, str) and len(content) > 0:
old_len = len(content)
reduction_factor = ratio2 if old_len > 0 else 1.0
reduced2[label] = self._reduceText(content, reduction_factor)
else:
reduced2[label] = content
effective_placeholders = reduced2
full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders)
# Make AI call using AiObjects (let it handle model selection)
request = AiCallRequest(
prompt=full_prompt,
context="", # Context is already included in the prompt
options=options
)
response = await self.aiObjects.call(request)
try:
logger.debug(f"AI model selected (planning): {getattr(response, 'modelName', 'unknown')}")
except Exception:
pass
return response.content
async def _callAiDirect(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
documentProcessor=None
) -> Dict[str, Any]:
"""
Call AI directly with prompt and documents for JSON output.
Used for multi-file generation - uses the existing generation pipeline.
"""
# Use the existing generation pipeline that already works
# This ensures proper document processing and content extraction
logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents")
if documentProcessor:
# Process documents with JSON merging using the existing pipeline
result = await documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
else:
# Fallback to simple AI call
request = AiCallRequest(
prompt=prompt,
context="",
options=options
)
response = await self.aiObjects.call(request)
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]}
# Convert single-file result to multi-file format if needed
if "sections" in result and "documents" not in result:
logger.info("Converting single-file result to multi-file format")
# This is a single-file result, convert it to multi-file format
return {
"metadata": result.get("metadata", {"title": "Converted Document"}),
"documents": [{
"id": "doc_1",
"title": result.get("metadata", {}).get("title", "Document"),
"filename": "document.txt",
"sections": result.get("sections", [])
}]
}
return result
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
"""
Get model capabilities for content processing, including appropriate size limits for chunking.
"""
# Estimate total content size
prompt_size = len(prompt.encode('utf-8'))
document_size = 0
if documents:
# Rough estimate of document content size
for doc in documents:
document_size += doc.fileSize or 0
total_size = prompt_size + document_size
# Use AiObjects to select the best model for this content size
# We'll simulate the model selection by checking available models
from modules.interfaces.interfaceAiObjects import aiModels
# Find the best model for this content size and operation
best_model = None
best_context_length = 0
for model_name, model_info in aiModels.items():
context_length = model_info.get("contextLength", 0)
# Skip models with no context length or too small for content
if context_length == 0:
continue
# Check if model supports the operation type
capabilities = model_info.get("capabilities", [])
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
continue
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
continue
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
continue
elif "text_generation" not in capabilities:
continue
# Prefer models that can handle the content without chunking, but allow chunking if needed
if context_length >= total_size * 0.8: # 80% of content size
if context_length > best_context_length:
best_model = model_info
best_context_length = context_length
elif best_model is None: # Fallback to largest available model
if context_length > best_context_length:
best_model = model_info
best_context_length = context_length
# Fallback to a reasonable default if no model found
if best_model is None:
best_model = {
"contextLength": 128000, # GPT-4o default
"llmName": "gpt-4o"
}
# Calculate appropriate sizes
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
context_length_bytes = int(best_model["contextLength"] * 4)
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
return {
"maxContextBytes": max_context_bytes,
"textChunkSize": text_chunk_size,
"imageChunkSize": image_chunk_size
}
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
"""
Get models capable of handling the specific operation with capability filtering.
"""
# Use the actual AI objects model selection instead of hardcoded default
if hasattr(self, 'aiObjects') and self.aiObjects:
# Let AiObjects handle the model selection
return []
else:
# Fallback to default model if AiObjects not available
default_model = ModelCapabilities(
name="default",
maxTokens=4000,
capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"],
costPerToken=0.001,
processingTime=1.0,
isAvailable=True
)
return [default_model]
def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str:
"""
Build full prompt by replacing placeholders with their content.
Uses the new {{KEY:placeholder}} format.
"""
if not placeholders:
return prompt
full_prompt = prompt
for placeholder, content in placeholders.items():
# Replace both old format {{placeholder}} and new format {{KEY:placeholder}}
full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content)
full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content)
return full_prompt
def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
"""Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled."""
try:
# Check if debug logging is enabled
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if not debug_enabled:
return
import os
from datetime import datetime, UTC
# Base dir: gateway/test-chat/ai (go up 4 levels from this file)
# .../gateway/modules/services/serviceAi/subCoreAi.py -> up to gateway root
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
outDir = os.path.join(gatewayDir, 'test-chat', 'ai')
os.makedirs(outDir, exist_ok=True)
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
suffix = []
if partIndex is not None:
suffix.append(f"part{partIndex}")
if continuation is not None:
suffix.append(f"cont_{str(continuation).lower()}")
if modelName:
safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName)
suffix.append(safeModel)
suffixStr = ('_' + '_'.join(suffix)) if suffix else ''
fname = f"{ts}_{label}{suffixStr}.txt"
fpath = os.path.join(outDir, fname)
with open(fpath, 'w', encoding='utf-8') as f:
f.write(content or '')
except Exception:
# Do not raise; best-effort debug write
pass
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
"""
Check if text exceeds model token limit with safety margin.
"""
# Simple character-based estimation (4 chars per token)
estimated_tokens = len(text) // 4
max_tokens = int(model.maxTokens * (1 - safety_margin))
return estimated_tokens > max_tokens
def _reducePlanningPrompt(
self,
full_prompt: str,
placeholders: Optional[Dict[str, str]],
model: ModelCapabilities,
options: AiCallOptions
) -> str:
"""
Reduce planning prompt size by summarizing placeholders while preserving prompt structure.
"""
if not placeholders:
return self._reduceText(full_prompt, 0.7)
# Reduce placeholders while preserving prompt
reduced_placeholders = {}
for placeholder, content in placeholders.items():
if len(content) > 1000: # Only reduce long content
reduction_factor = 0.7
reduced_content = self._reduceText(content, reduction_factor)
reduced_placeholders[placeholder] = reduced_content
else:
reduced_placeholders[placeholder] = content
return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders)
def _reduceTextPrompt(
self,
prompt: str,
context: str,
model: ModelCapabilities,
options: AiCallOptions
) -> str:
"""
Reduce text prompt size using typeGroup-aware chunking and merging.
"""
max_size = int(model.maxTokens * (1 - options.safetyMargin))
if options.compressPrompt:
# Reduce both prompt and context
target_size = max_size
current_size = len(prompt) + len(context)
reduction_factor = (target_size * 0.7) / current_size
if reduction_factor < 1.0:
prompt = self._reduceText(prompt, reduction_factor)
context = self._reduceText(context, reduction_factor)
else:
# Only reduce context, preserve prompt integrity
max_context_size = max_size - len(prompt)
if len(context) > max_context_size:
reduction_factor = max_context_size / len(context)
context = self._reduceText(context, reduction_factor)
return prompt + "\n\n" + context if context else prompt
def _extractTextFromContentParts(self, extracted_content) -> str:
"""
Extract text content from ExtractionService ContentPart objects.
"""
if not extracted_content or not hasattr(extracted_content, 'parts'):
return ""
text_parts = []
for part in extracted_content.parts:
if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']:
if hasattr(part, 'data') and part.data:
text_parts.append(part.data)
return "\n\n".join(text_parts)
def _reduceText(self, text: str, reduction_factor: float) -> str:
"""
Reduce text size by the specified factor.
"""
if reduction_factor >= 1.0:
return text
target_length = int(len(text) * reduction_factor)
return text[:target_length] + "... [reduced]"

View file

@ -0,0 +1,804 @@
import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
logger = logging.getLogger(__name__)
class SubDocumentGeneration:
"""Document generation operations including single-file and multi-file generation."""
def __init__(self, services, aiObjects, documentProcessor):
"""Initialize document generation service.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
documentProcessor: Document processing service instance
"""
self.services = services
self.aiObjects = aiObjects
self.documentProcessor = documentProcessor
async def callAiWithDocumentGeneration(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
outputFormat: str,
title: Optional[str]
) -> Dict[str, Any]:
"""
Handle AI calls with document generation in specific output format.
Now supports both single-file and multi-file generation.
Args:
prompt: The main prompt for the AI call
documents: Optional list of documents to process
options: AI call configuration options
outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
title: Optional title for generated documents
Returns:
Dict with generated documents and metadata
"""
try:
# Use AI to analyze prompt intent
prompt_analysis = await self._analyzePromptIntent(prompt, self)
logger.info(f"Prompt analysis result: {prompt_analysis}")
if prompt_analysis.get("is_multi_file", False):
return await self._callAiWithMultiFileGeneration(
prompt, documents, options, outputFormat, title, prompt_analysis
)
else:
return await self._callAiWithSingleFileGeneration(
prompt, documents, options, outputFormat, title
)
except Exception as e:
logger.error(f"Error in document generation: {str(e)}")
return {
"success": False,
"error": str(e),
"content": "",
"rendered_content": "",
"mime_type": "text/plain",
"filename": f"error_{outputFormat}",
"format": outputFormat,
"title": title or "Error",
"documents": []
}
async def _callAiWithSingleFileGeneration(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
outputFormat: str,
title: Optional[str],
generationPrompt: Optional[str] = None
) -> Dict[str, Any]:
"""Handle single-file document generation (existing functionality)."""
try:
# Get format-specific extraction prompt from generation service
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generation_service = GenerationService(self.services)
# Use default title if not provided
if not title:
title = "AI Generated Document"
# Get format-specific extraction prompt
extractionPrompt = await generation_service.getExtractionPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
aiService=self
)
# Process documents with format-specific prompt using JSON mode
# This ensures structured JSON output instead of text
aiResponseJson = await self._callAiJson(extractionPrompt, documents, options)
# Validate JSON response
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
raise Exception("AI response is not valid JSON document structure")
# Emit raw extracted data as a chat message attachment before rendering
try:
await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
except Exception:
logger.warning("Failed to emit raw extraction chat message (single-file)")
# Generate filename from document metadata
parsedFilename = None
try:
if aiResponseJson.get("metadata", {}).get("title"):
title = aiResponseJson["metadata"]["title"]
# Clean title for filename
import re
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title)
parsed = re.sub(r"-+", "-", parsed).strip('-')
if parsed:
parsedFilename = f"{parsed}.{outputFormat}"
except Exception:
parsedFilename = None
# Use AI generation to enhance the extracted JSON before rendering
enhancedContent = aiResponseJson # Default to original
if prompt:
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
# Get generation prompt
generationPrompt = await generation_service.getGenerationPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
aiService=self
)
# Prepare the AI call
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
# Create context with the extracted JSON content
import json
context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"
request = AiCallRequest(
prompt=generationPrompt,
context=context,
options=request_options
)
# Call AI to enhance the content
response = await self.aiObjects.call(request)
if response and response.content:
# Parse the AI response as JSON
try:
import re
result = response.content.strip()
# Extract JSON from markdown if present
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to parse JSON
enhancedContent = json.loads(result)
logger.info(f"AI enhanced JSON content successfully")
except json.JSONDecodeError as e:
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
enhancedContent = aiResponseJson
else:
logger.warning("AI generation returned empty response, using original content")
enhancedContent = aiResponseJson
except Exception as e:
logger.warning(f"AI generation failed: {str(e)}, using original content")
enhancedContent = aiResponseJson
# Render the enhanced JSON content
renderedContent, mimeType = await generation_service.renderReport(
extractedContent=enhancedContent,
outputFormat=outputFormat,
title=title,
userPrompt=prompt,
aiService=self
)
# Generate meaningful filename (use AI-provided if valid, else fallback)
from datetime import datetime, UTC
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
filename = parsedFilename
else:
safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
# Return structured result with document information
return {
"success": True,
"content": aiResponseJson, # Structured JSON document
"rendered_content": renderedContent, # Formatted content
"mime_type": mimeType,
"filename": filename,
"format": outputFormat,
"title": title,
"documents": [{
"documentName": filename,
"documentData": renderedContent,
"mimeType": mimeType
}],
"is_multi_file": False
}
except Exception as e:
logger.error(f"Error in single-file document generation: {str(e)}")
raise
async def _callAiWithMultiFileGeneration(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
outputFormat: str,
title: Optional[str],
prompt_analysis: Dict[str, Any]
) -> Dict[str, Any]:
"""Handle multi-file document generation using AI analysis."""
try:
# Get multi-file extraction prompt based on AI analysis
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generation_service = GenerationService(self.services)
# Use default title if not provided
if not title:
title = "AI Generated Documents"
# Get adaptive extraction prompt
extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
promptAnalysis=prompt_analysis,
aiService=self
)
logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
# Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
# Use the existing pipeline but replace the prompt with our adaptive one
# This ensures proper document processing while using the multi-file prompt
ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
logger.info(f"AI response type: {type(ai_response)}")
logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
# Validate response structure
if not self._validateResponseStructure(ai_response, prompt_analysis):
# Fallback to single-file if multi-file fails
logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
logger.warning(f"Prompt analysis: {prompt_analysis}")
logger.warning("Falling back to single-file generation")
return await self._callAiWithSingleFileGeneration(
prompt, documents, options, outputFormat, title
)
# Emit raw extracted data as a chat message attachment before transformation/rendering
try:
await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
except Exception:
logger.warning("Failed to emit raw extraction chat message (multi-file)")
# Process multiple documents
generated_documents = []
for i, doc_data in enumerate(ai_response.get("documents", [])):
# Transform AI-generated sections to renderer-compatible format
transformed_sections = []
for section in doc_data.get("sections", []):
# Convert AI format to renderer format
transformed_section = {
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
"content_type": section.get("content_type", "paragraph"),
"elements": section.get("elements", []),
"order": section.get("order", len(transformed_sections) + 1)
}
# Extract text from elements for simple text-based sections
if section.get("content_type") in ["paragraph", "heading"]:
text_parts = []
for element in section.get("elements", []):
if "text" in element:
text_parts.append(element["text"])
# Add text to the first element or create a new one
if transformed_section["elements"]:
transformed_section["elements"][0]["text"] = "\n".join(text_parts)
else:
transformed_section["elements"] = [{"text": "\n".join(text_parts)}]
transformed_sections.append(transformed_section)
# Create complete document structure for rendering
complete_document = {
"metadata": {
"title": doc_data["title"],
"source_document": "multi_file_generation",
"document_id": doc_data.get("id", f"doc_{i+1}"),
"filename": doc_data.get("filename", f"document_{i+1}"),
"split_strategy": prompt_analysis.get("strategy", "custom")
},
"sections": transformed_sections,
"summary": f"Generated document: {doc_data['title']}",
"tags": ["multi_file", "ai_generated"]
}
# Use AI generation to enhance the extracted JSON before rendering
enhancedContent = complete_document # Default to original
if prompt:
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
# Get generation prompt
generationPrompt = await generation_service.getGenerationPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=doc_data["title"],
aiService=self
)
# Prepare the AI call
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
# Create context with the extracted JSON content
import json
context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"
request = AiCallRequest(
prompt=generationPrompt,
context=context,
options=request_options
)
# Call AI to enhance the content
response = await self.aiObjects.call(request)
if response and response.content:
# Parse the AI response as JSON
try:
import re
result = response.content.strip()
# Extract JSON from markdown if present
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to parse JSON
enhancedContent = json.loads(result)
logger.info(f"AI enhanced JSON content successfully")
except json.JSONDecodeError as e:
logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...")
# Try to repair common JSON issues
try:
repaired_result = self._repairJson(result)
enhancedContent = json.loads(repaired_result)
logger.info(f"Successfully repaired JSON content")
except (json.JSONDecodeError, Exception) as repair_error:
logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...")
# Try AI-powered JSON repair as last resort
try:
ai_repaired = await self._repairJsonWithAI(result)
enhancedContent = json.loads(ai_repaired)
logger.info(f"AI successfully repaired JSON content")
except Exception as ai_repair_error:
logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content")
enhancedContent = complete_document
else:
logger.warning("AI generation returned empty response, using original content")
enhancedContent = complete_document
except Exception as e:
logger.warning(f"AI generation failed: {str(e)}, using original content")
enhancedContent = complete_document
# Render the enhanced JSON content
rendered_content, mime_type = await generation_service.renderReport(
extractedContent=enhancedContent,
outputFormat=outputFormat,
title=doc_data["title"],
userPrompt=prompt,
aiService=self
)
# Generate proper filename with correct extension
base_filename = doc_data.get("filename", f"document_{i+1}")
# Remove any existing extension and add the correct one
if '.' in base_filename:
base_filename = base_filename.rsplit('.', 1)[0]
# Add proper extension based on output format
if outputFormat.lower() == "docx":
filename = f"{base_filename}.docx"
elif outputFormat.lower() == "pdf":
filename = f"{base_filename}.pdf"
elif outputFormat.lower() == "html":
filename = f"{base_filename}.html"
else:
filename = f"{base_filename}.{outputFormat}"
generated_documents.append({
"documentName": filename,
"documentData": rendered_content,
"mimeType": mime_type
})
# Save debug files for multi-file generation - only if debug enabled
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
try:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
os.makedirs(debug_dir, exist_ok=True)
# Save metadata
with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
f.write(f"title: {title}\n")
f.write(f"format: {outputFormat}\n")
f.write(f"documents_count: {len(generated_documents)}\n")
f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
f.write(f"prompt_analysis: {prompt_analysis}\n")
# Save each generated document
for i, doc in enumerate(generated_documents):
doc_filename = doc["documentName"]
doc_data = doc["documentData"]
doc_mime = doc["mimeType"]
# Determine file extension
if outputFormat.lower() == "docx":
file_ext = ".docx"
elif outputFormat.lower() == "pdf":
file_ext = ".pdf"
elif outputFormat.lower() == "html":
file_ext = ".html"
else:
file_ext = f".{outputFormat}"
# Save the rendered document
output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
# Text-based formats
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
else:
# Binary formats - decode from base64 if needed
try:
import base64
doc_bytes = base64.b64decode(doc_data)
with open(output_path, 'wb') as f:
f.write(doc_bytes)
except Exception:
# If not base64, save as text
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")
logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")
except Exception as e:
logger.warning(f"Failed to save multi-file debug output: {e}")
return {
"success": True,
"content": ai_response,
"rendered_content": None, # Not applicable for multi-file
"mime_type": None, # Not applicable for multi-file
"filename": None, # Not applicable for multi-file
"format": outputFormat,
"title": title,
"documents": generated_documents,
"is_multi_file": True,
"split_strategy": prompt_analysis.get("strategy", "custom")
}
except Exception as e:
logger.error(f"Error in multi-file document generation: {str(e)}")
# Fallback to single-file
return await self._callAiWithSingleFileGeneration(
prompt, documents, options, outputFormat, title
)
async def _callAiJson(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions
) -> Dict[str, Any]:
"""
Handle AI calls with document processing for JSON output.
Returns structured JSON document instead of text.
"""
# Process documents with JSON merging
return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
"""Use AI to analyze user prompt and determine processing requirements."""
if not ai_service:
return {"is_multi_file": False, "strategy": "single", "criteria": None}
try:
analysis_prompt = f"""
Analyze this user request and determine if it requires multiple file output or single file output.
User request: "{prompt}"
Respond with JSON only in this exact format:
{{
"is_multi_file": true/false,
"strategy": "single|per_entity|by_section|by_criteria|custom",
"criteria": "description of how to split content",
"file_naming_pattern": "suggested pattern for filenames",
"reasoning": "brief explanation of the analysis"
}}
Consider:
- Does the user want separate files for different entities (customers, products, etc.)?
- Does the user want to split content into multiple documents?
- What would be the most logical way to organize the content?
- What language is the request in? (analyze in the original language)
Return only the JSON response.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await ai_service.aiObjects.call(request)
if response and response.content:
import json
import re
# Extract JSON from response
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
analysis = json.loads(result)
return analysis
else:
return {"is_multi_file": False, "strategy": "single", "criteria": None}
except Exception as e:
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
return {"is_multi_file": False, "strategy": "single", "criteria": None}
def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
"""Validate that AI response matches the expected structure."""
try:
if not isinstance(response, dict):
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
return False
# Check for multi-file structure
if prompt_analysis.get("is_multi_file", False):
has_documents = "documents" in response
is_documents_list = isinstance(response.get("documents"), list)
logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
if has_documents and is_documents_list:
logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
else:
logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
logger.warning(f"Available keys: {list(response.keys())}")
return has_documents and is_documents_list
else:
has_sections = "sections" in response
is_sections_list = isinstance(response.get("sections"), list)
logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
return has_sections and is_sections_list
except Exception as e:
logger.warning(f"Response validation failed with exception: {str(e)}")
return False
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
"""
Create a ChatMessage with the extracted raw JSON attached as a file so the user
has access to the data even if downstream processing fails.
"""
try:
services = self.services
workflow = services.currentWorkflow
# Serialize payload
import json as _json
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
content_bytes = content_text.encode('utf-8')
# Store as file via component storage
file_name = f"{label}_{ts}.json"
file_item = services.interfaceDbComponent.createFile(
name=file_name,
mimeType="application/json",
content=content_bytes
)
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
# Lookup file info for ChatDocument
file_info = services.workflow.getFileInfo(file_item.id)
doc = ChatDocument(
messageId="", # set after message creation
fileId=file_item.id,
fileName=file_info.get("fileName", file_name) if file_info else file_name,
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
)
# Create message referencing the file
messageData = {
"workflowId": workflow.id,
"role": "assistant",
"message": "Raw extraction data saved",
"status": "data",
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
"publishedAt": services.utils.getUtcTimestamp(),
"documentsLabel": label,
"documents": []
}
message = services.workflow.createMessage(messageData)
if not message:
return
# Persist ChatDocument with messageId
doc.messageId = message.id
services.interfaceDbChat.createDocument(doc.to_dict())
# Update message to include document
try:
if not message.documents:
message.documents = []
message.documents.append(doc)
services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]})
except Exception:
pass
except Exception:
# Non-fatal; ignore if storage or chat creation fails
return
def _repairJson(self, json_string: str) -> str:
"""Repair common JSON syntax errors efficiently for large JSON."""
try:
import re
import json
# Remove any leading/trailing whitespace
json_string = json_string.strip()
# For large JSON, skip substring extraction and go straight to targeted repairs
logger.info(f"Attempting JSON repair for {len(json_string)} characters...")
# Try to parse first to see what specific error we get
try:
json.loads(json_string)
return json_string # Already valid
except json.JSONDecodeError as e:
error_msg = str(e)
logger.info(f"JSON error: {error_msg}")
# Apply targeted fixes based on the specific error
if "Expecting ',' delimiter" in error_msg:
# Fix missing commas between array elements
json_string = re.sub(r'\]\s*\[', '], [', json_string)
json_string = re.sub(r'\}\s*\{', '}, {', json_string)
# Fix missing commas between object properties
json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string)
if "Expecting value" in error_msg:
# Fix missing values (replace empty with null)
json_string = re.sub(r':\s*,', ': null,', json_string)
json_string = re.sub(r':\s*}', ': null}', json_string)
if "Expecting property name" in error_msg:
# Fix unquoted property names
json_string = re.sub(r'(\w+):', r'"\1":', json_string)
# Fix trailing commas before closing brackets/braces
json_string = re.sub(r',(\s*[}\]])', r'\1', json_string)
# Fix missing closing brackets/braces (only if reasonable)
open_braces = json_string.count('{')
close_braces = json_string.count('}')
open_brackets = json_string.count('[')
close_brackets = json_string.count(']')
# Only add missing brackets if the difference is small (avoid runaway)
if 0 < (open_braces - close_braces) <= 5:
missing_braces = open_braces - close_braces
json_string += '}' * missing_braces
if 0 < (open_brackets - close_brackets) <= 5:
missing_brackets = open_brackets - close_brackets
json_string += ']' * missing_brackets
# Try to parse again
try:
json.loads(json_string)
logger.info("JSON repair successful")
return json_string
except json.JSONDecodeError:
logger.warning("JSON repair failed - will try AI repair")
return json_string
except Exception as e:
logger.warning(f"JSON repair failed: {str(e)}")
return json_string
async def _repairJsonWithAI(self, malformed_json: str) -> str:
"""Use AI to repair malformed JSON efficiently for large files."""
try:
# Limit JSON size for AI processing (max 50KB to avoid token limits)
max_json_size = 50000
json_to_repair = malformed_json
if len(malformed_json) > max_json_size:
logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair")
# Try to find a good truncation point (end of a complete object/array)
truncate_at = max_json_size
for i in range(max_json_size, max(0, max_json_size - 1000), -1):
if malformed_json[i] in ['}', ']']:
truncate_at = i + 1
break
json_to_repair = malformed_json[:truncate_at] + "..."
repair_prompt = f"""
You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations.
Malformed JSON:
{json_to_repair}
Return only the valid JSON:
"""
# Use AI to repair the JSON
repaired_json = await self.services.ai.callAi(
prompt=repair_prompt,
documents=None,
options={
"process_type": "text",
"operation_type": "generate_content",
"priority": "speed",
"max_cost": 0.01
}
)
# Clean up the response (remove any markdown formatting)
repaired_json = repaired_json.strip()
if repaired_json.startswith('```json'):
repaired_json = repaired_json[7:]
if repaired_json.endswith('```'):
repaired_json = repaired_json[:-3]
repaired_json = repaired_json.strip()
# Validate the repaired JSON
import json
json.loads(repaired_json)
logger.info("AI JSON repair successful")
return repaired_json
except Exception as e:
logger.warning(f"AI JSON repair failed: {str(e)}")
return malformed_json

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,316 @@
import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelAi import ModelCapabilities, AiCallOptions
logger = logging.getLogger(__name__)
class SubUtilities:
"""Utility functions for text processing, debugging, and helper operations."""
def __init__(self, services):
"""Initialize utilities service.
Args:
services: Service center instance for accessing other services
"""
self.services = services
def _writeTraceLog(self, contextText: str, data: Any) -> None:
"""Write raw data to the central trace log file without truncation."""
try:
import os
import json
from datetime import datetime, UTC
# Only write if logger is in debug mode
if logger.level > logging.DEBUG:
return
# Get log directory from configuration via service center if possible
logDir = None
try:
logDir = self.services.utils.configGet("APP_LOGGING_LOG_DIR", "./")
except Exception:
pass
if not logDir:
logDir = "./"
if not os.path.isabs(logDir):
# Make it relative to gateway directory
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
logDir = os.path.join(gatewayDir, logDir)
os.makedirs(logDir, exist_ok=True)
traceFile = os.path.join(logDir, "log_trace.log")
timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
traceEntry = f"[{timestamp}] {contextText}\n" + ("=" * 80) + "\n"
if data is None:
traceEntry += "No data provided\n"
else:
# Prefer exact text; if dict/list, pretty print JSON
try:
if isinstance(data, (dict, list)):
traceEntry += f"JSON Data:\n{json.dumps(data, indent=2, ensure_ascii=False)}\n"
else:
text = str(data)
traceEntry += f"Text Data:\n{text}\n"
except Exception:
traceEntry += f"Data (fallback): {str(data)}\n"
traceEntry += ("=" * 80) + "\n\n"
with open(traceFile, "a", encoding="utf-8") as f:
f.write(traceEntry)
except Exception:
# Swallow to avoid recursive logging issues
pass
def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
"""Persist raw AI response parts for debugging under test-chat/ai - only if debug enabled."""
try:
# Check if debug logging is enabled
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if not debug_enabled:
return
import os
from datetime import datetime, UTC
# Base dir: gateway/test-chat/ai (go up 4 levels from this file)
# .../gateway/modules/services/serviceAi/subUtilities.py -> up to gateway root
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
outDir = os.path.join(gatewayDir, 'test-chat', 'ai')
os.makedirs(outDir, exist_ok=True)
ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]
suffix = []
if partIndex is not None:
suffix.append(f"part{partIndex}")
if continuation is not None:
suffix.append(f"cont_{str(continuation).lower()}")
if modelName:
safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName)
suffix.append(safeModel)
suffixStr = ('_' + '_'.join(suffix)) if suffix else ''
fname = f"{ts}_{label}{suffixStr}.txt"
fpath = os.path.join(outDir, fname)
with open(fpath, 'w', encoding='utf-8') as f:
f.write(content or '')
except Exception:
# Do not raise; best-effort debug write
pass
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
"""
Check if text exceeds model token limit with safety margin.
"""
# Simple character-based estimation (4 chars per token)
estimated_tokens = len(text) // 4
max_tokens = int(model.maxTokens * (1 - safety_margin))
return estimated_tokens > max_tokens
def _reduceText(self, text: str, reduction_factor: float) -> str:
"""
Reduce text size by the specified factor.
"""
if reduction_factor >= 1.0:
return text
target_length = int(len(text) * reduction_factor)
return text[:target_length] + "... [reduced]"
def _extractTextFromContentParts(self, extracted_content) -> str:
"""
Extract text content from ExtractionService ContentPart objects.
"""
if not extracted_content or not hasattr(extracted_content, 'parts'):
return ""
text_parts = []
for part in extracted_content.parts:
if hasattr(part, 'typeGroup') and part.typeGroup in ['text', 'table', 'structure']:
if hasattr(part, 'data') and part.data:
text_parts.append(part.data)
return "\n\n".join(text_parts)
def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str:
"""
Build full prompt by replacing placeholders with their content.
Uses the new {{KEY:placeholder}} format.
"""
if not placeholders:
return prompt
full_prompt = prompt
for placeholder, content in placeholders.items():
# Replace both old format {{placeholder}} and new format {{KEY:placeholder}}
full_prompt = full_prompt.replace(f"{{{{{placeholder}}}}}", content)
full_prompt = full_prompt.replace(f"{{{{KEY:{placeholder}}}}}", content)
return full_prompt
def _reducePlanningPrompt(
self,
full_prompt: str,
placeholders: Optional[Dict[str, str]],
model: ModelCapabilities,
options: AiCallOptions
) -> str:
"""
Reduce planning prompt size by summarizing placeholders while preserving prompt structure.
"""
if not placeholders:
return self._reduceText(full_prompt, 0.7)
# Reduce placeholders while preserving prompt
reduced_placeholders = {}
for placeholder, content in placeholders.items():
if len(content) > 1000: # Only reduce long content
reduction_factor = 0.7
reduced_content = self._reduceText(content, reduction_factor)
reduced_placeholders[placeholder] = reduced_content
else:
reduced_placeholders[placeholder] = content
return self._buildPromptWithPlaceholders(full_prompt, reduced_placeholders)
def _reduceTextPrompt(
self,
prompt: str,
context: str,
model: ModelCapabilities,
options: AiCallOptions
) -> str:
"""
Reduce text prompt size using typeGroup-aware chunking and merging.
"""
max_size = int(model.maxTokens * (1 - options.safetyMargin))
if options.compressPrompt:
# Reduce both prompt and context
target_size = max_size
current_size = len(prompt) + len(context)
reduction_factor = (target_size * 0.7) / current_size
if reduction_factor < 1.0:
prompt = self._reduceText(prompt, reduction_factor)
context = self._reduceText(context, reduction_factor)
else:
# Only reduce context, preserve prompt integrity
max_context_size = max_size - len(prompt)
if len(context) > max_context_size:
reduction_factor = max_context_size / len(context)
context = self._reduceText(context, reduction_factor)
return prompt + "\n\n" + context if context else prompt
async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str:
"""Compress content to target size."""
if len(content.encode("utf-8")) <= targetSize:
return content
try:
compressionPrompt = f"""
Komprimiere den folgenden {contentType} auf maximal {targetSize} Zeichen,
behalte aber alle wichtigen Informationen bei:
{content}
Gib nur den komprimierten Inhalt zurück, ohne zusätzliche Erklärungen.
"""
# Service must not call connectors directly; use simple truncation fallback here
data = content.encode("utf-8")
return data[:targetSize].decode("utf-8", errors="ignore") + "... [truncated]"
except Exception as e:
logger.warning(f"AI compression failed, using truncation: {str(e)}")
return content[:targetSize] + "... [truncated]"
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List], options: AiCallOptions) -> Dict[str, int]:
"""
Get model capabilities for content processing, including appropriate size limits for chunking.
"""
# Estimate total content size
prompt_size = len(prompt.encode('utf-8'))
document_size = 0
if documents:
# Rough estimate of document content size
for doc in documents:
document_size += getattr(doc, 'fileSize', 0) or 0
total_size = prompt_size + document_size
# Use AiObjects to select the best model for this content size
# We'll simulate the model selection by checking available models
from modules.interfaces.interfaceAiObjects import aiModels
# Find the best model for this content size and operation
best_model = None
best_context_length = 0
for model_name, model_info in aiModels.items():
context_length = model_info.get("contextLength", 0)
# Skip models with no context length or too small for content
if context_length == 0:
continue
# Check if model supports the operation type
capabilities = model_info.get("capabilities", [])
from modules.datamodels.datamodelAi import OperationType
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
continue
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
continue
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
continue
elif "text_generation" not in capabilities:
continue
# Prefer models that can handle the content without chunking, but allow chunking if needed
if context_length >= total_size * 0.8: # 80% of content size
if context_length > best_context_length:
best_model = model_info
best_context_length = context_length
elif best_model is None: # Fallback to largest available model
if context_length > best_context_length:
best_model = model_info
best_context_length = context_length
# Fallback to a reasonable default if no model found
if best_model is None:
best_model = {
"contextLength": 128000, # GPT-4o default
"llmName": "gpt-4o"
}
# Calculate appropriate sizes
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
context_length_bytes = int(best_model["contextLength"] * 4)
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
return {
"maxContextBytes": max_context_bytes,
"textChunkSize": text_chunk_size,
"imageChunkSize": image_chunk_size
}
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
"""
Get models capable of handling the specific operation with capability filtering.
"""
# Use the actual AI objects model selection instead of hardcoded default
if hasattr(self, 'aiObjects') and self.aiObjects:
# Let AiObjects handle the model selection
return []
else:
# Fallback to default model if AiObjects not available
default_model = ModelCapabilities(
name="default",
maxTokens=4000,
capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"],
costPerToken=0.001,
processingTime=1.0,
isAvailable=True
)
return [default_model]

View file

@ -0,0 +1,384 @@
import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelWeb import (
WebResearchRequest,
WebResearchActionResult,
WebResearchDocumentData,
WebResearchActionDocument,
WebSearchResultItem,
)
from modules.interfaces.interfaceAiObjects import AiObjects
from modules.shared.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
class SubWebResearch:
"""Web research operations including search, crawling, and analysis."""
def __init__(self, services, aiObjects):
"""Initialize web research service.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
"""
self.services = services
self.aiObjects = aiObjects
async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult:
"""Perform web research using interface functions."""
try:
logger.info(f"WEB RESEARCH STARTED")
logger.info(f"User Query: {request.user_prompt}")
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
# Global URL index to track all processed URLs across the entire research session
global_processed_urls = set()
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
if request.urls:
# Use provided URLs as initial main URLs
websites = request.urls
logger.info(f"Using provided URLs ({len(websites)}):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
else:
# Use AI to determine main URLs based on user's intention
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
# Use AI to generate optimized Tavily search query and search parameters
query_optimizer_prompt = f"""You are a search query optimizer.
USER QUERY: {request.user_prompt}
Your task: Create a search query and parameters for the USER QUERY given.
RULES:
1. The search query MUST be related to the user query above
2. Extract key terms from the user query
3. Determine appropriate country/language based on the query context
4. Keep search query short (2-6 words)
Return ONLY this JSON format:
{{
"user_prompt": "search query based on user query above",
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
"language": "language_code_or_null",
"topic": "general|news|academic_or_null",
"time_range": "d|w|m|y_or_null",
"selection_strategy": "single|multiple|specific_page",
"selection_criteria": "what URLs to prioritize",
"expected_url_patterns": ["pattern1", "pattern2"],
"estimated_result_count": number
}}"""
# Get AI response for query optimization
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
ai_request = AiCallRequest(
prompt=query_optimizer_prompt,
options=AiCallOptions()
)
ai_response_obj = await self.aiObjects.call(ai_request)
ai_response = ai_response_obj.content
logger.debug(f"AI query optimizer response: {ai_response}")
# Parse AI response to extract search query
import json
try:
# Clean the response by removing markdown code blocks
cleaned_response = ai_response.strip()
if cleaned_response.startswith('```json'):
cleaned_response = cleaned_response[7:] # Remove ```json
if cleaned_response.endswith('```'):
cleaned_response = cleaned_response[:-3] # Remove ```
cleaned_response = cleaned_response.strip()
query_data = json.loads(cleaned_response)
search_query = query_data.get("user_prompt", request.user_prompt)
ai_country = query_data.get("country")
ai_language = query_data.get("language")
ai_topic = query_data.get("topic")
ai_time_range = query_data.get("time_range")
selection_strategy = query_data.get("selection_strategy", "multiple")
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
expected_patterns = query_data.get("expected_url_patterns", [])
estimated_count = query_data.get("estimated_result_count", request.max_results)
logger.info(f"AI optimized search query: '{search_query}'")
logger.info(f"Selection strategy: {selection_strategy}")
logger.info(f"Selection criteria: {selection_criteria}")
logger.info(f"Expected URL patterns: {expected_patterns}")
logger.info(f"Estimated result count: {estimated_count}")
except json.JSONDecodeError:
logger.warning("Failed to parse AI response as JSON, using original query")
search_query = request.user_prompt
ai_country = None
ai_language = None
ai_topic = None
ai_time_range = None
selection_strategy = "multiple"
# Perform the web search with AI-determined parameters
search_kwargs = {
"query": search_query,
"max_results": request.max_results,
"search_depth": request.options.search_depth,
"auto_parameters": False # Use explicit parameters
}
# Add parameters only if they have valid values
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
if not c:
return None
s = str(c).strip()
if not s or s.lower() in ['null', 'none', 'undefined']:
return None
# Map common codes to full English names when easy to do without extra deps
mapping = {
'ch': 'Switzerland', 'che': 'Switzerland',
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
'at': 'Austria', 'aut': 'Austria',
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
}
key = s.lower()
if key in mapping:
return mapping[key]
# If looks like full name, capitalize first letter only (Tavily accepts English names)
return s
norm_ai_country = _normalizeCountry(ai_country)
norm_req_country = _normalizeCountry(request.options.country)
if norm_ai_country:
search_kwargs["country"] = norm_ai_country
elif norm_req_country:
search_kwargs["country"] = norm_req_country
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
search_kwargs["language"] = ai_language
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
search_kwargs["language"] = request.options.language
if ai_topic and ai_topic in ['general', 'news', 'academic']:
search_kwargs["topic"] = ai_topic
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
search_kwargs["topic"] = request.options.topic
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
search_kwargs["time_range"] = ai_time_range
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
search_kwargs["time_range"] = request.options.time_range
# Constrain by expected domains if provided by AI
try:
include_domains = []
for p in expected_patterns or []:
if not isinstance(p, str):
continue
# Extract bare domain from pattern or URL
import re
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
if m:
domain = m.group(1).lower()
# strip leading www.
if domain.startswith('www.'):
domain = domain[4:]
include_domains.append(domain)
# Deduplicate
if include_domains:
seen = set()
uniq = []
for d in include_domains:
if d not in seen:
seen.add(d)
uniq.append(d)
search_kwargs["include_domains"] = uniq
except Exception:
pass
# Log the parameters being used
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
search_results = await self.aiObjects.search_websites(**search_kwargs)
logger.debug(f"Web search returned {len(search_results)} results:")
for i, result in enumerate(search_results, 1):
logger.debug(f" {i}. {result.url} - {result.title}")
# Deduplicate while preserving order
seen = set()
search_urls = []
for r in search_results:
u = str(r.url)
if u not in seen:
seen.add(u)
search_urls.append(u)
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
if not search_urls:
logger.error("No relevant websites found")
return WebResearchActionResult(success=False, error="No relevant websites found")
# Now use AI to determine the main URLs based on user's intention
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
# Create a prompt for AI to identify main URLs based on user's intention
ai_prompt = f"""
Select the most relevant URLs from these search results:
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
Return only the URLs that are most relevant for the user's query.
One URL per line.
"""
# Create AI call request
ai_request = AiCallRequest(
prompt=ai_prompt,
options=AiCallOptions()
)
ai_response_obj = await self.aiObjects.call(ai_request)
ai_response = ai_response_obj.content
logger.debug(f"AI response for main URL selection: {ai_response}")
# Parse AI response to extract URLs
websites = []
for line in ai_response.strip().split('\n'):
line = line.strip()
if line and ('http://' in line or 'https://' in line):
# Extract URL from the line
for word in line.split():
if word.startswith('http://') or word.startswith('https://'):
websites.append(word.rstrip('.,;'))
break
if not websites:
logger.warning("AI did not identify any main URLs, using first few search results")
websites = search_urls[:3] # Fallback to first 3 search results
# Deduplicate while preserving order
seen = set()
unique_websites = []
for url in websites:
if url not in seen:
seen.add(url)
unique_websites.append(url)
websites = unique_websites
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
# Step 2: Smart website selection using AI interface
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
logger.debug(f"AI Response: {aiResponse}")
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
for i, url in enumerate(selectedWebsites, 1):
logger.debug(f" {i}. {url}")
# Show which were filtered out
filtered_out = [url for url in websites if url not in selectedWebsites]
if filtered_out:
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
for i, url in enumerate(filtered_out, 1):
logger.debug(f" {i}. {url}")
# Step 3+4+5: Recursive crawling with configurable depth
# Get configuration parameters
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
crawl_timeout_seconds = crawl_timeout_minutes * 60
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
effective_depth = min(max_depth, request.options.pages_search_depth)
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
logger.info(f"Max links per domain: {max_links_per_domain}")
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
# Use recursive crawling with URL index to avoid duplicates
import asyncio
try:
allContent = await asyncio.wait_for(
self.aiObjects.crawlRecursively(
urls=selectedWebsites,
max_depth=effective_depth,
extract_depth=request.options.extract_depth,
max_per_domain=max_links_per_domain,
global_processed_urls=global_processed_urls
),
timeout=crawl_timeout_seconds
)
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
except asyncio.TimeoutError:
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
# crawlRecursively now handles timeouts gracefully and returns partial results
# Try to get the partial results that were collected
allContent = {}
if not allContent:
logger.error("Could not extract content from any websites")
return WebResearchActionResult(success=False, error="Could not extract content from any websites")
logger.info(f"=== WEB RESEARCH COMPLETED ===")
logger.info(f"Successfully crawled {len(allContent)} URLs total")
logger.info(f"Crawl depth: {effective_depth} levels")
# Create simple result with raw content
sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites]
# Get all additional links (all URLs except main ones)
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
# Combine all content into a single result
combinedContent = ""
for url, content in allContent.items():
combinedContent += f"\n\n=== {url} ===\n{content}\n"
documentData = WebResearchDocumentData(
user_prompt=request.user_prompt,
websites_analyzed=len(allContent),
additional_links_found=len(additional_links),
analysis_result=combinedContent, # Raw content, no analysis
sources=sources,
additional_links=additional_links,
individual_content=allContent, # Individual URL -> content mapping
debug_info={
"crawl_depth": effective_depth,
"max_configured_depth": max_depth,
"max_links_per_domain": max_links_per_domain,
"crawl_timeout_minutes": crawl_timeout_minutes,
"total_urls_crawled": len(allContent),
"main_urls": len(selectedWebsites),
"additional_urls": len(additional_links)
}
)
document = WebResearchActionDocument(
documentName=f"web_research_{request.user_prompt[:50]}.json",
documentData=documentData,
mimeType="application/json"
)
return WebResearchActionResult(
success=True,
documents=[document],
resultLabel="web_research_results"
)
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
return WebResearchActionResult(success=False, error=str(e))

View file

@ -7,9 +7,29 @@ from ..subRegistry import Extractor
class BinaryExtractor(Extractor):
"""
Fallback extractor for unsupported file types.
This extractor handles any file type that doesn't match other extractors.
It encodes the file as base64 and marks it as binary data.
Supported formats:
- All file types (fallback)
- MIME types: application/octet-stream (default)
- File extensions: All (fallback)
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return True
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (all)."""
return [] # Accepts all extensions as fallback
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types (all)."""
return [] # Accepts all MIME types as fallback
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/octet-stream"
return [ContentPart(

View file

@ -6,9 +6,26 @@ from ..subRegistry import Extractor
class CsvExtractor(Extractor):
"""
Extractor for CSV files.
Supported formats:
- MIME types: text/csv
- File extensions: .csv
- Special handling: Treats as table data
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".csv"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/csv"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/csv"

View file

@ -7,6 +7,16 @@ from ..subRegistry import Extractor
class DocxExtractor(Extractor):
"""
Extractor for Microsoft Word documents.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
- File extensions: .docx
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
- Dependencies: python-docx
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
@ -25,6 +35,14 @@ class DocxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".docx"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
parts: List[ContentPart] = []

View file

@ -7,9 +7,27 @@ from ..subRegistry import Extractor
class HtmlExtractor(Extractor):
"""
Extractor for HTML files.
Supported formats:
- MIME types: text/html
- File extensions: .html, .htm
- Special handling: Uses BeautifulSoup for parsing
- Dependencies: beautifulsoup4
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".html", ".htm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/html"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "text/html"
text = fileBytes.decode("utf-8", errors="replace")

View file

@ -0,0 +1,75 @@
from typing import Any, Dict, List
import base64
import logging
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
class ImageExtractor(Extractor):
"""
Extractor for image files.
Supported formats:
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
- Special handling: GIF files are converted to PNG during extraction
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return ((mimeType or "").startswith("image/") or
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "image/unknown"
fileName = context.get("fileName", "")
# Convert GIF to PNG during extraction
if mimeType.lower() == "image/gif":
try:
from PIL import Image
import io
# Open GIF and convert to PNG
with Image.open(io.BytesIO(fileBytes)) as img:
# Convert to RGB (removes animation)
if img.mode in ('RGBA', 'LA', 'P'):
img = img.convert('RGB')
# Save as PNG in memory
png_buffer = io.BytesIO()
img.save(png_buffer, format='PNG')
png_data = png_buffer.getvalue()
# Update mimeType and fileBytes
mimeType = "image/png"
fileBytes = png_data
logger.info(f"GIF converted to PNG during extraction: {fileName}, original={len(fileBytes)} bytes, converted={len(png_data)} bytes")
except Exception as e:
logger.warning(f"GIF conversion failed during extraction for {fileName}: {str(e)}, using original")
# Keep original GIF data if conversion fails
return [ContentPart(
id=makeId(),
parentId=None,
label="image",
typeGroup="image",
mimeType=mimeType,
data=base64.b64encode(fileBytes).decode("utf-8"),
metadata={"size": len(fileBytes)}
)]

View file

@ -7,9 +7,26 @@ from ..subRegistry import Extractor
class JsonExtractor(Extractor):
"""
Extractor for JSON files.
Supported formats:
- MIME types: application/json
- File extensions: .json
- Special handling: Validates JSON format, falls back to text if invalid
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".json"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/json"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/json"
text = fileBytes.decode("utf-8", errors="replace")

View file

@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class PdfExtractor(Extractor):
"""
Extractor for PDF files.
Supported formats:
- MIME types: application/pdf
- File extensions: .pdf
- Special handling: Extracts text per page and embedded images
- Dependencies: PyPDF2, PyMuPDF (fitz)
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
@ -27,6 +37,14 @@ class PdfExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pdf"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/pdf"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
parts: List[ContentPart] = []

View file

@ -0,0 +1,225 @@
import logging
import base64
from typing import List, Dict, Any, Optional
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
class PptxExtractor(Extractor):
"""
Extractor for PowerPoint files.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
- File extensions: .pptx, .ppt
- Special handling: Extracts slide content, tables, and images
- Dependencies: python-pptx
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global Presentation
from pptx import Presentation
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pptx", ".ppt"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""
Extract content from PowerPoint files.
Args:
fileBytes: Raw file data as bytes
context: Context dictionary with file information
Returns:
List of ContentPart objects with extracted content
"""
self._load()
if not self._haveLibs:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data="Error: python-pptx library not installed",
metadata={"error": True, "error_message": "python-pptx library not installed"}
)]
try:
import io
# Load presentation from bytes
presentation = Presentation(io.BytesIO(fileBytes))
parts = []
slide_index = 0
# Extract content from each slide
for slide in presentation.slides:
slide_index += 1
slide_content = []
# Extract text from slide
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content.append(shape.text.strip())
# Extract table data
for shape in slide.shapes:
if shape.has_table:
table = shape.table
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
row_data.append(cell.text.strip())
table_data.append(row_data)
if table_data:
# Convert table to markdown format
table_md = self._table_to_markdown(table_data)
slide_content.append(table_md)
# Extract images
for shape in slide.shapes:
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
try:
image = shape.image
image_bytes = image.blob
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Create image part
image_part = ContentPart(
id=f"slide_{slide_index}_image_{len(parts)}",
label=f"Slide {slide_index} Image",
typeGroup="image",
mimeType="image/png", # Default to PNG
data=image_b64,
metadata={
"slide_number": slide_index,
"shape_type": "image",
"extracted_from": "powerpoint"
}
)
parts.append(image_part)
except Exception as e:
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
# Create slide content part
if slide_content:
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
slide_part = ContentPart(
id=f"slide_{slide_index}",
label=f"Slide {slide_index} Content",
typeGroup="structure",
mimeType="text/plain",
data=slide_text,
metadata={
"slide_number": slide_index,
"content_type": "slide",
"extracted_from": "powerpoint",
"text_length": len(slide_text)
}
)
parts.append(slide_part)
# Create presentation overview
file_name = context.get("fileName", "presentation.pptx")
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
overview_text += f"**Content Parts:** {len(parts)}\n\n"
# Add slide summaries
for i, slide in enumerate(presentation.slides, 1):
slide_text_parts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text_parts.append(shape.text.strip())
if slide_text_parts:
overview_text += f"## Slide {i}\n"
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
overview_text += "\n\n"
# Create overview part
overview_part = ContentPart(
id="presentation_overview",
label="Presentation Overview",
typeGroup="text",
mimeType="text/plain",
data=overview_text,
metadata={
"content_type": "overview",
"extracted_from": "powerpoint",
"total_slides": len(presentation.slides),
"text_length": len(overview_text)
}
)
parts.insert(0, overview_part) # Insert at beginning
return parts
except Exception as e:
logger.error(f"Error extracting PowerPoint content: {str(e)}")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data=f"Error extracting PowerPoint content: {str(e)}",
metadata={"error": True, "error_message": str(e)}
)]
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
"""Convert table data to markdown format."""
if not table_data:
return ""
markdown_lines = []
# Header row
if table_data:
header = "| " + " | ".join(table_data[0]) + " |"
markdown_lines.append(header)
# Separator row
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
markdown_lines.append(separator)
# Data rows
for row in table_data[1:]:
data_row = "| " + " | ".join(row) + " |"
markdown_lines.append(data_row)
return "\n".join(markdown_lines)

View file

@ -0,0 +1,56 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class SqlExtractor(Extractor):
"""
Extractor for SQL files.
Supported formats:
- MIME types: text/x-sql, application/sql
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
- Special handling: Treats as structured text with SQL syntax
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in ("text/x-sql", "application/sql") or
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/x-sql", "application/sql"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/x-sql"
data = fileBytes.decode("utf-8", errors="replace")
# Add SQL-specific metadata
metadata = {
"size": len(fileBytes),
"file_type": "sql",
"line_count": len(data.splitlines()),
"has_select": "SELECT" in data.upper(),
"has_insert": "INSERT" in data.upper(),
"has_update": "UPDATE" in data.upper(),
"has_delete": "DELETE" in data.upper(),
"has_create": "CREATE" in data.upper(),
"has_drop": "DROP" in data.upper()
}
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=data,
metadata=metadata
)]

View file

@ -0,0 +1,103 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class TextExtractor(Extractor):
"""
Extractor for plain text files and code files.
Supported formats:
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
# Check MIME types
if mimeType and mimeType.startswith("text/"):
return True
# Check file extensions
if fileName:
ext = fileName.lower()
return ext.endswith((
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
))
return False
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
"application/x-yaml", "application/x-toml", "application/x-ini",
"application/x-config", "application/x-properties", "application/x-log"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/plain"
data = fileBytes.decode("utf-8", errors="replace")
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="text",
mimeType=mimeType,
data=data,
metadata={"size": len(fileBytes)}
)]

View file

@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class XlsxExtractor(Extractor):
"""
Extractor for Microsoft Excel spreadsheets.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
- File extensions: .xlsx, .xlsm
- Special handling: Extracts all sheets as CSV data
- Dependencies: openpyxl
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
@ -27,6 +37,14 @@ class XlsxExtractor(Extractor):
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xlsx", ".xlsm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
parts: List[ContentPart] = []

View file

@ -7,9 +7,26 @@ from ..subRegistry import Extractor
class XmlExtractor(Extractor):
"""
Extractor for XML files.
Supported formats:
- MIME types: application/xml
- File extensions: .xml, .rss, .atom
- Special handling: Uses ElementTree for parsing
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xml", ".rss", ".atom"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/xml"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/xml"
text = fileBytes.decode("utf-8", errors="replace")

View file

@ -1,25 +0,0 @@
from typing import Any, Dict, List
import base64
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
class ImageExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType or "").startswith("image/")
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "image/unknown"
return [ContentPart(
id=makeId(),
parentId=None,
label="image",
typeGroup="image",
mimeType=mimeType,
data=base64.b64encode(fileBytes).decode("utf-8"),
metadata={"size": len(fileBytes)}
)]

View file

@ -1,26 +0,0 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class TextExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType in ("text/plain", "text/markdown")
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/plain"
data = fileBytes.decode("utf-8", errors="replace")
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="text",
mimeType=mimeType,
data=data,
metadata={"size": len(fileBytes)}
)]

View file

@ -67,10 +67,12 @@ class ExtractionService:
if part.metadata:
logger.debug(f" Metadata: {part.metadata}")
# Attach document id to parts if missing
# Attach document id and MIME type to parts if missing
for p in ec.parts:
if "documentId" not in p.metadata:
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
if "documentMimeType" not in p.metadata:
p.metadata["documentMimeType"] = documentData["mimeType"]
# Log chunking information
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]

View file

@ -0,0 +1,209 @@
"""
Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits.
"""
from typing import List, Dict, Any, Tuple
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from .subUtils import makeId
logger = logging.getLogger(__name__)
class IntelligentTokenAwareMerger:
"""
Intelligent merger that groups chunks based on LLM token limits to minimize AI calls.
Strategy:
1. Calculate token count for each chunk
2. Group chunks to maximize token usage without exceeding limits
3. Preserve document structure and semantic boundaries
4. Minimize total number of AI calls
"""
def __init__(self, model_capabilities: Dict[str, Any]):
self.max_tokens = model_capabilities.get("maxTokens", 4000)
self.safety_margin = model_capabilities.get("safetyMargin", 0.1)
self.effective_max_tokens = int(self.max_tokens * (1 - self.safety_margin))
self.chars_per_token = model_capabilities.get("charsPerToken", 4) # Rough estimation
def merge_chunks_intelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]:
"""
Merge chunks intelligently based on token limits.
Args:
chunks: List of ContentPart chunks to merge
prompt: AI prompt to account for in token calculation
Returns:
List of optimally merged ContentPart objects
"""
if not chunks:
return chunks
logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, max_tokens={self.effective_max_tokens}")
# Calculate tokens for prompt
prompt_tokens = self._estimate_tokens(prompt)
available_tokens = self.effective_max_tokens - prompt_tokens
logger.info(f"📊 Prompt tokens: {prompt_tokens}, Available for content: {available_tokens}")
# Group chunks by document and type for semantic coherence
grouped_chunks = self._group_chunks_by_document_and_type(chunks)
merged_parts = []
for group_key, group_chunks in grouped_chunks.items():
logger.info(f"📁 Processing group: {group_key} ({len(group_chunks)} chunks)")
# Merge chunks within this group optimally
group_merged = self._merge_group_optimally(group_chunks, available_tokens)
merged_parts.extend(group_merged)
logger.info(f"✅ Intelligent merging complete: {len(chunks)}{len(merged_parts)} parts")
return merged_parts
def _group_chunks_by_document_and_type(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]:
"""Group chunks by document and type for semantic coherence."""
groups = {}
for chunk in chunks:
# Create group key: document_id + type_group
doc_id = chunk.metadata.get("documentId", "unknown")
type_group = chunk.typeGroup
group_key = f"{doc_id}_{type_group}"
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(chunk)
return groups
def _merge_group_optimally(self, chunks: List[ContentPart], available_tokens: int) -> List[ContentPart]:
"""Merge chunks within a group optimally to minimize AI calls."""
if not chunks:
return []
# Sort chunks by size (smallest first for better packing)
sorted_chunks = sorted(chunks, key=lambda c: self._estimate_tokens(c.data))
merged_parts = []
current_group = []
current_tokens = 0
for chunk in sorted_chunks:
chunk_tokens = self._estimate_tokens(chunk.data)
# Special case: If single chunk is already at max size, process it alone
if chunk_tokens >= available_tokens * 0.9: # 90% of available tokens
# Finalize current group if it exists
if current_group:
merged_part = self._create_merged_part(current_group, current_tokens)
merged_parts.append(merged_part)
current_group = []
current_tokens = 0
# Process large chunk individually
merged_parts.append(chunk)
logger.debug(f"🔍 Large chunk processed individually: {chunk_tokens} tokens")
continue
# If adding this chunk would exceed limit, finalize current group
if current_tokens + chunk_tokens > available_tokens and current_group:
merged_part = self._create_merged_part(current_group, current_tokens)
merged_parts.append(merged_part)
current_group = [chunk]
current_tokens = chunk_tokens
else:
current_group.append(chunk)
current_tokens += chunk_tokens
# Finalize remaining group
if current_group:
merged_part = self._create_merged_part(current_group, current_tokens)
merged_parts.append(merged_part)
logger.info(f"📦 Group merged: {len(chunks)}{len(merged_parts)} parts")
return merged_parts
def _create_merged_part(self, chunks: List[ContentPart], total_tokens: int) -> ContentPart:
"""Create a merged ContentPart from multiple chunks."""
if len(chunks) == 1:
return chunks[0] # No need to merge single chunk
# Combine data with semantic separators
combined_data = self._combine_chunk_data(chunks)
# Use metadata from first chunk as base
base_chunk = chunks[0]
merged_metadata = base_chunk.metadata.copy()
merged_metadata.update({
"merged": True,
"originalChunkCount": len(chunks),
"totalTokens": total_tokens,
"originalChunkIds": [c.id for c in chunks],
"size": len(combined_data.encode('utf-8'))
})
merged_part = ContentPart(
id=makeId(),
parentId=base_chunk.parentId,
label=f"merged_{len(chunks)}_chunks",
typeGroup=base_chunk.typeGroup,
mimeType=base_chunk.mimeType,
data=combined_data,
metadata=merged_metadata
)
logger.debug(f"🔗 Created merged part: {len(chunks)} chunks, {total_tokens} tokens")
return merged_part
def _combine_chunk_data(self, chunks: List[ContentPart]) -> str:
"""Combine chunk data with appropriate separators."""
if not chunks:
return ""
# Use different separators based on content type
if chunks[0].typeGroup == "text":
separator = "\n\n---\n\n" # Clear text separation
elif chunks[0].typeGroup == "table":
separator = "\n\n[TABLE BREAK]\n\n" # Table separation
else:
separator = "\n\n---\n\n" # Default separation
return separator.join([chunk.data for chunk in chunks])
def _estimate_tokens(self, text: str) -> int:
"""Estimate token count for text."""
if not text:
return 0
return len(text) // self.chars_per_token
def calculate_optimization_stats(self, original_chunks: List[ContentPart], merged_parts: List[ContentPart]) -> Dict[str, Any]:
"""Calculate optimization statistics with detailed analysis."""
original_calls = len(original_chunks)
optimized_calls = len(merged_parts)
reduction_percent = ((original_calls - optimized_calls) / original_calls * 100) if original_calls > 0 else 0
# Analyze chunk sizes
large_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) >= self.effective_max_tokens * 0.9]
small_chunks = [c for c in original_chunks if self._estimate_tokens(c.data) < self.effective_max_tokens * 0.9]
# Calculate theoretical maximum optimization (if all small chunks could be merged)
theoretical_min_calls = len(large_chunks) + max(1, len(small_chunks) // 3) # Assume 3 small chunks per call
theoretical_reduction = ((original_calls - theoretical_min_calls) / original_calls * 100) if original_calls > 0 else 0
return {
"original_ai_calls": original_calls,
"optimized_ai_calls": optimized_calls,
"reduction_percent": round(reduction_percent, 1),
"cost_savings": f"{reduction_percent:.1f}%",
"efficiency_gain": f"{original_calls / optimized_calls:.1f}x" if optimized_calls > 0 else "",
"analysis": {
"large_chunks": len(large_chunks),
"small_chunks": len(small_chunks),
"theoretical_min_calls": theoretical_min_calls,
"theoretical_reduction": round(theoretical_reduction, 1),
"optimization_potential": "high" if reduction_percent > 50 else "moderate" if reduction_percent > 20 else "low"
}
}

View file

@ -3,11 +3,13 @@ import logging
import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from modules.shared.configuration import APP_CONFIG
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.text_merger import TextMerger
from .merging.table_merger import TableMerger
from .merging.default_merger import DefaultMerger
from .merging.mergerText import TextMerger
from .merging.mergerTable import TableMerger
from .merging.mergerDefault import DefaultMerger
from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
@ -84,16 +86,25 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]
logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
logger.debug(f"runExtraction - non_chunk_parts: {len(non_chunk_parts)}, chunk_parts: {len(chunk_parts)}")
# Apply intelligent merging for small text parts
if non_chunk_parts:
# Count text parts
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
if len(text_parts) > 5: # If we have many small text parts, merge them
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)
# Combine non-chunk parts with chunk parts (chunks stay separate)
parts = non_chunk_parts + chunk_parts
logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
# DEBUG: dump parts and chunks to files TODO TO REMOVE
logger.debug(f"runExtraction - Final parts: {len(parts)} (chunks: {len(chunk_parts)})")
# DEBUG: dump parts and chunks to files - only if debug enabled
try:
debug_enabled = APP_CONFIG.get("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
base_dir = "./test-chat/ai"
os.makedirs(base_dir, exist_ok=True)
@ -146,13 +157,22 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
kept: List[ContentPart] = []
remaining: List[ContentPart] = []
for p in parts:
logger.debug(f"Starting poolAndLimit with {len(parts)} parts, maxSize={maxSize}")
for i, p in enumerate(parts):
size = int(p.metadata.get("size", 0) or 0)
# Show first 50 characters of text content for debugging
content_preview = p.data[:50].replace('\n', '\\n') if p.data else ""
logger.debug(f"Part {i}: {p.typeGroup} - {size} bytes - '{content_preview}...' (current: {current})")
if current + size <= maxSize:
kept.append(p)
current += size
logger.debug(f"Part {i} kept (total: {current})")
else:
remaining.append(p)
logger.debug(f"Part {i} moved to remaining")
logger.debug(f"Kept: {len(kept)}, Remaining: {len(remaining)}")
# If we have remaining parts and chunking is allowed, try chunking
if remaining and chunkAllowed:
@ -160,12 +180,15 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
logger.debug(f"Remaining parts to chunk: {len(remaining)}")
logger.debug(f"Max size limit: {maxSize} bytes")
logger.debug(f"Current size used: {current} bytes")
logger.debug(f"Chunking {len(remaining)} remaining parts")
for p in remaining:
if p.typeGroup in ("text", "table", "structure", "image"):
if p.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
logger.debug(f"Chunking {p.typeGroup} part with {len(p.data)} chars")
chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
logger.debug(f"Created {len(chunks)} chunks")
logger.debug(f"Created {len(chunks)} chunks")
chunks_added = 0
for ch in chunks:
@ -197,12 +220,18 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")
# Apply intelligent merging for small text parts
if non_chunk_parts:
# Count text parts
text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
if len(text_parts) > 5: # If we have many small text parts, merge them
logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)
# Combine non-chunk parts with chunk parts (chunks stay separate)
kept = non_chunk_parts + chunk_parts
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
# Re-check size after merging
@ -211,11 +240,30 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
# Apply size limit to merged parts
kept = _applySizeLimit(kept, maxSize)
logger.debug(f"poolAndLimit returning {len(kept)} parts")
return kept
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
"""Apply merging strategy to parts."""
"""Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"_applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("modelCapabilities", {})
subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
# Calculate and log optimization stats
stats = subMerger.calculate_optimization_stats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
logger.debug(f"Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
return merged
# Fallback to traditional merging
textMerger = TextMerger()
tableMerger = TableMerger()
defaultMerger = DefaultMerger()
@ -226,18 +274,29 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
structureParts = [p for p in parts if p.typeGroup == "structure"]
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
merged: List[ContentPart] = []
if textParts:
merged.extend(textMerger.merge(textParts, strategy))
textMerged = textMerger.merge(textParts, strategy)
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
merged.extend(textMerged)
if tableParts:
merged.extend(tableMerger.merge(tableParts, strategy))
tableMerged = tableMerger.merge(tableParts, strategy)
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
merged.extend(tableMerged)
if structureParts:
# For now, treat structure like text
merged.extend(textMerger.merge(structureParts, strategy))
structureMerged = textMerger.merge(structureParts, strategy)
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
merged.extend(structureMerged)
if otherParts:
merged.extend(defaultMerger.merge(otherParts, strategy))
otherMerged = defaultMerger.merge(otherParts, strategy)
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
merged.extend(otherMerged)
logger.debug(f"_applyMerging returning {len(merged)} parts")
return merged

View file

@ -1,15 +1,38 @@
from typing import Any, Dict, Optional
import logging
from modules.datamodels.datamodelExtraction import ContentPart
logger = logging.getLogger(__name__)
class Extractor:
"""
Base class for all document extractors.
Each extractor should implement:
- detect(): Check if this extractor can handle the given file
- extract(): Extract content from the file
- getSupportedExtensions(): Return supported file extensions
- getSupportedMimeTypes(): Return supported MIME types
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
"""Check if this extractor can handle the given file."""
return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
"""Extract content from the file bytes."""
raise NotImplementedError
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (including dots)."""
return []
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return []
class Chunker:
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
@ -20,51 +43,86 @@ class ExtractorRegistry:
def __init__(self):
self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None
# Register built-ins
self._auto_discover_extractors()
def _auto_discover_extractors(self):
"""Auto-discover and register all extractors from the extractors directory."""
try:
from .formats.text_extractor import TextExtractor
from .formats.csv_extractor import CsvExtractor
from .formats.json_extractor import JsonExtractor
from .formats.xml_extractor import XmlExtractor
from .formats.html_extractor import HtmlExtractor
from .formats.pdf_extractor import PdfExtractor
from .formats.docx_extractor import DocxExtractor
from .formats.xlsx_extractor import XlsxExtractor
from .formats.image_extractor import ImageExtractor
from .formats.binary_extractor import BinaryExtractor
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor())
self.register("application/json", JsonExtractor())
self.register("application/xml", XmlExtractor())
self.register("text/html", HtmlExtractor())
self.register("application/pdf", PdfExtractor())
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
# images
self.register("image/jpeg", ImageExtractor())
self.register("image/png", ImageExtractor())
self.register("image/gif", ImageExtractor())
# extension fallbacks
self.register("txt", TextExtractor())
self.register("md", TextExtractor())
self.register("csv", CsvExtractor())
self.register("json", JsonExtractor())
self.register("xml", XmlExtractor())
self.register("html", HtmlExtractor())
self.register("htm", HtmlExtractor())
self.register("pdf", PdfExtractor())
self.register("docx", DocxExtractor())
self.register("xlsx", XlsxExtractor())
self.register("xlsm", XlsxExtractor())
# fallback
self.setFallback(BinaryExtractor())
print(f"✅ ExtractorRegistry: Successfully registered {len(self._map)} extractors")
import os
import importlib
from pathlib import Path
# Get the extractors directory
current_dir = Path(__file__).parent
extractors_dir = current_dir / "extractors"
if not extractors_dir.exists():
logger.error(f"Extractors directory not found: {extractors_dir}")
return
# Import all extractor modules
extractor_modules = []
for file_path in extractors_dir.glob("extractor*.py"):
if file_path.name == "__init__.py":
continue
module_name = file_path.stem
try:
# Import the module
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
# Find all extractor classes in the module
for attr_name in dir(module):
attr = getattr(module, attr_name)
if (isinstance(attr, type) and
issubclass(attr, Extractor) and
attr != Extractor and
not attr_name.startswith('_')):
# Create instance and auto-register
extractor_instance = attr()
self._auto_register_extractor(extractor_instance)
extractor_modules.append(attr_name)
except Exception as e:
print(f"❌ ExtractorRegistry: Failed to register extractors: {str(e)}")
logger.warning(f"Failed to import {module_name}: {str(e)}")
continue
# Set fallback extractor
try:
from .extractors.extractorBinary import BinaryExtractor
self.setFallback(BinaryExtractor())
except Exception as e:
logger.warning(f"Failed to set fallback extractor: {str(e)}")
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
except Exception as e:
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
import traceback
traceback.print_exc()
def _auto_register_extractor(self, extractor: Extractor):
"""Auto-register an extractor based on its declared supported formats."""
try:
# Register MIME types
mime_types = extractor.getSupportedMimeTypes()
for mime_type in mime_types:
self.register(mime_type, extractor)
logger.debug(f"Registered MIME type: {mime_type}{extractor.__class__.__name__}")
# Register file extensions
extensions = extractor.getSupportedExtensions()
for ext in extensions:
# Remove leading dot for registry key
ext_key = ext.lstrip('.')
self.register(ext_key, extractor)
logger.debug(f"Registered extension: .{ext_key}{extractor.__class__.__name__}")
except Exception as e:
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
def register(self, key: str, extractor: Extractor):
self._map[key] = extractor
@ -81,6 +139,43 @@ class ExtractorRegistry:
return self._map[ext]
return self._fallback
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
"""
Get all supported formats from all registered extractors.
Returns:
Dictionary with format information:
{
"extensions": {
"extractor_name": [".ext1", ".ext2", ...]
},
"mime_types": {
"extractor_name": ["mime/type1", "mime/type2", ...]
}
}
"""
formats = {"extensions": {}, "mime_types": {}}
# Get formats from registered extractors
for key, extractor in self._map.items():
if hasattr(extractor, 'getSupportedExtensions'):
extensions = extractor.getSupportedExtensions()
if extensions:
formats["extensions"][key] = extensions
if hasattr(extractor, 'getSupportedMimeTypes'):
mime_types = extractor.getSupportedMimeTypes()
if mime_types:
formats["mime_types"][key] = mime_types
# Add fallback extractor info
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
return formats
class ChunkerRegistry:
def __init__(self):
@ -88,17 +183,19 @@ class ChunkerRegistry:
self._noop = Chunker()
# Register default chunkers
try:
from .chunking.text_chunker import TextChunker
from .chunking.table_chunker import TableChunker
from .chunking.structure_chunker import StructureChunker
# Skip ImageChunker for now to avoid PIL import hang
# from .chunking.image_chunker import ImageChunker
from .chunking.chunkerText import TextChunker
from .chunking.chunkerTable import TableChunker
from .chunking.chunkerStructure import StructureChunker
from .chunking.chunkerImage import ImageChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())
# self.register("image", ImageChunker())
self.register("image", ImageChunker())
# Use text chunker for container and binary content
self.register("container", TextChunker())
self.register("binary", TextChunker())
except Exception as e:
print(f"❌ ChunkerRegistry: Failed to register chunkers: {str(e)}")
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
import traceback
traceback.print_exc()

View file

@ -1,6 +1,7 @@
import logging
import uuid
from typing import Any, Dict, List, Optional
import json
from typing import Any, Dict, List, Optional, Union, Tuple
from datetime import datetime, UTC
import re
from modules.shared.timezoneUtils import get_utc_timestamp
@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
class GenerationService:
def __init__(self, serviceCenter=None):
# Directly use interfaces from the provided service center (no self.service calls)
self.serviceCenter = serviceCenter
self.services = serviceCenter
self.interfaceDbComponent = getattr(serviceCenter, 'interfaceDbComponent', None) if serviceCenter else None
self.interfaceDbChat = getattr(serviceCenter, 'interfaceDbChat', None) if serviceCenter else None
self.workflow = getattr(serviceCenter, 'workflow', None) if serviceCenter else None
@ -296,101 +297,237 @@ class GenerationService:
'workflowId': 'unknown'
}
async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]:
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
"""
Render extracted content to the specified output format.
Render extracted JSON content to the specified output format.
Args:
extracted_content: Content extracted by AI using format-specific prompt
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
extractedContent: Structured JSON document from AI extraction
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
title: Report title
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
Returns:
tuple: (rendered_content, mime_type)
"""
try:
# DEBUG: dump renderer input to diagnose JSON+HTML mixtures TODO REMOVE
# Validate JSON input
if not isinstance(extractedContent, dict):
raise ValueError("extractedContent must be a JSON dictionary")
if "sections" not in extractedContent:
raise ValueError("extractedContent must contain 'sections' field")
# DEBUG: Log renderer input metadata only (no verbose JSON) - only if debug enabled
try:
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
import os
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
os.makedirs(debug_dir, exist_ok=True)
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
f.write(extracted_content or "")
f.write(f"title: {title}\nformat: {outputFormat}\ncontent_type: {type(extractedContent).__name__}\n")
f.write(f"content_size: {len(str(extractedContent))} characters\n")
f.write(f"sections_count: {len(extractedContent.get('sections', []))}\n")
except Exception:
pass
# Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format)
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
raise ValueError(f"Unsupported output format: {output_format}")
raise ValueError(f"Unsupported output format: {outputFormat}")
# Render the content
rendered_content, mime_type = await renderer.render(extracted_content, title)
# Render the JSON content directly (AI generation handled by main service)
renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService)
# DEBUG: dump rendered output
try:
import os
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
f.write(rendered_content or "")
f.write(renderedContent or "")
except Exception:
pass
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
return rendered_content, mime_type
logger.info(f"Successfully rendered JSON report to {outputFormat} format: {len(renderedContent)} characters")
return renderedContent, mimeType
except Exception as e:
logger.error(f"Error rendering report to {output_format}: {str(e)}")
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
raise
def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
async def getAdaptiveExtractionPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
promptAnalysis: Dict[str, Any],
aiService=None
) -> str:
"""Get adaptive extraction prompt based on AI analysis."""
from .subPromptBuilder import buildAdaptiveExtractionPrompt
return await buildAdaptiveExtractionPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
promptAnalysis=promptAnalysis,
aiService=aiService,
services=self.services
)
async def getGenerationPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""Get generation prompt for enhancing extracted JSON content."""
from .subPromptBuilder import buildGenerationPrompt
return await buildGenerationPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
async def getGenericExtractionPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""Get generic extraction prompt that works for both single and multi-file."""
from .subPromptBuilder import buildGenericExtractionPrompt
return await buildGenericExtractionPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
Args:
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
user_prompt: User's original prompt for report generation
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
userPrompt: User's original prompt for report generation
title: Report title
aiService: AI service instance for intent extraction
Returns:
str: Format-specific prompt for AI extraction
"""
try:
# Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format)
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
raise ValueError(f"Unsupported output format: {output_format}")
raise ValueError(f"Unsupported output format: {outputFormat}")
# Build centralized prompt with generic rules + format-specific guidelines
from .prompt_builder import buildExtractionPrompt
extraction_prompt = buildExtractionPrompt(
output_format=output_format,
from .subPromptBuilder import buildExtractionPrompt
extractionPrompt = await buildExtractionPrompt(
outputFormat=outputFormat,
renderer=renderer,
user_prompt=user_prompt,
title=title
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
return extraction_prompt
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
return extractionPrompt
except Exception as e:
logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise
async def renderAdaptiveReport(
self,
extractedContent: Dict[str, Any],
outputFormat: str,
title: str,
userPrompt: str = None,
aiService=None,
isMultiFile: bool = False
) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
"""Render report adaptively based on content structure."""
if isMultiFile and "documents" in extractedContent:
return await self._renderMultiFileReport(
extractedContent, outputFormat, title, userPrompt, aiService
)
else:
return await self._renderSingleFileReport(
extractedContent, outputFormat, title, userPrompt, aiService
)
async def _renderMultiFileReport(
self,
extractedContent: Dict[str, Any],
outputFormat: str,
title: str,
userPrompt: str = None,
aiService=None
) -> List[Dict[str, Any]]:
"""Render multiple documents from extracted content."""
generated_documents = []
for doc_data in extractedContent.get("documents", []):
# Use existing single-file renderer for each document
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
continue
# Render individual document
rendered_content, mime_type = await renderer.render(
extractedContent={"sections": doc_data["sections"]},
title=doc_data["title"],
userPrompt=userPrompt,
aiService=aiService
)
generated_documents.append({
"filename": doc_data["filename"],
"content": rendered_content,
"mime_type": mime_type,
"title": doc_data["title"]
})
return generated_documents
async def _renderSingleFileReport(
self,
extractedContent: Dict[str, Any],
outputFormat: str,
title: str,
userPrompt: str = None,
aiService=None
) -> Tuple[str, str]:
"""Render single file report (existing functionality)."""
# Use existing renderReport method
return await self.renderReport(
extractedContent, outputFormat, title, userPrompt, aiService
)
def _getFormatRenderer(self, output_format: str):
"""Get the appropriate renderer for the specified format using auto-discovery."""
try:
from .renderers.registry import get_renderer
renderer = get_renderer(output_format)
renderer = get_renderer(output_format, services=self.services)
if renderer:
return renderer
# Fallback to text renderer if no specific renderer found
logger.warning(f"No renderer found for format {output_format}, falling back to text")
fallback_renderer = get_renderer('text')
fallback_renderer = get_renderer('text', services=self.services)
if fallback_renderer:
return fallback_renderer

View file

@ -1,72 +0,0 @@
"""
Centralized prompt builder for document generation across formats.
Builds a robust prompt that:
- Accepts any user intent (no fixed structure assumptions)
- Injects format-specific guidelines from the selected renderer
- Adds a common policy section to always use real data from source docs
- Requires the AI to output a filename header that we can parse and use
"""
from typing import Protocol
class _RendererLike(Protocol):
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
...
def buildExtractionPrompt(
output_format: str,
renderer: _RendererLike,
user_prompt: str,
title: str
) -> str:
"""
Build the final extraction prompt by combining:
- The raw user prompt (verbatim)
- Generic cross-format instructions (filename header + real-data policy)
- Format-specific guidelines snippet provided by the renderer
The AI must place a single filename header at the very top:
FILENAME: <safe-file-name-with-extension>
followed by a blank line and then ONLY the document content according to the target format.
"""
format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
# Generic block appears once for every format
generic_intro = f"""
{user_prompt}
You are generating a document in {output_format.upper()} format for the title: "{title}".
Rules:
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
- Use only factual information extracted from the supplied source documents.
- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD").
- The output must strictly follow the target format and be ready for saving without extra wrapping.
- At the VERY TOP output exactly one line with the filename header:
FILENAME: <safe-file-name-with-extension>
- The base name should be short, descriptive, and kebab-case or snake-case without spaces.
- Include the correct extension for the requested format (e.g., .html, .pdf, .docx, .md, .txt, .json, .csv, .xlsx).
- Avoid special characters beyond [a-zA-Z0-9-_].
- After this header, insert a single blank line and then provide ONLY the document content.
Common policy:
- Use the actual data from the source documents to create the content.
- Do not generate placeholder text or templates.
- Extract and use the real data provided in the source documents to create meaningful content.
""".strip()
# Final assembly
final_prompt = (
generic_intro
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
+ format_guidelines.strip()
+ "\n\nGenerate the complete document content now based on the source documents below:"
)
return final_prompt

View file

@ -1,86 +0,0 @@
"""
Base renderer class for all format renderers.
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, Tuple, List
import logging
logger = logging.getLogger(__name__)
class BaseRenderer(ABC):
"""Base class for all format renderers."""
def __init__(self):
self.logger = logger
@classmethod
def get_supported_formats(cls) -> List[str]:
"""
Return list of supported format names for this renderer.
Override this method in subclasses to specify supported formats.
"""
return []
@classmethod
def get_format_aliases(cls) -> List[str]:
"""
Return list of format aliases for this renderer.
Override this method in subclasses to specify format aliases.
"""
return []
@classmethod
def get_priority(cls) -> int:
"""
Return priority for this renderer (higher number = higher priority).
Used when multiple renderers support the same format.
"""
return 0
@abstractmethod
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
Args:
user_prompt: User's original prompt for report generation
title: Report title
Returns:
str: Format-specific prompt for AI extraction
"""
pass
@abstractmethod
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""
Render extracted content to the target format.
Args:
extracted_content: Raw content extracted by AI using format-specific prompt
title: Report title
Returns:
tuple: (rendered_content, mime_type)
"""
pass
def _extract_sections(self, report_data: Dict[str, Any]) -> list:
"""Extract sections from report data."""
return report_data.get('sections', [])
def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract metadata from report data."""
return report_data.get('metadata', {})
def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
"""Get title from report data or use fallback."""
return report_data.get('title', fallback_title)
def _format_timestamp(self, timestamp: str = None) -> str:
"""Format timestamp for display."""
if timestamp:
return timestamp
from datetime import datetime, UTC
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")

View file

@ -1,64 +0,0 @@
"""
CSV renderer for report generation.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import csv
import io
class CsvRenderer(BaseRenderer):
"""Renders content to CSV format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported CSV formats."""
return ['csv']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['spreadsheet', 'table']
@classmethod
def get_priority(cls) -> int:
"""Return priority for CSV renderer."""
return 70
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only CSV-specific guidelines; global prompt is built centrally."""
return (
"CSV FORMAT GUIDELINES:\n"
"- Emit ONLY CSV text without fences or commentary.\n"
"- Include a single header row with clear column names.\n"
"- Quote fields containing commas, quotes, or newlines; escape quotes by doubling them.\n"
"- Use rows to represent items/records derived from sources.\n"
"- Keep cells concise; include units in headers when useful.\n"
"OUTPUT: Return ONLY valid CSV content that can be imported."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to CSV format."""
try:
# The extracted content should already be CSV from the AI
# Just clean it up
csv_content = self._clean_csv_content(extracted_content, title)
return csv_content, "text/csv"
except Exception as e:
self.logger.error(f"Error rendering CSV: {str(e)}")
# Return minimal CSV fallback
return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
def _clean_csv_content(self, content: str, title: str) -> str:
"""Clean and validate CSV content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
return content

View file

@ -1,249 +0,0 @@
"""
DOCX renderer for report generation using python-docx.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC
try:
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.shared import OxmlElement, qn
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
class DocxRenderer(BaseRenderer):
"""Renders content to DOCX format using python-docx."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported DOCX formats."""
return ['docx', 'doc']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['word', 'document']
@classmethod
def get_priority(cls) -> int:
"""Return priority for DOCX renderer."""
return 115
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only DOCX-specific guidelines; global prompt is built centrally."""
return (
"DOCX FORMAT GUIDELINES:\n"
"- Provide plain text content suitable for Word generation (no markdown/HTML).\n"
"- Use clear section hierarchy; bullet and numbered lists where needed.\n"
"- Include tables as simple pipe-delimited lines if tabular data is needed.\n"
"OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to DOCX format."""
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .html_renderer import HtmlRenderer
html_renderer = HtmlRenderer()
html_content, _ = await html_renderer.render(extracted_content, title)
return html_content, "text/html"
# Generate DOCX using python-docx
docx_content = self._generate_docx(extracted_content, title)
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
except Exception as e:
self.logger.error(f"Error rendering DOCX: {str(e)}")
# Return minimal fallback
return f"DOCX Generation Error: {str(e)}", "text/plain"
def _generate_docx(self, content: str, title: str) -> str:
"""Generate DOCX content using python-docx."""
try:
# Create new document
doc = Document()
# Set up document styles
self._setup_document_styles(doc)
# Add title
title_para = doc.add_heading(title, 0)
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add generation date
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add page break
doc.add_page_break()
# Process content
lines = content.split('\n')
current_section = []
for line in lines:
line = line.strip()
if not line:
continue
# Check for ALL CAPS headings (major headings)
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line, level=1)
# Check for Title Case headings (subheadings)
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line, level=2)
# Check for markdown headings (fallback)
elif line.startswith('# '):
# H1 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[2:], level=1)
elif line.startswith('## '):
# H2 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[3:], level=2)
elif line.startswith('### '):
# H3 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[4:], level=3)
else:
current_section.append(line)
# Process remaining content
if current_section:
self._process_section(doc, current_section)
# Save to buffer
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
# Convert to base64
docx_bytes = buffer.getvalue()
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
return docx_base64
except Exception as e:
self.logger.error(f"Error generating DOCX: {str(e)}")
raise
def _setup_document_styles(self, doc):
"""Set up document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
# Set heading styles
for i in range(1, 4):
heading_style = doc.styles[f'Heading {i}']
heading_font = heading_style.font
heading_font.name = 'Calibri'
heading_font.size = Pt(16 - i * 2)
heading_font.bold = True
except Exception as e:
self.logger.warning(f"Could not set up document styles: {str(e)}")
def _process_section(self, doc, lines: list):
"""Process a section of content into DOCX elements."""
for line in lines:
if not line.strip():
continue
# Check for tables (lines with |)
if '|' in line and not line.startswith('|'):
# This might be part of a table, process as table
table_data = self._extract_table_data(lines)
if table_data:
self._add_table(doc, table_data)
return
# Check for lists
if line.startswith('- ') or line.startswith('* '):
# This is a list item
doc.add_paragraph(line[2:], style='List Bullet')
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
# This is a numbered list item
doc.add_paragraph(line[3:], style='List Number')
else:
# Regular paragraph
doc.add_paragraph(line)
def _extract_table_data(self, lines: list) -> list:
"""Extract table data from lines."""
table_data = []
in_table = False
for line in lines:
if '|' in line:
if not in_table:
in_table = True
# Split by | and clean up
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
if cells:
table_data.append(cells)
elif in_table and not line.strip():
# Empty line, might be end of table
break
return table_data if len(table_data) > 1 else []
def _add_table(self, doc, table_data: list):
"""Add a table to the document."""
try:
if not table_data:
return
# Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
# Add data to table
for row_idx, row_data in enumerate(table_data):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data
# Style the table
self._style_table(table)
except Exception as e:
self.logger.warning(f"Could not add table: {str(e)}")
def _style_table(self, table):
"""Apply styling to the table."""
try:
# Style header row
if len(table.rows) > 0:
header_cells = table.rows[0].cells
for cell in header_cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")

View file

@ -1,210 +0,0 @@
"""
Excel renderer for report generation using openpyxl.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC
try:
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
from openpyxl.worksheet.table import Table, TableStyleInfo
OPENPYXL_AVAILABLE = True
except ImportError:
OPENPYXL_AVAILABLE = False
class ExcelRenderer(BaseRenderer):
"""Renders content to Excel format using openpyxl."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported Excel formats."""
return ['xlsx', 'xls', 'excel']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['spreadsheet', 'workbook']
@classmethod
def get_priority(cls) -> int:
"""Return priority for Excel renderer."""
return 110
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only Excel-specific guidelines; global prompt is built centrally."""
return (
"EXCEL FORMAT GUIDELINES:\n"
"- Output one or more pipe-delimited tables with a single header row.\n"
"- Let user intent define columns; use clear names and ISO dates.\n"
"- Separate multiple tables by a single blank line.\n"
"- No markdown/HTML/code fences; tables only unless user explicitly asks for notes.\n"
"OUTPUT: Return ONLY pipe-delimited tables suitable for import."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to Excel format."""
try:
if not OPENPYXL_AVAILABLE:
# Fallback to CSV if openpyxl not available
from .csv_renderer import CsvRenderer
csv_renderer = CsvRenderer()
csv_content, _ = await csv_renderer.render(extracted_content, title)
return csv_content, "text/csv"
# Generate Excel using openpyxl
excel_content = self._generate_excel(extracted_content, title)
return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
except Exception as e:
self.logger.error(f"Error rendering Excel: {str(e)}")
# Return CSV fallback
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
def _generate_excel(self, content: str, title: str) -> str:
"""Generate Excel content using openpyxl."""
try:
# Create workbook
wb = Workbook()
# Remove default sheet
wb.remove(wb.active)
# Create sheets
summary_sheet = wb.create_sheet("Summary", 0)
data_sheet = wb.create_sheet("Data", 1)
analysis_sheet = wb.create_sheet("Analysis", 2)
# Add content to sheets
self._populate_summary_sheet(summary_sheet, title)
self._populate_data_sheet(data_sheet, content)
self._populate_analysis_sheet(analysis_sheet, content)
# Save to buffer
buffer = io.BytesIO()
wb.save(buffer)
buffer.seek(0)
# Convert to base64
excel_bytes = buffer.getvalue()
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
return excel_base64
except Exception as e:
self.logger.error(f"Error generating Excel: {str(e)}")
raise
def _populate_summary_sheet(self, sheet, title: str):
"""Populate the summary sheet."""
try:
# Title
sheet['A1'] = title
sheet['A1'].font = Font(size=16, bold=True)
sheet['A1'].alignment = Alignment(horizontal='center')
# Generation info
sheet['A3'] = "Generated:"
sheet['B3'] = self._format_timestamp()
sheet['A4'] = "Status:"
sheet['B4'] = "Generated Successfully"
# Key metrics placeholder
sheet['A6'] = "Key Metrics:"
sheet['A6'].font = Font(bold=True)
sheet['A7'] = "Total Items:"
sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
# Auto-adjust column widths
sheet.column_dimensions['A'].width = 20
sheet.column_dimensions['B'].width = 30
except Exception as e:
self.logger.warning(f"Could not populate summary sheet: {str(e)}")
def _populate_data_sheet(self, sheet, content: str):
"""Populate the data sheet."""
try:
# Headers
headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
for col, header in enumerate(headers, 1):
cell = sheet.cell(row=1, column=col, value=header)
cell.font = Font(bold=True)
cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
# Process content
lines = content.split('\n')
row = 2
for line in lines:
line = line.strip()
if not line:
continue
# Check for table data (lines with |)
if '|' in line:
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
sheet.cell(row=row, column=col, value=cell_data)
row += 1
else:
# Regular content
sheet.cell(row=row, column=1, value=line)
row += 1
# Auto-adjust column widths
for col in range(1, 6):
sheet.column_dimensions[get_column_letter(col)].width = 20
except Exception as e:
self.logger.warning(f"Could not populate data sheet: {str(e)}")
def _populate_analysis_sheet(self, sheet, content: str):
"""Populate the analysis sheet."""
try:
# Title
sheet['A1'] = "Analysis & Insights"
sheet['A1'].font = Font(size=14, bold=True)
# Content analysis
lines = content.split('\n')
row = 3
sheet['A3'] = "Content Analysis:"
sheet['A3'].font = Font(bold=True)
row += 1
# Count different types of content
table_lines = sum(1 for line in lines if '|' in line)
list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
text_lines = len(lines) - table_lines - list_lines
sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
row += 1
sheet[f'A{row}'] = f"Table Rows: {table_lines}"
row += 1
sheet[f'A{row}'] = f"List Items: {list_lines}"
row += 1
sheet[f'A{row}'] = f"Text Lines: {text_lines}"
row += 2
# Recommendations
sheet[f'A{row}'] = "Recommendations:"
sheet[f'A{row}'].font = Font(bold=True)
row += 1
sheet[f'A{row}'] = "1. Review data accuracy"
row += 1
sheet[f'A{row}'] = "2. Consider additional analysis"
row += 1
sheet[f'A{row}'] = "3. Update regularly"
# Auto-adjust column width
sheet.column_dimensions['A'].width = 30
except Exception as e:
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")

View file

@ -1,69 +0,0 @@
"""
HTML renderer for report generation.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
class HtmlRenderer(BaseRenderer):
"""Renders content to HTML format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported HTML formats."""
return ['html', 'htm']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['web', 'webpage']
@classmethod
def get_priority(cls) -> int:
"""Return priority for HTML renderer."""
return 100
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only HTML-specific guidelines; global prompt is built centrally."""
return (
"HTML FORMAT GUIDELINES:\n"
"- Output a complete HTML5 document starting with <!DOCTYPE html>.\n"
"- Include <html>, <head> with <meta charset=\"UTF-8\"> and <title>, and <body>.\n"
"- Use semantic elements: <header>, <main>, <section>, <article>, <footer>.\n"
"- Provide professional CSS in a <style> block; responsive, clean typography.\n"
"- Use h1/h2/h3 for headings; tables and lists for structure.\n"
"OUTPUT: Return ONLY valid HTML (no markdown, no code fences)."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to HTML format."""
try:
# The extracted content should already be HTML from the AI
# Just clean it up and ensure it's valid
html_content = self._clean_html_content(extracted_content, title)
return html_content, "text/html"
except Exception as e:
self.logger.error(f"Error rendering HTML: {str(e)}")
# Return minimal HTML fallback
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
def _clean_html_content(self, content: str, title: str) -> str:
"""Clean and validate HTML content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
# Ensure it starts with DOCTYPE
if not content.startswith('<!DOCTYPE'):
if content.startswith('<html'):
content = '<!DOCTYPE html>\n' + content
else:
content = f'<!DOCTYPE html>\n<html>\n<head><meta charset="UTF-8"><title>{title}</title></head>\n<body>\n{content}\n</body>\n</html>'
return content

View file

@ -1,74 +0,0 @@
"""
JSON renderer for report generation.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import json
class JsonRenderer(BaseRenderer):
"""Renders content to JSON format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported JSON formats."""
return ['json']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['data']
@classmethod
def get_priority(cls) -> int:
"""Return priority for JSON renderer."""
return 80
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only JSON-specific guidelines; global prompt is built centrally."""
return (
"JSON FORMAT GUIDELINES:\n"
"- Output ONLY a single valid JSON object (no fences, no pre/post text).\n"
"- Choose a structure that best fits the user's intent; include a top-level title and data.\n"
"- Prefer arrays/objects that map cleanly to the extracted facts.\n"
"- Include minimal metadata only if useful (e.g., generatedAt, sources).\n"
"OUTPUT: Return ONLY valid, parseable JSON."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to JSON format."""
try:
# The extracted content should already be JSON from the AI
# Just validate and format it
json_content = self._clean_json_content(extracted_content, title)
return json_content, "application/json"
except Exception as e:
self.logger.error(f"Error rendering JSON: {str(e)}")
# Return minimal JSON fallback
fallback_data = {
"title": title,
"sections": [{"type": "text", "content": f"Error rendering report: {str(e)}"}],
"metadata": {"error": str(e)}
}
return json.dumps(fallback_data, indent=2), "application/json"
def _clean_json_content(self, content: str, title: str) -> str:
"""Clean and validate JSON content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
# Validate JSON
try:
parsed = json.loads(content)
# Re-format with proper indentation
return json.dumps(parsed, indent=2, ensure_ascii=False)
except json.JSONDecodeError:
# If not valid JSON, return as-is
return content

View file

@ -1,65 +0,0 @@
"""
Markdown renderer for report generation.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
class MarkdownRenderer(BaseRenderer):
"""Renders content to Markdown format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported Markdown formats."""
return ['md', 'markdown']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['mdown', 'mkd']
@classmethod
def get_priority(cls) -> int:
"""Return priority for markdown renderer."""
return 95
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only Markdown-specific guidelines; global prompt is built centrally."""
return (
"MARKDOWN FORMAT GUIDELINES:\n"
"- Use proper Markdown syntax only (no HTML wrappers).\n"
"- # for main title, ## for sections, ### for subsections.\n"
"- Tables with | separators and a header row.\n"
"- Bullet lists with - or *.\n"
"- Emphasis with **bold** and *italic*.\n"
"- Code blocks with ```language.\n"
"- Horizontal rules (---) to separate major sections when helpful.\n"
"- Include links [text](url) and images ![alt](url) when referenced by sources.\n"
"OUTPUT: Return ONLY raw Markdown content without code fences."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to Markdown format."""
try:
# The extracted content should already be Markdown from the AI
# Just clean it up
markdown_content = self._clean_markdown_content(extracted_content, title)
return markdown_content, "text/markdown"
except Exception as e:
self.logger.error(f"Error rendering markdown: {str(e)}")
# Return minimal markdown fallback
return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown"
def _clean_markdown_content(self, content: str, title: str) -> str:
"""Clean and validate Markdown content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
return content

View file

@ -1,225 +0,0 @@
"""
PDF renderer for report generation using reportlab.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC
try:
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
REPORTLAB_AVAILABLE = True
except ImportError:
REPORTLAB_AVAILABLE = False
class PdfRenderer(BaseRenderer):
"""Renders content to PDF format using reportlab."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported PDF formats."""
return ['pdf']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['document', 'print']
@classmethod
def get_priority(cls) -> int:
"""Return priority for PDF renderer."""
return 120
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only PDF-specific guidelines; global prompt is built centrally."""
return (
"PDF FORMAT GUIDELINES:\n"
"- Provide structured content suitable for pagination and headings (H1/H2/H3-like).\n"
"- Use bullet lists and tables where useful; separate major sections clearly.\n"
"- Avoid markdown/HTML; produce clean, plain content that can be laid out as PDF.\n"
"OUTPUT: Return ONLY the PDF-ready textual content (no fences)."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to PDF format."""
try:
if not REPORTLAB_AVAILABLE:
# Fallback to HTML if reportlab not available
from .html_renderer import HtmlRenderer
html_renderer = HtmlRenderer()
html_content, _ = await html_renderer.render(extracted_content, title)
return html_content, "text/html"
# Generate PDF using reportlab
pdf_content = self._generate_pdf(extracted_content, title)
return pdf_content, "application/pdf"
except Exception as e:
self.logger.error(f"Error rendering PDF: {str(e)}")
# Return minimal fallback
return f"PDF Generation Error: {str(e)}", "text/plain"
def _generate_pdf(self, content: str, title: str) -> str:
"""Generate PDF content using reportlab."""
try:
# Create a buffer to hold the PDF
buffer = io.BytesIO()
# Create PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Get styles
styles = getSampleStyleSheet()
# Create custom styles
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=24,
spaceAfter=30,
alignment=TA_CENTER,
textColor=colors.darkblue
)
heading_style = ParagraphStyle(
'CustomHeading',
parent=styles['Heading2'],
fontSize=16,
spaceAfter=12,
spaceBefore=12,
textColor=colors.darkblue
)
# Build PDF content
story = []
# Title page
story.append(Paragraph(title, title_style))
story.append(Spacer(1, 20))
story.append(Paragraph(f"Generated: {self._format_timestamp()}", styles['Normal']))
story.append(PageBreak())
# Process content
lines = content.split('\n')
current_section = []
for line in lines:
line = line.strip()
if not line:
continue
# Check for headings
if line.startswith('# '):
# H1 heading
if current_section:
story.extend(self._process_section(current_section, styles))
current_section = []
story.append(Paragraph(line[2:], title_style))
story.append(Spacer(1, 12))
elif line.startswith('## '):
# H2 heading
if current_section:
story.extend(self._process_section(current_section, styles))
current_section = []
story.append(Paragraph(line[3:], heading_style))
story.append(Spacer(1, 8))
elif line.startswith('### '):
# H3 heading
if current_section:
story.extend(self._process_section(current_section, styles))
current_section = []
story.append(Paragraph(line[4:], styles['Heading3']))
story.append(Spacer(1, 6))
else:
current_section.append(line)
# Process remaining content
if current_section:
story.extend(self._process_section(current_section, styles))
# Build PDF
doc.build(story)
# Get PDF content as base64
buffer.seek(0)
pdf_bytes = buffer.getvalue()
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
return pdf_base64
except Exception as e:
self.logger.error(f"Error generating PDF: {str(e)}")
raise
def _process_section(self, lines: list, styles) -> list:
"""Process a section of content into PDF elements."""
elements = []
for line in lines:
if not line.strip():
continue
# Check for tables (lines with |)
if '|' in line and not line.startswith('|'):
# This might be part of a table, process as table
table_data = self._extract_table_data(lines)
if table_data:
table = Table(table_data)
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 14),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
elements.append(table)
elements.append(Spacer(1, 12))
return elements
# Check for lists
if line.startswith('- ') or line.startswith('* '):
# This is a list item
elements.append(Paragraph(f"{line[2:]}", styles['Normal']))
else:
# Regular paragraph
elements.append(Paragraph(line, styles['Normal']))
elements.append(Spacer(1, 6))
return elements
def _extract_table_data(self, lines: list) -> list:
"""Extract table data from lines."""
table_data = []
in_table = False
for line in lines:
if '|' in line:
if not in_table:
in_table = True
# Split by | and clean up
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
if cells:
table_data.append(cells)
elif in_table and not line.strip():
# Empty line, might be end of table
break
return table_data if len(table_data) > 1 else []

View file

@ -6,7 +6,7 @@ import logging
import importlib
import pkgutil
from typing import Dict, Type, List, Optional
from .base_renderer import BaseRenderer
from .rendererBaseTemplate import BaseRenderer
logger = logging.getLogger(__name__)
@ -37,7 +37,7 @@ class RendererRegistry:
# Scan all Python files in the renderers directory
for file_path in renderers_dir.glob("*.py"):
if file_path.name in ['registry.py', 'base_renderer.py', '__init__.py']:
if file_path.name in ['registry.py', 'rendererBaseTemplate.py', '__init__.py']:
continue
# Extract module name from filename
@ -92,7 +92,7 @@ class RendererRegistry:
except Exception as e:
logger.error(f"Error registering renderer {renderer_class.__name__}: {str(e)}")
def get_renderer(self, output_format: str) -> Optional[BaseRenderer]:
def get_renderer(self, output_format: str, services=None) -> Optional[BaseRenderer]:
"""Get a renderer instance for the specified format."""
if not self._discovered:
self.discover_renderers()
@ -109,7 +109,7 @@ class RendererRegistry:
if renderer_class:
try:
return renderer_class()
return renderer_class(services=services)
except Exception as e:
logger.error(f"Error creating renderer instance for {format_name}: {str(e)}")
return None
@ -144,9 +144,9 @@ class RendererRegistry:
# Global registry instance
_registry = RendererRegistry()
def get_renderer(output_format: str) -> Optional[BaseRenderer]:
def get_renderer(output_format: str, services=None) -> Optional[BaseRenderer]:
"""Get a renderer instance for the specified format."""
return _registry.get_renderer(output_format)
return _registry.get_renderer(output_format, services)
def get_supported_formats() -> List[str]:
"""Get list of all supported formats."""

View file

@ -0,0 +1,459 @@
"""
Base renderer class for all format renderers.
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, Tuple, List
import logging
import json
logger = logging.getLogger(__name__)
class BaseRenderer(ABC):
"""Base class for all format renderers."""
def __init__(self, services=None):
self.logger = logger
self.services = services # Add services attribute
@classmethod
def get_supported_formats(cls) -> List[str]:
"""
Return list of supported format names for this renderer.
Override this method in subclasses to specify supported formats.
"""
return []
@classmethod
def get_format_aliases(cls) -> List[str]:
"""
Return list of format aliases for this renderer.
Override this method in subclasses to specify format aliases.
"""
return []
@classmethod
def get_priority(cls) -> int:
"""
Return priority for this renderer (higher number = higher priority).
Used when multiple renderers support the same format.
"""
return 0
@abstractmethod
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""
Render extracted JSON content to the target format.
Args:
extracted_content: Structured JSON content with sections and metadata
title: Report title
user_prompt: Original user prompt for context
ai_service: AI service instance for additional processing
Returns:
tuple: (rendered_content, mime_type)
"""
pass
def _extract_sections(self, report_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Extract sections from report data."""
return report_data.get('sections', [])
def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract metadata from report data."""
return report_data.get('metadata', {})
def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
"""Get title from report data or use fallback."""
metadata = report_data.get('metadata', {})
return metadata.get('title', fallback_title)
def _validate_json_structure(self, json_content: Dict[str, Any]) -> bool:
"""Validate that JSON content has the expected structure."""
if not isinstance(json_content, dict):
return False
if "sections" not in json_content:
return False
sections = json_content.get("sections", [])
if not isinstance(sections, list):
return False
# Validate each section has content_type and elements
for section in sections:
if not isinstance(section, dict):
return False
if "content_type" not in section or "elements" not in section:
return False
return True
def _get_section_type(self, section: Dict[str, Any]) -> str:
"""Get the type of a section."""
return section.get("content_type", "paragraph")
def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Get the elements of a section."""
return section.get("elements", [])
def _get_section_id(self, section: Dict[str, Any]) -> str:
"""Get the ID of a section (if available)."""
return section.get("id", "unknown")
def _extract_table_data(self, section_data: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
"""Extract table headers and rows from section data."""
headers = section_data.get("headers", [])
rows = section_data.get("rows", [])
return headers, rows
def _extract_bullet_list_items(self, section_data: Dict[str, Any]) -> List[str]:
"""Extract bullet list items from section data."""
items = section_data.get("items", [])
result = []
for item in items:
if isinstance(item, str):
result.append(item)
elif isinstance(item, dict) and "text" in item:
result.append(item["text"])
return result
def _extract_heading_data(self, section_data: Dict[str, Any]) -> Tuple[int, str]:
"""Extract heading level and text from section data."""
level = section_data.get("level", 1)
text = section_data.get("text", "")
return level, text
def _extract_paragraph_text(self, section_data: Dict[str, Any]) -> str:
"""Extract paragraph text from section data."""
return section_data.get("text", "")
def _extract_code_block_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]:
"""Extract code and language from section data."""
code = section_data.get("code", "")
language = section_data.get("language", "")
return code, language
def _extract_image_data(self, section_data: Dict[str, Any]) -> Tuple[str, str]:
"""Extract base64 data and alt text from section data."""
base64_data = section_data.get("base64Data", "")
alt_text = section_data.get("altText", "Image")
return base64_data, alt_text
def _render_image_section(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
"""
Render an image section. This is a base implementation that should be overridden
by format-specific renderers.
Args:
section: Image section data
styles: Optional styling information
Returns:
Format-specific image representation
"""
section_data = self._get_section_data(section)
base64_data, alt_text = self._extract_image_data(section_data)
# Base implementation returns a simple dict
# Format-specific renderers should override this method
return {
"content_type": "image",
"base64Data": base64_data,
"altText": alt_text,
"width": section_data.get("width", None),
"height": section_data.get("height", None),
"caption": section_data.get("caption", "")
}
def _validate_image_data(self, base64_data: str, alt_text: str) -> bool:
"""Validate image data."""
if not base64_data:
self.logger.warning("Image section has no base64 data")
return False
if not alt_text:
self.logger.warning("Image section has no alt text")
return False
# Basic base64 validation
try:
import base64
base64.b64decode(base64_data, validate=True)
return True
except Exception as e:
self.logger.warning(f"Invalid base64 image data: {str(e)}")
return False
def _get_image_dimensions(self, base64_data: str) -> Tuple[int, int]:
"""
Get image dimensions from base64 data.
This is a helper method that format-specific renderers can use.
"""
try:
import base64
from PIL import Image
import io
# Decode base64 data
image_data = base64.b64decode(base64_data)
image = Image.open(io.BytesIO(image_data))
return image.size # Returns (width, height)
except Exception as e:
self.logger.warning(f"Could not determine image dimensions: {str(e)}")
return (0, 0)
def _resize_image_if_needed(self, base64_data: str, max_width: int = 800, max_height: int = 600) -> str:
"""
Resize image if it exceeds maximum dimensions.
Returns the resized image as base64 string.
"""
try:
import base64
from PIL import Image
import io
# Decode base64 data
image_data = base64.b64decode(base64_data)
image = Image.open(io.BytesIO(image_data))
# Check if resizing is needed
width, height = image.size
if width <= max_width and height <= max_height:
return base64_data # No resizing needed
# Calculate new dimensions maintaining aspect ratio
ratio = min(max_width / width, max_height / height)
new_width = int(width * ratio)
new_height = int(height * ratio)
# Resize image
resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
# Convert back to base64
buffer = io.BytesIO()
resized_image.save(buffer, format=image.format or 'PNG')
resized_data = buffer.getvalue()
return base64.b64encode(resized_data).decode('utf-8')
except Exception as e:
self.logger.warning(f"Could not resize image: {str(e)}")
return base64_data # Return original if resize fails
def _get_supported_section_types(self) -> List[str]:
"""Return list of supported section types."""
return ["table", "bullet_list", "heading", "paragraph", "code_block", "image"]
def _is_valid_section_type(self, section_type: str) -> bool:
"""Check if a section type is valid."""
return section_type in self._get_supported_section_types()
def _process_section_by_type(self, section: Dict[str, Any]) -> Dict[str, Any]:
"""Process a section and return structured data based on its type."""
section_type = self._get_section_type(section)
section_data = self._get_section_data(section)
if section_type == "table":
headers, rows = self._extract_table_data(section_data)
return {"content_type": "table", "headers": headers, "rows": rows}
elif section_type == "bullet_list":
items = self._extract_bullet_list_items(section_data)
return {"content_type": "bullet_list", "items": items}
elif section_type == "heading":
level, text = self._extract_heading_data(section_data)
return {"content_type": "heading", "level": level, "text": text}
elif section_type == "paragraph":
text = self._extract_paragraph_text(section_data)
return {"content_type": "paragraph", "text": text}
elif section_type == "code_block":
code, language = self._extract_code_block_data(section_data)
return {"content_type": "code_block", "code": code, "language": language}
elif section_type == "image":
base64_data, alt_text = self._extract_image_data(section_data)
# Validate image data
if self._validate_image_data(base64_data, alt_text):
return {
"content_type": "image",
"base64Data": base64_data,
"altText": alt_text,
"width": section_data.get("width"),
"height": section_data.get("height"),
"caption": section_data.get("caption", "")
}
else:
# Return placeholder if image data is invalid
return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"}
else:
# Fallback to paragraph
text = self._extract_paragraph_text(section_data)
return {"content_type": "paragraph", "text": text}
def _format_timestamp(self, timestamp: str = None) -> str:
"""Format timestamp for display."""
if timestamp:
return timestamp
from datetime import datetime, UTC
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
# ===== GENERIC AI STYLING HELPERS =====
async def _get_ai_styles(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
"""
Generic AI styling method that can be used by all renderers.
Args:
ai_service: AI service instance
style_template: Format-specific style template
default_styles: Default styles to fall back to
Returns:
Dict with styling definitions
"""
# DEBUG: Show which renderer is calling this method
if not ai_service:
return default_styles
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=style_template, context="", options=request_options)
# DEBUG: Show the actual prompt being sent to AI
self.logger.debug(f"AI Style Template Prompt:")
self.logger.debug(f"{style_template}")
response = await ai_service.aiObjects.call(request)
import json
import re
# Clean and parse JSON
result = response.content.strip() if response and response.content else ""
# Check if result is empty
if not result:
self.logger.warning("AI styling returned empty response, using defaults")
return default_styles
# Extract JSON from markdown if present
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to parse JSON
try:
styles = json.loads(result)
except json.JSONDecodeError as json_error:
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
# Use print instead of logger to avoid truncation
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "RENDERER")
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "RENDERER")
self.logger.warning(f"Raw content that failed to parse: {result}")
# Try to fix incomplete JSON by adding missing closing braces
open_braces = result.count('{')
close_braces = result.count('}')
if open_braces > close_braces:
# JSON is incomplete, add missing closing braces
missing_braces = open_braces - close_braces
result = result + '}' * missing_braces
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
self.logger.debug(f"Fixed JSON: {result}")
# Try parsing the fixed JSON
try:
styles = json.loads(result)
self.logger.info("Successfully fixed incomplete JSON")
except json.JSONDecodeError as fix_error:
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
self.logger.warning(f"Fixed JSON content: {result}")
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
else:
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
# Convert colors to appropriate format
styles = self._convert_colors_format(styles)
return styles
except Exception as e:
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
return default_styles
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert colors to appropriate format based on renderer type.
Override this method in subclasses for format-specific color handling.
"""
return styles
def _create_ai_style_template(self, format_name: str, user_prompt: str, style_schema: Dict[str, Any]) -> str:
"""
Create a standardized AI style template for any format.
Args:
format_name: Name of the format (e.g., "docx", "xlsx", "pptx")
user_prompt: User's original prompt
style_schema: Format-specific style schema
Returns:
Formatted prompt string
"""
schema_json = json.dumps(style_schema, indent=4)
# DEBUG: Show the schema being sent
return f"""You are a professional document styling expert. Generate a complete JSON styling configuration for {format_name.upper()} documents.
Use this schema as a template and customize the values for professional document styling:
{schema_json}
Requirements:
- Return ONLY the complete JSON object (no markdown, no explanations)
- Customize colors, fonts, and spacing for professional appearance
- Ensure all objects are properly closed with closing braces
- Make the styling modern and professional
Return the complete JSON:"""

View file

@ -0,0 +1,260 @@
"""
CSV renderer for report generation.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
import csv
import io
class RendererCsv(BaseRenderer):
"""Renders content to CSV format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported CSV formats."""
return ['csv']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['spreadsheet', 'table']
@classmethod
def get_priority(cls) -> int:
"""Return priority for CSV renderer."""
return 70
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to CSV format."""
try:
# Generate CSV directly from JSON (no styling needed for CSV)
csv_content = await self._generate_csv_from_json(extracted_content, title)
return csv_content, "text/csv"
except Exception as e:
self.logger.error(f"Error rendering CSV: {str(e)}")
# Return minimal CSV fallback
return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
async def _generate_csv_from_json(self, json_content: Dict[str, Any], title: str) -> str:
"""Generate CSV content from structured JSON document."""
try:
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Generate CSV content
csv_rows = []
# Add title row
if document_title:
csv_rows.append([document_title])
csv_rows.append([]) # Empty row
# Process each section in order
sections = json_content.get("sections", [])
for section in sections:
section_csv = self._render_json_section_to_csv(section)
if section_csv:
csv_rows.extend(section_csv)
csv_rows.append([]) # Empty row between sections
# Convert to CSV string
csv_content = self._convert_rows_to_csv(csv_rows)
return csv_content
except Exception as e:
self.logger.error(f"Error generating CSV from JSON: {str(e)}")
raise Exception(f"CSV generation failed: {str(e)}")
def _render_json_section_to_csv(self, section: Dict[str, Any]) -> List[List[str]]:
"""Render a single JSON section to CSV rows."""
try:
section_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
csv_rows = []
# Add section title if available
section_title = section.get("title")
if section_title:
csv_rows.append([f"# {section_title}"])
# Process each element in the section
for element in elements:
if section_type == "table":
csv_rows.extend(self._render_json_table_to_csv(element))
elif section_type == "list":
csv_rows.extend(self._render_json_list_to_csv(element))
elif section_type == "heading":
csv_rows.extend(self._render_json_heading_to_csv(element))
elif section_type == "paragraph":
csv_rows.extend(self._render_json_paragraph_to_csv(element))
elif section_type == "code":
csv_rows.extend(self._render_json_code_to_csv(element))
else:
# Fallback to paragraph for unknown types
csv_rows.extend(self._render_json_paragraph_to_csv(element))
return csv_rows
except Exception as e:
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
return [["[Error rendering section]"]]
def _render_json_table_to_csv(self, table_data: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON table to CSV rows."""
try:
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
csv_rows = []
if headers:
csv_rows.append(headers)
if rows:
csv_rows.extend(rows)
return csv_rows
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return [["[Error rendering table]"]]
def _render_json_list_to_csv(self, list_data: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON list to CSV rows."""
try:
items = list_data.get("items", [])
csv_rows = []
for item in items:
if isinstance(item, dict):
text = item.get("text", "")
subitems = item.get("subitems", [])
csv_rows.append([text])
# Add subitems as indented rows
for subitem in subitems:
if isinstance(subitem, dict):
csv_rows.append([f" - {subitem.get('text', '')}"])
else:
csv_rows.append([f" - {subitem}"])
else:
csv_rows.append([str(item)])
return csv_rows
except Exception as e:
self.logger.warning(f"Error rendering list: {str(e)}")
return [["[Error rendering list]"]]
def _render_json_heading_to_csv(self, heading_data: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON heading to CSV rows."""
try:
text = heading_data.get("text", "")
level = heading_data.get("level", 1)
if text:
# Use # symbols for heading levels
heading_text = f"{'#' * level} {text}"
return [[heading_text]]
return []
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return [["[Error rendering heading]"]]
def _render_json_paragraph_to_csv(self, paragraph_data: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON paragraph to CSV rows."""
try:
text = paragraph_data.get("text", "")
if text:
# Split long paragraphs into multiple rows if needed
if len(text) > 100:
words = text.split()
rows = []
current_row = []
current_length = 0
for word in words:
if current_length + len(word) > 100 and current_row:
rows.append([" ".join(current_row)])
current_row = [word]
current_length = len(word)
else:
current_row.append(word)
current_length += len(word) + 1
if current_row:
rows.append([" ".join(current_row)])
return rows
else:
return [[text]]
return []
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return [["[Error rendering paragraph]"]]
def _render_json_code_to_csv(self, code_data: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON code block to CSV rows."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
csv_rows = []
if language:
csv_rows.append([f"Code ({language}):"])
if code:
# Split code into lines
code_lines = code.split('\n')
for line in code_lines:
csv_rows.append([f" {line}"])
return csv_rows
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return [["[Error rendering code block]"]]
def _convert_rows_to_csv(self, rows: List[List[str]]) -> str:
"""Convert rows to CSV string."""
import csv
import io
output = io.StringIO()
writer = csv.writer(output)
for row in rows:
if row: # Only write non-empty rows
writer.writerow(row)
return output.getvalue()
def _clean_csv_content(self, content: str, title: str) -> str:
"""Clean and validate CSV content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
return content

View file

@ -0,0 +1,958 @@
"""
DOCX renderer for report generation using python-docx.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
import re
import os
from datetime import datetime, UTC
try:
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.shared import OxmlElement, qn
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
class RendererDocx(BaseRenderer):
"""Renders content to DOCX format using python-docx."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported DOCX formats."""
return ['docx', 'doc']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['word', 'document']
@classmethod
def get_priority(cls) -> int:
"""Return priority for DOCX renderer."""
return 115
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={user_prompt[:50] if user_prompt else 'None'}...", "DOCX_RENDERER")
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .rendererHtml import RendererHtml
html_renderer = RendererHtml()
html_content, _ = await html_renderer.render(extracted_content, title)
return html_content, "text/html"
# Generate DOCX using AI-analyzed styling
docx_content = await self._generate_docx_from_json(extracted_content, title, user_prompt, ai_service)
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
except Exception as e:
self.logger.error(f"Error rendering DOCX: {str(e)}")
# Return minimal fallback
return f"DOCX Generation Error: {str(e)}", "text/plain"
async def _generate_docx_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
"""Generate DOCX content from structured JSON document using AI-generated styling."""
try:
# Create new document
doc = Document()
# Get AI-generated styling definitions
self.logger.info(f"About to call AI styling with user_prompt: {user_prompt[:100] if user_prompt else 'None'}...")
styles = await self._get_docx_styles(user_prompt, ai_service)
# Apply basic document setup
self._setup_basic_document_styles(doc)
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Add document title using analyzed styles
if document_title:
title_heading = doc.add_heading(document_title, level=1)
title_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Process each section in order
sections = json_content.get("sections", [])
for section in sections:
self._render_json_section(doc, section, styles)
# Save to buffer
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
# Convert to base64
docx_bytes = buffer.getvalue()
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
return docx_base64
except Exception as e:
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
raise Exception(f"DOCX generation failed: {str(e)}")
async def _get_docx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
"""Get DOCX styling definitions using base template AI styling."""
style_schema = {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
}
style_template = self._create_ai_style_template("docx", user_prompt, style_schema)
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_styles())
# Validate and fix contrast issues
return self._validate_styles_contrast(styles)
def _validate_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bg_color = header.get("background", "#FFFFFF")
text_color = header.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bg_color = cell.get("background", "#FFFFFF")
text_color = cell.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._get_default_styles()
def _get_default_styles(self) -> Dict[str, Any]:
"""Default DOCX styles."""
return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
}
def _setup_basic_document_styles(self, doc: Document) -> None:
"""Set up basic document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
except Exception as e:
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
def _clear_template_content(self, doc: Document) -> None:
"""Clear template content while preserving styles."""
try:
# Remove all paragraphs except keep the styles
for paragraph in list(doc.paragraphs):
# Keep the paragraph but clear its content
paragraph.clear()
# Remove all tables
for table in list(doc.tables):
table._element.getparent().remove(table._element)
except Exception as e:
self.logger.warning(f"Could not clear template content: {str(e)}")
def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a single JSON section to DOCX using AI-generated styles."""
try:
section_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
# Process each element in the section
for element in elements:
if section_type == "table":
self._render_json_table(doc, element, styles)
elif section_type == "bullet_list":
self._render_json_bullet_list(doc, element, styles)
elif section_type == "heading":
self._render_json_heading(doc, element, styles)
elif section_type == "paragraph":
self._render_json_paragraph(doc, element, styles)
elif section_type == "code_block":
self._render_json_code_block(doc, element, styles)
elif section_type == "image":
self._render_json_image(doc, element, styles)
else:
# Fallback to paragraph for unknown types
self._render_json_paragraph(doc, element, styles)
except Exception as e:
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
# Add error paragraph as fallback
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
def _render_json_table(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON table to DOCX using AI-generated styles."""
try:
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
if not headers or not rows:
return
# Create table
table = doc.add_table(rows=len(rows) + 1, cols=len(headers))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
# Apply table borders based on AI style
border_style = styles["table_border"]["style"]
if border_style == "horizontal_only":
self._apply_horizontal_borders_only(table)
elif border_style == "grid":
table.style = 'Table Grid'
# else: no borders
# Add headers with AI-generated styling
header_row = table.rows[0]
header_style = styles["table_header"]
for i, header in enumerate(headers):
if i < len(header_row.cells):
cell = header_row.cells[i]
cell.text = str(header)
# Apply background color
bg_color = header_style["background"].lstrip('#')
self._set_cell_background(cell, RGBColor(int(bg_color[0:2], 16), int(bg_color[2:4], 16), int(bg_color[4:6], 16)))
# Apply text styling
for paragraph in cell.paragraphs:
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if header_style["align"] == "center" else WD_ALIGN_PARAGRAPH.LEFT
for run in paragraph.runs:
run.bold = header_style["bold"]
run.font.size = Pt(11)
text_color = header_style["text_color"].lstrip('#')
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
# Add data rows with AI-generated styling
cell_style = styles["table_cell"]
for row_idx, row_data in enumerate(rows):
if row_idx + 1 < len(table.rows):
table_row = table.rows[row_idx + 1]
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table_row.cells):
cell = table_row.cells[col_idx]
cell.text = str(cell_data)
# Apply text styling
for paragraph in cell.paragraphs:
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
for run in paragraph.runs:
run.font.size = Pt(10)
text_color = cell_style["text_color"].lstrip('#')
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
def _apply_horizontal_borders_only(self, table) -> None:
"""Apply only horizontal borders to the table (no vertical borders)."""
try:
from docx.oxml.shared import OxmlElement, qn
# Get table properties
tbl_pr = table._element.find(qn('w:tblPr'))
if tbl_pr is None:
tbl_pr = OxmlElement('w:tblPr')
table._element.insert(0, tbl_pr)
# Remove existing borders
existing_borders = tbl_pr.find(qn('w:tblBorders'))
if existing_borders is not None:
tbl_pr.remove(existing_borders)
# Create new borders element
tbl_borders = OxmlElement('w:tblBorders')
# Top border
top_border = OxmlElement('w:top')
top_border.set(qn('w:val'), 'single')
top_border.set(qn('w:sz'), '4')
top_border.set(qn('w:space'), '0')
top_border.set(qn('w:color'), '000000')
tbl_borders.append(top_border)
# Bottom border
bottom_border = OxmlElement('w:bottom')
bottom_border.set(qn('w:val'), 'single')
bottom_border.set(qn('w:sz'), '4')
bottom_border.set(qn('w:space'), '0')
bottom_border.set(qn('w:color'), '000000')
tbl_borders.append(bottom_border)
# Left border - none
left_border = OxmlElement('w:left')
left_border.set(qn('w:val'), 'none')
tbl_borders.append(left_border)
# Right border - none
right_border = OxmlElement('w:right')
right_border.set(qn('w:val'), 'none')
tbl_borders.append(right_border)
# Inside horizontal border
inside_h_border = OxmlElement('w:insideH')
inside_h_border.set(qn('w:val'), 'single')
inside_h_border.set(qn('w:sz'), '4')
inside_h_border.set(qn('w:space'), '0')
inside_h_border.set(qn('w:color'), '000000')
tbl_borders.append(inside_h_border)
# Inside vertical border - none
inside_v_border = OxmlElement('w:insideV')
inside_v_border.set(qn('w:val'), 'none')
tbl_borders.append(inside_v_border)
tbl_pr.append(tbl_borders)
except Exception as e:
self.logger.warning(f"Could not apply horizontal borders: {str(e)}")
def _set_cell_background(self, cell, color: RGBColor) -> None:
"""Set the background color of a table cell."""
try:
from docx.oxml.shared import OxmlElement, qn
# Get cell properties
tc_pr = cell._element.find(qn('w:tcPr'))
if tc_pr is None:
tc_pr = OxmlElement('w:tcPr')
cell._element.insert(0, tc_pr)
# Remove existing shading
existing_shading = tc_pr.find(qn('w:shd'))
if existing_shading is not None:
tc_pr.remove(existing_shading)
# Create new shading element
shading = OxmlElement('w:shd')
shading.set(qn('w:val'), 'clear')
shading.set(qn('w:color'), 'auto')
# Convert RGBColor to hex string by unpacking RGB components
red, green, blue = color
hex_color = f"{red:02x}{green:02x}{blue:02x}"
shading.set(qn('w:fill'), hex_color)
tc_pr.append(shading)
except Exception as e:
self.logger.warning(f"Could not set cell background: {str(e)}")
def _render_json_bullet_list(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON bullet list to DOCX using AI-generated styles."""
try:
items = list_data.get("items", [])
bullet_style = styles["bullet_list"]
for item in items:
if isinstance(item, str):
para = doc.add_paragraph(item, style='List Bullet')
elif isinstance(item, dict) and "text" in item:
para = doc.add_paragraph(item["text"], style='List Bullet')
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
def _render_json_heading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON heading to DOCX using AI-generated styles."""
try:
level = heading_data.get("level", 1)
text = heading_data.get("text", "")
if text:
level = max(1, min(6, level))
doc.add_heading(text, level=level)
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
def _render_json_paragraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON paragraph to DOCX using AI-generated styles."""
try:
text = paragraph_data.get("text", "")
if text:
para = doc.add_paragraph(text)
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
def _render_json_code_block(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON code block to DOCX using AI-generated styles."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
if code:
if language:
lang_para = doc.add_paragraph(f"Code ({language}):")
lang_para.runs[0].bold = True
code_para = doc.add_paragraph(code)
for run in code_para.runs:
run.font.name = 'Courier New'
run.font.size = Pt(10)
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
def _render_json_image(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON image to DOCX."""
try:
base64_data = image_data.get("base64Data", "")
alt_text = image_data.get("altText", "Image")
if base64_data:
image_bytes = base64.b64decode(base64_data)
doc.add_picture(io.BytesIO(image_bytes), width=Inches(4))
if alt_text:
caption_para = doc.add_paragraph(f"Figure: {alt_text}")
caption_para.runs[0].italic = True
except Exception as e:
self.logger.warning(f"Error rendering image: {str(e)}")
doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]")
def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]:
"""Extract document structure from user prompt."""
structure = {
'title': title,
'sections': [],
'format': 'standard'
}
if not user_prompt:
return structure
# Extract title from prompt if not provided
if not title or title == "Generated Document":
# Look for "create a ... document" or "generate a ... report"
import re
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower())
if title_match:
structure['title'] = title_match.group(1).strip().title()
# Extract sections from numbered lists in prompt
import re
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
sections = re.findall(section_pattern, user_prompt)
for num, section_text in sections:
structure['sections'].append({
'number': int(num),
'title': section_text.strip(),
'level': 2 # H2 level
})
# If no numbered sections found, try to extract from "including:" patterns
if not structure['sections']:
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL)
if including_match:
including_text = including_match.group(1)
# Split by common separators
parts = re.split(r'[,;]\s*', including_text)
for i, part in enumerate(parts, 1):
part = part.strip()
if part:
structure['sections'].append({
'number': i,
'title': part,
'level': 2
})
# If still no sections, extract from any list-like patterns
if not structure['sections']:
# Look for bullet points or dashes
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
bullets = re.findall(bullet_pattern, user_prompt)
for i, bullet in enumerate(bullets, 1):
bullet = bullet.strip()
if bullet and len(bullet) > 3:
structure['sections'].append({
'number': i,
'title': bullet,
'level': 2
})
# If still no sections, extract from sentence structure
if not structure['sections']:
# Split prompt into sentences and use as sections
sentences = re.split(r'[.!?]\s+', user_prompt)
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
sentence = sentence.strip()
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
structure['sections'].append({
'number': i,
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
'level': 2
})
# Final fallback: create sections from prompt keywords
if not structure['sections']:
# Extract key action words from prompt
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
found_actions = []
for action in action_words:
if action in user_prompt.lower():
found_actions.append(action.title())
if found_actions:
for i, action in enumerate(found_actions[:3], 1):
structure['sections'].append({
'number': i,
'title': f"{action} Document Content",
'level': 2
})
else:
# Last resort: generic but meaningful sections
structure['sections'] = [
{'number': 1, 'title': 'Document Analysis', 'level': 2},
{'number': 2, 'title': 'Key Information', 'level': 2},
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
]
return structure
def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]):
"""Generate DOCX content based on extracted structure."""
# Add sections based on prompt structure
for section in structure['sections']:
# Add section heading
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
# Add AI-generated content for this section
# Try to extract relevant content for this section from the AI response
section_content = self._extract_section_content(content, section['title'])
if section_content:
doc.add_paragraph(section_content)
else:
# If no specific content found, add a note
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
# Add some spacing
doc.add_paragraph()
# Add the complete AI-generated content as additional analysis
if content and content.strip():
doc.add_heading("Complete Analysis", level=1)
doc.add_paragraph(content)
def _extract_section_content(self, content: str, section_title: str) -> str:
"""Extract relevant content for a specific section from AI response."""
if not content or not section_title:
return ""
# Look for content that matches the section title
section_keywords = section_title.lower().split()
# Split content into paragraphs
paragraphs = content.split('\n\n')
relevant_paragraphs = []
for paragraph in paragraphs:
paragraph_lower = paragraph.lower()
# Check if paragraph contains keywords from section title
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
relevant_paragraphs.append(paragraph.strip())
if relevant_paragraphs:
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
return ""
def _setup_document_styles(self, doc):
"""Set up document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
# Set heading styles
for i in range(1, 4):
heading_style = doc.styles[f'Heading {i}']
heading_font = heading_style.font
heading_font.name = 'Calibri'
heading_font.size = Pt(16 - i * 2)
heading_font.bold = True
except Exception as e:
self.logger.warning(f"Could not set up document styles: {str(e)}")
def _process_section(self, doc, lines: list):
"""Process a section of content into DOCX elements."""
for line in lines:
if not line.strip():
continue
# Check for tables (lines with |)
if '|' in line and not line.startswith('|'):
# This might be part of a table, process as table
table_data = self._extract_table_data(lines)
if table_data:
self._add_table(doc, table_data)
return
# Check for lists
if line.startswith('- ') or line.startswith('* '):
# This is a list item
doc.add_paragraph(line[2:], style='List Bullet')
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
# This is a numbered list item
doc.add_paragraph(line[3:], style='List Number')
else:
# Regular paragraph
doc.add_paragraph(line)
def _extract_table_data(self, lines: list) -> list:
"""Extract table data from lines."""
table_data = []
in_table = False
for line in lines:
if '|' in line:
if not in_table:
in_table = True
# Split by | and clean up
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
if cells:
table_data.append(cells)
elif in_table and not line.strip():
# Empty line, might be end of table
break
return table_data if len(table_data) > 1 else []
def _add_table(self, doc, table_data: list):
"""Add a table to the document."""
try:
if not table_data:
return
# Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
# Add data to table
for row_idx, row_data in enumerate(table_data):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data
# Style the table
self._style_table(table)
except Exception as e:
self.logger.warning(f"Could not add table: {str(e)}")
def _style_table(self, table):
"""Apply styling to the table."""
try:
# Style header row
if len(table.rows) > 0:
header_cells = table.rows[0].cells
for cell in header_cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")
def _process_table_row(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _clean_ai_content(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates."""
if not content:
return ""
# Remove debug information
lines = content.split('\n')
clean_lines = []
for line in lines:
# Skip debug lines and separators
if (line.startswith('[Skipped ') or
line.startswith('=== DOCUMENT:') or
line.startswith('---') or
line.startswith('FILENAME:') or
line.strip() == '' or
line.strip() == '---'):
continue
clean_lines.append(line)
# Join lines and remove duplicate content
clean_content = '\n'.join(clean_lines)
# Remove duplicate sections by keeping only the first occurrence
sections = clean_content.split('\n\n')
seen_sections = set()
unique_sections = []
for section in sections:
section_key = section.strip()[:50] # Use first 50 chars as key
if section_key not in seen_sections and section.strip():
seen_sections.add(section_key)
unique_sections.append(section)
return '\n\n'.join(unique_sections)
def _process_tables(self, doc, content: str) -> str:
"""
Process tables in the content (both CSV and pipe-separated) and convert them to Word tables.
Returns the content with tables replaced by placeholders.
"""
import csv
import io
lines = content.split('\n')
processed_lines = []
i = 0
while i < len(lines):
line = lines[i].strip()
# Check if this line looks like a table (contains pipes or commas with multiple fields)
is_pipe_table = '|' in line and len(line.split('|')) >= 2
is_csv_table = ',' in line and len(line.split(',')) >= 2
if is_pipe_table or is_csv_table:
# Collect consecutive table lines
table_lines = []
j = i
# Determine separator and collect lines
separator = '|' if is_pipe_table else ','
while j < len(lines):
current_line = lines[j].strip()
if separator in current_line and len(current_line.split(separator)) >= 2:
table_lines.append(current_line)
j += 1
else:
break
if len(table_lines) >= 2: # At least header + 1 data row
# Create Word table
try:
if separator == '|':
# Process pipe-separated table
rows = []
for table_line in table_lines:
# Split by pipe and clean up
cells = [cell.strip() for cell in table_line.split('|')]
rows.append(cells)
else:
# Process CSV table
csv_content = '\n'.join(table_lines)
csv_reader = csv.reader(io.StringIO(csv_content))
rows = list(csv_reader)
if rows and len(rows[0]) > 0:
# Create Word table
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
table.style = 'Table Grid'
# Populate table
for row_idx, row_data in enumerate(rows):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data.strip()
# Make header row bold
if row_idx == 0:
for cell in table.rows[row_idx].cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
# Add placeholder to mark where table was inserted
processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]")
# Skip the table lines
i = j
continue
except Exception as e:
# If table parsing fails, treat as regular text
pass
processed_lines.append(line)
i += 1
return '\n'.join(processed_lines)
def _parse_and_format_content(self, doc, content: str, title: str):
"""Parse AI-generated content in standardized format and apply proper DOCX formatting."""
if not content:
return
# Process tables and replace them with placeholders
content = self._process_tables(doc, content)
# Parse content line by line in exact sequence
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
# Empty line - add paragraph break
doc.add_paragraph()
continue
# Skip table placeholders (already processed)
if line.startswith('[TABLE_INSERTED_'):
continue
# Check if this is a Markdown heading (# ## ###)
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
heading_text = line.lstrip('# ').strip()
doc.add_heading(heading_text, level=min(level, 3))
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
elif re.match(r'^\d+\)\s+.+', line):
heading_text = re.sub(r'^\d+\)\s+', '', line)
doc.add_heading(heading_text, level=1)
# Check if this is a Markdown list item
elif line.startswith('- ') or re.match(r'^\d+\.\s+', line):
bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line)
self._add_bullet_point(doc, bullet_text)
# Check if this is a code block
elif line.startswith('```'):
if not line.endswith('```'):
# Start of code block - collect until end
code_lines = [line]
continue
else:
# End of code block
if 'code_lines' in locals():
code_lines.append(line)
code_text = '\n'.join(code_lines)
para = doc.add_paragraph()
run = para.add_run(code_text)
run.font.name = 'Courier New'
del code_lines
# Regular paragraph
else:
self._add_paragraph_to_doc(doc, line)
def _add_paragraph_to_doc(self, doc, text: str):
"""Add a paragraph to the document with proper formatting."""
if not text.strip():
return
# Check for Markdown formatting (**bold**, *italic*)
para = doc.add_paragraph()
# Split by bold markers
parts = text.split('**')
for i, part in enumerate(parts):
if i % 2 == 0:
# Regular text - check for italic
italic_parts = part.split('*')
for j, italic_part in enumerate(italic_parts):
if j % 2 == 0:
# Regular text
if italic_part:
para.add_run(italic_part)
else:
# Italic text
if italic_part:
run = para.add_run(italic_part)
run.italic = True
else:
# Bold text
if part:
run = para.add_run(part)
run.bold = True

View file

@ -0,0 +1,424 @@
"""
HTML renderer for report generation.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
class RendererHtml(BaseRenderer):
"""Renders content to HTML format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported HTML formats."""
return ['html', 'htm']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['web', 'webpage']
@classmethod
def get_priority(cls) -> int:
"""Return priority for HTML renderer."""
return 100
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to HTML format using AI-analyzed styling."""
try:
# Generate HTML using AI-analyzed styling
html_content = await self._generate_html_from_json(extracted_content, title, user_prompt, ai_service)
return html_content, "text/html"
except Exception as e:
self.logger.error(f"Error rendering HTML: {str(e)}")
# Return minimal HTML fallback
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
async def _generate_html_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
# Get AI-generated styling definitions
styles = await self._get_html_styles(user_prompt, ai_service)
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Build HTML document
html_parts = []
# HTML document structure
html_parts.append('<!DOCTYPE html>')
html_parts.append('<html lang="en">')
html_parts.append('<head>')
html_parts.append('<meta charset="UTF-8">')
html_parts.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
html_parts.append(f'<title>{document_title}</title>')
html_parts.append('<style>')
html_parts.append(self._generate_css_styles(styles))
html_parts.append('</style>')
html_parts.append('</head>')
html_parts.append('<body>')
# Document header
html_parts.append(f'<header><h1 class="document-title">{document_title}</h1></header>')
# Main content
html_parts.append('<main>')
# Process each section
sections = json_content.get("sections", [])
for section in sections:
section_html = self._render_json_section(section, styles)
if section_html:
html_parts.append(section_html)
html_parts.append('</main>')
# Footer
html_parts.append('<footer>')
html_parts.append(f'<p class="generated-info">Generated: {self._format_timestamp()}</p>')
html_parts.append('</footer>')
html_parts.append('</body>')
html_parts.append('</html>')
return '\n'.join(html_parts)
except Exception as e:
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
raise Exception(f"HTML generation failed: {str(e)}")
async def _get_html_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
"""Get HTML styling definitions using base template AI styling."""
style_schema = {
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
}
style_template = self._create_ai_style_template("html", user_prompt, style_schema)
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_html_styles())
# Validate and fix contrast issues
return self._validate_html_styles_contrast(styles)
def _validate_html_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bg_color = header.get("background", "#FFFFFF")
text_color = header.get("color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["color"] = "#FFFFFF"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
header["background"] = "#4F4F4F"
header["color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bg_color = cell.get("background", "#FFFFFF")
text_color = cell.get("color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["color"] = "#2F2F2F"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._get_default_html_styles()
def _get_default_html_styles(self) -> Dict[str, Any]:
"""Default HTML styles."""
return {
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
}
def _generate_css_styles(self, styles: Dict[str, Any]) -> str:
"""Generate CSS from style definitions."""
css_parts = []
# Body styles
body_style = styles.get("body", {})
css_parts.append("body {")
for property_name, value in body_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Document title
title_style = styles.get("title", {})
css_parts.append(".document-title {")
for property_name, value in title_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Headings
for heading_level in ["heading1", "heading2"]:
heading_style = styles.get(heading_level, {})
css_class = f"h{heading_level[-1]}"
css_parts.append(f"{css_class} {{")
for property_name, value in heading_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Paragraphs
paragraph_style = styles.get("paragraph", {})
css_parts.append("p {")
for property_name, value in paragraph_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Tables
table_style = styles.get("table", {})
css_parts.append("table {")
for property_name, value in table_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Table headers
table_header_style = styles.get("table_header", {})
css_parts.append("th {")
for property_name, value in table_header_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Table cells
table_cell_style = styles.get("table_cell", {})
css_parts.append("td {")
for property_name, value in table_cell_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Lists
bullet_list_style = styles.get("bullet_list", {})
css_parts.append("ul {")
for property_name, value in bullet_list_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Code blocks
code_block_style = styles.get("code_block", {})
css_parts.append("pre {")
for property_name, value in code_block_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Images
image_style = styles.get("image", {})
css_parts.append("img {")
for property_name, value in image_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Generated info
css_parts.append(".generated-info {")
css_parts.append(" font-size: 0.9em;")
css_parts.append(" color: #666;")
css_parts.append(" text-align: center;")
css_parts.append(" margin-top: 2em;")
css_parts.append(" padding-top: 1em;")
css_parts.append(" border-top: 1px solid #ddd;")
css_parts.append("}")
return '\n'.join(css_parts)
def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a single JSON section to HTML using AI-generated styles."""
try:
section_type = self._get_section_type(section)
section_data = self._get_section_data(section)
if section_type == "table":
# Process the section data to extract table structure
processed_data = self._process_section_by_type(section)
return self._render_json_table(processed_data, styles)
elif section_type == "bullet_list":
# Process the section data to extract bullet list structure
processed_data = self._process_section_by_type(section)
return self._render_json_bullet_list(processed_data, styles)
elif section_type == "heading":
return self._render_json_heading(section_data, styles)
elif section_type == "paragraph":
return self._render_json_paragraph(section_data, styles)
elif section_type == "code_block":
# Process the section data to extract code block structure
processed_data = self._process_section_by_type(section)
return self._render_json_code_block(processed_data, styles)
elif section_type == "image":
# Process the section data to extract image structure
processed_data = self._process_section_by_type(section)
return self._render_json_image(processed_data, styles)
else:
# Fallback to paragraph for unknown types
return self._render_json_paragraph(section_data, styles)
except Exception as e:
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
return f'<div class="error">[Error rendering section: {str(e)}]</div>'
def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON table to HTML using AI-generated styles."""
try:
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
if not headers or not rows:
return ""
html_parts = ['<table>']
# Table header
html_parts.append('<thead><tr>')
for header in headers:
html_parts.append(f'<th>{header}</th>')
html_parts.append('</tr></thead>')
# Table body
html_parts.append('<tbody>')
for row in rows:
html_parts.append('<tr>')
for cell_data in row:
html_parts.append(f'<td>{cell_data}</td>')
html_parts.append('</tr>')
html_parts.append('</tbody>')
html_parts.append('</table>')
return '\n'.join(html_parts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON bullet list to HTML using AI-generated styles."""
try:
items = list_data.get("items", [])
if not items:
return ""
html_parts = ['<ul>']
for item in items:
if isinstance(item, str):
html_parts.append(f'<li>{item}</li>')
elif isinstance(item, dict) and "text" in item:
html_parts.append(f'<li>{item["text"]}</li>')
html_parts.append('</ul>')
return '\n'.join(html_parts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON heading to HTML using AI-generated styles."""
try:
level = heading_data.get("level", 1)
text = heading_data.get("text", "")
if text:
level = max(1, min(6, level))
return f'<h{level}>{text}</h{level}>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON paragraph to HTML using AI-generated styles."""
try:
text = paragraph_data.get("text", "")
if text:
return f'<p>{text}</p>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON code block to HTML using AI-generated styles."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
if code:
if language:
return f'<pre><code class="language-{language}">{code}</code></pre>'
else:
return f'<pre><code>{code}</code></pre>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON image to HTML."""
try:
base64_data = image_data.get("base64Data", "")
alt_text = image_data.get("altText", "Image")
if base64_data:
return f'<img src="data:image/png;base64,{base64_data}" alt="{alt_text}">'
return ""
except Exception as e:
self.logger.warning(f"Error rendering image: {str(e)}")
return f'<div class="error">[Image: {image_data.get("altText", "Image")}]</div>'

View file

@ -0,0 +1,281 @@
"""
Image renderer for report generation using AI image generation.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
import base64
import logging
logger = logging.getLogger(__name__)
class RendererImage(BaseRenderer):
"""Renders content to image format using AI image generation."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported image formats."""
return ['png', 'jpg', 'jpeg', 'image']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['img', 'picture', 'photo', 'graphic']
@classmethod
def get_priority(cls) -> int:
"""Return priority for image renderer."""
return 90
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to image format using AI image generation."""
try:
# Generate AI image from content
image_content = await self._generate_ai_image(extracted_content, title, user_prompt, ai_service)
return image_content, "image/png"
except Exception as e:
self.logger.error(f"Error rendering image: {str(e)}")
# Re-raise the exception instead of using fallback
raise Exception(f"Image rendering failed: {str(e)}")
async def _generate_ai_image(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
"""Generate AI image from extracted content."""
try:
if not ai_service:
raise ValueError("AI service is required for image generation")
# Validate JSON structure
if not isinstance(extracted_content, dict):
raise ValueError("Extracted content must be a dictionary")
if "sections" not in extracted_content:
raise ValueError("Extracted content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = extracted_content.get("metadata", {}).get("title", title)
# Create AI prompt for image generation
image_prompt = await self._create_image_generation_prompt(extracted_content, document_title, user_prompt, ai_service)
# Generate image using AI
image_result = await ai_service.aiObjects.generateImage(
prompt=image_prompt,
size="1024x1024",
quality="standard",
style="vivid"
)
# Extract base64 image data from result
if image_result and image_result.get("success", False):
image_data = image_result.get("image_data", "")
if image_data:
return image_data
else:
raise ValueError("No image data returned from AI")
else:
error_msg = image_result.get("error", "Unknown error") if image_result else "No result"
raise ValueError(f"AI image generation failed: {error_msg}")
except Exception as e:
self.logger.error(f"Error generating AI image: {str(e)}")
raise Exception(f"AI image generation failed: {str(e)}")
async def _create_image_generation_prompt(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
"""Create a detailed prompt for AI image generation based on the content."""
try:
# Start with base prompt
prompt_parts = []
# Add user's original intent if available
if user_prompt:
prompt_parts.append(f"User Request: {user_prompt}")
# Add document title
prompt_parts.append(f"Document Title: {title}")
# Analyze content and create visual description
sections = extracted_content.get("sections", [])
content_description = self._analyze_content_for_visual_description(sections)
if content_description:
prompt_parts.append(f"Content to Visualize: {content_description}")
# Add style guidance
style_guidance = self._get_style_guidance_from_content(extracted_content, user_prompt)
if style_guidance:
prompt_parts.append(f"Visual Style: {style_guidance}")
# Combine all parts
full_prompt = "Create a professional, informative image that visualizes the following content:\n\n" + "\n\n".join(prompt_parts)
# Add technical requirements
full_prompt += "\n\nTechnical Requirements:"
full_prompt += "\n- High quality, professional appearance"
full_prompt += "\n- Clear, readable text if any text is included"
full_prompt += "\n- Appropriate colors and layout"
full_prompt += "\n- Suitable for business/professional use"
# Truncate prompt if it exceeds DALL-E's 4000 character limit
if len(full_prompt) > 4000:
# Use AI to compress the prompt intelligently
compressed_prompt = await self._compress_prompt_with_ai(full_prompt, ai_service)
if compressed_prompt and len(compressed_prompt) <= 4000:
return compressed_prompt
# Fallback to minimal prompt if AI compression fails or is still too long
minimal_prompt = f"Create a professional image representing: {title}"
if user_prompt:
minimal_prompt += f" - {user_prompt}"
# If even the minimal prompt is too long, truncate it
if len(minimal_prompt) > 4000:
minimal_prompt = minimal_prompt[:3997] + "..."
return minimal_prompt
return full_prompt
except Exception as e:
self.logger.warning(f"Error creating image prompt: {str(e)}")
# Fallback to simple prompt
return f"Create a professional image representing: {title}"
async def _compress_prompt_with_ai(self, long_prompt: str, ai_service=None) -> str:
"""Use AI to intelligently compress a long prompt while preserving key information."""
try:
if not ai_service:
return None
compression_prompt = f"""
You are an expert at creating concise, effective prompts for AI image generation.
The following prompt is too long for DALL-E (4000 character limit) and needs to be compressed to under 4000 characters while preserving the most important visual information.
Original prompt ({len(long_prompt)} characters):
{long_prompt}
Please create a compressed version that:
1. Keeps the most important visual elements and requirements
2. Maintains the core intent and style guidance
3. Preserves technical requirements
4. Stays under 4000 characters
5. Is optimized for DALL-E image generation
Return only the compressed prompt, no explanations.
"""
# Use AI to compress the prompt - call the AI service correctly
# The ai_service has an aiObjects attribute that contains the actual AI interface
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request = AiCallRequest(
prompt=compression_prompt,
options=AiCallOptions(
operationType=OperationType.GENERAL,
maxTokens=2000,
temperature=0.3 # Lower temperature for more consistent compression
)
)
response = await ai_service.aiObjects.call(request)
compressed = response.content.strip()
# Validate the compressed prompt
if compressed and len(compressed) <= 4000 and len(compressed) > 50:
self.logger.info(f"Successfully compressed prompt from {len(long_prompt)} to {len(compressed)} characters")
return compressed
else:
self.logger.warning(f"AI compression failed or produced invalid result: {len(compressed) if compressed else 0} chars")
return None
except Exception as e:
self.logger.warning(f"Error compressing prompt with AI: {str(e)}")
return None
def _analyze_content_for_visual_description(self, sections: List[Dict[str, Any]]) -> str:
"""Analyze content sections and create a visual description for AI."""
try:
descriptions = []
for section in sections:
section_type = self._get_section_type(section)
section_data = self._get_section_data(section)
if section_type == "table":
headers = section_data.get("headers", [])
rows = section_data.get("rows", [])
if headers and rows:
descriptions.append(f"Data table with {len(headers)} columns and {len(rows)} rows: {', '.join(headers)}")
elif section_type == "bullet_list":
items = section_data.get("items", [])
if items:
descriptions.append(f"List with {len(items)} items")
elif section_type == "heading":
text = section_data.get("text", "")
level = section_data.get("level", 1)
if text:
descriptions.append(f"Heading {level}: {text}")
elif section_type == "paragraph":
text = section_data.get("text", "")
if text and len(text) > 10: # Only include substantial paragraphs
# Truncate long text
truncated = text[:100] + "..." if len(text) > 100 else text
descriptions.append(f"Text content: {truncated}")
elif section_type == "code_block":
code = section_data.get("code", "")
language = section_data.get("language", "")
if code:
descriptions.append(f"Code block ({language}): {code[:50]}...")
return "; ".join(descriptions) if descriptions else "General document content"
except Exception as e:
self.logger.warning(f"Error analyzing content: {str(e)}")
return "Document content"
def _get_style_guidance_from_content(self, extracted_content: Dict[str, Any], user_prompt: str = None) -> str:
"""Determine visual style guidance based on content and user prompt."""
try:
style_elements = []
# Analyze user prompt for style hints
if user_prompt:
prompt_lower = user_prompt.lower()
if any(word in prompt_lower for word in ["modern", "contemporary", "sleek"]):
style_elements.append("modern, clean design")
elif any(word in prompt_lower for word in ["classic", "traditional", "formal"]):
style_elements.append("classic, formal design")
elif any(word in prompt_lower for word in ["creative", "artistic", "colorful"]):
style_elements.append("creative, artistic design")
elif any(word in prompt_lower for word in ["corporate", "business", "professional"]):
style_elements.append("corporate, professional design")
# Analyze content type for additional style hints
sections = extracted_content.get("sections", [])
has_tables = any(self._get_section_type(s) == "table" for s in sections)
has_lists = any(self._get_section_type(s) == "bullet_list" for s in sections)
has_code = any(self._get_section_type(s) == "code_block" for s in sections)
if has_tables:
style_elements.append("data-focused layout")
if has_lists:
style_elements.append("organized, structured presentation")
if has_code:
style_elements.append("technical, developer-friendly")
# Default style if no specific guidance
if not style_elements:
style_elements.append("professional, clean design")
return ", ".join(style_elements)
except Exception as e:
self.logger.warning(f"Error determining style guidance: {str(e)}")
return "professional design"

View file

@ -0,0 +1,79 @@
"""
JSON renderer for report generation.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
import json
class RendererJson(BaseRenderer):
"""Renders content to JSON format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported JSON formats."""
return ['json']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['data']
@classmethod
def get_priority(cls) -> int:
"""Return priority for JSON renderer."""
return 80
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to JSON format."""
try:
# The extracted content should already be JSON from the AI
# Just validate and format it
json_content = self._clean_json_content(extracted_content, title)
return json_content, "application/json"
except Exception as e:
self.logger.error(f"Error rendering JSON: {str(e)}")
# Return minimal JSON fallback
fallback_data = {
"title": title,
"sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}],
"metadata": {"error": str(e)}
}
return json.dumps(fallback_data, indent=2), "application/json"
def _clean_json_content(self, content: Dict[str, Any], title: str) -> str:
"""Clean and validate JSON content from AI."""
try:
# Validate JSON structure
if not isinstance(content, dict):
raise ValueError("Content must be a dictionary")
# Ensure it has the expected structure
if "sections" not in content:
# Convert old format to new format
content = {
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
"metadata": {"title": title}
}
# Ensure metadata exists
if "metadata" not in content:
content["metadata"] = {}
# Set title in metadata if not present
if "title" not in content["metadata"]:
content["metadata"]["title"] = title
# Re-format with proper indentation
return json.dumps(content, indent=2, ensure_ascii=False)
except Exception as e:
self.logger.warning(f"Error cleaning JSON content: {str(e)}")
# Return minimal valid JSON
fallback_data = {
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
"metadata": {"title": title, "error": str(e)}
}
return json.dumps(fallback_data, indent=2, ensure_ascii=False)

View file

@ -0,0 +1,221 @@
"""
Markdown renderer for report generation.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
class RendererMarkdown(BaseRenderer):
"""Renders content to Markdown format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported Markdown formats."""
return ['md', 'markdown']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['mdown', 'mkd']
@classmethod
def get_priority(cls) -> int:
"""Return priority for markdown renderer."""
return 95
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to Markdown format."""
try:
# Generate markdown from JSON structure
markdown_content = self._generate_markdown_from_json(extracted_content, title)
return markdown_content, "text/markdown"
except Exception as e:
self.logger.error(f"Error rendering markdown: {str(e)}")
# Return minimal markdown fallback
return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown"
def _generate_markdown_from_json(self, json_content: Dict[str, Any], title: str) -> str:
"""Generate markdown content from structured JSON document."""
try:
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Build markdown content
markdown_parts = []
# Document title
markdown_parts.append(f"# {document_title}")
markdown_parts.append("")
# Process each section
sections = json_content.get("sections", [])
for section in sections:
section_markdown = self._render_json_section(section)
if section_markdown:
markdown_parts.append(section_markdown)
markdown_parts.append("") # Add spacing between sections
# Add generation info
markdown_parts.append("---")
markdown_parts.append(f"*Generated: {self._format_timestamp()}*")
return '\n'.join(markdown_parts)
except Exception as e:
self.logger.error(f"Error generating markdown from JSON: {str(e)}")
raise Exception(f"Markdown generation failed: {str(e)}")
def _render_json_section(self, section: Dict[str, Any]) -> str:
"""Render a single JSON section to markdown."""
try:
section_type = self._get_section_type(section)
section_data = self._get_section_data(section)
if section_type == "table":
# Process the section data to extract table structure
processed_data = self._process_section_by_type(section)
return self._render_json_table(processed_data)
elif section_type == "bullet_list":
# Process the section data to extract bullet list structure
processed_data = self._process_section_by_type(section)
return self._render_json_bullet_list(processed_data)
elif section_type == "heading":
return self._render_json_heading(section_data)
elif section_type == "paragraph":
return self._render_json_paragraph(section_data)
elif section_type == "code_block":
# Process the section data to extract code block structure
processed_data = self._process_section_by_type(section)
return self._render_json_code_block(processed_data)
elif section_type == "image":
# Process the section data to extract image structure
processed_data = self._process_section_by_type(section)
return self._render_json_image(processed_data)
else:
# Fallback to paragraph for unknown types
return self._render_json_paragraph(section_data)
except Exception as e:
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
return f"*[Error rendering section: {str(e)}]*"
def _render_json_table(self, table_data: Dict[str, Any]) -> str:
"""Render a JSON table to markdown."""
try:
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
if not headers or not rows:
return ""
markdown_parts = []
# Create table header
header_line = " | ".join(str(header) for header in headers)
markdown_parts.append(header_line)
# Add separator line
separator_line = " | ".join("---" for _ in headers)
markdown_parts.append(separator_line)
# Add data rows
for row in rows:
row_line = " | ".join(str(cell_data) for cell_data in row)
markdown_parts.append(row_line)
return '\n'.join(markdown_parts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str:
"""Render a JSON bullet list to markdown."""
try:
items = list_data.get("items", [])
if not items:
return ""
markdown_parts = []
for item in items:
if isinstance(item, str):
markdown_parts.append(f"- {item}")
elif isinstance(item, dict) and "text" in item:
markdown_parts.append(f"- {item['text']}")
return '\n'.join(markdown_parts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _render_json_heading(self, heading_data: Dict[str, Any]) -> str:
"""Render a JSON heading to markdown."""
try:
level = heading_data.get("level", 1)
text = heading_data.get("text", "")
if text:
level = max(1, min(6, level))
return f"{'#' * level} {text}"
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str:
"""Render a JSON paragraph to markdown."""
try:
text = paragraph_data.get("text", "")
return text if text else ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _render_json_code_block(self, code_data: Dict[str, Any]) -> str:
"""Render a JSON code block to markdown."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
if code:
if language:
return f"```{language}\n{code}\n```"
else:
return f"```\n{code}\n```"
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _render_json_image(self, image_data: Dict[str, Any]) -> str:
"""Render a JSON image to markdown."""
try:
alt_text = image_data.get("altText", "Image")
base64_data = image_data.get("base64Data", "")
if base64_data:
# For base64 images, we can't embed them directly in markdown
# So we'll use a placeholder with the alt text
return f"![{alt_text}](data:image/png;base64,{base64_data[:50]}...)"
else:
return f"![{alt_text}](image-placeholder)"
except Exception as e:
self.logger.warning(f"Error rendering image: {str(e)}")
return f"![{image_data.get('altText', 'Image')}](image-error)"

View file

@ -0,0 +1,642 @@
"""
PDF renderer for report generation using reportlab.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC
try:
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
REPORTLAB_AVAILABLE = True
except ImportError:
REPORTLAB_AVAILABLE = False
class RendererPdf(BaseRenderer):
"""Renders content to PDF format using reportlab."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported PDF formats."""
return ['pdf']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['document', 'print']
@classmethod
def get_priority(cls) -> int:
"""Return priority for PDF renderer."""
return 120
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to PDF format using AI-analyzed styling."""
try:
if not REPORTLAB_AVAILABLE:
# Fallback to HTML if reportlab not available
from .rendererHtml import RendererHtml
html_renderer = RendererHtml()
html_content, _ = await html_renderer.render(extracted_content, title, user_prompt, ai_service)
return html_content, "text/html"
# Generate PDF using AI-analyzed styling
pdf_content = await self._generate_pdf_from_json(extracted_content, title, user_prompt, ai_service)
return pdf_content, "application/pdf"
except Exception as e:
self.logger.error(f"Error rendering PDF: {str(e)}")
# Return minimal fallback
return f"PDF Generation Error: {str(e)}", "text/plain"
async def _generate_pdf_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
"""Generate PDF content from structured JSON document using AI-generated styling."""
try:
# Get AI-generated styling definitions
styles = await self._get_pdf_styles(user_prompt, ai_service)
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Make title shorter to prevent wrapping/overlapping
if len(document_title) > 40:
document_title = "PowerOn - Consent Agreement"
# Create a buffer to hold the PDF
buffer = io.BytesIO()
# Create PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Build PDF content
story = []
# Title page
title_style = self._create_title_style(styles)
story.append(Paragraph(document_title, title_style))
story.append(Spacer(1, 50)) # Increased spacing to prevent overlap
story.append(Paragraph(f"Generated: {self._format_timestamp()}", self._create_normal_style(styles)))
story.append(Spacer(1, 30)) # Add spacing before page break
story.append(PageBreak())
# Process each section
sections = json_content.get("sections", [])
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
for i, section in enumerate(sections):
self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
section_elements = self._render_json_section(section, styles)
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
story.extend(section_elements)
# Build PDF
doc.build(story)
# Get PDF content as base64
buffer.seek(0)
pdf_bytes = buffer.getvalue()
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
return pdf_base64
except Exception as e:
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
raise Exception(f"PDF generation failed: {str(e)}")
async def _get_pdf_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
"""Get PDF styling definitions using base template AI styling."""
style_schema = {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
}
style_template = self._create_ai_style_template("pdf", user_prompt, style_schema)
# Use base template method like DOCX does (this works!)
styles = await self._get_ai_styles(ai_service, style_template, self._get_default_pdf_styles())
if styles is None:
return self._get_default_pdf_styles()
# Convert colors to PDF format after getting styles
styles = self._convert_colors_format(styles)
# Validate and fix contrast issues
return self._validate_pdf_styles_contrast(styles)
async def _get_ai_styles_with_pdf_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper PDF color conversion."""
if not ai_service:
return default_styles
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=style_template, context="", options=request_options)
# Check if AI service is properly configured
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
self.logger.warning("AI service not properly configured, using defaults")
return default_styles
response = await ai_service.aiObjects.call(request)
# Check if response is valid
if not response:
self.logger.warning("AI service returned no response, using defaults")
return default_styles
import json
import re
# Clean and parse JSON
result = response.content.strip() if response and response.content else ""
# Check if result is empty
if not result:
self.logger.warning("AI styling returned empty response, using defaults")
return default_styles
# Log the raw response for debugging
self.logger.debug(f"AI styling raw response: {result[:200]}...")
# Extract JSON from various formats
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to extract JSON from explanatory text
json_patterns = [
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
r'\{.*?"title".*?\}', # JSON with title field
r'\{.*?"font_size".*?\}', # JSON with font_size field
]
for pattern in json_patterns:
json_match = re.search(pattern, result, re.DOTALL)
if json_match:
result = json_match.group(0)
break
# Additional cleanup - remove any leading/trailing whitespace and newlines
result = result.strip()
# Check if result is still empty after cleanup
if not result:
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
return default_styles
# Try to parse JSON
try:
styles = json.loads(result)
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
except json.JSONDecodeError as json_error:
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
# Use print instead of logger to avoid truncation
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "PDF_RENDERER")
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "PDF_RENDERER")
self.logger.warning(f"Raw content that failed to parse: {result}")
# Try to fix incomplete JSON by adding missing closing braces
open_braces = result.count('{')
close_braces = result.count('}')
if open_braces > close_braces:
# JSON is incomplete, add missing closing braces
missing_braces = open_braces - close_braces
result = result + '}' * missing_braces
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
# Try parsing the fixed JSON
try:
styles = json.loads(result)
self.logger.info("Successfully fixed incomplete JSON")
except json.JSONDecodeError as fix_error:
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
else:
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
# Convert colors to PDF format (keep as hex strings, PDF renderer will convert them)
styles = self._convert_colors_format(styles)
return styles
except Exception as e:
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
return default_styles
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Convert colors to proper format for PDF compatibility."""
try:
for style_name, style_config in styles.items():
if isinstance(style_config, dict):
for prop, value in style_config.items():
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel) for consistency
styles[style_name][prop] = f"FF{value[1:]}"
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
# Already aRGB format, keep as is
pass
return styles
except Exception as e:
self.logger.warning(f"Color conversion failed: {str(e)}")
return styles
def _get_safe_color(self, color_value: str, default: str = "#000000") -> str:
"""Get a safe hex color value for PDF."""
if isinstance(color_value, str) and color_value.startswith('#'):
if len(color_value) == 7:
return f"FF{color_value[1:]}"
elif len(color_value) == 9:
return color_value
return default
def _validate_pdf_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bg_color = header.get("background", "#FFFFFF")
text_color = header.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bg_color = cell.get("background", "#FFFFFF")
text_color = cell.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._get_default_pdf_styles()
def _get_default_pdf_styles(self) -> Dict[str, Any]:
"""Default PDF styles."""
return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
}
def _create_title_style(self, styles: Dict[str, Any]) -> ParagraphStyle:
"""Create title style from style definitions."""
title_style_def = styles.get("title", {})
# DEBUG: Show what color and spacing is being used for title
title_color = title_style_def.get("color", "#1F4E79")
title_space_after = title_style_def.get("space_after", 30)
self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hex_to_color(title_color)}", "PDF_RENDERER")
self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER")
return ParagraphStyle(
'CustomTitle',
fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20
spaceAfter=title_style_def.get("space_after", 30),
alignment=self._get_alignment(title_style_def.get("align", "center")),
textColor=self._hex_to_color(title_color),
leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles
spaceBefore=0 # Ensure no space before title
)
def _create_heading_style(self, styles: Dict[str, Any], level: int) -> ParagraphStyle:
"""Create heading style from style definitions."""
heading_key = f"heading{level}"
heading_style_def = styles.get(heading_key, styles.get("heading1", {}))
return ParagraphStyle(
f'CustomHeading{level}',
fontSize=heading_style_def.get("font_size", 18 - level * 2),
spaceAfter=heading_style_def.get("space_after", 12),
spaceBefore=heading_style_def.get("space_before", 12),
alignment=self._get_alignment(heading_style_def.get("align", "left")),
textColor=self._hex_to_color(heading_style_def.get("color", "#2F2F2F"))
)
def _create_normal_style(self, styles: Dict[str, Any]) -> ParagraphStyle:
"""Create normal paragraph style from style definitions."""
paragraph_style_def = styles.get("paragraph", {})
return ParagraphStyle(
'CustomNormal',
fontSize=paragraph_style_def.get("font_size", 11),
spaceAfter=paragraph_style_def.get("space_after", 6),
alignment=self._get_alignment(paragraph_style_def.get("align", "left")),
textColor=self._hex_to_color(paragraph_style_def.get("color", "#2F2F2F")),
leading=paragraph_style_def.get("line_height", 1.2) * paragraph_style_def.get("font_size", 11)
)
def _get_alignment(self, align: str) -> int:
"""Convert alignment string to reportlab alignment constant."""
if not align or not isinstance(align, str):
return TA_LEFT
align_map = {
"center": TA_CENTER,
"left": TA_LEFT,
"justify": TA_JUSTIFY,
"right": TA_LEFT, # ReportLab doesn't have TA_RIGHT, use LEFT as fallback
"0": TA_LEFT, # Handle numeric strings
"1": TA_CENTER,
"2": TA_JUSTIFY
}
return align_map.get(align.lower().strip(), TA_LEFT)
def _get_table_alignment(self, align: str) -> str:
"""Convert alignment string to ReportLab table alignment string."""
if not align or not isinstance(align, str):
return 'LEFT'
align_map = {
"center": 'CENTER',
"left": 'LEFT',
"justify": 'LEFT', # Tables don't support justify, use LEFT
"right": 'RIGHT',
"0": 'LEFT', # Handle numeric strings
"1": 'CENTER',
"2": 'LEFT' # Tables don't support justify, use LEFT
}
return align_map.get(align.lower().strip(), 'LEFT')
def _hex_to_color(self, hex_color: str) -> colors.Color:
"""Convert hex color to reportlab color."""
try:
hex_color = hex_color.lstrip('#')
# Handle aRGB format (8 characters: FF + RGB)
if len(hex_color) == 8:
# Skip the alpha channel (first 2 characters)
hex_color = hex_color[2:]
# Handle RGB format (6 characters)
if len(hex_color) == 6:
r = int(hex_color[0:2], 16) / 255.0
g = int(hex_color[2:4], 16) / 255.0
b = int(hex_color[4:6], 16) / 255.0
return colors.Color(r, g, b)
# Fallback for other formats
return colors.black
except:
return colors.black
def _render_json_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a single JSON section to PDF elements using AI-generated styles."""
try:
section_type = self._get_section_type(section)
elements = self._get_section_data(section)
# Process each element in the section
all_elements = []
for element in elements:
if section_type == "table":
all_elements.extend(self._render_json_table(element, styles))
elif section_type == "bullet_list":
all_elements.extend(self._render_json_bullet_list(element, styles))
elif section_type == "heading":
all_elements.extend(self._render_json_heading(element, styles))
elif section_type == "paragraph":
all_elements.extend(self._render_json_paragraph(element, styles))
elif section_type == "code_block":
all_elements.extend(self._render_json_code_block(element, styles))
elif section_type == "image":
all_elements.extend(self._render_json_image(element, styles))
else:
# Fallback to paragraph for unknown types
all_elements.extend(self._render_json_paragraph(element, styles))
return all_elements
except Exception as e:
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
return [Paragraph(f"[Error rendering section: {str(e)}]", self._create_normal_style(styles))]
def _render_json_table(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON table to PDF elements using AI-generated styles."""
try:
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
if not headers or not rows:
return []
# Prepare table data
table_data_list = [headers] + rows
# Create table
table = Table(table_data_list)
# Apply styling
table_header_style = styles.get("table_header", {})
table_cell_style = styles.get("table_cell", {})
table_style = [
('BACKGROUND', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("background", "#4F4F4F"))),
('TEXTCOLOR', (0, 0), (-1, 0), self._hex_to_color(table_header_style.get("text_color", "#FFFFFF"))),
('ALIGN', (0, 0), (-1, -1), self._get_table_alignment(table_cell_style.get("align", "left"))),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), self._hex_to_color(table_cell_style.get("background", "#FFFFFF"))),
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
('GRID', (0, 0), (-1, -1), 1, colors.black)
]
table.setStyle(TableStyle(table_style))
return [table, Spacer(1, 12)]
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return []
def _render_json_bullet_list(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
try:
items = list_data.get("items", [])
bullet_style_def = styles.get("bullet_list", {})
elements = []
for item in items:
if isinstance(item, str):
elements.append(Paragraph(f"{item}", self._create_normal_style(styles)))
elif isinstance(item, dict) and "text" in item:
elements.append(Paragraph(f"{item['text']}", self._create_normal_style(styles)))
if elements:
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
return elements
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return []
def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON heading to PDF elements using AI-generated styles."""
try:
level = heading_data.get("level", 1)
text = heading_data.get("text", "")
if text:
level = max(1, min(6, level))
heading_style = self._create_heading_style(styles, level)
return [Paragraph(text, heading_style)]
return []
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return []
def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
try:
text = paragraph_data.get("text", "")
if text:
return [Paragraph(text, self._create_normal_style(styles))]
return []
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return []
def _render_json_code_block(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON code block to PDF elements using AI-generated styles."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
code_style_def = styles.get("code_block", {})
if code:
elements = []
if language:
lang_style = ParagraphStyle(
'CodeLanguage',
fontSize=code_style_def.get("font_size", 9),
textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")),
fontName='Helvetica-Bold'
)
elements.append(Paragraph(f"Code ({language}):", lang_style))
code_style = ParagraphStyle(
'CodeBlock',
fontSize=code_style_def.get("font_size", 9),
textColor=self._hex_to_color(code_style_def.get("color", "#2F2F2F")),
fontName=code_style_def.get("font", "Courier"),
backColor=self._hex_to_color(code_style_def.get("background", "#F5F5F5")),
spaceAfter=code_style_def.get("space_after", 6)
)
elements.append(Paragraph(code, code_style))
return elements
return []
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return []
def _render_json_image(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON image to PDF elements."""
try:
base64_data = image_data.get("base64Data", "")
alt_text = image_data.get("altText", "Image")
if base64_data:
# For now, just add a placeholder since reportlab image handling is complex
return [Paragraph(f"[Image: {alt_text}]", self._create_normal_style(styles))]
return []
except Exception as e:
self.logger.warning(f"Error rendering image: {str(e)}")
return [Paragraph(f"[Image: {image_data.get('altText', 'Image')}]", self._create_normal_style(styles))]

View file

@ -0,0 +1,885 @@
import logging
import base64
import io
from typing import Dict, Any, Optional, Tuple, List
from .rendererBaseTemplate import BaseRenderer
logger = logging.getLogger(__name__)
class RendererPptx(BaseRenderer):
"""Renderer for PowerPoint (.pptx) files using python-pptx library."""
def __init__(self):
super().__init__()
self.supported_formats = ["pptx", "ppt"]
self.output_mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
@classmethod
def get_supported_formats(cls) -> list:
"""Get list of supported output formats."""
return ["pptx", "ppt"]
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""
Render content as PowerPoint presentation from JSON data.
Args:
extracted_content: JSON content to render as presentation
title: Title for the presentation
user_prompt: User prompt for AI styling
ai_service: AI service for styling
**kwargs: Additional rendering options
Returns:
Base64-encoded PowerPoint presentation as string
"""
try:
# Import python-pptx
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
import re
# Get AI-generated styling definitions first
styles = await self._get_pptx_styles(user_prompt, ai_service)
# Create new presentation
prs = Presentation()
# Set slide size based on user intent (default to 16:9)
slide_size = styles.get("slide_size", "16:9")
if slide_size == "4:3":
prs.slide_width = Inches(10)
prs.slide_height = Inches(7.5)
else: # Default to 16:9
prs.slide_width = Inches(13.33)
prs.slide_height = Inches(7.5)
# Generate slides from JSON content
slides_data = await self._parse_json_to_slides(extracted_content, title, styles)
logger.info(f"Parsed {len(slides_data)} slides from JSON content")
# Debug: Show first 200 chars of content
logger.info(f"JSON content preview: {str(extracted_content)[:200]}...")
for i, slide_data in enumerate(slides_data):
logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - {len(slide_data.get('content', ''))} chars")
# Debug: Show slide content preview
slide_content = slide_data.get('content', '')
if slide_content:
logger.info(f" Content preview: '{slide_content[:100]}...'")
else:
logger.warning(f" ⚠️ Slide {i+1} has NO content!")
# Create slide with appropriate layout based on content
slide_layout_index = self._get_slide_layout_index(slide_data, styles)
slide_layout = prs.slide_layouts[slide_layout_index]
slide = prs.slides.add_slide(slide_layout)
# Set title with AI-generated styling
title_shape = slide.shapes.title
title_shape.text = slide_data.get("title", "Slide")
# Apply title styling
title_style = styles.get("title", {})
if title_shape.text_frame.paragraphs[0].font:
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 44))
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
# Set content with AI-generated styling
content_shape = slide.placeholders[1]
content_text = slide_data.get("content", "")
# Format content text with AI styles
text_frame = content_shape.text_frame
text_frame.clear()
# Split content into paragraphs
paragraphs = content_text.split('\n\n')
for i, paragraph in enumerate(paragraphs):
if paragraph.strip():
if i == 0:
p = text_frame.paragraphs[0]
else:
p = text_frame.add_paragraph()
p.text = paragraph.strip()
# Apply AI-generated styling based on content type
if paragraph.startswith('#'):
# Header
p.text = paragraph.lstrip('#').strip()
heading_style = styles.get("heading", {})
p.font.size = Pt(heading_style.get("font_size", 32))
p.font.bold = heading_style.get("bold", True)
heading_color = self._get_safe_color(heading_style.get("color", (47, 47, 47)))
p.font.color.rgb = RGBColor(*heading_color)
elif paragraph.startswith('##'):
# Subheader
p.text = paragraph.lstrip('#').strip()
subheading_style = styles.get("subheading", {})
p.font.size = Pt(subheading_style.get("font_size", 24))
p.font.bold = subheading_style.get("bold", True)
subheading_color = self._get_safe_color(subheading_style.get("color", (79, 79, 79)))
p.font.color.rgb = RGBColor(*subheading_color)
elif paragraph.startswith('*') and paragraph.endswith('*'):
# Bold text
p.text = paragraph.strip('*')
paragraph_style = styles.get("paragraph", {})
p.font.size = Pt(paragraph_style.get("font_size", 18))
p.font.bold = True
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
p.font.color.rgb = RGBColor(*paragraph_color)
else:
# Regular text
paragraph_style = styles.get("paragraph", {})
p.font.size = Pt(paragraph_style.get("font_size", 18))
p.font.bold = paragraph_style.get("bold", False)
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
p.font.color.rgb = RGBColor(*paragraph_color)
# Apply alignment
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
# If no slides were created, create a default slide
if not slides_data:
slide_layout = prs.slide_layouts[0] # Title slide layout
slide = prs.slides.add_slide(slide_layout)
title_shape = slide.shapes.title
title_shape.text = title
# Apply title styling to default slide
title_style = styles.get("title", {})
if title_shape.text_frame.paragraphs[0].font:
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
title_color = self._get_safe_color(title_style.get("color", (31, 78, 121)))
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
subtitle_shape = slide.placeholders[1]
subtitle_shape.text = "Generated by PowerOn AI System"
# Apply subtitle styling
paragraph_style = styles.get("paragraph", {})
if subtitle_shape.text_frame.paragraphs[0].font:
subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20))
subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False)
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color)
# Save to buffer
buffer = io.BytesIO()
prs.save(buffer)
buffer.seek(0)
# Convert to base64
pptx_bytes = buffer.getvalue()
pptx_base64 = base64.b64encode(pptx_bytes).decode('utf-8')
logger.info(f"Successfully rendered PowerPoint presentation: {len(pptx_bytes)} bytes")
return pptx_base64, "application/vnd.openxmlformats-officedocument.presentationml.presentation"
except ImportError:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
return "python-pptx library not installed", "text/plain"
except Exception as e:
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
return f"Error rendering PowerPoint presentation: {str(e)}", "text/plain"
def _parse_content_to_slides(self, content: str, title: str) -> list:
"""
Parse content into slide data structure.
Args:
content: Content to parse
title: Presentation title
Returns:
List of slide data dictionaries
"""
slides = []
# Split content by slide markers or headers
slide_sections = self._split_content_into_slides(content)
for i, section in enumerate(slide_sections):
if section.strip():
slide_data = {
"title": f"Slide {i + 1}",
"content": section.strip()
}
# Extract title from content if it starts with #
lines = section.strip().split('\n')
if lines and lines[0].startswith('#'):
# Remove # symbols and clean up title
slide_title = lines[0].lstrip('#').strip()
slide_data["title"] = slide_title
slide_data["content"] = '\n'.join(lines[1:]).strip()
elif lines and lines[0].strip():
# Use first line as title if it looks like a title
first_line = lines[0].strip()
if len(first_line) < 100 and not first_line.endswith('.'):
slide_data["title"] = first_line
slide_data["content"] = '\n'.join(lines[1:]).strip()
slides.append(slide_data)
return slides
def _split_content_into_slides(self, content: str) -> list:
"""
Split content into individual slides based on headers and structure.
Args:
content: Content to split
Returns:
List of slide content strings
"""
import re
# First, try to split by major headers (# or ##)
# This is the most common case for AI-generated content
header_pattern = r'^(#{1,2})\s+(.+)$'
lines = content.split('\n')
slides = []
current_slide = []
for line in lines:
# Check if this line is a header
header_match = re.match(header_pattern, line.strip())
if header_match:
# If we have content in current slide, save it
if current_slide:
slide_content = '\n'.join(current_slide).strip()
if slide_content:
slides.append(slide_content)
current_slide = []
# Start new slide with this header
current_slide.append(line)
else:
# Add line to current slide
current_slide.append(line)
# Add the last slide
if current_slide:
slide_content = '\n'.join(current_slide).strip()
if slide_content:
slides.append(slide_content)
# If we found slides with headers, return them
if len(slides) > 1:
return slides
# Fallback: Split by double newlines
sections = content.split('\n\n\n')
if len(sections) > 1:
return [s.strip() for s in sections if s.strip()]
# Another fallback: Split by double newlines
sections = content.split('\n\n')
if len(sections) > 1:
return [s.strip() for s in sections if s.strip()]
# Last resort: return as single slide
return [content.strip()]
def get_output_mime_type(self) -> str:
"""Get MIME type for rendered output."""
return self.output_mime_type
async def _get_pptx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
"""Get PowerPoint styling definitions using base template AI styling."""
style_schema = {
"title": {"font_size": 52, "color": "#1B365D", "bold": True, "align": "center"},
"heading": {"font_size": 36, "color": "#2C5F2D", "bold": True, "align": "left"},
"subheading": {"font_size": 28, "color": "#4A90E2", "bold": True, "align": "left"},
"paragraph": {"font_size": 20, "color": "#2F2F2F", "bold": False, "align": "left"},
"bullet_list": {"font_size": 20, "color": "#2F2F2F", "indent": 20},
"table_header": {"font_size": 18, "color": "#FFFFFF", "bold": True, "background": "#1B365D"},
"table_cell": {"font_size": 16, "color": "#2F2F2F", "bold": False, "background": "#F8F9FA"},
"slide_size": "16:9",
"content_per_slide": "concise",
"design_theme": "corporate",
"color_scheme": "professional",
"background_style": "clean",
"accent_colors": ["#1B365D", "#2C5F2D", "#4A90E2", "#6B7280"],
"professional_grade": True,
"executive_ready": True
}
style_template = self._create_professional_pptx_template(user_prompt, style_schema)
# Use our own _get_ai_styles_with_pptx_colors method to ensure proper color conversion
styles = await self._get_ai_styles_with_pptx_colors(ai_service, style_template, self._get_default_pptx_styles())
# Validate PowerPoint-specific requirements
return self._validate_pptx_styles_readability(styles)
def _create_professional_pptx_template(self, user_prompt: str, style_schema: Dict[str, Any]) -> str:
"""Create a professional PowerPoint-specific AI style template for corporate-quality slides."""
import json
schema_json = json.dumps(style_schema, indent=4)
return f"""Customize the JSON below for professional PowerPoint slides.
User Request: {user_prompt or "Create professional corporate slides"}
Rules:
- Use professional colors (blues, grays, deep greens)
- Large, readable font sizes
- High contrast
- Sophisticated color palettes
Return ONLY this JSON with your changes:
{schema_json}
JSON ONLY. NO OTHER TEXT."""
async def _get_ai_styles_with_pptx_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper PowerPoint color conversion."""
if not ai_service:
return default_styles
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=style_template, context="", options=request_options)
# Check if AI service is properly configured
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
self.logger.warning("AI service not properly configured, using defaults")
return default_styles
response = await ai_service.aiObjects.call(request)
# Check if response is valid
if not response:
self.logger.warning("AI service returned no response, using defaults")
return default_styles
import json
import re
# Clean and parse JSON
result = response.content.strip() if response and response.content else ""
# Check if result is empty
if not result:
self.logger.warning("AI styling returned empty response, using defaults")
return default_styles
# Log the raw response for debugging
self.logger.debug(f"AI styling raw response: {result[:200]}...")
# Extract JSON from various formats
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to extract JSON from explanatory text
json_patterns = [
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
r'\{.*?"title".*?\}', # JSON with title field
r'\{.*?"font_size".*?\}', # JSON with font_size field
]
for pattern in json_patterns:
json_match = re.search(pattern, result, re.DOTALL)
if json_match:
result = json_match.group(0)
break
# Additional cleanup - remove any leading/trailing whitespace and newlines
result = result.strip()
# Check if result is still empty after cleanup
if not result:
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
return default_styles
# Try to parse JSON
try:
styles = json.loads(result)
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
except json.JSONDecodeError as json_error:
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
self.logger.warning(f"Raw content that failed to parse: {result[:100]}...")
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
self.logger.debug(f"Extracted AI styles: {list(styles.keys())}")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
# Convert colors to PowerPoint RGB format
styles = self._convert_colors_format(styles)
return styles
except Exception as e:
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
return default_styles
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Convert hex colors to RGB format for PowerPoint compatibility."""
try:
for style_name, style_config in styles.items():
if isinstance(style_config, dict):
for prop, value in style_config.items():
if isinstance(value, str) and value.startswith('#'):
# Convert hex to RGB tuple for PowerPoint
hex_color = value.lstrip('#')
if len(hex_color) == 6:
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
styles[style_name][prop] = (r, g, b)
elif len(hex_color) == 8: # aRGB format
r = int(hex_color[2:4], 16)
g = int(hex_color[4:6], 16)
b = int(hex_color[6:8], 16)
styles[style_name][prop] = (r, g, b)
return styles
except Exception as e:
self.logger.warning(f"Color conversion failed: {str(e)}")
return styles
def _get_safe_color(self, color_value, default=(0, 0, 0)) -> tuple:
"""Get a safe RGB color tuple for PowerPoint."""
if isinstance(color_value, tuple) and len(color_value) == 3:
return color_value
elif isinstance(color_value, str) and color_value.startswith('#'):
hex_color = color_value.lstrip('#')
if len(hex_color) == 6:
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
return (r, g, b)
elif len(hex_color) == 8: # aRGB format
r = int(hex_color[2:4], 16)
g = int(hex_color[4:6], 16)
b = int(hex_color[6:8], 16)
return (r, g, b)
return default
def _validate_pptx_styles_readability(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix readability issues in AI-generated styles."""
try:
# Ensure minimum font sizes for PowerPoint readability
min_font_sizes = {
"title": 36,
"heading": 24,
"subheading": 20,
"paragraph": 14,
"bullet_list": 14,
"table_header": 12,
"table_cell": 12
}
for style_name, min_size in min_font_sizes.items():
if style_name in styles:
current_size = styles[style_name].get("font_size", 12)
if current_size < min_size:
styles[style_name]["font_size"] = min_size
return styles
except Exception as e:
logger.warning(f"Style validation failed: {str(e)}")
return self._get_default_pptx_styles()
def _get_default_pptx_styles(self) -> Dict[str, Any]:
"""Default PowerPoint styles with corporate professional color scheme."""
return {
"title": {"font_size": 52, "color": (27, 54, 93), "bold": True, "align": "center"},
"heading": {"font_size": 36, "color": (44, 95, 45), "bold": True, "align": "left"},
"subheading": {"font_size": 28, "color": (74, 144, 226), "bold": True, "align": "left"},
"paragraph": {"font_size": 20, "color": (47, 47, 47), "bold": False, "align": "left"},
"bullet_list": {"font_size": 20, "color": (47, 47, 47), "indent": 20},
"table_header": {"font_size": 18, "color": (255, 255, 255), "bold": True, "background": (27, 54, 93)},
"table_cell": {"font_size": 16, "color": (47, 47, 47), "bold": False, "background": (248, 249, 250)},
"slide_size": "16:9",
"content_per_slide": "concise",
"design_theme": "corporate",
"color_scheme": "professional",
"background_style": "clean",
"accent_colors": [(27, 54, 93), (44, 95, 45), (74, 144, 226), (107, 114, 128)],
"professional_grade": True,
"executive_ready": True
}
async def _parse_json_to_slides(self, json_content: Dict[str, Any], title: str, styles: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Parse JSON content into slide data structure.
Args:
json_content: JSON content to parse
title: Presentation title
styles: AI-generated styles
Returns:
List of slide data dictionaries
"""
slides = []
try:
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Create title slide
slides.append({
"title": document_title,
"content": "Generated by PowerOn AI System\n\n" + self._format_timestamp()
})
# Process sections into slides based on content and user intent
sections = json_content.get("sections", [])
slides.extend(self._create_slides_from_sections(sections, styles))
# If no content slides were created, create a default content slide
if len(slides) == 1: # Only title slide
slides.append({
"title": "Content Overview",
"content": "No structured content found in the source documents.\n\nPlease check the source documents and try again."
})
return slides
except Exception as e:
logger.error(f"Error parsing JSON to slides: {str(e)}")
# Return minimal fallback slides
return [
{
"title": title,
"content": "Error parsing content for presentation"
}
]
def _create_slide_from_section(self, section: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
"""Create a slide from a JSON section."""
try:
# Get section title from data or use default
section_title = "Untitled Section"
if section.get("content_type") == "heading":
# Extract text from elements array
for element in section.get("elements", []):
if isinstance(element, dict) and "text" in element:
section_title = element.get("text", "Untitled Section")
break
elif section.get("title"):
section_title = section.get("title")
content_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
# Build slide content based on section type
content_parts = []
if content_type == "table":
content_parts.append(self._format_table_for_slide(elements))
elif content_type == "list":
content_parts.append(self._format_list_for_slide(elements))
elif content_type == "heading":
content_parts.append(self._format_heading_for_slide(elements))
elif content_type == "paragraph":
content_parts.append(self._format_paragraph_for_slide(elements))
elif content_type == "code":
content_parts.append(self._format_code_for_slide(elements))
else:
content_parts.append(self._format_paragraph_for_slide(elements))
# Combine content parts
slide_content = "\n\n".join(filter(None, content_parts))
return {
"title": section_title,
"content": slide_content
}
except Exception as e:
logger.warning(f"Error creating slide from section: {str(e)}")
return None
def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str:
"""Format table data for slide presentation."""
try:
# Extract table data from elements array
headers = []
rows = []
for element in elements:
if isinstance(element, dict) and "headers" in element and "rows" in element:
headers = element.get("headers", [])
rows = element.get("rows", [])
break
if not headers:
return ""
# Create table representation
table_lines = []
# Add headers
header_line = " | ".join(str(h) for h in headers)
table_lines.append(header_line)
# Add separator
separator = "-" * len(header_line)
table_lines.append(separator)
# Add data rows (limit based on content density)
max_rows = 5 # Default limit
for row in rows[:max_rows]:
row_line = " | ".join(str(cell) for cell in row)
table_lines.append(row_line)
if len(rows) > max_rows:
table_lines.append(f"... and {len(rows) - max_rows} more rows")
return "\n".join(table_lines)
except Exception as e:
logger.warning(f"Error formatting table for slide: {str(e)}")
return ""
def _format_list_for_slide(self, list_data: Dict[str, Any]) -> str:
"""Format list data for slide presentation."""
try:
items = list_data.get("items", [])
if not items:
return ""
# Create list representation
list_lines = []
for item in items:
if isinstance(item, dict):
text = item.get("text", "")
list_lines.append(f"{text}")
# Add subitems (limit to 3 for readability)
subitems = item.get("subitems", [])[:3]
for subitem in subitems:
if isinstance(subitem, dict):
list_lines.append(f" - {subitem.get('text', '')}")
else:
list_lines.append(f" - {subitem}")
else:
list_lines.append(f"{str(item)}")
return "\n".join(list_lines)
except Exception as e:
logger.warning(f"Error formatting list for slide: {str(e)}")
return ""
def _format_heading_for_slide(self, heading_data: Dict[str, Any]) -> str:
"""Format heading data for slide presentation."""
try:
text = heading_data.get("text", "")
level = heading_data.get("level", 1)
if text:
return f"{'#' * level} {text}"
return ""
except Exception as e:
logger.warning(f"Error formatting heading for slide: {str(e)}")
return ""
def _format_paragraph_for_slide(self, paragraph_data: Dict[str, Any]) -> str:
"""Format paragraph data for slide presentation."""
try:
text = paragraph_data.get("text", "")
if text:
# Limit paragraph length based on content density
max_length = 200 # Default limit
if len(text) > max_length:
text = text[:max_length] + "..."
return text
return ""
except Exception as e:
logger.warning(f"Error formatting paragraph for slide: {str(e)}")
return ""
def _format_code_for_slide(self, code_data: Dict[str, Any]) -> str:
"""Format code data for slide presentation."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
if code:
# Limit code length based on content density
max_length = 100 # Default limit
if len(code) > max_length:
code = code[:max_length] + "..."
if language:
return f"Code ({language}):\n{code}"
else:
return f"Code:\n{code}"
return ""
except Exception as e:
logger.warning(f"Error formatting code for slide: {str(e)}")
return ""
def _get_slide_layout_index(self, slide_data: Dict[str, Any], styles: Dict[str, Any]) -> int:
"""Determine the best professional slide layout based on content."""
try:
content = slide_data.get("content", "")
title = slide_data.get("title", "")
# Check if it's a title slide (first slide)
if not content or "Generated by PowerOn AI System" in content:
return 0 # Title slide layout
# Professional layout selection based on content
if "|" in content and "-" in content:
# Has both tables and lists - use content with caption for professional look
return 2
elif "|" in content:
# Has tables - use content layout for clean table presentation
return 1
elif content.count("") > 2:
# Has many bullet points - use content layout for better readability
return 1
elif len(content) > 200:
# Long content - use content layout for better text flow
return 1
elif title and len(title) > 20:
# Long title - use title and content layout
return 1
else:
# Default to title and content layout for professional appearance
return 1
except Exception as e:
logger.warning(f"Error determining slide layout: {str(e)}")
return 1 # Default to title and content layout
def _create_slides_from_sections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create slides from sections based on content density and user intent."""
try:
slides = []
content_per_slide = styles.get("content_per_slide", "concise")
# Group sections by type and create slides
current_slide_content = []
current_slide_title = "Content Overview"
for section in sections:
section_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
if section_type == "heading":
# If we have accumulated content, create a slide
if current_slide_content:
slides.append({
"title": current_slide_title,
"content": "\n\n".join(current_slide_content)
})
current_slide_content = []
# Start new slide with heading as title
for element in elements:
if isinstance(element, dict) and "text" in element:
current_slide_title = element.get("text", "Untitled Section")
break
else:
# Add content to current slide
formatted_content = self._format_section_content(section)
if formatted_content:
current_slide_content.append(formatted_content)
# Add final slide if there's content
if current_slide_content:
slides.append({
"title": current_slide_title,
"content": "\n\n".join(current_slide_content)
})
return slides
except Exception as e:
logger.warning(f"Error creating slides from sections: {str(e)}")
return []
def _format_section_content(self, section: Dict[str, Any]) -> str:
"""Format section content for slide presentation."""
try:
content_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
# Process each element in the section
content_parts = []
for element in elements:
if content_type == "table":
content_parts.append(self._format_table_for_slide([element]))
elif content_type == "list":
content_parts.append(self._format_list_for_slide([element]))
elif content_type == "heading":
content_parts.append(self._format_heading_for_slide([element]))
elif content_type == "paragraph":
content_parts.append(self._format_paragraph_for_slide([element]))
elif content_type == "code":
content_parts.append(self._format_code_for_slide([element]))
else:
content_parts.append(self._format_paragraph_for_slide([element]))
return "\n\n".join(filter(None, content_parts))
except Exception as e:
logger.warning(f"Error formatting section content: {str(e)}")
return ""
def _format_timestamp(self) -> str:
"""Format current timestamp for presentation generation."""
from datetime import datetime, UTC
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")

View file

@ -0,0 +1,256 @@
"""
Text renderer for report generation.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
class RendererText(BaseRenderer):
"""Renders content to plain text format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported text formats (excluding formats with dedicated renderers)."""
return [
'txt', 'text', 'plain',
# Programming languages
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
# Web technologies (excluding html/htm which have dedicated renderer)
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
# Data formats (excluding csv, md/markdown which have dedicated renderers)
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
# Configuration files
'env', 'properties', 'conf', 'config', 'rc',
'gitattributes', 'editorconfig', 'eslintrc',
# Documentation
'readme', 'changelog', 'license', 'authors',
'contributing', 'todo', 'notes', 'docs'
]
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return [
'ascii', 'utf8', 'utf-8', 'code', 'source',
'script', 'program', 'file', 'document',
'raw', 'unformatted', 'plaintext'
]
@classmethod
def get_priority(cls) -> int:
"""Return priority for text renderer."""
return 90
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to plain text format."""
try:
# Generate text from JSON structure
text_content = self._generate_text_from_json(extracted_content, title)
return text_content, "text/plain"
except Exception as e:
self.logger.error(f"Error rendering text: {str(e)}")
# Return minimal text fallback
return f"{title}\n\nError rendering report: {str(e)}", "text/plain"
def _generate_text_from_json(self, json_content: Dict[str, Any], title: str) -> str:
"""Generate text content from structured JSON document."""
try:
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Build text content
text_parts = []
# Document title
text_parts.append(document_title)
text_parts.append("=" * len(document_title))
text_parts.append("")
# Process each section
sections = json_content.get("sections", [])
for section in sections:
section_text = self._render_json_section(section)
if section_text:
text_parts.append(section_text)
text_parts.append("") # Add spacing between sections
# Add generation info
text_parts.append("")
text_parts.append(f"Generated: {self._format_timestamp()}")
return '\n'.join(text_parts)
except Exception as e:
self.logger.error(f"Error generating text from JSON: {str(e)}")
raise Exception(f"Text generation failed: {str(e)}")
def _render_json_section(self, section: Dict[str, Any]) -> str:
"""Render a single JSON section to text."""
try:
section_type = self._get_section_type(section)
section_data = self._get_section_data(section)
if section_type == "table":
# Process the section data to extract table structure
processed_data = self._process_section_by_type(section)
return self._render_json_table(processed_data)
elif section_type == "bullet_list":
# Process the section data to extract bullet list structure
processed_data = self._process_section_by_type(section)
return self._render_json_bullet_list(processed_data)
elif section_type == "heading":
# Render each heading element in the elements array
# section_data is already the elements array from _get_section_data
rendered_elements = []
for element in section_data:
rendered_elements.append(self._render_json_heading(element))
return "\n".join(rendered_elements)
elif section_type == "paragraph":
# Render each paragraph element in the elements array
# section_data is already the elements array from _get_section_data
rendered_elements = []
for element in section_data:
rendered_elements.append(self._render_json_paragraph(element))
return "\n".join(rendered_elements)
elif section_type == "code_block":
# Process the section data to extract code block structure
processed_data = self._process_section_by_type(section)
return self._render_json_code_block(processed_data)
elif section_type == "image":
# Process the section data to extract image structure
processed_data = self._process_section_by_type(section)
return self._render_json_image(processed_data)
else:
# Fallback to paragraph for unknown types - render each element
# section_data is already the elements array from _get_section_data
rendered_elements = []
for element in section_data:
rendered_elements.append(self._render_json_paragraph(element))
return "\n".join(rendered_elements)
except Exception as e:
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
return f"[Error rendering section: {str(e)}]"
def _render_json_table(self, table_data: Dict[str, Any]) -> str:
"""Render a JSON table to text."""
try:
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
if not headers or not rows:
return ""
text_parts = []
# Create table header
header_line = " | ".join(str(header) for header in headers)
text_parts.append(header_line)
# Add separator line
separator_line = " | ".join("-" * len(str(header)) for header in headers)
text_parts.append(separator_line)
# Add data rows
for row in rows:
row_line = " | ".join(str(cell_data) for cell_data in row)
text_parts.append(row_line)
return '\n'.join(text_parts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _render_json_bullet_list(self, list_data: Dict[str, Any]) -> str:
"""Render a JSON bullet list to text."""
try:
items = list_data.get("items", [])
if not items:
return ""
text_parts = []
for item in items:
if isinstance(item, str):
text_parts.append(f"- {item}")
elif isinstance(item, dict) and "text" in item:
text_parts.append(f"- {item['text']}")
return '\n'.join(text_parts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _render_json_heading(self, heading_data: Dict[str, Any]) -> str:
"""Render a JSON heading to text."""
try:
level = heading_data.get("level", 1)
text = heading_data.get("text", "")
if text:
level = max(1, min(6, level))
if level == 1:
return f"{text}\n{'=' * len(text)}"
elif level == 2:
return f"{text}\n{'-' * len(text)}"
else:
return f"{'#' * level} {text}"
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _render_json_paragraph(self, paragraph_data: Dict[str, Any]) -> str:
"""Render a JSON paragraph to text."""
try:
text = paragraph_data.get("text", "")
return text if text else ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _render_json_code_block(self, code_data: Dict[str, Any]) -> str:
"""Render a JSON code block to text."""
try:
code = code_data.get("code", "")
language = code_data.get("language", "")
if code:
if language:
return f"Code ({language}):\n{code}"
else:
return code
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _render_json_image(self, image_data: Dict[str, Any]) -> str:
"""Render a JSON image to text."""
try:
alt_text = image_data.get("altText", "Image")
return f"[Image: {alt_text}]"
except Exception as e:
self.logger.warning(f"Error rendering image: {str(e)}")
return f"[Image: {image_data.get('altText', 'Image')}]"

View file

@ -0,0 +1,791 @@
"""
Excel renderer for report generation using openpyxl.
"""
from .rendererBaseTemplate import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC
try:
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
from openpyxl.worksheet.table import Table, TableStyleInfo
OPENPYXL_AVAILABLE = True
except ImportError:
OPENPYXL_AVAILABLE = False
class RendererXlsx(BaseRenderer):
"""Renders content to Excel format using openpyxl."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported Excel formats."""
return ['xlsx', 'xls', 'excel']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['spreadsheet', 'workbook']
@classmethod
def get_priority(cls) -> int:
"""Return priority for Excel renderer."""
return 110
async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
"""Render extracted JSON content to Excel format using AI-analyzed styling."""
try:
if not OPENPYXL_AVAILABLE:
# Fallback to CSV if openpyxl not available
from .rendererCsv import RendererCsv
csv_renderer = RendererCsv()
csv_content, _ = await csv_renderer.render(extracted_content, title, user_prompt, ai_service)
return csv_content, "text/csv"
# Generate Excel using AI-analyzed styling
excel_content = await self._generate_excel_from_json(extracted_content, title, user_prompt, ai_service)
return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
except Exception as e:
self.logger.error(f"Error rendering Excel: {str(e)}")
# Return CSV fallback
return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
def _generate_excel(self, content: str, title: str) -> str:
"""Generate Excel content using openpyxl."""
try:
# Create workbook
wb = Workbook()
# Remove default sheet
wb.remove(wb.active)
# Create sheets
summary_sheet = wb.create_sheet("Summary", 0)
data_sheet = wb.create_sheet("Data", 1)
analysis_sheet = wb.create_sheet("Analysis", 2)
# Add content to sheets
self._populate_summary_sheet(summary_sheet, title)
self._populate_data_sheet(data_sheet, content)
self._populate_analysis_sheet(analysis_sheet, content)
# Save to buffer
buffer = io.BytesIO()
wb.save(buffer)
buffer.seek(0)
# Convert to base64
excel_bytes = buffer.getvalue()
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
return excel_base64
except Exception as e:
self.logger.error(f"Error generating Excel: {str(e)}")
raise
def _populate_summary_sheet(self, sheet, title: str):
"""Populate the summary sheet."""
try:
# Title
sheet['A1'] = title
sheet['A1'].font = Font(size=16, bold=True)
sheet['A1'].alignment = Alignment(horizontal='center')
# Generation info
sheet['A3'] = "Generated:"
sheet['B3'] = self._format_timestamp()
sheet['A4'] = "Status:"
sheet['B4'] = "Generated Successfully"
# Key metrics placeholder
sheet['A6'] = "Key Metrics:"
sheet['A6'].font = Font(bold=True)
sheet['A7'] = "Total Items:"
sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
# Auto-adjust column widths
sheet.column_dimensions['A'].width = 20
sheet.column_dimensions['B'].width = 30
except Exception as e:
self.logger.warning(f"Could not populate summary sheet: {str(e)}")
def _populate_data_sheet(self, sheet, content: str):
"""Populate the data sheet."""
try:
# Headers
headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
for col, header in enumerate(headers, 1):
cell = sheet.cell(row=1, column=col, value=header)
cell.font = Font(bold=True)
cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
# Process content
lines = content.split('\n')
row = 2
for line in lines:
line = line.strip()
if not line:
continue
# Check for table data (lines with |)
if '|' in line:
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
sheet.cell(row=row, column=col, value=cell_data)
row += 1
else:
# Regular content
sheet.cell(row=row, column=1, value=line)
row += 1
# Auto-adjust column widths
for col in range(1, 6):
sheet.column_dimensions[get_column_letter(col)].width = 20
except Exception as e:
self.logger.warning(f"Could not populate data sheet: {str(e)}")
def _populate_analysis_sheet(self, sheet, content: str):
"""Populate the analysis sheet."""
try:
# Title
sheet['A1'] = "Analysis & Insights"
sheet['A1'].font = Font(size=14, bold=True)
# Content analysis
lines = content.split('\n')
row = 3
sheet['A3'] = "Content Analysis:"
sheet['A3'].font = Font(bold=True)
row += 1
# Count different types of content
table_lines = sum(1 for line in lines if '|' in line)
list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
text_lines = len(lines) - table_lines - list_lines
sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
row += 1
sheet[f'A{row}'] = f"Table Rows: {table_lines}"
row += 1
sheet[f'A{row}'] = f"List Items: {list_lines}"
row += 1
sheet[f'A{row}'] = f"Text Lines: {text_lines}"
row += 2
# Recommendations
sheet[f'A{row}'] = "Recommendations:"
sheet[f'A{row}'].font = Font(bold=True)
row += 1
sheet[f'A{row}'] = "1. Review data accuracy"
row += 1
sheet[f'A{row}'] = "2. Consider additional analysis"
row += 1
sheet[f'A{row}'] = "3. Update regularly"
# Auto-adjust column width
sheet.column_dimensions['A'].width = 30
except Exception as e:
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
async def _generate_excel_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
"""Generate Excel content from structured JSON document using AI-generated styling."""
try:
# Debug output
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(json_content)}", "EXCEL_RENDERER")
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(json_content.keys()) if isinstance(json_content, dict) else 'Not a dict'}", "EXCEL_RENDERER")
# Get AI-generated styling definitions
styles = await self._get_excel_styles(user_prompt, ai_service)
# Validate JSON structure
if not isinstance(json_content, dict):
raise ValueError("JSON content must be a dictionary")
if "sections" not in json_content:
raise ValueError("JSON content must contain 'sections' field")
# Use title from JSON metadata if available, otherwise use provided title
document_title = json_content.get("metadata", {}).get("title", title)
# Create workbook
wb = Workbook()
# Create sheets based on content
sheets = self._create_excel_sheets(wb, json_content, styles)
self.services.utils.debugLogToFile(f"EXCEL SHEETS CREATED: {list(sheets.keys()) if sheets else 'None'}", "EXCEL_RENDERER")
# Populate sheets with content
self._populate_excel_sheets(sheets, json_content, styles)
# Save to buffer
buffer = io.BytesIO()
wb.save(buffer)
buffer.seek(0)
# Convert to base64
excel_bytes = buffer.getvalue()
self.services.utils.debugLogToFile(f"EXCEL BYTES LENGTH: {len(excel_bytes)}", "EXCEL_RENDERER")
try:
excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
self.services.utils.debugLogToFile(f"EXCEL BASE64 LENGTH: {len(excel_base64)}", "EXCEL_RENDERER")
except Exception as b64_error:
self.services.utils.debugLogToFile(f"BASE64 ENCODING ERROR: {b64_error}", "EXCEL_RENDERER")
raise
return excel_base64
except Exception as e:
self.logger.error(f"Error generating Excel from JSON: {str(e)}")
raise Exception(f"Excel generation failed: {str(e)}")
async def _get_excel_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]:
"""Get Excel styling definitions using base template AI styling."""
style_schema = {
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
"bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2},
"paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"},
"code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"}
}
style_template = self._create_ai_style_template("xlsx", user_prompt, style_schema)
# Use our own _get_ai_styles_with_excel_colors method to ensure proper color conversion
styles = await self._get_ai_styles_with_excel_colors(ai_service, style_template, self._get_default_excel_styles())
# Validate and fix contrast issues
return self._validate_excel_styles_contrast(styles)
async def _get_ai_styles_with_excel_colors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper Excel color conversion."""
if not ai_service:
return default_styles
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=style_template, context="", options=request_options)
response = await ai_service.aiObjects.call(request)
import json
import re
# Clean and parse JSON
result = response.content.strip() if response and response.content else ""
# Check if result is empty
if not result:
self.logger.warning("AI styling returned empty response, using defaults")
return default_styles
# Extract JSON from markdown if present
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
self.services.utils.debugLogToFile(f"EXTRACTED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
self.services.utils.debugLogToFile(f"CLEANED JSON FROM MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
self.services.utils.debugLogToFile(f"CLEANED JSON FROM GENERIC MARKDOWN: {result[:100]}...", "EXCEL_RENDERER")
# Try to parse JSON
try:
styles = json.loads(result)
except json.JSONDecodeError as json_error:
self.logger.warning(f"AI styling returned invalid JSON: {json_error}, using defaults")
return default_styles
# Convert colors to Excel aRGB format
styles = self._convert_colors_format(styles)
return styles
except Exception as e:
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
return default_styles
def _get_safe_color(self, color_value: str, default: str = "FF000000") -> str:
"""Get a safe aRGB color value for Excel (without # prefix)."""
if not isinstance(color_value, str):
return default
# Remove # prefix if present
if color_value.startswith('#'):
color_value = color_value[1:]
if len(color_value) == 6:
# Convert RRGGBB to AARRGGBB
return f"FF{color_value}"
elif len(color_value) == 8:
# Already aRGB format
return color_value
else:
# Unexpected format, return default
return default
def _convert_colors_format(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Convert hex colors to aRGB format for Excel compatibility."""
try:
self.services.utils.debugLogToFile(f"CONVERTING COLORS IN STYLES: {styles}", "EXCEL_RENDERER")
for style_name, style_config in styles.items():
if isinstance(style_config, dict):
for prop, value in style_config.items():
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel)
styles[style_name][prop] = f"FF{value[1:]}"
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
pass # Already aRGB format
elif isinstance(value, str) and value.startswith('#'):
pass # Unexpected format, keep as is
return styles
except Exception as e:
return styles
def _validate_excel_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bg_color = header.get("background", "#FFFFFF")
text_color = header.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bg_color = cell.get("background", "#FFFFFF")
text_color = cell.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._get_default_excel_styles()
def _get_default_excel_styles(self) -> Dict[str, Any]:
"""Default Excel styles with aRGB color format."""
return {
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
"bullet_list": {"font_size": 11, "color": "#FF2F2F2F", "indent": 2},
"paragraph": {"font_size": 11, "color": "#FF2F2F2F", "bold": False, "align": "left"},
"code_block": {"font": "Courier New", "font_size": 10, "color": "#FF2F2F2F", "background": "#FFF5F5F5"}
}
def _create_excel_sheets(self, wb: Workbook, json_content: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
"""Create Excel sheets based on content structure and user intent."""
sheets = {}
# Get sheet names from AI styles or generate based on content
sheet_names = styles.get("sheet_names", self._generate_sheet_names_from_content(json_content))
self.services.utils.debugLogToFile(f"EXCEL SHEET NAMES: {sheet_names}", "EXCEL_RENDERER")
# Create sheets
for i, sheet_name in enumerate(sheet_names):
if i == 0:
# Use the default sheet for the first sheet
sheet = wb.active
sheet.title = sheet_name
else:
# Create additional sheets
sheet = wb.create_sheet(sheet_name, i)
sheets[sheet_name.lower()] = sheet
return sheets
def _generate_sheet_names_from_content(self, json_content: Dict[str, Any]) -> List[str]:
"""Generate sheet names based on actual content structure."""
sections = json_content.get("sections", [])
# If no sections, create a single sheet
if not sections:
return ["Content"]
# Generate sheet names based on content structure
sheet_names = []
# Check if we have multiple table sections
table_sections = [s for s in sections if s.get("content_type") == "table"]
if len(table_sections) > 1:
# Create separate sheets for each table
for i, section in enumerate(table_sections, 1):
section_title = section.get("title", f"Table {i}")
sheet_names.append(section_title[:31]) # Excel sheet name limit
else:
# Single table or mixed content - create main sheet
document_title = json_content.get("metadata", {}).get("title", "Document")
sheet_names.append(document_title[:31]) # Excel sheet name limit
# Add additional sheets for other content types
content_types = set()
for section in sections:
content_type = section.get("content_type", "paragraph")
content_types.add(content_type)
if "table" in content_types and len(table_sections) == 1:
sheet_names.append("Table Data")
if "list" in content_types:
sheet_names.append("Lists")
if "paragraph" in content_types or "heading" in content_types:
sheet_names.append("Text")
# Limit to 4 sheets maximum
return sheet_names[:4]
def _populate_excel_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Populate Excel sheets with content from JSON based on actual sheet names."""
try:
# Get the actual sheet names that were created
sheet_names = list(sheets.keys())
if not sheet_names:
return
sections = json_content.get("sections", [])
table_sections = [s for s in sections if s.get("content_type") == "table"]
if len(table_sections) > 1:
# Multiple tables - populate each sheet with its corresponding table
for i, section in enumerate(table_sections):
if i < len(sheet_names):
sheet_name = sheet_names[i]
sheet = sheets[sheet_name]
self._populate_table_sheet(sheet, section, styles, f"Table {i+1}")
else:
# Single table or mixed content - use original logic
first_sheet_name = sheet_names[0]
self._populate_main_sheet(sheets[first_sheet_name], json_content, styles)
# If we have multiple sheets, distribute content by type
if len(sheet_names) > 1:
self._populate_content_type_sheets(sheets, json_content, styles, sheet_names[1:])
except Exception as e:
self.logger.warning(f"Could not populate Excel sheets: {str(e)}")
def _populate_table_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], sheet_title: str):
"""Populate a sheet with a single table section."""
try:
# Sheet title
sheet['A1'] = sheet_title
sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79")))
sheet['A1'].alignment = Alignment(horizontal="center")
# Get table data from elements (canonical JSON format)
elements = section.get("elements", [])
if elements and isinstance(elements, list) and len(elements) > 0:
table_data = elements[0]
headers = table_data.get("headers", [])
rows = table_data.get("rows", [])
else:
headers = []
rows = []
if not headers and not rows:
sheet['A3'] = "No table data available"
return
# Add headers
header_style = styles.get("table_header", {})
for col, header in enumerate(headers, 1):
cell = sheet.cell(row=3, column=col, value=header)
if header_style.get("bold"):
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
if header_style.get("background"):
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
# Add rows
cell_style = styles.get("table_cell", {})
for row_idx, row_data in enumerate(rows, 4):
for col_idx, cell_value in enumerate(row_data, 1):
cell = sheet.cell(row=row_idx, column=col_idx, value=cell_value)
if cell_style.get("text_color"):
cell.font = Font(color=self._get_safe_color(cell_style["text_color"]))
# Auto-adjust column widths
for col in range(1, len(headers) + 1):
sheet.column_dimensions[get_column_letter(col)].width = 20
except Exception as e:
self.logger.warning(f"Could not populate table sheet: {str(e)}")
def _populate_main_sheet(self, sheet, json_content: Dict[str, Any], styles: Dict[str, Any]):
"""Populate the main sheet with document overview and all content."""
try:
# Document title
document_title = json_content.get("metadata", {}).get("title", "Generated Report")
sheet['A1'] = document_title
# Safety check for title style
title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "center"})
try:
safe_color = self._get_safe_color(title_style["color"])
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color=safe_color)
sheet['A1'].alignment = Alignment(horizontal=title_style["align"])
except Exception as font_error:
# Try with a safe color
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color="FF000000")
sheet['A1'].alignment = Alignment(horizontal=title_style["align"])
# Generation info
sheet['A3'] = "Generated:"
sheet['B3'] = self._format_timestamp()
sheet['A4'] = "Status:"
sheet['B4'] = "Generated Successfully"
# Document metadata
metadata = json_content.get("metadata", {})
if metadata:
sheet['A6'] = "Document Information:"
sheet['A6'].font = Font(bold=True)
row = 7
for key, value in metadata.items():
if key != "title":
sheet[f'A{row}'] = f"{key.title()}:"
sheet[f'B{row}'] = str(value)
row += 1
# Content overview
sections = json_content.get("sections", [])
sheet[f'A{row + 1}'] = "Content Overview:"
sheet[f'A{row + 1}'].font = Font(bold=True)
row += 2
sheet[f'A{row}'] = f"Total Sections: {len(sections)}"
# Count different content types
content_types = {}
for section in sections:
content_type = section.get("content_type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
for content_type, count in content_types.items():
row += 1
sheet[f'A{row}'] = f"{content_type.title()} Sections: {count}"
# Add all content to this sheet
row += 2
for section in sections:
row = self._add_section_to_sheet(sheet, section, styles, row)
row += 1 # Empty row between sections
# Auto-adjust column widths
sheet.column_dimensions['A'].width = 20
sheet.column_dimensions['B'].width = 30
except Exception as e:
self.logger.warning(f"Could not populate main sheet: {str(e)}")
def _populate_content_type_sheets(self, sheets: Dict[str, Any], json_content: Dict[str, Any], styles: Dict[str, Any], sheet_names: List[str]):
"""Populate additional sheets based on content types."""
try:
sections = json_content.get("sections", [])
for sheet_name in sheet_names:
if sheet_name not in sheets:
continue
sheet = sheets[sheet_name]
sheet_title = sheet_name.title()
sheet['A1'] = sheet_title
sheet['A1'].font = Font(size=16, bold=True)
row = 3
# Filter sections by content type
if sheet_name == "tables":
filtered_sections = [s for s in sections if s.get("content_type") == "table"]
elif sheet_name == "lists":
filtered_sections = [s for s in sections if s.get("content_type") == "list"]
elif sheet_name == "text":
filtered_sections = [s for s in sections if s.get("content_type") in ["paragraph", "heading"]]
else:
filtered_sections = sections
for section in filtered_sections:
row = self._add_section_to_sheet(sheet, section, styles, row)
row += 1 # Empty row between sections
# Auto-adjust column widths
for col in range(1, 6):
sheet.column_dimensions[get_column_letter(col)].width = 20
except Exception as e:
self.logger.warning(f"Could not populate content type sheets: {str(e)}")
def _add_section_to_sheet(self, sheet, section: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
"""Add a section to a sheet and return the next row."""
try:
# Add section title
section_title = section.get("title")
if section_title:
sheet[f'A{start_row}'] = f"# {section_title}"
sheet[f'A{start_row}'].font = Font(bold=True)
start_row += 1
# Process section based on type
section_type = section.get("content_type", "paragraph")
# Handle all section types using elements array
elements = section.get("elements", [])
for element in elements:
if section_type == "table":
start_row = self._add_table_to_excel(sheet, element, styles, start_row)
elif section_type == "list":
start_row = self._add_list_to_excel(sheet, element, styles, start_row)
elif section_type == "paragraph":
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
elif section_type == "heading":
start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
else:
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
return start_row
except Exception as e:
self.logger.warning(f"Could not add section to sheet: {str(e)}")
return start_row + 1
def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
"""Add a table element to Excel sheet."""
try:
# In canonical JSON format, table elements have headers and rows directly
headers = element.get("headers", [])
rows = element.get("rows", [])
if not headers and not rows:
return start_row
# Add headers
header_style = styles.get("table_header", {})
for col, header in enumerate(headers, 1):
cell = sheet.cell(row=start_row, column=col, value=header)
if header_style.get("bold"):
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
if header_style.get("background"):
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
start_row += 1
# Add rows
cell_style = styles.get("table_cell", {})
for row_data in rows:
for col, cell_value in enumerate(row_data, 1):
cell = sheet.cell(row=start_row, column=col, value=cell_value)
if cell_style.get("text_color"):
cell.font = Font(color=self._get_safe_color(cell_style["text_color"]))
start_row += 1
return start_row
except Exception as e:
self.logger.warning(f"Could not add table to Excel: {str(e)}")
return start_row + 1
def _add_list_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
"""Add a list element to Excel sheet."""
try:
list_items = element.get("items", [])
list_style = styles.get("bullet_list", {})
for item in list_items:
sheet.cell(row=start_row, column=1, value=f"{item}")
if list_style.get("color"):
sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(list_style["color"]))
start_row += 1
return start_row
except Exception as e:
self.logger.warning(f"Could not add list to Excel: {str(e)}")
return start_row + 1
def _add_paragraph_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
"""Add a paragraph element to Excel sheet."""
try:
text = element.get("text", "")
if text:
sheet.cell(row=start_row, column=1, value=text)
paragraph_style = styles.get("paragraph", {})
if paragraph_style.get("color"):
sheet.cell(row=start_row, column=1).font = Font(color=self._get_safe_color(paragraph_style["color"]))
start_row += 1
return start_row
except Exception as e:
self.logger.warning(f"Could not add paragraph to Excel: {str(e)}")
return start_row + 1
def _add_heading_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
"""Add a heading element to Excel sheet."""
try:
text = element.get("text", "")
level = element.get("level", 1)
if text:
sheet.cell(row=start_row, column=1, value=text)
heading_style = styles.get("heading", {})
font_size = heading_style.get("font_size", 14)
if level > 1:
font_size = max(10, font_size - (level - 1) * 2)
sheet.cell(row=start_row, column=1).font = Font(
size=font_size,
bold=True,
color=self._get_safe_color(heading_style.get("color", "FF000000"))
)
start_row += 1
return start_row
except Exception as e:
self.logger.warning(f"Could not add heading to Excel: {str(e)}")
return start_row + 1
def _format_timestamp(self) -> str:
"""Format current timestamp for document generation."""
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")

View file

@ -1,94 +0,0 @@
"""
Text renderer for report generation.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
class TextRenderer(BaseRenderer):
"""Renders content to plain text format with format-specific extraction."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported text formats (excluding formats with dedicated renderers)."""
return [
'txt', 'text', 'plain',
# Programming languages
'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx',
'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp',
'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust',
'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc',
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd',
# Web technologies (excluding html/htm which have dedicated renderer)
'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg',
# Data formats (excluding csv, md/markdown which have dedicated renderers)
'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore',
# Configuration files
'env', 'properties', 'conf', 'config', 'rc',
'gitattributes', 'editorconfig', 'eslintrc',
# Documentation
'readme', 'changelog', 'license', 'authors',
'contributing', 'todo', 'notes', 'docs'
]
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return [
'ascii', 'utf8', 'utf-8', 'code', 'source',
'script', 'program', 'file', 'document',
'raw', 'unformatted', 'plaintext'
]
@classmethod
def get_priority(cls) -> int:
"""Return priority for text renderer."""
return 90
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Return only plain-text guidelines; global prompt is built centrally."""
return (
"TEXT FORMAT GUIDELINES:\n"
"- Output ONLY plain text (no markdown or HTML).\n"
"- Use clear headings (you may underline with === or --- when helpful).\n"
"- Use simple bullet lists with '-' and tables with '|' when needed.\n"
"- Preserve indentation for code-like content if present.\n"
"OUTPUT: Return ONLY the raw text content."
)
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to plain text format."""
try:
# The extracted content should already be formatted text from the AI
# Just clean it up
text_content = self._clean_text_content(extracted_content, title)
return text_content, "text/plain"
except Exception as e:
self.logger.error(f"Error rendering text: {str(e)}")
# Return minimal text fallback
return f"{title}\n\nError rendering report: {str(e)}", "text/plain"
def _clean_text_content(self, content: str, title: str) -> str:
"""Clean and validate text content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
# Remove any remaining markdown formatting
content = content.replace('**', '').replace('*', '')
content = content.replace('__', '').replace('_', '')
# Clean up any HTML-like tags that might have slipped through
import re
content = re.sub(r'<[^>]+>', '', content)
# Ensure proper line endings
content = content.replace('\r\n', '\n').replace('\r', '\n')
return content

View file

@ -0,0 +1,517 @@
"""
JSON Schema definitions for AI-generated document structures.
This module provides schemas that guide AI to generate structured JSON output.
"""
from typing import Dict, Any
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
"""Get the JSON schema for multi-document generation."""
return {
"type": "object",
"required": ["metadata", "documents"],
"properties": {
"metadata": {
"type": "object",
"required": ["title", "splitStrategy"],
"properties": {
"title": {"type": "string", "description": "Document title"},
"splitStrategy": {
"type": "string",
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
"description": "Strategy for splitting content into multiple files"
},
"splitCriteria": {
"type": "object",
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
},
"fileNamingPattern": {
"type": "string",
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
},
"author": {"type": "string", "description": "Document author (optional)"},
"source_documents": {
"type": "array",
"items": {"type": "string"},
"description": "List of source document IDs"
},
"extraction_method": {
"type": "string",
"default": "ai_extraction",
"description": "Method used for extraction"
}
}
},
"documents": {
"type": "array",
"description": "Array of individual documents to generate",
"items": {
"type": "object",
"required": ["id", "title", "sections", "filename"],
"properties": {
"id": {"type": "string", "description": "Unique document identifier"},
"title": {"type": "string", "description": "Document title"},
"filename": {"type": "string", "description": "Generated filename"},
"sections": {
"type": "array",
"description": "Document sections containing structured content",
"items": {
"type": "object",
"required": ["id", "content_type", "elements", "order"],
"properties": {
"id": {"type": "string", "description": "Unique section identifier"},
"title": {"type": "string", "description": "Section title (optional)"},
"content_type": {
"type": "string",
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
"description": "Primary content type of this section"
},
"elements": {
"type": "array",
"description": "Content elements in this section",
"items": {
"oneOf": [
{"$ref": "#/definitions/table"},
{"$ref": "#/definitions/bullet_list"},
{"$ref": "#/definitions/paragraph"},
{"$ref": "#/definitions/heading"},
{"$ref": "#/definitions/code_block"}
]
}
},
"order": {"type": "integer", "description": "Section order in document"},
"metadata": {
"type": "object",
"description": "Additional section metadata"
}
}
}
},
"metadata": {
"type": "object",
"description": "Document-specific metadata"
}
}
}
}
},
"definitions": {
"table": {
"type": "object",
"required": ["headers", "rows"],
"properties": {
"headers": {
"type": "array",
"items": {"type": "string"},
"description": "Table column headers"
},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": "string"}
},
"description": "Table data rows"
},
"caption": {
"type": "string",
"description": "Table caption (optional)"
}
}
},
"bullet_list": {
"type": "object",
"required": ["items"],
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"description": "List items"
},
"list_type": {
"type": "string",
"enum": ["bullet", "numbered", "checklist"],
"default": "bullet",
"description": "Type of list"
}
}
},
"list_item": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"paragraph": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "Paragraph text"},
"formatting": {
"type": "object",
"description": "Text formatting (bold, italic, etc.)"
}
}
},
"heading": {
"type": "object",
"required": ["text", "level"],
"properties": {
"text": {"type": "string", "description": "Heading text"},
"level": {
"type": "integer",
"minimum": 1,
"maximum": 6,
"description": "Heading level (1-6)"
}
}
},
"code_block": {
"type": "object",
"required": ["code"],
"properties": {
"code": {"type": "string", "description": "Code content"},
"language": {"type": "string", "description": "Programming language (optional)"}
}
}
}
}
def get_document_subJsonSchema() -> Dict[str, Any]:
"""Get the JSON schema for structured document generation (single document)."""
return {
"type": "object",
"required": ["metadata", "sections"],
"properties": {
"metadata": {
"type": "object",
"required": ["title"],
"properties": {
"title": {"type": "string", "description": "Document title"},
"author": {"type": "string", "description": "Document author (optional)"},
"source_documents": {
"type": "array",
"items": {"type": "string"},
"description": "List of source document IDs"
},
"extraction_method": {
"type": "string",
"default": "ai_extraction",
"description": "Method used for extraction"
}
}
},
"sections": {
"type": "array",
"description": "Document sections containing structured content",
"items": {
"type": "object",
"required": ["id", "content_type", "elements", "order"],
"properties": {
"id": {"type": "string", "description": "Unique section identifier"},
"title": {"type": "string", "description": "Section title (optional)"},
"content_type": {
"type": "string",
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
"description": "Primary content type of this section"
},
"elements": {
"type": "array",
"description": "Content elements in this section",
"items": {
"oneOf": [
{"$ref": "#/definitions/table"},
{"$ref": "#/definitions/bullet_list"},
{"$ref": "#/definitions/paragraph"},
{"$ref": "#/definitions/heading"},
{"$ref": "#/definitions/code_block"}
]
}
},
"order": {"type": "integer", "description": "Section order in document"},
"metadata": {
"type": "object",
"description": "Additional section metadata"
}
}
}
},
"summary": {
"type": "string",
"description": "Document summary (optional)"
},
"tags": {
"type": "array",
"items": {"type": "string"},
"description": "Document tags for categorization"
}
},
"definitions": {
"table": {
"type": "object",
"required": ["headers", "rows"],
"properties": {
"headers": {
"type": "array",
"items": {"type": "string"},
"description": "Table column headers"
},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": "string"}
},
"description": "Table data rows"
},
"caption": {
"type": "string",
"description": "Table caption (optional)"
}
}
},
"bullet_list": {
"type": "object",
"required": ["items"],
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"description": "List items"
},
"list_type": {
"type": "string",
"enum": ["bullet", "numbered", "checklist"],
"default": "bullet",
"description": "Type of list"
}
}
},
"list_item": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"paragraph": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "Paragraph text"},
"formatting": {
"type": "object",
"description": "Text formatting (bold, italic, etc.)"
}
}
},
"heading": {
"type": "object",
"required": ["text", "level"],
"properties": {
"text": {"type": "string", "description": "Heading text"},
"level": {
"type": "integer",
"minimum": 1,
"maximum": 6,
"description": "Heading level (1-6)"
}
}
},
"code_block": {
"type": "object",
"required": ["code"],
"properties": {
"code": {"type": "string", "description": "Code content"},
"language": {"type": "string", "description": "Programming language (optional)"}
}
}
}
}
def get_extraction_prompt_template() -> str:
"""Get the template for AI extraction prompts that request JSON output."""
return """
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
JSON Schema Requirements:
- Extract the actual data from the source documents
- If content is a table, extract it as a table with headers and rows
- If content is a list, extract it as a structured list with items
- If content is text, extract it as paragraphs or headings
- Preserve the original structure and data - do not summarize or interpret
- Use the exact JSON schema provided
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
Return only the JSON structure following the schema. Do not include any text before or after the JSON.
"""
def get_generation_prompt_template() -> str:
"""Get the template for AI generation prompts that work with JSON input."""
return """
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.
IMPORTANT: You must respond with valid JSON only, following the document schema.
Generation Guidelines:
- Use the provided JSON structure as the foundation
- Enhance the content with proper formatting and organization
- Ensure logical flow and readability
- Maintain the original data integrity
- Add appropriate headings and sections
- Organize content in a logical sequence
Content Enhancement:
- Tables: Ensure proper headers and data alignment
- Lists: Use appropriate list types (bullet, numbered, checklist)
- Headings: Use appropriate heading levels for hierarchy
- Paragraphs: Ensure proper text flow and formatting
- Code: Preserve code blocks with proper language identification
Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
"""
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
"""Automatically select appropriate schema based on prompt analysis."""
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
return get_multi_document_subJsonSchema()
else:
return get_document_subJsonSchema()
def validate_json_document(json_data: Dict[str, Any]) -> bool:
"""Validate that the JSON data follows the document schema."""
try:
# Basic validation - check required fields
if not isinstance(json_data, dict):
return False
# Check if it's multi-document or single-document structure
if "documents" in json_data:
# Multi-document structure
if "metadata" not in json_data:
return False
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata:
return False
documents = json_data["documents"]
if not isinstance(documents, list):
return False
# Validate each document
for doc in documents:
if not isinstance(doc, dict):
return False
required_fields = ["id", "title", "sections", "filename"]
for field in required_fields:
if field not in doc:
return False
# Validate sections in each document
sections = doc.get("sections", [])
if not isinstance(sections, list):
return False
for section in sections:
if not isinstance(section, dict):
return False
section_required = ["id", "content_type", "elements", "order"]
for field in section_required:
if field not in section:
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
return False
# Validate elements
if not isinstance(section["elements"], list):
return False
elif "sections" in json_data:
# Single-document structure (existing validation)
if "metadata" not in json_data:
return False
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata:
return False
sections = json_data["sections"]
if not isinstance(sections, list):
return False
# Validate each section
for i, section in enumerate(sections):
if not isinstance(section, dict):
return False
required_fields = ["id", "content_type", "elements", "order"]
for field in required_fields:
if field not in section:
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
return False
# Validate elements
if not isinstance(section["elements"], list):
return False
else:
return False
return True
except Exception:
return False

View file

@ -0,0 +1,738 @@
"""
Prompt builder for AI document generation and extraction.
This module builds prompts for AI services to extract and generate documents.
"""
import json
import logging
from typing import Dict, Any, Optional, List, TYPE_CHECKING
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
# Type hint for renderer parameter
if TYPE_CHECKING:
from .renderers.rendererBaseTemplate import BaseRenderer
_RendererLike = BaseRenderer
else:
_RendererLike = Any
logger = logging.getLogger(__name__)
async def buildAdaptiveExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
promptAnalysis: Dict[str, Any],
aiService=None,
services=None
) -> str:
"""
Build adaptive extraction prompt based on AI analysis.
Uses multi-file or single-file approach based on analysis.
"""
# Multi-file example data instead of schema
multi_file_example = {
"metadata": {
"title": "Multi-Document Example",
"splitStrategy": "by_section",
"source_documents": ["doc_001"],
"extraction_method": "ai_extraction"
},
"documents": [
{
"id": "doc_section_1",
"title": "Section 1 Title",
"filename": "section_1.xlsx",
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be extracted from the document."
}
],
"order": 2
},
{
"id": "section_3",
"content_type": "table",
"elements": [
{
"headers": ["Column 1", "Column 2"],
"rows": [["Value 1", "Value 2"]]
}
],
"order": 3
}
]
}
]
}
# Single-file example data instead of schema
single_file_example = {
"metadata": {
"title": "Single Document Example",
"source_documents": ["doc_001"],
"extraction_method": "ai_extraction"
},
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be extracted from the document."
}
],
"order": 2
},
{
"id": "section_3",
"content_type": "table",
"elements": [
{
"headers": ["Column 1", "Column 2"],
"rows": [["Value 1", "Value 2"]]
}
],
"order": 3
}
]
}
if promptAnalysis.get("is_multi_file", False):
# Multi-file prompt
adaptive_prompt = f"""
{userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
3. Create one JSON document entry for each section found
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
5. Generate appropriate filenames for each section
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(multi_file_example, indent=2)}
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
- "id": unique identifier
- "title": section title from the document
- "filename": appropriate filename for the section
- "sections": array of content sections
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
INSTRUCTIONS:
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
- Use actual section titles, headings, and text from the document
- Create meaningful filenames based on section content
- Ensure each section contains the complete content for that part of the document
- Do not use generic placeholder text like "Section 1", "Section 2"
- Extract real headings, paragraphs, lists, and other content elements
- CRITICAL: Return JSON with "documents" array, not "sections" array
CONTEXT (Document Content):
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
else:
# Single-file prompt - use example data instead of schema
adaptive_prompt = f"""
{userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into structured sections.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Extract all content and organize it into logical sections
3. Create structured JSON with sections containing the extracted content
4. Preserve the original structure and data
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(single_file_example, indent=2)}
INSTRUCTIONS:
- Replace example data with actual content from the document
- Use actual headings, paragraphs, and text from the document
- Ensure all content is properly structured
- Do not use generic placeholder text
- Extract real content from the documents
CONTEXT (Document Content):
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
return adaptive_prompt
async def buildGenericExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""Build generic extraction prompt that works for both single and multi-file."""
# Use AI to determine the best approach
if aiService:
try:
analysis_prompt = f"""
Analyze this user request and determine the best JSON structure for document extraction.
User request: "{userPrompt}"
Respond with JSON only:
{{
"requires_multi_file": true/false,
"recommended_schema": "single_document|multi_document",
"split_approach": "description of how to organize content",
"file_naming": "suggested naming pattern"
}}
Consider the user's intent and the most logical way to organize the extracted content.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
import re
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
analysis = json.loads(result)
# Use analysis to build appropriate prompt
return await buildAdaptiveExtractionPrompt(
outputFormat, userPrompt, title, analysis, aiService, services
)
except Exception as e:
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
# Fallback to single-file prompt
example_data = {
"metadata": {
"title": "Example Document",
"author": "AI Assistant",
"source_documents": ["document_001"],
"extraction_method": "ai_extraction"
},
"sections": [
{
"id": "section_001",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1,
"metadata": {}
}
],
"summary": "",
"tags": []
}
return f"""
{userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into structured sections.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Extract all content and organize it into logical sections
3. Create structured JSON with sections containing the extracted content
4. Preserve the original structure and data
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(example_data, indent=2)}
Requirements:
- Preserve all original data - do not summarize or interpret
- Use the exact JSON format shown above
- Maintain data integrity and structure
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
"""
async def buildExtractionPrompt(
outputFormat: str,
renderer: _RendererLike,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""
Build the final extraction prompt by combining:
- Parsed extraction intent from user prompt (using AI)
- Generic cross-format instructions (filename header + real-data policy)
- Format-specific guidelines snippet provided by the renderer
The AI must place a single filename header at the very top:
FILENAME: <safe-file-name-with-extension>
followed by a blank line and then ONLY the document content according to the target format.
"""
# Parse user prompt to separate extraction intent from generation format using AI
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
# Import JSON schema for structured output
from .subJsonSchema import get_document_subJsonSchema
jsonSchema = get_document_subJsonSchema()
# Generic block for JSON extraction - use mixed example data showing different content types
example_data = {
"metadata": {
"title": "Example Document",
"author": "AI Assistant",
"source_documents": ["document_001"],
"extraction_method": "ai_extraction"
},
"sections": [
{
"id": "section_001",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. INTRODUCTION"
}
],
"order": 1,
"metadata": {}
},
{
"id": "section_002",
"content_type": "paragraph",
"elements": [
{
"text": "This is a sample paragraph with actual content that should be extracted from the document."
}
],
"order": 2,
"metadata": {}
},
{
"id": "section_003",
"content_type": "table",
"elements": [
{
"headers": ["Column 1", "Column 2", "Column 3"],
"rows": [
["Value 1", "Value 2", "Value 3"],
["Value 4", "Value 5", "Value 6"]
]
}
],
"order": 3,
"metadata": {}
}
],
"summary": "",
"tags": []
}
genericIntro = f"""
{extractionIntent}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into structured sections.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Extract all content and organize it into logical sections
3. Create structured JSON with sections containing the extracted content
4. Preserve the original structure and data
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(example_data, indent=2)}
Requirements:
- Preserve all original data - do not summarize or interpret
- Use the exact JSON format shown above
- Maintain data integrity and structure
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
"""
# Get format-specific guidelines from renderer
formatGuidelines = ""
try:
if hasattr(renderer, 'getExtractionGuidelines'):
formatGuidelines = renderer.getExtractionGuidelines()
except Exception:
pass
# Combine all parts
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
# Save extraction prompt to debug file - only if debug enabled
try:
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
f.write(finalPrompt)
except Exception:
pass
return finalPrompt
async def buildGenerationPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""
Use AI to build the generation prompt based on user intent and format requirements.
Focus on what's important for the user and how to structure the content.
"""
if not aiService:
# Fallback if no AI service available
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
try:
# Protect userPrompt from injection
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# Debug output
services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")
# AI call to generate the appropriate generation prompt
generationPromptRequest = f"""
You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document.
User request: "{safeUserPrompt}"
Document title: "{title}"
Target format: {outputFormat}
Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on:
1. What content is most important for the user
2. How to structure and organize the content using the canonical JSON format with 'sections'
3. Specific formatting requirements for the target format
4. Language requirements to preserve
5. How to ensure the JSON content meets the user's needs
CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure:
{{
"metadata": {{
"title": "Document Title"
}},
"sections": [
{{
"id": "section_1",
"content_type": "heading",
"elements": [
{{
"level": 1,
"text": "1. SECTION TITLE"
}}
],
"order": 1
}},
{{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{{
"text": "This is the actual content that should be extracted from the document."
}}
],
"order": 2
}},
{{
"id": "section_3",
"content_type": "table",
"elements": [
{{
"headers": ["Column 1", "Column 2", "Column 3"],
"rows": [
["Value 1", "Value 2", "Value 3"],
["Value 4", "Value 5", "Value 6"]
]
}}
],
"order": 3
}}
]
}}
The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements".
Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format.
"""
# Call AI service to generate the prompt
services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER")
# Import and set proper options for AI call
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
response = await aiService.aiObjects.call(request)
result = response.content if response else ""
# Replace the placeholder that the AI created with actual format rules
if result:
formatRules = _getFormatRules(outputFormat)
result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)
# Debug output
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
# Save full generation prompt and AI response to debug file - only if debug enabled
try:
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
except Exception:
pass
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
except Exception as e:
# Fallback on any error - preserve user prompt for language instructions
services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
def _getFormatRules(outputFormat: str) -> str:
"""
Get format-specific rules for the generation prompt.
"""
format_rules = {
"xlsx": """
XLSX Format Rules:
- Create tables with clear headers and organized data
- Use appropriate column widths and formatting
- Include summary information if relevant
- Ensure data is properly structured for spreadsheet analysis
""",
"pdf": """
PDF Format Rules:
- Create professional document layout
- Use appropriate headings and sections
- Include proper spacing and formatting
- Ensure content is well-organized and readable
""",
"docx": """
DOCX Format Rules:
- Create professional document layout
- Use appropriate headings and sections
- Include proper spacing and formatting
- Ensure content is well-organized and readable
""",
"html": """
HTML Format Rules:
- Create clean, semantic HTML structure
- Use appropriate tags for content organization
- Include proper styling classes
- Ensure content is accessible and well-formatted
""",
"json": """
JSON Format Rules:
- Create well-structured JSON data
- Use appropriate nesting and organization
- Include metadata and context information
- Ensure data is properly formatted and valid
""",
"csv": """
CSV Format Rules:
- Create clear, organized tabular data
- Use appropriate headers and data types
- Ensure proper CSV formatting
- Include all relevant data in structured format
""",
"txt": """
TXT Format Rules:
- Create clean, readable text format
- Use appropriate spacing and organization
- Include clear headings and sections
- Ensure content is well-structured and easy to read
"""
}
return format_rules.get(outputFormat.lower(), f"""
{outputFormat.upper()} Format Rules:
- Create well-structured content appropriate for {outputFormat}
- Use appropriate formatting and organization
- Ensure content is clear and professional
- Include all relevant information in proper format
""")
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
"""
Parse user prompt to extract the core extraction intent.
"""
if not aiService:
return f"Extract content from the provided documents and create a {outputFormat} report."
try:
analysis_prompt = f"""
Analyze this user request and extract the core extraction intent:
User request: "{userPrompt}"
Target format: {outputFormat}
Extract the main intent and requirements for document processing. Focus on:
1. What content needs to be extracted
2. How it should be organized
3. Any specific requirements or preferences
Respond with a clear, concise statement of the extraction intent.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
return response.content.strip()
else:
return f"Extract content from the provided documents and create a {outputFormat} report."
except Exception as e:
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
return f"Extract content from the provided documents and create a {outputFormat} report."

View file

@ -32,7 +32,7 @@ class NeutralizationService:
serviceCenter: Service center instance for accessing other services
NamesToParse: List of names to parse and replace (case-insensitive)
"""
self.serviceCenter = serviceCenter
self.services = serviceCenter
self.interfaceDbApp = serviceCenter.interfaceDbApp
# Initialize anonymization processors

View file

@ -0,0 +1,264 @@
import json
import os
from typing import Any, Dict, List, Set
from datetime import datetime, UTC
class NormalizationService:
"""
Produces a single canonical table in merged JSON using an AI-provided header mapping
and deterministic, in-code value normalization. No language heuristics in code.
"""
def __init__(self, services):
self.services = services
# Public API
def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]:
headers: Set[str] = set()
samples: Dict[str, List[str]] = {}
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
for section in sections:
if not isinstance(section, dict):
continue
# Use only the fundamental agreed JSON structure: content_type/elements
if section.get("content_type") != "table":
continue
# Extract table data from elements array
hdrs = []
rows = []
for element in section.get("elements", []):
if isinstance(element, dict) and "headers" in element and "rows" in element:
hdrs = element.get("headers") or []
rows = element.get("rows") or []
break
if not hdrs or not rows:
continue
for h in hdrs:
if not isinstance(h, str):
continue
headers.add(h)
# collect small value samples by column index
for row in rows[:5]:
if not isinstance(row, list):
continue
for i, value in enumerate(row):
headerName = hdrs[i] if i < len(hdrs) else f"col_{i}"
if headerName not in samples:
samples[headerName] = []
if len(samples[headerName]) < 5:
samples[headerName].append(str(value))
return {
"tableHeaders": sorted(list(headers)),
"headerSamples": samples,
}
async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]:
# Allow caller to specify any canonical schema. If none provided, default to discovered headers.
if canonicalSpec is None:
canonicalSpec = {
"canonicalHeaders": inventory.get("tableHeaders", []),
"constraints": {}
}
# Protect merge prompt context by wrapping in single quotes and escaping internal quotes
protectedMerge = None
if mergePrompt:
try:
protectedMerge = str(mergePrompt).replace("'", "\\'")
except Exception:
protectedMerge = str(mergePrompt)
prompt = (
"You are a mapping generator. Return ONLY JSON.\n\n"
"Given discovered headers and sample values, map them to the canonical headers.\n"
"Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n"
f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n"
f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n"
+ (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") +
"REPLY JSON SHAPE:\n(Example)\n"
"{\n \"mappings\": {\"<sourceHeader>\": \"<Canonical>|null\"},\n"
" \"normalizationPolicy\": {\n \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n"
" \"Currency\": {\"stripSymbols\": true},\n"
" \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n }\n}\n"
)
response = await self.services.ai.callAi(prompt=prompt)
if not response:
return {"mapping": {}, "normalizationPolicy": {}}
# Extract JSON from response more safely
start_idx = response.find('{')
end_idx = response.rfind('}')
if start_idx == -1 or end_idx == -1 or start_idx >= end_idx:
return {"mapping": {}, "normalizationPolicy": {}}
js = response[start_idx:end_idx + 1]
try:
mapping = json.loads(js)
except json.JSONDecodeError:
return {"mapping": {}, "normalizationPolicy": {}}
# Normalize key naming from AI: prefer single key "mapping"
if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict):
mapping["mapping"] = mapping["mappings"]
try:
del mapping["mappings"]
except Exception:
pass
# Ensure canonicalHeaders present in mapping for downstream use
if "canonicalHeaders" not in mapping:
mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", [])
# debug artifact
self._writeDebugArtifact("mapping.json", mapping)
return mapping
def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]:
mappings = (mappingSpec or {}).get("mapping", {})
policy = (mappingSpec or {}).get("normalizationPolicy", {})
# Prefer headers provided by mapping (generic across domains)
canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or []
if not canonicalHeaders:
# Fallback to union of mapped targets
canonicalHeaders = sorted(list({t for t in mappings.values() if t}))
rows: List[List[str]] = []
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
for section in sections:
# Use only the fundamental agreed JSON structure: content_type/elements
if section.get("content_type") != "table":
continue
# Extract table data from elements array
sourceHeaders = []
sourceRows = []
for element in section.get("elements", []):
if isinstance(element, dict) and "headers" in element and "rows" in element:
sourceHeaders = element.get("headers") or []
sourceRows = element.get("rows") or []
break
if not sourceHeaders or not sourceRows:
continue
# Build index map: canonical -> source index or None
indexMap: Dict[str, int] = {}
for ci, ch in enumerate(canonicalHeaders):
srcIndex = None
for si, sh in enumerate(sourceHeaders):
# Prefer explicit mapping target; fallback to identity when names match
target = mappings.get(sh)
if target is None and sh == ch:
target = ch
if target == ch:
srcIndex = si
break
indexMap[ch] = srcIndex
# Transform rows
for r in sourceRows:
canonicalRow: List[str] = []
for ch in canonicalHeaders:
idx = indexMap.get(ch)
try:
value = r[idx] if (idx is not None and idx < len(r)) else ""
except (IndexError, KeyError) as e:
# Handle corrupted data gracefully
value = ""
canonicalRow.append(self._normalizeValue(ch, value, policy))
# consider as row if at least one non-empty meaningful field
if any(v.strip() for v in canonicalRow):
rows.append(canonicalRow)
canonical = {
"metadata": {
"title": mergedJson.get("metadata", {}).get("title", "Merged Document"),
"source_documents": mergedJson.get("metadata", {}).get("source_documents", [])
},
"sections": [
{
"id": "canonical_table_1",
"content_type": "table",
"elements": [
{
"headers": canonicalHeaders,
"rows": rows
}
],
"order": 1
}
]
}
# debug artifact
self._writeDebugArtifact("canonical_merged.json", canonical)
return canonical
def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]:
rows = []
try:
sections = canonicalJson.get("sections", [])
for s in sections:
if s.get("content_type") == "table":
# Extract rows from elements array
for element in s.get("elements", []):
if isinstance(element, dict) and "rows" in element:
rows.extend(element.get("rows", []))
except Exception:
rows = []
report = {
"rowCount": len(rows),
"success": len(rows) > 0
}
self._writeDebugArtifact("normalization_report.json", report)
return report
# Internal helpers
def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str:
if value is None:
return ""
text = str(value).strip()
# Generic normalization guided by policy; avoid domain specifics
if canonicalHeader in (policy.get("numericFields", []) or []):
dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator")
or (policy.get("numeric") or {}).get("decimalSeparator")
or ".")
if dec == ",":
text = text.replace(".", "").replace(",", ".") if "," in text else text
text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+'])
elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []):
text = ''.join(ch for ch in text if ch.isalpha())
text = text.upper()
return text
def _writeDebugArtifact(self, fileName: str, obj: Any) -> None:
try:
debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if not debugEnabled:
return
root = "./test-chat/ai"
os.makedirs(root, exist_ok=True)
# Prefix timestamp for files that are frequently overwritten
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
if fileName in ("mapping.json", "canonical_merged.json"):
outName = f"{ts}_{fileName}"
else:
outName = fileName
path = os.path.join(root, outName)
with open(path, "w", encoding="utf-8") as f:
if isinstance(obj, (dict, list)):
f.write(json.dumps(obj, ensure_ascii=False, indent=2))
else:
f.write(str(obj))
except Exception:
pass

View file

@ -21,7 +21,7 @@ class SharepointService:
Use setAccessTokenFromConnection() method to configure the access token before making API calls.
"""
self.serviceCenter = serviceCenter
self.services = serviceCenter
self.access_token = None
self.base_url = "https://graph.microsoft.com/v1.0"

View file

@ -16,7 +16,7 @@ class TicketService:
Args:
serviceCenter: Service center instance for accessing other services
"""
self.serviceCenter = serviceCenter
self.services = serviceCenter
async def _createTicketInterfaceByType(
self,

View file

@ -4,6 +4,7 @@ Provides centralized access to configuration, events, and other utilities.
"""
import logging
import os
from typing import Any, Optional, Dict, Callable
from modules.shared.configuration import APP_CONFIG
from modules.shared.eventManagement import eventManager
@ -140,3 +141,42 @@ class UtilsService:
except Exception as e:
logger.error(f"Error getting fresh token for connection {connectionId}: {str(e)}")
return None
def debugLogToFile(self, message: str, context: str = "DEBUG"):
"""
Log debug message to file if debug logging is enabled.
Args:
message: Debug message to log
context: Context identifier for the debug message
"""
try:
# Check if debug logging is enabled
debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if not debug_enabled:
return
# Get debug directory
debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat")
if not os.path.isabs(debug_dir):
# If relative path, make it relative to the gateway directory
gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
debug_dir = os.path.join(gateway_dir, debug_dir)
# Ensure debug directory exists
os.makedirs(debug_dir, exist_ok=True)
# Create debug file path
debug_file = os.path.join(debug_dir, "debug_workflow.log")
# Format the debug entry
timestamp = self.getUtcTimestamp()
debug_entry = f"[{timestamp}] [{context}] {message}\n"
# Write to debug file
with open(debug_file, "a", encoding="utf-8") as f:
f.write(debug_entry)
except Exception as e:
# Don't log debug errors to avoid recursion
pass

View file

@ -16,7 +16,7 @@ class WorkflowService:
"""Service class containing methods for document processing, chat operations, and workflow management"""
def __init__(self, serviceCenter):
self.serviceCenter = serviceCenter
self.services = serviceCenter
self.user = serviceCenter.user
self.workflow = serviceCenter.workflow
self.interfaceDbChat = serviceCenter.interfaceDbChat
@ -78,11 +78,15 @@ class WorkflowService:
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
"""Get ChatDocuments from a list of document references using all three formats."""
try:
# Get the current workflow from services (same pattern as setWorkflowContext)
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
if not workflow:
logger.error("No workflow available for document list resolution")
return []
workflow = self.services.currentWorkflow
# Reload workflow from database to ensure we have all messages
if hasattr(workflow, 'id'):
try:
workflow = self.getWorkflow(workflow.id)
logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages")
except Exception as e:
logger.warning(f"Could not reload workflow from database: {str(e)}")
all_documents = []
for doc_ref in documentList:
@ -125,7 +129,9 @@ class WorkflowService:
break
if not message_found:
logger.warning(f"Message with ID {message_id} not found in workflow. Available message IDs: {[str(msg.id) for msg in workflow.messages]}")
available_ids = [str(msg.id) for msg in workflow.messages]
logger.error(f"Message with ID {message_id} not found in workflow. Available message IDs: {available_ids}")
raise ValueError(f"Document reference not found: docList:{message_id}:{label}")
elif len(parts) >= 2:
# Format: docList:<label> - find message by documentsLabel
label = parts[1]
@ -154,7 +160,8 @@ class WorkflowService:
else:
logger.debug(f"Found docList reference {doc_ref} but message has no documents")
else:
logger.debug(f"No messages found with documentsLabel: {label}")
logger.error(f"No messages found with documentsLabel: {label}")
raise ValueError(f"Document reference not found: docList:{label}")
else:
# Direct label reference (round1_task2_action3_contextinfo)
# Search for messages with matching documentsLabel to find the actual documents
@ -198,30 +205,8 @@ class WorkflowService:
else:
logger.debug(f"No documents found in newest message {newest_message.id}")
else:
logger.debug(f"No messages found with documentsLabel: {doc_ref}")
# Fallback: also check if any message has this documentsLabel as a prefix
logger.debug(f"Trying fallback search for messages with documentsLabel containing: {doc_ref}")
fallback_messages = []
for message in workflow.messages:
msg_documents_label = getattr(message, 'documentsLabel', '')
if msg_documents_label and msg_documents_label.startswith(doc_ref):
fallback_messages.append(message)
logger.debug(f"Found fallback message {message.id} with documentsLabel: {msg_documents_label}")
if fallback_messages:
# Sort by publishedAt descending (newest first)
fallback_messages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
newest_fallback = fallback_messages[0]
logger.debug(f"Using fallback message {newest_fallback.id} with documentsLabel: {getattr(newest_fallback, 'documentsLabel', 'unknown')}")
if newest_fallback.documents:
doc_names = [doc.fileName for doc in newest_fallback.documents if hasattr(doc, 'fileName')]
logger.debug(f"Added {len(newest_fallback.documents)} documents from fallback message {newest_fallback.id}: {doc_names}")
all_documents.extend(newest_fallback.documents)
else:
logger.debug(f"No documents found in fallback message {newest_fallback.id}")
else:
logger.debug(f"No fallback messages found either")
logger.error(f"No messages found with documentsLabel: {doc_ref}")
raise ValueError(f"Document reference not found: {doc_ref}")
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
return all_documents
@ -260,7 +245,8 @@ class WorkflowService:
token_status = f"error: {str(e)}"
# Build enhanced reference with state information
base_ref = f"connection:{connection.authority.value}:{connection.externalUsername}:{connection.id}"
# Format: connection:msft:<username> (without UUID)
base_ref = f"connection:{connection.authority.value}:{connection.externalUsername}"
state_info = f" [status:{connection.status.value}, token:{token_status}]"
logger.debug(f"getConnectionReferenceFromUserConnection: Built reference: {base_ref + state_info}")
@ -283,26 +269,25 @@ class WorkflowService:
return None
def getUserConnectionFromConnectionReference(self, connectionReference: str) -> Optional[UserConnection]:
"""Get UserConnection from reference string (handles both old and enhanced formats)"""
"""Get UserConnection from reference string (handles new format without UUID)"""
try:
# Parse reference format: connection:{authority}:{username}:{id} [status:..., token:...]
# Parse reference format: connection:{authority}:{username} [status:..., token:...]
# Remove state information if present
base_reference = connectionReference.split(' [')[0]
parts = base_reference.split(':')
if len(parts) != 4 or parts[0] != "connection":
if len(parts) != 3 or parts[0] != "connection":
return None
authority = parts[1]
username = parts[2]
conn_id = parts[3]
# Get user connections through AppObjects interface
user_connections = self.interfaceDbApp.getUserConnections(self.user.id)
# Find matching connection
# Find matching connection by authority and username (no UUID needed)
for conn in user_connections:
if str(conn.id) == conn_id and conn.authority.value == authority and conn.externalUsername == username:
if conn.authority.value == authority and conn.externalUsername == username:
return conn
return None
@ -437,11 +422,7 @@ class WorkflowService:
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
"""Set current workflow context for document generation and routing"""
try:
# Get the current workflow from services
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
if not workflow:
logger.error("No workflow available for context setting")
return
workflow = self.services.currentWorkflow
# Prepare update data
update_data = {}
@ -548,10 +529,7 @@ class WorkflowService:
def getDocumentCount(self) -> str:
"""Get document count for task planning (matching old handlingTasks.py logic)"""
try:
# Get the current workflow from services
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
if not workflow:
return "No documents available"
workflow = self.services.currentWorkflow
# Count documents from all messages in the workflow (like old system)
total_docs = 0
@ -570,10 +548,7 @@ class WorkflowService:
def getWorkflowHistoryContext(self) -> str:
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
try:
# Get the current workflow from services
workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow
if not workflow:
return "No previous round context available"
workflow = self.services.currentWorkflow
# Check if there are any previous rounds by looking for "first" messages
has_previous_rounds = False
@ -622,15 +597,26 @@ class WorkflowService:
if not workflow or not hasattr(workflow, 'messages'):
return "No documents available"
# Use the provided workflow object directly to avoid database reload issues
# that can cause filename truncation. The workflow object should already be up-to-date.
logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})")
# Debug: Check document filenames in the workflow object
if hasattr(workflow, 'messages') and workflow.messages:
for message in workflow.messages:
if hasattr(message, 'documents') and message.documents:
for doc in message.documents:
logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
# Get document reference list using the exact same logic as old system
document_list = self._getDocumentReferenceList(workflow)
# Build technical context string for AI action planning (exact copy of old system)
context = "AVAILABLE DOCUMENTS:\n\n"
# Build index string for AI action planning
context = ""
# Process chat exchanges (current round) - exact copy of old system
# Process current round exchanges first
if document_list["chat"]:
context += "CURRENT ROUND DOCUMENTS:\n"
context += "\nCurrent round documents:\n"
for exchange in document_list["chat"]:
# Generate docList reference for the exchange (using message ID and label)
# Find the message that corresponds to this exchange
@ -656,9 +642,9 @@ class WorkflowService:
context += f" - docItem:{doc_ref}\n"
context += "\n"
# Process history exchanges (previous rounds) - exact copy of old system
# Process previous rounds after
if document_list["history"]:
context += "WORKFLOW HISTORY DOCUMENTS:\n"
context += "\nPast rounds documents:\n"
for exchange in document_list["history"]:
# Generate docList reference for the exchange (using message ID and label)
# Find the message that corresponds to this exchange
@ -685,7 +671,7 @@ class WorkflowService:
context += "\n"
if not document_list["chat"] and not document_list["history"]:
context += "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.\n"
context += "\nNO DOCUMENTS AVAILABLE - This workflow has no documents to process.\n"
return context
@ -713,39 +699,23 @@ class WorkflowService:
for message in reversed(workflow.messages):
is_first = message.status == "first" if hasattr(message, 'status') else False
# Build a DocumentExchange if message has documents
# Build a DocumentExchange if message has documents and an explicit documentsLabel
doc_exchange = None
if message.documents:
if message.actionId and message.documentsLabel:
# Validate that we use the same label as in the message
existing_label = getattr(message, 'documentsLabel', None)
if existing_label:
# Validate and use the message's actual documentsLabel
validated_label = self._validateDocumentLabelConsistency(message)
# Use the message's actual documentsLabel
doc_refs = []
for doc in message.documents:
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
doc_refs.append(doc_ref)
doc_exchange = {
'documentsLabel': validated_label,
'documents': doc_refs
}
else:
# Generate new labels for documents without explicit labels
doc_refs = []
for doc in message.documents:
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
doc_refs.append(doc_ref)
if doc_refs:
# Create a label based on message context
context_prefix = self._generateWorkflowContextPrefix(message)
context_label = f"{context_prefix}_context"
doc_exchange = {
'documentsLabel': context_label,
'documents': doc_refs
}
# IMPORTANT: Never synthesize new labels here. If a message lacks
# a documentsLabel, we skip adding an exchange for it.
# Append to appropriate container based on boundary
if doc_exchange:
@ -773,12 +743,22 @@ class WorkflowService:
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
for doc in documents:
try:
# Debug: Log original filename before refresh
original_filename = doc.fileName
logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
# Use the proper WorkflowService method to get file info
file_info = self.getFileInfo(doc.fileId)
if file_info:
db_filename = file_info.get("fileName", doc.fileName)
logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})")
doc.fileName = file_info.get("fileName", doc.fileName)
doc.fileSize = file_info.get("size", doc.fileSize)
doc.mimeType = file_info.get("mimeType", doc.mimeType)
# Debug: Log final filename after refresh
logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
else:
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
except Exception as e:
@ -794,6 +774,8 @@ class WorkflowService:
def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
"""Get document reference using document ID and filename."""
try:
# Debug logging to track filename truncation
logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})")
# Use document ID and filename for simple reference
return f"docItem:{document.id}:{document.fileName}"
except Exception as e:
@ -844,14 +826,14 @@ class WorkflowService:
"""Get connection reference list (matching old handlingTasks.py logic)"""
try:
# Get connections from the database using the same logic as the old system
if hasattr(self.serviceCenter, 'interfaceDbApp') and hasattr(self.serviceCenter, 'user'):
userId = self.serviceCenter.user.id
connections = self.serviceCenter.interfaceDbApp.getUserConnections(userId)
if hasattr(self.services, 'interfaceDbApp') and hasattr(self.services, 'user'):
userId = self.services.user.id
connections = self.services.interfaceDbApp.getUserConnections(userId)
if connections:
# Format connections as reference strings using the same pattern as the old system
connectionRefs = []
for conn in connections:
# Create reference string in format: connection:{authority}:{username}:{id} [status:..., token:...]
# Create reference string in format: connection:{authority}:{username} [status:..., token:...]
# This matches the format expected by getUserConnectionFromConnectionReference()
ref = self.getConnectionReferenceFromUserConnection(conn)
connectionRefs.append(ref)

View file

@ -42,9 +42,7 @@ class MethodDocument(MethodBase):
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
- mergeStrategy (dict, optional): Merge strategy for chunked content.
- expectedDocumentFormats (list, optional): Desired output format specs.
- includeMetadata (bool, optional): Include file metadata. Default: True.
- outputMimeType (str, optional): MIME type for output file. Options: "text/plain" (default), "application/json", "text/csv", "text/html". Default: "text/plain".
"""
try:
documentList = parameters.get("documentList")
@ -54,13 +52,7 @@ class MethodDocument(MethodBase):
operationType = parameters.get("operationType", "extract_content")
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
chunkAllowed = parameters.get("chunkAllowed", True)
mergeStrategy = parameters.get("mergeStrategy", {
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
})
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
includeMetadata = parameters.get("includeMetadata", True)
outputMimeType = parameters.get("outputMimeType", "text/plain")
if not documentList:
return ActionResult.isFailure(
@ -87,19 +79,16 @@ class MethodDocument(MethodBase):
compressContext=not chunkAllowed
)
# Add format instructions to prompt if expected formats are provided
# Add format instructions to prompt based on MIME type
enhanced_prompt = prompt
if expectedDocumentFormats:
format_instructions = []
for fmt in expectedDocumentFormats:
extension = fmt.get("extension", ".txt")
mime_type = fmt.get("mimeType", "text/plain")
description = fmt.get("description", "")
format_instructions.append(f"- {extension} ({mime_type}): {description}")
if format_instructions:
enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)
mime_type_mapping = {
"text/plain": (".txt", "Plain text format"),
"application/json": (".json", "Structured JSON format"),
"text/csv": (".csv", "Table format"),
"text/html": (".html", "HTML format")
}
extension, description = mime_type_mapping.get(outputMimeType, (".txt", "Plain text format"))
enhanced_prompt += f"\n\nPlease format the output as {extension} ({outputMimeType}): {description}"
# Use enhanced AI service for extraction
ai_response = await self.services.ai.callAi(
@ -125,8 +114,16 @@ class MethodDocument(MethodBase):
for i, chatDocument in enumerate(chatDocuments):
# Use the AI response directly - it already contains processed content
final_content = ai_response
final_mime_type = "text/plain"
final_extension = ".txt"
# Determine output format based on MIME type
mime_type_mapping = {
"text/plain": ".txt",
"application/json": ".json",
"text/csv": ".csv",
"text/html": ".html"
}
final_extension = mime_type_mapping.get(outputMimeType, ".txt")
final_mime_type = outputMimeType
# Create meaningful output fileName with workflow context
original_fileName = chatDocument.fileName
@ -156,9 +153,6 @@ class MethodDocument(MethodBase):
error=str(e)
)
@action
async def generate(self, parameters: Dict[str, Any]) -> ActionResult:
"""
@ -175,8 +169,6 @@ class MethodDocument(MethodBase):
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
- processDocumentsIndividually (bool, optional): Process per document. Default: True.
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
- mergeStrategy (dict, optional): Merging rules for multi-part generation.
- includeMetadata (bool, optional): Include file metadata. Default: True.
"""
try:
documentList = parameters.get("documentList")
@ -188,12 +180,6 @@ class MethodDocument(MethodBase):
operationType = parameters.get("operationType", "generate_report")
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
chunkAllowed = parameters.get("chunkAllowed", True)
mergeStrategy = parameters.get("mergeStrategy", {
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
})
includeMetadata = parameters.get("includeMetadata", True)
if not documentList:
return ActionResult.isFailure(

View file

@ -31,14 +31,14 @@ class MethodAi(MethodBase):
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: AI-based analysis and content generation with optional document context.
- Input requirements: aiPrompt (required); optional documentList, resultType, processingMode, includeMetadata, operationType, priority, maxCost, maxProcessingTime, requiredTags.
- Output format: Single or multiple documents in requested format.
- Purpose: Process a user prompt with optional unlimited input documents to produce one or many output documents of the SAME format.
- Input requirements: aiPrompt (required); optional documentList.
- Output format: Exactly one file format to select. For multiple output file formats to do different calls.
Parameters:
- aiPrompt (str, required): Instruction for the AI.
- documentList (list, optional): Document reference(s) for context.
- resultType (str, optional): Output extension (txt, json, md, csv, xml, html, pdf, docx, xlsx, png). Default: txt.
- resultType (str, optional): Output file extension - only one extension allowed (e.g. txt, json, md, csv, xml, html, pdf, docx, xlsx, png, ...). Default: txt.
- processingMode (str, optional): basic | advanced | detailed. Default: basic.
- includeMetadata (bool, optional): Include metadata when available. Default: True.
- operationType (str, optional): general | generate_plan | analyse_content | generate_content | web_research | image_analysis | image_generation. Default: general.
@ -169,12 +169,12 @@ class MethodAi(MethodBase):
Parameters:
- user_prompt (str, required): Research question or topic.
- urls (list, optional): Specific URLs to crawl.
- max_results (int, optional): Max search results. Default: 10.
- max_pages (int, optional): Max pages to crawl per site. Default: 10.
- max_results (int, optional): Max search results. Default: 5.
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
- search_depth (str, optional): basic | advanced. Default: basic.
- extract_depth (str, optional): basic | advanced. Default: advanced.
- pages_search_depth (int, optional): Crawl depth level. Default: 2.
- country (str, optional): Country code for bias.
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
- time_range (str, optional): d | w | m | y.
- topic (str, optional): general | news | academic.
- language (str, optional): Language code (e.g., de, en, fr).
@ -182,8 +182,8 @@ class MethodAi(MethodBase):
try:
user_prompt = parameters.get("user_prompt")
urls = parameters.get("urls")
max_results = parameters.get("max_results", 10)
max_pages = parameters.get("max_pages", 10)
max_results = parameters.get("max_results", 5)
max_pages = parameters.get("max_pages", 5)
search_depth = parameters.get("search_depth", "basic")
extract_depth = parameters.get("extract_depth", "advanced")
pages_search_depth = parameters.get("pages_search_depth", 2)

View file

@ -154,6 +154,12 @@ class MethodOutlook(MethodBase):
if not query or not query.strip():
# No query specified, just get emails from folder
if folder and folder.lower() != "all":
# Use folder name directly for well-known folders, or get folder ID
if folder.lower() in ["inbox", "drafts", "sentitems", "deleteditems"]:
params["$filter"] = f"parentFolderId eq '{folder}'"
else:
# For custom folders, we need to get the folder ID first
# This will be handled by the calling method
params["$filter"] = f"parentFolderId eq '{folder}'"
# Add orderby for basic queries
params["$orderby"] = "receivedDateTime desc"
@ -191,6 +197,16 @@ class MethodOutlook(MethodBase):
# Use only subject search to keep filter simple
# Handle wildcard queries specially
if clean_query == "*" or clean_query == "":
# For wildcard or empty query, don't use contains filter
# Just use folder filter if specified
if folder and folder.lower() != "all":
params["$filter"] = f"parentFolderId eq '{folder}'"
else:
# No filter needed for wildcard search across all folders
pass
else:
params["$filter"] = f"contains(subject,'{clean_query}')"
# Add folder filter if specified
@ -235,6 +251,10 @@ class MethodOutlook(MethodBase):
if '@' in filter_text and '.' in filter_text and ' ' not in filter_text and not filter_text.startswith('from:'):
return {"$filter": f"from/fromAddress/address eq '{filter_text}'"}
# Handle OData filter conditions (contains 'eq', 'ne', 'gt', 'lt', etc.)
if any(op in filter_text.lower() for op in [' eq ', ' ne ', ' gt ', ' lt ', ' ge ', ' le ', ' and ', ' or ']):
return {"$filter": filter_text}
# Handle text content - search in subject
return {"$filter": f"contains(subject,'{filter_text}')"}
@ -300,26 +320,31 @@ class MethodOutlook(MethodBase):
"""
GENERAL:
- Purpose: Read emails and metadata from a mailbox folder.
- Input requirements: connectionReference (required); optional folder, limit, filter, expectedDocumentFormats.
- Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType.
- Output format: JSON with emails and metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- folder (str, optional): Folder to read from. Default: Inbox.
- limit (int, optional): Maximum items to return. Default: 10.
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
- filter (str, optional): Sender, query operators, or subject text.
- expectedDocumentFormats (list, optional): Output format preferences.
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
"""
try:
connectionReference = parameters.get("connectionReference")
folder = parameters.get("folder", "Inbox")
limit = parameters.get("limit", 10)
filter = parameters.get("filter")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required")
# Validate limit parameter
if limit <= 0:
limit = 1000
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
# Validate filter parameter if provided
if filter:
# Remove any potentially dangerous characters that could break the filter
@ -343,8 +368,16 @@ class MethodOutlook(MethodBase):
"Content-Type": "application/json"
}
# Build the API request
# Get the folder ID for the specified folder
folder_id = self._getFolderId(folder, connection)
if folder_id:
# Build the API request with folder ID
api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages"
else:
# Fallback: use folder name directly (for well-known folders like "Inbox")
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
params = {
"$top": limit,
"$orderby": "receivedDateTime desc"
@ -380,7 +413,11 @@ class MethodOutlook(MethodBase):
"count": len(emails_data.get("value", [])),
"folder": folder,
"filter": filter,
"apiResponse": emails_data
"apiMetadata": {
"@odata.context": emails_data.get("@odata.context"),
"@odata.count": emails_data.get("@odata.count"),
"@odata.nextLink": emails_data.get("@odata.nextLink")
}
}
@ -405,18 +442,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
# Determine output format based on MIME type
mime_type_mapping = {
"application/json": ".json",
"text/plain": ".txt",
"text/csv": ".csv"
}
output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_mime_type = outputMimeType
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
@ -454,27 +488,32 @@ class MethodOutlook(MethodBase):
"""
GENERAL:
- Purpose: Search emails by query and return matching items with metadata.
- Input requirements: connectionReference (required); query (required); optional folder, limit, expectedDocumentFormats.
- Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType.
- Output format: JSON with search results and metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- query (str, required): Search expression.
- folder (str, optional): Folder scope or All. Default: All.
- limit (int, optional): Maximum items to return. Default: 20.
- expectedDocumentFormats (list, optional): Output format preferences.
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
"""
try:
connectionReference = parameters.get("connectionReference")
query = parameters.get("query")
folder = parameters.get("folder", "All")
limit = parameters.get("limit", 20)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
limit = parameters.get("limit", 1000)
outputMimeType = parameters.get("outputMimeType", "application/json")
# Validate parameters
if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required")
# Validate limit parameter
if limit <= 0:
limit = 1000
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
if not query or not query.strip():
return ActionResult.isFailure(error="Search query is required and cannot be empty")
@ -488,12 +527,15 @@ class MethodOutlook(MethodBase):
# Validate limit
try:
limit = int(limit)
if limit <= 0 or limit > 1000: # Microsoft Graph API has limits
limit = 20
logger.warning(f"Limit {limit} is out of range, using default value 20")
if limit <= 0:
limit = 1000
logger.warning(f"Invalid limit value (<=0), using default value 1000")
elif limit > 1000: # Microsoft Graph API has limits
limit = 1000
logger.warning(f"Limit {limit} exceeds maximum (1000), using 1000")
except (ValueError, TypeError):
limit = 20
logger.warning(f"Invalid limit value, using default value 20")
limit = 1000
logger.warning(f"Invalid limit value, using default value 1000")
# Get Microsoft connection
connection = self._getMicrosoftConnection(connectionReference)
@ -509,9 +551,18 @@ class MethodOutlook(MethodBase):
"Content-Type": "application/json"
}
# Get the folder ID for the specified folder if needed
folder_id = None
if folder and folder.lower() != "all":
folder_id = self._getFolderId(folder, connection)
if folder_id:
logger.debug(f"Found folder ID for '{folder}': {folder_id}")
else:
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
# Build the search API request
api_url = f"{graph_url}/me/messages"
params = self._buildSearchParameters(query, folder, limit)
params = self._buildSearchParameters(query, folder_id or folder, limit)
# Log search parameters for debugging
logger.debug(f"Search query: '{query}'")
@ -605,7 +656,11 @@ class MethodOutlook(MethodBase):
"count": len(emails),
"folder": folder,
"limit": limit,
"apiResponse": search_data,
"apiMetadata": {
"@odata.context": search_data.get("@odata.context"),
"@odata.count": search_data.get("@odata.count"),
"@odata.nextLink": search_data.get("@odata.nextLink")
},
"searchParams": params
}
@ -618,18 +673,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
# Determine output format based on MIME type
mime_type_mapping = {
"application/json": ".json",
"text/plain": ".txt",
"text/csv": ".csv"
}
output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_mime_type = outputMimeType
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
@ -664,20 +716,20 @@ class MethodOutlook(MethodBase):
"""
GENERAL:
- Purpose: List draft emails from a folder.
- Input requirements: connectionReference (required); optional folder, limit, expectedDocumentFormats.
- Input requirements: connectionReference (required); optional folder, limit, outputMimeType.
- Output format: JSON with draft items and metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- folder (str, optional): Drafts folder to list. Default: Drafts.
- limit (int, optional): Maximum items to return. Default: 20.
- expectedDocumentFormats (list, optional): Output format preferences.
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
"""
try:
connectionReference = parameters.get("connectionReference")
folder = parameters.get("folder", "Drafts")
limit = parameters.get("limit", 20)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
limit = parameters.get("limit", 1000)
outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required")
@ -745,18 +797,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
# Determine output format based on MIME type
mime_type_mapping = {
"application/json": ".json",
"text/plain": ".txt",
"text/csv": ".csv"
}
output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_mime_type = outputMimeType
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
@ -790,18 +839,18 @@ class MethodOutlook(MethodBase):
"""
GENERAL:
- Purpose: Find draft emails across folders.
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
- Input requirements: connectionReference (required); optional limit, outputMimeType.
- Output format: JSON with drafts and metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- limit (int, optional): Maximum items to return. Default: 50.
- expectedDocumentFormats (list, optional): Output format preferences.
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
"""
try:
connectionReference = parameters.get("connectionReference")
limit = parameters.get("limit", 50)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required")
@ -859,18 +908,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
# Determine output format based on MIME type
mime_type_mapping = {
"application/json": ".json",
"text/plain": ".txt",
"text/csv": ".csv"
}
output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_mime_type = outputMimeType
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
@ -930,18 +976,18 @@ class MethodOutlook(MethodBase):
"""
GENERAL:
- Purpose: Check contents of the Drafts folder.
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
- Input requirements: connectionReference (required); optional limit, outputMimeType.
- Output format: JSON with drafts and metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- limit (int, optional): Maximum items to return. Default: 20.
- expectedDocumentFormats (list, optional): Output format preferences.
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
"""
try:
connectionReference = parameters.get("connectionReference")
limit = parameters.get("limit", 20)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required")
@ -1003,18 +1049,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
# Determine output format based on MIME type
mime_type_mapping = {
"application/json": ".json",
"text/plain": ".txt",
"text/csv": ".csv"
}
output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_mime_type = outputMimeType
logger.info(f"Using output format: {output_extension} ({output_mime_type})")

View file

@ -931,7 +931,8 @@ class MethodSharepoint(MethodBase):
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
# Check if pathQuery contains search terms (words without proper path structure)
if not pathQuery.startswith('/site:') and not pathQuery.startswith('/Documents') and not pathQuery.startswith('/Shared Documents'):
valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents']
if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes):
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
# For pathQuery, we need to discover sites to find the specific one
@ -1627,7 +1628,8 @@ class MethodSharepoint(MethodBase):
return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:<Site Display Name>/... e.g. /site:KM LayerFinance/Documents/Work")
# Check if pathQuery contains search terms (words without proper path structure)
if not pathQuery.startswith('/site:') and not pathQuery.startswith('/Documents') and not pathQuery.startswith('/Shared Documents'):
valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents']
if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes):
return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.")
# For pathQuery, we need to discover sites to find the specific one

View file

@ -1,9 +1,9 @@
# adaptive module for React mode
# Provides adaptive learning capabilities
from .intentAnalyzer import IntentAnalyzer, DataType, ExpectedFormat
from .intentAnalyzer import IntentAnalyzer
from .contentValidator import ContentValidator
from .learningEngine import LearningEngine
from .progressTracker import ProgressTracker
__all__ = ['IntentAnalyzer', 'ContentValidator', 'LearningEngine', 'ProgressTracker', 'DataType', 'ExpectedFormat']
__all__ = ['IntentAnalyzer', 'ContentValidator', 'LearningEngine', 'ProgressTracker']

View file

@ -1,8 +1,9 @@
# contentValidator.py
# Content validation for adaptive React mode
import re
import logging
import json
import re
from typing import List, Dict, Any
logger = logging.getLogger(__name__)
@ -10,34 +11,14 @@ logger = logging.getLogger(__name__)
class ContentValidator:
"""Validates delivered content against user intent"""
def __init__(self):
pass
def __init__(self, services=None):
self.services = services
def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
"""Validates delivered content against user intent"""
async def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
"""Validates delivered content against user intent using AI"""
try:
validationDetails = []
for doc in documents:
content = self._extractContent(doc)
detail = self._validateSingleDocument(content, doc, intent)
validationDetails.append(detail)
# Calculate overall success
overallSuccess = all(detail.get("successCriteriaMet", [False]) for detail in validationDetails)
# Calculate quality score
qualityScore = self._calculateQualityScore(validationDetails)
# Generate improvement suggestions
improvementSuggestions = self._generateImprovementSuggestions(validationDetails, intent)
return {
"overallSuccess": overallSuccess,
"qualityScore": qualityScore,
"validationDetails": validationDetails,
"improvementSuggestions": improvementSuggestions
}
# Use AI for comprehensive validation
return await self._validateWithAI(documents, intent)
except Exception as e:
logger.error(f"Error validating content: {str(e)}")
@ -56,253 +37,236 @@ class ContentValidator:
except Exception:
return ""
def _validateSingleDocument(self, content: str, doc: Any, intent: Dict[str, Any]) -> Dict[str, Any]:
"""Validates a single document against intent"""
# Check data type match
dataTypeMatch = self._checkDataTypeMatch(content, intent.get("dataType", "unknown"))
# Check format match
formatMatch = self._checkFormatMatch(content, intent.get("expectedFormat", "unknown"))
# Calculate quality score
qualityScore = self._calculateDocumentQualityScore(content, intent)
# Check success criteria
successCriteriaMet = self._checkSuccessCriteria(content, intent)
# Identify specific issues
specificIssues = self._identifySpecificIssues(content, intent)
# Generate improvement suggestions
improvementSuggestions = self._generateDocumentImprovementSuggestions(content, intent)
return {
"documentName": getattr(doc, 'documentName', 'Unknown'),
"dataTypeMatch": dataTypeMatch,
"formatMatch": formatMatch,
"qualityScore": qualityScore,
"successCriteriaMet": successCriteriaMet,
"specificIssues": specificIssues,
"improvementSuggestions": improvementSuggestions
}
def _checkDataTypeMatch(self, content: str, dataType: str) -> bool:
"""Checks if content matches the expected data type"""
if dataType == "numbers":
return self._containsNumbers(content)
elif dataType == "text":
return self._containsText(content)
elif dataType == "documents":
return self._containsDocumentContent(content)
elif dataType == "analysis":
return self._containsAnalysis(content)
elif dataType == "code":
return self._containsCode(content)
else:
return True # Unknown type, assume match
def _containsNumbers(self, content: str) -> bool:
"""Checks if content contains actual numbers (not code)"""
# Look for actual numbers in the content
numbers = re.findall(r'\b\d+\b', content)
# Check if it's code (contains function definitions, etc.)
isCode = any(keyword in content.lower() for keyword in [
'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
'return', 'print(', 'console.log', 'public ', 'private '
])
# If it's code, it doesn't contain actual numbers
if isCode:
return False
# If it has numbers and it's not code, it contains actual numbers
return len(numbers) > 0
def _containsText(self, content: str) -> bool:
"""Checks if content contains readable text"""
# Remove numbers and special characters
textContent = re.sub(r'[^\w\s]', '', content)
words = textContent.split()
# Check if there are enough words to be considered text
return len(words) > 5
def _containsDocumentContent(self, content: str) -> bool:
"""Checks if content is suitable for document creation"""
# Check for structured content
hasStructure = any(indicator in content for indicator in [
'\n', '\t', '|', '-', '*', '1.', '2.', '', ''
])
# Check for meaningful content
hasMeaningfulContent = len(content.strip()) > 50
return hasStructure and hasMeaningfulContent
def _containsAnalysis(self, content: str) -> bool:
"""Checks if content contains analysis"""
analysisIndicators = [
'analysis', 'findings', 'conclusion', 'summary', 'insights',
'trends', 'patterns', 'comparison', 'evaluation', 'assessment'
]
contentLower = content.lower()
return any(indicator in contentLower for indicator in analysisIndicators)
def _containsCode(self, content: str) -> bool:
"""Checks if content contains code"""
codeIndicators = [
'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
'return', 'print(', 'console.log', 'public ', 'private ', 'void ',
'int ', 'string ', 'var ', 'let ', 'const '
]
contentLower = content.lower()
return any(indicator in contentLower for indicator in codeIndicators)
def _checkFormatMatch(self, content: str, expectedFormat: str) -> bool:
"""Checks if content matches expected format"""
if expectedFormat == "raw_data":
# Raw data should be simple, not heavily formatted
return not any(indicator in content for indicator in [
'<html>', '<div>', '<table>', '## ', '### ', '**', '__'
])
elif expectedFormat == "formatted":
# Formatted content should have structure
return any(indicator in content for indicator in [
'\n', '\t', '|', '-', '*', '1.', '2.', ''
])
elif expectedFormat == "structured":
# Structured content should have clear organization
return any(indicator in content for indicator in [
'{', '}', '[', ']', '|', '\t', ' '
])
else:
return True # Unknown format, assume match
def _checkSuccessCriteria(self, content: str, intent: Dict[str, Any]) -> List[bool]:
"""Checks if content meets success criteria"""
criteriaMet = []
successCriteria = intent.get("successCriteria", [])
for criterion in successCriteria:
if 'prime numbers' in criterion.lower():
# Check if content contains actual prime numbers, not code
hasNumbers = bool(re.search(r'\b\d+\b', content))
isNotCode = not any(keyword in content.lower() for keyword in [
'def ', 'function', 'import ', 'class '
])
criteriaMet.append(hasNumbers and isNotCode)
elif 'document' in criterion.lower():
# Check if content is suitable for document creation
hasStructure = any(indicator in content for indicator in [
'\n', '\t', '|', '-', '*', '1.', '2.'
])
criteriaMet.append(hasStructure)
elif 'format' in criterion.lower():
# Check if content is properly formatted
hasFormatting = any(indicator in content for indicator in [
'\n', '\t', '|', '-', '*', '1.', '2.', ''
])
criteriaMet.append(hasFormatting)
else:
# Generic check - content should not be empty
criteriaMet.append(len(content.strip()) > 0)
return criteriaMet
def _calculateDocumentQualityScore(self, content: str, intent: Dict[str, Any]) -> float:
"""Calculates quality score for a single document"""
score = 0.0
# Base score for having content
if len(content.strip()) > 0:
score += 0.2
# Score for data type match
if self._checkDataTypeMatch(content, intent.get("dataType", "unknown")):
score += 0.3
# Score for format match
if self._checkFormatMatch(content, intent.get("expectedFormat", "unknown")):
score += 0.2
# Score for success criteria
successCriteriaMet = self._checkSuccessCriteria(content, intent)
if successCriteriaMet:
successRate = sum(successCriteriaMet) / len(successCriteriaMet)
score += 0.3 * successRate
return min(score, 1.0)
def _calculateQualityScore(self, validationDetails: List[Dict[str, Any]]) -> float:
"""Calculates overall quality score from validation details"""
if not validationDetails:
return 0.0
totalScore = sum(detail.get("qualityScore", 0) for detail in validationDetails)
return totalScore / len(validationDetails)
def _identifySpecificIssues(self, content: str, intent: Dict[str, Any]) -> List[str]:
"""Identifies specific issues with the content"""
issues = []
# Check for common issues
if intent.get("dataType") == "numbers" and self._containsCode(content):
issues.append("Content contains code instead of actual numbers")
if intent.get("expectedFormat") == "raw_data" and any(indicator in content for indicator in ['<html>', '## ', '**']):
issues.append("Content is formatted when raw data was requested")
if len(content.strip()) == 0:
issues.append("Content is empty")
return issues
def _generateDocumentImprovementSuggestions(self, content: str, intent: Dict[str, Any]) -> List[str]:
"""Generates improvement suggestions for a single document"""
suggestions = []
dataType = intent.get("dataType", "unknown")
expectedFormat = intent.get("expectedFormat", "unknown")
if dataType == "numbers" and self._containsCode(content):
suggestions.append("Deliver actual numbers, not code to generate them")
if expectedFormat == "raw_data" and any(indicator in content for indicator in ['<html>', '## ']):
suggestions.append("Provide raw data without formatting")
if len(content.strip()) == 0:
suggestions.append("Provide actual content")
return suggestions
def _generateImprovementSuggestions(self, validationDetails: List[Dict[str, Any]],
intent: Dict[str, Any]) -> List[str]:
"""Generates improvement suggestions based on validation results"""
suggestions = []
# Check for common issues
if not any(detail.get("dataTypeMatch", False) for detail in validationDetails):
dataType = intent.get("dataType", "unknown")
suggestions.append(f"Content should contain {dataType} data, not code or other formats")
if not any(detail.get("formatMatch", False) for detail in validationDetails):
expectedFormat = intent.get("expectedFormat", "unknown")
suggestions.append(f"Content should be in {expectedFormat} format")
# Add specific suggestions from validation details
for detail in validationDetails:
suggestions.extend(detail.get("improvementSuggestions", []))
return list(set(suggestions)) # Remove duplicates
def _createFailedValidationResult(self, error: str) -> Dict[str, Any]:
"""Creates a failed validation result"""
return {
"overallSuccess": False,
"qualityScore": 0.0,
"validationDetails": [],
"improvementSuggestions": [f"Validation failed: {error}"]
"improvementSuggestions": [f"NEXT STEP: Fix validation error - {error}. Check system logs for more details and retry the operation."]
}
def _isValidJsonResponse(self, response: str) -> bool:
"""Checks if response contains valid JSON structure"""
try:
import re
# Look for JSON with expected structure
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', response, re.DOTALL)
if json_match:
json.loads(json_match.group(0))
return True
return False
except:
return False
def _extractFallbackValidationResult(self, response: str) -> Dict[str, Any]:
"""Extracts validation result from malformed AI response"""
try:
import re
# Extract key values using regex patterns
overall_success = re.search(r'"overallSuccess"\s*:\s*(true|false)', response, re.IGNORECASE)
quality_score = re.search(r'"qualityScore"\s*:\s*([0-9.]+)', response)
gap_analysis = re.search(r'"gapAnalysis"\s*:\s*"([^"]*)"', response)
# Determine overall success from context if not found
if not overall_success:
# Look for positive/negative indicators in the text
if any(word in response.lower() for word in ['success', 'complete', 'fulfilled', 'satisfied']):
overall_success = True
elif any(word in response.lower() for word in ['failed', 'incomplete', 'missing', 'error']):
overall_success = False
else:
overall_success = False
return {
"overallSuccess": overall_success if isinstance(overall_success, bool) else (overall_success.group(1).lower() == 'true' if overall_success else False),
"qualityScore": float(quality_score.group(1)) if quality_score else 0.5,
"validationDetails": [{
"documentName": "AI Validation (Fallback)",
"gapAnalysis": gap_analysis.group(1) if gap_analysis else "Unable to parse detailed analysis",
"successCriteriaMet": [False] # Conservative fallback
}],
"improvementSuggestions": ["NEXT STEP: AI response was malformed - retry the operation for better results"]
}
except Exception as e:
logger.error(f"Fallback extraction failed: {str(e)}")
return None
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
"""AI-based comprehensive validation - single main function"""
try:
if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
return self._createFailedValidationResult("AI service not available")
# Extract content from all documents
documentContents = []
for doc in documents:
content = self._extractContent(doc)
documentContents.append({
"name": getattr(doc, 'documentName', 'Unknown'),
"content": content[:2000] # Limit content for AI processing
})
# Create comprehensive AI validation prompt
validationPrompt = f"""
You are a comprehensive task completion validator. Analyze if the delivered content fulfills the user's request.
USER REQUEST: {intent.get('primaryGoal', 'Unknown')}
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA: {intent.get('successCriteria', [])}
DELIVERED CONTENT:
{json.dumps(documentContents, indent=2)}
Perform comprehensive validation:
1. Check if content matches expected data type
2. Check if content matches expected format
3. Verify success criteria are met
4. Assess overall quality and completeness
5. Identify specific gaps and issues
6. Provide actionable next steps
CRITICAL: You MUST respond with ONLY the JSON object below. NO TEXT ANALYSIS. NO EXPLANATIONS. NO OTHER CONTENT.
RESPOND WITH THIS EXACT JSON FORMAT:
{{
"overallSuccess": false,
"qualityScore": 0.5,
"dataTypeMatch": false,
"formatMatch": false,
"successCriteriaMet": [false, false],
"gapAnalysis": "Content does not match expected format and lacks required elements",
"improvementSuggestions": ["NEXT STEP: Create proper content in expected format", "NEXT STEP: Ensure all success criteria are met"],
"validationDetails": [
{{
"documentName": "Content Validation",
"issues": ["Format mismatch", "Missing required elements"],
"suggestions": ["NEXT STEP: Fix format", "NEXT STEP: Add missing elements"]
}}
]
}}
"""
# Call AI service for validation
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
response = await self.services.ai.callAi(
prompt=validationPrompt,
documents=None,
options=request_options
)
# If first attempt fails, try with more explicit prompt
if response and not self._isValidJsonResponse(response):
logger.debug("First AI validation attempt failed, retrying with explicit JSON-only prompt")
explicitPrompt = f"""
VALIDATE AND RETURN JSON ONLY - NO TEXT ANALYSIS
Request: {intent.get('primaryGoal', 'Unknown')}
Data Type: {intent.get('dataType', 'unknown')}
Format: {intent.get('expectedFormat', 'unknown')}
Criteria: {intent.get('successCriteria', [])}
Content: {json.dumps(documentContents, indent=2)}
RESPOND WITH THIS EXACT JSON FORMAT - NO OTHER TEXT:
{{
"overallSuccess": false,
"qualityScore": 0.3,
"dataTypeMatch": false,
"formatMatch": false,
"successCriteriaMet": [false, false],
"gapAnalysis": "Content does not match expected format and lacks required elements",
"improvementSuggestions": ["NEXT STEP: Create proper content in expected format", "NEXT STEP: Ensure all success criteria are met"],
"validationDetails": [
{{
"documentName": "Content Validation",
"issues": ["Format mismatch", "Missing required elements"],
"suggestions": ["NEXT STEP: Fix format", "NEXT STEP: Add missing elements"]
}}
]
}}
"""
response = await self.services.ai.callAi(
prompt=explicitPrompt,
documents=None,
options=request_options
)
if not response or not response.strip():
logger.warning("AI validation returned empty response")
return self._createFailedValidationResult("AI validation failed - empty response")
# Clean and extract JSON from response
result = response.strip()
logger.debug(f"AI validation response length: {len(result)}")
# Try to find JSON in the response with multiple strategies
import re
# Strategy 1: Look for JSON in markdown code blocks
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
if json_match:
result = json_match.group(1)
logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
else:
# Strategy 2: Look for JSON object with proper structure
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', result, re.DOTALL)
if not json_match:
# Strategy 3: Look for any JSON object
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
logger.debug(f"Extracted JSON directly: {result[:200]}...")
else:
logger.debug(f"No JSON found in AI response, trying fallback extraction: {result[:200]}...")
logger.debug(f"Full AI response: {result}")
# Try fallback extraction for text responses
fallback_result = self._extractFallbackValidationResult(result)
if fallback_result:
logger.info("Using fallback text extraction for validation")
return fallback_result
logger.warning("All AI validation attempts failed - no JSON found and fallback extraction failed")
return self._createFailedValidationResult("AI validation failed - no JSON in response")
try:
aiResult = json.loads(result)
logger.info("AI validation JSON parsed successfully")
return {
"overallSuccess": aiResult.get("overallSuccess", False),
"qualityScore": aiResult.get("qualityScore", 0.0),
"validationDetails": aiResult.get("validationDetails", [{
"documentName": "AI Validation",
"gapAnalysis": aiResult.get("gapAnalysis", ""),
"successCriteriaMet": aiResult.get("successCriteriaMet", [False])
}]),
"improvementSuggestions": aiResult.get("improvementSuggestions", [])
}
except json.JSONDecodeError as json_error:
logger.warning(f"All AI validation attempts failed - invalid JSON: {str(json_error)}")
logger.debug(f"JSON content: {result}")
# Try to extract key information from malformed response
fallbackResult = self._extractFallbackValidationResult(result)
if fallbackResult:
logger.info("Using fallback validation result from malformed JSON")
return fallbackResult
return self._createFailedValidationResult(f"AI validation failed - invalid JSON: {str(json_error)}")
return self._createFailedValidationResult("AI validation failed - no response")
except Exception as e:
logger.error(f"AI validation failed: {str(e)}")
return self._createFailedValidationResult(f"AI validation error: {str(e)}")

View file

@ -1,228 +1,156 @@
# intentAnalyzer.py
# Intent analysis for adaptive React mode
# Intent analysis for adaptive React mode - AI-based, language-agnostic
import re
import json
import logging
from typing import Dict, Any, List
from enum import Enum
logger = logging.getLogger(__name__)
class DataType(Enum):
NUMBERS = "numbers"
TEXT = "text"
DOCUMENTS = "documents"
ANALYSIS = "analysis"
CODE = "code"
UNKNOWN = "unknown"
class ExpectedFormat(Enum):
RAW_DATA = "raw_data"
FORMATTED = "formatted"
STRUCTURED = "structured"
VISUAL = "visual"
UNKNOWN = "unknown"
class IntentAnalyzer:
"""Analyzes user intent to understand what they actually want"""
"""Analyzes user intent using AI - language-agnostic and generic"""
def __init__(self):
self.dataTypePatterns = {
DataType.NUMBERS: [
r'\b(numbers?|digits?|count|list|sequence)\b',
r'\b(prime|fibonacci|random|even|odd)\s+(numbers?)\b',
r'\b(calculate|compute|generate)\s+(numbers?)\b',
r'\b(first|last)\s+\d+\s+(numbers?)\b'
],
DataType.TEXT: [
r'\b(text|content|words?|sentences?|paragraphs?)\b',
r'\b(write|create|generate)\s+(text|content)\b',
r'\b(summary|description|explanation)\b',
r'\b(article|essay|report)\b'
],
DataType.DOCUMENTS: [
r'\b(document|file|report|pdf|word|excel)\b',
r'\b(create|generate|make)\s+(document|file|report)\b',
r'\b(format|structure|organize)\s+(document)\b',
r'\b(presentation|slides?)\b'
],
DataType.ANALYSIS: [
r'\b(analyze|analysis|examine|study|evaluate)\b',
r'\b(insights?|findings?|results?)\b',
r'\b(compare|contrast|evaluate)\b',
r'\b(trends?|patterns?)\b'
],
DataType.CODE: [
r'\b(code|program|script|algorithm|function)\b',
r'\b(write|create|develop)\s+(code|program|script)\b',
r'\b(implement|build|construct)\b',
r'\b(debug|fix|optimize)\s+(code)\b'
]
}
def __init__(self, services=None):
self.services = services
self.formatPatterns = {
ExpectedFormat.RAW_DATA: [
r'\b(raw|plain|simple|basic)\b',
r'\b(numbers?|data|list)\b(?!\s+(in|as|with))',
r'\b(just|only)\s+(numbers?|data)\b'
],
ExpectedFormat.FORMATTED: [
r'\b(formatted|structured|organized|presented)\b',
r'\b(table|chart|graph|visual)\b',
r'\b(pretty|nice|clean)\s+(format|presentation)\b',
r'\b(professional|polished)\b'
],
ExpectedFormat.STRUCTURED: [
r'\b(json|xml|csv|structured)\b',
r'\b(organized|categorized|grouped)\b',
r'\b(systematic|methodical)\b',
r'\b(database|spreadsheet)\b'
]
}
def analyzeUserIntent(self, userPrompt: str, context: Any) -> Dict[str, Any]:
"""Analyzes user intent from prompt and context"""
async def analyzeUserIntent(self, userPrompt: str, context: Any) -> Dict[str, Any]:
"""Analyzes user intent from prompt and context using AI"""
try:
# Extract primary goal
primaryGoal = self._extractPrimaryGoal(userPrompt)
# Use AI to analyze intent
aiAnalysis = await self._analyzeIntentWithAI(userPrompt, context)
if aiAnalysis:
return aiAnalysis
# Classify data type
dataType = self._classifyDataType(userPrompt)
# Determine expected format
expectedFormat = self._determineExpectedFormat(userPrompt)
# Assess quality requirements
qualityRequirements = self._assessQualityRequirements(userPrompt, context)
# Extract success criteria
successCriteria = self._extractSuccessCriteria(userPrompt, context)
# Calculate confidence score
confidenceScore = self._calculateConfidenceScore(dataType, expectedFormat, successCriteria)
return {
"primaryGoal": primaryGoal,
"dataType": dataType.value,
"expectedFormat": expectedFormat.value,
"qualityRequirements": qualityRequirements,
"successCriteria": successCriteria,
"confidenceScore": confidenceScore
}
# Fallback to basic analysis if AI fails
return self._createBasicIntentAnalysis(userPrompt)
except Exception as e:
logger.error(f"Error analyzing user intent: {str(e)}")
return self._createDefaultIntentAnalysis(userPrompt)
def _extractPrimaryGoal(self, userPrompt: str) -> str:
"""Extracts the primary goal from user prompt"""
# Simple extraction - can be enhanced
return userPrompt.strip()
async def _analyzeIntentWithAI(self, userPrompt: str, context: Any) -> Dict[str, Any]:
"""Uses AI to analyze user intent - language-agnostic"""
try:
if not self.services or not hasattr(self.services, 'ai'):
return None
def _classifyDataType(self, userPrompt: str) -> DataType:
"""Classifies the type of data the user wants"""
promptLower = userPrompt.lower()
# Create AI analysis prompt
analysisPrompt = f"""
You are an intent analyzer. Analyze the user's request to understand what they want delivered.
for dataType, patterns in self.dataTypePatterns.items():
for pattern in patterns:
if re.search(pattern, promptLower):
return dataType
USER REQUEST: {userPrompt}
return DataType.UNKNOWN
CONTEXT: {getattr(context.task_step, 'objective', '') if hasattr(context, 'task_step') and context.task_step else ''}
def _determineExpectedFormat(self, userPrompt: str) -> ExpectedFormat:
"""Determines the expected format of the output"""
promptLower = userPrompt.lower()
Analyze the user's intent and determine:
1. What type of data/content they want (numbers, text, documents, analysis, code, etc.)
2. What format they expect (raw data, formatted, structured, visual, etc.)
3. What quality requirements they have (accuracy, completeness, format)
4. What specific success criteria define completion
for formatType, patterns in self.formatPatterns.items():
for pattern in patterns:
if re.search(pattern, promptLower):
return formatType
CRITICAL: Respond with ONLY the JSON object below. Do not include any explanatory text, analysis, or other content before or after the JSON.
return ExpectedFormat.UNKNOWN
{{
"primaryGoal": "The main objective the user wants to achieve",
"dataType": "numbers|text|documents|analysis|code|unknown",
"expectedFormat": "raw_data|formatted|structured|visual|unknown",
"qualityRequirements": {{
"accuracyThreshold": 0.0-1.0,
"completenessThreshold": 0.0-1.0,
"formatRequirement": "any|formatted|raw|structured"
}},
"successCriteria": ["specific criterion 1", "specific criterion 2"],
"confidenceScore": 0.0-1.0
}}
"""
def _assessQualityRequirements(self, userPrompt: str, context: Any) -> Dict[str, Any]:
"""Assesses quality requirements from prompt and context"""
promptLower = userPrompt.lower()
# Call AI service for analysis
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
# Check for accuracy requirements
accuracyThreshold = 0.8
if any(word in promptLower for word in ['exact', 'precise', 'accurate', 'correct']):
accuracyThreshold = 0.95
elif any(word in promptLower for word in ['approximate', 'rough', 'estimate']):
accuracyThreshold = 0.7
response = await self.services.ai.callAi(
prompt=analysisPrompt,
documents=None,
options=request_options
)
# Check for completeness requirements
completenessThreshold = 0.8
if any(word in promptLower for word in ['complete', 'full', 'comprehensive', 'all']):
completenessThreshold = 0.95
elif any(word in promptLower for word in ['summary', 'brief', 'overview']):
completenessThreshold = 0.6
# If first attempt fails, try with more explicit prompt
if response and not self._isValidJsonResponse(response):
logger.debug("First AI intent analysis attempt failed, retrying with explicit JSON-only prompt")
explicitPrompt = f"""
{analysisPrompt}
# Check for format requirements
formatRequirement = "any"
if any(word in promptLower for word in ['formatted', 'structured', 'organized']):
formatRequirement = "formatted"
elif any(word in promptLower for word in ['raw', 'plain', 'simple']):
formatRequirement = "raw"
IMPORTANT: You must respond with ONLY valid JSON. No explanations, no analysis, no text before or after. Just the JSON object.
"""
response = await self.services.ai.callAi(
prompt=explicitPrompt,
documents=None,
options=request_options
)
if not response or not response.strip():
logger.warning("AI intent analysis returned empty response")
return None
# Clean and extract JSON from response
result = response.strip()
logger.debug(f"AI intent analysis response length: {len(result)}")
# Try to find JSON in the response with multiple strategies
import re
# Strategy 1: Look for JSON in markdown code blocks
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
if json_match:
result = json_match.group(1)
logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
else:
# Strategy 2: Look for JSON object with proper structure
json_match = re.search(r'\{[^{}]*"primaryGoal"[^{}]*\}', result, re.DOTALL)
if not json_match:
# Strategy 3: Look for any JSON object
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if not json_match:
logger.warning(f"All AI intent analysis attempts failed - no JSON found in response: {result[:200]}...")
logger.debug(f"Full AI response: {result}")
return None
result = json_match.group(0)
logger.debug(f"Extracted JSON directly: {result[:200]}...")
try:
aiResult = json.loads(result)
logger.info("AI intent analysis JSON parsed successfully")
return aiResult
except json.JSONDecodeError as json_error:
logger.warning(f"All AI intent analysis attempts failed - invalid JSON: {str(json_error)}")
logger.debug(f"JSON content: {result}")
return None
return None
except Exception as e:
logger.error(f"AI intent analysis failed: {str(e)}")
return None
def _createBasicIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
"""Creates basic intent analysis without AI"""
return {
"accuracyThreshold": accuracyThreshold,
"completenessThreshold": completenessThreshold,
"formatRequirement": formatRequirement
"primaryGoal": userPrompt.strip(),
"dataType": "unknown",
"expectedFormat": "unknown",
"qualityRequirements": {
"accuracyThreshold": 0.8,
"completenessThreshold": 0.8,
"formatRequirement": "any"
},
"successCriteria": ["Delivers what the user requested"],
"confidenceScore": 0.5
}
def _extractSuccessCriteria(self, userPrompt: str, context: Any) -> List[str]:
"""Extracts success criteria from prompt and context"""
criteria = []
promptLower = userPrompt.lower()
# Extract explicit criteria
if 'first' in promptLower and 'numbers' in promptLower:
criteria.append("Contains the first N numbers as requested")
if 'prime' in promptLower:
criteria.append("Contains actual prime numbers, not code to generate them")
if 'document' in promptLower:
criteria.append("Creates a properly formatted document")
if 'format' in promptLower:
criteria.append("Content is properly formatted as requested")
# Add context-based criteria
if hasattr(context, 'task_step') and context.task_step:
taskObjective = context.task_step.objective.lower()
if 'word' in taskObjective:
criteria.append("Creates a Word document")
if 'excel' in taskObjective:
criteria.append("Creates an Excel spreadsheet")
return criteria if criteria else ["Delivers what the user requested"]
def _calculateConfidenceScore(self, dataType: DataType, expectedFormat: ExpectedFormat,
successCriteria: List[str]) -> float:
"""Calculates confidence score for the intent analysis"""
score = 0.0
# Data type confidence
if dataType != DataType.UNKNOWN:
score += 0.3
# Format confidence
if expectedFormat != ExpectedFormat.UNKNOWN:
score += 0.2
# Success criteria confidence
if len(successCriteria) > 0:
score += 0.3
# Additional confidence for specific patterns
if len(successCriteria) > 1:
score += 0.2
return min(score, 1.0)
def _createDefaultIntentAnalysis(self, userPrompt: str) -> Dict[str, Any]:
"""Creates a default intent analysis when analysis fails"""
return {
@ -237,3 +165,16 @@ class IntentAnalyzer:
"successCriteria": ["Delivers what the user requested"],
"confidenceScore": 0.1
}
def _isValidJsonResponse(self, response: str) -> bool:
"""Checks if response contains valid JSON structure"""
try:
import re
# Look for JSON with expected structure
json_match = re.search(r'\{[^{}]*"primaryGoal"[^{}]*\}', response, re.DOTALL)
if json_match:
json.loads(json_match.group(0))
return True
return False
except:
return False

View file

@ -31,8 +31,8 @@ class ReactMode(BaseMode):
def __init__(self, services, workflow):
super().__init__(services, workflow)
# Initialize adaptive components
self.intentAnalyzer = IntentAnalyzer()
self.contentValidator = ContentValidator()
self.intentAnalyzer = IntentAnalyzer(services)
self.contentValidator = ContentValidator(services)
self.learningEngine = LearningEngine()
self.progressTracker = ProgressTracker()
self.currentIntent = None
@ -49,13 +49,14 @@ class ReactMode(BaseMode):
"""Execute task using React mode - iterative plan-act-observe-refine loop"""
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
# NEW: Analyze user intent with both original prompt and task objective
# Get original user prompt from services (clean and reliable)
# NEW: Analyze intents separately for proper validation vs task completion
# Workflow-level intent from cleaned original user prompt
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
combined_context = f"Original request: {original_prompt}\n\nCurrent task: {taskStep.objective}"
self.currentIntent = self.intentAnalyzer.analyzeUserIntent(combined_context, context)
logger.info(f"Intent analysis (original + task): {self.currentIntent}")
self.workflowIntent = await self.intentAnalyzer.analyzeUserIntent(original_prompt, context)
# Task-level intent from current task objective (used only for task-scoped checks)
self.taskIntent = await self.intentAnalyzer.analyzeUserIntent(taskStep.objective, context)
logger.info(f"Intent analysis — workflow: {self.workflowIntent}")
logger.info(f"Intent analysis — task: {self.taskIntent}")
# NEW: Reset progress tracking for new task
self.progressTracker.reset()
@ -99,18 +100,18 @@ class ReactMode(BaseMode):
# Attach deterministic label for clarity
observation['resultLabel'] = result.resultLabel
# NEW: Add content validation
if self.currentIntent and result.documents:
validationResult = self.contentValidator.validateContent(result.documents, self.currentIntent)
# NEW: Add content validation (against original cleaned user prompt / workflow intent)
if getattr(self, 'workflowIntent', None) and result.documents:
validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent)
observation['contentValidation'] = validationResult
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
# NEW: Learn from feedback
feedback = self._collectFeedback(result, validationResult, self.currentIntent)
self.learningEngine.learnFromFeedback(feedback, context, self.currentIntent)
feedback = self._collectFeedback(result, validationResult, self.workflowIntent)
self.learningEngine.learnFromFeedback(feedback, context, self.workflowIntent)
# NEW: Update progress
self.progressTracker.updateProgress(result, validationResult, self.currentIntent)
self.progressTracker.updateProgress(result, validationResult, self.workflowIntent)
decision = await self._refineDecide(context, observation)
@ -204,6 +205,11 @@ class ReactMode(BaseMode):
selection = json.loads(response[jsonStart:jsonEnd])
if 'action' not in selection or not isinstance(selection['action'], str):
raise ValueError("Selection missing 'action' as string")
# Validate document references - prevent AI from inventing Message IDs
if 'requiredInputDocuments' in selection:
self._validateDocumentReferences(selection['requiredInputDocuments'], context)
# Enforce spec: Stage 1 must NOT include 'parameters'
if 'parameters' in selection:
# Remove to avoid accidental carryover
@ -213,6 +219,38 @@ class ReactMode(BaseMode):
selection['parameters'] = None
return selection
def _validateDocumentReferences(self, document_refs: List[str], context: TaskContext) -> None:
"""Validate that document references exist in the current workflow"""
if not document_refs:
return
# Get available documents from the current workflow
try:
available_docs = self.services.workflow.getAvailableDocuments(self.services.currentWorkflow)
if not available_docs or available_docs == "No documents available":
logger.warning("No documents available for validation")
return
# Extract all valid references from available documents
valid_refs = []
for line in available_docs.split('\n'):
if 'docList:' in line or 'docItem:' in line:
# Extract reference from line like " - docList:msg_xxx:label" or " - docItem:xxx:filename with spaces"
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line)
if ref_match:
valid_refs.append(ref_match.group(1))
# Check if all provided references are valid
for ref in document_refs:
if ref not in valid_refs:
logger.error(f"Invalid document reference: {ref}")
logger.error(f"Available references: {valid_refs}")
raise ValueError(f"Document reference '{ref}' not found in available documents. Use only exact references from AVAILABLE_DOCUMENTS_INDEX.")
except Exception as e:
logger.error(f"Error validating document references: {str(e)}")
raise ValueError(f"Failed to validate document references: {str(e)}")
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
"""Act: request minimal parameters then execute selected action"""

View file

@ -42,13 +42,24 @@ def extractUserPrompt(context: Any) -> str:
Fallback to the task_step objective.
"""
try:
# Prefer services.currentUserPrompt when accessible through context
services = getattr(context, 'services', None)
if services and getattr(services, 'currentUserPrompt', None):
return services.currentUserPrompt
except Exception:
pass
# Determine raw user prompt from services or task_step
rawPrompt = None
if services and getattr(services, 'currentUserPrompt', None):
rawPrompt = services.currentUserPrompt
elif hasattr(context, 'task_step') and context.task_step:
rawPrompt = context.task_step.objective or 'No request specified'
else:
rawPrompt = 'No request specified'
# Prefer values computed at workflow start by WorkflowManager analyzer
normalized = getattr(services, 'currentUserPromptNormalized', None) if services else None
if normalized:
return normalized
return rawPrompt
except Exception:
# Robust fallback behavior
if hasattr(context, 'task_step') and context.task_step:
return context.task_step.objective or 'No request specified'
return 'No request specified'
@ -57,19 +68,11 @@ def extractWorkflowHistory(service: Any, context: Any) -> str:
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
Reverse-chronological, enriched with message summaries and document labels.
"""
# Prefer explicit workflow on context; else fall back to services.workflow
workflow = None
try:
if hasattr(context, 'workflow') and context.workflow:
workflow = context.workflow
elif hasattr(service, 'workflow') and service.workflow:
workflow = service.workflow
except Exception:
workflow = None
if workflow:
history = getPreviousRoundContext(service, workflow)
history = getPreviousRoundContext(service, service.currentWorkflow)
return history or "No previous workflow rounds available"
except Exception as e:
logger.error(f"Error getting workflow history: {str(e)}")
return "No previous workflow rounds available"
def extractAvailableMethods(service: Any) -> str:
@ -99,7 +102,15 @@ def extractAvailableMethods(service: Any) -> str:
def extractUserLanguage(service: Any) -> str:
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
try:
# Prefer detected language if available
if service and getattr(service, 'currentUserLanguage', None):
return service.currentUserLanguage
return service.user.language if service and service.user else 'en'
except Exception:
return 'en'
# Normalization now happens centrally in WorkflowManager._sendFirstMessage; no AI call here.
def _computeMessageSummary(msg) -> str:
@ -371,9 +382,10 @@ def extractLatestRefinementFeedback(context: Any) -> str:
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
"""Summary of available documents (count only)."""
try:
documents = service.workflow.getAvailableDocuments(context.workflow)
documents = service.workflow.getAvailableDocuments(service.currentWorkflow)
if documents and documents != "No documents available":
doc_count = documents.count("docList:") + documents.count("docItem:")
# Count only actual documents, not list labels
doc_count = documents.count("docItem:")
return f"{doc_count} documents available from previous tasks"
return "No documents available"
except Exception as e:
@ -383,7 +395,7 @@ def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
"""Index of available documents with detailed references for parameter generation."""
try:
return service.workflow.getAvailableDocuments(context.workflow)
return service.workflow.getAvailableDocuments(service.currentWorkflow)
except Exception as e:
logger.error(f"Error getting document index: {str(e)}")
return "No documents available"

View file

@ -32,7 +32,7 @@ def generateReactPlanSelectionPrompt(services, context: Any) -> PromptBundle:
PromptPlaceholder(label="AVAILABLE_CONNECTIONS_INDEX", content=extractAvailableConnectionsIndex(services), summaryAllowed=False),
]
template = """Select exactly one action to advance the task.
template = """Select exactly one next action to advance the task incrementally.
OBJECTIVE:
{{KEY:USER_PROMPT}}
@ -52,7 +52,11 @@ AVAILABLE_DOCUMENTS_INDEX:
AVAILABLE_CONNECTIONS_INDEX:
{{KEY:AVAILABLE_CONNECTIONS_INDEX}}
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text):
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text). The chosen action MUST:
- be the next logical incremental step toward fulfilling the objective
- not attempt to complete the entire objective in one step
- if producing files, target exactly one output format for this step
- reference ONLY existing document IDs/labels from AVAILABLE_DOCUMENTS_INDEX
{{
"action": "method.action_name",
"actionObjective": "...",
@ -64,7 +68,7 @@ REPLY: Return ONLY a JSON object with the following structure (no comments, no e
EXAMPLE how to assign references from AVAILABLE_DOCUMENTS_INDEX and AVAILABLE_CONNECTIONS_INDEX:
"requiredInputDocuments": ["docList:msg_47a7a578-e8f2-4ba8-ac66-0dbff40605e0:round8_task1_action1_results","docItem:5d8b7aee-b546-4487-b6a8-835c86f7b186:AI_Generated_Document_20251006-104256.docx"],
"requiredConnection": "connection:msft:p.motsch@valueon.ch:1ae8b8e5-128b-49b8-b1cb-7c632669eeae",
"requiredConnection": "connection:msft:p.motsch@valueon.ch",
RULES:
1. Use EXACT action names from AVAILABLE_METHODS
@ -72,7 +76,11 @@ RULES:
3. parametersContext must be short and sufficient for Stage 2
4. Return ONLY JSON - no markdown, no explanations
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
- DO NOT invent or modify Message IDs
- DO NOT create new references
- Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
7. Plan incrementally: if the overall intent needs multiple output formats (e.g., CSV and HTML), choose one format in this step and leave the other(s) for subsequent steps
"""
return PromptBundle(prompt=template, placeholders=placeholders)

View file

@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle:
Break down user requests into logical, executable task steps.
**IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks.
## 📋 Context
### User Request
@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps.
- **ONE TOPIC PER TASK** - Each task should handle one complete business objective
- **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
- **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
- **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job
### Task Grouping Examples
- **Research + Analysis + Report** ONE task: "Web research report"
- **Data Collection + Processing + Visualization** ONE task: "Collect and present data"
- **Document splitting** (analyze + extract + create files) ONE task: "Split document into separate files"
- **Different topics** (email + flowers) SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"
### Common Single-Task Scenarios
- **"Split document into sections"** ONE task: "Split document into separate files"
- **"Extract data and create report"** ONE task: "Extract data and create report"
- **"Analyze and summarize document"** ONE task: "Analyze and summarize document"
- **"Convert file to different format"** ONE task: "Convert file to different format"
### Retry Handling
- **If retry request**: Analyze previous rounds to understand what failed
- **Learn from mistakes**: Improve the plan based on previous failures

View file

@ -216,23 +216,23 @@ class WorkflowManager:
# Update the message with documents in database
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
# Analyze the user's input to extract intent and offload bulky context into documents
# Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents
try:
analyzerPrompt = (
"You are an input analyzer. Split the user's message into:\n"
"1) intent: the user's core request in one concise paragraph, normalized to the user's language.\n"
"2) contextItems: supportive data to attach as separate documents if significantly larger than the intent. "
"Include large literal data blocks, long lists/tables, code/JSON blocks, quoted transcripts, CSV fragments, or detailed specs. "
"Keep URLs in the intent unless they include large pasted content.\n\n"
"You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n"
"1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n"
"2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n"
"3) intent: concise single-paragraph core request in the detected language for high-level routing.\n"
"4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n"
"Rules:\n"
"- If total content length (intent + data) is less than 10% of the model's max tokens, do not extract; "
"return an empty contextItems and keep a compact, self-contained intent.\n"
"- If content exceeds that, move bulky parts into contextItems, keeping the intent short and clear.\n"
"- Preserve critical references (URLs, filenames) in the intent.\n"
"- Normalize the intent to the detected language. If mixed-language, use the primary detected language and normalize.\n\n"
"Output JSON only (no markdown):\n"
"- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n"
"- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n"
"- Preserve critical references (URLs, filenames) in intent.\n"
"- Normalize to the primary detected language if mixed-language.\n\n"
"Return ONLY JSON (no markdown) with this shape:\n"
"{\n"
" \"detectedLanguage\": \"en\",\n"
" \"detectedLanguage\": \"de|en|fr|it|...\",\n"
" \"normalizedRequest\": \"Full explicit instruction in detected language\",\n"
" \"intent\": \"Concise normalized request...\",\n"
" \"contextItems\": [\n"
" {\n"
@ -249,6 +249,7 @@ class WorkflowManager:
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
detectedLanguage = None
normalizedRequest = None
intentText = userInput.prompt
contextItems = []
@ -260,6 +261,7 @@ class WorkflowManager:
if jsonStart != -1 and jsonEnd > jsonStart:
parsed = json.loads(aiResponse[jsonStart:jsonEnd])
detectedLanguage = parsed.get('detectedLanguage') or None
normalizedRequest = parsed.get('normalizedRequest') or None
if parsed.get('intent'):
intentText = parsed.get('intent')
contextItems = parsed.get('contextItems') or []
@ -269,7 +271,18 @@ class WorkflowManager:
# Update services state
if detectedLanguage and isinstance(detectedLanguage, str):
self._setUserLanguage(detectedLanguage)
try:
setattr(self.services, 'currentUserLanguage', detectedLanguage)
except Exception:
pass
self.services.currentUserPrompt = intentText or userInput.prompt
try:
if normalizedRequest:
setattr(self.services, 'currentUserPromptNormalized', normalizedRequest)
if contextItems is not None:
setattr(self.services, 'currentUserContextItems', contextItems)
except Exception:
pass
# Telemetry (sizes and counts)
try:
@ -329,8 +342,6 @@ class WorkflowManager:
if not message.documents:
message.documents = []
message.documents.extend(created_docs)
# Ensure label is user_context for discoverability
message.documentsLabel = context_label
self.services.workflow.updateMessage(message.id, {
"documents": [d.to_dict() for d in message.documents],
"documentsLabel": context_label

View file

@ -41,6 +41,7 @@ markdown
## Web Scraping & HTTP
beautifulsoup4==4.12.2 # Required for HTML/XML parsing
requests==2.31.0
requests-oauthlib==1.3.1 # Required for Google OAuth2Session
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
aiohttp>=3.8.0 # Required for SharePoint operations (async HTTP)
selenium>=4.15.0 # Required for web automation and JavaScript-heavy pages

555
test_document_processing.py Normal file
View file

@ -0,0 +1,555 @@
"""
Test script for document processing and DOCX generation.
Calls the main AI service directly to process PDF documents and generate DOCX summaries.
"""
import asyncio
import sys
import os
import logging
import base64
from datetime import datetime
from pathlib import Path
# Add the gateway module to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import EnhancedAiCallOptions
from modules.services.serviceAi.mainServiceAi import AiService
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
async def process_documents_and_generate_summary():
"""Process documents using the main AI service with intelligent chunk integration."""
logger.info("🚀 Starting intelligent chunk integration test...")
# Find testdata directory
testdata_path = Path("../wiki/poweron/testdata")
if not testdata_path.exists():
# Try relative to current directory
testdata_path = Path("wiki/poweron/testdata")
if not testdata_path.exists():
# Try relative to parent directory
testdata_path = Path("../wiki/poweron/testdata")
if not testdata_path.exists():
logger.error(f"❌ Testdata path not found. Tried:")
logger.error(f" - ../wiki/poweron/testdata")
logger.error(f" - wiki/poweron/testdata")
logger.error(f" - ../wiki/poweron/testdata")
logger.info("Please ensure the testdata folder exists with PDF documents")
return False
# Find all supported document files
supported_extensions = [
# Document formats
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
# Image formats
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
# Text and code files
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
]
document_files = []
for ext in supported_extensions:
document_files.extend(list(testdata_path.glob(ext)))
logger.info(f"Found {len(document_files)} document files in testdata:")
for doc_file in document_files:
logger.info(f" - {doc_file.name}")
if not document_files:
logger.error("❌ No supported document files found in testdata folder")
return False
try:
# Mock the database interface to provide our file data BEFORE creating AI service
class TestDbInterface:
def __init__(self, file_data_map):
self.file_data_map = file_data_map
def getFileData(self, file_id):
logger.info(f"TestDbInterface.getFileData called with file_id: {file_id}")
data = self.file_data_map.get(file_id)
if data:
logger.info(f"✅ Found file data for {file_id}: {len(data)} bytes")
else:
logger.warning(f"❌ No file data found for {file_id}")
return data
# Create file data mapping
file_data_map = {}
for i, doc_file in enumerate(document_files):
with open(doc_file, 'rb') as f:
file_data_map[f"test_doc_{i+1}"] = f.read()
logger.info(f"📁 Loaded {doc_file.name} as test_doc_{i+1}: {len(file_data_map[f'test_doc_{i+1}'])} bytes")
# Mock the database interface BEFORE creating AI service
import modules.interfaces.interfaceDbComponentObjects as db_interface_module
original_get_interface = db_interface_module.getInterface
db_interface_module.getInterface = lambda: TestDbInterface(file_data_map)
logger.info("🔧 Database interface mocked successfully")
# Create a mock service center with utils
class MockServiceCenter:
def __init__(self):
self.utils = MockUtils()
class MockUtils:
def debugLogToFile(self, message, label):
logger.debug(f"[{label}] {message}")
print(f"DEBUG [{label}]: {message}") # Also print to console for visibility
# Only write to debug file if debug logging is enabled (matching real implementation)
debug_enabled = self.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
try:
import os
from datetime import datetime, UTC
debug_dir = self.configGet("APP_DEBUG_CHAT_WORKFLOW_DIR", "./test-chat")
if not os.path.isabs(debug_dir):
# If relative path, make it relative to the gateway directory
gateway_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
debug_dir = os.path.join(gateway_dir, debug_dir)
os.makedirs(debug_dir, exist_ok=True)
debug_file = os.path.join(debug_dir, "debug_workflow.log")
timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
debug_entry = f"[{timestamp}] [{label}] {message}\n"
with open(debug_file, "a", encoding="utf-8") as f:
f.write(debug_entry)
except Exception:
pass # Don't fail on debug logging errors
def configGet(self, key, default):
# Return debug settings
if key == "APP_DEBUG_CHAT_WORKFLOW_ENABLED":
return True
elif key == "APP_DEBUG_CHAT_WORKFLOW_DIR":
return "./test-chat"
return default
mock_service_center = MockServiceCenter()
# Initialize the main AI service - let it handle everything
logger.info("🔧 Initializing main AI service...")
ai_service = await AiService.create(mock_service_center)
# Create test documents - the AI service will handle file access internally
documents = []
logger.info(f"📁 Found {len(document_files)} document files")
for i, doc_file in enumerate(document_files):
logger.info(f"📄 Processing file {i+1}/{len(document_files)}: {doc_file.name}")
# Determine MIME type based on file extension
mime_type = "application/octet-stream" # default
if doc_file.suffix.lower() == '.pdf':
mime_type = "application/pdf"
elif doc_file.suffix.lower() in ['.jpg', '.jpeg']:
mime_type = "image/jpeg"
elif doc_file.suffix.lower() == '.png':
mime_type = "image/png"
elif doc_file.suffix.lower() == '.gif':
mime_type = "image/gif"
elif doc_file.suffix.lower() == '.docx':
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif doc_file.suffix.lower() == '.xlsx':
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif doc_file.suffix.lower() == '.pptx':
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
elif doc_file.suffix.lower() == '.ppt':
mime_type = "application/vnd.ms-powerpoint"
elif doc_file.suffix.lower() == '.html':
mime_type = "text/html"
elif doc_file.suffix.lower() == '.csv':
mime_type = "text/csv"
elif doc_file.suffix.lower() == '.json':
mime_type = "application/json"
elif doc_file.suffix.lower() in ['.txt', '.md']:
mime_type = "text/plain"
chat_doc = ChatDocument(
fileId=f"test_doc_{i+1}",
messageId=f"test_message_{i+1}",
fileName=doc_file.name,
mimeType=mime_type,
fileSize=doc_file.stat().st_size,
roundNumber=1,
taskNumber=1,
actionNumber=1,
actionId=f"test_action_{i+1}"
)
documents.append(chat_doc)
logger.info(f"✅ Created ChatDocument: {chat_doc.fileName} ({chat_doc.mimeType}) - {chat_doc.fileSize} bytes")
logger.info(f"📄 Created {len(documents)} document objects")
# Create enhanced AI call options for intelligent chunked processing
ai_options = EnhancedAiCallOptions(
operationType="general",
enableParallelProcessing=True,
maxConcurrentChunks=5, # Increased for better testing
preserveChunkMetadata=True,
chunkSeparator="\n\n---\n\n"
)
# Call the main AI service directly - let it handle everything including DOCX generation
logger.info("🤖 Calling main AI service with intelligent merging...")
# Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...")
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
# userPrompt = "Can you create one file for each section in the document"
# userPrompt = "Analyze these documents and create a fitting image for the content"
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."
# userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
# userPrompt = "Create a docx file containing the combined documents in french language."
try:
# Single AI call with DOCX generation
ai_response = await ai_service.callAi(
prompt=userPrompt,
documents=documents,
options=ai_options,
outputFormat="txt",
title="Kunden und Use Cases"
)
logger.info(f"✅ End-to-end test completed successfully")
logger.info(f"📊 Response type: {type(ai_response)}")
logger.info(f"📊 Response length: {len(str(ai_response))} characters")
# Single test result
test_results = [{
"test_name": "End-to-End DOCX Generation",
"success": True,
"response_type": type(ai_response).__name__,
"response_length": len(str(ai_response)),
"response": ai_response
}]
except Exception as e:
logger.error(f"❌ End-to-end test failed: {str(e)}")
test_results = [{
"test_name": "End-to-End DOCX Generation",
"success": False,
"error": str(e),
"response": None
}]
logger.info(f"🎯 Completed 1 end-to-end test")
# Process all test results and save outputs
logger.info("📊 Processing test results...")
successful_tests = [r for r in test_results if r['success']]
failed_tests = [r for r in test_results if not r['success']]
logger.info(f"✅ Successful tests: {len(successful_tests)}")
logger.info(f"❌ Failed tests: {len(failed_tests)}")
# Display test results summary
logger.info("=" * 80)
logger.info("END-TO-END TEST RESULTS SUMMARY")
logger.info("=" * 80)
for i, result in enumerate(test_results, 1):
status = "✅ PASS" if result['success'] else "❌ FAIL"
logger.info(f"Test {i}: {result['test_name']} - {status}")
if result['success']:
logger.info(f" Response Type: {result['response_type']}")
logger.info(f" Response Length: {result['response_length']} characters")
else:
logger.info(f" Error: {result['error']}")
logger.info("=" * 80)
# Create output directory if it doesn't exist
output_dir = Path("test-chat/unittestoutput")
output_dir.mkdir(parents=True, exist_ok=True)
# Save all test results and generated files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
logger.info("💾 Saving test results and generated files...")
try:
for i, result in enumerate(successful_tests, 1):
test_name = result['test_name'].replace(' ', '_').lower()
response = result['response']
logger.info(f"💾 Saving Test {i}: {result['test_name']}")
# Handle different response types
if isinstance(response, dict):
# Document generation response
if 'documents' in response and response['documents']:
logger.info(f"📄 Found {len(response['documents'])} documents in response")
for j, doc in enumerate(response['documents']):
doc_name = doc.get('documentName', f'{test_name}_document_{j+1}')
doc_data = doc.get('documentData', '')
doc_mime = doc.get('mimeType', 'application/octet-stream')
logger.info(f"📄 Document {j+1}: {doc_name}")
logger.info(f"📄 MIME Type: {doc_mime}")
logger.info(f"📄 Data length: {len(doc_data)} characters")
# Determine file extension with better MIME type detection
file_ext = '.bin' # Default fallback
if doc_mime:
if 'docx' in doc_mime.lower() or 'wordprocessingml' in doc_mime.lower():
file_ext = '.docx'
elif 'pdf' in doc_mime.lower():
file_ext = '.pdf'
elif 'txt' in doc_mime.lower() or 'plain' in doc_mime.lower():
file_ext = '.txt'
elif 'html' in doc_mime.lower():
file_ext = '.html'
elif 'json' in doc_mime.lower():
file_ext = '.json'
elif 'csv' in doc_mime.lower():
file_ext = '.csv'
elif 'xlsx' in doc_mime.lower() or 'spreadsheetml' in doc_mime.lower():
file_ext = '.xlsx'
elif 'pptx' in doc_mime.lower() or 'presentationml' in doc_mime.lower():
file_ext = '.pptx'
elif 'markdown' in doc_mime.lower() or 'md' in doc_mime.lower():
file_ext = '.md'
elif 'png' in doc_mime.lower() or 'image' in doc_mime.lower():
file_ext = '.png'
elif 'jpg' in doc_mime.lower() or 'jpeg' in doc_mime.lower():
file_ext = '.jpg'
else:
logger.warning(f"⚠️ Unknown MIME type: {doc_mime}, using .bin")
# Also check filename for hints
if doc_name and '.' in doc_name:
name_ext = '.' + doc_name.split('.')[-1].lower()
if name_ext in ['.docx', '.pdf', '.txt', '.html', '.json', '.csv', '.xlsx', '.pptx', '.md', '.png', '.jpg', '.jpeg']:
file_ext = name_ext
logger.info(f"📄 Using extension from filename: {file_ext}")
logger.info(f"📄 Final file extension: {file_ext}")
# Save document
output_path = output_dir / f"{test_name}_{timestamp}{file_ext}"
# Handle different content types
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
# Text-based formats - save directly as text
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
logger.info(f"✅ Document saved as text: {output_path} ({len(doc_data)} characters)")
elif file_ext in ['.png', '.jpg', '.jpeg']:
# Image formats - decode from base64
try:
doc_bytes = base64.b64decode(doc_data)
with open(output_path, 'wb') as f:
f.write(doc_bytes)
logger.info(f"✅ Image saved: {output_path} ({len(doc_bytes)} bytes)")
except Exception as e:
logger.warning(f"⚠️ Failed to decode image as base64: {e}")
# Save as text if base64 decoding fails
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
logger.info(f"✅ Image saved as text (fallback): {output_path}")
else:
# Other binary formats - decode from base64
try:
doc_bytes = base64.b64decode(doc_data)
with open(output_path, 'wb') as f:
f.write(doc_bytes)
logger.info(f"✅ Document saved as binary: {output_path} ({len(doc_bytes)} bytes)")
except Exception as e:
logger.warning(f"⚠️ Failed to decode document as base64: {e}")
# Save as text if base64 decoding fails
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
logger.info(f"✅ Document saved as text (fallback): {output_path}")
# Also save raw content as text
content = response.get('content', '')
if content:
text_path = output_dir / f"{test_name}_content_{timestamp}.txt"
with open(text_path, 'w', encoding='utf-8') as f:
# Handle both string and dictionary content
if isinstance(content, dict):
import json
f.write(json.dumps(content, indent=2, ensure_ascii=False))
else:
f.write(str(content))
logger.info(f"✅ Content saved: {text_path}")
elif isinstance(response, str):
# Text response
text_path = output_dir / f"{test_name}_response_{timestamp}.txt"
with open(text_path, 'w', encoding='utf-8') as f:
f.write(response)
logger.info(f"✅ Text response saved: {text_path}")
else:
logger.warning(f"⚠️ Unknown response type for {result['test_name']}: {type(response)}")
# Save failed test details
if failed_tests:
error_path = output_dir / f"failed_tests_{timestamp}.txt"
with open(error_path, 'w', encoding='utf-8') as f:
f.write("# Failed Test Details\n\n")
for i, result in enumerate(failed_tests, 1):
f.write(f"## Test {i}: {result['test_name']}\n")
f.write(f"**Error:** {result['error']}\n\n")
logger.info(f"✅ Failed test details saved: {error_path}")
except Exception as e:
logger.error(f"❌ Error saving test results: {str(e)}")
return False
# Save comprehensive test report
report_path = output_dir / f"end_to_end_test_report_{timestamp}.txt"
with open(report_path, 'w', encoding='utf-8') as f:
f.write(f"# End-to-End AI Service Test Report\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Test Configuration\n")
f.write(f"- Documents processed: {len(documents)}\n")
f.write(f"- Processing method: Intelligent Token-Aware Merging\n")
f.write(f"- Parallel processing: {ai_options.enableParallelProcessing}\n")
f.write(f"- Max concurrent chunks: {ai_options.maxConcurrentChunks}\n")
f.write(f"- Chunk metadata preserved: {ai_options.preserveChunkMetadata}\n")
f.write(f"- Chunk separator: '{ai_options.chunkSeparator}'\n\n")
f.write(f"## Document Inventory\n")
for i, doc in enumerate(documents, 1):
f.write(f"{i}. **{doc.fileName}**\n")
f.write(f" - MIME Type: {doc.mimeType}\n")
f.write(f" - File Size: {doc.fileSize:,} bytes\n")
f.write(f" - File ID: {doc.fileId}\n\n")
f.write(f"## Test Results Summary\n")
f.write(f"- Total Tests: {len(test_results)}\n")
f.write(f"- Successful: {len(successful_tests)}\n")
f.write(f"- Failed: {len(failed_tests)}\n")
f.write(f"- Success Rate: {len(successful_tests)/len(test_results)*100:.1f}%\n\n")
f.write(f"## Detailed Test Results\n")
for i, result in enumerate(test_results, 1):
f.write(f"### Test {i}: {result['test_name']}\n")
f.write(f"**Status:** {'✅ PASS' if result['success'] else '❌ FAIL'}\n")
if result['success']:
f.write(f"**Response Type:** {result['response_type']}\n")
f.write(f"**Response Length:** {result['response_length']} characters\n")
# Show response preview
response_preview = str(result['response'])[:500]
f.write(f"**Response Preview:**\n```\n{response_preview}...\n```\n\n")
else:
f.write(f"**Error:** {result['error']}\n\n")
f.write(f"## Technical Implementation Details\n")
f.write(f"This test validates the complete AI service pipeline:\n\n")
f.write(f"### Tested Components:\n")
f.write(f"- **Document Extraction**: PDF, DOCX, images, etc.\n")
f.write(f"- **Intelligent Chunking**: Token-aware merging\n")
f.write(f"- **Model Selection**: Automatic AI model choice\n")
f.write(f"- **Parallel Processing**: Concurrent chunk processing\n")
f.write(f"- **Document Generation**: DOCX, PDF, text output\n")
f.write(f"- **Error Handling**: Graceful failure management\n\n")
f.write(f"### Performance Metrics:\n")
f.write(f"- **Chunk Optimization**: Intelligent merging reduces AI calls\n")
f.write(f"- **Processing Speed**: Parallel execution\n")
f.write(f"- **Memory Efficiency**: Token-aware chunking\n")
f.write(f"- **Output Quality**: Multiple format support\n\n")
f.write(f"## Generated Files\n")
for i, result in enumerate(successful_tests, 1):
test_name = result['test_name'].replace(' ', '_').lower()
f.write(f"- **Test {i}**: {result['test_name']} → `{test_name}_*_{timestamp}.*`\n")
if failed_tests:
f.write(f"- **Failed Tests**: `failed_tests_{timestamp}.txt`\n")
f.write(f"- **This Report**: `end_to_end_test_report_{timestamp}.txt`\n\n")
f.write(f"The end-to-end test successfully validates the complete AI service\n")
f.write(f"pipeline from document input to formatted output generation.\n")
logger.info(f"✅ Comprehensive test report saved: {report_path}")
# Show debug file locations
debug_files = []
try:
debug_dir = Path("test-chat")
if debug_dir.exists():
debug_files.extend(list(debug_dir.glob("*.log")))
debug_files.extend(list(debug_dir.glob("ai/*.txt")))
if debug_files:
logger.info("📁 Debug files created:")
for debug_file in debug_files:
logger.info(f" - {debug_file}")
else:
logger.info("📁 No debug files found in test-chat directory")
except Exception as e:
logger.warning(f"Could not list debug files: {e}")
# Restore original database interface
db_interface_module.getInterface = original_get_interface
return True
except Exception as e:
logger.error(f"❌ Error during document processing: {str(e)}")
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")
# Restore original database interface in case of error
try:
db_interface_module.getInterface = original_get_interface
except:
pass
return False
async def main():
"""Main function to run the intelligent chunk integration test."""
logger.info("🎯 Starting Intelligent Chunk Integration Test")
logger.info("=" * 60)
success = await process_documents_and_generate_summary()
if success:
logger.info("🎉 Intelligent chunk integration test completed successfully!")
logger.info("✅ Main AI service handled all processing internally")
logger.info("✅ Intelligent token-aware merging activated")
logger.info("✅ DOCX document generated directly by AI service")
logger.info("✅ Detailed chunk integration analysis saved")
logger.info("✅ Performance optimization achieved")
else:
logger.error("❌ Test failed!")
logger.error("Please check the error messages above for details")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,422 @@
#!/usr/bin/env python3
"""
Tool for encrypting all *_SECRET variables in all environment files.
This tool automatically processes all three environment files (dev, int, prod)
and encrypts any unencrypted *_SECRET variables using the appropriate encryption
keys for each environment.
Usage:
# Encrypt all secrets in all environment files
python tool_security_encrypt_all_env_files.py
# Dry run - show what would be changed without making changes
python tool_security_encrypt_all_env_files.py --dry-run
# Skip backup creation
python tool_security_encrypt_all_env_files.py --no-backup
# Process only specific environment files
python tool_security_encrypt_all_env_files.py --files env_dev.env env_prod.env
"""
import sys
import os
import argparse
import shutil
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any
# Add the modules directory to the Python path
current_dir = Path(__file__).parent
modules_dir = current_dir / 'modules'
if modules_dir.exists():
sys.path.insert(0, str(modules_dir))
else:
print(f"Error: Modules directory not found: {modules_dir}")
print(f"Make sure you're running this script from the gateway directory")
sys.exit(1)
# Import encryption functions
try:
from modules.shared.configuration import encrypt_value
except ImportError as e:
print(f"Error: Could not import encryption functions from shared.configuration: {e}")
print(f"Make sure you're running this script from the gateway directory")
print(f"Modules directory: {modules_dir}")
sys.exit(1)
def get_env_type_from_file(file_path: Path) -> str:
"""
Read the APP_ENV_TYPE from the environment file.
Args:
file_path: Path to the environment file
Returns:
str: The environment type (dev, int, prod) or 'dev' as default
"""
if not file_path.exists():
return 'dev'
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('APP_ENV_TYPE') and '=' in line:
_, value = line.split('=', 1)
return value.strip().lower()
except Exception as e:
print(f"Warning: Could not read APP_ENV_TYPE from {file_path}: {e}")
return 'dev'
def is_any_encrypted_value(value: str) -> bool:
"""
Check if a value has any encryption prefix (DEV_ENC:, INT_ENC:, PROD_ENC:, etc.).
Args:
value: The value to check
Returns:
bool: True if the value has any encryption prefix, False otherwise
"""
if not value or not isinstance(value, str):
return False
# Check for any environment-specific encryption prefixes
return (value.startswith('DEV_ENC:') or
value.startswith('INT_ENC:') or
value.startswith('PROD_ENC:') or
value.startswith('TEST_ENC:') or
value.startswith('STAGING_ENC:'))
def find_secret_keys_in_file(file_path: Path) -> list:
"""
Find all *_SECRET keys in an environment file that are not encrypted.
Args:
file_path: Path to the environment file
Returns:
list: List of tuples (line_number, key, value, full_line)
"""
secret_keys = []
if not file_path.exists():
return secret_keys
# Get the environment type from the file itself
file_env_type = get_env_type_from_file(file_path)
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
i += 1
continue
# Check if line contains a key-value pair
if '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
# Check if it's a secret key and not already encrypted with ANY prefix
if key.endswith('_SECRET') and value and not is_any_encrypted_value(value):
# Check if value starts with { (JSON object)
if value.startswith('{'):
# Collect all lines until we find the closing }
json_lines = [value]
start_line = i + 1
i += 1
brace_count = value.count('{') - value.count('}')
while i < len(lines) and brace_count > 0:
json_lines.append(lines[i].rstrip('\n'))
brace_count += lines[i].count('{') - lines[i].count('}')
i += 1
# Join all lines and create the full JSON value
full_json_value = '\n'.join(json_lines)
secret_keys.append((start_line, key, full_json_value, line))
i -= 1 # Adjust for the loop increment
else:
# Single line value
secret_keys.append((i + 1, key, value, line))
# Check if it's a secret key with multiline JSON (value is just "{")
elif key.endswith('_SECRET') and value == '{' and not is_any_encrypted_value(value):
# Collect all lines until we find the closing }
json_lines = [value]
start_line = i + 1
i += 1
brace_count = 1 # We already have one opening brace
while i < len(lines) and brace_count > 0:
json_lines.append(lines[i].rstrip('\n'))
brace_count += lines[i].count('{') - lines[i].count('}')
i += 1
# Join all lines and create the full JSON value
full_json_value = '\n'.join(json_lines)
secret_keys.append((start_line, key, full_json_value, line))
i -= 1 # Adjust for the loop increment
i += 1
except Exception as e:
print(f"Error reading {file_path}: {e}")
return secret_keys
def backup_file(file_path: Path) -> Path:
"""
Create a backup of the file before modification.
Args:
file_path: Path to the file to backup
Returns:
Path: Path to the backup file
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = file_path.with_suffix(f'.{timestamp}.backup')
shutil.copy2(file_path, backup_path)
return backup_path
def encrypt_all_secrets_in_file(file_path: Path, dry_run: bool = False, create_backup: bool = True) -> Dict[str, Any]:
"""
Encrypt all non-encrypted secrets in a file.
Args:
file_path: Path to the environment file
dry_run: If True, only show what would be changed
create_backup: If True, create a backup before modifying
Returns:
dict: Results of the encryption process
"""
# Get the environment type from the file itself
file_env_type = get_env_type_from_file(file_path)
results = {
'file': str(file_path),
'env_type': file_env_type,
'secrets_found': 0,
'secrets_encrypted': 0,
'errors': [],
'backup_created': None
}
# Find all secret keys
secret_keys = find_secret_keys_in_file(file_path)
results['secrets_found'] = len(secret_keys)
if not secret_keys:
print(f" ✅ No unencrypted secrets found - all values already have encryption prefixes")
return results
print(f" Found {len(secret_keys)} non-encrypted secrets")
if dry_run:
print(" [DRY RUN] Would encrypt the following secrets:")
for line_num, key, value, full_line in secret_keys:
print(f" Line {line_num}: {key} = {value[:50]}{'...' if len(value) > 50 else ''}")
return results
# Create backup if requested
if create_backup:
try:
backup_path = backup_file(file_path)
results['backup_created'] = str(backup_path)
print(f" 📋 Backup created: {backup_path.name}")
except Exception as e:
results['errors'].append(f"Failed to create backup: {e}")
print(f" ⚠️ Warning: Could not create backup: {e}")
# Read the file content
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
except Exception as e:
results['errors'].append(f"Failed to read file: {e}")
return results
# Process each secret key
for line_num, key, value, full_line in secret_keys:
try:
print(f" 🔐 Encrypting {key}...")
# Encrypt the value using the environment type from the file
encrypted_value = encrypt_value(value, file_env_type)
# Replace the line in the file content
new_line = f"{key} = {encrypted_value}\n"
lines[line_num - 1] = new_line
# If this was a multiline JSON, we need to remove the remaining lines
if value.startswith('{') and '\n' in value:
# Count how many lines the original JSON spanned
json_lines = value.split('\n')
lines_to_remove = len(json_lines) - 1 # -1 because we already replaced the first line
# Remove the remaining lines
for i in range(line_num, line_num + lines_to_remove):
if i < len(lines):
lines[i] = ""
results['secrets_encrypted'] += 1
print(f" ✓ Encrypted successfully")
except Exception as e:
error_msg = f"Failed to encrypt {key}: {e}"
results['errors'].append(error_msg)
print(f"{error_msg}")
# Write the modified content back to the file
if results['secrets_encrypted'] > 0:
try:
with open(file_path, 'w', encoding='utf-8') as f:
f.writelines(lines)
print(f" 💾 File updated successfully")
except Exception as e:
results['errors'].append(f"Failed to write file: {e}")
print(f" ✗ Failed to write file: {e}")
return results
def process_all_env_files(env_files: List[str] = None, dry_run: bool = False, create_backup: bool = True) -> Dict[str, Any]:
"""
Process all environment files and encrypt unencrypted secrets.
Args:
env_files: List of specific files to process (if None, processes all three default files)
dry_run: If True, only show what would be changed
create_backup: If True, create backups before modifying
Returns:
dict: Summary of all processing results
"""
# Default environment files if none specified
if env_files is None:
env_files = ['env_dev.env', 'env_int.env', 'env_prod.env']
# Convert to Path objects and check if they exist
env_paths = []
for env_file in env_files:
env_path = Path(env_file)
if not env_path.exists():
print(f"⚠️ Warning: Environment file not found: {env_file}")
continue
env_paths.append(env_path)
if not env_paths:
print("❌ No valid environment files found to process")
return {'total_files': 0, 'total_secrets_found': 0, 'total_secrets_encrypted': 0, 'total_errors': 0, 'files': []}
print("🔐 PowerOn Batch Secret Encryption Tool")
print("=" * 60)
print("⚠️ IMPORTANT: The tool will read APP_ENV_TYPE from each file itself")
print("⚠️ Each file will be processed with its own environment-specific encryption")
print()
if dry_run:
print("🔍 DRY RUN MODE - No changes will be made")
print()
# Process each file
all_results = []
total_secrets_found = 0
total_secrets_encrypted = 0
total_errors = 0
for env_path in env_paths:
print(f"\n📁 Processing {env_path.name}:")
results = encrypt_all_secrets_in_file(env_path, dry_run, create_backup)
all_results.append(results)
total_secrets_found += results['secrets_found']
total_secrets_encrypted += results['secrets_encrypted']
total_errors += len(results['errors'])
# Summary
print("\n" + "=" * 60)
print("📊 SUMMARY")
print("=" * 60)
print(f"Files processed: {len(env_paths)}")
print(f"Total secrets found: {total_secrets_found}")
if not dry_run:
print(f"Total secrets encrypted: {total_secrets_encrypted}")
print(f"Total errors: {total_errors}")
if total_errors == 0 and total_secrets_encrypted > 0:
print("\n🎉 All secrets encrypted successfully!")
elif total_errors > 0:
print(f"\n⚠️ Completed with {total_errors} errors")
else:
print("\n✅ No secrets needed encryption")
else:
print(f"Secrets that would be encrypted: {total_secrets_found}")
# Show backup information
backups_created = [r['backup_created'] for r in all_results if r['backup_created']]
if backups_created:
print(f"\n📋 Backups created: {len(backups_created)}")
for backup in backups_created:
print(f" - {Path(backup).name}")
# Show errors if any
all_errors = []
for results in all_results:
all_errors.extend(results['errors'])
if all_errors:
print(f"\n❌ Errors encountered:")
for error in all_errors:
print(f" - {error}")
return {
'total_files': len(env_paths),
'total_secrets_found': total_secrets_found,
'total_secrets_encrypted': total_secrets_encrypted,
'total_errors': total_errors,
'files': all_results
}
def main():
parser = argparse.ArgumentParser(description='Encrypt all *_SECRET variables in all environment files')
parser.add_argument('--files', '-f', nargs='+',
help='Specific environment files to process (default: all three env files)')
parser.add_argument('--dry-run', action='store_true',
help='Show what would be changed without making changes')
parser.add_argument('--no-backup', action='store_true',
help='Skip creating backup files')
args = parser.parse_args()
try:
results = process_all_env_files(
env_files=args.files,
dry_run=args.dry_run,
create_backup=not args.no_backup
)
# Return appropriate exit code
if results['total_errors'] > 0:
return 1
return 0
except Exception as e:
print(f"Error: {e}")
return 1
if __name__ == '__main__':
sys.exit(main())