From dedee0ecdae423d2b4a635340c5224a8814d447e Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 12 Oct 2025 02:53:37 +0200
Subject: [PATCH] renders image generation tested and fixed - all renderers
ready
---
modules/connectors/connectorAiOpenai.py | 81 ++++-
.../renderers/rendererBaseTemplate.py | 117 +++++++-
.../renderers/rendererImage.py | 281 ++++++++++++++++++
test_document_processing.py | 18 +-
4 files changed, 491 insertions(+), 6 deletions(-)
create mode 100644 modules/services/serviceGeneration/renderers/rendererImage.py
diff --git a/modules/connectors/connectorAiOpenai.py b/modules/connectors/connectorAiOpenai.py
index 4a9f4888..692fe422 100644
--- a/modules/connectors/connectorAiOpenai.py
+++ b/modules/connectors/connectorAiOpenai.py
@@ -188,4 +188,83 @@ class AiOpenai:
except Exception as e:
logger.error(f"Error during image analysis: {str(e)}", exc_info=True)
- return f"[Error during image analysis: {str(e)}]"
\ No newline at end of file
+ return f"[Error during image analysis: {str(e)}]"
+
+ async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> Dict[str, Any]:
+ """
+ Generate an image using DALL-E 3.
+
+ Args:
+ prompt: The text prompt for image generation
+ size: Image size (1024x1024, 1792x1024, or 1024x1792)
+ quality: Image quality (standard or hd)
+ style: Image style (vivid or natural)
+
+ Returns:
+ Dictionary with success status and image data
+ """
+ try:
+ logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
+
+ # DALL-E 3 API endpoint
+ dalle_url = "https://api.openai.com/v1/images/generations"
+
+ payload = {
+ "model": "dall-e-3",
+ "prompt": prompt,
+ "size": size,
+ "quality": quality,
+ "style": style,
+ "n": 1,
+ "response_format": "b64_json" # Get base64 data directly instead of URLs
+ }
+
+ # Create a separate client for DALL-E API calls
+ dalle_client = httpx.AsyncClient(
+ timeout=120.0,
+ headers={
+ "Authorization": f"Bearer {self.apiKey}",
+ "Content-Type": "application/json"
+ }
+ )
+
+ response = await dalle_client.post(
+ dalle_url,
+ json=payload
+ )
+
+ await dalle_client.aclose()
+
+ if response.status_code != 200:
+ logger.error(f"DALL-E API error: {response.status_code} - {response.text}")
+ return {
+ "success": False,
+ "error": f"DALL-E API error: {response.status_code} - {response.text}"
+ }
+
+ responseJson = response.json()
+
+ if "data" in responseJson and len(responseJson["data"]) > 0:
+ image_data = responseJson["data"][0]["b64_json"]
+
+ logger.info(f"Successfully generated image: {len(image_data)} characters")
+ return {
+ "success": True,
+ "image_data": image_data,
+ "size": size,
+ "quality": quality,
+ "style": style
+ }
+ else:
+ logger.error("No image data in DALL-E response")
+ return {
+ "success": False,
+ "error": "No image data in DALL-E response"
+ }
+
+ except Exception as e:
+ logger.error(f"Error during image generation: {str(e)}", exc_info=True)
+ return {
+ "success": False,
+ "error": f"Error during image generation: {str(e)}"
+ }
\ No newline at end of file
diff --git a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py
index 34c7387c..4c6b7001 100644
--- a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py
+++ b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py
@@ -140,6 +140,109 @@ class BaseRenderer(ABC):
alt_text = section_data.get("altText", "Image")
return base64_data, alt_text
+ def _render_image_section(self, section: Dict[str, Any], styles: Dict[str, Any] = None) -> Any:
+ """
+ Render an image section. This is a base implementation that should be overridden
+ by format-specific renderers.
+
+ Args:
+ section: Image section data
+ styles: Optional styling information
+
+ Returns:
+ Format-specific image representation
+ """
+ section_data = self._get_section_data(section)
+ base64_data, alt_text = self._extract_image_data(section_data)
+
+ # Base implementation returns a simple dict
+ # Format-specific renderers should override this method
+ return {
+ "type": "image",
+ "base64Data": base64_data,
+ "altText": alt_text,
+ "width": section_data.get("width", None),
+ "height": section_data.get("height", None),
+ "caption": section_data.get("caption", "")
+ }
+
+ def _validate_image_data(self, base64_data: str, alt_text: str) -> bool:
+ """Validate image data."""
+ if not base64_data:
+ self.logger.warning("Image section has no base64 data")
+ return False
+
+ if not alt_text:
+ self.logger.warning("Image section has no alt text")
+ return False
+
+ # Basic base64 validation
+ try:
+ import base64
+ base64.b64decode(base64_data, validate=True)
+ return True
+ except Exception as e:
+ self.logger.warning(f"Invalid base64 image data: {str(e)}")
+ return False
+
+ def _get_image_dimensions(self, base64_data: str) -> Tuple[int, int]:
+ """
+ Get image dimensions from base64 data.
+ This is a helper method that format-specific renderers can use.
+ """
+ try:
+ import base64
+ from PIL import Image
+ import io
+
+ # Decode base64 data
+ image_data = base64.b64decode(base64_data)
+ image = Image.open(io.BytesIO(image_data))
+
+ return image.size # Returns (width, height)
+
+ except Exception as e:
+ self.logger.warning(f"Could not determine image dimensions: {str(e)}")
+ return (0, 0)
+
+ def _resize_image_if_needed(self, base64_data: str, max_width: int = 800, max_height: int = 600) -> str:
+ """
+ Resize image if it exceeds maximum dimensions.
+ Returns the resized image as base64 string.
+ """
+ try:
+ import base64
+ from PIL import Image
+ import io
+
+ # Decode base64 data
+ image_data = base64.b64decode(base64_data)
+ image = Image.open(io.BytesIO(image_data))
+
+ # Check if resizing is needed
+ width, height = image.size
+ if width <= max_width and height <= max_height:
+ return base64_data # No resizing needed
+
+ # Calculate new dimensions maintaining aspect ratio
+ ratio = min(max_width / width, max_height / height)
+ new_width = int(width * ratio)
+ new_height = int(height * ratio)
+
+ # Resize image
+ resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+
+ # Convert back to base64
+ buffer = io.BytesIO()
+ resized_image.save(buffer, format=image.format or 'PNG')
+ resized_data = buffer.getvalue()
+
+ return base64.b64encode(resized_data).decode('utf-8')
+
+ except Exception as e:
+ self.logger.warning(f"Could not resize image: {str(e)}")
+ return base64_data # Return original if resize fails
+
def _get_supported_section_types(self) -> List[str]:
"""Return list of supported section types."""
return ["table", "bullet_list", "heading", "paragraph", "code_block", "image"]
@@ -170,7 +273,19 @@ class BaseRenderer(ABC):
return {"type": "code_block", "code": code, "language": language}
elif section_type == "image":
base64_data, alt_text = self._extract_image_data(section_data)
- return {"type": "image", "base64Data": base64_data, "altText": alt_text}
+ # Validate image data
+ if self._validate_image_data(base64_data, alt_text):
+ return {
+ "type": "image",
+ "base64Data": base64_data,
+ "altText": alt_text,
+ "width": section_data.get("width"),
+ "height": section_data.get("height"),
+ "caption": section_data.get("caption", "")
+ }
+ else:
+ # Return placeholder if image data is invalid
+ return {"type": "paragraph", "text": f"[Image: {alt_text}]"}
else:
# Fallback to paragraph
text = self._extract_paragraph_text(section_data)
diff --git a/modules/services/serviceGeneration/renderers/rendererImage.py b/modules/services/serviceGeneration/renderers/rendererImage.py
new file mode 100644
index 00000000..863a52e2
--- /dev/null
+++ b/modules/services/serviceGeneration/renderers/rendererImage.py
@@ -0,0 +1,281 @@
+"""
+Image renderer for report generation using AI image generation.
+"""
+
+from .rendererBaseTemplate import BaseRenderer
+from typing import Dict, Any, Tuple, List
+import base64
+import logging
+
+logger = logging.getLogger(__name__)
+
+class RendererImage(BaseRenderer):
+ """Renders content to image format using AI image generation."""
+
+ @classmethod
+ def get_supported_formats(cls) -> List[str]:
+ """Return supported image formats."""
+ return ['png', 'jpg', 'jpeg', 'image']
+
+ @classmethod
+ def get_format_aliases(cls) -> List[str]:
+ """Return format aliases."""
+ return ['img', 'picture', 'photo', 'graphic']
+
+ @classmethod
+ def get_priority(cls) -> int:
+ """Return priority for image renderer."""
+ return 90
+
+ async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]:
+ """Render extracted JSON content to image format using AI image generation."""
+ try:
+ # Generate AI image from content
+ image_content = await self._generate_ai_image(extracted_content, title, user_prompt, ai_service)
+
+ return image_content, "image/png"
+
+ except Exception as e:
+ self.logger.error(f"Error rendering image: {str(e)}")
+ # Re-raise the exception instead of using fallback
+ raise Exception(f"Image rendering failed: {str(e)}")
+
+ async def _generate_ai_image(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
+ """Generate AI image from extracted content."""
+ try:
+ if not ai_service:
+ raise ValueError("AI service is required for image generation")
+
+ # Validate JSON structure
+ if not isinstance(extracted_content, dict):
+ raise ValueError("Extracted content must be a dictionary")
+
+ if "sections" not in extracted_content:
+ raise ValueError("Extracted content must contain 'sections' field")
+
+ # Use title from JSON metadata if available, otherwise use provided title
+ document_title = extracted_content.get("metadata", {}).get("title", title)
+
+ # Create AI prompt for image generation
+ image_prompt = await self._create_image_generation_prompt(extracted_content, document_title, user_prompt, ai_service)
+
+ # Generate image using AI
+ image_result = await ai_service.aiObjects.generateImage(
+ prompt=image_prompt,
+ size="1024x1024",
+ quality="standard",
+ style="vivid"
+ )
+
+ # Extract base64 image data from result
+ if image_result and image_result.get("success", False):
+ image_data = image_result.get("image_data", "")
+ if image_data:
+ return image_data
+ else:
+ raise ValueError("No image data returned from AI")
+ else:
+ error_msg = image_result.get("error", "Unknown error") if image_result else "No result"
+ raise ValueError(f"AI image generation failed: {error_msg}")
+
+ except Exception as e:
+ self.logger.error(f"Error generating AI image: {str(e)}")
+ raise Exception(f"AI image generation failed: {str(e)}")
+
+ async def _create_image_generation_prompt(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str:
+ """Create a detailed prompt for AI image generation based on the content."""
+ try:
+ # Start with base prompt
+ prompt_parts = []
+
+ # Add user's original intent if available
+ if user_prompt:
+ prompt_parts.append(f"User Request: {user_prompt}")
+
+ # Add document title
+ prompt_parts.append(f"Document Title: {title}")
+
+ # Analyze content and create visual description
+ sections = extracted_content.get("sections", [])
+ content_description = self._analyze_content_for_visual_description(sections)
+
+ if content_description:
+ prompt_parts.append(f"Content to Visualize: {content_description}")
+
+ # Add style guidance
+ style_guidance = self._get_style_guidance_from_content(extracted_content, user_prompt)
+ if style_guidance:
+ prompt_parts.append(f"Visual Style: {style_guidance}")
+
+ # Combine all parts
+ full_prompt = "Create a professional, informative image that visualizes the following content:\n\n" + "\n\n".join(prompt_parts)
+
+ # Add technical requirements
+ full_prompt += "\n\nTechnical Requirements:"
+ full_prompt += "\n- High quality, professional appearance"
+ full_prompt += "\n- Clear, readable text if any text is included"
+ full_prompt += "\n- Appropriate colors and layout"
+ full_prompt += "\n- Suitable for business/professional use"
+
+ # Truncate prompt if it exceeds DALL-E's 4000 character limit
+ if len(full_prompt) > 4000:
+ # Use AI to compress the prompt intelligently
+ compressed_prompt = await self._compress_prompt_with_ai(full_prompt, ai_service)
+ if compressed_prompt and len(compressed_prompt) <= 4000:
+ return compressed_prompt
+
+ # Fallback to minimal prompt if AI compression fails or is still too long
+ minimal_prompt = f"Create a professional image representing: {title}"
+ if user_prompt:
+ minimal_prompt += f" - {user_prompt}"
+
+ # If even the minimal prompt is too long, truncate it
+ if len(minimal_prompt) > 4000:
+ minimal_prompt = minimal_prompt[:3997] + "..."
+
+ return minimal_prompt
+
+ return full_prompt
+
+ except Exception as e:
+ self.logger.warning(f"Error creating image prompt: {str(e)}")
+ # Fallback to simple prompt
+ return f"Create a professional image representing: {title}"
+
+ async def _compress_prompt_with_ai(self, long_prompt: str, ai_service=None) -> str:
+ """Use AI to intelligently compress a long prompt while preserving key information."""
+ try:
+ if not ai_service:
+ return None
+
+ compression_prompt = f"""
+You are an expert at creating concise, effective prompts for AI image generation.
+
+The following prompt is too long for DALL-E (4000 character limit) and needs to be compressed to under 4000 characters while preserving the most important visual information.
+
+Original prompt ({len(long_prompt)} characters):
+{long_prompt}
+
+Please create a compressed version that:
+1. Keeps the most important visual elements and requirements
+2. Maintains the core intent and style guidance
+3. Preserves technical requirements
+4. Stays under 4000 characters
+5. Is optimized for DALL-E image generation
+
+Return only the compressed prompt, no explanations.
+"""
+
+ # Use AI to compress the prompt - call the AI service correctly
+ # The ai_service has an aiObjects attribute that contains the actual AI interface
+ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
+
+ request = AiCallRequest(
+ prompt=compression_prompt,
+ options=AiCallOptions(
+ operationType=OperationType.GENERAL,
+ maxTokens=2000,
+ temperature=0.3 # Lower temperature for more consistent compression
+ )
+ )
+
+ response = await ai_service.aiObjects.call(request)
+ compressed = response.content.strip()
+
+ # Validate the compressed prompt
+ if compressed and len(compressed) <= 4000 and len(compressed) > 50:
+ self.logger.info(f"Successfully compressed prompt from {len(long_prompt)} to {len(compressed)} characters")
+ return compressed
+ else:
+ self.logger.warning(f"AI compression failed or produced invalid result: {len(compressed) if compressed else 0} chars")
+ return None
+
+ except Exception as e:
+ self.logger.warning(f"Error compressing prompt with AI: {str(e)}")
+ return None
+
+ def _analyze_content_for_visual_description(self, sections: List[Dict[str, Any]]) -> str:
+ """Analyze content sections and create a visual description for AI."""
+ try:
+ descriptions = []
+
+ for section in sections:
+ section_type = self._get_section_type(section)
+ section_data = self._get_section_data(section)
+
+ if section_type == "table":
+ headers = section_data.get("headers", [])
+ rows = section_data.get("rows", [])
+ if headers and rows:
+ descriptions.append(f"Data table with {len(headers)} columns and {len(rows)} rows: {', '.join(headers)}")
+
+ elif section_type == "bullet_list":
+ items = section_data.get("items", [])
+ if items:
+ descriptions.append(f"List with {len(items)} items")
+
+ elif section_type == "heading":
+ text = section_data.get("text", "")
+ level = section_data.get("level", 1)
+ if text:
+ descriptions.append(f"Heading {level}: {text}")
+
+ elif section_type == "paragraph":
+ text = section_data.get("text", "")
+ if text and len(text) > 10: # Only include substantial paragraphs
+ # Truncate long text
+ truncated = text[:100] + "..." if len(text) > 100 else text
+ descriptions.append(f"Text content: {truncated}")
+
+ elif section_type == "code_block":
+ code = section_data.get("code", "")
+ language = section_data.get("language", "")
+ if code:
+ descriptions.append(f"Code block ({language}): {code[:50]}...")
+
+ return "; ".join(descriptions) if descriptions else "General document content"
+
+ except Exception as e:
+ self.logger.warning(f"Error analyzing content: {str(e)}")
+ return "Document content"
+
+ def _get_style_guidance_from_content(self, extracted_content: Dict[str, Any], user_prompt: str = None) -> str:
+ """Determine visual style guidance based on content and user prompt."""
+ try:
+ style_elements = []
+
+ # Analyze user prompt for style hints
+ if user_prompt:
+ prompt_lower = user_prompt.lower()
+
+ if any(word in prompt_lower for word in ["modern", "contemporary", "sleek"]):
+ style_elements.append("modern, clean design")
+ elif any(word in prompt_lower for word in ["classic", "traditional", "formal"]):
+ style_elements.append("classic, formal design")
+ elif any(word in prompt_lower for word in ["creative", "artistic", "colorful"]):
+ style_elements.append("creative, artistic design")
+ elif any(word in prompt_lower for word in ["corporate", "business", "professional"]):
+ style_elements.append("corporate, professional design")
+
+ # Analyze content type for additional style hints
+ sections = extracted_content.get("sections", [])
+ has_tables = any(self._get_section_type(s) == "table" for s in sections)
+ has_lists = any(self._get_section_type(s) == "bullet_list" for s in sections)
+ has_code = any(self._get_section_type(s) == "code_block" for s in sections)
+
+ if has_tables:
+ style_elements.append("data-focused layout")
+ if has_lists:
+ style_elements.append("organized, structured presentation")
+ if has_code:
+ style_elements.append("technical, developer-friendly")
+
+ # Default style if no specific guidance
+ if not style_elements:
+ style_elements.append("professional, clean design")
+
+ return ", ".join(style_elements)
+
+ except Exception as e:
+ self.logger.warning(f"Error determining style guidance: {str(e)}")
+ return "professional design"
diff --git a/test_document_processing.py b/test_document_processing.py
index 53fbd80d..c16add06 100644
--- a/test_document_processing.py
+++ b/test_document_processing.py
@@ -154,7 +154,7 @@ async def process_documents_and_generate_summary():
# userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
- userPrompt = "Analyze these documents and create a comprehensive form for a user to fill out"
+ userPrompt = "Analyze these documents and create a fitting image for the content"
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."
@@ -168,7 +168,7 @@ async def process_documents_and_generate_summary():
prompt=userPrompt,
documents=documents,
options=ai_options,
- outputFormat="html",
+ outputFormat="txt",
title="Formulaire"
)
@@ -272,13 +272,17 @@ async def process_documents_and_generate_summary():
file_ext = '.pptx'
elif 'markdown' in doc_mime.lower() or 'md' in doc_mime.lower():
file_ext = '.md'
+ elif 'png' in doc_mime.lower() or 'image' in doc_mime.lower():
+ file_ext = '.png'
+ elif 'jpg' in doc_mime.lower() or 'jpeg' in doc_mime.lower():
+ file_ext = '.jpg'
else:
logger.warning(f"⚠️ Unknown MIME type: {doc_mime}, using .bin")
# Also check filename for hints
if doc_name and '.' in doc_name:
name_ext = '.' + doc_name.split('.')[-1].lower()
- if name_ext in ['.docx', '.pdf', '.txt', '.html', '.json', '.csv', '.xlsx', '.pptx', '.md']:
+ if name_ext in ['.docx', '.pdf', '.txt', '.html', '.json', '.csv', '.xlsx', '.pptx', '.md', '.png', '.jpg', '.jpeg']:
file_ext = name_ext
logger.info(f"📄 Using extension from filename: {file_ext}")
@@ -293,8 +297,14 @@ async def process_documents_and_generate_summary():
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
logger.info(f"✅ Document saved as text: {output_path} ({len(doc_data)} characters)")
+ elif file_ext in ['.png', '.jpg', '.jpeg']:
+ # Image formats - decode from base64
+ doc_bytes = base64.b64decode(doc_data)
+ with open(output_path, 'wb') as f:
+ f.write(doc_bytes)
+ logger.info(f"✅ Image saved: {output_path} ({len(doc_bytes)} bytes)")
else:
- # Binary formats - decode from base64
+ # Other binary formats - decode from base64
doc_bytes = base64.b64decode(doc_data)
with open(output_path, 'wb') as f:
f.write(doc_bytes)