gateway/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py
2026-03-22 11:09:48 +01:00

1978 lines
92 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import base64
import io
import json
import re
from datetime import datetime, UTC
from typing import Dict, Any, Optional, List
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
logger = logging.getLogger(__name__)
_PPTX_MD_INLINE_RE = re.compile(
r"(\*\*(.+?)\*\*)"
r"|(__(.+?)__)"
r"|(?<!\*)\*([^*\n]+?)\*(?!\*)"
r"|(?<![\w/])_([^_\n]+?)_(?![\w/])"
r"|`([^`]+)`"
)
class RendererPptx(BaseRenderer):
"""Renderer for PowerPoint (.pptx) files using python-pptx library."""
def __init__(self, services=None):
super().__init__(services=services)
self.supportedFormats = ["pptx", "ppt"]
self.outputMimeType = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
@classmethod
def getSupportedFormats(cls) -> list:
"""Get list of supported output formats."""
return ["pptx", "ppt"]
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return []
@classmethod
def getPriority(cls) -> int:
"""Return priority for PowerPoint renderer."""
return 105
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: PowerPoint presentations are formatted documents."""
return 'document'
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that PowerPoint renderer accepts.
PowerPoint renderer accepts all section types (presentations can contain all content types including images).
"""
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render content as PowerPoint presentation from JSON data.
Args:
extractedContent: JSON content to render as presentation
title: Title for the presentation
userPrompt: User prompt for AI styling
aiService: AI service for styling
**kwargs: Additional rendering options
Returns:
Base64-encoded PowerPoint presentation as string
"""
try:
# Import python-pptx
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
import re
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
# Create new presentation
prs = Presentation()
# Set slide size based on user intent (default to 16:9)
slide_size = styles.get("slide_size", "16:9")
if slide_size == "4:3":
prs.slide_width = Inches(10)
prs.slide_height = Inches(7.5)
else: # Default to 16:9
prs.slide_width = Inches(13.33)
prs.slide_height = Inches(7.5)
# Generate slides from JSON content
slidesData = await self._parseJsonToSlides(extractedContent, title, styles)
logger.info(f"Parsed {len(slidesData)} slides from JSON content")
# Debug: Show first 200 chars of content
logger.info(f"JSON content preview: {str(extractedContent)[:200]}...")
# Store prs reference for image methods
self._currentPresentation = prs
for i, slide_data in enumerate(slidesData):
slide_sections = slide_data.get("sections", [])
slide_images = list(slide_data.get("images", []))
slide_content = slide_data.get('content', '')
hasSections = slide_sections and len(slide_sections) > 0
hasImages = len(slide_images) > 0
isTitleSlide = slide_data.get("_isTitleSlide", False)
logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars, titleSlide={isTitleSlide}")
# Title slide uses the built-in Title Slide layout (index 0)
if isTitleSlide:
titleLayout = prs.slide_layouts[0]
slide = prs.slides.add_slide(titleLayout)
try:
titleShape = slide.shapes.title
titleShape.text = slide_data.get("title", "")
titleStyle = styles.get("title", {})
tf = titleShape.text_frame
if tf.paragraphs:
p = tf.paragraphs[0]
p.font.size = Pt(titleStyle.get("font_size", 36))
p.font.bold = titleStyle.get("bold", True)
tColor = self._getSafeColor(titleStyle.get("color", (31, 78, 121)))
p.font.color.rgb = RGBColor(*tColor)
except Exception as titleErr:
logger.warning(f"Could not style title slide: {titleErr}")
# Clear subtitle placeholder
try:
sub = slide.placeholders[1]
sub.text = ""
except (KeyError, IndexError):
pass
continue
# Content slides: use blank layout
slideLayoutIndex = None
for idx in [6, 5]:
if idx < len(prs.slide_layouts):
try:
layout = prs.slide_layouts[idx]
if len(layout.placeholders) == 0:
slideLayoutIndex = idx
break
except (AttributeError, IndexError):
continue
if slideLayoutIndex is None:
minPh = float('inf')
for idx in range(len(prs.slide_layouts)):
try:
layout = prs.slide_layouts[idx]
phCount = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0
if phCount < minPh:
minPh = phCount
slideLayoutIndex = idx
except:
continue
if slideLayoutIndex is None:
slideLayoutIndex = 0
slide_layout = prs.slide_layouts[slideLayoutIndex]
slide = prs.slides.add_slide(slide_layout)
try:
for shape in slide.shapes:
if hasattr(shape, 'is_placeholder') and shape.is_placeholder:
try:
if hasattr(shape, 'text_frame'):
shape.text_frame.clear()
if len(shape.text_frame.paragraphs) > 0:
shape.text_frame.paragraphs[0].text = ""
except:
pass
except Exception as placeholder_error:
logger.warning(f"Could not clear placeholders: {str(placeholder_error)}")
# Add title as textbox
from pptx.util import Inches
titleBox = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), prs.slide_width - Inches(1), Inches(0.6))
titleFrame = titleBox.text_frame
titleFrame.text = slide_data.get("title", "Slide")
title_style = styles.get("title", {})
# Smaller title size for slides (default 32 instead of 44)
title_font_size = title_style.get("font_size", 32)
# Reduce further for slides (max 32pt, min 10pt for readability)
title_font_size = max(10, min(title_font_size, 32))
titleFrame.paragraphs[0].font.size = Pt(title_font_size)
titleFrame.paragraphs[0].font.bold = title_style.get("bold", True)
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
titleFrame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
titleFrame.paragraphs[0].alignment = PP_ALIGN.LEFT
titleFrame.word_wrap = True
# Render sections with proper PowerPoint objects (tables, lists, etc.)
# Organize content into frames for better layout
if hasSections:
# Organize sections into content groups for frame-based layout
# Images are handled within the frame rendering method
self._renderSlideContentWithFrames(slide, slide_sections, slide_images, styles, prs)
# Fallback: if no sections but has content text, render in textbox
elif slide_content and not hasImages:
# Create textbox for content (no placeholders in blank layout)
from pptx.util import Inches
title_height_used = Inches(1.0) # Title height for blank slides
content_left = Inches(0.5)
content_top = title_height_used + Inches(0.3)
content_width = prs.slide_width - Inches(1)
content_height = prs.slide_height - content_top - Inches(0.5)
content_textbox = slide.shapes.add_textbox(content_left, content_top, content_width, content_height)
text_frame = content_textbox.text_frame
text_frame.word_wrap = True
text_frame.auto_size = None
# Split content into paragraphs
paragraphs = slide_content.split('\n\n')
for paragraph in paragraphs:
if paragraph.strip():
p = text_frame.add_paragraph()
p.text = paragraph.strip()
# Apply AI-generated styling with adaptive sizing
paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 18)
# Calculate adaptive font size based on content length
try:
total_chars = len(slide_content)
chars_per_line = max(1, int(content_width / Pt(10)))
lines_needed = total_chars / chars_per_line
available_lines = max(1, int(content_height / Pt(14)))
font_multiplier = 1.0
if available_lines > 0 and lines_needed > available_lines:
font_multiplier = max(0.6, min(1.0, (available_lines / lines_needed) * 1.1))
calculated_size = max(6, int(base_font_size * font_multiplier)) # Minimum 6pt
except (ZeroDivisionError, ValueError, TypeError):
calculated_size = max(6, base_font_size) # Fallback to base size with minimum
p.font.size = Pt(calculated_size)
p.font.bold = paragraph_style.get("bold", False)
paragraph_color = self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))
p.font.color.rgb = RGBColor(*paragraph_color)
# Apply alignment
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
# If no slides were created, create a single slide with the document title
if not slidesData:
slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(slide_layout)
title_shape = slide.shapes.title
title_shape.text = title
title_style = styles.get("title", {})
if title_shape.text_frame.paragraphs[0].font:
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
title_shape.text_frame.paragraphs[0].font.bold = title_style.get("bold", True)
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
# Clear subtitle placeholder instead of adding filler text
try:
subtitle_shape = slide.placeholders[1]
subtitle_shape.text = ""
except (KeyError, IndexError):
pass
# Save to buffer
buffer = io.BytesIO()
prs.save(buffer)
buffer.seek(0)
# Convert to base64
pptx_bytes = buffer.getvalue()
pptx_base64 = base64.b64encode(pptx_bytes).decode('utf-8')
logger.info(f"Successfully rendered PowerPoint presentation: {len(pptx_bytes)} bytes")
# Determine filename from document or title
documents = extractedContent.get("documents", [])
if documents and isinstance(documents[0], dict):
filename = documents[0].get("filename")
if not filename:
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.presentationml.presentation")
else:
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.presentationml.presentation")
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
return [
RenderedDocument(
documentData=pptx_bytes,
mimeType="application/vnd.openxmlformats-officedocument.presentationml.presentation",
filename=filename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
except ImportError:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
fallbackContent = "python-pptx library not installed"
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
return [
RenderedDocument(
documentData=fallbackContent.encode('utf-8'),
mimeType="text/plain",
filename=self._determineFilename(title, "text/plain"),
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
except Exception as e:
logger.error(f"Error rendering PowerPoint presentation: {str(e)}")
fallbackContent = f"Error rendering PowerPoint presentation: {str(e)}"
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
return [
RenderedDocument(
documentData=fallbackContent.encode('utf-8'),
mimeType="text/plain",
filename=self._determineFilename(title, "text/plain"),
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
def _parseContentToSlides(self, content: str, title: str) -> list:
"""
Parse content into slide data structure.
Args:
content: Content to parse
title: Presentation title
Returns:
List of slide data dictionaries
"""
slides = []
# Split content by slide markers or headers
slide_sections = self._splitContentIntoSlides(content)
for i, section in enumerate(slide_sections):
if section.strip():
slide_data = {
"title": f"Slide {i + 1}",
"content": section.strip()
}
# Extract title from content if it starts with #
lines = section.strip().split('\n')
if lines and lines[0].startswith('#'):
# Remove # symbols and clean up title
slide_title = lines[0].lstrip('#').strip()
slide_data["title"] = slide_title
slide_data["content"] = '\n'.join(lines[1:]).strip()
elif lines and lines[0].strip():
# Use first line as title if it looks like a title
first_line = lines[0].strip()
if len(first_line) < 100 and not first_line.endswith('.'):
slide_data["title"] = first_line
slide_data["content"] = '\n'.join(lines[1:]).strip()
slides.append(slide_data)
return slides
def _splitContentIntoSlides(self, content: str) -> list:
"""
Split content into individual slides based on headers and structure.
Args:
content: Content to split
Returns:
List of slide content strings
"""
# re is already imported at module level
# First, try to split by major headers (# or ##)
# This is the most common case for AI-generated content
header_pattern = r'^(#{1,2})\s+(.+)$'
lines = content.split('\n')
slides = []
current_slide = []
for line in lines:
# Check if this line is a header
header_match = re.match(header_pattern, line.strip())
if header_match:
# If we have content in current slide, save it
if current_slide:
slide_content = '\n'.join(current_slide).strip()
if slide_content:
slides.append(slide_content)
current_slide = []
# Start new slide with this header
current_slide.append(line)
else:
# Add line to current slide
current_slide.append(line)
# Add the last slide
if current_slide:
slide_content = '\n'.join(current_slide).strip()
if slide_content:
slides.append(slide_content)
# If we found slides with headers, return them
if len(slides) > 1:
return slides
# Fallback: Split by double newlines
sections = content.split('\n\n\n')
if len(sections) > 1:
return [s.strip() for s in sections if s.strip()]
# Another fallback: Split by double newlines
sections = content.split('\n\n')
if len(sections) > 1:
return [s.strip() for s in sections if s.strip()]
# Last resort: return as single slide
return [content.strip()]
def getOutputMimeType(self) -> str:
"""Get MIME type for rendered output."""
return self.outputMimeType
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available,
otherwise enhance default styles with AI if userPrompt provided.
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
not be generated separately by renderers. Only fall back to AI if styles not provided.
Args:
extractedContent: Document content with metadata (may contain styles)
userPrompt: User's prompt (AI will detect style instructions in any language)
aiService: AI service (used only if styles not in metadata and userPrompt provided)
templateName: Name of template style set (None = default)
Returns:
Dict with style definitions for all document styles
"""
# Get default style set
defaultStyleSet = self._getDefaultStyleSet()
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
if extractedContent:
metadata = extractedContent.get("metadata", {})
if isinstance(metadata, dict):
styles = metadata.get("styles")
if styles and isinstance(styles, dict):
self.logger.debug("Using styles from document generation metadata")
enhancedStyleSet = self._convertColorsFormat(styles)
return self._validateStylesReadability(enhancedStyleSet)
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
if userPrompt and aiService:
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
# Colors already converted in _getAiStylesWithPptxColors
return self._validateStylesReadability(enhancedStyleSet)
else:
# Use default styles only
return defaultStyleSet
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
"""Enhance default styles with AI based on user prompt."""
try:
style_template = self._createProfessionalPptxTemplate(userPrompt, defaultStyleSet)
enhanced_styles = await self._getAiStylesWithPptxColors(aiService, style_template, defaultStyleSet)
return enhanced_styles
except Exception as e:
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
return defaultStyleSet
def _validateStylesReadability(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix readability issues in AI-generated styles."""
try:
# Ensure minimum font sizes for PowerPoint readability
min_font_sizes = {
"title": 36,
"heading": 24,
"subheading": 20,
"paragraph": 14,
"bullet_list": 14,
"table_header": 12,
"table_cell": 12
}
for style_name, min_size in min_font_sizes.items():
if style_name in styles:
current_size = styles[style_name].get("font_size", 12)
if current_size < min_size:
styles[style_name]["font_size"] = min_size
return styles
except Exception as e:
logger.warning(f"Style validation failed: {str(e)}")
return self._getDefaultStyleSet()
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default PowerPoint style set - used when no style instructions present."""
return {
"title": {"font_size": 32, "color": "#1B365D", "bold": True, "align": "left"},
"heading": {"font_size": 24, "color": "#1B365D", "bold": True, "align": "left"},
"subheading": {"font_size": 20, "color": "#4A90E2", "bold": True, "align": "left"},
"paragraph": {"font_size": 14, "color": "#2F2F2F", "bold": False, "align": "left"},
"bullet_list": {"font_size": 14, "color": "#2F2F2F", "indent": 20},
"table_header": {"font_size": 18, "color": "#FFFFFF", "bold": True, "background": "#1B365D"},
"table_cell": {"font_size": 16, "color": "#2F2F2F", "bold": False, "background": "#F8F9FA"},
"slide_size": "16:9",
"content_per_slide": "concise",
"design_theme": "corporate",
"color_scheme": "professional",
"background_style": "clean",
"accent_colors": ["#1B365D", "#2C5F2D", "#4A90E2", "#6B7280"],
"professional_grade": True,
"executive_ready": True
}
def _createProfessionalPptxTemplate(self, userPrompt: str, style_schema: Dict[str, Any]) -> str:
"""Create a professional PowerPoint-specific AI style template for corporate-quality slides."""
# json is already imported at module level
schema_json = json.dumps(style_schema, indent=4)
return f"""Customize the JSON below for professional PowerPoint slides.
User Request: {userPrompt or "Create professional corporate slides"}
Rules:
- Use professional colors (blues, grays, deep greens)
- Large, readable font sizes
- High contrast
- Sophisticated color palettes
Return ONLY this JSON with your changes:
{schema_json}
JSON ONLY. NO OTHER TEXT."""
async def _getAiStylesWithPptxColors(self, aiService, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper PowerPoint color conversion. Uses base _getAiStyles for debug file writing."""
if not aiService:
return default_styles
try:
# Use base template method which handles debug file writing
enhanced_styles = await self._getAiStyles(aiService, style_template, default_styles)
# Convert colors to PPTX format (RGB tuples)
return self._convertColorsFormat(enhanced_styles)
except Exception as e:
self.logger.warning(f"AI style enhancement failed: {str(e)}, using defaults")
return default_styles
def _convertColorsFormat(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Convert hex colors to RGB format for PowerPoint compatibility."""
try:
for style_name, style_config in styles.items():
if isinstance(style_config, dict):
for prop, value in style_config.items():
if isinstance(value, str) and value.startswith('#'):
# Convert hex to RGB tuple for PowerPoint
hex_color = value.lstrip('#')
if len(hex_color) == 6:
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
styles[style_name][prop] = (r, g, b)
elif len(hex_color) == 8: # aRGB format
r = int(hex_color[2:4], 16)
g = int(hex_color[4:6], 16)
b = int(hex_color[6:8], 16)
styles[style_name][prop] = (r, g, b)
return styles
except Exception as e:
self.logger.warning(f"Color conversion failed: {str(e)}")
return styles
def _getSafeColor(self, color_value, default=(0, 0, 0)) -> tuple:
"""Get a safe RGB color tuple for PowerPoint."""
if isinstance(color_value, tuple) and len(color_value) == 3:
return color_value
elif isinstance(color_value, str) and color_value.startswith('#'):
hex_color = color_value.lstrip('#')
if len(hex_color) == 6:
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
return (r, g, b)
elif len(hex_color) == 8: # aRGB format
r = int(hex_color[2:4], 16)
g = int(hex_color[4:6], 16)
b = int(hex_color[6:8], 16)
return (r, g, b)
return default
async def _parseJsonToSlides(self, json_content: Dict[str, Any], title: str, styles: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Parse JSON content into slide data structure.
Args:
json_content: JSON content to parse
title: Presentation title
styles: AI-generated styles
Returns:
List of slide data dictionaries
"""
slides = []
try:
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
if not self._validateJsonStructure(json_content):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
sections = self._extractSections(json_content)
metadata = self._extractMetadata(json_content)
document_title = title if title else metadata.get("title", "Generated Document")
# Title slide (clean — just the document title, no filler text)
slides.append({
"title": document_title,
"content": "",
"_isTitleSlide": True,
})
# Content slides split by chapter headings
contentSlides = self._createSlidesFromSections(sections, styles)
if contentSlides:
slides.extend(contentSlides)
else:
slides.append({
"title": "Content Overview",
"content": ""
})
return slides
except Exception as e:
logger.error(f"Error parsing JSON to slides: {str(e)}")
# Return minimal fallback slides
return [
{
"title": title,
"content": "Error parsing content for presentation"
}
]
def _createSlideFromSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> Dict[str, Any]:
"""Create a slide from a JSON section."""
try:
# Get section title from data or use default
section_title = "Untitled Section"
if section.get("content_type") == "heading":
# Extract text from elements array - use nested content structure
for element in section.get("elements", []):
if isinstance(element, dict):
content = element.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
if text:
section_title = text
break
elif section.get("title"):
section_title = section.get("title")
content_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
# Check for three content formats from Phase 5D in elements
content_parts = []
for element in elements:
if not isinstance(element, dict):
continue
element_type = element.get("type", "")
# Support three content formats from Phase 5D
if element_type == "reference":
# Document reference format
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
content_parts.append(f"[Reference: {label}]")
continue
elif element_type == "extracted_text":
# Extracted text format
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f" (Source: {source})" if source else ""
content_parts.append(f"{content}{source_text}")
continue
# Handle image sections specially
if content_type == "image":
# Extract image data from nested content structure
images = []
for element in elements:
if isinstance(element, dict):
# Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
base64Data = content.get("base64Data")
altText = content.get("altText", "Image")
caption = content.get("caption", "")
else:
# Fallback to direct element fields
base64Data = element.get("base64Data")
altText = element.get("altText", "Image")
caption = element.get("caption", "")
if base64Data:
images.append({
"base64Data": base64Data,
"altText": altText,
"caption": caption
})
return {
"title": section_title or (elements[0].get("content", {}).get("altText", "Image") if elements and isinstance(elements[0], dict) else "Image"),
"content": "\n\n".join(content_parts) if content_parts else "", # Include reference/extracted_text if present
"images": images
}
# Build slide content based on section type - iterate over elements and format each
if not content_parts: # Only if we didn't process reference/extracted_text above
for element in elements:
if not isinstance(element, dict):
continue
element_type = element.get("type", "")
# Use element type if available, otherwise fall back to section content_type
if not element_type:
element_type = content_type
if element_type == "table":
formatted = self._formatTableForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "bullet_list" or element_type == "list":
formatted = self._formatListForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "heading":
formatted = self._formatHeadingForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "paragraph":
formatted = self._formatParagraphForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "code_block" or element_type == "code":
formatted = self._formatCodeForSlide(element)
if formatted:
content_parts.append(formatted)
else:
# Fallback to paragraph formatting
formatted = self._formatParagraphForSlide(element)
if formatted:
content_parts.append(formatted)
# Combine content parts
slide_content = "\n\n".join(filter(None, content_parts))
return {
"title": section_title,
"content": slide_content,
"images": [] # No images for non-image sections
}
except Exception as e:
logger.warning(f"Error creating slide from section: {str(e)}")
return None
def _formatTableForSlide(self, element: Dict[str, Any]) -> str:
"""Format table data for slide presentation."""
try:
# Extract table data from element - handle nested content structure
if not isinstance(element, dict):
return ""
# Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return ""
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers:
return ""
# Create table representation
table_lines = []
# Add headers
header_line = " | ".join(str(h) for h in headers)
table_lines.append(header_line)
# Add separator
separator = "-" * len(header_line)
table_lines.append(separator)
# Add data rows (limit based on content density)
max_rows = 5 # Default limit
for row in rows[:max_rows]:
row_line = " | ".join(str(cell) for cell in row)
table_lines.append(row_line)
if len(rows) > max_rows:
table_lines.append(f"... and {len(rows) - max_rows} more rows")
return "\n".join(table_lines)
except Exception as e:
logger.warning(f"Error formatting table for slide: {str(e)}")
return ""
def _formatListForSlide(self, list_data: Dict[str, Any]) -> str:
"""Format list data for slide presentation."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return ""
items = content.get("items", [])
if not items:
return ""
# Create list representation
list_lines = []
for item in items:
if isinstance(item, dict):
text = item.get("text", "")
list_lines.append(f"{text}")
# Add subitems (limit to 3 for readability)
subitems = item.get("subitems", [])[:3]
for subitem in subitems:
if isinstance(subitem, dict):
list_lines.append(f" - {subitem.get('text', '')}")
else:
list_lines.append(f" - {subitem}")
else:
list_lines.append(f"{str(item)}")
return "\n".join(list_lines)
except Exception as e:
logger.warning(f"Error formatting list for slide: {str(e)}")
return ""
def _formatHeadingForSlide(self, heading_data: Dict[str, Any]) -> str:
"""Format heading data for slide presentation."""
try:
# Extract from nested content structure
content = heading_data.get("content", {})
if not isinstance(content, dict):
return ""
text = content.get("text", "")
level = content.get("level", 1)
if text:
return f"{'#' * level} {text}"
return ""
except Exception as e:
logger.warning(f"Error formatting heading for slide: {str(e)}")
return ""
def _formatParagraphForSlide(self, paragraph_data: Dict[str, Any]) -> str:
"""Format paragraph data for slide presentation."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
# Limit paragraph length based on content density
max_length = 200 # Default limit
if len(text) > max_length:
text = text[:max_length] + "..."
return text
return ""
except Exception as e:
logger.warning(f"Error formatting paragraph for slide: {str(e)}")
return ""
def _formatCodeForSlide(self, code_data: Dict[str, Any]) -> str:
"""Format code data for slide presentation."""
try:
# Extract from nested content structure
content = code_data.get("content", {})
if not isinstance(content, dict):
return ""
code = content.get("code", "")
language = content.get("language", "")
if code:
# Limit code length based on content density
max_length = 100 # Default limit
if len(code) > max_length:
code = code[:max_length] + "..."
if language:
return f"Code ({language}):\n{code}"
else:
return f"Code:\n{code}"
return ""
except Exception as e:
logger.warning(f"Error formatting code for slide: {str(e)}")
return ""
def _getSlideLayoutIndex(self, slide_data: Dict[str, Any], styles: Dict[str, Any]) -> int:
"""Determine the best professional slide layout based on content."""
try:
content = slide_data.get("content", "")
title = slide_data.get("title", "")
if not content:
return 0
# Professional layout selection based on content
if "|" in content and "-" in content:
# Has both tables and lists - use content with caption for professional look
return 2
elif "|" in content:
# Has tables - use content layout for clean table presentation
return 1
elif content.count("") > 2:
# Has many bullet points - use content layout for better readability
return 1
elif len(content) > 200:
# Long content - use content layout for better text flow
return 1
elif title and len(title) > 20:
# Long title - use title and content layout
return 1
else:
# Default to title and content layout for professional appearance
return 1
except Exception as e:
logger.warning(f"Error determining slide layout: {str(e)}")
return 1 # Default to title and content layout
def _createSlidesFromSections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create slides from sections: each top-level heading creates a new slide.
The split level is determined dynamically: if there is exactly one H1 (the
document title), chapters are H2; otherwise chapters are H1.
"""
try:
# First pass: discover heading levels to choose the split level
headingLevels: List[int] = []
for section in sections:
if section.get("content_type") == "heading":
for el in section.get("elements", []):
if isinstance(el, dict):
c = el.get("content", {})
if isinstance(c, dict):
headingLevels.append(c.get("level", 1))
h1Count = headingLevels.count(1)
h2Count = headingLevels.count(2)
# If there's at most one H1 but multiple H2s, split on H2
splitLevel = 2 if h1Count <= 1 and h2Count > 1 else 1
slides = []
currentSlideSections = []
currentSlideTitle = "Content Overview"
for section in sections:
sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", [])
if not elements and sectionType != "heading":
continue
if sectionType == "heading":
level = 1
headingText = ""
for element in elements:
if isinstance(element, dict):
content = element.get("content", {})
if isinstance(content, dict):
headingText = content.get("text", "")
level = content.get("level", 1)
elif isinstance(content, str):
headingText = content
level = 1
if level <= splitLevel:
if currentSlideSections:
slides.append({
"title": currentSlideTitle,
"sections": currentSlideSections.copy(),
"images": []
})
currentSlideSections = []
currentSlideTitle = headingText or section.get("id", "Untitled Section")
else:
currentSlideSections.append(section)
elif sectionType == "image":
currentSlideSections.append(section)
else:
currentSlideSections.append(section)
if currentSlideSections:
slides.append({
"title": currentSlideTitle,
"sections": currentSlideSections.copy(),
"images": []
})
return slides
except Exception as e:
logger.warning(f"Error creating slides from sections: {str(e)}")
return []
def _formatSectionContent(self, section: Dict[str, Any]) -> str:
"""Format section content for slide presentation."""
try:
content_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
# Image sections return empty content (handled separately)
if content_type == "image":
return ""
# Process each element in the section - use element type, not section type
content_parts = []
for element in elements:
if not isinstance(element, dict):
continue
element_type = element.get("type", "")
# Use element type if available, otherwise fall back to section content_type
if not element_type:
element_type = content_type
if element_type == "table":
formatted = self._formatTableForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "bullet_list" or element_type == "list":
formatted = self._formatListForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "heading":
formatted = self._formatHeadingForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "paragraph":
formatted = self._formatParagraphForSlide(element)
if formatted:
content_parts.append(formatted)
elif element_type == "code_block" or element_type == "code":
formatted = self._formatCodeForSlide(element)
if formatted:
content_parts.append(formatted)
else:
# Fallback to paragraph formatting
formatted = self._formatParagraphForSlide(element)
if formatted:
content_parts.append(formatted)
return "\n\n".join(filter(None, content_parts))
except Exception as e:
logger.warning(f"Error formatting section content: {str(e)}")
return ""
def _addImagesToSlide(self, slide, images: List[Dict[str, Any]], styles: Dict[str, Any]) -> None:
"""Add images to a PowerPoint slide."""
try:
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
import base64
import io
if not images:
return
# Get slide dimensions from presentation
if hasattr(self, '_currentPresentation'):
prs = self._currentPresentation
else:
prs = slide.presentation
slideWidth = prs.slide_width
slideHeight = prs.slide_height
titleHeight = Inches(1.5) # Approximate title height
# Available area for images
availableWidth = slideWidth - Inches(1) # Margins
availableHeight = slideHeight - titleHeight - Inches(1) # Title + margins
# Position images
if len(images) == 1:
# Single image: center it
img = images[0]
base64Data = img.get("base64Data")
# Validate base64Data is present and not empty
if not base64Data or not isinstance(base64Data, str) or len(base64Data.strip()) == 0:
logger.error(f"Invalid base64Data: present={bool(base64Data)}, type={type(base64Data)}, length={len(base64Data) if base64Data else 0}")
return
try:
imageBytes = base64.b64decode(base64Data)
if len(imageBytes) == 0:
logger.error("Decoded image bytes are empty")
return
imageStream = io.BytesIO(imageBytes)
except Exception as decode_error:
logger.error(f"Failed to decode base64 image data: {str(decode_error)}")
return
# Get image dimensions
try:
from PIL import Image as PILImage
pilImage = PILImage.open(imageStream)
imgWidth, imgHeight = pilImage.size
# Scale to fit available space (max 90% of slide for better visibility)
# Convert PIL pixels to PowerPoint points (1 inch = 72 points, typical screen DPI = 96)
# Conversion: pixels * (72/96) = points
imgWidthPoints = imgWidth * (72.0 / 96.0)
imgHeightPoints = imgHeight * (72.0 / 96.0)
maxWidth = availableWidth * 0.9
maxHeight = availableHeight * 0.9
scale = min(maxWidth / imgWidthPoints, maxHeight / imgHeightPoints, 1.0)
finalWidth = imgWidthPoints * scale
finalHeight = imgHeightPoints * scale
# Center image
left = (slideWidth - finalWidth) / 2
top = titleHeight + (availableHeight - finalHeight) / 2
imageStream.seek(0)
except Exception:
# Fallback: use default size
finalWidth = Inches(6)
finalHeight = Inches(4.5)
left = (slideWidth - finalWidth) / 2
top = titleHeight + Inches(1)
imageStream.seek(0)
# Add image to slide
try:
slide.shapes.add_picture(imageStream, left, top, width=finalWidth, height=finalHeight)
except Exception as add_error:
# If add_picture fails, try with explicit format
imageStream.seek(0)
# Ensure we have valid image data
if len(imageBytes) > 0:
slide.shapes.add_picture(imageStream, left, top, width=finalWidth, height=finalHeight)
else:
raise Exception(f"Empty image data: {add_error}")
# Add caption if available
caption = img.get("caption") or img.get("altText")
if caption and caption != "Image":
# Add text box below image
captionTop = top + finalHeight + Inches(0.2)
captionBox = slide.shapes.add_textbox(
Inches(1),
captionTop,
slideWidth - Inches(2),
Inches(0.5)
)
captionFrame = captionBox.text_frame
captionFrame.text = caption
captionFrame.paragraphs[0].font.size = Pt(12)
captionFrame.paragraphs[0].font.italic = True
captionFrame.paragraphs[0].alignment = PP_ALIGN.CENTER
else:
# Multiple images: arrange in grid
cols = 2 if len(images) <= 4 else 3
rows = (len(images) + cols - 1) // cols
imgWidth = (availableWidth - Inches(0.5) * (cols - 1)) / cols
imgHeight = (availableHeight - Inches(0.5) * (rows - 1)) / rows
for idx, img in enumerate(images):
base64Data = img.get("base64Data")
if base64Data:
row = idx // cols
col = idx % cols
imageBytes = base64.b64decode(base64Data)
imageStream = io.BytesIO(imageBytes)
left = Inches(0.5) + col * (imgWidth + Inches(0.5))
top = titleHeight + Inches(0.5) + row * (imgHeight + Inches(0.5))
slide.shapes.add_picture(imageStream, left, top, width=imgWidth, height=imgHeight)
except Exception as e:
logger.error(f"Error embedding images in PPTX slide: {str(e)}")
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")
def _addMarkdownInlineRuns(self, paragraph, text: str, fontSize=None, fontColor=None, fontBold=None) -> None:
"""Parse markdown inline formatting and add Runs to a pptx paragraph.
Every piece of text is added as an explicit Run with font properties set,
so the paragraph never falls back to the slide-master default font.
"""
from pptx.util import Pt
paragraph.text = ""
def _applyBase(run, bold=None):
if fontSize:
run.font.size = fontSize
if fontColor:
run.font.color.rgb = fontColor
if bold is not None:
run.font.bold = bold
elif fontBold is not None:
run.font.bold = fontBold
pos = 0
for m in _PPTX_MD_INLINE_RE.finditer(text):
if m.start() > pos:
r = paragraph.add_run()
r.text = text[pos:m.start()]
_applyBase(r)
if m.group(2) or m.group(4):
r = paragraph.add_run()
r.text = m.group(2) or m.group(4)
_applyBase(r, bold=True)
elif m.group(5) or m.group(6):
r = paragraph.add_run()
r.text = m.group(5) or m.group(6)
r.font.italic = True
_applyBase(r)
elif m.group(7):
r = paragraph.add_run()
r.text = m.group(7)
r.font.name = "Courier New"
if fontSize and hasattr(fontSize, 'pt'):
r.font.size = Pt(max(8, int(fontSize.pt * 0.85)))
elif fontSize:
r.font.size = fontSize
if fontColor:
r.font.color.rgb = fontColor
pos = m.end()
# Remaining tail (or entire string if no matches)
if pos < len(text):
r = paragraph.add_run()
r.text = text[pos:]
_applyBase(r)
def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None:
"""Add a PowerPoint table to slide."""
try:
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
content = element.get("content", {})
if not isinstance(content, dict):
return
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers:
return
num_cols = int(len(headers))
num_rows = int(len(rows) + 1)
left = Inches(0.5)
if hasattr(self, '_currentPresentation'):
prs = self._currentPresentation
else:
prs = slide.presentation
width = max_width if max_width is not None else (prs.slide_width - Inches(1))
row_height = Inches(0.4)
# Auto-calculate top from existing shapes when not specified
if top is None:
maxBottom = Inches(1.5)
for shape in slide.shapes:
shapeBottom = shape.top + shape.height
if shapeBottom > maxBottom:
maxBottom = shapeBottom
top = maxBottom + Inches(0.15)
table_height = row_height * num_rows
table_shape = slide.shapes.add_table(num_rows, num_cols, left, top, width, table_height)
table = table_shape.table
# Set column widths - width is in EMU, divide evenly
# python-pptx expects EMU values (914400 EMU = 1 inch)
col_width_emu = int(width) // num_cols # Ensure integer division for EMU
for col_idx in range(num_cols):
table.columns[col_idx].width = col_width_emu
# Add headers with styling - OPTIMIZED: pre-calculate color/style objects
header_style = styles.get("table_header", {})
header_bg_color = self._getSafeColor(header_style.get("background", (31, 78, 121)))
header_text_color = self._getSafeColor(header_style.get("text_color", (255, 255, 255)))
header_font_size = header_style.get("font_size", 18)
# Pre-calculate and cache RGB color objects
header_bg_rgb = RGBColor(*header_bg_color)
header_text_rgb = RGBColor(*header_text_color)
header_font_size_pt = Pt(header_font_size)
header_bold = header_style.get("bold", True)
# Determine alignment once
align = header_style.get("align", "center")
if align == "left":
header_alignment = PP_ALIGN.LEFT
elif align == "right":
header_alignment = PP_ALIGN.RIGHT
else:
header_alignment = PP_ALIGN.CENTER
for col_idx, header in enumerate(headers):
cell = table.cell(0, col_idx)
# Clear existing text and set new text
cell.text_frame.clear()
header_text = str(header) if header else ""
cell.text = header_text
# Ensure paragraph exists
if len(cell.text_frame.paragraphs) == 0:
cell.text_frame.add_paragraph()
# Apply styling - use cached objects
cell.fill.solid()
cell.fill.fore_color.rgb = header_bg_rgb
para = cell.text_frame.paragraphs[0]
para.font.bold = header_bold
para.font.size = header_font_size_pt
para.font.color.rgb = header_text_rgb
para.alignment = header_alignment
# Ensure text is set on paragraph
if not para.text:
para.text = header_text
# Add data rows with styling - OPTIMIZED: pre-calculate color/style objects
cell_style = styles.get("table_cell", {})
cell_bg_color = self._getSafeColor(cell_style.get("background", (255, 255, 255)))
cell_text_color = self._getSafeColor(cell_style.get("text_color", (47, 47, 47)))
cell_font_size = cell_style.get("font_size", 16)
# Pre-calculate and cache RGB color objects
cell_bg_rgb = RGBColor(*cell_bg_color)
cell_text_rgb = RGBColor(*cell_text_color)
cell_font_size_pt = Pt(cell_font_size)
cell_bold = cell_style.get("bold", False)
# Determine alignment once
align = cell_style.get("align", "left")
if align == "center":
cell_alignment = PP_ALIGN.CENTER
elif align == "right":
cell_alignment = PP_ALIGN.RIGHT
else:
cell_alignment = PP_ALIGN.LEFT
for row_idx, row_data in enumerate(rows, 1):
for col_idx, cell_data in enumerate(row_data[:num_cols]):
cell = table.cell(row_idx, col_idx)
# Clear existing text and set new text
cell.text_frame.clear()
cell_text = str(cell_data) if cell_data is not None else ""
cell.text = cell_text
# Ensure paragraph exists
if len(cell.text_frame.paragraphs) == 0:
cell.text_frame.add_paragraph()
# Apply styling - use cached objects
cell.fill.solid()
cell.fill.fore_color.rgb = cell_bg_rgb
para = cell.text_frame.paragraphs[0]
para.font.size = cell_font_size_pt
para.font.bold = cell_bold
para.font.color.rgb = cell_text_rgb
para.alignment = cell_alignment
# Ensure text is set on paragraph
if not para.text:
para.text = cell_text
except Exception as e:
logger.warning(f"Error adding table to slide: {str(e)}")
def _addBulletListToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Add bullet list to slide text frame with consistent formatting."""
try:
from pptx.util import Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
content = element.get("content", {})
if not isinstance(content, dict):
return
items = content.get("items", [])
if not items:
return
listStyle = styles.get("paragraph", {})
fontSize = Pt(max(10, int(listStyle.get("font_size", 14) * font_size_multiplier)))
fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47))))
for item in items:
itemText = item.get("text", "") if isinstance(item, dict) else str(item)
if not itemText or not itemText.strip():
continue
p = text_frame.add_paragraph()
p.level = 0
p.alignment = PP_ALIGN.LEFT
p.space_before = Pt(2)
p.space_after = Pt(2)
# Consistent bullet prefix
self._addMarkdownInlineRuns(p, f"{itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
# Subitems
if isinstance(item, dict):
for sub in item.get("subitems", []):
subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
if not subText:
continue
sp = text_frame.add_paragraph()
sp.level = 0
sp.alignment = PP_ALIGN.LEFT
sp.space_before = Pt(1)
sp.space_after = Pt(1)
self._addMarkdownInlineRuns(sp, f" {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
except Exception as e:
logger.warning(f"Error adding bullet list to slide: {str(e)}")
def _addHeadingToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Add heading to slide text frame."""
try:
from pptx.util import Pt
from pptx.dml.color import RGBColor
# Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return
text = content.get("text", "")
level = content.get("level", 1)
if text:
p = text_frame.add_paragraph()
p.level = 0
heading_style = styles.get("heading", {})
if level == 1:
base_font_size = heading_style.get("font_size", 28)
elif level == 2:
base_font_size = heading_style.get("font_size", 22)
elif level == 3:
base_font_size = heading_style.get("font_size", 18)
else:
base_font_size = heading_style.get("font_size", 16)
calculated_size = max(12, int(base_font_size * font_size_multiplier))
fSize = Pt(calculated_size)
fColor = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121))))
self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=True)
# Add spacing before and after headings
p.space_before = Pt(12 if level == 1 else 8) # More space before H1
p.space_after = Pt(6) # Space after heading
except Exception as e:
logger.warning(f"Error adding heading to slide: {str(e)}")
def _addParagraphToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Add paragraph to slide text frame."""
try:
from pptx.util import Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
# Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
p = text_frame.add_paragraph()
p.level = 0
try:
if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None
except (AttributeError, TypeError):
pass
paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 14)
calculated_size = max(10, int(base_font_size * font_size_multiplier))
fSize = Pt(calculated_size)
fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
fBold = paragraph_style.get("bold", False)
self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold)
# Add proper spacing
p.space_before = Pt(6) # Space before paragraph
p.space_after = Pt(6) # Space after paragraph
p.line_spacing = 1.2 # Line spacing for readability
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
except Exception as e:
logger.warning(f"Error adding paragraph to slide: {str(e)}")
def _addCodeBlockToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Add code block to slide text frame."""
try:
from pptx.util import Pt
from pptx.dml.color import RGBColor
# Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return
code = content.get("code", "")
language = content.get("language", "")
if code:
code_style = styles.get("code_block", {})
code_font = code_style.get("font", "Courier New")
base_code_font_size = code_style.get("font_size", 9)
code_font_size = max(6, int(base_code_font_size * font_size_multiplier)) # Minimum 6pt for code
code_color = self._getSafeColor(code_style.get("color", (47, 47, 47)))
p = text_frame.add_paragraph()
if language:
p.text = f"Code ({language}):"
p.font.bold = True
p.font.size = Pt(code_font_size)
p = text_frame.add_paragraph()
p.text = code
p.font.name = code_font
p.font.size = Pt(code_font_size)
p.font.color.rgb = RGBColor(*code_color)
except Exception as e:
logger.warning(f"Error adding code block to slide: {str(e)}")
def _formatTimestamp(self) -> str:
"""Format current timestamp for presentation generation."""
# datetime and UTC are already imported at module level
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
def _renderSlideContentWithFrames(self, slide, slide_sections: List[Dict[str, Any]], slide_images: List[Dict[str, Any]], styles: Dict[str, Any], prs) -> None:
"""Render all sections sequentially: text/bullets/headings into a shared
textbox, tables and images as separate shapes placed below."""
try:
from pptx.util import Inches, Pt
margin = Inches(0.5)
contentTop = Inches(1.3)
availableWidth = prs.slide_width - Inches(1)
availableHeight = prs.slide_height - contentTop - Inches(0.3)
# Create a single textbox for all non-table, non-image content
textbox = slide.shapes.add_textbox(margin, contentTop, availableWidth, availableHeight)
textFrame = textbox.text_frame
textFrame.word_wrap = True
textFrame.auto_size = None
for section in slide_sections:
self._renderSectionToTextFrame(slide, section, styles, textFrame, font_size_multiplier=1.0)
# Render standalone images that were passed alongside sections
if slide_images:
self._addImagesToSlideInFrame(slide, slide_images, styles, margin, contentTop, availableWidth, availableHeight)
except Exception as e:
logger.error(f"Error rendering slide content: {str(e)}")
def _renderTextSectionsInFrame(self, slide, text_sections: List[Dict[str, Any]], styles: Dict[str, Any], left: float, top: float, width: float, height: float, adaptiveFontSize: bool = False) -> None:
"""Render text sections (paragraphs, lists, headings) in a text frame."""
try:
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
# Calculate total text length for adaptive font sizing
total_text_length = 0
if adaptiveFontSize:
for section in text_sections:
elements = section.get("elements", [])
for element in elements:
if isinstance(element, dict):
element_type = element.get("type", "")
if element_type in ["paragraph", "bullet_list", "list", "heading"]:
content = element.get("content", "")
if isinstance(content, dict):
if "text" in content:
total_text_length += len(str(content["text"]))
elif "items" in content:
for item in content.get("items", []):
total_text_length += len(str(item))
elif isinstance(content, str):
total_text_length += len(content)
# Calculate adaptive font size multiplier based on text length and frame size
font_size_multiplier = 1.0
if adaptiveFontSize and total_text_length > 0:
try:
# More accurate calculation: estimate characters per line based on average character width
# Average character width is approximately 0.6 * font_size in points
# For 14pt font, average char width ≈ 8.4pt
avg_char_width_pt = 8.4 # Approximate for 14pt font
chars_per_line = max(1, int(float(width) / avg_char_width_pt))
# Estimate lines needed
lines_needed = total_text_length / max(chars_per_line, 1)
# Available lines based on height (line height ≈ 1.2 * font_size)
line_height_pt = 16.8 # Approximate for 14pt font with 1.2 spacing
available_lines = max(1, int(float(height) / line_height_pt))
if available_lines > 0 and lines_needed > available_lines:
# More aggressive scaling for long texts
# Calculate exact scale needed, then add 10% buffer
scale_needed = available_lines / lines_needed
font_size_multiplier = scale_needed * 0.9 # 10% buffer
# Allow scaling down to 50% for very long texts (minimum readable)
font_size_multiplier = max(0.5, min(1.0, font_size_multiplier))
elif lines_needed <= available_lines * 0.7:
# If text is much shorter than available space, can use slightly larger font
font_size_multiplier = min(1.1, (available_lines / lines_needed) * 0.8)
except (ZeroDivisionError, ValueError, TypeError) as calc_error:
logger.debug(f"Font size calculation error: {str(calc_error)}")
# Fallback to default if calculation fails
font_size_multiplier = 1.0
textbox = slide.shapes.add_textbox(left, top, width, height)
text_frame = textbox.text_frame
text_frame.word_wrap = True
text_frame.auto_size = None # Disable auto-size for fixed frame
# Ensure text frame can display bullets
text_frame.margin_left = Pt(0)
text_frame.margin_right = Pt(0)
text_frame.margin_top = Pt(0)
text_frame.margin_bottom = Pt(0)
# Pass font size multiplier to rendering methods
for section in text_sections:
self._renderSectionToTextFrame(slide, section, styles, text_frame, font_size_multiplier)
except Exception as e:
logger.warning(f"Error rendering text sections in frame: {str(e)}")
@staticmethod
def _isHorizontalRule(element: Dict[str, Any]) -> bool:
"""Detect markdown horizontal rules (---, ***, ___) that should be skipped on slides."""
content = element.get("content", {})
text = content.get("text", "") if isinstance(content, dict) else (content if isinstance(content, str) else "")
stripped = text.strip()
return bool(stripped) and all(c in "-*_ " for c in stripped) and len(stripped.replace(" ", "")) >= 3
def _renderSectionToTextFrame(self, slide, section: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Render a single section to a text frame."""
try:
from pptx.util import Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", [])
if not elements:
return
for element in elements:
if not isinstance(element, dict):
continue
elementType = element.get("type", "") or sectionType
if elementType == "image":
continue
# Skip horizontal rules (---, ***, ___)
if elementType == "paragraph" and self._isHorizontalRule(element):
continue
if elementType == "table":
self._addTableToSlide(slide, element, styles)
elif elementType in ("bullet_list", "list"):
self._addBulletListToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif elementType == "heading":
self._addHeadingToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif elementType == "paragraph":
self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif elementType in ("code_block", "code"):
self._addCodeBlockToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif elementType == "extracted_text":
content = element.get("content", "")
if content:
p = text_frame.add_paragraph()
pStyle = styles.get("paragraph", {})
fSize = Pt(max(10, int(pStyle.get("font_size", 14) * font_size_multiplier)))
fColor = RGBColor(*self._getSafeColor(pStyle.get("color", (47, 47, 47))))
self._addMarkdownInlineRuns(p, content, fontSize=fSize, fontColor=fColor)
p.alignment = PP_ALIGN.LEFT
elif elementType == "reference":
label = element.get("label", "Reference")
p = text_frame.add_paragraph()
p.text = f"[Reference: {label}]"
p.font.italic = True
p.alignment = PP_ALIGN.LEFT
else:
self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier)
except Exception as e:
logger.warning(f"Error rendering section to text frame: {str(e)}")
def _addImagesToSlideInFrame(self, slide, images: List[Dict[str, Any]], styles: Dict[str, Any], left: float, top: float, width: float, height: float) -> None:
"""Add images to slide within a specific frame area."""
try:
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
import base64
import io
if not images:
logger.debug("No images to render in frame")
return
logger.info(f"Rendering {len(images)} image(s) in frame at ({left}, {top}), size ({width}, {height})")
# Calculate image dimensions within frame
if len(images) == 1:
# Single image: fit to frame
img = images[0]
base64Data = img.get("base64Data")
if not base64Data:
logger.warning("Image has no base64Data")
return
# Clean base64 data (remove data URI prefix if present)
if isinstance(base64Data, str):
if base64Data.startswith("data:image/"):
# Extract base64 from data URI
base64Data = base64Data.split(",", 1)[1]
# Remove any whitespace
base64Data = base64Data.strip()
try:
# Decode base64
imageBytes = base64.b64decode(base64Data, validate=True)
if len(imageBytes) == 0:
logger.error("Decoded image bytes are empty")
return
imageStream = io.BytesIO(imageBytes)
# Get image dimensions using PIL
imgWidth, imgHeight = None, None
try:
from PIL import Image as PILImage
pilImage = PILImage.open(imageStream)
imgWidth, imgHeight = pilImage.size
imageStream.seek(0) # Reset stream for PowerPoint
# Validate image dimensions - ensure they're reasonable
if imgWidth <= 1 or imgHeight <= 1:
logger.warning(f"Image has invalid dimensions: {imgWidth}x{imgHeight}, using default size")
imgWidth, imgHeight = 800, 600
imageStream.seek(0)
elif imgWidth < 100 or imgHeight < 100:
logger.warning(f"Image dimensions very small: {imgWidth}x{imgHeight}, may appear tiny")
except ImportError:
logger.warning("PIL not available, using default image size")
imgWidth, imgHeight = 800, 600 # Default dimensions
except Exception as pil_error:
logger.warning(f"Error getting image dimensions with PIL: {str(pil_error)}, using default size")
imgWidth, imgHeight = 800, 600
imageStream.seek(0)
# Ensure we have valid dimensions
if not imgWidth or not imgHeight or imgWidth <= 1 or imgHeight <= 1:
logger.warning("Invalid image dimensions, using default 800x600")
imgWidth, imgHeight = 800, 600
# Scale to fit frame while maintaining aspect ratio
# width and height parameters are already in Inches (from pptx.util.Inches)
# Convert PIL pixel dimensions to Inches (assuming 96 DPI for PIL images)
imgWidthInches = Inches(imgWidth / 96.0)
imgHeightInches = Inches(imgHeight / 96.0)
# Calculate scale to fit within frame
# Inches objects support division, result is a float
try:
scale_width = width / imgWidthInches if imgWidthInches > 0 else 1.0
scale_height = height / imgHeightInches if imgHeightInches > 0 else 1.0
scale = min(scale_width, scale_height, 1.0) # Don't scale up, only down
finalWidth = imgWidthInches * scale
finalHeight = imgHeightInches * scale
# Ensure minimum size (at least 1 inch) to prevent tiny rendering
minSize = Inches(1)
if finalWidth < minSize or finalHeight < minSize:
# Use minimum size while maintaining aspect ratio
min_scale = max(minSize / imgWidthInches if imgWidthInches > 0 else 1.0,
minSize / imgHeightInches if imgHeightInches > 0 else 1.0)
finalWidth = max(minSize, imgWidthInches * min_scale)
finalHeight = max(minSize, imgHeightInches * min_scale)
# Ensure we don't exceed frame bounds
if finalWidth > width:
finalWidth = width
finalHeight = imgHeightInches * (width / imgWidthInches) if imgWidthInches > 0 else finalHeight
if finalHeight > height:
finalHeight = height
finalWidth = imgWidthInches * (height / imgHeightInches) if imgHeightInches > 0 else finalWidth
except (ZeroDivisionError, TypeError, AttributeError) as calc_error:
logger.warning(f"Error calculating image size: {str(calc_error)}, using frame size")
finalWidth = width * 0.9 # Use 90% of frame width
finalHeight = height * 0.9 # Use 90% of frame height
# Center in frame
frame_left = left + (width - finalWidth) / 2
frame_top = top + (height - finalHeight) / 2
# Add image to slide
imageStream.seek(0)
slide.shapes.add_picture(imageStream, frame_left, frame_top, width=finalWidth, height=finalHeight)
logger.info(f"Successfully added image to slide at ({frame_left}, {frame_top}), size ({finalWidth}, {finalHeight})")
# Add caption if available
caption = img.get("caption") or img.get("altText")
if caption and caption != "Image":
captionTop = frame_top + finalHeight + Inches(0.1)
captionBox = slide.shapes.add_textbox(left, captionTop, width, Inches(0.4))
captionFrame = captionBox.text_frame
captionFrame.text = caption
captionFrame.paragraphs[0].font.size = Pt(10)
captionFrame.paragraphs[0].font.italic = True
captionFrame.paragraphs[0].alignment = PP_ALIGN.CENTER
except base64.binascii.Error as b64_error:
logger.error(f"Invalid base64 data: {str(b64_error)}")
except Exception as img_error:
logger.error(f"Error adding image to frame: {str(img_error)}", exc_info=True)
else:
# Multiple images: grid layout
cols = 2 if len(images) <= 4 else 3
rows = (len(images) + cols - 1) // cols
imgWidth = (width - Inches(0.2) * (cols - 1)) / cols
imgHeight = (height - Inches(0.2) * (rows - 1)) / rows
for idx, img in enumerate(images):
base64Data = img.get("base64Data")
if not base64Data:
logger.warning(f"Image {idx} has no base64Data")
continue
# Clean base64 data
if isinstance(base64Data, str):
if base64Data.startswith("data:image/"):
base64Data = base64Data.split(",", 1)[1]
base64Data = base64Data.strip().replace("\n", "").replace("\r", "").replace("\t", "").replace(" ", "")
row = idx // cols
col = idx % cols
img_left = left + col * (imgWidth + Inches(0.2))
img_top = top + row * (imgHeight + Inches(0.2))
try:
imageBytes = base64.b64decode(base64Data, validate=True)
if len(imageBytes) == 0:
logger.error(f"Decoded image {idx} bytes are empty")
continue
imageStream = io.BytesIO(imageBytes)
# Try to get dimensions for better scaling
try:
from PIL import Image as PILImage
pilImage = PILImage.open(imageStream)
imgW, imgH = pilImage.size
# Scale to fit grid cell while maintaining aspect ratio
scale = min(imgWidth / (imgW * (72.0 / 96.0)), imgHeight / (imgH * (72.0 / 96.0)), 1.0)
finalW = (imgW * (72.0 / 96.0)) * scale
finalH = (imgH * (72.0 / 96.0)) * scale
# Center in grid cell
cell_left = img_left + (imgWidth - finalW) / 2
cell_top = img_top + (imgHeight - finalH) / 2
imageStream.seek(0)
slide.shapes.add_picture(imageStream, cell_left, cell_top, width=finalW, height=finalH)
except (ImportError, Exception):
# Fallback: use grid cell size directly
imageStream.seek(0)
slide.shapes.add_picture(imageStream, img_left, img_top, width=imgWidth, height=imgHeight)
logger.info(f"Successfully added image {idx+1}/{len(images)} to slide grid")
except base64.binascii.Error as b64_error:
logger.error(f"Invalid base64 data for image {idx}: {str(b64_error)}")
except Exception as img_error:
logger.error(f"Error adding image {idx} to frame: {str(img_error)}", exc_info=True)
except Exception as e:
logger.error(f"Error adding images to slide frame: {str(e)}", exc_info=True)