# Copyright (c) 2025 Patrick Motsch # All rights reserved. import logging import base64 from typing import List, Dict, Any, Optional from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted from ..subRegistry import Extractor logger = logging.getLogger(__name__) class PptxExtractor(Extractor): """ Extractor for PowerPoint files. Supported formats: - MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint - File extensions: .pptx, .ppt - Special handling: Extracts slide content, tables, and images - Dependencies: python-pptx """ def __init__(self): self._loaded = False self._haveLibs = False def _load(self): if self._loaded: return self._loaded = True try: global Presentation from pptx import Presentation self._haveLibs = True except Exception: self._haveLibs = False def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return (mimeType in [ "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint" ]) or (fileName or "").lower().endswith((".pptx", ".ppt")) def getSupportedExtensions(self) -> list[str]: """Return list of supported file extensions.""" return [".pptx", ".ppt"] def getSupportedMimeTypes(self) -> list[str]: """Return list of supported MIME types.""" return [ "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint" ] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: """ Extract content from PowerPoint files. Args: fileBytes: Raw file data as bytes context: Context dictionary with file information Returns: List of ContentPart objects with extracted content """ self._load() if not self._haveLibs: logger.error("python-pptx library not installed. Install with: pip install python-pptx") return [ContentPart( id="error", label="PowerPoint Extraction Error", typeGroup="text", mimeType="text/plain", data="Error: python-pptx library not installed", metadata={"error": True, "error_message": "python-pptx library not installed"} )] try: import io # Load presentation from bytes presentation = Presentation(io.BytesIO(fileBytes)) parts = [] slide_index = 0 # Extract content from each slide for slide in presentation.slides: slide_index += 1 slide_content = [] # Extract text from slide for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_content.append(shape.text.strip()) # Extract table data for shape in slide.shapes: if shape.has_table: table = shape.table table_data = [] for row in table.rows: row_data = [] for cell in row.cells: row_data.append(cell.text.strip()) table_data.append(row_data) if table_data: # Convert table to markdown format table_md = self._table_to_markdown(table_data) slide_content.append(table_md) # Extract images for shape in slide.shapes: if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE try: image = shape.image image_bytes = image.blob image_b64 = base64.b64encode(image_bytes).decode('utf-8') # Create image part image_part = ContentPart( id=f"slide_{slide_index}_image_{len(parts)}", label=f"Slide {slide_index} Image", typeGroup="image", mimeType="image/png", # Default to PNG data=image_b64, metadata={ "slide_number": slide_index, "shape_type": "image", "extracted_from": "powerpoint" } ) parts.append(image_part) except Exception as e: logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}") # Create slide content part if slide_content: slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content) slide_part = ContentPart( id=f"slide_{slide_index}", label=f"Slide {slide_index} Content", typeGroup="structure", mimeType="text/plain", data=slide_text, metadata={ "slide_number": slide_index, "content_type": "slide", "extracted_from": "powerpoint", "text_length": len(slide_text) } ) parts.append(slide_part) # Create presentation overview file_name = context.get("fileName", "presentation.pptx") overview_text = f"# PowerPoint Presentation: {file_name}\n\n" overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n" overview_text += f"**Content Parts:** {len(parts)}\n\n" # Add slide summaries for i, slide in enumerate(presentation.slides, 1): slide_text_parts = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_text_parts.append(shape.text.strip()) if slide_text_parts: overview_text += f"## Slide {i}\n" overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements overview_text += "\n\n" # Create overview part overview_part = ContentPart( id="presentation_overview", label="Presentation Overview", typeGroup="text", mimeType="text/plain", data=overview_text, metadata={ "content_type": "overview", "extracted_from": "powerpoint", "total_slides": len(presentation.slides), "text_length": len(overview_text) } ) parts.insert(0, overview_part) # Insert at beginning return parts except Exception as e: logger.error(f"Error extracting PowerPoint content: {str(e)}") return [ContentPart( id="error", label="PowerPoint Extraction Error", typeGroup="text", mimeType="text/plain", data=f"Error extracting PowerPoint content: {str(e)}", metadata={"error": True, "error_message": str(e)} )] def _table_to_markdown(self, table_data: List[List[str]]) -> str: """Convert table data to markdown format.""" if not table_data: return "" markdown_lines = [] # Header row if table_data: header = "| " + " | ".join(table_data[0]) + " |" markdown_lines.append(header) # Separator row separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |" markdown_lines.append(separator) # Data rows for row in table_data[1:]: data_row = "| " + " | ".join(row) + " |" markdown_lines.append(data_row) return "\n".join(markdown_lines)