227 lines
8.8 KiB
Python
227 lines
8.8 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
import logging
|
|
import base64
|
|
from typing import List, Dict, Any, Optional
|
|
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
|
|
from ..subRegistry import Extractor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PptxExtractor(Extractor):
|
|
"""
|
|
Extractor for PowerPoint files.
|
|
|
|
Supported formats:
|
|
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
|
|
- File extensions: .pptx, .ppt
|
|
- Special handling: Extracts slide content, tables, and images
|
|
- Dependencies: python-pptx
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._loaded = False
|
|
self._haveLibs = False
|
|
|
|
def _load(self):
|
|
if self._loaded:
|
|
return
|
|
self._loaded = True
|
|
try:
|
|
global Presentation
|
|
from pptx import Presentation
|
|
self._haveLibs = True
|
|
except Exception:
|
|
self._haveLibs = False
|
|
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
return (mimeType in [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
"application/vnd.ms-powerpoint"
|
|
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
|
|
|
def getSupportedExtensions(self) -> list[str]:
|
|
"""Return list of supported file extensions."""
|
|
return [".pptx", ".ppt"]
|
|
|
|
def getSupportedMimeTypes(self) -> list[str]:
|
|
"""Return list of supported MIME types."""
|
|
return [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
"application/vnd.ms-powerpoint"
|
|
]
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
"""
|
|
Extract content from PowerPoint files.
|
|
|
|
Args:
|
|
fileBytes: Raw file data as bytes
|
|
context: Context dictionary with file information
|
|
|
|
Returns:
|
|
List of ContentPart objects with extracted content
|
|
"""
|
|
self._load()
|
|
|
|
if not self._haveLibs:
|
|
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
|
return [ContentPart(
|
|
id="error",
|
|
label="PowerPoint Extraction Error",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data="Error: python-pptx library not installed",
|
|
metadata={"error": True, "error_message": "python-pptx library not installed"}
|
|
)]
|
|
|
|
try:
|
|
import io
|
|
|
|
# Load presentation from bytes
|
|
presentation = Presentation(io.BytesIO(fileBytes))
|
|
|
|
parts = []
|
|
slide_index = 0
|
|
|
|
# Extract content from each slide
|
|
for slide in presentation.slides:
|
|
slide_index += 1
|
|
slide_content = []
|
|
|
|
# Extract text from slide
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text") and shape.text.strip():
|
|
slide_content.append(shape.text.strip())
|
|
|
|
# Extract table data
|
|
for shape in slide.shapes:
|
|
if shape.has_table:
|
|
table = shape.table
|
|
table_data = []
|
|
for row in table.rows:
|
|
row_data = []
|
|
for cell in row.cells:
|
|
row_data.append(cell.text.strip())
|
|
table_data.append(row_data)
|
|
|
|
if table_data:
|
|
# Convert table to markdown format
|
|
table_md = self._table_to_markdown(table_data)
|
|
slide_content.append(table_md)
|
|
|
|
# Extract images
|
|
for shape in slide.shapes:
|
|
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
|
|
try:
|
|
image = shape.image
|
|
image_bytes = image.blob
|
|
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
|
|
|
|
# Create image part
|
|
image_part = ContentPart(
|
|
id=f"slide_{slide_index}_image_{len(parts)}",
|
|
label=f"Slide {slide_index} Image",
|
|
typeGroup="image",
|
|
mimeType="image/png", # Default to PNG
|
|
data=image_b64,
|
|
metadata={
|
|
"slide_number": slide_index,
|
|
"shape_type": "image",
|
|
"extracted_from": "powerpoint"
|
|
}
|
|
)
|
|
parts.append(image_part)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
|
|
|
|
# Create slide content part
|
|
if slide_content:
|
|
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
|
|
|
|
slide_part = ContentPart(
|
|
id=f"slide_{slide_index}",
|
|
label=f"Slide {slide_index} Content",
|
|
typeGroup="structure",
|
|
mimeType="text/plain",
|
|
data=slide_text,
|
|
metadata={
|
|
"slide_number": slide_index,
|
|
"content_type": "slide",
|
|
"extracted_from": "powerpoint",
|
|
"text_length": len(slide_text)
|
|
}
|
|
)
|
|
parts.append(slide_part)
|
|
|
|
# Create presentation overview
|
|
file_name = context.get("fileName", "presentation.pptx")
|
|
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
|
|
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
|
|
overview_text += f"**Content Parts:** {len(parts)}\n\n"
|
|
|
|
# Add slide summaries
|
|
for i, slide in enumerate(presentation.slides, 1):
|
|
slide_text_parts = []
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text") and shape.text.strip():
|
|
slide_text_parts.append(shape.text.strip())
|
|
|
|
if slide_text_parts:
|
|
overview_text += f"## Slide {i}\n"
|
|
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
|
|
overview_text += "\n\n"
|
|
|
|
# Create overview part
|
|
overview_part = ContentPart(
|
|
id="presentation_overview",
|
|
label="Presentation Overview",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=overview_text,
|
|
metadata={
|
|
"content_type": "overview",
|
|
"extracted_from": "powerpoint",
|
|
"total_slides": len(presentation.slides),
|
|
"text_length": len(overview_text)
|
|
}
|
|
)
|
|
parts.insert(0, overview_part) # Insert at beginning
|
|
|
|
return parts
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting PowerPoint content: {str(e)}")
|
|
return [ContentPart(
|
|
id="error",
|
|
label="PowerPoint Extraction Error",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=f"Error extracting PowerPoint content: {str(e)}",
|
|
metadata={"error": True, "error_message": str(e)}
|
|
)]
|
|
|
|
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
|
|
"""Convert table data to markdown format."""
|
|
if not table_data:
|
|
return ""
|
|
|
|
markdown_lines = []
|
|
|
|
# Header row
|
|
if table_data:
|
|
header = "| " + " | ".join(table_data[0]) + " |"
|
|
markdown_lines.append(header)
|
|
|
|
# Separator row
|
|
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
|
|
markdown_lines.append(separator)
|
|
|
|
# Data rows
|
|
for row in table_data[1:]:
|
|
data_row = "| " + " | ".join(row) + " |"
|
|
markdown_lines.append(data_row)
|
|
|
|
return "\n".join(markdown_lines)
|
|
|