gateway/modules/services/serviceExtraction/extractors/extractorPptx.py
2025-10-13 22:03:28 +02:00

225 lines
8.8 KiB
Python

import logging
import base64
from typing import List, Dict, Any, Optional
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
class PptxExtractor(Extractor):
"""
Extractor for PowerPoint files.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
- File extensions: .pptx, .ppt
- Special handling: Extracts slide content, tables, and images
- Dependencies: python-pptx
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global Presentation
from pptx import Presentation
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pptx", ".ppt"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""
Extract content from PowerPoint files.
Args:
fileBytes: Raw file data as bytes
context: Context dictionary with file information
Returns:
List of ContentPart objects with extracted content
"""
self._load()
if not self._haveLibs:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data="Error: python-pptx library not installed",
metadata={"error": True, "error_message": "python-pptx library not installed"}
)]
try:
import io
# Load presentation from bytes
presentation = Presentation(io.BytesIO(fileBytes))
parts = []
slide_index = 0
# Extract content from each slide
for slide in presentation.slides:
slide_index += 1
slide_content = []
# Extract text from slide
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content.append(shape.text.strip())
# Extract table data
for shape in slide.shapes:
if shape.has_table:
table = shape.table
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
row_data.append(cell.text.strip())
table_data.append(row_data)
if table_data:
# Convert table to markdown format
table_md = self._table_to_markdown(table_data)
slide_content.append(table_md)
# Extract images
for shape in slide.shapes:
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
try:
image = shape.image
image_bytes = image.blob
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Create image part
image_part = ContentPart(
id=f"slide_{slide_index}_image_{len(parts)}",
label=f"Slide {slide_index} Image",
typeGroup="image",
mimeType="image/png", # Default to PNG
data=image_b64,
metadata={
"slide_number": slide_index,
"shape_type": "image",
"extracted_from": "powerpoint"
}
)
parts.append(image_part)
except Exception as e:
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
# Create slide content part
if slide_content:
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
slide_part = ContentPart(
id=f"slide_{slide_index}",
label=f"Slide {slide_index} Content",
typeGroup="structure",
mimeType="text/plain",
data=slide_text,
metadata={
"slide_number": slide_index,
"content_type": "slide",
"extracted_from": "powerpoint",
"text_length": len(slide_text)
}
)
parts.append(slide_part)
# Create presentation overview
file_name = context.get("fileName", "presentation.pptx")
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
overview_text += f"**Content Parts:** {len(parts)}\n\n"
# Add slide summaries
for i, slide in enumerate(presentation.slides, 1):
slide_text_parts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text_parts.append(shape.text.strip())
if slide_text_parts:
overview_text += f"## Slide {i}\n"
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
overview_text += "\n\n"
# Create overview part
overview_part = ContentPart(
id="presentation_overview",
label="Presentation Overview",
typeGroup="text",
mimeType="text/plain",
data=overview_text,
metadata={
"content_type": "overview",
"extracted_from": "powerpoint",
"total_slides": len(presentation.slides),
"text_length": len(overview_text)
}
)
parts.insert(0, overview_part) # Insert at beginning
return parts
except Exception as e:
logger.error(f"Error extracting PowerPoint content: {str(e)}")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data=f"Error extracting PowerPoint content: {str(e)}",
metadata={"error": True, "error_message": str(e)}
)]
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
"""Convert table data to markdown format."""
if not table_data:
return ""
markdown_lines = []
# Header row
if table_data:
header = "| " + " | ".join(table_data[0]) + " |"
markdown_lines.append(header)
# Separator row
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
markdown_lines.append(separator)
# Data rows
for row in table_data[1:]:
data_row = "| " + " | ".join(row) + " |"
markdown_lines.append(data_row)
return "\n".join(markdown_lines)