# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import Any, Dict, List import base64 import io from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor class PdfExtractor(Extractor): """ Extractor for PDF files. Supported formats: - MIME types: application/pdf - File extensions: .pdf - Special handling: Extracts text per page and embedded images - Dependencies: PyPDF2, PyMuPDF (fitz) """ def __init__(self): self._loaded = False self._haveLibs = False def _load(self): if self._loaded: return self._loaded = True try: global PyPDF2, fitz import PyPDF2 import fitz # PyMuPDF self._haveLibs = True except Exception: self._haveLibs = False def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf") def getSupportedExtensions(self) -> list[str]: """Return list of supported file extensions.""" return [".pdf"] def getSupportedMimeTypes(self) -> list[str]: """Return list of supported MIME types.""" return ["application/pdf"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() parts: List[ContentPart] = [] rootId = makeId() parts.append(ContentPart( id=rootId, parentId=None, label="pdf", typeGroup="container", mimeType="application/pdf", data="", metadata={"size": len(fileBytes)} )) if not self._haveLibs: parts.append(ContentPart( id=makeId(), parentId=rootId, label="binary", typeGroup="binary", mimeType="application/pdf", data=base64.b64encode(fileBytes).decode("utf-8"), metadata={"size": len(fileBytes), "warning": "PDF libs not available"} )) return parts # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer) try: doc = fitz.open(stream=fileBytes, filetype="pdf") for i in range(len(doc)): try: page = doc[i] text = page.get_text() or "" if text.strip(): parts.append(ContentPart( id=makeId(), parentId=rootId, label=f"page_{i+1}", typeGroup="text", mimeType="text/plain", data=text, metadata={ "pages": 1, "pageIndex": i, "size": len(text.encode('utf-8')), "contextRef": { "containerPath": context.get("fileName", "document.pdf"), "location": f"page:{i+1}", "pageIndex": i, }, } )) except Exception: continue doc.close() except Exception: pass # Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts) if not has_text: try: with io.BytesIO(fileBytes) as buf: reader = PyPDF2.PdfReader(buf) for i, page in enumerate(reader.pages): try: text = page.extract_text() or "" if text.strip(): parts.append(ContentPart( id=makeId(), parentId=rootId, label=f"page_{i+1}", typeGroup="text", mimeType="text/plain", data=text, metadata={ "pages": 1, "pageIndex": i, "size": len(text.encode('utf-8')), "contextRef": { "containerPath": context.get("fileName", "document.pdf"), "location": f"page:{i+1}", "pageIndex": i, }, } )) except Exception: continue except Exception: pass # Extract images with PyMuPDF try: doc = fitz.open(stream=fileBytes, filetype="pdf") for i in range(len(doc)): page = doc[i] images = page.get_images(full=True) for j, img in enumerate(images): try: xref = img[0] baseImage = doc.extract_image(xref) if baseImage: imgBytes = baseImage.get("image", b"") ext = baseImage.get("ext", "png") if imgBytes: parts.append(ContentPart( id=makeId(), parentId=rootId, label=f"image_{i+1}_{j}", typeGroup="image", mimeType=f"image/{ext}", data=base64.b64encode(imgBytes).decode("utf-8"), metadata={ "pageIndex": i, "size": len(imgBytes), "contextRef": { "containerPath": context.get("fileName", "document.pdf"), "location": f"page:{i+1}/image:{j}", "pageIndex": i, }, } )) except Exception: continue doc.close() except Exception: pass return parts