# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ PDF in-place neutralization using PyMuPDF. Removes original text completely and inserts full UUID placeholders. PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout. """ import io import logging from typing import Dict, Optional logger = logging.getLogger(__name__) def neutralize_pdf_in_place( pdf_bytes: bytes, mapping: Dict[str, str], ) -> Optional[bytes]: """ Remove sensitive text and replace with UUID placeholders in-place. Content is fully removed (not just covered) so it cannot be copied. Args: pdf_bytes: Original PDF file content mapping: Dict of original_text -> placeholder (e.g. [address.uuid]) Returns: Modified PDF bytes, or None on failure """ if not mapping: return pdf_bytes try: import fitz # PyMuPDF except ImportError: logger.warning("PyMuPDF (fitz) not available for PDF in-place neutralization") return None try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") except Exception as e: logger.error(f"Failed to open PDF: {e}") return None sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0])) fill_color = (1, 1, 1) text_color = (0, 0, 0) fontname = "helv" fontsize = 8 try: for page_num in range(len(doc)): page = doc[page_num] for original_text, placeholder in sorted_items: if not original_text or not placeholder: continue search_text = original_text insert_text = placeholder if placeholder.startswith("[policy."): # Try label+number to get wider rect; insert UUID only (label+UUID would overflow) for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "): candidate = prefix + original_text try: hits = page.search_for(candidate, quads=False) if hits: search_text = candidate insert_text = placeholder # UUID only so it fits in rect break except Exception: continue try: instances = page.search_for(search_text, quads=False) except Exception: instances = [] for rect in instances: try: fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize page.add_redact_annot( rect, text=insert_text, fill=fill_color, text_color=text_color, fontname=fontname, fontsize=fs, ) except Exception as e: logger.warning(f"Redact failed for {original_text[:40]!r}: {e}") try: page.apply_redactions() except Exception as e: logger.debug(f"apply_redactions page {page_num + 1}: {e}") buf = io.BytesIO() doc.save(buf, garbage=4, deflate=True) doc.close() return buf.getvalue() except Exception as e: logger.error(f"PDF in-place neutralization failed: {e}", exc_info=True) try: doc.close() except Exception: pass return None