110 lines
3.6 KiB
Python
110 lines
3.6 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
PDF in-place neutralization using PyMuPDF.
|
|
Removes original text completely and inserts full UUID placeholders.
|
|
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
|
|
"""
|
|
|
|
import io
|
|
import logging
|
|
from typing import Dict, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def neutralize_pdf_in_place(
|
|
pdf_bytes: bytes,
|
|
mapping: Dict[str, str],
|
|
) -> Optional[bytes]:
|
|
"""
|
|
Remove sensitive text and replace with UUID placeholders in-place.
|
|
Content is fully removed (not just covered) so it cannot be copied.
|
|
|
|
Args:
|
|
pdf_bytes: Original PDF file content
|
|
mapping: Dict of original_text -> placeholder (e.g. [address.uuid])
|
|
|
|
Returns:
|
|
Modified PDF bytes, or None on failure
|
|
"""
|
|
if not mapping:
|
|
return pdf_bytes
|
|
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
logger.warning("PyMuPDF (fitz) not available for PDF in-place neutralization")
|
|
return None
|
|
|
|
try:
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
except Exception as e:
|
|
logger.error(f"Failed to open PDF: {e}")
|
|
return None
|
|
|
|
sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
|
|
fill_color = (1, 1, 1)
|
|
text_color = (0, 0, 0)
|
|
fontname = "helv"
|
|
fontsize = 8
|
|
|
|
try:
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
|
|
for original_text, placeholder in sorted_items:
|
|
if not original_text or not placeholder:
|
|
continue
|
|
|
|
search_text = original_text
|
|
insert_text = placeholder
|
|
if placeholder.startswith("[policy."):
|
|
# Try label+number to get wider rect; insert UUID only (label+UUID would overflow)
|
|
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
|
|
candidate = prefix + original_text
|
|
try:
|
|
hits = page.search_for(candidate, quads=False)
|
|
if hits:
|
|
search_text = candidate
|
|
insert_text = placeholder # UUID only so it fits in rect
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
try:
|
|
instances = page.search_for(search_text, quads=False)
|
|
except Exception:
|
|
instances = []
|
|
|
|
for rect in instances:
|
|
try:
|
|
fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize
|
|
page.add_redact_annot(
|
|
rect,
|
|
text=insert_text,
|
|
fill=fill_color,
|
|
text_color=text_color,
|
|
fontname=fontname,
|
|
fontsize=fs,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
|
|
|
|
try:
|
|
page.apply_redactions()
|
|
except Exception as e:
|
|
logger.debug(f"apply_redactions page {page_num + 1}: {e}")
|
|
|
|
buf = io.BytesIO()
|
|
doc.save(buf, garbage=4, deflate=True)
|
|
doc.close()
|
|
return buf.getvalue()
|
|
|
|
except Exception as e:
|
|
logger.error(f"PDF in-place neutralization failed: {e}", exc_info=True)
|
|
try:
|
|
doc.close()
|
|
except Exception:
|
|
pass
|
|
return None
|