145 lines
5.1 KiB
Python
145 lines
5.1 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Inline emoji-font fallback for the ReportLab-based PDF renderer.
|
|
|
|
The default ReportLab core fonts (Helvetica, Times, Courier) only cover
|
|
WinAnsi (Latin-1 + a handful of typographic glyphs). Codepoints from the
|
|
Unicode Symbols / Pictographs / Emoji blocks render as a missing-glyph
|
|
square ("tofu") or are dropped entirely.
|
|
|
|
This module bundles a single TrueType emoji font (Noto Emoji, monochrome,
|
|
SIL Open Font License) and exposes `wrapEmojiSpansInXml` which rewrites
|
|
already-built ReportLab mini-XML so any character that the emoji font can
|
|
draw is wrapped in `<font name="NotoEmoji">...</font>`. ReportLab's
|
|
Paragraph parser supports nested <font> tags, so emoji spans nest cleanly
|
|
inside <b>, <i>, and <font name="Courier"> markup produced elsewhere.
|
|
|
|
ReportLab does not natively color emoji (CBDT/COLR/SBIX glyph tables are
|
|
not honoured by its TTF backend) — Noto Emoji is intentionally a
|
|
monochrome outline font, which is the only flavour that will render at all.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
from typing import FrozenSet, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EMOJI_FONT_NAME = "NotoEmoji"
|
|
|
|
_RENDERER_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
_GATEWAY_ROOT = os.path.abspath(os.path.join(_RENDERER_DIR, "..", "..", "..", "..", ".."))
|
|
_FONT_PATH = os.path.join(_GATEWAY_ROOT, "assets", "fonts", "NotoEmoji-Regular.ttf")
|
|
|
|
# Below 0x2000 the WinAnsi-style core fonts already cover Latin-1, common
|
|
# diacritics and basic punctuation. We only swap to the emoji font for
|
|
# higher codepoints so umlauts, copyright, NBSP, etc. stay visually
|
|
# consistent with surrounding body text.
|
|
_EMOJI_RANGE_START = 0x2000
|
|
|
|
_supportedCodepoints: Optional[FrozenSet[int]] = None
|
|
_initAttempted = False
|
|
|
|
|
|
def _initialize() -> bool:
|
|
"""Register the emoji TTF with ReportLab and capture its cmap.
|
|
|
|
Lazy + idempotent: the renderer may instantiate before reportlab is
|
|
imported in the worker process, and tests that don't generate PDFs
|
|
must not pay the registration cost.
|
|
"""
|
|
global _initAttempted, _supportedCodepoints
|
|
if _initAttempted:
|
|
return _supportedCodepoints is not None
|
|
_initAttempted = True
|
|
|
|
if not os.path.exists(_FONT_PATH):
|
|
logger.warning(
|
|
"Emoji font not found at %s — emoji codepoints in PDFs will render as tofu",
|
|
_FONT_PATH,
|
|
)
|
|
return False
|
|
|
|
try:
|
|
from reportlab.pdfbase import pdfmetrics
|
|
from reportlab.pdfbase.ttfonts import TTFont
|
|
except ImportError:
|
|
logger.warning("reportlab not installed; cannot register emoji font")
|
|
return False
|
|
|
|
try:
|
|
font = TTFont(EMOJI_FONT_NAME, _FONT_PATH)
|
|
pdfmetrics.registerFont(font)
|
|
# `face.charToGlyph` is built lazily on first use; force population
|
|
# so the mapping is available for our coverage check below.
|
|
cmap = getattr(font.face, "charToGlyph", None) or {}
|
|
if not cmap:
|
|
from fontTools.ttLib import TTFont as FtTTFont
|
|
cmap = FtTTFont(_FONT_PATH).getBestCmap()
|
|
_supportedCodepoints = frozenset(
|
|
cp for cp in cmap.keys() if cp >= _EMOJI_RANGE_START
|
|
)
|
|
logger.info(
|
|
"Registered emoji font '%s' with %d renderable codepoints (>= U+%04X)",
|
|
EMOJI_FONT_NAME,
|
|
len(_supportedCodepoints),
|
|
_EMOJI_RANGE_START,
|
|
)
|
|
return True
|
|
except Exception as exc:
|
|
logger.warning("Failed to register emoji font: %s", exc)
|
|
_supportedCodepoints = None
|
|
return False
|
|
|
|
|
|
_TAG_RE = re.compile(r"<[^>]+>")
|
|
|
|
|
|
def wrapEmojiSpansInXml(xml: str) -> str:
|
|
"""Wrap consecutive emoji codepoints with <font name="NotoEmoji">…</font>.
|
|
|
|
Operates on already-XML-escaped ReportLab markup. Tag markers
|
|
(`<...>`) are skipped so we never insert a font tag inside another
|
|
tag's attribute list. Codepoints that the emoji font cannot draw
|
|
pass through unchanged so the default body font still gets a chance
|
|
(e.g. U+200D zero-width-joiner has no glyph in Noto Emoji and would
|
|
otherwise render as tofu inside a forced <font> span).
|
|
"""
|
|
if not xml:
|
|
return xml
|
|
if not _initialize() or not _supportedCodepoints:
|
|
return xml
|
|
|
|
cps = _supportedCodepoints
|
|
out: list[str] = []
|
|
i = 0
|
|
n = len(xml)
|
|
while i < n:
|
|
# Skip past any markup tag verbatim — emojis inside attribute
|
|
# values would be unusual but harmless; the simpler invariant
|
|
# "we never split a `<...>` token" keeps the rewrite safe.
|
|
if xml[i] == "<":
|
|
tagEnd = xml.find(">", i)
|
|
if tagEnd == -1:
|
|
out.append(xml[i:])
|
|
break
|
|
out.append(xml[i : tagEnd + 1])
|
|
i = tagEnd + 1
|
|
continue
|
|
|
|
if ord(xml[i]) in cps:
|
|
j = i
|
|
while j < n and xml[j] != "<" and ord(xml[j]) in cps:
|
|
j += 1
|
|
out.append(f'<font name="{EMOJI_FONT_NAME}">')
|
|
out.append(xml[i:j])
|
|
out.append("</font>")
|
|
i = j
|
|
continue
|
|
|
|
out.append(xml[i])
|
|
i += 1
|
|
return "".join(out)
|