113 lines
4 KiB
Python
113 lines
4 KiB
Python
# Copyright (c) 2026 PowerOn AG
|
|
# All rights reserved.
|
|
"""
|
|
Document utility functions (Layer L0 - shared).
|
|
Pure text-processing helpers with zero internal dependencies.
|
|
"""
|
|
|
|
import base64
|
|
import binascii
|
|
import re
|
|
from typing import Any, Optional
|
|
|
|
|
|
def parseInlineRuns(text: str) -> list:
|
|
"""
|
|
Parse inline markdown formatting into a list of InlineRun dicts.
|
|
Handles: images, links, bold, italic, inline code, plain text.
|
|
Uses a regex-based tokenizer that processes tokens left-to-right.
|
|
"""
|
|
if not text:
|
|
return [{"type": "text", "value": ""}]
|
|
|
|
_TOKEN_RE = re.compile(
|
|
r'!\[(?P<imgAlt>[^\]]*)\]\((?P<imgSrc>[^)"]+)(?:\s+"(?P<imgWidth>\d+)pt")?\)'
|
|
r'|\[(?P<linkText>[^\]]+)\]\((?P<linkHref>[^)]+)\)'
|
|
r'|`(?P<code>[^`]+)`'
|
|
r'|\*\*(?P<bold>.+?)\*\*'
|
|
r'|(?<!\w)\*(?P<italic1>.+?)\*(?!\w)'
|
|
r'|(?<!\w)_(?P<italic2>.+?)_(?!\w)'
|
|
)
|
|
|
|
runs = []
|
|
lastEnd = 0
|
|
|
|
for m in _TOKEN_RE.finditer(text):
|
|
if m.start() > lastEnd:
|
|
runs.append({"type": "text", "value": text[lastEnd:m.start()]})
|
|
|
|
if m.group("imgAlt") is not None or m.group("imgSrc") is not None:
|
|
alt = (m.group("imgAlt") or "").strip() or "Image"
|
|
src = (m.group("imgSrc") or "").strip()
|
|
widthStr = m.group("imgWidth")
|
|
run = {"type": "image", "value": alt}
|
|
if src.startswith("file:"):
|
|
run["fileId"] = src[5:]
|
|
else:
|
|
run["href"] = src
|
|
if widthStr:
|
|
run["widthPt"] = int(widthStr)
|
|
runs.append(run)
|
|
elif m.group("linkText") is not None:
|
|
runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")})
|
|
elif m.group("code") is not None:
|
|
runs.append({"type": "code", "value": m.group("code")})
|
|
elif m.group("bold") is not None:
|
|
runs.append({"type": "bold", "value": m.group("bold")})
|
|
elif m.group("italic1") is not None:
|
|
runs.append({"type": "italic", "value": m.group("italic1")})
|
|
elif m.group("italic2") is not None:
|
|
runs.append({"type": "italic", "value": m.group("italic2")})
|
|
|
|
lastEnd = m.end()
|
|
|
|
if lastEnd < len(text):
|
|
runs.append({"type": "text", "value": text[lastEnd:]})
|
|
|
|
return runs if runs else [{"type": "text", "value": text}]
|
|
|
|
|
|
def _looksLikeAsciiBase64Payload(s: str) -> bool:
|
|
"""Heuristic: ActionDocument binary payloads use standard ASCII base64; markdown/text uses other chars."""
|
|
t = "".join(s.split())
|
|
if len(t) < 8:
|
|
return False
|
|
if not t.isascii():
|
|
return False
|
|
return bool(re.fullmatch(r"[A-Za-z0-9+/]+=*", t)) and len(t) % 4 == 0
|
|
|
|
|
|
def coerceDocumentDataToBytes(raw: Any) -> Optional[bytes]:
|
|
"""Normalize documentData for DB file persistence.
|
|
|
|
ActionDocument conventions (see methodFile.create): binary bodies are carried as ASCII
|
|
base64 strings; plain markdown/text stays as Unicode. Do not UTF-8-encode a base64
|
|
literal — that persists the ASCII of the encoding (file looks like base64 gibberish).
|
|
"""
|
|
if raw is None:
|
|
return None
|
|
if isinstance(raw, bytes):
|
|
return raw if len(raw) > 0 else None
|
|
if isinstance(raw, bytearray):
|
|
b = bytes(raw)
|
|
return b if len(b) > 0 else None
|
|
if isinstance(raw, memoryview):
|
|
b = raw.tobytes()
|
|
return b if len(b) > 0 else None
|
|
if isinstance(raw, str):
|
|
stripped = raw.strip()
|
|
if not stripped:
|
|
return None
|
|
if _looksLikeAsciiBase64Payload(stripped):
|
|
try:
|
|
decoded = base64.b64decode(stripped, validate=True)
|
|
except (TypeError, binascii.Error, ValueError):
|
|
try:
|
|
decoded = base64.b64decode(stripped)
|
|
except (binascii.Error, ValueError):
|
|
decoded = b""
|
|
if decoded:
|
|
return decoded
|
|
b = stripped.encode("utf-8")
|
|
return b if len(b) > 0 else None
|
|
return None
|