platform-core/modules/shared/documentUtils.py
ValueOn AG 4a60086c80
Some checks failed
Deploy Plattform-Core (Int) / test (push) Failing after 15s
Deploy Plattform-Core (Int) / deploy (push) Has been skipped
cp adapted to 2026 poweron
2026-06-09 09:53:31 +02:00

113 lines
4 KiB
Python

# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""
Document utility functions (Layer L0 - shared).
Pure text-processing helpers with zero internal dependencies.
"""
import base64
import binascii
import re
from typing import Any, Optional
def parseInlineRuns(text: str) -> list:
"""
Parse inline markdown formatting into a list of InlineRun dicts.
Handles: images, links, bold, italic, inline code, plain text.
Uses a regex-based tokenizer that processes tokens left-to-right.
"""
if not text:
return [{"type": "text", "value": ""}]
_TOKEN_RE = re.compile(
r'!\[(?P<imgAlt>[^\]]*)\]\((?P<imgSrc>[^)"]+)(?:\s+"(?P<imgWidth>\d+)pt")?\)'
r'|\[(?P<linkText>[^\]]+)\]\((?P<linkHref>[^)]+)\)'
r'|`(?P<code>[^`]+)`'
r'|\*\*(?P<bold>.+?)\*\*'
r'|(?<!\w)\*(?P<italic1>.+?)\*(?!\w)'
r'|(?<!\w)_(?P<italic2>.+?)_(?!\w)'
)
runs = []
lastEnd = 0
for m in _TOKEN_RE.finditer(text):
if m.start() > lastEnd:
runs.append({"type": "text", "value": text[lastEnd:m.start()]})
if m.group("imgAlt") is not None or m.group("imgSrc") is not None:
alt = (m.group("imgAlt") or "").strip() or "Image"
src = (m.group("imgSrc") or "").strip()
widthStr = m.group("imgWidth")
run = {"type": "image", "value": alt}
if src.startswith("file:"):
run["fileId"] = src[5:]
else:
run["href"] = src
if widthStr:
run["widthPt"] = int(widthStr)
runs.append(run)
elif m.group("linkText") is not None:
runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")})
elif m.group("code") is not None:
runs.append({"type": "code", "value": m.group("code")})
elif m.group("bold") is not None:
runs.append({"type": "bold", "value": m.group("bold")})
elif m.group("italic1") is not None:
runs.append({"type": "italic", "value": m.group("italic1")})
elif m.group("italic2") is not None:
runs.append({"type": "italic", "value": m.group("italic2")})
lastEnd = m.end()
if lastEnd < len(text):
runs.append({"type": "text", "value": text[lastEnd:]})
return runs if runs else [{"type": "text", "value": text}]
def _looksLikeAsciiBase64Payload(s: str) -> bool:
"""Heuristic: ActionDocument binary payloads use standard ASCII base64; markdown/text uses other chars."""
t = "".join(s.split())
if len(t) < 8:
return False
if not t.isascii():
return False
return bool(re.fullmatch(r"[A-Za-z0-9+/]+=*", t)) and len(t) % 4 == 0
def coerceDocumentDataToBytes(raw: Any) -> Optional[bytes]:
"""Normalize documentData for DB file persistence.
ActionDocument conventions (see methodFile.create): binary bodies are carried as ASCII
base64 strings; plain markdown/text stays as Unicode. Do not UTF-8-encode a base64
literal — that persists the ASCII of the encoding (file looks like base64 gibberish).
"""
if raw is None:
return None
if isinstance(raw, bytes):
return raw if len(raw) > 0 else None
if isinstance(raw, bytearray):
b = bytes(raw)
return b if len(b) > 0 else None
if isinstance(raw, memoryview):
b = raw.tobytes()
return b if len(b) > 0 else None
if isinstance(raw, str):
stripped = raw.strip()
if not stripped:
return None
if _looksLikeAsciiBase64Payload(stripped):
try:
decoded = base64.b64decode(stripped, validate=True)
except (TypeError, binascii.Error, ValueError):
try:
decoded = base64.b64decode(stripped)
except (binascii.Error, ValueError):
decoded = b""
if decoded:
return decoded
b = stripped.encode("utf-8")
return b if len(b) > 0 else None
return None