platform-core/modules/shared/documentUtils.py

# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""
Document utility functions (Layer L0 - shared).
Pure text-processing helpers with zero internal dependencies.
"""

import base64
import binascii
import re
from typing import Any, Optional


def parseInlineRuns(text: str) -> list:
    """
    Parse inline markdown formatting into a list of InlineRun dicts.
    Handles: images, links, bold, italic, inline code, plain text.
    Uses a regex-based tokenizer that processes tokens left-to-right.
    """
    if not text:
        return [{"type": "text", "value": ""}]

    _TOKEN_RE = re.compile(
        r'!\[(?P<imgAlt>[^\]]*)\]\((?P<imgSrc>[^)"]+)(?:\s+"(?P<imgWidth>\d+)pt")?\)'
        r'|\[(?P<linkText>[^\]]+)\]\((?P<linkHref>[^)]+)\)'
        r'|`(?P<code>[^`]+)`'
        r'|\*\*(?P<bold>.+?)\*\*'
        r'|(?<!\w)\*(?P<italic1>.+?)\*(?!\w)'
        r'|(?<!\w)_(?P<italic2>.+?)_(?!\w)'
    )

    runs = []
    lastEnd = 0

    for m in _TOKEN_RE.finditer(text):
        if m.start() > lastEnd:
            runs.append({"type": "text", "value": text[lastEnd:m.start()]})

        if m.group("imgAlt") is not None or m.group("imgSrc") is not None:
            alt = (m.group("imgAlt") or "").strip() or "Image"
            src = (m.group("imgSrc") or "").strip()
            widthStr = m.group("imgWidth")
            run = {"type": "image", "value": alt}
            if src.startswith("file:"):
                run["fileId"] = src[5:]
            else:
                run["href"] = src
            if widthStr:
                run["widthPt"] = int(widthStr)
            runs.append(run)
        elif m.group("linkText") is not None:
            runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")})
        elif m.group("code") is not None:
            runs.append({"type": "code", "value": m.group("code")})
        elif m.group("bold") is not None:
            runs.append({"type": "bold", "value": m.group("bold")})
        elif m.group("italic1") is not None:
            runs.append({"type": "italic", "value": m.group("italic1")})
        elif m.group("italic2") is not None:
            runs.append({"type": "italic", "value": m.group("italic2")})

        lastEnd = m.end()

    if lastEnd < len(text):
        runs.append({"type": "text", "value": text[lastEnd:]})

    return runs if runs else [{"type": "text", "value": text}]


def _looksLikeAsciiBase64Payload(s: str) -> bool:
    """Heuristic: ActionDocument binary payloads use standard ASCII base64; markdown/text uses other chars."""
    t = "".join(s.split())
    if len(t) < 8:
        return False
    if not t.isascii():
        return False
    return bool(re.fullmatch(r"[A-Za-z0-9+/]+=*", t)) and len(t) % 4 == 0


def coerceDocumentDataToBytes(raw: Any) -> Optional[bytes]:
    """Normalize documentData for DB file persistence.

    ActionDocument conventions (see methodFile.create): binary bodies are carried as ASCII
    base64 strings; plain markdown/text stays as Unicode. Do not UTF-8-encode a base64
    literal — that persists the ASCII of the encoding (file looks like base64 gibberish).
    """
    if raw is None:
        return None
    if isinstance(raw, bytes):
        return raw if len(raw) > 0 else None
    if isinstance(raw, bytearray):
        b = bytes(raw)
        return b if len(b) > 0 else None
    if isinstance(raw, memoryview):
        b = raw.tobytes()
        return b if len(b) > 0 else None
    if isinstance(raw, str):
        stripped = raw.strip()
        if not stripped:
            return None
        if _looksLikeAsciiBase64Payload(stripped):
            try:
                decoded = base64.b64decode(stripped, validate=True)
            except (TypeError, binascii.Error, ValueError):
                try:
                    decoded = base64.b64decode(stripped)
                except (binascii.Error, ValueError):
                    decoded = b""
            if decoded:
                return decoded
        b = stripped.encode("utf-8")
        return b if len(b) > 0 else None
    return None