# Copyright (c) 2026 PowerOn AG # All rights reserved. """ Document utility functions (Layer L0 - shared). Pure text-processing helpers with zero internal dependencies. """ import base64 import binascii import re from typing import Any, Optional def parseInlineRuns(text: str) -> list: """ Parse inline markdown formatting into a list of InlineRun dicts. Handles: images, links, bold, italic, inline code, plain text. Uses a regex-based tokenizer that processes tokens left-to-right. """ if not text: return [{"type": "text", "value": ""}] _TOKEN_RE = re.compile( r'!\[(?P[^\]]*)\]\((?P[^)"]+)(?:\s+"(?P\d+)pt")?\)' r'|\[(?P[^\]]+)\]\((?P[^)]+)\)' r'|`(?P[^`]+)`' r'|\*\*(?P.+?)\*\*' r'|(?.+?)\*(?!\w)' r'|(?.+?)_(?!\w)' ) runs = [] lastEnd = 0 for m in _TOKEN_RE.finditer(text): if m.start() > lastEnd: runs.append({"type": "text", "value": text[lastEnd:m.start()]}) if m.group("imgAlt") is not None or m.group("imgSrc") is not None: alt = (m.group("imgAlt") or "").strip() or "Image" src = (m.group("imgSrc") or "").strip() widthStr = m.group("imgWidth") run = {"type": "image", "value": alt} if src.startswith("file:"): run["fileId"] = src[5:] else: run["href"] = src if widthStr: run["widthPt"] = int(widthStr) runs.append(run) elif m.group("linkText") is not None: runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")}) elif m.group("code") is not None: runs.append({"type": "code", "value": m.group("code")}) elif m.group("bold") is not None: runs.append({"type": "bold", "value": m.group("bold")}) elif m.group("italic1") is not None: runs.append({"type": "italic", "value": m.group("italic1")}) elif m.group("italic2") is not None: runs.append({"type": "italic", "value": m.group("italic2")}) lastEnd = m.end() if lastEnd < len(text): runs.append({"type": "text", "value": text[lastEnd:]}) return runs if runs else [{"type": "text", "value": text}] def _looksLikeAsciiBase64Payload(s: str) -> bool: """Heuristic: ActionDocument binary payloads use standard ASCII base64; markdown/text uses other chars.""" t = "".join(s.split()) if len(t) < 8: return False if not t.isascii(): return False return bool(re.fullmatch(r"[A-Za-z0-9+/]+=*", t)) and len(t) % 4 == 0 def coerceDocumentDataToBytes(raw: Any) -> Optional[bytes]: """Normalize documentData for DB file persistence. ActionDocument conventions (see methodFile.create): binary bodies are carried as ASCII base64 strings; plain markdown/text stays as Unicode. Do not UTF-8-encode a base64 literal — that persists the ASCII of the encoding (file looks like base64 gibberish). """ if raw is None: return None if isinstance(raw, bytes): return raw if len(raw) > 0 else None if isinstance(raw, bytearray): b = bytes(raw) return b if len(b) > 0 else None if isinstance(raw, memoryview): b = raw.tobytes() return b if len(b) > 0 else None if isinstance(raw, str): stripped = raw.strip() if not stripped: return None if _looksLikeAsciiBase64Payload(stripped): try: decoded = base64.b64decode(stripped, validate=True) except (TypeError, binascii.Error, ValueError): try: decoded = base64.b64decode(stripped) except (binascii.Error, ValueError): decoded = b"" if decoded: return decoded b = stripped.encode("utf-8") return b if len(b) > 0 else None return None