gateway/modules/datamodels/datamodelJson.py
2026-04-29 23:12:46 +02:00

130 lines
3.9 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Unified JSON document schema and helpers used by both generation prompts and renderers.
This defines a single canonical template and the supported section types.
"""
from typing import List, Literal, TypedDict
# Canonical list of supported section types across the system
supportedSectionTypes: List[str] = [
"table",
"bullet_list",
"heading",
"paragraph",
"code_block",
"image",
]
class InlineRun(TypedDict, total=False):
"""Single inline content run. Every paragraph/cell/list-item is a List[InlineRun]."""
type: Literal["text", "image", "link", "bold", "italic", "code"]
value: str # text content (for text/bold/italic/code/link-label)
fileId: str # for type=image: reference to FileItem
base64Data: str # for type=image: resolved base64 (post-processing)
mimeType: str # for type=image: e.g. "image/png"
widthPt: int # for type=image: optional render width
href: str # for type=link: URL target
supportedInlineRunTypes: List[str] = [
"text", "image", "link", "bold", "italic", "code",
]
# Canonical JSON template used for AI generation (documents array + sections)
# This template is used for STRUCTURE generation - sections have empty elements arrays.
# For content generation, elements arrays will be populated later.
jsonTemplateDocument: str = """{
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation",
"title": "{{DOCUMENT_TITLE}}"
},
"documents": [
{
"id": "doc_1",
"title": "{{DOCUMENT_TITLE}}",
"filename": "document.json",
"sections": [
{
"id": "section_heading_main_title",
"content_type": "heading",
"complexity": "simple",
"generation_hint": "Main document title heading",
"order": 1,
"elements": []
},
{
"id": "section_paragraph_introduction",
"content_type": "paragraph",
"complexity": "simple",
"generation_hint": "Introduction paragraph",
"order": 2,
"elements": []
},
{
"id": "section_heading_section_1",
"content_type": "heading",
"complexity": "simple",
"generation_hint": "Section heading for topic 1",
"order": 3,
"elements": []
},
{
"id": "section_paragraph_section_1",
"content_type": "paragraph",
"complexity": "simple",
"generation_hint": "Content paragraph for section 1",
"order": 4,
"elements": []
},
{
"id": "section_bullet_list_example",
"content_type": "bullet_list",
"complexity": "simple",
"generation_hint": "Bullet list items",
"order": 5,
"elements": []
},
{
"id": "section_image_example",
"content_type": "image",
"complexity": "complex",
"generation_hint": "Illustration for document",
"image_prompt": "A detailed description for image generation",
"order": 6,
"elements": []
},
{
"id": "section_table_example",
"content_type": "table",
"complexity": "simple",
"generation_hint": "Data table with relevant information",
"order": 7,
"elements": []
},
{
"id": "section_code_example",
"content_type": "code_block",
"complexity": "simple",
"generation_hint": "Code example or snippet",
"order": 8,
"elements": []
},
{
"id": "section_paragraph_conclusion",
"content_type": "paragraph",
"complexity": "simple",
"generation_hint": "Conclusion paragraph",
"order": 9,
"elements": []
}
]
}
]
}"""