270 lines
11 KiB
Python
270 lines
11 KiB
Python
"""
|
|
JSON Schema definitions for AI-generated document structures.
|
|
This module provides schemas that guide AI to generate structured JSON output.
|
|
"""
|
|
|
|
from typing import Dict, Any
|
|
|
|
|
|
def get_document_subJsonSchema() -> Dict[str, Any]:
|
|
"""Get the JSON schema for structured document generation."""
|
|
return {
|
|
"type": "object",
|
|
"required": ["metadata", "sections"],
|
|
"properties": {
|
|
"metadata": {
|
|
"type": "object",
|
|
"required": ["title"],
|
|
"properties": {
|
|
"title": {"type": "string", "description": "Document title"},
|
|
"author": {"type": "string", "description": "Document author (optional)"},
|
|
"source_documents": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "List of source document IDs"
|
|
},
|
|
"extraction_method": {
|
|
"type": "string",
|
|
"default": "ai_extraction",
|
|
"description": "Method used for extraction"
|
|
}
|
|
}
|
|
},
|
|
"sections": {
|
|
"type": "array",
|
|
"description": "Document sections containing structured content",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["id", "content_type", "elements", "order"],
|
|
"properties": {
|
|
"id": {"type": "string", "description": "Unique section identifier"},
|
|
"title": {"type": "string", "description": "Section title (optional)"},
|
|
"content_type": {
|
|
"type": "string",
|
|
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
|
"description": "Primary content type of this section"
|
|
},
|
|
"elements": {
|
|
"type": "array",
|
|
"description": "Content elements in this section",
|
|
"items": {
|
|
"oneOf": [
|
|
{"$ref": "#/definitions/table"},
|
|
{"$ref": "#/definitions/bullet_list"},
|
|
{"$ref": "#/definitions/paragraph"},
|
|
{"$ref": "#/definitions/heading"},
|
|
{"$ref": "#/definitions/code_block"}
|
|
]
|
|
}
|
|
},
|
|
"order": {"type": "integer", "description": "Section order in document"},
|
|
"metadata": {
|
|
"type": "object",
|
|
"description": "Additional section metadata"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"summary": {
|
|
"type": "string",
|
|
"description": "Document summary (optional)"
|
|
},
|
|
"tags": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Document tags for categorization"
|
|
}
|
|
},
|
|
"definitions": {
|
|
"table": {
|
|
"type": "object",
|
|
"required": ["headers", "rows"],
|
|
"properties": {
|
|
"headers": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Table column headers"
|
|
},
|
|
"rows": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "array",
|
|
"items": {"type": "string"}
|
|
},
|
|
"description": "Table data rows"
|
|
},
|
|
"caption": {
|
|
"type": "string",
|
|
"description": "Table caption (optional)"
|
|
}
|
|
}
|
|
},
|
|
"bullet_list": {
|
|
"type": "object",
|
|
"required": ["items"],
|
|
"properties": {
|
|
"items": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "List item text"},
|
|
"subitems": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/list_item"},
|
|
"description": "Nested sub-items (optional)"
|
|
}
|
|
}
|
|
},
|
|
"description": "List items"
|
|
},
|
|
"list_type": {
|
|
"type": "string",
|
|
"enum": ["bullet", "numbered", "checklist"],
|
|
"default": "bullet",
|
|
"description": "Type of list"
|
|
}
|
|
}
|
|
},
|
|
"list_item": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "List item text"},
|
|
"subitems": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/list_item"},
|
|
"description": "Nested sub-items (optional)"
|
|
}
|
|
}
|
|
},
|
|
"paragraph": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "Paragraph text"},
|
|
"formatting": {
|
|
"type": "object",
|
|
"description": "Text formatting (bold, italic, etc.)"
|
|
}
|
|
}
|
|
},
|
|
"heading": {
|
|
"type": "object",
|
|
"required": ["text", "level"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "Heading text"},
|
|
"level": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"maximum": 6,
|
|
"description": "Heading level (1-6)"
|
|
}
|
|
}
|
|
},
|
|
"code_block": {
|
|
"type": "object",
|
|
"required": ["code"],
|
|
"properties": {
|
|
"code": {"type": "string", "description": "Code content"},
|
|
"language": {"type": "string", "description": "Programming language (optional)"}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
def get_extraction_prompt_template() -> str:
|
|
"""Get the template for AI extraction prompts that request JSON output."""
|
|
return """
|
|
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.
|
|
|
|
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
|
|
|
JSON Schema Requirements:
|
|
- Extract the actual data from the source documents
|
|
- If content is a table, extract it as a table with headers and rows
|
|
- If content is a list, extract it as a structured list with items
|
|
- If content is text, extract it as paragraphs or headings
|
|
- Preserve the original structure and data - do not summarize or interpret
|
|
- Use the exact JSON schema provided
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
|
|
Return only the JSON structure following the schema. Do not include any text before or after the JSON.
|
|
"""
|
|
|
|
|
|
def get_generation_prompt_template() -> str:
|
|
"""Get the template for AI generation prompts that work with JSON input."""
|
|
return """
|
|
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.
|
|
|
|
IMPORTANT: You must respond with valid JSON only, following the document schema.
|
|
|
|
Generation Guidelines:
|
|
- Use the provided JSON structure as the foundation
|
|
- Enhance the content with proper formatting and organization
|
|
- Ensure logical flow and readability
|
|
- Maintain the original data integrity
|
|
- Add appropriate headings and sections
|
|
- Organize content in a logical sequence
|
|
|
|
Content Enhancement:
|
|
- Tables: Ensure proper headers and data alignment
|
|
- Lists: Use appropriate list types (bullet, numbered, checklist)
|
|
- Headings: Use appropriate heading levels for hierarchy
|
|
- Paragraphs: Ensure proper text flow and formatting
|
|
- Code: Preserve code blocks with proper language identification
|
|
|
|
Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
|
|
"""
|
|
|
|
|
|
def validate_json_document(json_data: Dict[str, Any]) -> bool:
|
|
"""Validate that the JSON data follows the document schema."""
|
|
try:
|
|
# Basic validation - check required fields
|
|
if not isinstance(json_data, dict):
|
|
return False
|
|
|
|
if "metadata" not in json_data or "sections" not in json_data:
|
|
return False
|
|
|
|
metadata = json_data["metadata"]
|
|
if not isinstance(metadata, dict) or "title" not in metadata:
|
|
return False
|
|
|
|
sections = json_data["sections"]
|
|
if not isinstance(sections, list):
|
|
return False
|
|
|
|
# Validate each section
|
|
for i, section in enumerate(sections):
|
|
if not isinstance(section, dict):
|
|
return False
|
|
|
|
required_fields = ["id", "content_type", "elements", "order"]
|
|
for field in required_fields:
|
|
if field not in section:
|
|
return False
|
|
|
|
# Validate content_type
|
|
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
|
if section["content_type"] not in valid_types:
|
|
return False
|
|
|
|
# Validate elements
|
|
if not isinstance(section["elements"], list):
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception:
|
|
return False
|