""" JSON Schema definitions for AI-generated document structures. This module provides schemas that guide AI to generate structured JSON output. """ from typing import Dict, Any def get_document_subJsonSchema() -> Dict[str, Any]: """Get the JSON schema for structured document generation.""" return { "type": "object", "required": ["metadata", "sections"], "properties": { "metadata": { "type": "object", "required": ["title"], "properties": { "title": {"type": "string", "description": "Document title"}, "author": {"type": "string", "description": "Document author (optional)"}, "source_documents": { "type": "array", "items": {"type": "string"}, "description": "List of source document IDs" }, "extraction_method": { "type": "string", "default": "ai_extraction", "description": "Method used for extraction" } } }, "sections": { "type": "array", "description": "Document sections containing structured content", "items": { "type": "object", "required": ["id", "content_type", "elements", "order"], "properties": { "id": {"type": "string", "description": "Unique section identifier"}, "title": {"type": "string", "description": "Section title (optional)"}, "content_type": { "type": "string", "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"], "description": "Primary content type of this section" }, "elements": { "type": "array", "description": "Content elements in this section", "items": { "oneOf": [ {"$ref": "#/definitions/table"}, {"$ref": "#/definitions/bullet_list"}, {"$ref": "#/definitions/paragraph"}, {"$ref": "#/definitions/heading"}, {"$ref": "#/definitions/code_block"} ] } }, "order": {"type": "integer", "description": "Section order in document"}, "metadata": { "type": "object", "description": "Additional section metadata" } } } }, "summary": { "type": "string", "description": "Document summary (optional)" }, "tags": { "type": "array", "items": {"type": "string"}, "description": "Document tags for categorization" } }, "definitions": { "table": { "type": "object", "required": ["headers", "rows"], "properties": { "headers": { "type": "array", "items": {"type": "string"}, "description": "Table column headers" }, "rows": { "type": "array", "items": { "type": "array", "items": {"type": "string"} }, "description": "Table data rows" }, "caption": { "type": "string", "description": "Table caption (optional)" } } }, "bullet_list": { "type": "object", "required": ["items"], "properties": { "items": { "type": "array", "items": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "List item text"}, "subitems": { "type": "array", "items": {"$ref": "#/definitions/list_item"}, "description": "Nested sub-items (optional)" } } }, "description": "List items" }, "list_type": { "type": "string", "enum": ["bullet", "numbered", "checklist"], "default": "bullet", "description": "Type of list" } } }, "list_item": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "List item text"}, "subitems": { "type": "array", "items": {"$ref": "#/definitions/list_item"}, "description": "Nested sub-items (optional)" } } }, "paragraph": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "Paragraph text"}, "formatting": { "type": "object", "description": "Text formatting (bold, italic, etc.)" } } }, "heading": { "type": "object", "required": ["text", "level"], "properties": { "text": {"type": "string", "description": "Heading text"}, "level": { "type": "integer", "minimum": 1, "maximum": 6, "description": "Heading level (1-6)" } } }, "code_block": { "type": "object", "required": ["code"], "properties": { "code": {"type": "string", "description": "Code content"}, "language": {"type": "string", "description": "Programming language (optional)"} } } } } def get_extraction_prompt_template() -> str: """Get the template for AI extraction prompts that request JSON output.""" return """ You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document. IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. JSON Schema Requirements: - Extract the actual data from the source documents - If content is a table, extract it as a table with headers and rows - If content is a list, extract it as a structured list with items - If content is text, extract it as paragraphs or headings - Preserve the original structure and data - do not summarize or interpret - Use the exact JSON schema provided Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification Return only the JSON structure following the schema. Do not include any text before or after the JSON. """ def get_generation_prompt_template() -> str: """Get the template for AI generation prompts that work with JSON input.""" return """ You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content. IMPORTANT: You must respond with valid JSON only, following the document schema. Generation Guidelines: - Use the provided JSON structure as the foundation - Enhance the content with proper formatting and organization - Ensure logical flow and readability - Maintain the original data integrity - Add appropriate headings and sections - Organize content in a logical sequence Content Enhancement: - Tables: Ensure proper headers and data alignment - Lists: Use appropriate list types (bullet, numbered, checklist) - Headings: Use appropriate heading levels for hierarchy - Paragraphs: Ensure proper text flow and formatting - Code: Preserve code blocks with proper language identification Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON. """ def validate_json_document(json_data: Dict[str, Any]) -> bool: """Validate that the JSON data follows the document schema.""" try: # Basic validation - check required fields if not isinstance(json_data, dict): return False if "metadata" not in json_data or "sections" not in json_data: return False metadata = json_data["metadata"] if not isinstance(metadata, dict) or "title" not in metadata: return False sections = json_data["sections"] if not isinstance(sections, list): return False # Validate each section for i, section in enumerate(sections): if not isinstance(section, dict): return False required_fields = ["id", "content_type", "elements", "order"] for field in required_fields: if field not in section: return False # Validate content_type valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] if section["content_type"] not in valid_types: return False # Validate elements if not isinstance(section["elements"], list): return False return True except Exception: return False