""" JSON Schema definitions for AI-generated document structures. This module provides schemas that guide AI to generate structured JSON output. """ from typing import Dict, Any def get_multi_document_subJsonSchema() -> Dict[str, Any]: """Get the JSON schema for multi-document generation.""" return { "type": "object", "required": ["metadata", "documents"], "properties": { "metadata": { "type": "object", "required": ["title", "splitStrategy"], "properties": { "title": {"type": "string", "description": "Document title"}, "splitStrategy": { "type": "string", "enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"], "description": "Strategy for splitting content into multiple files" }, "splitCriteria": { "type": "object", "description": "Custom criteria for splitting (e.g., entity_id, category, etc.)" }, "fileNamingPattern": { "type": "string", "description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')" }, "author": {"type": "string", "description": "Document author (optional)"}, "source_documents": { "type": "array", "items": {"type": "string"}, "description": "List of source document IDs" }, "extraction_method": { "type": "string", "default": "ai_extraction", "description": "Method used for extraction" } } }, "documents": { "type": "array", "description": "Array of individual documents to generate", "items": { "type": "object", "required": ["id", "title", "sections", "filename"], "properties": { "id": {"type": "string", "description": "Unique document identifier"}, "title": {"type": "string", "description": "Document title"}, "filename": {"type": "string", "description": "Generated filename"}, "sections": { "type": "array", "description": "Document sections containing structured content", "items": { "type": "object", "required": ["id", "content_type", "elements", "order"], "properties": { "id": {"type": "string", "description": "Unique section identifier"}, "title": {"type": "string", "description": "Section title (optional)"}, "content_type": { "type": "string", "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"], "description": "Primary content type of this section" }, "elements": { "type": "array", "description": "Content elements in this section", "items": { "oneOf": [ {"$ref": "#/definitions/table"}, {"$ref": "#/definitions/bullet_list"}, {"$ref": "#/definitions/paragraph"}, {"$ref": "#/definitions/heading"}, {"$ref": "#/definitions/code_block"} ] } }, "order": {"type": "integer", "description": "Section order in document"}, "metadata": { "type": "object", "description": "Additional section metadata" } } } }, "metadata": { "type": "object", "description": "Document-specific metadata" } } } } }, "definitions": { "table": { "type": "object", "required": ["headers", "rows"], "properties": { "headers": { "type": "array", "items": {"type": "string"}, "description": "Table column headers" }, "rows": { "type": "array", "items": { "type": "array", "items": {"type": "string"} }, "description": "Table data rows" }, "caption": { "type": "string", "description": "Table caption (optional)" } } }, "bullet_list": { "type": "object", "required": ["items"], "properties": { "items": { "type": "array", "items": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "List item text"}, "subitems": { "type": "array", "items": {"$ref": "#/definitions/list_item"}, "description": "Nested sub-items (optional)" } } }, "description": "List items" }, "list_type": { "type": "string", "enum": ["bullet", "numbered", "checklist"], "default": "bullet", "description": "Type of list" } } }, "list_item": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "List item text"}, "subitems": { "type": "array", "items": {"$ref": "#/definitions/list_item"}, "description": "Nested sub-items (optional)" } } }, "paragraph": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "Paragraph text"}, "formatting": { "type": "object", "description": "Text formatting (bold, italic, etc.)" } } }, "heading": { "type": "object", "required": ["text", "level"], "properties": { "text": {"type": "string", "description": "Heading text"}, "level": { "type": "integer", "minimum": 1, "maximum": 6, "description": "Heading level (1-6)" } } }, "code_block": { "type": "object", "required": ["code"], "properties": { "code": {"type": "string", "description": "Code content"}, "language": {"type": "string", "description": "Programming language (optional)"} } } } } def get_document_subJsonSchema() -> Dict[str, Any]: """Get the JSON schema for structured document generation (single document).""" return { "type": "object", "required": ["metadata", "sections"], "properties": { "metadata": { "type": "object", "required": ["title"], "properties": { "title": {"type": "string", "description": "Document title"}, "author": {"type": "string", "description": "Document author (optional)"}, "source_documents": { "type": "array", "items": {"type": "string"}, "description": "List of source document IDs" }, "extraction_method": { "type": "string", "default": "ai_extraction", "description": "Method used for extraction" } } }, "sections": { "type": "array", "description": "Document sections containing structured content", "items": { "type": "object", "required": ["id", "content_type", "elements", "order"], "properties": { "id": {"type": "string", "description": "Unique section identifier"}, "title": {"type": "string", "description": "Section title (optional)"}, "content_type": { "type": "string", "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"], "description": "Primary content type of this section" }, "elements": { "type": "array", "description": "Content elements in this section", "items": { "oneOf": [ {"$ref": "#/definitions/table"}, {"$ref": "#/definitions/bullet_list"}, {"$ref": "#/definitions/paragraph"}, {"$ref": "#/definitions/heading"}, {"$ref": "#/definitions/code_block"} ] } }, "order": {"type": "integer", "description": "Section order in document"}, "metadata": { "type": "object", "description": "Additional section metadata" } } } }, "summary": { "type": "string", "description": "Document summary (optional)" }, "tags": { "type": "array", "items": {"type": "string"}, "description": "Document tags for categorization" } }, "definitions": { "table": { "type": "object", "required": ["headers", "rows"], "properties": { "headers": { "type": "array", "items": {"type": "string"}, "description": "Table column headers" }, "rows": { "type": "array", "items": { "type": "array", "items": {"type": "string"} }, "description": "Table data rows" }, "caption": { "type": "string", "description": "Table caption (optional)" } } }, "bullet_list": { "type": "object", "required": ["items"], "properties": { "items": { "type": "array", "items": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "List item text"}, "subitems": { "type": "array", "items": {"$ref": "#/definitions/list_item"}, "description": "Nested sub-items (optional)" } } }, "description": "List items" }, "list_type": { "type": "string", "enum": ["bullet", "numbered", "checklist"], "default": "bullet", "description": "Type of list" } } }, "list_item": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "List item text"}, "subitems": { "type": "array", "items": {"$ref": "#/definitions/list_item"}, "description": "Nested sub-items (optional)" } } }, "paragraph": { "type": "object", "required": ["text"], "properties": { "text": {"type": "string", "description": "Paragraph text"}, "formatting": { "type": "object", "description": "Text formatting (bold, italic, etc.)" } } }, "heading": { "type": "object", "required": ["text", "level"], "properties": { "text": {"type": "string", "description": "Heading text"}, "level": { "type": "integer", "minimum": 1, "maximum": 6, "description": "Heading level (1-6)" } } }, "code_block": { "type": "object", "required": ["code"], "properties": { "code": {"type": "string", "description": "Code content"}, "language": {"type": "string", "description": "Programming language (optional)"} } } } } def get_extraction_prompt_template() -> str: """Get the template for AI extraction prompts that request JSON output.""" return """ You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document. IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. JSON Schema Requirements: - Extract the actual data from the source documents - If content is a table, extract it as a table with headers and rows - If content is a list, extract it as a structured list with items - If content is text, extract it as paragraphs or headings - Preserve the original structure and data - do not summarize or interpret - Use the exact JSON schema provided Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification Return only the JSON structure following the schema. Do not include any text before or after the JSON. """ def get_generation_prompt_template() -> str: """Get the template for AI generation prompts that work with JSON input.""" return """ You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content. IMPORTANT: You must respond with valid JSON only, following the document schema. Generation Guidelines: - Use the provided JSON structure as the foundation - Enhance the content with proper formatting and organization - Ensure logical flow and readability - Maintain the original data integrity - Add appropriate headings and sections - Organize content in a logical sequence Content Enhancement: - Tables: Ensure proper headers and data alignment - Lists: Use appropriate list types (bullet, numbered, checklist) - Headings: Use appropriate heading levels for hierarchy - Paragraphs: Ensure proper text flow and formatting - Code: Preserve code blocks with proper language identification Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON. """ def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]: """Automatically select appropriate schema based on prompt analysis.""" if prompt_analysis and prompt_analysis.get("is_multi_file", False): return get_multi_document_subJsonSchema() else: return get_document_subJsonSchema() def validate_json_document(json_data: Dict[str, Any]) -> bool: """Validate that the JSON data follows the document schema.""" try: # Basic validation - check required fields if not isinstance(json_data, dict): return False # Check if it's multi-document or single-document structure if "documents" in json_data: # Multi-document structure if "metadata" not in json_data: return False metadata = json_data["metadata"] if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata: return False documents = json_data["documents"] if not isinstance(documents, list): return False # Validate each document for doc in documents: if not isinstance(doc, dict): return False required_fields = ["id", "title", "sections", "filename"] for field in required_fields: if field not in doc: return False # Validate sections in each document sections = doc.get("sections", []) if not isinstance(sections, list): return False for section in sections: if not isinstance(section, dict): return False section_required = ["id", "content_type", "elements", "order"] for field in section_required: if field not in section: return False # Validate content_type valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] if section["content_type"] not in valid_types: return False # Validate elements if not isinstance(section["elements"], list): return False elif "sections" in json_data: # Single-document structure (existing validation) if "metadata" not in json_data: return False metadata = json_data["metadata"] if not isinstance(metadata, dict) or "title" not in metadata: return False sections = json_data["sections"] if not isinstance(sections, list): return False # Validate each section for i, section in enumerate(sections): if not isinstance(section, dict): return False required_fields = ["id", "content_type", "elements", "order"] for field in required_fields: if field not in section: return False # Validate content_type valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] if section["content_type"] not in valid_types: return False # Validate elements if not isinstance(section["elements"], list): return False else: return False return True except Exception: return False