517 lines
22 KiB
Python
517 lines
22 KiB
Python
"""
|
|
JSON Schema definitions for AI-generated document structures.
|
|
This module provides schemas that guide AI to generate structured JSON output.
|
|
"""
|
|
|
|
from typing import Dict, Any
|
|
|
|
|
|
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
|
|
"""Get the JSON schema for multi-document generation."""
|
|
return {
|
|
"type": "object",
|
|
"required": ["metadata", "documents"],
|
|
"properties": {
|
|
"metadata": {
|
|
"type": "object",
|
|
"required": ["title", "split_strategy"],
|
|
"properties": {
|
|
"title": {"type": "string", "description": "Document title"},
|
|
"split_strategy": {
|
|
"type": "string",
|
|
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
|
|
"description": "Strategy for splitting content into multiple files"
|
|
},
|
|
"splitCriteria": {
|
|
"type": "object",
|
|
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
|
|
},
|
|
"fileNamingPattern": {
|
|
"type": "string",
|
|
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
|
|
},
|
|
"author": {"type": "string", "description": "Document author (optional)"},
|
|
"source_documents": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "List of source document IDs"
|
|
},
|
|
"extraction_method": {
|
|
"type": "string",
|
|
"default": "ai_extraction",
|
|
"description": "Method used for extraction"
|
|
}
|
|
}
|
|
},
|
|
"documents": {
|
|
"type": "array",
|
|
"description": "Array of individual documents to generate",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["id", "title", "sections", "filename"],
|
|
"properties": {
|
|
"id": {"type": "string", "description": "Unique document identifier"},
|
|
"title": {"type": "string", "description": "Document title"},
|
|
"filename": {"type": "string", "description": "Generated filename"},
|
|
"sections": {
|
|
"type": "array",
|
|
"description": "Document sections containing structured content",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["id", "content_type", "elements", "order"],
|
|
"properties": {
|
|
"id": {"type": "string", "description": "Unique section identifier"},
|
|
"title": {"type": "string", "description": "Section title (optional)"},
|
|
"content_type": {
|
|
"type": "string",
|
|
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
|
"description": "Primary content type of this section"
|
|
},
|
|
"elements": {
|
|
"type": "array",
|
|
"description": "Content elements in this section",
|
|
"items": {
|
|
"oneOf": [
|
|
{"$ref": "#/definitions/table"},
|
|
{"$ref": "#/definitions/bullet_list"},
|
|
{"$ref": "#/definitions/paragraph"},
|
|
{"$ref": "#/definitions/heading"},
|
|
{"$ref": "#/definitions/code_block"}
|
|
]
|
|
}
|
|
},
|
|
"order": {"type": "integer", "description": "Section order in document"},
|
|
"metadata": {
|
|
"type": "object",
|
|
"description": "Additional section metadata"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"metadata": {
|
|
"type": "object",
|
|
"description": "Document-specific metadata"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"definitions": {
|
|
"table": {
|
|
"type": "object",
|
|
"required": ["headers", "rows"],
|
|
"properties": {
|
|
"headers": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Table column headers"
|
|
},
|
|
"rows": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "array",
|
|
"items": {"type": "string"}
|
|
},
|
|
"description": "Table data rows"
|
|
},
|
|
"caption": {
|
|
"type": "string",
|
|
"description": "Table caption (optional)"
|
|
}
|
|
}
|
|
},
|
|
"bullet_list": {
|
|
"type": "object",
|
|
"required": ["items"],
|
|
"properties": {
|
|
"items": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "List item text"},
|
|
"subitems": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/list_item"},
|
|
"description": "Nested sub-items (optional)"
|
|
}
|
|
}
|
|
},
|
|
"description": "List items"
|
|
},
|
|
"list_type": {
|
|
"type": "string",
|
|
"enum": ["bullet", "numbered", "checklist"],
|
|
"default": "bullet",
|
|
"description": "Type of list"
|
|
}
|
|
}
|
|
},
|
|
"list_item": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "List item text"},
|
|
"subitems": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/list_item"},
|
|
"description": "Nested sub-items (optional)"
|
|
}
|
|
}
|
|
},
|
|
"paragraph": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "Paragraph text"},
|
|
"formatting": {
|
|
"type": "object",
|
|
"description": "Text formatting (bold, italic, etc.)"
|
|
}
|
|
}
|
|
},
|
|
"heading": {
|
|
"type": "object",
|
|
"required": ["text", "level"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "Heading text"},
|
|
"level": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"maximum": 6,
|
|
"description": "Heading level (1-6)"
|
|
}
|
|
}
|
|
},
|
|
"code_block": {
|
|
"type": "object",
|
|
"required": ["code"],
|
|
"properties": {
|
|
"code": {"type": "string", "description": "Code content"},
|
|
"language": {"type": "string", "description": "Programming language (optional)"}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
def get_document_subJsonSchema() -> Dict[str, Any]:
|
|
"""Get the JSON schema for structured document generation (single document)."""
|
|
return {
|
|
"type": "object",
|
|
"required": ["metadata", "sections"],
|
|
"properties": {
|
|
"metadata": {
|
|
"type": "object",
|
|
"required": ["title"],
|
|
"properties": {
|
|
"title": {"type": "string", "description": "Document title"},
|
|
"author": {"type": "string", "description": "Document author (optional)"},
|
|
"source_documents": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "List of source document IDs"
|
|
},
|
|
"extraction_method": {
|
|
"type": "string",
|
|
"default": "ai_extraction",
|
|
"description": "Method used for extraction"
|
|
}
|
|
}
|
|
},
|
|
"sections": {
|
|
"type": "array",
|
|
"description": "Document sections containing structured content",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["id", "content_type", "elements", "order"],
|
|
"properties": {
|
|
"id": {"type": "string", "description": "Unique section identifier"},
|
|
"title": {"type": "string", "description": "Section title (optional)"},
|
|
"content_type": {
|
|
"type": "string",
|
|
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
|
|
"description": "Primary content type of this section"
|
|
},
|
|
"elements": {
|
|
"type": "array",
|
|
"description": "Content elements in this section",
|
|
"items": {
|
|
"oneOf": [
|
|
{"$ref": "#/definitions/table"},
|
|
{"$ref": "#/definitions/bullet_list"},
|
|
{"$ref": "#/definitions/paragraph"},
|
|
{"$ref": "#/definitions/heading"},
|
|
{"$ref": "#/definitions/code_block"}
|
|
]
|
|
}
|
|
},
|
|
"order": {"type": "integer", "description": "Section order in document"},
|
|
"metadata": {
|
|
"type": "object",
|
|
"description": "Additional section metadata"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"summary": {
|
|
"type": "string",
|
|
"description": "Document summary (optional)"
|
|
},
|
|
"tags": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Document tags for categorization"
|
|
}
|
|
},
|
|
"definitions": {
|
|
"table": {
|
|
"type": "object",
|
|
"required": ["headers", "rows"],
|
|
"properties": {
|
|
"headers": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Table column headers"
|
|
},
|
|
"rows": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "array",
|
|
"items": {"type": "string"}
|
|
},
|
|
"description": "Table data rows"
|
|
},
|
|
"caption": {
|
|
"type": "string",
|
|
"description": "Table caption (optional)"
|
|
}
|
|
}
|
|
},
|
|
"bullet_list": {
|
|
"type": "object",
|
|
"required": ["items"],
|
|
"properties": {
|
|
"items": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "List item text"},
|
|
"subitems": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/list_item"},
|
|
"description": "Nested sub-items (optional)"
|
|
}
|
|
}
|
|
},
|
|
"description": "List items"
|
|
},
|
|
"list_type": {
|
|
"type": "string",
|
|
"enum": ["bullet", "numbered", "checklist"],
|
|
"default": "bullet",
|
|
"description": "Type of list"
|
|
}
|
|
}
|
|
},
|
|
"list_item": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "List item text"},
|
|
"subitems": {
|
|
"type": "array",
|
|
"items": {"$ref": "#/definitions/list_item"},
|
|
"description": "Nested sub-items (optional)"
|
|
}
|
|
}
|
|
},
|
|
"paragraph": {
|
|
"type": "object",
|
|
"required": ["text"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "Paragraph text"},
|
|
"formatting": {
|
|
"type": "object",
|
|
"description": "Text formatting (bold, italic, etc.)"
|
|
}
|
|
}
|
|
},
|
|
"heading": {
|
|
"type": "object",
|
|
"required": ["text", "level"],
|
|
"properties": {
|
|
"text": {"type": "string", "description": "Heading text"},
|
|
"level": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"maximum": 6,
|
|
"description": "Heading level (1-6)"
|
|
}
|
|
}
|
|
},
|
|
"code_block": {
|
|
"type": "object",
|
|
"required": ["code"],
|
|
"properties": {
|
|
"code": {"type": "string", "description": "Code content"},
|
|
"language": {"type": "string", "description": "Programming language (optional)"}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
def get_extraction_prompt_template() -> str:
|
|
"""Get the template for AI extraction prompts that request JSON output."""
|
|
return """
|
|
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.
|
|
|
|
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
|
|
|
JSON Schema Requirements:
|
|
- Extract the actual data from the source documents
|
|
- If content is a table, extract it as a table with headers and rows
|
|
- If content is a list, extract it as a structured list with items
|
|
- If content is text, extract it as paragraphs or headings
|
|
- Preserve the original structure and data - do not summarize or interpret
|
|
- Use the exact JSON schema provided
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
|
|
Return only the JSON structure following the schema. Do not include any text before or after the JSON.
|
|
"""
|
|
|
|
|
|
def get_generation_prompt_template() -> str:
|
|
"""Get the template for AI generation prompts that work with JSON input."""
|
|
return """
|
|
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.
|
|
|
|
IMPORTANT: You must respond with valid JSON only, following the document schema.
|
|
|
|
Generation Guidelines:
|
|
- Use the provided JSON structure as the foundation
|
|
- Enhance the content with proper formatting and organization
|
|
- Ensure logical flow and readability
|
|
- Maintain the original data integrity
|
|
- Add appropriate headings and sections
|
|
- Organize content in a logical sequence
|
|
|
|
Content Enhancement:
|
|
- Tables: Ensure proper headers and data alignment
|
|
- Lists: Use appropriate list types (bullet, numbered, checklist)
|
|
- Headings: Use appropriate heading levels for hierarchy
|
|
- Paragraphs: Ensure proper text flow and formatting
|
|
- Code: Preserve code blocks with proper language identification
|
|
|
|
Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
|
|
"""
|
|
|
|
|
|
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
|
|
"""Automatically select appropriate schema based on prompt analysis."""
|
|
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
|
|
return get_multi_document_subJsonSchema()
|
|
else:
|
|
return get_document_subJsonSchema()
|
|
|
|
def validate_json_document(json_data: Dict[str, Any]) -> bool:
|
|
"""Validate that the JSON data follows the document schema."""
|
|
try:
|
|
# Basic validation - check required fields
|
|
if not isinstance(json_data, dict):
|
|
return False
|
|
|
|
# Check if it's multi-document or single-document structure
|
|
if "documents" in json_data:
|
|
# Multi-document structure
|
|
if "metadata" not in json_data:
|
|
return False
|
|
|
|
metadata = json_data["metadata"]
|
|
if not isinstance(metadata, dict) or "title" not in metadata or "split_strategy" not in metadata:
|
|
return False
|
|
|
|
documents = json_data["documents"]
|
|
if not isinstance(documents, list):
|
|
return False
|
|
|
|
# Validate each document
|
|
for doc in documents:
|
|
if not isinstance(doc, dict):
|
|
return False
|
|
|
|
required_fields = ["id", "title", "sections", "filename"]
|
|
for field in required_fields:
|
|
if field not in doc:
|
|
return False
|
|
|
|
# Validate sections in each document
|
|
sections = doc.get("sections", [])
|
|
if not isinstance(sections, list):
|
|
return False
|
|
|
|
for section in sections:
|
|
if not isinstance(section, dict):
|
|
return False
|
|
|
|
section_required = ["id", "content_type", "elements", "order"]
|
|
for field in section_required:
|
|
if field not in section:
|
|
return False
|
|
|
|
# Validate content_type
|
|
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
|
if section["content_type"] not in valid_types:
|
|
return False
|
|
|
|
# Validate elements
|
|
if not isinstance(section["elements"], list):
|
|
return False
|
|
|
|
elif "sections" in json_data:
|
|
# Single-document structure (existing validation)
|
|
if "metadata" not in json_data:
|
|
return False
|
|
|
|
metadata = json_data["metadata"]
|
|
if not isinstance(metadata, dict) or "title" not in metadata:
|
|
return False
|
|
|
|
sections = json_data["sections"]
|
|
if not isinstance(sections, list):
|
|
return False
|
|
|
|
# Validate each section
|
|
for i, section in enumerate(sections):
|
|
if not isinstance(section, dict):
|
|
return False
|
|
|
|
required_fields = ["id", "content_type", "elements", "order"]
|
|
for field in required_fields:
|
|
if field not in section:
|
|
return False
|
|
|
|
# Validate content_type
|
|
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
|
|
if section["content_type"] not in valid_types:
|
|
return False
|
|
|
|
# Validate elements
|
|
if not isinstance(section["elements"], list):
|
|
return False
|
|
else:
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception:
|
|
return False
|