gateway/modules/services/serviceGeneration/subJsonSchema.py
2025-10-28 00:14:24 +01:00

517 lines
22 KiB
Python

"""
JSON Schema definitions for AI-generated document structures.
This module provides schemas that guide AI to generate structured JSON output.
"""
from typing import Dict, Any
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
"""Get the JSON schema for multi-document generation."""
return {
"type": "object",
"required": ["metadata", "documents"],
"properties": {
"metadata": {
"type": "object",
"required": ["title", "split_strategy"],
"properties": {
"title": {"type": "string", "description": "Document title"},
"split_strategy": {
"type": "string",
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
"description": "Strategy for splitting content into multiple files"
},
"splitCriteria": {
"type": "object",
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
},
"fileNamingPattern": {
"type": "string",
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
},
"author": {"type": "string", "description": "Document author (optional)"},
"source_documents": {
"type": "array",
"items": {"type": "string"},
"description": "List of source document IDs"
},
"extraction_method": {
"type": "string",
"default": "ai_extraction",
"description": "Method used for extraction"
}
}
},
"documents": {
"type": "array",
"description": "Array of individual documents to generate",
"items": {
"type": "object",
"required": ["id", "title", "sections", "filename"],
"properties": {
"id": {"type": "string", "description": "Unique document identifier"},
"title": {"type": "string", "description": "Document title"},
"filename": {"type": "string", "description": "Generated filename"},
"sections": {
"type": "array",
"description": "Document sections containing structured content",
"items": {
"type": "object",
"required": ["id", "content_type", "elements", "order"],
"properties": {
"id": {"type": "string", "description": "Unique section identifier"},
"title": {"type": "string", "description": "Section title (optional)"},
"content_type": {
"type": "string",
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
"description": "Primary content type of this section"
},
"elements": {
"type": "array",
"description": "Content elements in this section",
"items": {
"oneOf": [
{"$ref": "#/definitions/table"},
{"$ref": "#/definitions/bullet_list"},
{"$ref": "#/definitions/paragraph"},
{"$ref": "#/definitions/heading"},
{"$ref": "#/definitions/code_block"}
]
}
},
"order": {"type": "integer", "description": "Section order in document"},
"metadata": {
"type": "object",
"description": "Additional section metadata"
}
}
}
},
"metadata": {
"type": "object",
"description": "Document-specific metadata"
}
}
}
}
},
"definitions": {
"table": {
"type": "object",
"required": ["headers", "rows"],
"properties": {
"headers": {
"type": "array",
"items": {"type": "string"},
"description": "Table column headers"
},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": "string"}
},
"description": "Table data rows"
},
"caption": {
"type": "string",
"description": "Table caption (optional)"
}
}
},
"bullet_list": {
"type": "object",
"required": ["items"],
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"description": "List items"
},
"list_type": {
"type": "string",
"enum": ["bullet", "numbered", "checklist"],
"default": "bullet",
"description": "Type of list"
}
}
},
"list_item": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"paragraph": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "Paragraph text"},
"formatting": {
"type": "object",
"description": "Text formatting (bold, italic, etc.)"
}
}
},
"heading": {
"type": "object",
"required": ["text", "level"],
"properties": {
"text": {"type": "string", "description": "Heading text"},
"level": {
"type": "integer",
"minimum": 1,
"maximum": 6,
"description": "Heading level (1-6)"
}
}
},
"code_block": {
"type": "object",
"required": ["code"],
"properties": {
"code": {"type": "string", "description": "Code content"},
"language": {"type": "string", "description": "Programming language (optional)"}
}
}
}
}
def get_document_subJsonSchema() -> Dict[str, Any]:
"""Get the JSON schema for structured document generation (single document)."""
return {
"type": "object",
"required": ["metadata", "sections"],
"properties": {
"metadata": {
"type": "object",
"required": ["title"],
"properties": {
"title": {"type": "string", "description": "Document title"},
"author": {"type": "string", "description": "Document author (optional)"},
"source_documents": {
"type": "array",
"items": {"type": "string"},
"description": "List of source document IDs"
},
"extraction_method": {
"type": "string",
"default": "ai_extraction",
"description": "Method used for extraction"
}
}
},
"sections": {
"type": "array",
"description": "Document sections containing structured content",
"items": {
"type": "object",
"required": ["id", "content_type", "elements", "order"],
"properties": {
"id": {"type": "string", "description": "Unique section identifier"},
"title": {"type": "string", "description": "Section title (optional)"},
"content_type": {
"type": "string",
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
"description": "Primary content type of this section"
},
"elements": {
"type": "array",
"description": "Content elements in this section",
"items": {
"oneOf": [
{"$ref": "#/definitions/table"},
{"$ref": "#/definitions/bullet_list"},
{"$ref": "#/definitions/paragraph"},
{"$ref": "#/definitions/heading"},
{"$ref": "#/definitions/code_block"}
]
}
},
"order": {"type": "integer", "description": "Section order in document"},
"metadata": {
"type": "object",
"description": "Additional section metadata"
}
}
}
},
"summary": {
"type": "string",
"description": "Document summary (optional)"
},
"tags": {
"type": "array",
"items": {"type": "string"},
"description": "Document tags for categorization"
}
},
"definitions": {
"table": {
"type": "object",
"required": ["headers", "rows"],
"properties": {
"headers": {
"type": "array",
"items": {"type": "string"},
"description": "Table column headers"
},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": "string"}
},
"description": "Table data rows"
},
"caption": {
"type": "string",
"description": "Table caption (optional)"
}
}
},
"bullet_list": {
"type": "object",
"required": ["items"],
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"description": "List items"
},
"list_type": {
"type": "string",
"enum": ["bullet", "numbered", "checklist"],
"default": "bullet",
"description": "Type of list"
}
}
},
"list_item": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"paragraph": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "Paragraph text"},
"formatting": {
"type": "object",
"description": "Text formatting (bold, italic, etc.)"
}
}
},
"heading": {
"type": "object",
"required": ["text", "level"],
"properties": {
"text": {"type": "string", "description": "Heading text"},
"level": {
"type": "integer",
"minimum": 1,
"maximum": 6,
"description": "Heading level (1-6)"
}
}
},
"code_block": {
"type": "object",
"required": ["code"],
"properties": {
"code": {"type": "string", "description": "Code content"},
"language": {"type": "string", "description": "Programming language (optional)"}
}
}
}
}
def get_extraction_prompt_template() -> str:
"""Get the template for AI extraction prompts that request JSON output."""
return """
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
JSON Schema Requirements:
- Extract the actual data from the source documents
- If content is a table, extract it as a table with headers and rows
- If content is a list, extract it as a structured list with items
- If content is text, extract it as paragraphs or headings
- Preserve the original structure and data - do not summarize or interpret
- Use the exact JSON schema provided
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
Return only the JSON structure following the schema. Do not include any text before or after the JSON.
"""
def get_generation_prompt_template() -> str:
"""Get the template for AI generation prompts that work with JSON input."""
return """
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.
IMPORTANT: You must respond with valid JSON only, following the document schema.
Generation Guidelines:
- Use the provided JSON structure as the foundation
- Enhance the content with proper formatting and organization
- Ensure logical flow and readability
- Maintain the original data integrity
- Add appropriate headings and sections
- Organize content in a logical sequence
Content Enhancement:
- Tables: Ensure proper headers and data alignment
- Lists: Use appropriate list types (bullet, numbered, checklist)
- Headings: Use appropriate heading levels for hierarchy
- Paragraphs: Ensure proper text flow and formatting
- Code: Preserve code blocks with proper language identification
Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
"""
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
"""Automatically select appropriate schema based on prompt analysis."""
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
return get_multi_document_subJsonSchema()
else:
return get_document_subJsonSchema()
def validate_json_document(json_data: Dict[str, Any]) -> bool:
"""Validate that the JSON data follows the document schema."""
try:
# Basic validation - check required fields
if not isinstance(json_data, dict):
return False
# Check if it's multi-document or single-document structure
if "documents" in json_data:
# Multi-document structure
if "metadata" not in json_data:
return False
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata or "split_strategy" not in metadata:
return False
documents = json_data["documents"]
if not isinstance(documents, list):
return False
# Validate each document
for doc in documents:
if not isinstance(doc, dict):
return False
required_fields = ["id", "title", "sections", "filename"]
for field in required_fields:
if field not in doc:
return False
# Validate sections in each document
sections = doc.get("sections", [])
if not isinstance(sections, list):
return False
for section in sections:
if not isinstance(section, dict):
return False
section_required = ["id", "content_type", "elements", "order"]
for field in section_required:
if field not in section:
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
return False
# Validate elements
if not isinstance(section["elements"], list):
return False
elif "sections" in json_data:
# Single-document structure (existing validation)
if "metadata" not in json_data:
return False
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata:
return False
sections = json_data["sections"]
if not isinstance(sections, list):
return False
# Validate each section
for i, section in enumerate(sections):
if not isinstance(section, dict):
return False
required_fields = ["id", "content_type", "elements", "order"]
for field in required_fields:
if field not in section:
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
return False
# Validate elements
if not isinstance(section["elements"], list):
return False
else:
return False
return True
except Exception:
return False