gateway/modules/services/serviceGeneration/subJsonSchema.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
JSON Schema definitions for AI-generated document structures (unified).
This module provides schemas that guide AI to generate structured JSON output
that matches the master template in modules.datamodels.datamodelJson.
"""

from typing import Dict, Any


def getMultiDocumentSchema() -> Dict[str, Any]:
    """Get the JSON schema for multi-document generation (unified)."""
    return {
        "type": "object",
        "required": ["metadata", "documents"],
        "properties": {
            "metadata": {
                "type": "object",
                "required": ["split_strategy"],
                "properties": {
                    "split_strategy": {
                        "type": "string",
                        "enum": [
                            "single_document",
                            "per_entity",
                            "by_section",
                            "by_criteria",
                            "by_data_type",
                            "custom"
                        ],
                        "description": "Strategy for splitting content into multiple files"
                    },
                    "splitCriteria": {
                        "type": "object",
                        "description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
                    },
                    "fileNamingPattern": {
                        "type": "string",
                        "description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
                    },
                    "source_documents": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of source document IDs"
                    },
                    "extraction_method": {
                        "type": "string",
                        "default": "ai_generation",
                        "description": "Method used for extraction"
                    }
                }
            },
            "documents": {
                "type": "array",
                "description": "Array of individual documents to generate",
                "items": {
                    "type": "object",
                    "required": ["id", "title", "sections", "filename"],
                    "properties": {
                        "id": {"type": "string", "description": "Unique document identifier"},
                        "title": {"type": "string", "description": "Document title"},
                        "filename": {"type": "string", "description": "Generated filename"},
                        "sections": {
                            "type": "array",
                            "description": "Document sections containing structured content",
                            "items": {
                                "type": "object",
                                "required": ["id", "content_type", "elements", "order"],
                                "properties": {
                                    "id": {"type": "string", "description": "Unique section identifier"},
                                    "title": {"type": "string", "description": "Section title (optional)"},
                                    "content_type": {
                                        "type": "string",
                                        "enum": [
                                            "table",
                                            "bullet_list",
                                            "paragraph",
                                            "heading",
                                            "code_block",
                                            "image",
                                            "mixed"
                                        ],
                                        "description": "Primary content type of this section"
                                    },
                                    "elements": {
                                        "type": "array",
                                        "description": "Content elements in this section",
                                        "items": {
                                            "oneOf": [
                                                {"$ref": "#/definitions/table"},
                                                {"$ref": "#/definitions/bullet_list"},
                                                {"$ref": "#/definitions/paragraph"},
                                                {"$ref": "#/definitions/heading"},
                                                {"$ref": "#/definitions/code_block"},
                                                {"$ref": "#/definitions/image"}
                                            ]
                                        }
                                    },
                                    "order": {"type": "integer", "description": "Section order in document"},
                                    "metadata": {
                                        "type": "object",
                                        "description": "Additional section metadata"
                                    }
                                }
                            }
                        },
                        "metadata": {
                            "type": "object",
                            "description": "Document-specific metadata"
                        }
                    }
                }
            }
        },
        "definitions": {
            "table": {
                "type": "object",
                "required": ["headers", "rows"],
                "properties": {
                    "headers": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Table column headers"
                    },
                    "rows": {
                        "type": "array",
                        "items": {
                            "type": "array",
                            "items": {"type": "string"}
                        },
                        "description": "Table data rows"
                    },
                    "caption": {
                        "type": "string",
                        "description": "Table caption (optional)"
                    }
                }
            },
            "bullet_list": {
                "type": "object",
                "required": ["items"],
                "properties": {
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "required": ["text"],
                            "properties": {
                                "text": {"type": "string", "description": "List item text"},
                                "subitems": {
                                    "type": "array",
                                    "items": {"$ref": "#/definitions/list_item"},
                                    "description": "Nested sub-items (optional)"
                                }
                            }
                        },
                        "description": "List items"
                    },
                    "list_type": {
                        "type": "string",
                        "enum": ["bullet", "numbered", "checklist"],
                        "default": "bullet",
                        "description": "Type of list"
                    }
                }
            },
            "list_item": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "List item text"},
                    "subitems": {
                        "type": "array",
                        "items": {"$ref": "#/definitions/list_item"},
                        "description": "Nested sub-items (optional)"
                    }
                }
            },
            "paragraph": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "Paragraph text"},
                    "formatting": {
                        "type": "object",
                        "description": "Text formatting (bold, italic, etc.)"
                    }
                }
            },
            "heading": {
                "type": "object",
                "required": ["text", "level"],
                "properties": {
                    "text": {"type": "string", "description": "Heading text"},
                    "level": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 6,
                        "description": "Heading level (1-6)"
                    }
                }
            },
            "code_block": {
                "type": "object",
                "required": ["code"],
                "properties": {
                    "code": {"type": "string", "description": "Code content"},
                    "language": {"type": "string", "description": "Programming language (optional)"}
                }
            },
            "image": {
                "type": "object",
                "required": ["url"],
                "properties": {
                    "url": {"type": "string", "description": "Image URL or data URI"},
                    "caption": {"type": "string", "description": "Image caption (optional)"},
                    "alt": {"type": "string", "description": "Alt text (optional)"}
                }
            }
        }
    }

def getDocumentSchema() -> Dict[str, Any]:
    """Get the JSON schema for structured document generation (single document)."""
    return {
        "type": "object",
        "required": ["metadata", "sections"],
        "properties": {
            "metadata": {
                "type": "object",
                "required": ["title"],
                "properties": {
                    "title": {"type": "string", "description": "Document title"},
                    "source_documents": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of source document IDs"
                    },
                    "extraction_method": {
                        "type": "string",
                        "default": "ai_generation",
                        "description": "Method used for extraction"
                    }
                }
            },
            "sections": {
                "type": "array",
                "description": "Document sections containing structured content",
                "items": {
                    "type": "object",
                    "required": ["id", "content_type", "elements", "order"],
                    "properties": {
                        "id": {"type": "string", "description": "Unique section identifier"},
                        "title": {"type": "string", "description": "Section title (optional)"},
                        "content_type": {
                            "type": "string",
                            "enum": [
                                "table",
                                "bullet_list",
                                "paragraph",
                                "heading",
                                "code_block",
                                "image",
                                "mixed"
                            ],
                            "description": "Primary content type of this section"
                        },
                        "elements": {
                            "type": "array",
                            "description": "Content elements in this section",
                            "items": {
                                "oneOf": [
                                    {"$ref": "#/definitions/table"},
                                    {"$ref": "#/definitions/bullet_list"},
                                    {"$ref": "#/definitions/paragraph"},
                                    {"$ref": "#/definitions/heading"},
                                    {"$ref": "#/definitions/code_block"},
                                    {"$ref": "#/definitions/image"}
                                ]
                            }
                        },
                        "order": {"type": "integer", "description": "Section order in document"},
                        "metadata": {
                            "type": "object",
                            "description": "Additional section metadata"
                        }
                    }
                }
            },
            "summary": {
                "type": "string",
                "description": "Document summary (optional)"
            },
            "tags": {
                "type": "array",
                "items": {"type": "string"},
                "description": "Document tags for categorization"
            }
        },
        "definitions": {
            "table": {
                "type": "object",
                "required": ["headers", "rows"],
                "properties": {
                    "headers": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Table column headers"
                    },
                    "rows": {
                        "type": "array",
                        "items": {
                            "type": "array",
                            "items": {"type": "string"}
                        },
                        "description": "Table data rows"
                    },
                    "caption": {
                        "type": "string",
                        "description": "Table caption (optional)"
                    }
                }
            },
            "bullet_list": {
                "type": "object",
                "required": ["items"],
                "properties": {
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "required": ["text"],
                            "properties": {
                                "text": {"type": "string", "description": "List item text"},
                                "subitems": {
                                    "type": "array",
                                    "items": {"$ref": "#/definitions/list_item"},
                                    "description": "Nested sub-items (optional)"
                                }
                            }
                        },
                        "description": "List items"
                    },
                    "list_type": {
                        "type": "string",
                        "enum": ["bullet", "numbered", "checklist"],
                        "default": "bullet",
                        "description": "Type of list"
                    }
                }
            },
            "list_item": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "List item text"},
                    "subitems": {
                        "type": "array",
                        "items": {"$ref": "#/definitions/list_item"},
                        "description": "Nested sub-items (optional)"
                    }
                }
            },
            "paragraph": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "Paragraph text"},
                    "formatting": {
                        "type": "object",
                        "description": "Text formatting (bold, italic, etc.)"
                    }
                }
            },
            "heading": {
                "type": "object",
                "required": ["text", "level"],
                "properties": {
                    "text": {"type": "string", "description": "Heading text"},
                    "level": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 6,
                        "description": "Heading level (1-6)"
                    }
                }
            },
            "code_block": {
                "type": "object",
                "required": ["code"],
                "properties": {
                    "code": {"type": "string", "description": "Code content"},
                    "language": {"type": "string", "description": "Programming language (optional)"}
                }
            },
            "image": {
                "type": "object",
                "required": ["url"],
                "properties": {
                    "url": {"type": "string", "description": "Image URL or data URI"},
                    "caption": {"type": "string", "description": "Image caption (optional)"},
                    "alt": {"type": "string", "description": "Alt text (optional)"}
                }
            }
        }
    }


def getExtractionPromptTemplate() -> str:
    """Get the template for AI extraction prompts that request JSON output."""
    return """
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.

IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.

JSON Schema Requirements:
- Extract the actual data from the source documents
- If content is a table, extract it as a table with headers and rows
- If content is a list, extract it as a structured list with items
- If content is text, extract it as paragraphs or headings
- Preserve the original structure and data - do not summarize or interpret
- Use the exact JSON schema provided

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification

Return only the JSON structure following the schema. Do not include any text before or after the JSON.
"""


def getGenerationPromptTemplate() -> str:
    """Get the template for AI generation prompts that work with JSON input."""
    return """
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.

IMPORTANT: You must respond with valid JSON only, following the document schema.

Generation Guidelines:
- Use the provided JSON structure as the foundation
- Enhance the content with proper formatting and organization
- Ensure logical flow and readability
- Maintain the original data integrity
- Add appropriate headings and sections
- Organize content in a logical sequence

Content Enhancement:
- Tables: Ensure proper headers and data alignment
- Lists: Use appropriate list types (bullet, numbered, checklist)
- Headings: Use appropriate heading levels for hierarchy
- Paragraphs: Ensure proper text flow and formatting
- Code: Preserve code blocks with proper language identification

Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
"""


def getAdaptiveJsonSchema(promptAnalysis: Dict[str, Any] = None) -> Dict[str, Any]:
    """Automatically select appropriate schema based on prompt analysis."""
    if promptAnalysis and promptAnalysis.get("is_multi_file", False):
        return getMultiDocumentSchema()
    else:
        return getDocumentSchema()

def validateJsonDocument(jsonData: Dict[str, Any]) -> bool:
    """Validate that the JSON data follows the unified document schema."""
    try:
        # Basic validation - check required fields
        if not isinstance(jsonData, dict):
            return False

        # Check if it's multi-document or single-document structure
        if "documents" in jsonData:
            # Multi-document structure
            if "metadata" not in jsonData:
                return False

            metadata = jsonData["metadata"]
            if not isinstance(metadata, dict) or "split_strategy" not in metadata:
                return False

            documents = jsonData["documents"]
            if not isinstance(documents, list):
                return False

            # Validate each document
            for doc in documents:
                if not isinstance(doc, dict):
                    return False

                required_fields = ["id", "title", "sections", "filename"]
                for field in required_fields:
                    if field not in doc:
                        return False

                # Validate sections in each document
                sections = doc.get("sections", [])
                if not isinstance(sections, list):
                    return False

                for section in sections:
                    if not isinstance(section, dict):
                        return False

                    section_required = ["id", "content_type", "elements", "order"]
                    for field in section_required:
                        if field not in section:
                            return False

                    # Validate content_type
                    valid_types = ["table", "bullet_list", "paragraph", "heading", "code_block", "image", "mixed"]
                    if section["content_type"] not in valid_types:
                        return False

                    # Validate elements
                    if not isinstance(section["elements"], list):
                        return False

        elif "sections" in jsonData:
            # Single-document structure (existing validation)
            if "metadata" not in jsonData:
                return False

            metadata = jsonData["metadata"]
            if not isinstance(metadata, dict) or "title" not in metadata:
                return False

            sections = jsonData["sections"]
            if not isinstance(sections, list):
                return False

            # Validate each section
            for i, section in enumerate(sections):
                if not isinstance(section, dict):
                    return False

                required_fields = ["id", "content_type", "elements", "order"]
                for field in required_fields:
                    if field not in section:
                        return False

                # Validate content_type
                valid_types = ["table", "bullet_list", "paragraph", "heading", "code_block", "image", "mixed"]
                if section["content_type"] not in valid_types:
                    return False

                # Validate elements
                if not isinstance(section["elements"], list):
                    return False
        else:
            return False

        return True

    except Exception:
        return False