gateway/modules/services/serviceGeneration/subJsonSchema.py

"""
JSON Schema definitions for AI-generated document structures.
This module provides schemas that guide AI to generate structured JSON output.
"""

from typing import Dict, Any


def get_multi_document_subJsonSchema() -> Dict[str, Any]:
    """Get the JSON schema for multi-document generation."""
    return {
        "type": "object",
        "required": ["metadata", "documents"],
        "properties": {
            "metadata": {
                "type": "object",
                "required": ["title", "split_strategy"],
                "properties": {
                    "title": {"type": "string", "description": "Document title"},
                    "split_strategy": {
                        "type": "string",
                        "enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
                        "description": "Strategy for splitting content into multiple files"
                    },
                    "splitCriteria": {
                        "type": "object",
                        "description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
                    },
                    "fileNamingPattern": {
                        "type": "string",
                        "description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
                    },
                    "author": {"type": "string", "description": "Document author (optional)"},
                    "source_documents": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of source document IDs"
                    },
                    "extraction_method": {
                        "type": "string",
                        "default": "ai_extraction",
                        "description": "Method used for extraction"
                    }
                }
            },
            "documents": {
                "type": "array",
                "description": "Array of individual documents to generate",
                "items": {
                    "type": "object",
                    "required": ["id", "title", "sections", "filename"],
                    "properties": {
                        "id": {"type": "string", "description": "Unique document identifier"},
                        "title": {"type": "string", "description": "Document title"},
                        "filename": {"type": "string", "description": "Generated filename"},
                        "sections": {
                            "type": "array",
                            "description": "Document sections containing structured content",
                            "items": {
                                "type": "object",
                                "required": ["id", "content_type", "elements", "order"],
                                "properties": {
                                    "id": {"type": "string", "description": "Unique section identifier"},
                                    "title": {"type": "string", "description": "Section title (optional)"},
                                    "content_type": {
                                        "type": "string",
                                        "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
                                        "description": "Primary content type of this section"
                                    },
                                    "elements": {
                                        "type": "array",
                                        "description": "Content elements in this section",
                                        "items": {
                                            "oneOf": [
                                                {"$ref": "#/definitions/table"},
                                                {"$ref": "#/definitions/bullet_list"},
                                                {"$ref": "#/definitions/paragraph"},
                                                {"$ref": "#/definitions/heading"},
                                                {"$ref": "#/definitions/code_block"}
                                            ]
                                        }
                                    },
                                    "order": {"type": "integer", "description": "Section order in document"},
                                    "metadata": {
                                        "type": "object",
                                        "description": "Additional section metadata"
                                    }
                                }
                            }
                        },
                        "metadata": {
                            "type": "object",
                            "description": "Document-specific metadata"
                        }
                    }
                }
            }
        },
        "definitions": {
            "table": {
                "type": "object",
                "required": ["headers", "rows"],
                "properties": {
                    "headers": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Table column headers"
                    },
                    "rows": {
                        "type": "array",
                        "items": {
                            "type": "array",
                            "items": {"type": "string"}
                        },
                        "description": "Table data rows"
                    },
                    "caption": {
                        "type": "string",
                        "description": "Table caption (optional)"
                    }
                }
            },
            "bullet_list": {
                "type": "object",
                "required": ["items"],
                "properties": {
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "required": ["text"],
                            "properties": {
                                "text": {"type": "string", "description": "List item text"},
                                "subitems": {
                                    "type": "array",
                                    "items": {"$ref": "#/definitions/list_item"},
                                    "description": "Nested sub-items (optional)"
                                }
                            }
                        },
                        "description": "List items"
                    },
                    "list_type": {
                        "type": "string",
                        "enum": ["bullet", "numbered", "checklist"],
                        "default": "bullet",
                        "description": "Type of list"
                    }
                }
            },
            "list_item": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "List item text"},
                    "subitems": {
                        "type": "array",
                        "items": {"$ref": "#/definitions/list_item"},
                        "description": "Nested sub-items (optional)"
                    }
                }
            },
            "paragraph": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "Paragraph text"},
                    "formatting": {
                        "type": "object",
                        "description": "Text formatting (bold, italic, etc.)"
                    }
                }
            },
            "heading": {
                "type": "object",
                "required": ["text", "level"],
                "properties": {
                    "text": {"type": "string", "description": "Heading text"},
                    "level": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 6,
                        "description": "Heading level (1-6)"
                    }
                }
            },
            "code_block": {
                "type": "object",
                "required": ["code"],
                "properties": {
                    "code": {"type": "string", "description": "Code content"},
                    "language": {"type": "string", "description": "Programming language (optional)"}
                }
            }
        }
    }

def get_document_subJsonSchema() -> Dict[str, Any]:
    """Get the JSON schema for structured document generation (single document)."""
    return {
        "type": "object",
        "required": ["metadata", "sections"],
        "properties": {
            "metadata": {
                "type": "object",
                "required": ["title"],
                "properties": {
                    "title": {"type": "string", "description": "Document title"},
                    "author": {"type": "string", "description": "Document author (optional)"},
                    "source_documents": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of source document IDs"
                    },
                    "extraction_method": {
                        "type": "string",
                        "default": "ai_extraction",
                        "description": "Method used for extraction"
                    }
                }
            },
            "sections": {
                "type": "array",
                "description": "Document sections containing structured content",
                "items": {
                    "type": "object",
                    "required": ["id", "content_type", "elements", "order"],
                    "properties": {
                        "id": {"type": "string", "description": "Unique section identifier"},
                        "title": {"type": "string", "description": "Section title (optional)"},
                        "content_type": {
                            "type": "string",
                            "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
                            "description": "Primary content type of this section"
                        },
                        "elements": {
                            "type": "array",
                            "description": "Content elements in this section",
                            "items": {
                                "oneOf": [
                                    {"$ref": "#/definitions/table"},
                                    {"$ref": "#/definitions/bullet_list"},
                                    {"$ref": "#/definitions/paragraph"},
                                    {"$ref": "#/definitions/heading"},
                                    {"$ref": "#/definitions/code_block"}
                                ]
                            }
                        },
                        "order": {"type": "integer", "description": "Section order in document"},
                        "metadata": {
                            "type": "object",
                            "description": "Additional section metadata"
                        }
                    }
                }
            },
            "summary": {
                "type": "string",
                "description": "Document summary (optional)"
            },
            "tags": {
                "type": "array",
                "items": {"type": "string"},
                "description": "Document tags for categorization"
            }
        },
        "definitions": {
            "table": {
                "type": "object",
                "required": ["headers", "rows"],
                "properties": {
                    "headers": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Table column headers"
                    },
                    "rows": {
                        "type": "array",
                        "items": {
                            "type": "array",
                            "items": {"type": "string"}
                        },
                        "description": "Table data rows"
                    },
                    "caption": {
                        "type": "string",
                        "description": "Table caption (optional)"
                    }
                }
            },
            "bullet_list": {
                "type": "object",
                "required": ["items"],
                "properties": {
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "required": ["text"],
                            "properties": {
                                "text": {"type": "string", "description": "List item text"},
                                "subitems": {
                                    "type": "array",
                                    "items": {"$ref": "#/definitions/list_item"},
                                    "description": "Nested sub-items (optional)"
                                }
                            }
                        },
                        "description": "List items"
                    },
                    "list_type": {
                        "type": "string",
                        "enum": ["bullet", "numbered", "checklist"],
                        "default": "bullet",
                        "description": "Type of list"
                    }
                }
            },
            "list_item": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "List item text"},
                    "subitems": {
                        "type": "array",
                        "items": {"$ref": "#/definitions/list_item"},
                        "description": "Nested sub-items (optional)"
                    }
                }
            },
            "paragraph": {
                "type": "object",
                "required": ["text"],
                "properties": {
                    "text": {"type": "string", "description": "Paragraph text"},
                    "formatting": {
                        "type": "object",
                        "description": "Text formatting (bold, italic, etc.)"
                    }
                }
            },
            "heading": {
                "type": "object",
                "required": ["text", "level"],
                "properties": {
                    "text": {"type": "string", "description": "Heading text"},
                    "level": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 6,
                        "description": "Heading level (1-6)"
                    }
                }
            },
            "code_block": {
                "type": "object",
                "required": ["code"],
                "properties": {
                    "code": {"type": "string", "description": "Code content"},
                    "language": {"type": "string", "description": "Programming language (optional)"}
                }
            }
        }
    }


def get_extraction_prompt_template() -> str:
    """Get the template for AI extraction prompts that request JSON output."""
    return """
You are extracting structured content from documents. Your task is to analyze the provided content and generate a structured JSON document.

IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.

JSON Schema Requirements:
- Extract the actual data from the source documents
- If content is a table, extract it as a table with headers and rows
- If content is a list, extract it as a structured list with items
- If content is text, extract it as paragraphs or headings
- Preserve the original structure and data - do not summarize or interpret
- Use the exact JSON schema provided

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification

Return only the JSON structure following the schema. Do not include any text before or after the JSON.
"""


def get_generation_prompt_template() -> str:
    """Get the template for AI generation prompts that work with JSON input."""
    return """
You are generating a document from structured JSON data. Your task is to create a well-formatted document based on the provided structured content.

IMPORTANT: You must respond with valid JSON only, following the document schema.

Generation Guidelines:
- Use the provided JSON structure as the foundation
- Enhance the content with proper formatting and organization
- Ensure logical flow and readability
- Maintain the original data integrity
- Add appropriate headings and sections
- Organize content in a logical sequence

Content Enhancement:
- Tables: Ensure proper headers and data alignment
- Lists: Use appropriate list types (bullet, numbered, checklist)
- Headings: Use appropriate heading levels for hierarchy
- Paragraphs: Ensure proper text flow and formatting
- Code: Preserve code blocks with proper language identification

Return only the enhanced JSON structure following the schema. Do not include any text before or after the JSON.
"""


def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
    """Automatically select appropriate schema based on prompt analysis."""
    if prompt_analysis and prompt_analysis.get("is_multi_file", False):
        return get_multi_document_subJsonSchema()
    else:
        return get_document_subJsonSchema()

def validate_json_document(json_data: Dict[str, Any]) -> bool:
    """Validate that the JSON data follows the document schema."""
    try:
        # Basic validation - check required fields
        if not isinstance(json_data, dict):
            return False

        # Check if it's multi-document or single-document structure
        if "documents" in json_data:
            # Multi-document structure
            if "metadata" not in json_data:
                return False

            metadata = json_data["metadata"]
            if not isinstance(metadata, dict) or "title" not in metadata or "split_strategy" not in metadata:
                return False

            documents = json_data["documents"]
            if not isinstance(documents, list):
                return False

            # Validate each document
            for doc in documents:
                if not isinstance(doc, dict):
                    return False

                required_fields = ["id", "title", "sections", "filename"]
                for field in required_fields:
                    if field not in doc:
                        return False

                # Validate sections in each document
                sections = doc.get("sections", [])
                if not isinstance(sections, list):
                    return False

                for section in sections:
                    if not isinstance(section, dict):
                        return False

                    section_required = ["id", "content_type", "elements", "order"]
                    for field in section_required:
                        if field not in section:
                            return False

                    # Validate content_type
                    valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
                    if section["content_type"] not in valid_types:
                        return False

                    # Validate elements
                    if not isinstance(section["elements"], list):
                        return False

        elif "sections" in json_data:
            # Single-document structure (existing validation)
            if "metadata" not in json_data:
                return False

            metadata = json_data["metadata"]
            if not isinstance(metadata, dict) or "title" not in metadata:
                return False

            sections = json_data["sections"]
            if not isinstance(sections, list):
                return False

            # Validate each section
            for i, section in enumerate(sections):
                if not isinstance(section, dict):
                    return False

                required_fields = ["id", "content_type", "elements", "order"]
                for field in required_fields:
                    if field not in section:
                        return False

                # Validate content_type
                valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
                if section["content_type"] not in valid_types:
                    return False

                # Validate elements
                if not isinstance(section["elements"], list):
                    return False
        else:
            return False

        return True

    except Exception:
        return False