gateway/modules/services/serviceGeneration/subPromptBuilder.py

"""
Centralized prompt builder for document generation across formats.

Builds a robust prompt that:
- Accepts any user intent (no fixed structure assumptions)
- Injects format-specific guidelines from the selected renderer
- Adds a common policy section to always use real data from source docs
- Requires the AI to output a filename header that we can parse and use
"""

import json
from typing import Protocol, Dict, Any


class _RendererLike(Protocol):
    def getExtractionPrompt(self, user_prompt: str, title: str) -> str:  # returns only format-specific guidelines
        ...


async def buildAdaptiveExtractionPrompt(
    outputFormat: str,
    userPrompt: str,
    title: str,
    promptAnalysis: Dict[str, Any],
    aiService=None,
    services=None
) -> str:
    """Build adaptive extraction prompt based on AI analysis."""

    # Get appropriate JSON schema based on analysis
    from .subJsonSchema import get_adaptive_json_schema
    json_schema = get_adaptive_json_schema(promptAnalysis)

    if promptAnalysis.get("is_multi_file", False):
        schema_type = "multi-document"
    else:
        schema_type = "single-document"

    # Build adaptive prompt using AI analysis - match single-file style
    if promptAnalysis.get("is_multi_file", False):
        # Multi-file prompt - use simple example format like single-file
        multi_file_example = {
            "metadata": {
                "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
                "splitStrategy": "by_section"
            },
            "documents": [
                {
                    "id": "doc_1",
                    "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
                    "filename": "REPLACE_WITH_ACTUAL_FILENAME",
                    "sections": [
                        {
                            "id": "section_1",
                            "content_type": "heading",
                            "elements": [
                                {
                                    "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
                                    "level": 1
                                }
                            ],
                            "order": 1
                        },
                        {
                            "id": "section_2",
                            "content_type": "paragraph",
                            "elements": [
                                {
                                    "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
                                }
                            ],
                            "order": 2
                        }
                    ]
                }
            ]
        }

        adaptive_prompt = f"""
{userPrompt}

You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.

TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.

REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
3. Create one JSON document entry for each section found
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
5. Generate appropriate filenames for each section

CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.

OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(multi_file_example, indent=2)}

IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
- "id": unique identifier
- "title": section title from the document
- "filename": appropriate filename for the section
- "sections": array of content sections

DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.

INSTRUCTIONS:
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
- Use actual section titles, headings, and text from the document
- Create meaningful filenames based on section content
- Ensure each section contains the complete content for that part of the document
- Do not use generic placeholder text like "Section 1", "Section 2"
- Extract real headings, paragraphs, lists, and other content elements
- CRITICAL: Return JSON with "documents" array, not "sections" array

CONTEXT (Document Content):

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements

Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses

Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.

Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
    else:
        # Single-file prompt - use original style
        adaptive_prompt = f"""
{userPrompt}

You are extracting structured content from documents and must respond with valid JSON only.

IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.

Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(json_schema, indent=2)}

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements

Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses

Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.

Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()

    return adaptive_prompt

async def buildGenericExtractionPrompt(
    outputFormat: str,
    userPrompt: str,
    title: str,
    aiService=None,
    services=None
) -> str:
    """Build generic extraction prompt that works for both single and multi-file."""

    # Use AI to determine the best approach
    if aiService:
        try:
            analysis_prompt = f"""
Analyze this user request and determine the best JSON structure for document extraction.

User request: "{userPrompt}"

Respond with JSON only:
{{
    "requires_multi_file": true/false,
    "recommended_schema": "single_document|multi_document",
    "split_approach": "description of how to organize content",
    "file_naming": "suggested naming pattern"
}}

Consider the user's intent and the most logical way to organize the extracted content.
"""

            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
            request_options = AiCallOptions()
            request_options.operationType = OperationType.GENERAL

            request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
            response = await aiService.aiObjects.call(request)

            if response and response.content:
                import re

                result = response.content.strip()
                json_match = re.search(r'\{.*\}', result, re.DOTALL)
                if json_match:
                    result = json_match.group(0)

                analysis = json.loads(result)

                # Use analysis to build appropriate prompt
                return await buildAdaptiveExtractionPrompt(
                    outputFormat, userPrompt, title, analysis, aiService, services
                )
        except Exception as e:
            services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")

    # Fallback to single-file prompt
    from .subJsonSchema import get_document_subJsonSchema
    json_schema = get_document_subJsonSchema()

    return f"""
{userPrompt}

You are extracting structured content from documents and must respond with valid JSON only.

CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.

Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(json_schema, indent=2)}

Requirements:
- Preserve all original data - do not summarize or interpret
- Use the exact JSON schema provided
- Maintain data integrity and structure

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements

Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses

Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.

Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.

DO NOT return a schema description - return actual extracted content in the JSON format shown above.
"""

async def buildExtractionPrompt(
    outputFormat: str,
    renderer: _RendererLike,
    userPrompt: str,
    title: str,
    aiService=None,
    services=None
) -> str:
    """
    Build the final extraction prompt by combining:
    - Parsed extraction intent from user prompt (using AI)
    - Generic cross-format instructions (filename header + real-data policy)
    - Format-specific guidelines snippet provided by the renderer

    The AI must place a single filename header at the very top:
    FILENAME: <safe-file-name-with-extension>
    followed by a blank line and then ONLY the document content according to the target format.
    """

    # Parse user prompt to separate extraction intent from generation format using AI
    extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)

    # Import JSON schema for structured output
    from .subJsonSchema import get_document_subJsonSchema
    jsonSchema = get_document_subJsonSchema()

    # Generic block for JSON extraction - use proper schema instead of hardcoded template
    genericIntro = f"""
{extractionIntent}

You are extracting structured content from documents and must respond with valid JSON only.

CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.

Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(jsonSchema, indent=2)}

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements

Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses

Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.

Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.

DO NOT return a schema description - return actual extracted content in the JSON format shown above.
""".strip()

    # Final assembly
    finalPrompt = genericIntro

    # Debug output
    services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER")

    # Save full extraction prompt to debug file - only if debug enabled
    try:
        debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
        if debug_enabled:
            import os
            from datetime import datetime, UTC
            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
            debug_root = "./test-chat/ai"
            os.makedirs(debug_root, exist_ok=True)
            with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
                f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
                f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
    except Exception:
        pass

    return finalPrompt


async def buildGenerationPrompt(
    outputFormat: str,
    userPrompt: str,
    title: str,
    aiService=None,
    services=None
) -> str:
    """
    Use AI to build the generation prompt based on user intent and format requirements.
    Focus on what's important for the user and how to structure the content.
    """
    if not aiService:
        # Fallback if no AI service available
        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."

    try:
        # Protect userPrompt from injection
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')

        # Debug output
        services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")

        # AI call to generate the appropriate generation prompt
        generationPromptRequest = f"""
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.

User request: "{safeUserPrompt}"
Document title: "{title}"
Output format: {outputFormat}

Create a generation prompt that:
1. Identifies what content is most important for the user
2. Specifies how to structure and organize the content
3. Includes any specific formatting or presentation requirements
4. Preserves any language requirements
5. Ensures the document meets the user's needs

IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.

CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}

Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
"""

        # Call AI service to generate the prompt
        services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER")

        # Import and set proper options for AI call
        from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
        request_options = AiCallOptions()
        request_options.operationType = OperationType.GENERAL

        request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
        response = await aiService.aiObjects.call(request)
        result = response.content if response else ""

        # Replace the placeholder that the AI created with actual format rules
        if result:
            formatRules = _getFormatRules(outputFormat)
            result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)

        # Debug output
        services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")

        # Save full generation prompt and AI response to debug file - only if debug enabled
        try:
            debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
            if debug_enabled:
                import os
                from datetime import datetime, UTC
                ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
                debug_root = "./test-chat/ai"
                os.makedirs(debug_root, exist_ok=True)
                with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
                    f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
                    f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
                    f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
        except Exception:
            pass

        return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"

    except Exception as e:
        # Fallback on any error - preserve user prompt for language instructions
        services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"


def _getFormatRules(outputFormat: str) -> str:
    """
    Get format-specific rules for JSON-based generation.
    Since we now use standardized JSON, all formats follow the same rules.
    """
    return """
- Generate content in standardized JSON format following the document schema
- Tables: Use JSON table format with headers and rows arrays
- Lists: Use JSON list format with items array
- Text: Use JSON paragraph format with text field
- Headings: Use JSON heading format with level field
- Structure: Follow the document JSON schema exactly
""".strip()


async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
    """
    Use AI to extract a rich, structured extraction intent from the user prompt.
    Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
    """
    if not aiService:
        # Fallback if no AI service available
        return "Extract all relevant content from the document according to the user's requirements"

    try:
        # Protect userPrompt from injection by escaping quotes and newlines
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')

        # Rich analysis to derive a complete extraction intent and structure guidance
        extractionPrompt = f"""
Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.

Goals:
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
- Identify if multi-file output is appropriate and how to split/files name.

User request: "{safeUserPrompt}"

Return JSON in this exact shape:
{{
  "detectedLanguage": "de|en|fr|it|...",
  "normalizedRequest": "Full explicit instruction in detected language",
  "requiresStructuredData": true|false,
  "targetStructure": "table|list|mixed|unstructured",
  "table": {{
    "headers": ["Header1", "Header2", "..."],
    "headerOrderStrict": true|false,
    "rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
    "formats": {{
      "dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
      "amountDecimals": 2,
      "currencyFormat": "code|symbol",
      "idMasking": "none|last4|custom"
    }}
  }},
  "multiFile": true|false,
  "fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
  "fileNamingPattern": "suggested pattern for filenames",
  "constraints": ["List of critical constraints to enforce"],
  "reasoning": "Brief justification (one sentence)"
}}

Rules:
- Preserve user terminology and language in normalizedRequest.
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
"""

        # Call AI service to extract intention
        services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER")

        # Import and set proper options for AI call
        from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
        request_options = AiCallOptions()
        request_options.operationType = OperationType.GENERAL

        request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
        response = await aiService.aiObjects.call(request)
        result = response.content if response else ""
        services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")

        # Try to extract and pretty print JSON
        if result:
            import re, json as _json
            match = re.search(r'\{[\s\S]*\}', result)
            if match:
                try:
                    obj = _json.loads(match.group(0))
                    return _json.dumps(obj, ensure_ascii=False, indent=2)
                except Exception:
                    pass

        # Fallback to previous simple format
        return f"Extract: {safeUserPrompt}"

    except Exception as e:
        # Fallback on any error - preserve user prompt for language instructions
        services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
        return f"Extract: {userPrompt}"