gateway/modules/services/serviceGeneration/subPromptBuilder.py

"""
Centralized prompt builder for document generation across formats.

Builds a robust prompt that:
- Accepts any user intent (no fixed structure assumptions)
- Injects format-specific guidelines from the selected renderer
- Adds a common policy section to always use real data from source docs
- Requires the AI to output a filename header that we can parse and use
"""

from typing import Protocol


class _RendererLike(Protocol):
    def getExtractionPrompt(self, user_prompt: str, title: str) -> str:  # returns only format-specific guidelines
        ...


async def buildExtractionPrompt(
    outputFormat: str,
    renderer: _RendererLike,
    userPrompt: str,
    title: str,
    aiService=None
) -> str:
    """
    Build the final extraction prompt by combining:
    - Parsed extraction intent from user prompt (using AI)
    - Generic cross-format instructions (filename header + real-data policy)
    - Format-specific guidelines snippet provided by the renderer

    The AI must place a single filename header at the very top:
    FILENAME: <safe-file-name-with-extension>
    followed by a blank line and then ONLY the document content according to the target format.
    """

    # Parse user prompt to separate extraction intent from generation format using AI
    extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)

    # Import JSON schema for structured output
    from .subJsonSchema import get_document_subJsonSchema
    jsonSchema = get_document_subJsonSchema()

    # Generic block for JSON extraction
    genericIntro = f"""
{extractionIntent}

You are extracting structured content from documents and must respond with valid JSON only.

IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.

Extract the actual data from the source documents and structure it as JSON with this format:
{{
    "metadata": {{
        "title": "Document Title",
        "version": "1.0"
    }},
    "sections": [
        {{
            "id": "section_1",
            "type": "heading",
            "data": {{
                "level": 1,
                "text": "Heading Text"
            }}
        }},
        {{
            "id": "section_2",
            "type": "table",
            "data": {{
                "headers": ["Column1", "Column2"],
                "rows": [["Data1", "Data2"], ["Data3", "Data4"]]
            }}
        }},
        {{
            "id": "section_3",
            "type": "bullet_list",
            "data": {{
                "items": ["Item 1", "Item 2", "Item 3"]
            }}
        }},
        {{
            "id": "section_4",
            "type": "paragraph",
            "data": {{
                "text": "Paragraph content here"
            }}
        }}
    ]
}}

Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification

Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
""".strip()

    # Final assembly
    finalPrompt = genericIntro

    # Debug output
    print(f"🔍 EXTRACTION INTENT: {extractionIntent}")

    # Save full extraction prompt to debug file
    try:
        import os
        from datetime import datetime, UTC
        ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
        debug_root = "./test-chat/ai"
        os.makedirs(debug_root, exist_ok=True)
        with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
            f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
            f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
    except Exception:
        pass

    return finalPrompt


async def buildGenerationPrompt(
    outputFormat: str,
    userPrompt: str,
    title: str,
    aiService=None
) -> str:
    """
    Use AI to build the generation prompt based on user intent and format requirements.
    Focus on what's important for the user and how to structure the content.
    """
    if not aiService:
        # Fallback if no AI service available
        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."

    try:
        # Protect userPrompt from injection
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')

        # Debug output
        print(f"🔍 GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'")

        # AI call to generate the appropriate generation prompt
        generationPromptRequest = f"""
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.

User request: "{safeUserPrompt}"
Document title: "{title}"
Output format: {outputFormat}

Create a generation prompt that:
1. Identifies what content is most important for the user
2. Specifies how to structure and organize the content
3. Includes any specific formatting or presentation requirements
4. Preserves any language requirements
5. Ensures the document meets the user's needs

IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.

CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}

Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
"""

        # Call AI service to generate the prompt
        print(f"🔍 GENERATION PROMPT REQUEST: Calling AI for generation prompt...")

        # Import and set proper options for AI call
        from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
        request_options = AiCallOptions()
        request_options.operationType = OperationType.GENERAL

        request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
        response = await aiService.aiObjects.call(request)
        result = response.content if response else ""
        print(f"🔍 GENERATION PROMPT AI RESPONSE: '{result}'")

        # Replace the placeholder that the AI created with actual format rules
        if result:
            formatRules = _getFormatRules(outputFormat)
            result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)

        # Debug output
        print(f"🔍 GENERATION PROMPT FINAL: {result if result else 'None'}")

        # Save full generation prompt and AI response to debug file
        try:
            import os
            from datetime import datetime, UTC
            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
            debug_root = "./test-chat/ai"
            os.makedirs(debug_root, exist_ok=True)
            with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
                f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
                f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
                f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
        except Exception:
            pass

        return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"

    except Exception as e:
        # Fallback on any error - preserve user prompt for language instructions
        print(f"🔍 DEBUG: AI generation prompt failed: {str(e)}")
        return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"


def _getFormatRules(outputFormat: str) -> str:
    """
    Get format-specific rules for JSON-based generation.
    Since we now use standardized JSON, all formats follow the same rules.
    """
    return """
- Generate content in standardized JSON format following the document schema
- Tables: Use JSON table format with headers and rows arrays
- Lists: Use JSON list format with items array
- Text: Use JSON paragraph format with text field
- Headings: Use JSON heading format with level field
- Structure: Follow the document JSON schema exactly
""".strip()


async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
    """
    Use AI to extract the core content intention from the user prompt.
    Focus on WHAT the user wants to extract, not HOW to format it.
    """
    if not aiService:
        # Fallback if no AI service available
        return "Extract all relevant content from the document according to the user's requirements"

    try:
        # Protect userPrompt from injection by escaping quotes and newlines
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')

        # Simple AI call to extract the intention
        extractionPrompt = f"""
Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted.

User request: "{safeUserPrompt}"

Return only the content intention in a simple format like "Extract: [content description]"
Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis.
If the user mentions a table, extract the actual table data with rows and columns.
If the user mentions a list, extract the actual list items.
IMPORTANT: Preserve any language requirements in your response.
Do not include formatting instructions, file types, or output methods.
"""

        # Call AI service to extract intention
        print(f"🔍 DEBUG: Calling AI for extraction intent...")

        # Import and set proper options for AI call
        from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
        request_options = AiCallOptions()
        request_options.operationType = OperationType.GENERAL

        request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
        response = await aiService.aiObjects.call(request)
        result = response.content if response else ""
        print(f"🔍 DEBUG: AI extraction intent result: '{result}'")

        return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"

    except Exception as e:
        # Fallback on any error - preserve user prompt for language instructions
        print(f"🔍 DEBUG: AI extraction intent failed: {str(e)}")
        return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"