gateway/modules/serviceCenter/services/serviceGeneration/subPromptBuilderGeneration.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Prompt builder for document generation.
This module builds prompts for generating documents from extracted content.
"""

import logging
from typing import Dict, Any
from modules.datamodels.datamodelJson import jsonTemplateDocument

logger = logging.getLogger(__name__)


async def buildGenerationPrompt(
    outputFormat: str,
    userPrompt: str,
    title: str,
    extracted_content: str = None,
    continuationContext: Dict[str, Any] = None,
    services: Any = None,
    useContentParts: bool = False  # ARCHITECTURE: If True, don't include full content in prompt (ContentParts will be used directly)
) -> str:
    """
    Build the unified generation prompt using a single JSON template.
    Generic solution that works for any user request.

    Args:
        outputFormat: Target output format (html, pdf, docx, etc.) - not used in prompt
        userPrompt: User's original prompt for document generation
        title: Title for the document
        extracted_content: Optional extracted content from documents to prepend to prompt
        continuationContext: Optional context from previous generation for continuation
        services: Optional services instance for accessing user language

    Returns:
        Complete generation prompt string
    """
    # Extract user language for document language instruction
    userLanguage = 'en'  # Default fallback
    if services:
        try:
            # Prefer detected language if available
            if hasattr(services, 'currentUserLanguage') and services.currentUserLanguage:
                userLanguage = services.currentUserLanguage
            elif hasattr(services, 'user') and services.user and hasattr(services.user, 'language'):
                userLanguage = services.user.language
        except Exception:
            pass

    # Create a template - let AI generate title if not provided
    titleValue = title if title else "Generated Document"
    jsonTemplate = jsonTemplateDocument.replace("{{DOCUMENT_TITLE}}", titleValue)

    # Build prompt based on whether this is a continuation or first call
    # Check if we have valid continuation context with actual JSON fragment
    # CRITICAL: Allow continuation even if section_count is 0 (broken JSON that couldn't be parsed)
    # as long as we have last_raw_json - this handles cases where JSON is too broken to extract sections
    hasContinuation = (
        continuationContext
        and continuationContext.get("last_raw_json", "")
        and continuationContext.get("last_raw_json", "").strip() != "{}"
    )

    if hasContinuation:
        # CONTINUATION PROMPT - use centralized jsonContinuation system
        delivered_summary = continuationContext.get("delivered_summary", "")

        # Use centralized system: overlap_context and hierarchy_context from jsonContinuation.getContexts()
        overlap_context = continuationContext.get("overlap_context")
        hierarchy_context = continuationContext.get("hierarchy_context")

        # Build continuation text with delivered summary and cut-off information
        # CRITICAL: Always include cut-off information if available (per loop_plan.md)
        continuationText = f"{delivered_summary}\n\n"
        continuationText += "⚠️ CONTINUATION: Response was cut off. Generate ONLY the remaining content that comes AFTER the reference elements below.\n\n"

        # Add cut-off point information using centralized jsonContinuation contexts
        # These are shown ONLY as REFERENCE to know where generation stopped
        if hierarchy_context:
            continuationText += "# REFERENCE: Structure context (already delivered - DO NOT repeat):\n"
            continuationText += f"{hierarchy_context}\n\n"

        if overlap_context:
            continuationText += "# REFERENCE: Overlap context - incomplete element at cut point (DO NOT repeat):\n"
            continuationText += f"{overlap_context}\n\n"

        continuationText += "⚠️ CRITICAL: The elements above are REFERENCE ONLY. They are already delivered.\n"
        continuationText += "Generate ONLY what comes AFTER these elements. DO NOT regenerate the entire JSON structure.\n"
        continuationText += "Start directly with the next element/section that should follow.\n\n"

        # PROMPT FOR CONTINUATION
        generationPrompt = f"""{'='*80}
USER REQUEST / USER PROMPT:
{'='*80}
{userPrompt}
{'='*80}
END OF USER REQUEST / USER PROMPT
{'='*80}

⚠️ CONTINUATION MODE: Response was incomplete. Generate ONLY the remaining content.

LANGUAGE REQUIREMENT: All generated content must be in the language '{userLanguage}'. Generate all text, headings, paragraphs, and content in this language.

{continuationText}

JSON structure template:
{jsonTemplate}

Rules:
- Return ONLY valid JSON (no comments, no trailing commas, double quotes only).
- Reference elements shown above are ALREADY DELIVERED - DO NOT repeat them.
- Generate ONLY the remaining content that comes AFTER the reference elements.
- DO NOT regenerate the entire JSON structure - start directly with what comes next.
- All content must be in the language '{userLanguage}'.
- Output JSON only; no markdown fences or extra text.

Continue generating the remaining content now.
"""
    else:

        # PROMPT FOR FIRST CALL
        # Structure: User request + Extracted content FIRST (if available), then JSON template, then instructions

        # ARCHITECTURE: If useContentParts=True, don't include full content in prompt
        # ContentParts will be passed directly to callAi for model-aware chunking
        if extracted_content and not useContentParts:
            # If we have extracted content, put it FIRST and make it very clear it's the source data
            generationPrompt = f"""{'='*80}
USER REQUEST / USER PROMPT:
{'='*80}
{userPrompt}
{'='*80}
END OF USER REQUEST / USER PROMPT
{'='*80}

{'='*80}
⚠️ CRITICAL: USE THIS EXTRACTED CONTENT AS YOUR DATA SOURCE ⚠️
{'='*80}
The content below contains the ACTUAL DATA extracted from the source documents.
You MUST use this data - DO NOT generate fake or example data.
{'='*80}
EXTRACTED CONTENT FROM DOCUMENTS:
{'='*80}
{extracted_content}
{'='*80}
END OF EXTRACTED CONTENT
{'='*80}

LANGUAGE REQUIREMENT: All generated content must be in the language '{userLanguage}'. Generate all text, headings, paragraphs, and content in this language. If the extracted content is in a different language, translate it to '{userLanguage}' while preserving the structure and meaning.

Generate a VALID JSON response using the EXTRACTED CONTENT above as your data source.
The JSON structure template below shows ONLY the structure pattern - the example values are NOT real data.
You MUST use the actual data from EXTRACTED CONTENT above, NOT the example values from the template.

JSON structure template (structure only - use data from EXTRACTED CONTENT above):
{jsonTemplate}

Instructions:
- Return ONLY valid JSON (strict). No comments. No trailing commas. Use double quotes.
- Do NOT reuse example section IDs; create your own.
- CRITICAL: Use the ACTUAL DATA from EXTRACTED CONTENT above, NOT the example values from the template.
- Generate complete content based on the user request and the extracted content. Do NOT just give an instruction or comments. Deliver the complete response.
- All content must be in the language '{userLanguage}'.
- IMPORTANT: Set a meaningful "filename" in each document with appropriate file extension (e.g., "prime_numbers.txt", "report.docx", "data.json"). The filename should reflect the content and task objective.
- Output JSON only; no markdown fences or extra text.

Generate your complete response using the extracted content data.
"""
        else:
            # No extracted content - generate from scratch
            generationPrompt = f"""{'='*80}
USER REQUEST / USER PROMPT:
{'='*80}
{userPrompt}
{'='*80}
END OF USER REQUEST / USER PROMPT
{'='*80}

LANGUAGE REQUIREMENT: All generated content must be in the language '{userLanguage}'. Generate all text, headings, paragraphs, and content in this language.

Generate a VALID JSON response for the user request. The template below shows ONLY the structure pattern - it is NOT existing content.

JSON structure template:
{jsonTemplate}

Instructions:
- Return ONLY valid JSON (strict). No comments. No trailing commas. Use double quotes.
- Do NOT reuse example section IDs; create your own.
- Generate complete content based on the user request. Do NOT just give an instruction or comments. Deliver the complete response.
- All content must be in the language '{userLanguage}'.
- IMPORTANT: Set a meaningful "filename" in each document with appropriate file extension (e.g., "prime_numbers.txt", "report.docx", "data.json"). The filename should reflect the content and task objective.
- Output JSON only; no markdown fences or extra text.

Generate your complete response.
"""

    return generationPrompt.strip()