""" Centralized prompt builder for document generation across formats. Builds a robust prompt that: - Accepts any user intent (no fixed structure assumptions) - Injects format-specific guidelines from the selected renderer - Adds a common policy section to always use real data from source docs - Requires the AI to output a filename header that we can parse and use """ from typing import Protocol class _RendererLike(Protocol): def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines ... async def buildExtractionPrompt( outputFormat: str, renderer: _RendererLike, userPrompt: str, title: str, aiService=None, services=None ) -> str: """ Build the final extraction prompt by combining: - Parsed extraction intent from user prompt (using AI) - Generic cross-format instructions (filename header + real-data policy) - Format-specific guidelines snippet provided by the renderer The AI must place a single filename header at the very top: FILENAME: followed by a blank line and then ONLY the document content according to the target format. """ # Parse user prompt to separate extraction intent from generation format using AI extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) # Import JSON schema for structured output from .subJsonSchema import get_document_subJsonSchema jsonSchema = get_document_subJsonSchema() # Generic block for JSON extraction genericIntro = f""" {extractionIntent} You are extracting structured content from documents and must respond with valid JSON only. IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. Extract the actual data from the source documents and structure it as JSON with this format: {{ "metadata": {{ "title": "Document Title", "version": "1.0" }}, "sections": [ {{ "id": "section_1", "type": "heading", "data": {{ "level": 1, "text": "Heading Text" }} }}, {{ "id": "section_2", "type": "table", "data": {{ "headers": ["Column1", "Column2"], "rows": [["Data1", "Data2"], ["Data3", "Data4"]] }} }}, {{ "id": "section_3", "type": "bullet_list", "data": {{ "items": ["Item 1", "Item 2", "Item 3"] }} }}, {{ "id": "section_4", "type": "paragraph", "data": {{ "text": "Paragraph content here" }} }} ] }} Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. """.strip() # Final assembly finalPrompt = genericIntro # Debug output services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER") # Save full extraction prompt to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f: f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n") f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n") except Exception: pass return finalPrompt async def buildGenerationPrompt( outputFormat: str, userPrompt: str, title: str, aiService=None, services=None ) -> str: """ Use AI to build the generation prompt based on user intent and format requirements. Focus on what's important for the user and how to structure the content. """ if not aiService: # Fallback if no AI service available return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." try: # Protect userPrompt from injection safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') # Debug output services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER") # AI call to generate the appropriate generation prompt generationPromptRequest = f""" Based on this user request, create a detailed generation prompt for creating a {outputFormat} document. User request: "{safeUserPrompt}" Document title: "{title}" Output format: {outputFormat} Create a generation prompt that: 1. Identifies what content is most important for the user 2. Specifies how to structure and organize the content 3. Includes any specific formatting or presentation requirements 4. Preserves any language requirements 5. Ensures the document meets the user's needs IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically. CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat} Return only the generation prompt, starting with "Generate a {outputFormat} document that..." """ # Call AI service to generate the prompt services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER") # Import and set proper options for AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options) response = await aiService.aiObjects.call(request) result = response.content if response else "" # Replace the placeholder that the AI created with actual format rules if result: formatRules = _getFormatRules(outputFormat) result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules) # Debug output services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER") # Save full generation prompt and AI response to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f: f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n") f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n") f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n") except Exception: pass return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" except Exception as e: # Fallback on any error - preserve user prompt for language instructions services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER") return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" def _getFormatRules(outputFormat: str) -> str: """ Get format-specific rules for JSON-based generation. Since we now use standardized JSON, all formats follow the same rules. """ return """ - Generate content in standardized JSON format following the document schema - Tables: Use JSON table format with headers and rows arrays - Lists: Use JSON list format with items array - Text: Use JSON paragraph format with text field - Headings: Use JSON heading format with level field - Structure: Follow the document JSON schema exactly """.strip() async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str: """ Use AI to extract the core content intention from the user prompt. Focus on WHAT the user wants to extract, not HOW to format it. """ if not aiService: # Fallback if no AI service available return "Extract all relevant content from the document according to the user's requirements" try: # Protect userPrompt from injection by escaping quotes and newlines safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') # Simple AI call to extract the intention extractionPrompt = f""" Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted. User request: "{safeUserPrompt}" Return only the content intention in a simple format like "Extract: [content description]" Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis. If the user mentions a table, extract the actual table data with rows and columns. If the user mentions a list, extract the actual list items. IMPORTANT: Preserve any language requirements in your response. Do not include formatting instructions, file types, or output methods. """ # Call AI service to extract intention services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER") # Import and set proper options for AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options) response = await aiService.aiObjects.call(request) result = response.content if response else "" services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER") return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}" except Exception as e: # Fallback on any error - preserve user prompt for language instructions services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER") return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"