""" Centralized prompt builder for document generation across formats. Builds a robust prompt that: - Accepts any user intent (no fixed structure assumptions) - Injects format-specific guidelines from the selected renderer - Adds a common policy section to always use real data from source docs - Requires the AI to output a filename header that we can parse and use """ import json from typing import Protocol, Dict, Any class _RendererLike(Protocol): def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines ... async def buildAdaptiveExtractionPrompt( outputFormat: str, userPrompt: str, title: str, promptAnalysis: Dict[str, Any], aiService=None, services=None ) -> str: """Build adaptive extraction prompt based on AI analysis.""" # Get appropriate JSON schema based on analysis from .subJsonSchema import get_adaptive_json_schema json_schema = get_adaptive_json_schema(promptAnalysis) if promptAnalysis.get("is_multi_file", False): schema_type = "multi-document" else: schema_type = "single-document" # Build adaptive prompt using AI analysis - match single-file style if promptAnalysis.get("is_multi_file", False): # Check if this is JSON email data is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation']) if is_json_email: # Specialized prompt for JSON email data multi_file_example = { "metadata": { "title": "Email Conversations", "splitStrategy": "per_entity" }, "documents": [ { "id": "doc_1", "title": "Email from SENDER to RECIPIENT", "filename": "email_sender_to_recipient.txt", "sections": [ { "id": "section_1", "content_type": "heading", "elements": [ { "text": "Email from SENDER to RECIPIENT", "level": 1 } ], "order": 1 }, { "id": "section_2", "content_type": "paragraph", "elements": [ { "text": "FULL_EMAIL_CONTENT_HERE" } ], "order": 2 } ] } ] } else: # Generic multi-file prompt multi_file_example = { "metadata": { "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE", "splitStrategy": "by_section" }, "documents": [ { "id": "doc_1", "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE", "filename": "REPLACE_WITH_ACTUAL_FILENAME", "sections": [ { "id": "section_1", "content_type": "heading", "elements": [ { "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT", "level": 1 } ], "order": 1 }, { "id": "section_2", "content_type": "paragraph", "elements": [ { "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT" } ], "order": 2 } ] } ] } adaptive_prompt = f""" {userPrompt} You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file. REQUIREMENTS: 1. Analyze the document content provided in the context below 2. Identify distinct sections in the document (by headings, topics, or logical breaks) 3. Create one JSON document entry for each section found 4. Extract the real content from each section (headings, paragraphs, lists, etc.) 5. Generate appropriate filenames for each section CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array. OUTPUT FORMAT: Return only valid JSON in this exact structure: {json.dumps(multi_file_example, indent=2)} IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have: - "id": unique identifier - "title": section title from the document - "filename": appropriate filename for the section - "sections": array of content sections DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level. INSTRUCTIONS: - Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document - Use actual section titles, headings, and text from the document - Create meaningful filenames based on section content - Ensure each section contains the complete content for that part of the document - Do not use generic placeholder text like "Section 1", "Section 2" - Extract real headings, paragraphs, lists, and other content elements - CRITICAL: Return JSON with "documents" array, not "sections" array CONTEXT (Document Content): Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. """.strip() else: # Single-file prompt - use original style adaptive_prompt = f""" {userPrompt} You are extracting structured content from documents and must respond with valid JSON only. IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. Extract the actual data from the source documents and structure it as JSON with this format: {json.dumps(json_schema, indent=2)} Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. """.strip() return adaptive_prompt async def buildGenericExtractionPrompt( outputFormat: str, userPrompt: str, title: str, aiService=None, services=None ) -> str: """Build generic extraction prompt that works for both single and multi-file.""" # Use AI to determine the best approach if aiService: try: analysis_prompt = f""" Analyze this user request and determine the best JSON structure for document extraction. User request: "{userPrompt}" Respond with JSON only: {{ "requires_multi_file": true/false, "recommended_schema": "single_document|multi_document", "split_approach": "description of how to organize content", "file_naming": "suggested naming pattern" }} Consider the user's intent and the most logical way to organize the extracted content. """ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) response = await aiService.aiObjects.call(request) if response and response.content: import re result = response.content.strip() json_match = re.search(r'\{.*\}', result, re.DOTALL) if json_match: result = json_match.group(0) analysis = json.loads(result) # Use analysis to build appropriate prompt return await buildAdaptiveExtractionPrompt( outputFormat, userPrompt, title, analysis, aiService, services ) except Exception as e: services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER") # Fallback to single-file prompt from .subJsonSchema import get_document_subJsonSchema json_schema = get_document_subJsonSchema() return f""" {userPrompt} You are extracting structured content from documents and must respond with valid JSON only. CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. Extract the actual data from the source documents and structure it as JSON with this format: {json.dumps(json_schema, indent=2)} Requirements: - Preserve all original data - do not summarize or interpret - Use the exact JSON schema provided - Maintain data integrity and structure Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. DO NOT return a schema description - return actual extracted content in the JSON format shown above. """ async def buildExtractionPrompt( outputFormat: str, renderer: _RendererLike, userPrompt: str, title: str, aiService=None, services=None ) -> str: """ Build the final extraction prompt by combining: - Parsed extraction intent from user prompt (using AI) - Generic cross-format instructions (filename header + real-data policy) - Format-specific guidelines snippet provided by the renderer The AI must place a single filename header at the very top: FILENAME: followed by a blank line and then ONLY the document content according to the target format. """ # Parse user prompt to separate extraction intent from generation format using AI extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) # Import JSON schema for structured output from .subJsonSchema import get_document_subJsonSchema jsonSchema = get_document_subJsonSchema() # Generic block for JSON extraction genericIntro = f""" {extractionIntent} You are extracting structured content from documents and must respond with valid JSON only. CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. Extract the actual data from the source documents and structure it as JSON with this format: {{ "metadata": {{ "title": "Document Title", "version": "1.0" }}, "sections": [ {{ "id": "section_1", "type": "heading", "data": {{ "level": 1, "text": "Heading Text" }} }}, {{ "id": "section_2", "type": "table", "data": {{ "headers": ["Column1", "Column2"], "rows": [["Data1", "Data2"], ["Data3", "Data4"]] }} }}, {{ "id": "section_3", "type": "bullet_list", "data": {{ "items": ["Item 1", "Item 2", "Item 3"] }} }}, {{ "id": "section_4", "type": "paragraph", "data": {{ "text": "Paragraph content here" }} }} ] }} Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. DO NOT return a schema description - return actual extracted content in the JSON format shown above. """.strip() # Final assembly finalPrompt = genericIntro # Debug output services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER") # Save full extraction prompt to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f: f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n") f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n") except Exception: pass return finalPrompt async def buildGenerationPrompt( outputFormat: str, userPrompt: str, title: str, aiService=None, services=None ) -> str: """ Use AI to build the generation prompt based on user intent and format requirements. Focus on what's important for the user and how to structure the content. """ if not aiService: # Fallback if no AI service available return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." try: # Protect userPrompt from injection safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') # Debug output services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER") # AI call to generate the appropriate generation prompt generationPromptRequest = f""" Based on this user request, create a detailed generation prompt for creating a {outputFormat} document. User request: "{safeUserPrompt}" Document title: "{title}" Output format: {outputFormat} Create a generation prompt that: 1. Identifies what content is most important for the user 2. Specifies how to structure and organize the content 3. Includes any specific formatting or presentation requirements 4. Preserves any language requirements 5. Ensures the document meets the user's needs IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically. CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat} Return only the generation prompt, starting with "Generate a {outputFormat} document that..." """ # Call AI service to generate the prompt services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER") # Import and set proper options for AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options) response = await aiService.aiObjects.call(request) result = response.content if response else "" # Replace the placeholder that the AI created with actual format rules if result: formatRules = _getFormatRules(outputFormat) result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules) # Debug output services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER") # Save full generation prompt and AI response to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f: f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n") f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n") f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n") except Exception: pass return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" except Exception as e: # Fallback on any error - preserve user prompt for language instructions services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER") return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" def _getFormatRules(outputFormat: str) -> str: """ Get format-specific rules for JSON-based generation. Since we now use standardized JSON, all formats follow the same rules. """ return """ - Generate content in standardized JSON format following the document schema - Tables: Use JSON table format with headers and rows arrays - Lists: Use JSON list format with items array - Text: Use JSON paragraph format with text field - Headings: Use JSON heading format with level field - Structure: Follow the document JSON schema exactly """.strip() async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str: """ Use AI to extract the core content intention from the user prompt. Focus on WHAT the user wants to extract, not HOW to format it. """ if not aiService: # Fallback if no AI service available return "Extract all relevant content from the document according to the user's requirements" try: # Protect userPrompt from injection by escaping quotes and newlines safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') # Simple AI call to extract the intention extractionPrompt = f""" Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted. User request: "{safeUserPrompt}" Return only the content intention in a simple format like "Extract: [content description]" Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis. If the user mentions a table, extract the actual table data with rows and columns. If the user mentions a list, extract the actual list items. IMPORTANT: Preserve any language requirements in your response. Do not include formatting instructions, file types, or output methods. """ # Call AI service to extract intention services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER") # Import and set proper options for AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options) response = await aiService.aiObjects.call(request) result = response.content if response else "" services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER") return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}" except Exception as e: # Fallback on any error - preserve user prompt for language instructions services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER") return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"