""" Prompt builder for AI document generation and extraction. This module builds prompts for AI services to extract and generate documents. """ import json import logging from typing import Dict, Any, Optional, List, TYPE_CHECKING from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType # Type hint for renderer parameter if TYPE_CHECKING: from .renderers.rendererBaseTemplate import BaseRenderer _RendererLike = BaseRenderer else: _RendererLike = Any logger = logging.getLogger(__name__) async def buildAdaptiveExtractionPrompt( outputFormat: str, userPrompt: str, title: str, promptAnalysis: Dict[str, Any], aiService=None, services=None ) -> str: """ Build adaptive extraction prompt based on AI analysis. Uses multi-file or single-file approach based on analysis. """ # Multi-file example data instead of schema multi_file_example = { "metadata": { "title": "Multi-Document Example", "splitStrategy": "by_section", "source_documents": ["doc_001"], "extraction_method": "ai_extraction" }, "documents": [ { "id": "doc_section_1", "title": "Section 1 Title", "filename": "section_1.xlsx", "sections": [ { "id": "table_1", "content_type": "table", "elements": [ { "headers": ["Column 1", "Column 2"], "rows": [["Value 1", "Value 2"]] } ], "order": 1 } ] } ] } # Single-file example data instead of schema single_file_example = { "metadata": { "title": "Single Document Example", "source_documents": ["doc_001"], "extraction_method": "ai_extraction" }, "sections": [ { "id": "table_1", "content_type": "table", "elements": [ { "headers": ["Column 1", "Column 2"], "rows": [["Value 1", "Value 2"]] } ], "order": 1 } ] } if promptAnalysis.get("is_multi_file", False): # Multi-file prompt adaptive_prompt = f""" {userPrompt} You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file. REQUIREMENTS: 1. Analyze the document content provided in the context below 2. Identify distinct sections in the document (by headings, topics, or logical breaks) 3. Create one JSON document entry for each section found 4. Extract the real content from each section (headings, paragraphs, lists, etc.) 5. Generate appropriate filenames for each section CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array. OUTPUT FORMAT: Return only valid JSON in this exact structure: {json.dumps(multi_file_example, indent=2)} IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have: - "id": unique identifier - "title": section title from the document - "filename": appropriate filename for the section - "sections": array of content sections DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level. INSTRUCTIONS: - Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document - Use actual section titles, headings, and text from the document - Create meaningful filenames based on section content - Ensure each section contains the complete content for that part of the document - Do not use generic placeholder text like "Section 1", "Section 2" - Extract real headings, paragraphs, lists, and other content elements - CRITICAL: Return JSON with "documents" array, not "sections" array CONTEXT (Document Content): Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. """.strip() else: # Single-file prompt - use example data instead of schema adaptive_prompt = f""" {userPrompt} You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. TASK: Extract the actual content from the document and organize it into structured sections. REQUIREMENTS: 1. Analyze the document content provided in the context below 2. Extract all content and organize it into logical sections 3. Create structured JSON with sections containing the extracted content 4. Preserve the original structure and data OUTPUT FORMAT: Return only valid JSON in this exact structure: {json.dumps(single_file_example, indent=2)} INSTRUCTIONS: - Replace example data with actual content from the document - Use actual headings, paragraphs, and text from the document - Ensure all content is properly structured - Do not use generic placeholder text - Extract real content from the documents CONTEXT (Document Content): Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. """.strip() return adaptive_prompt async def buildGenericExtractionPrompt( outputFormat: str, userPrompt: str, title: str, aiService=None, services=None ) -> str: """Build generic extraction prompt that works for both single and multi-file.""" # Use AI to determine the best approach if aiService: try: analysis_prompt = f""" Analyze this user request and determine the best JSON structure for document extraction. User request: "{userPrompt}" Respond with JSON only: {{ "requires_multi_file": true/false, "recommended_schema": "single_document|multi_document", "split_approach": "description of how to organize content", "file_naming": "suggested naming pattern" }} Consider the user's intent and the most logical way to organize the extracted content. """ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) response = await aiService.aiObjects.call(request) if response and response.content: import re result = response.content.strip() json_match = re.search(r'\{.*\}', result, re.DOTALL) if json_match: result = json_match.group(0) analysis = json.loads(result) # Use analysis to build appropriate prompt return await buildAdaptiveExtractionPrompt( outputFormat, userPrompt, title, analysis, aiService, services ) except Exception as e: services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER") # Fallback to single-file prompt example_data = { "metadata": { "title": "Example Document", "author": "AI Assistant", "source_documents": ["document_001"], "extraction_method": "ai_extraction" }, "sections": [ { "id": "section_001", "content_type": "table", "elements": [ { "headers": ["Column 1", "Column 2", "Column 3"], "rows": [ ["Value 1", "Value 2", "Value 3"], ["Value 4", "Value 5", "Value 6"] ] } ], "order": 1, "metadata": {} } ], "summary": "", "tags": [] } return f""" {userPrompt} You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. TASK: Extract the actual content from the document and organize it into structured sections. REQUIREMENTS: 1. Analyze the document content provided in the context below 2. Extract all content and organize it into logical sections 3. Create structured JSON with sections containing the extracted content 4. Preserve the original structure and data OUTPUT FORMAT: Return only valid JSON in this exact structure: {json.dumps(example_data, indent=2)} Requirements: - Preserve all original data - do not summarize or interpret - Use the exact JSON format shown above - Maintain data integrity and structure Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. DO NOT return a schema description - return actual extracted content in the JSON format shown above. """ async def buildExtractionPrompt( outputFormat: str, renderer: _RendererLike, userPrompt: str, title: str, aiService=None, services=None ) -> str: """ Build the final extraction prompt by combining: - Parsed extraction intent from user prompt (using AI) - Generic cross-format instructions (filename header + real-data policy) - Format-specific guidelines snippet provided by the renderer The AI must place a single filename header at the very top: FILENAME: followed by a blank line and then ONLY the document content according to the target format. """ # Parse user prompt to separate extraction intent from generation format using AI extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) # Import JSON schema for structured output from .subJsonSchema import get_document_subJsonSchema jsonSchema = get_document_subJsonSchema() # Generic block for JSON extraction - use example data instead of schema example_data = { "metadata": { "title": "Example Document", "author": "AI Assistant", "source_documents": ["document_001"], "extraction_method": "ai_extraction" }, "sections": [ { "id": "section_001", "content_type": "table", "elements": [ { "headers": ["Column 1", "Column 2", "Column 3"], "rows": [ ["Value 1", "Value 2", "Value 3"], ["Value 4", "Value 5", "Value 6"] ] } ], "order": 1, "metadata": {} } ], "summary": "", "tags": [] } genericIntro = f""" {extractionIntent} You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. TASK: Extract the actual content from the document and organize it into structured sections. REQUIREMENTS: 1. Analyze the document content provided in the context below 2. Extract all content and organize it into logical sections 3. Create structured JSON with sections containing the extracted content 4. Preserve the original structure and data OUTPUT FORMAT: Return only valid JSON in this exact structure: {json.dumps(example_data, indent=2)} Requirements: - Preserve all original data - do not summarize or interpret - Use the exact JSON format shown above - Maintain data integrity and structure Content Types to Extract: 1. Tables: Extract all rows and columns with proper headers 2. Lists: Extract all items with proper nesting 3. Headings: Extract with appropriate levels 4. Paragraphs: Extract as structured text 5. Code: Extract code blocks with language identification 6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements Image Analysis Requirements: - If you cannot analyze an image for any reason, explain why in the JSON response - Describe everything you see in the image - Include all text content, tables, logos, graphics, layout, and visual elements - If the image is too small, corrupted, or unclear, explain this - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. DO NOT return a schema description - return actual extracted content in the JSON format shown above. """ # Get format-specific guidelines from renderer formatGuidelines = "" try: if hasattr(renderer, 'getExtractionGuidelines'): formatGuidelines = renderer.getExtractionGuidelines() except Exception: pass # Combine all parts finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip() # Save extraction prompt to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f: f.write(finalPrompt) except Exception: pass return finalPrompt async def buildGenerationPrompt( outputFormat: str, userPrompt: str, title: str, aiService=None, services=None ) -> str: """ Use AI to build the generation prompt based on user intent and format requirements. Focus on what's important for the user and how to structure the content. """ if not aiService: # Fallback if no AI service available return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." try: # Protect userPrompt from injection safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') # Debug output services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER") # AI call to generate the appropriate generation prompt generationPromptRequest = f""" You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document. User request: "{safeUserPrompt}" Document title: "{title}" Target format: {outputFormat} Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on: 1. What content is most important for the user 2. How to structure and organize the content using the canonical JSON format with 'sections' 3. Specific formatting requirements for the target format 4. Language requirements to preserve 5. How to ensure the JSON content meets the user's needs CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure: {{ "metadata": {{ "title": "Document Title" }}, "sections": [ {{ "id": "section_1", "content_type": "table", "elements": [ {{ "headers": ["Column1", "Column2", "Column3"], "rows": [ ["Value1", "Value2", "Value3"], ["Value4", "Value5", "Value6"] ] }} ], "order": 1 }} ] }} The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements". Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format. """ # Call AI service to generate the prompt services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER") # Import and set proper options for AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options) response = await aiService.aiObjects.call(request) result = response.content if response else "" # Replace the placeholder that the AI created with actual format rules if result: formatRules = _getFormatRules(outputFormat) result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules) # Debug output services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER") # Save full generation prompt and AI response to debug file - only if debug enabled try: debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f: f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n") f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n") f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n") except Exception: pass return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content." except Exception as e: # Fallback on any error - preserve user prompt for language instructions services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER") return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}" def _getFormatRules(outputFormat: str) -> str: """ Get format-specific rules for the generation prompt. """ format_rules = { "xlsx": """ XLSX Format Rules: - Create tables with clear headers and organized data - Use appropriate column widths and formatting - Include summary information if relevant - Ensure data is properly structured for spreadsheet analysis """, "pdf": """ PDF Format Rules: - Create professional document layout - Use appropriate headings and sections - Include proper spacing and formatting - Ensure content is well-organized and readable """, "docx": """ DOCX Format Rules: - Create professional document layout - Use appropriate headings and sections - Include proper spacing and formatting - Ensure content is well-organized and readable """, "html": """ HTML Format Rules: - Create clean, semantic HTML structure - Use appropriate tags for content organization - Include proper styling classes - Ensure content is accessible and well-formatted """, "json": """ JSON Format Rules: - Create well-structured JSON data - Use appropriate nesting and organization - Include metadata and context information - Ensure data is properly formatted and valid """, "csv": """ CSV Format Rules: - Create clear, organized tabular data - Use appropriate headers and data types - Ensure proper CSV formatting - Include all relevant data in structured format """, "txt": """ TXT Format Rules: - Create clean, readable text format - Use appropriate spacing and organization - Include clear headings and sections - Ensure content is well-structured and easy to read """ } return format_rules.get(outputFormat.lower(), f""" {outputFormat.upper()} Format Rules: - Create well-structured content appropriate for {outputFormat} - Use appropriate formatting and organization - Ensure content is clear and professional - Include all relevant information in proper format """) async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str: """ Parse user prompt to extract the core extraction intent. """ if not aiService: return f"Extract content from the provided documents and create a {outputFormat} report." try: analysis_prompt = f""" Analyze this user request and extract the core extraction intent: User request: "{userPrompt}" Target format: {outputFormat} Extract the main intent and requirements for document processing. Focus on: 1. What content needs to be extracted 2. How it should be organized 3. Any specific requirements or preferences Respond with a clear, concise statement of the extraction intent. """ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) response = await aiService.aiObjects.call(request) if response and response.content: return response.content.strip() else: return f"Extract content from the provided documents and create a {outputFormat} report." except Exception as e: services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER") return f"Extract content from the provided documents and create a {outputFormat} report."