gateway/modules/services/serviceGeneration/subPromptBuilder.py

537 lines
23 KiB
Python

"""
Centralized prompt builder for document generation across formats.
Builds a robust prompt that:
- Accepts any user intent (no fixed structure assumptions)
- Injects format-specific guidelines from the selected renderer
- Adds a common policy section to always use real data from source docs
- Requires the AI to output a filename header that we can parse and use
"""
import json
from typing import Protocol, Dict, Any
class _RendererLike(Protocol):
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
...
async def buildAdaptiveExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
promptAnalysis: Dict[str, Any],
aiService=None,
services=None
) -> str:
"""Build adaptive extraction prompt based on AI analysis."""
# Get appropriate JSON schema based on analysis
from .subJsonSchema import get_adaptive_json_schema
json_schema = get_adaptive_json_schema(promptAnalysis)
if promptAnalysis.get("is_multi_file", False):
schema_type = "multi-document"
else:
schema_type = "single-document"
# Build adaptive prompt using AI analysis - match single-file style
if promptAnalysis.get("is_multi_file", False):
# Multi-file prompt - use simple example format like single-file
multi_file_example = {
"metadata": {
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
"splitStrategy": "by_section"
},
"documents": [
{
"id": "doc_1",
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
"level": 1
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
}
],
"order": 2
}
]
}
]
}
adaptive_prompt = f"""
{userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
3. Create one JSON document entry for each section found
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
5. Generate appropriate filenames for each section
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(multi_file_example, indent=2)}
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
- "id": unique identifier
- "title": section title from the document
- "filename": appropriate filename for the section
- "sections": array of content sections
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
INSTRUCTIONS:
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
- Use actual section titles, headings, and text from the document
- Create meaningful filenames based on section content
- Ensure each section contains the complete content for that part of the document
- Do not use generic placeholder text like "Section 1", "Section 2"
- Extract real headings, paragraphs, lists, and other content elements
- CRITICAL: Return JSON with "documents" array, not "sections" array
CONTEXT (Document Content):
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
else:
# Single-file prompt - use original style
adaptive_prompt = f"""
{userPrompt}
You are extracting structured content from documents and must respond with valid JSON only.
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(json_schema, indent=2)}
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
return adaptive_prompt
async def buildGenericExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""Build generic extraction prompt that works for both single and multi-file."""
# Use AI to determine the best approach
if aiService:
try:
analysis_prompt = f"""
Analyze this user request and determine the best JSON structure for document extraction.
User request: "{userPrompt}"
Respond with JSON only:
{{
"requires_multi_file": true/false,
"recommended_schema": "single_document|multi_document",
"split_approach": "description of how to organize content",
"file_naming": "suggested naming pattern"
}}
Consider the user's intent and the most logical way to organize the extracted content.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
import re
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
analysis = json.loads(result)
# Use analysis to build appropriate prompt
return await buildAdaptiveExtractionPrompt(
outputFormat, userPrompt, title, analysis, aiService, services
)
except Exception as e:
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
# Fallback to single-file prompt
from .subJsonSchema import get_document_subJsonSchema
json_schema = get_document_subJsonSchema()
return f"""
{userPrompt}
You are extracting structured content from documents and must respond with valid JSON only.
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(json_schema, indent=2)}
Requirements:
- Preserve all original data - do not summarize or interpret
- Use the exact JSON schema provided
- Maintain data integrity and structure
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
"""
async def buildExtractionPrompt(
outputFormat: str,
renderer: _RendererLike,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""
Build the final extraction prompt by combining:
- Parsed extraction intent from user prompt (using AI)
- Generic cross-format instructions (filename header + real-data policy)
- Format-specific guidelines snippet provided by the renderer
The AI must place a single filename header at the very top:
FILENAME: <safe-file-name-with-extension>
followed by a blank line and then ONLY the document content according to the target format.
"""
# Parse user prompt to separate extraction intent from generation format using AI
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
# Import JSON schema for structured output
from .subJsonSchema import get_document_subJsonSchema
jsonSchema = get_document_subJsonSchema()
# Generic block for JSON extraction - use proper schema instead of hardcoded template
genericIntro = f"""
{extractionIntent}
You are extracting structured content from documents and must respond with valid JSON only.
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(jsonSchema, indent=2)}
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
""".strip()
# Final assembly
finalPrompt = genericIntro
# Debug output
services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER")
# Save full extraction prompt to debug file - only if debug enabled
try:
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
except Exception:
pass
return finalPrompt
async def buildGenerationPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""
Use AI to build the generation prompt based on user intent and format requirements.
Focus on what's important for the user and how to structure the content.
"""
if not aiService:
# Fallback if no AI service available
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
try:
# Protect userPrompt from injection
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# Debug output
services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")
# AI call to generate the appropriate generation prompt
generationPromptRequest = f"""
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
User request: "{safeUserPrompt}"
Document title: "{title}"
Output format: {outputFormat}
Create a generation prompt that:
1. Identifies what content is most important for the user
2. Specifies how to structure and organize the content
3. Includes any specific formatting or presentation requirements
4. Preserves any language requirements
5. Ensures the document meets the user's needs
IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.
CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
"""
# Call AI service to generate the prompt
services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER")
# Import and set proper options for AI call
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
response = await aiService.aiObjects.call(request)
result = response.content if response else ""
# Replace the placeholder that the AI created with actual format rules
if result:
formatRules = _getFormatRules(outputFormat)
result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)
# Debug output
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
# Save full generation prompt and AI response to debug file - only if debug enabled
try:
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
except Exception:
pass
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
except Exception as e:
# Fallback on any error - preserve user prompt for language instructions
services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
def _getFormatRules(outputFormat: str) -> str:
"""
Get format-specific rules for JSON-based generation.
Since we now use standardized JSON, all formats follow the same rules.
"""
return """
- Generate content in standardized JSON format following the document schema
- Tables: Use JSON table format with headers and rows arrays
- Lists: Use JSON list format with items array
- Text: Use JSON paragraph format with text field
- Headings: Use JSON heading format with level field
- Structure: Follow the document JSON schema exactly
""".strip()
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
"""
Use AI to extract a rich, structured extraction intent from the user prompt.
Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
"""
if not aiService:
# Fallback if no AI service available
return "Extract all relevant content from the document according to the user's requirements"
try:
# Protect userPrompt from injection by escaping quotes and newlines
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# Rich analysis to derive a complete extraction intent and structure guidance
extractionPrompt = f"""
Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
Goals:
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
- Identify if multi-file output is appropriate and how to split/files name.
User request: "{safeUserPrompt}"
Return JSON in this exact shape:
{{
"detectedLanguage": "de|en|fr|it|...",
"normalizedRequest": "Full explicit instruction in detected language",
"requiresStructuredData": true|false,
"targetStructure": "table|list|mixed|unstructured",
"table": {{
"headers": ["Header1", "Header2", "..."],
"headerOrderStrict": true|false,
"rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
"formats": {{
"dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
"amountDecimals": 2,
"currencyFormat": "code|symbol",
"idMasking": "none|last4|custom"
}}
}},
"multiFile": true|false,
"fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
"fileNamingPattern": "suggested pattern for filenames",
"constraints": ["List of critical constraints to enforce"],
"reasoning": "Brief justification (one sentence)"
}}
Rules:
- Preserve user terminology and language in normalizedRequest.
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
"""
# Call AI service to extract intention
services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER")
# Import and set proper options for AI call
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
result = response.content if response else ""
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
# Try to extract and pretty print JSON
if result:
import re, json as _json
match = re.search(r'\{[\s\S]*\}', result)
if match:
try:
obj = _json.loads(match.group(0))
return _json.dumps(obj, ensure_ascii=False, indent=2)
except Exception:
pass
# Fallback to previous simple format
return f"Extract: {safeUserPrompt}"
except Exception as e:
# Fallback on any error - preserve user prompt for language instructions
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
return f"Extract: {userPrompt}"