273 lines
11 KiB
Python
273 lines
11 KiB
Python
"""
|
|
Centralized prompt builder for document generation across formats.
|
|
|
|
Builds a robust prompt that:
|
|
- Accepts any user intent (no fixed structure assumptions)
|
|
- Injects format-specific guidelines from the selected renderer
|
|
- Adds a common policy section to always use real data from source docs
|
|
- Requires the AI to output a filename header that we can parse and use
|
|
"""
|
|
|
|
from typing import Protocol
|
|
|
|
|
|
class _RendererLike(Protocol):
|
|
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
|
|
...
|
|
|
|
|
|
async def buildExtractionPrompt(
|
|
outputFormat: str,
|
|
renderer: _RendererLike,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None
|
|
) -> str:
|
|
"""
|
|
Build the final extraction prompt by combining:
|
|
- Parsed extraction intent from user prompt (using AI)
|
|
- Generic cross-format instructions (filename header + real-data policy)
|
|
- Format-specific guidelines snippet provided by the renderer
|
|
|
|
The AI must place a single filename header at the very top:
|
|
FILENAME: <safe-file-name-with-extension>
|
|
followed by a blank line and then ONLY the document content according to the target format.
|
|
"""
|
|
|
|
# Parse user prompt to separate extraction intent from generation format using AI
|
|
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
|
|
|
|
# Import JSON schema for structured output
|
|
from .subJsonSchema import get_document_subJsonSchema
|
|
jsonSchema = get_document_subJsonSchema()
|
|
|
|
# Generic block for JSON extraction
|
|
genericIntro = f"""
|
|
{extractionIntent}
|
|
|
|
You are extracting structured content from documents and must respond with valid JSON only.
|
|
|
|
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
|
|
|
Extract the actual data from the source documents and structure it as JSON with this format:
|
|
{{
|
|
"metadata": {{
|
|
"title": "Document Title",
|
|
"version": "1.0"
|
|
}},
|
|
"sections": [
|
|
{{
|
|
"id": "section_1",
|
|
"type": "heading",
|
|
"data": {{
|
|
"level": 1,
|
|
"text": "Heading Text"
|
|
}}
|
|
}},
|
|
{{
|
|
"id": "section_2",
|
|
"type": "table",
|
|
"data": {{
|
|
"headers": ["Column1", "Column2"],
|
|
"rows": [["Data1", "Data2"], ["Data3", "Data4"]]
|
|
}}
|
|
}},
|
|
{{
|
|
"id": "section_3",
|
|
"type": "bullet_list",
|
|
"data": {{
|
|
"items": ["Item 1", "Item 2", "Item 3"]
|
|
}}
|
|
}},
|
|
{{
|
|
"id": "section_4",
|
|
"type": "paragraph",
|
|
"data": {{
|
|
"text": "Paragraph content here"
|
|
}}
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
""".strip()
|
|
|
|
# Final assembly
|
|
finalPrompt = genericIntro
|
|
|
|
# Debug output
|
|
print(f"🔍 EXTRACTION INTENT: {extractionIntent}")
|
|
|
|
# Save full extraction prompt to debug file
|
|
try:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
debug_root = "./test-chat/ai"
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
|
|
f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
|
|
except Exception:
|
|
pass
|
|
|
|
return finalPrompt
|
|
|
|
|
|
async def buildGenerationPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None
|
|
) -> str:
|
|
"""
|
|
Use AI to build the generation prompt based on user intent and format requirements.
|
|
Focus on what's important for the user and how to structure the content.
|
|
"""
|
|
if not aiService:
|
|
# Fallback if no AI service available
|
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
|
|
|
try:
|
|
# Protect userPrompt from injection
|
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
|
|
|
# Debug output
|
|
print(f"🔍 GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'")
|
|
|
|
# AI call to generate the appropriate generation prompt
|
|
generationPromptRequest = f"""
|
|
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
|
|
|
|
User request: "{safeUserPrompt}"
|
|
Document title: "{title}"
|
|
Output format: {outputFormat}
|
|
|
|
Create a generation prompt that:
|
|
1. Identifies what content is most important for the user
|
|
2. Specifies how to structure and organize the content
|
|
3. Includes any specific formatting or presentation requirements
|
|
4. Preserves any language requirements
|
|
5. Ensures the document meets the user's needs
|
|
|
|
IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.
|
|
|
|
CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}
|
|
|
|
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
|
|
"""
|
|
|
|
# Call AI service to generate the prompt
|
|
print(f"🔍 GENERATION PROMPT REQUEST: Calling AI for generation prompt...")
|
|
|
|
# Import and set proper options for AI call
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
result = response.content if response else ""
|
|
print(f"🔍 GENERATION PROMPT AI RESPONSE: '{result}'")
|
|
|
|
# Replace the placeholder that the AI created with actual format rules
|
|
if result:
|
|
formatRules = _getFormatRules(outputFormat)
|
|
result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)
|
|
|
|
# Debug output
|
|
print(f"🔍 GENERATION PROMPT FINAL: {result if result else 'None'}")
|
|
|
|
# Save full generation prompt and AI response to debug file
|
|
try:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
debug_root = "./test-chat/ai"
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
|
|
f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
|
|
f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
|
|
except Exception:
|
|
pass
|
|
|
|
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
|
|
|
except Exception as e:
|
|
# Fallback on any error - preserve user prompt for language instructions
|
|
print(f"🔍 DEBUG: AI generation prompt failed: {str(e)}")
|
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
|
|
|
|
|
def _getFormatRules(outputFormat: str) -> str:
|
|
"""
|
|
Get format-specific rules for JSON-based generation.
|
|
Since we now use standardized JSON, all formats follow the same rules.
|
|
"""
|
|
return """
|
|
- Generate content in standardized JSON format following the document schema
|
|
- Tables: Use JSON table format with headers and rows arrays
|
|
- Lists: Use JSON list format with items array
|
|
- Text: Use JSON paragraph format with text field
|
|
- Headings: Use JSON heading format with level field
|
|
- Structure: Follow the document JSON schema exactly
|
|
""".strip()
|
|
|
|
|
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
|
|
"""
|
|
Use AI to extract the core content intention from the user prompt.
|
|
Focus on WHAT the user wants to extract, not HOW to format it.
|
|
"""
|
|
if not aiService:
|
|
# Fallback if no AI service available
|
|
return "Extract all relevant content from the document according to the user's requirements"
|
|
|
|
try:
|
|
# Protect userPrompt from injection by escaping quotes and newlines
|
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
|
|
|
# Simple AI call to extract the intention
|
|
extractionPrompt = f"""
|
|
Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted.
|
|
|
|
User request: "{safeUserPrompt}"
|
|
|
|
Return only the content intention in a simple format like "Extract: [content description]"
|
|
Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis.
|
|
If the user mentions a table, extract the actual table data with rows and columns.
|
|
If the user mentions a list, extract the actual list items.
|
|
IMPORTANT: Preserve any language requirements in your response.
|
|
Do not include formatting instructions, file types, or output methods.
|
|
"""
|
|
|
|
# Call AI service to extract intention
|
|
print(f"🔍 DEBUG: Calling AI for extraction intent...")
|
|
|
|
# Import and set proper options for AI call
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
result = response.content if response else ""
|
|
print(f"🔍 DEBUG: AI extraction intent result: '{result}'")
|
|
|
|
return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"
|
|
|
|
except Exception as e:
|
|
# Fallback on any error - preserve user prompt for language instructions
|
|
print(f"🔍 DEBUG: AI extraction intent failed: {str(e)}")
|
|
return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"
|
|
|
|
|
|
|