537 lines
23 KiB
Python
537 lines
23 KiB
Python
"""
|
|
Centralized prompt builder for document generation across formats.
|
|
|
|
Builds a robust prompt that:
|
|
- Accepts any user intent (no fixed structure assumptions)
|
|
- Injects format-specific guidelines from the selected renderer
|
|
- Adds a common policy section to always use real data from source docs
|
|
- Requires the AI to output a filename header that we can parse and use
|
|
"""
|
|
|
|
import json
|
|
from typing import Protocol, Dict, Any
|
|
|
|
|
|
class _RendererLike(Protocol):
|
|
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
|
|
...
|
|
|
|
|
|
async def buildAdaptiveExtractionPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
promptAnalysis: Dict[str, Any],
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""Build adaptive extraction prompt based on AI analysis."""
|
|
|
|
# Get appropriate JSON schema based on analysis
|
|
from .subJsonSchema import get_adaptive_json_schema
|
|
json_schema = get_adaptive_json_schema(promptAnalysis)
|
|
|
|
if promptAnalysis.get("is_multi_file", False):
|
|
schema_type = "multi-document"
|
|
else:
|
|
schema_type = "single-document"
|
|
|
|
# Build adaptive prompt using AI analysis - match single-file style
|
|
if promptAnalysis.get("is_multi_file", False):
|
|
# Multi-file prompt - use simple example format like single-file
|
|
multi_file_example = {
|
|
"metadata": {
|
|
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
|
|
"splitStrategy": "by_section"
|
|
},
|
|
"documents": [
|
|
{
|
|
"id": "doc_1",
|
|
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
|
|
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
|
|
"sections": [
|
|
{
|
|
"id": "section_1",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{
|
|
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
|
|
"level": 1
|
|
}
|
|
],
|
|
"order": 1
|
|
},
|
|
{
|
|
"id": "section_2",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
|
|
}
|
|
],
|
|
"order": 2
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
adaptive_prompt = f"""
|
|
{userPrompt}
|
|
|
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
|
|
|
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
|
|
|
|
REQUIREMENTS:
|
|
1. Analyze the document content provided in the context below
|
|
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
|
3. Create one JSON document entry for each section found
|
|
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
|
5. Generate appropriate filenames for each section
|
|
|
|
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(multi_file_example, indent=2)}
|
|
|
|
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
|
|
- "id": unique identifier
|
|
- "title": section title from the document
|
|
- "filename": appropriate filename for the section
|
|
- "sections": array of content sections
|
|
|
|
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
|
|
|
|
INSTRUCTIONS:
|
|
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
|
|
- Use actual section titles, headings, and text from the document
|
|
- Create meaningful filenames based on section content
|
|
- Ensure each section contains the complete content for that part of the document
|
|
- Do not use generic placeholder text like "Section 1", "Section 2"
|
|
- Extract real headings, paragraphs, lists, and other content elements
|
|
- CRITICAL: Return JSON with "documents" array, not "sections" array
|
|
|
|
CONTEXT (Document Content):
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
""".strip()
|
|
else:
|
|
# Single-file prompt - use original style
|
|
adaptive_prompt = f"""
|
|
{userPrompt}
|
|
|
|
You are extracting structured content from documents and must respond with valid JSON only.
|
|
|
|
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
|
|
|
Extract the actual data from the source documents and structure it as JSON with this format:
|
|
{json.dumps(json_schema, indent=2)}
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
""".strip()
|
|
|
|
return adaptive_prompt
|
|
|
|
async def buildGenericExtractionPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""Build generic extraction prompt that works for both single and multi-file."""
|
|
|
|
# Use AI to determine the best approach
|
|
if aiService:
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and determine the best JSON structure for document extraction.
|
|
|
|
User request: "{userPrompt}"
|
|
|
|
Respond with JSON only:
|
|
{{
|
|
"requires_multi_file": true/false,
|
|
"recommended_schema": "single_document|multi_document",
|
|
"split_approach": "description of how to organize content",
|
|
"file_naming": "suggested naming pattern"
|
|
}}
|
|
|
|
Consider the user's intent and the most logical way to organize the extracted content.
|
|
"""
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
import re
|
|
|
|
result = response.content.strip()
|
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
|
if json_match:
|
|
result = json_match.group(0)
|
|
|
|
analysis = json.loads(result)
|
|
|
|
# Use analysis to build appropriate prompt
|
|
return await buildAdaptiveExtractionPrompt(
|
|
outputFormat, userPrompt, title, analysis, aiService, services
|
|
)
|
|
except Exception as e:
|
|
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
|
|
|
# Fallback to single-file prompt
|
|
from .subJsonSchema import get_document_subJsonSchema
|
|
json_schema = get_document_subJsonSchema()
|
|
|
|
return f"""
|
|
{userPrompt}
|
|
|
|
You are extracting structured content from documents and must respond with valid JSON only.
|
|
|
|
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
|
|
|
Extract the actual data from the source documents and structure it as JSON with this format:
|
|
{json.dumps(json_schema, indent=2)}
|
|
|
|
Requirements:
|
|
- Preserve all original data - do not summarize or interpret
|
|
- Use the exact JSON schema provided
|
|
- Maintain data integrity and structure
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
|
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
|
"""
|
|
|
|
async def buildExtractionPrompt(
|
|
outputFormat: str,
|
|
renderer: _RendererLike,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Build the final extraction prompt by combining:
|
|
- Parsed extraction intent from user prompt (using AI)
|
|
- Generic cross-format instructions (filename header + real-data policy)
|
|
- Format-specific guidelines snippet provided by the renderer
|
|
|
|
The AI must place a single filename header at the very top:
|
|
FILENAME: <safe-file-name-with-extension>
|
|
followed by a blank line and then ONLY the document content according to the target format.
|
|
"""
|
|
|
|
# Parse user prompt to separate extraction intent from generation format using AI
|
|
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
|
|
|
|
# Import JSON schema for structured output
|
|
from .subJsonSchema import get_document_subJsonSchema
|
|
jsonSchema = get_document_subJsonSchema()
|
|
|
|
# Generic block for JSON extraction - use proper schema instead of hardcoded template
|
|
genericIntro = f"""
|
|
{extractionIntent}
|
|
|
|
You are extracting structured content from documents and must respond with valid JSON only.
|
|
|
|
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
|
|
|
Extract the actual data from the source documents and structure it as JSON with this format:
|
|
{json.dumps(jsonSchema, indent=2)}
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
|
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
|
""".strip()
|
|
|
|
# Final assembly
|
|
finalPrompt = genericIntro
|
|
|
|
# Debug output
|
|
services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER")
|
|
|
|
# Save full extraction prompt to debug file - only if debug enabled
|
|
try:
|
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
debug_root = "./test-chat/ai"
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
|
|
f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
|
|
except Exception:
|
|
pass
|
|
|
|
return finalPrompt
|
|
|
|
|
|
async def buildGenerationPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Use AI to build the generation prompt based on user intent and format requirements.
|
|
Focus on what's important for the user and how to structure the content.
|
|
"""
|
|
if not aiService:
|
|
# Fallback if no AI service available
|
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
|
|
|
try:
|
|
# Protect userPrompt from injection
|
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
|
|
|
# Debug output
|
|
services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")
|
|
|
|
# AI call to generate the appropriate generation prompt
|
|
generationPromptRequest = f"""
|
|
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
|
|
|
|
User request: "{safeUserPrompt}"
|
|
Document title: "{title}"
|
|
Output format: {outputFormat}
|
|
|
|
Create a generation prompt that:
|
|
1. Identifies what content is most important for the user
|
|
2. Specifies how to structure and organize the content
|
|
3. Includes any specific formatting or presentation requirements
|
|
4. Preserves any language requirements
|
|
5. Ensures the document meets the user's needs
|
|
|
|
IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.
|
|
|
|
CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}
|
|
|
|
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
|
|
"""
|
|
|
|
# Call AI service to generate the prompt
|
|
services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Calling AI for generation prompt...", "PROMPT_BUILDER")
|
|
|
|
# Import and set proper options for AI call
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=generationPromptRequest, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
result = response.content if response else ""
|
|
|
|
# Replace the placeholder that the AI created with actual format rules
|
|
if result:
|
|
formatRules = _getFormatRules(outputFormat)
|
|
result = result.replace("PLACEHOLDER_FOR_FORMAT_RULES", formatRules)
|
|
|
|
# Debug output
|
|
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
|
|
|
|
# Save full generation prompt and AI response to debug file - only if debug enabled
|
|
try:
|
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
debug_root = "./test-chat/ai"
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_generation_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(f"GENERATION PROMPT REQUEST:\n{generationPromptRequest}\n\n")
|
|
f.write(f"GENERATION PROMPT AI RESPONSE:\n{response.content if response else 'No response'}\n\n")
|
|
f.write(f"GENERATION PROMPT FINAL:\n{result if result else 'None'}\n")
|
|
except Exception:
|
|
pass
|
|
|
|
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
|
|
|
except Exception as e:
|
|
# Fallback on any error - preserve user prompt for language instructions
|
|
services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
|
|
|
|
|
def _getFormatRules(outputFormat: str) -> str:
|
|
"""
|
|
Get format-specific rules for JSON-based generation.
|
|
Since we now use standardized JSON, all formats follow the same rules.
|
|
"""
|
|
return """
|
|
- Generate content in standardized JSON format following the document schema
|
|
- Tables: Use JSON table format with headers and rows arrays
|
|
- Lists: Use JSON list format with items array
|
|
- Text: Use JSON paragraph format with text field
|
|
- Headings: Use JSON heading format with level field
|
|
- Structure: Follow the document JSON schema exactly
|
|
""".strip()
|
|
|
|
|
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
|
"""
|
|
Use AI to extract a rich, structured extraction intent from the user prompt.
|
|
Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
|
|
"""
|
|
if not aiService:
|
|
# Fallback if no AI service available
|
|
return "Extract all relevant content from the document according to the user's requirements"
|
|
|
|
try:
|
|
# Protect userPrompt from injection by escaping quotes and newlines
|
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
|
|
|
# Rich analysis to derive a complete extraction intent and structure guidance
|
|
extractionPrompt = f"""
|
|
Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
|
|
|
|
Goals:
|
|
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
|
|
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
|
|
- Identify if multi-file output is appropriate and how to split/files name.
|
|
|
|
User request: "{safeUserPrompt}"
|
|
|
|
Return JSON in this exact shape:
|
|
{{
|
|
"detectedLanguage": "de|en|fr|it|...",
|
|
"normalizedRequest": "Full explicit instruction in detected language",
|
|
"requiresStructuredData": true|false,
|
|
"targetStructure": "table|list|mixed|unstructured",
|
|
"table": {{
|
|
"headers": ["Header1", "Header2", "..."],
|
|
"headerOrderStrict": true|false,
|
|
"rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
|
|
"formats": {{
|
|
"dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
|
|
"amountDecimals": 2,
|
|
"currencyFormat": "code|symbol",
|
|
"idMasking": "none|last4|custom"
|
|
}}
|
|
}},
|
|
"multiFile": true|false,
|
|
"fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
|
|
"fileNamingPattern": "suggested pattern for filenames",
|
|
"constraints": ["List of critical constraints to enforce"],
|
|
"reasoning": "Brief justification (one sentence)"
|
|
}}
|
|
|
|
Rules:
|
|
- Preserve user terminology and language in normalizedRequest.
|
|
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
|
|
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
|
|
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
|
|
"""
|
|
|
|
# Call AI service to extract intention
|
|
services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER")
|
|
|
|
# Import and set proper options for AI call
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
result = response.content if response else ""
|
|
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
|
|
|
|
# Try to extract and pretty print JSON
|
|
if result:
|
|
import re, json as _json
|
|
match = re.search(r'\{[\s\S]*\}', result)
|
|
if match:
|
|
try:
|
|
obj = _json.loads(match.group(0))
|
|
return _json.dumps(obj, ensure_ascii=False, indent=2)
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback to previous simple format
|
|
return f"Extract: {safeUserPrompt}"
|
|
|
|
except Exception as e:
|
|
# Fallback on any error - preserve user prompt for language instructions
|
|
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Extract: {userPrompt}"
|
|
|
|
|
|
|