614 lines
22 KiB
Python
614 lines
22 KiB
Python
"""
|
|
Prompt builder for AI document generation and extraction.
|
|
This module builds prompts for AI services to extract and generate documents.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
|
|
# Type hint for renderer parameter
|
|
if TYPE_CHECKING:
|
|
from .renderers.rendererBaseTemplate import BaseRenderer
|
|
_RendererLike = BaseRenderer
|
|
else:
|
|
_RendererLike = Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
async def buildAdaptiveExtractionPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
promptAnalysis: Dict[str, Any],
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Build adaptive extraction prompt based on AI analysis.
|
|
Uses multi-file or single-file approach based on analysis.
|
|
"""
|
|
|
|
# Multi-file example data instead of schema
|
|
multi_file_example = {
|
|
"metadata": {
|
|
"title": "Multi-Document Example",
|
|
"splitStrategy": "by_section",
|
|
"source_documents": ["doc_001"],
|
|
"extraction_method": "ai_extraction"
|
|
},
|
|
"documents": [
|
|
{
|
|
"id": "doc_section_1",
|
|
"title": "Section 1 Title",
|
|
"filename": "section_1.xlsx",
|
|
"sections": [
|
|
{
|
|
"id": "section_1",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{
|
|
"level": 1,
|
|
"text": "1. SECTION TITLE"
|
|
}
|
|
],
|
|
"order": 1
|
|
},
|
|
{
|
|
"id": "section_2",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "This is the actual content that should be extracted from the document."
|
|
}
|
|
],
|
|
"order": 2
|
|
},
|
|
{
|
|
"id": "section_3",
|
|
"content_type": "table",
|
|
"elements": [
|
|
{
|
|
"headers": ["Column 1", "Column 2"],
|
|
"rows": [["Value 1", "Value 2"]]
|
|
}
|
|
],
|
|
"order": 3
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
# UNIFIED APPROACH: Always use multi-document format (single doc = multi with n=1)
|
|
adaptive_prompt = f"""
|
|
{services.ai.sanitizePromptContent(userPrompt, 'userinput')}
|
|
|
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
|
|
|
TASK: Extract the actual content from the document and organize it into documents. For single documents, create one document entry. For multi-document requests, create multiple document entries.
|
|
|
|
REQUIREMENTS:
|
|
1. Analyze the document content provided in the context below
|
|
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
|
3. Create one or more JSON document entries based on the content structure
|
|
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
|
5. Generate appropriate filenames for each document
|
|
|
|
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(multi_file_example, indent=2)}
|
|
|
|
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
|
|
- "id": unique identifier
|
|
- "title": document title
|
|
- "filename": appropriate filename for the document
|
|
- "sections": array of content sections
|
|
|
|
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
|
|
|
|
INSTRUCTIONS:
|
|
- For single document requests: Create one document with all content in its sections
|
|
- For multi-document requests: Create multiple documents, each with relevant sections
|
|
- Use actual section titles, headings, and text from the document
|
|
- Create meaningful filenames based on content
|
|
- Ensure each section contains the complete content for that part
|
|
- Do not use generic placeholder text like "Section 1", "Section 2"
|
|
- Extract real headings, paragraphs, lists, and other content elements
|
|
- CRITICAL: Return JSON with "documents" array, not "sections" array
|
|
|
|
CONTEXT (Document Content):
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
""".strip()
|
|
|
|
return adaptive_prompt
|
|
|
|
async def buildGenerationPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""Build generic extraction prompt that works for both single and multi-file."""
|
|
|
|
# Use AI to determine the best approach
|
|
if aiService:
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and determine the best JSON structure for document extraction.
|
|
|
|
User request: "{userPrompt}"
|
|
|
|
Respond with JSON only:
|
|
{{
|
|
"requires_multi_file": true/false,
|
|
"recommended_schema": "single_document|multi_document",
|
|
"split_approach": "description of how to organize content",
|
|
"file_naming": "suggested naming pattern"
|
|
}}
|
|
|
|
Consider the user's intent and the most logical way to organize the extracted content.
|
|
"""
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
import re
|
|
|
|
result = response.content.strip()
|
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
|
if json_match:
|
|
result = json_match.group(0)
|
|
|
|
analysis = json.loads(result)
|
|
|
|
# Use analysis to build appropriate prompt
|
|
return await buildAdaptiveExtractionPrompt(
|
|
outputFormat, userPrompt, title, analysis, aiService, services
|
|
)
|
|
except Exception as e:
|
|
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
|
|
|
# Always use the proper generation prompt template with LOOP_INSTRUCTION
|
|
result = f"""You are an AI assistant that generates structured JSON content for document creation.
|
|
|
|
USER REQUEST: "{userPrompt}"
|
|
DOCUMENT TITLE: "{title}"
|
|
TARGET FORMAT: {outputFormat}
|
|
|
|
TASK: Generate JSON content that fulfills the user's request.
|
|
|
|
CRITICAL: You MUST return ONLY valid JSON in this exact structure:
|
|
{{
|
|
"metadata": {{
|
|
"title": "{title}",
|
|
"splitStrategy": "single_document",
|
|
"source_documents": [],
|
|
"extraction_method": "ai_generation"
|
|
}},
|
|
"documents": [
|
|
{{
|
|
"id": "doc_1",
|
|
"title": "{title}",
|
|
"filename": "document.{outputFormat}",
|
|
"sections": [
|
|
{{
|
|
"id": "section_1",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{{
|
|
"level": 1,
|
|
"text": "1. SECTION TITLE"
|
|
}}
|
|
],
|
|
"order": 1
|
|
}},
|
|
{{
|
|
"id": "section_2",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{{
|
|
"text": "This is the actual content that should be generated."
|
|
}}
|
|
],
|
|
"order": 2
|
|
}}
|
|
]
|
|
}}
|
|
]
|
|
}}
|
|
|
|
IMPORTANT:
|
|
- Return ONLY the JSON structure above
|
|
- Do NOT include any text before or after the JSON
|
|
- Fill in the actual content based on the user request: {userPrompt}
|
|
- If the content is too large, you can split it into multiple sections
|
|
- Each section should have a unique id and appropriate content_type
|
|
- LOOP_INSTRUCTION
|
|
"""
|
|
|
|
# Debug output
|
|
if services:
|
|
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
|
|
|
|
return result.strip()
|
|
|
|
async def buildExtractionPrompt(
|
|
outputFormat: str,
|
|
renderer: _RendererLike,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Build the final extraction prompt by combining:
|
|
- Parsed extraction intent from user prompt (using AI)
|
|
- Generic cross-format instructions (filename header + real-data policy)
|
|
- Format-specific guidelines snippet provided by the renderer
|
|
|
|
The AI must place a single filename header at the very top:
|
|
FILENAME: <safe-file-name-with-extension>
|
|
followed by a blank line and then ONLY the document content according to the target format.
|
|
"""
|
|
|
|
# Parse user prompt to separate extraction intent from generation format using AI
|
|
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
|
|
|
|
# Import JSON schema for structured output
|
|
from .subJsonSchema import get_document_subJsonSchema
|
|
jsonSchema = get_document_subJsonSchema()
|
|
|
|
# Generic block for JSON extraction - use mixed example data showing different content types
|
|
example_data = {
|
|
"metadata": {
|
|
"title": "Example Document",
|
|
"author": "AI Assistant",
|
|
"source_documents": ["document_001"],
|
|
"extraction_method": "ai_extraction"
|
|
},
|
|
"sections": [
|
|
{
|
|
"id": "section_001",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{
|
|
"level": 1,
|
|
"text": "1. INTRODUCTION"
|
|
}
|
|
],
|
|
"order": 1,
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"id": "section_002",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "This is a sample paragraph with actual content that should be extracted from the document."
|
|
}
|
|
],
|
|
"order": 2,
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"id": "section_003",
|
|
"content_type": "table",
|
|
"elements": [
|
|
{
|
|
"headers": ["Column 1", "Column 2", "Column 3"],
|
|
"rows": [
|
|
["Value 1", "Value 2", "Value 3"],
|
|
["Value 4", "Value 5", "Value 6"]
|
|
]
|
|
}
|
|
],
|
|
"order": 3,
|
|
"metadata": {}
|
|
}
|
|
],
|
|
"summary": "",
|
|
"tags": []
|
|
}
|
|
|
|
genericIntro = f"""
|
|
{extractionIntent}
|
|
|
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
|
|
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
|
|
|
REQUIREMENTS:
|
|
1. Analyze the document content provided in the context below
|
|
2. Extract all content and organize it into logical sections
|
|
3. Create structured JSON with sections containing the extracted content
|
|
4. Preserve the original structure and data
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(example_data, indent=2)}
|
|
|
|
Requirements:
|
|
- Preserve all original data - do not summarize or interpret
|
|
- Use the exact JSON format shown above
|
|
- Maintain data integrity and structure
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
|
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
|
"""
|
|
|
|
# Get format-specific guidelines from renderer
|
|
formatGuidelines = ""
|
|
try:
|
|
if hasattr(renderer, 'getExtractionGuidelines'):
|
|
formatGuidelines = renderer.getExtractionGuidelines()
|
|
except Exception:
|
|
pass
|
|
|
|
# Combine all parts
|
|
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
|
|
|
|
# Save extraction prompt to debug file - only if debug enabled
|
|
try:
|
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
# Use configured log directory instead of hardcoded test-chat
|
|
from modules.shared.configuration import APP_CONFIG
|
|
logDir = APP_CONFIG.get("APP_LOGGING_LOG_DIR", "./")
|
|
if not os.path.isabs(logDir):
|
|
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
logDir = os.path.join(gatewayDir, logDir)
|
|
debug_root = os.path.join(logDir, 'debug')
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(finalPrompt)
|
|
except Exception:
|
|
pass
|
|
|
|
return finalPrompt
|
|
|
|
|
|
async def buildGenerationPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Use AI to build the generation prompt based on user intent and format requirements.
|
|
Focus on what's important for the user and how to structure the content.
|
|
"""
|
|
if not aiService:
|
|
# Fallback if no AI service available
|
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
|
|
|
try:
|
|
# Protect userPrompt from injection
|
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
|
|
|
# Debug output
|
|
services.utils.debugLogToFile(f"GENERATION PROMPT REQUEST: buildGenerationPrompt called with outputFormat='{outputFormat}', title='{title}'", "PROMPT_BUILDER")
|
|
|
|
# Return static generation prompt template instead of calling AI
|
|
services.utils.debugLogToFile("GENERATION PROMPT REQUEST: Using static template instead of AI call", "PROMPT_BUILDER")
|
|
|
|
# Return static generation prompt template
|
|
result = f"""You are an AI assistant that generates structured JSON content for document creation.
|
|
|
|
USER REQUEST: "{safeUserPrompt}"
|
|
DOCUMENT TITLE: "{title}"
|
|
TARGET FORMAT: {outputFormat}
|
|
|
|
TASK: Generate JSON content that fulfills the user's request.
|
|
|
|
CRITICAL: You MUST return ONLY valid JSON in this exact structure:
|
|
{{
|
|
"metadata": {{
|
|
"title": "{title}",
|
|
"splitStrategy": "single_document",
|
|
"source_documents": [],
|
|
"extraction_method": "ai_generation"
|
|
}},
|
|
"documents": [
|
|
{{
|
|
"id": "doc_1",
|
|
"title": "{title}",
|
|
"filename": "document.{outputFormat}",
|
|
"sections": [
|
|
{{
|
|
"id": "section_1",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{{
|
|
"level": 1,
|
|
"text": "1. SECTION TITLE"
|
|
}}
|
|
],
|
|
"order": 1
|
|
}},
|
|
{{
|
|
"id": "section_2",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{{
|
|
"text": "This is the actual content that should be generated."
|
|
}}
|
|
],
|
|
"order": 2
|
|
}}
|
|
]
|
|
}}
|
|
]
|
|
}}
|
|
|
|
IMPORTANT:
|
|
- Return ONLY the JSON structure above
|
|
- Do NOT include any text before or after the JSON
|
|
- Fill in the actual content based on the user request: {safeUserPrompt}
|
|
- If the content is too large, you can split it into multiple sections
|
|
- Each section should have a unique id and appropriate content_type
|
|
|
|
LOOP_INSTRUCTION
|
|
"""
|
|
|
|
# Debug output
|
|
services.utils.debugLogToFile(f"GENERATION PROMPT: Generated successfully", "PROMPT_BUILDER")
|
|
|
|
return result.strip()
|
|
|
|
except Exception as e:
|
|
# Fallback on any error - preserve user prompt for language instructions
|
|
services.utils.debugLogToFile(f"DEBUG: AI generation prompt failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
|
|
|
|
|
def _getFormatRules(outputFormat: str) -> str:
|
|
"""
|
|
Get format-specific rules for the generation prompt.
|
|
"""
|
|
format_rules = {
|
|
"xlsx": """
|
|
XLSX Format Rules:
|
|
- Create tables with clear headers and organized data
|
|
- Use appropriate column widths and formatting
|
|
- Include summary information if relevant
|
|
- Ensure data is properly structured for spreadsheet analysis
|
|
""",
|
|
"pdf": """
|
|
PDF Format Rules:
|
|
- Create professional document layout
|
|
- Use appropriate headings and sections
|
|
- Include proper spacing and formatting
|
|
- Ensure content is well-organized and readable
|
|
""",
|
|
"docx": """
|
|
DOCX Format Rules:
|
|
- Create professional document layout
|
|
- Use appropriate headings and sections
|
|
- Include proper spacing and formatting
|
|
- Ensure content is well-organized and readable
|
|
""",
|
|
"html": """
|
|
HTML Format Rules:
|
|
- Create clean, semantic HTML structure
|
|
- Use appropriate tags for content organization
|
|
- Include proper styling classes
|
|
- Ensure content is accessible and well-formatted
|
|
""",
|
|
"json": """
|
|
JSON Format Rules:
|
|
- Create well-structured JSON data
|
|
- Use appropriate nesting and organization
|
|
- Include metadata and context information
|
|
- Ensure data is properly formatted and valid
|
|
""",
|
|
"csv": """
|
|
CSV Format Rules:
|
|
- Create clear, organized tabular data
|
|
- Use appropriate headers and data types
|
|
- Ensure proper CSV formatting
|
|
- Include all relevant data in structured format
|
|
""",
|
|
"txt": """
|
|
TXT Format Rules:
|
|
- Create clean, readable text format
|
|
- Use appropriate spacing and organization
|
|
- Include clear headings and sections
|
|
- Ensure content is well-structured and easy to read
|
|
"""
|
|
}
|
|
|
|
return format_rules.get(outputFormat.lower(), f"""
|
|
{outputFormat.upper()} Format Rules:
|
|
- Create well-structured content appropriate for {outputFormat}
|
|
- Use appropriate formatting and organization
|
|
- Ensure content is clear and professional
|
|
- Include all relevant information in proper format
|
|
""")
|
|
|
|
|
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
|
"""
|
|
Parse user prompt to extract the core extraction intent.
|
|
"""
|
|
if not aiService:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and extract the core extraction intent:
|
|
|
|
User request: "{userPrompt}"
|
|
Target format: {outputFormat}
|
|
|
|
Extract the main intent and requirements for document processing. Focus on:
|
|
1. What content needs to be extracted
|
|
2. How it should be organized
|
|
3. Any specific requirements or preferences
|
|
|
|
Respond with a clear, concise statement of the extraction intent.
|
|
"""
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
else:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
except Exception as e:
|
|
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|