399 lines
15 KiB
Python
399 lines
15 KiB
Python
"""
|
|
Prompt builder for AI document generation and extraction.
|
|
This module builds prompts for AI services to extract and generate documents.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
|
|
# Type hint for renderer parameter
|
|
if TYPE_CHECKING:
|
|
from .renderers.rendererBaseTemplate import BaseRenderer
|
|
_RendererLike = BaseRenderer
|
|
else:
|
|
_RendererLike = Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Centralized JSON structure template for document generation
|
|
JSON_STRUCTURE_TEMPLATE = """{
|
|
"metadata": {
|
|
"title": "{{DOCUMENT_TITLE}}",
|
|
"splitStrategy": "single_document",
|
|
"source_documents": [],
|
|
"extraction_method": "ai_generation"
|
|
},
|
|
"documents": [{
|
|
"id": "doc_1",
|
|
"title": "{{DOCUMENT_TITLE}}",
|
|
"filename": "document.json",
|
|
"sections": [
|
|
{
|
|
"id": "section_1",
|
|
"content_type": "heading|paragraph|table|list|code",
|
|
"elements": [
|
|
// heading: {"level": 1, "text": "..."}
|
|
// paragraph: {"text": "..."}
|
|
// table: {"headers": [...], "rows": [[...]], "caption": "..."}
|
|
// list: {"items": [{"text": "...", "subitems": [...]}], "list_type": "bullet|numbered"}
|
|
// code: {"code": "...", "language": "..."}
|
|
],
|
|
"order": 1
|
|
}
|
|
]
|
|
}],
|
|
"continuation": null,
|
|
}"""
|
|
|
|
async def buildAdaptiveExtractionPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
promptAnalysis: Dict[str, Any],
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Build adaptive extraction prompt based on AI analysis.
|
|
Uses multi-file or single-file approach based on analysis.
|
|
"""
|
|
|
|
# Multi-file example data instead of schema
|
|
multi_file_example = {
|
|
"metadata": {
|
|
"title": "Multi-Document Example",
|
|
"splitStrategy": "by_section",
|
|
"source_documents": ["doc_001"],
|
|
"extraction_method": "ai_extraction"
|
|
},
|
|
"documents": [
|
|
{
|
|
"id": "doc_section_1",
|
|
"title": "Section 1 Title",
|
|
"filename": "section_1.xlsx",
|
|
"sections": [
|
|
{
|
|
"id": "section_1",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{
|
|
"level": 1,
|
|
"text": "1. SECTION TITLE"
|
|
}
|
|
],
|
|
"order": 1
|
|
},
|
|
{
|
|
"id": "section_2",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "This is the actual content that should be extracted from the document."
|
|
}
|
|
],
|
|
"order": 2
|
|
},
|
|
{
|
|
"id": "section_3",
|
|
"content_type": "table",
|
|
"elements": [
|
|
{
|
|
"headers": ["Column 1", "Column 2"],
|
|
"rows": [["Value 1", "Value 2"]]
|
|
}
|
|
],
|
|
"order": 3
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
# UNIFIED APPROACH: Always use multi-document format (single doc = multi with n=1)
|
|
adaptive_prompt = f"""
|
|
{services.ai.sanitizePromptContent(userPrompt, 'userinput')}
|
|
|
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
|
|
|
TASK: Extract the actual content from the document and organize it into documents. For single documents, create one document entry. For multi-document requests, create multiple document entries.
|
|
|
|
REQUIREMENTS:
|
|
1. Analyze the document content provided in the context below
|
|
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
|
3. Create one or more JSON document entries based on the content structure
|
|
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
|
5. Generate appropriate filenames for each document
|
|
|
|
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(multi_file_example, indent=2)}
|
|
|
|
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
|
|
- "id": unique identifier
|
|
- "title": document title
|
|
- "filename": appropriate filename for the document
|
|
- "sections": array of content sections
|
|
|
|
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
|
|
|
|
INSTRUCTIONS:
|
|
- For single document requests: Create one document with all content in its sections
|
|
- For multi-document requests: Create multiple documents, each with relevant sections
|
|
- Use actual section titles, headings, and text from the document
|
|
- Create meaningful filenames based on content
|
|
- Ensure each section contains the complete content for that part
|
|
- Do not use generic placeholder text like "Section 1", "Section 2"
|
|
- Extract real headings, paragraphs, lists, and other content elements
|
|
- CRITICAL: Return JSON with "documents" array, not "sections" array
|
|
|
|
CONTEXT (Document Content):
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
""".strip()
|
|
|
|
return adaptive_prompt
|
|
|
|
async def buildGenerationPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str
|
|
) -> str:
|
|
"""Build the unified generation prompt using a single JSON template."""
|
|
# Create a template with the actual title
|
|
json_template = JSON_STRUCTURE_TEMPLATE.replace("{{DOCUMENT_TITLE}}", title)
|
|
|
|
# Always use the proper generation prompt template with LOOP_INSTRUCTION
|
|
result = f"""Generate structured JSON content for document creation.
|
|
|
|
USER CONTEXT: "{userPrompt}"
|
|
DOCUMENT TITLE: "{title}"
|
|
TARGET FORMAT: {outputFormat}
|
|
|
|
LOOP_INSTRUCTION
|
|
|
|
RULES:
|
|
- Follow the template structure below exactly; emit only one JSON object in the response
|
|
- Fill sections with content based on the user request
|
|
- Use appropriate content_type
|
|
|
|
Return ONLY valid JSON matching this structure (template below). Do not include any prose before/after. Use this as the single template reference for your output:
|
|
{json_template}
|
|
"""
|
|
|
|
return result.strip()
|
|
|
|
async def buildExtractionPrompt(
|
|
outputFormat: str,
|
|
renderer: _RendererLike,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None
|
|
) -> str:
|
|
"""
|
|
Build the final extraction prompt by combining:
|
|
- Parsed extraction intent from user prompt (using AI)
|
|
- Generic cross-format instructions (filename header + real-data policy)
|
|
- Format-specific guidelines snippet provided by the renderer
|
|
|
|
The AI must place a single filename header at the very top:
|
|
FILENAME: <safe-file-name-with-extension>
|
|
followed by a blank line and then ONLY the document content according to the target format.
|
|
"""
|
|
|
|
# Parse user prompt to separate extraction intent from generation format using AI
|
|
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services)
|
|
|
|
# Import JSON schema for structured output
|
|
from .subJsonSchema import get_document_subJsonSchema
|
|
jsonSchema = get_document_subJsonSchema()
|
|
|
|
# Generic block for JSON extraction - use mixed example data showing different content types
|
|
example_data = {
|
|
"metadata": {
|
|
"title": "Example Document",
|
|
"author": "AI Assistant",
|
|
"source_documents": ["document_001"],
|
|
"extraction_method": "ai_extraction"
|
|
},
|
|
"sections": [
|
|
{
|
|
"id": "section_001",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{
|
|
"level": 1,
|
|
"text": "1. INTRODUCTION"
|
|
}
|
|
],
|
|
"order": 1,
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"id": "section_002",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "This is a sample paragraph with actual content that should be extracted from the document."
|
|
}
|
|
],
|
|
"order": 2,
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"id": "section_003",
|
|
"content_type": "table",
|
|
"elements": [
|
|
{
|
|
"headers": ["Column 1", "Column 2", "Column 3"],
|
|
"rows": [
|
|
["Value 1", "Value 2", "Value 3"],
|
|
["Value 4", "Value 5", "Value 6"]
|
|
]
|
|
}
|
|
],
|
|
"order": 3,
|
|
"metadata": {}
|
|
}
|
|
],
|
|
"summary": "",
|
|
"tags": []
|
|
}
|
|
|
|
genericIntro = f"""
|
|
{extractionIntent}
|
|
|
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
|
|
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
|
|
|
REQUIREMENTS:
|
|
1. Analyze the document content provided in the context below
|
|
2. Extract all content and organize it into logical sections
|
|
3. Create structured JSON with sections containing the extracted content
|
|
4. Preserve the original structure and data
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(example_data, indent=2)}
|
|
|
|
Requirements:
|
|
- Preserve all original data - do not summarize or interpret
|
|
- Use the exact JSON format shown above
|
|
- Maintain data integrity and structure
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
|
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
|
"""
|
|
|
|
# Get format-specific guidelines from renderer
|
|
formatGuidelines = ""
|
|
try:
|
|
if hasattr(renderer, 'getExtractionGuidelines'):
|
|
formatGuidelines = renderer.getExtractionGuidelines()
|
|
except Exception:
|
|
pass
|
|
|
|
# Combine all parts
|
|
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
|
|
|
|
# Save extraction prompt to debug file - only if debug enabled
|
|
try:
|
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
# Use configured log directory instead of hardcoded test-chat
|
|
from modules.shared.configuration import APP_CONFIG
|
|
logDir = APP_CONFIG.get("APP_LOGGING_LOG_DIR", "./")
|
|
if not os.path.isabs(logDir):
|
|
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
logDir = os.path.join(gatewayDir, logDir)
|
|
debug_root = os.path.join(logDir, 'debug')
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(finalPrompt)
|
|
except Exception:
|
|
pass
|
|
|
|
return finalPrompt
|
|
|
|
|
|
|
|
|
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
|
"""
|
|
Parse user prompt to extract the core extraction intent.
|
|
"""
|
|
if not aiService:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and extract the core extraction intent:
|
|
|
|
User request: "{userPrompt}"
|
|
Target format: {outputFormat}
|
|
|
|
Extract the main intent and requirements for document processing. Focus on:
|
|
1. What content needs to be extracted
|
|
2. How it should be organized
|
|
3. Any specific requirements or preferences
|
|
|
|
Respond with a clear, concise statement of the extraction intent.
|
|
"""
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
else:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
except Exception as e:
|
|
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|