214 lines
8.2 KiB
Python
214 lines
8.2 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Prompt builder for document extraction.
|
|
This module builds prompts for extracting content from documents.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, Optional
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
|
|
# Type hint for renderer parameter
|
|
from typing import TYPE_CHECKING
|
|
if TYPE_CHECKING:
|
|
from modules.services.serviceGeneration.renderers.rendererBaseTemplate import BaseRenderer
|
|
_RendererLike = BaseRenderer
|
|
else:
|
|
_RendererLike = Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def buildExtractionPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None,
|
|
renderer: _RendererLike = None
|
|
) -> str:
|
|
"""
|
|
Build unified extraction prompt for extracting content from documents.
|
|
Always uses multi-file format (single doc = multi with n=1).
|
|
|
|
Args:
|
|
outputFormat: Target output format
|
|
userPrompt: User's prompt describing what to extract
|
|
title: Document title
|
|
aiService: Optional AI service for intent parsing
|
|
services: Services instance
|
|
renderer: Optional renderer for format-specific guidelines
|
|
|
|
Returns:
|
|
Complete extraction prompt string
|
|
"""
|
|
|
|
# Flat extraction format - returns extracted content as structured data, not documents/sections
|
|
# This format allows merging multiple contentParts into one response
|
|
json_example = {
|
|
"extracted_content": {
|
|
"text": "Extracted text content from the document...",
|
|
"tables": [
|
|
{
|
|
"headers": ["Column 1", "Column 2"],
|
|
"rows": [
|
|
["Value 1", "Value 2"],
|
|
["Value 3", "Value 4"]
|
|
]
|
|
}
|
|
],
|
|
"headings": [
|
|
{
|
|
"level": 1,
|
|
"text": "Main Heading"
|
|
},
|
|
{
|
|
"level": 2,
|
|
"text": "Subheading"
|
|
}
|
|
],
|
|
"lists": [
|
|
{
|
|
"type": "bullet",
|
|
"items": ["Item 1", "Item 2", "Item 3"]
|
|
}
|
|
],
|
|
"images": [
|
|
{
|
|
"description": "Description of image content, including all visible text, tables, and visual elements"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
structure_instruction = """CRITICAL EXTRACTION REQUIREMENTS:
|
|
1. Extract content from the provided ContentPart(s) - process what is provided in this call
|
|
2. If this ContentPart contains tables, extract them with proper structure (headers and rows)
|
|
3. If this ContentPart contains text, extract it as structured text
|
|
4. Return ONE JSON object with extracted content from this ContentPart
|
|
5. Preserve all original data - do not summarize or interpret
|
|
6. The system will merge results from multiple ContentParts automatically - focus on extracting this ContentPart's content accurately"""
|
|
|
|
# Parse extraction intent if AI service is available
|
|
extraction_intent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) if aiService else userPrompt
|
|
|
|
# Extract user language for document language instruction
|
|
userLanguage = 'en' # Default fallback
|
|
if services:
|
|
try:
|
|
# Prefer detected language if available
|
|
if hasattr(services, 'currentUserLanguage') and services.currentUserLanguage:
|
|
userLanguage = services.currentUserLanguage
|
|
elif hasattr(services, 'user') and services.user and hasattr(services.user, 'language'):
|
|
userLanguage = services.user.language
|
|
except Exception:
|
|
pass
|
|
|
|
# Build base prompt with clear user prompt markers
|
|
sanitized_user_prompt = services.utils.sanitizePromptContent(userPrompt, 'userinput') if services else userPrompt
|
|
adaptive_prompt = f"""
|
|
{'='*80}
|
|
USER REQUEST / USER PROMPT:
|
|
{'='*80}
|
|
{sanitized_user_prompt}
|
|
{'='*80}
|
|
END OF USER REQUEST / USER PROMPT
|
|
{'='*80}
|
|
|
|
You are a document processing assistant that extracts content from documents. Your task is to analyze the provided ContentPart(s) and extract their content into a structured JSON format.
|
|
|
|
TASK: Extract content from the provided ContentPart(s). Extract all tables, text, headings, lists, and other content types accurately. The system processes ContentParts individually and merges results automatically.
|
|
|
|
LANGUAGE REQUIREMENT: All extracted content must be in the language '{userLanguage}'. Extract and preserve content in this language.
|
|
|
|
{extraction_intent}
|
|
|
|
{structure_instruction}
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(json_example, indent=2)}
|
|
|
|
CRITICAL EXTRACTION RULES:
|
|
- Extract only content that is ACTUALLY PRESENT in the ContentPart - never create fake or placeholder data
|
|
- Return empty arrays [] or empty strings "" when content is missing - this is normal and expected
|
|
- Extract all tables, text, headings, lists accurately with proper structure
|
|
- Preserve all original data - do not summarize or interpret
|
|
- Return ONE JSON object per ContentPart (the system merges multiple ContentParts automatically)
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract only actual content from the ContentPart. Return empty arrays/strings when content is missing - never create fake data.
|
|
""".strip()
|
|
|
|
# Add renderer-specific guidelines if provided
|
|
if renderer:
|
|
try:
|
|
if hasattr(renderer, 'getExtractionGuidelines'):
|
|
formatGuidelines = renderer.getExtractionGuidelines()
|
|
adaptive_prompt = f"{adaptive_prompt}\n\n{formatGuidelines}".strip()
|
|
except Exception:
|
|
pass
|
|
|
|
# Save extraction prompt to debug file - only if debug enabled
|
|
from modules.shared.debugLogger import writeDebugFile
|
|
writeDebugFile(adaptive_prompt, "extraction_prompt")
|
|
|
|
return adaptive_prompt
|
|
|
|
|
|
|
|
|
|
|
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
|
"""
|
|
Parse user prompt to extract the core extraction intent.
|
|
"""
|
|
if not aiService:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and extract the core extraction intent:
|
|
|
|
User request: "{userPrompt}"
|
|
Target format: {outputFormat}
|
|
|
|
Extract the main intent and requirements for document processing. Focus on:
|
|
1. What content needs to be extracted
|
|
2. How it should be organized
|
|
3. Any specific requirements or preferences
|
|
|
|
Respond with a clear, concise statement of the extraction intent.
|
|
"""
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationTypeEnum.DATA_GENERATE
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
else:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
except Exception as e:
|
|
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|