gateway/modules/aichat/serviceExtraction/subPromptBuilderExtraction.py
2026-01-22 21:11:25 +01:00

214 lines
8.2 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Prompt builder for document extraction.
This module builds prompts for extracting content from documents.
"""
import json
import logging
from typing import Dict, Any, Optional
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Type hint for renderer parameter
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from modules.aichat.serviceGeneration.renderers.documentRendererBaseTemplate import BaseRenderer
_RendererLike = BaseRenderer
else:
_RendererLike = Any
logger = logging.getLogger(__name__)
async def buildExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None,
renderer: _RendererLike = None
) -> str:
"""
Build unified extraction prompt for extracting content from documents.
Always uses multi-file format (single doc = multi with n=1).
Args:
outputFormat: Target output format
userPrompt: User's prompt describing what to extract
title: Document title
aiService: Optional AI service for intent parsing
services: Services instance
renderer: Optional renderer for format-specific guidelines
Returns:
Complete extraction prompt string
"""
# Flat extraction format - returns extracted content as structured data, not documents/sections
# This format allows merging multiple contentParts into one response
json_example = {
"extracted_content": {
"text": "Extracted text content from the document...",
"tables": [
{
"headers": ["Column 1", "Column 2"],
"rows": [
["Value 1", "Value 2"],
["Value 3", "Value 4"]
]
}
],
"headings": [
{
"level": 1,
"text": "Main Heading"
},
{
"level": 2,
"text": "Subheading"
}
],
"lists": [
{
"type": "bullet",
"items": ["Item 1", "Item 2", "Item 3"]
}
],
"images": [
{
"description": "Description of image content, including all visible text, tables, and visual elements"
}
]
}
}
structure_instruction = """CRITICAL EXTRACTION REQUIREMENTS:
1. Extract content from the provided ContentPart(s) - process what is provided in this call
2. If this ContentPart contains tables, extract them with proper structure (headers and rows)
3. If this ContentPart contains text, extract it as structured text
4. Return ONE JSON object with extracted content from this ContentPart
5. Preserve all original data - do not summarize or interpret
6. The system will merge results from multiple ContentParts automatically - focus on extracting this ContentPart's content accurately"""
# Parse extraction intent if AI service is available
extraction_intent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) if aiService else userPrompt
# Extract user language for document language instruction
userLanguage = 'en' # Default fallback
if services:
try:
# Prefer detected language if available
if hasattr(services, 'currentUserLanguage') and services.currentUserLanguage:
userLanguage = services.currentUserLanguage
elif hasattr(services, 'user') and services.user and hasattr(services.user, 'language'):
userLanguage = services.user.language
except Exception:
pass
# Build base prompt with clear user prompt markers
sanitized_user_prompt = services.utils.sanitizePromptContent(userPrompt, 'userinput') if services else userPrompt
adaptive_prompt = f"""
{'='*80}
USER REQUEST / USER PROMPT:
{'='*80}
{sanitized_user_prompt}
{'='*80}
END OF USER REQUEST / USER PROMPT
{'='*80}
You are a document processing assistant that extracts content from documents. Your task is to analyze the provided ContentPart(s) and extract their content into a structured JSON format.
TASK: Extract content from the provided ContentPart(s). Extract all tables, text, headings, lists, and other content types accurately. The system processes ContentParts individually and merges results automatically.
LANGUAGE REQUIREMENT: All extracted content must be in the language '{userLanguage}'. Extract and preserve content in this language.
{extraction_intent}
{structure_instruction}
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(json_example, indent=2)}
CRITICAL EXTRACTION RULES:
- Extract only content that is ACTUALLY PRESENT in the ContentPart - never create fake or placeholder data
- Return empty arrays [] or empty strings "" when content is missing - this is normal and expected
- Extract all tables, text, headings, lists accurately with proper structure
- Preserve all original data - do not summarize or interpret
- Return ONE JSON object per ContentPart (the system merges multiple ContentParts automatically)
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract only actual content from the ContentPart. Return empty arrays/strings when content is missing - never create fake data.
""".strip()
# Add renderer-specific guidelines if provided
if renderer:
try:
if hasattr(renderer, 'getExtractionGuidelines'):
formatGuidelines = renderer.getExtractionGuidelines()
adaptive_prompt = f"{adaptive_prompt}\n\n{formatGuidelines}".strip()
except Exception:
pass
# Save extraction prompt to debug file - only if debug enabled
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(adaptive_prompt, "extraction_prompt")
return adaptive_prompt
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
"""
Parse user prompt to extract the core extraction intent.
"""
if not aiService:
return f"Extract content from the provided documents and create a {outputFormat} report."
try:
analysis_prompt = f"""
Analyze this user request and extract the core extraction intent:
User request: "{userPrompt}"
Target format: {outputFormat}
Extract the main intent and requirements for document processing. Focus on:
1. What content needs to be extracted
2. How it should be organized
3. Any specific requirements or preferences
Respond with a clear, concise statement of the extraction intent.
"""
request_options = AiCallOptions()
request_options.operationType = OperationTypeEnum.DATA_GENERATE
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
return response.content.strip()
else:
return f"Extract content from the provided documents and create a {outputFormat} report."
except Exception as e:
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
return f"Extract content from the provided documents and create a {outputFormat} report."