219 lines
8.4 KiB
Python
219 lines
8.4 KiB
Python
"""
|
|
Prompt builder for document extraction.
|
|
This module builds prompts for extracting content from documents.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, Optional
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
|
|
# Type hint for renderer parameter
|
|
from typing import TYPE_CHECKING
|
|
if TYPE_CHECKING:
|
|
from modules.services.serviceGeneration.renderers.rendererBaseTemplate import BaseRenderer
|
|
_RendererLike = BaseRenderer
|
|
else:
|
|
_RendererLike = Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def buildExtractionPrompt(
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None,
|
|
services=None,
|
|
renderer: _RendererLike = None
|
|
) -> str:
|
|
"""
|
|
Build unified extraction prompt for extracting content from documents.
|
|
Always uses multi-file format (single doc = multi with n=1).
|
|
|
|
Args:
|
|
outputFormat: Target output format
|
|
userPrompt: User's prompt describing what to extract
|
|
title: Document title
|
|
aiService: Optional AI service for intent parsing
|
|
services: Services instance
|
|
renderer: Optional renderer for format-specific guidelines
|
|
|
|
Returns:
|
|
Complete extraction prompt string
|
|
"""
|
|
|
|
# Unified multi-file example (single doc = multi with n=1)
|
|
json_example = {
|
|
"metadata": {
|
|
"title": "Multi-Document Example",
|
|
"split_strategy": "by_section",
|
|
"source_documents": ["doc_001"],
|
|
"extraction_method": "ai_extraction"
|
|
},
|
|
"documents": [
|
|
{
|
|
"id": "doc_section_1",
|
|
"title": "Section 1 Title",
|
|
"filename": "section_1.xlsx",
|
|
"sections": [
|
|
{
|
|
"id": "section_1",
|
|
"content_type": "heading",
|
|
"elements": [
|
|
{
|
|
"level": 1,
|
|
"text": "1. SECTION TITLE"
|
|
}
|
|
],
|
|
"order": 1
|
|
},
|
|
{
|
|
"id": "section_2",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "This is the actual content that should be extracted from the document."
|
|
}
|
|
],
|
|
"order": 2
|
|
},
|
|
{
|
|
"id": "section_3",
|
|
"content_type": "table",
|
|
"elements": [
|
|
{
|
|
"headers": ["Column 1", "Column 2"],
|
|
"rows": [["Value 1", "Value 2"]]
|
|
}
|
|
],
|
|
"order": 3
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
structure_instruction = "CRITICAL: You MUST return a JSON structure with a \"documents\" array. For single documents, create one document entry with all sections."
|
|
|
|
# Parse extraction intent if AI service is available
|
|
extraction_intent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) if aiService else userPrompt
|
|
|
|
# Build base prompt
|
|
adaptive_prompt = f"""
|
|
{services.ai.sanitizePromptContent(userPrompt, 'userinput') if services else userPrompt}
|
|
|
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
|
|
|
TASK: Extract the actual content from the document and organize it into documents. For single documents, create one document entry. For multi-document requests, create multiple document entries.
|
|
|
|
{extraction_intent}
|
|
|
|
REQUIREMENTS:
|
|
1. Analyze the document content provided in the context below
|
|
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
|
|
3. Create one or more JSON document entries based on the content structure
|
|
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
|
|
5. Generate appropriate filenames for each document
|
|
|
|
{structure_instruction}
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{json.dumps(json_example, indent=2)}
|
|
|
|
Requirements:
|
|
- Preserve all original data - do not summarize or interpret
|
|
- Use the exact JSON format shown above
|
|
- Maintain data integrity and structure
|
|
|
|
Content Types to Extract:
|
|
1. Tables: Extract all rows and columns with proper headers
|
|
2. Lists: Extract all items with proper nesting
|
|
3. Headings: Extract with appropriate levels
|
|
4. Paragraphs: Extract as structured text
|
|
5. Code: Extract code blocks with language identification
|
|
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
|
|
|
Image Analysis Requirements:
|
|
- If you cannot analyze an image for any reason, explain why in the JSON response
|
|
- Describe everything you see in the image
|
|
- Include all text content, tables, logos, graphics, layout, and visual elements
|
|
- If the image is too small, corrupted, or unclear, explain this
|
|
- Always provide feedback - never return empty responses
|
|
|
|
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
|
|
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
|
""".strip()
|
|
|
|
# Add renderer-specific guidelines if provided
|
|
if renderer:
|
|
try:
|
|
if hasattr(renderer, 'getExtractionGuidelines'):
|
|
formatGuidelines = renderer.getExtractionGuidelines()
|
|
adaptive_prompt = f"{adaptive_prompt}\n\n{formatGuidelines}".strip()
|
|
except Exception:
|
|
pass
|
|
|
|
# Save extraction prompt to debug file - only if debug enabled
|
|
if services:
|
|
try:
|
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
from modules.shared.configuration import APP_CONFIG
|
|
logDir = APP_CONFIG.get("APP_LOGGING_LOG_DIR", "./")
|
|
if not os.path.isabs(logDir):
|
|
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
logDir = os.path.join(gatewayDir, logDir)
|
|
debug_root = os.path.join(logDir, 'debug')
|
|
os.makedirs(debug_root, exist_ok=True)
|
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
|
f.write(adaptive_prompt)
|
|
except Exception:
|
|
pass
|
|
|
|
return adaptive_prompt
|
|
|
|
|
|
|
|
|
|
|
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
|
"""
|
|
Parse user prompt to extract the core extraction intent.
|
|
"""
|
|
if not aiService:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and extract the core extraction intent:
|
|
|
|
User request: "{userPrompt}"
|
|
Target format: {outputFormat}
|
|
|
|
Extract the main intent and requirements for document processing. Focus on:
|
|
1. What content needs to be extracted
|
|
2. How it should be organized
|
|
3. Any specific requirements or preferences
|
|
|
|
Respond with a clear, concise statement of the extraction intent.
|
|
"""
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationTypeEnum.DATA_GENERATE
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await aiService.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
else:
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|
|
except Exception as e:
|
|
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
|
|