gateway/modules/services/serviceExtraction/subPromptBuilderExtraction.py
2025-10-28 00:14:24 +01:00

219 lines
8.4 KiB
Python

"""
Prompt builder for document extraction.
This module builds prompts for extracting content from documents.
"""
import json
import logging
from typing import Dict, Any, Optional
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Type hint for renderer parameter
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from modules.services.serviceGeneration.renderers.rendererBaseTemplate import BaseRenderer
_RendererLike = BaseRenderer
else:
_RendererLike = Any
logger = logging.getLogger(__name__)
async def buildExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None,
renderer: _RendererLike = None
) -> str:
"""
Build unified extraction prompt for extracting content from documents.
Always uses multi-file format (single doc = multi with n=1).
Args:
outputFormat: Target output format
userPrompt: User's prompt describing what to extract
title: Document title
aiService: Optional AI service for intent parsing
services: Services instance
renderer: Optional renderer for format-specific guidelines
Returns:
Complete extraction prompt string
"""
# Unified multi-file example (single doc = multi with n=1)
json_example = {
"metadata": {
"title": "Multi-Document Example",
"split_strategy": "by_section",
"source_documents": ["doc_001"],
"extraction_method": "ai_extraction"
},
"documents": [
{
"id": "doc_section_1",
"title": "Section 1 Title",
"filename": "section_1.xlsx",
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be extracted from the document."
}
],
"order": 2
},
{
"id": "section_3",
"content_type": "table",
"elements": [
{
"headers": ["Column 1", "Column 2"],
"rows": [["Value 1", "Value 2"]]
}
],
"order": 3
}
]
}
]
}
structure_instruction = "CRITICAL: You MUST return a JSON structure with a \"documents\" array. For single documents, create one document entry with all sections."
# Parse extraction intent if AI service is available
extraction_intent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) if aiService else userPrompt
# Build base prompt
adaptive_prompt = f"""
{services.ai.sanitizePromptContent(userPrompt, 'userinput') if services else userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into documents. For single documents, create one document entry. For multi-document requests, create multiple document entries.
{extraction_intent}
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
3. Create one or more JSON document entries based on the content structure
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
5. Generate appropriate filenames for each document
{structure_instruction}
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(json_example, indent=2)}
Requirements:
- Preserve all original data - do not summarize or interpret
- Use the exact JSON format shown above
- Maintain data integrity and structure
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
# Add renderer-specific guidelines if provided
if renderer:
try:
if hasattr(renderer, 'getExtractionGuidelines'):
formatGuidelines = renderer.getExtractionGuidelines()
adaptive_prompt = f"{adaptive_prompt}\n\n{formatGuidelines}".strip()
except Exception:
pass
# Save extraction prompt to debug file - only if debug enabled
if services:
try:
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
from modules.shared.configuration import APP_CONFIG
logDir = APP_CONFIG.get("APP_LOGGING_LOG_DIR", "./")
if not os.path.isabs(logDir):
gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
logDir = os.path.join(gatewayDir, logDir)
debug_root = os.path.join(logDir, 'debug')
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
f.write(adaptive_prompt)
except Exception:
pass
return adaptive_prompt
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
"""
Parse user prompt to extract the core extraction intent.
"""
if not aiService:
return f"Extract content from the provided documents and create a {outputFormat} report."
try:
analysis_prompt = f"""
Analyze this user request and extract the core extraction intent:
User request: "{userPrompt}"
Target format: {outputFormat}
Extract the main intent and requirements for document processing. Focus on:
1. What content needs to be extracted
2. How it should be organized
3. Any specific requirements or preferences
Respond with a clear, concise statement of the extraction intent.
"""
request_options = AiCallOptions()
request_options.operationType = OperationTypeEnum.DATA_GENERATE
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
return response.content.strip()
else:
return f"Extract content from the provided documents and create a {outputFormat} report."
except Exception as e:
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
return f"Extract content from the provided documents and create a {outputFormat} report."