Workflow end 2 End Validated - Start Variant Testing
This commit is contained in:
parent
bdc87eb5c6
commit
e0afc72e13
13 changed files with 541 additions and 285 deletions
|
|
@ -263,8 +263,7 @@ async def read_user_me(
|
||||||
@limiter.limit("60/minute")
|
@limiter.limit("60/minute")
|
||||||
async def refresh_token(
|
async def refresh_token(
|
||||||
request: Request,
|
request: Request,
|
||||||
response: Response,
|
response: Response
|
||||||
currentUser: User = Depends(getCurrentUser)
|
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Refresh access token using refresh token from cookie"""
|
"""Refresh access token using refresh token from cookie"""
|
||||||
try:
|
try:
|
||||||
|
|
@ -283,12 +282,27 @@ async def refresh_token(
|
||||||
except jwt.JWTError:
|
except jwt.JWTError:
|
||||||
raise HTTPException(status_code=401, detail="Invalid refresh token")
|
raise HTTPException(status_code=401, detail="Invalid refresh token")
|
||||||
|
|
||||||
|
# Get user information from refresh token payload
|
||||||
|
user_id = payload.get("userId")
|
||||||
|
if not user_id:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID")
|
||||||
|
|
||||||
|
# Get user from database using the user ID from refresh token
|
||||||
|
try:
|
||||||
|
app_interface = getRootInterface()
|
||||||
|
current_user = app_interface.getUser(user_id)
|
||||||
|
if not current_user:
|
||||||
|
raise HTTPException(status_code=401, detail="User not found")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get user from database: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to validate user")
|
||||||
|
|
||||||
# Create new token data
|
# Create new token data
|
||||||
token_data = {
|
token_data = {
|
||||||
"sub": currentUser.username,
|
"sub": current_user.username,
|
||||||
"mandateId": str(currentUser.mandateId),
|
"mandateId": str(current_user.mandateId),
|
||||||
"userId": str(currentUser.id),
|
"userId": str(current_user.id),
|
||||||
"authenticationAuthority": currentUser.authenticationAuthority
|
"authenticationAuthority": current_user.authenticationAuthority
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create new access token + set cookie
|
# Create new access token + set cookie
|
||||||
|
|
|
||||||
|
|
@ -345,7 +345,7 @@ class SubCoreAi:
|
||||||
options=options
|
options=options
|
||||||
)
|
)
|
||||||
response = await self.aiObjects.call(request)
|
response = await self.aiObjects.call(request)
|
||||||
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "type": "paragraph", "data": {"text": response.content}}]}
|
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]}
|
||||||
|
|
||||||
# Convert single-file result to multi-file format if needed
|
# Convert single-file result to multi-file format if needed
|
||||||
if "sections" in result and "documents" not in result:
|
if "sections" in result and "documents" not in result:
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,8 @@ class SubDocumentGeneration:
|
||||||
documents: Optional[List[ChatDocument]],
|
documents: Optional[List[ChatDocument]],
|
||||||
options: AiCallOptions,
|
options: AiCallOptions,
|
||||||
outputFormat: str,
|
outputFormat: str,
|
||||||
title: Optional[str]
|
title: Optional[str],
|
||||||
|
generationPrompt: Optional[str] = None
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Handle single-file document generation (existing functionality)."""
|
"""Handle single-file document generation (existing functionality)."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -125,9 +126,72 @@ class SubDocumentGeneration:
|
||||||
except Exception:
|
except Exception:
|
||||||
parsedFilename = None
|
parsedFilename = None
|
||||||
|
|
||||||
# Render the JSON content to the specified format
|
# Use AI generation to enhance the extracted JSON before rendering
|
||||||
|
enhancedContent = aiResponseJson # Default to original
|
||||||
|
if prompt:
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
# Get generation prompt
|
||||||
|
generationPrompt = await generation_service.getGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=prompt,
|
||||||
|
title=title,
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare the AI call
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
# Create context with the extracted JSON content
|
||||||
|
import json
|
||||||
|
context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"
|
||||||
|
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=generationPrompt,
|
||||||
|
context=context,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
# Call AI to enhance the content
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
# Parse the AI response as JSON
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
result = response.content.strip()
|
||||||
|
|
||||||
|
# Extract JSON from markdown if present
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
enhancedContent = json.loads(result)
|
||||||
|
logger.info(f"AI enhanced JSON content successfully")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
||||||
|
enhancedContent = aiResponseJson
|
||||||
|
else:
|
||||||
|
logger.warning("AI generation returned empty response, using original content")
|
||||||
|
enhancedContent = aiResponseJson
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||||
|
enhancedContent = aiResponseJson
|
||||||
|
|
||||||
|
# Render the enhanced JSON content
|
||||||
renderedContent, mimeType = await generation_service.renderReport(
|
renderedContent, mimeType = await generation_service.renderReport(
|
||||||
extractedContent=aiResponseJson,
|
extractedContent=enhancedContent,
|
||||||
outputFormat=outputFormat,
|
outputFormat=outputFormat,
|
||||||
title=title,
|
title=title,
|
||||||
userPrompt=prompt,
|
userPrompt=prompt,
|
||||||
|
|
@ -232,11 +296,8 @@ class SubDocumentGeneration:
|
||||||
# Convert AI format to renderer format
|
# Convert AI format to renderer format
|
||||||
transformed_section = {
|
transformed_section = {
|
||||||
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
||||||
"type": section.get("content_type", "paragraph"),
|
"content_type": section.get("content_type", "paragraph"),
|
||||||
"data": {
|
"elements": section.get("elements", []),
|
||||||
"text": "",
|
|
||||||
"elements": section.get("elements", [])
|
|
||||||
},
|
|
||||||
"order": section.get("order", len(transformed_sections) + 1)
|
"order": section.get("order", len(transformed_sections) + 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -246,7 +307,11 @@ class SubDocumentGeneration:
|
||||||
for element in section.get("elements", []):
|
for element in section.get("elements", []):
|
||||||
if "text" in element:
|
if "text" in element:
|
||||||
text_parts.append(element["text"])
|
text_parts.append(element["text"])
|
||||||
transformed_section["data"]["text"] = "\n".join(text_parts)
|
# Add text to the first element or create a new one
|
||||||
|
if transformed_section["elements"]:
|
||||||
|
transformed_section["elements"][0]["text"] = "\n".join(text_parts)
|
||||||
|
else:
|
||||||
|
transformed_section["elements"] = [{"text": "\n".join(text_parts)}]
|
||||||
|
|
||||||
transformed_sections.append(transformed_section)
|
transformed_sections.append(transformed_section)
|
||||||
|
|
||||||
|
|
@ -264,8 +329,72 @@ class SubDocumentGeneration:
|
||||||
"tags": ["multi_file", "ai_generated"]
|
"tags": ["multi_file", "ai_generated"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Use AI generation to enhance the extracted JSON before rendering
|
||||||
|
enhancedContent = complete_document # Default to original
|
||||||
|
if prompt:
|
||||||
|
try:
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
# Get generation prompt
|
||||||
|
generationPrompt = await generation_service.getGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=prompt,
|
||||||
|
title=doc_data["title"],
|
||||||
|
aiService=self
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare the AI call
|
||||||
|
request_options = AiCallOptions()
|
||||||
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
|
# Create context with the extracted JSON content
|
||||||
|
import json
|
||||||
|
context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"
|
||||||
|
|
||||||
|
request = AiCallRequest(
|
||||||
|
prompt=generationPrompt,
|
||||||
|
context=context,
|
||||||
|
options=request_options
|
||||||
|
)
|
||||||
|
|
||||||
|
# Call AI to enhance the content
|
||||||
|
response = await self.aiObjects.call(request)
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
# Parse the AI response as JSON
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
result = response.content.strip()
|
||||||
|
|
||||||
|
# Extract JSON from markdown if present
|
||||||
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
result = json_match.group(1).strip()
|
||||||
|
elif result.startswith('```json'):
|
||||||
|
result = re.sub(r'^```json\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
elif result.startswith('```'):
|
||||||
|
result = re.sub(r'^```\s*', '', result)
|
||||||
|
result = re.sub(r'\s*```$', '', result)
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
enhancedContent = json.loads(result)
|
||||||
|
logger.info(f"AI enhanced JSON content successfully")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
||||||
|
enhancedContent = complete_document
|
||||||
|
else:
|
||||||
|
logger.warning("AI generation returned empty response, using original content")
|
||||||
|
enhancedContent = complete_document
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||||
|
enhancedContent = complete_document
|
||||||
|
|
||||||
|
# Render the enhanced JSON content
|
||||||
rendered_content, mime_type = await generation_service.renderReport(
|
rendered_content, mime_type = await generation_service.renderReport(
|
||||||
extractedContent=complete_document,
|
extractedContent=enhancedContent,
|
||||||
outputFormat=outputFormat,
|
outputFormat=outputFormat,
|
||||||
title=doc_data["title"],
|
title=doc_data["title"],
|
||||||
userPrompt=prompt,
|
userPrompt=prompt,
|
||||||
|
|
@ -477,9 +606,7 @@ Return only the JSON response.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
services = self.services
|
services = self.services
|
||||||
workflow = getattr(services, 'currentWorkflow', None)
|
workflow = services.currentWorkflow
|
||||||
if not workflow:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Serialize payload
|
# Serialize payload
|
||||||
import json as _json
|
import json as _json
|
||||||
|
|
|
||||||
|
|
@ -181,9 +181,8 @@ class SubDocumentProcessing:
|
||||||
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
|
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
|
||||||
normalizer = NormalizationService(self.services)
|
normalizer = NormalizationService(self.services)
|
||||||
inventory = normalizer.discoverStructures(mergedJsonDocument)
|
inventory = normalizer.discoverStructures(mergedJsonDocument)
|
||||||
# Use workflow id if available as cache key, else default
|
# Use workflow id as cache key
|
||||||
cacheKey = getattr(self.services, 'currentWorkflow', None)
|
cacheKey = self.services.currentWorkflow.id
|
||||||
cacheKey = getattr(cacheKey, 'id', 'workflow_run') if cacheKey else 'workflow_run'
|
|
||||||
# Provide the extraction/merge prompt context when available to help mapping
|
# Provide the extraction/merge prompt context when available to help mapping
|
||||||
mergePrompt = prompt
|
mergePrompt = prompt
|
||||||
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
|
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
|
||||||
|
|
@ -476,8 +475,8 @@ class SubDocumentProcessing:
|
||||||
"metadata": {"title": f"Image Analysis - Chunk {chunk_index}"},
|
"metadata": {"title": f"Image Analysis - Chunk {chunk_index}"},
|
||||||
"sections": [{
|
"sections": [{
|
||||||
"id": f"image_section_{chunk_index}",
|
"id": f"image_section_{chunk_index}",
|
||||||
"type": "paragraph",
|
"content_type": "paragraph",
|
||||||
"data": {"text": fallback_content}
|
"elements": [{"text": fallback_content}]
|
||||||
}]
|
}]
|
||||||
})
|
})
|
||||||
self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE")
|
self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE")
|
||||||
|
|
@ -583,8 +582,8 @@ class SubDocumentProcessing:
|
||||||
"metadata": {"title": f"Document Analysis - Chunk {chunk_index}"},
|
"metadata": {"title": f"Document Analysis - Chunk {chunk_index}"},
|
||||||
"sections": [{
|
"sections": [{
|
||||||
"id": f"analysis_section_{chunk_index}",
|
"id": f"analysis_section_{chunk_index}",
|
||||||
"type": "paragraph",
|
"content_type": "paragraph",
|
||||||
"data": {"text": fallback_content}
|
"elements": [{"text": fallback_content}]
|
||||||
}]
|
}]
|
||||||
})
|
})
|
||||||
self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE")
|
self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE")
|
||||||
|
|
@ -676,8 +675,8 @@ class SubDocumentProcessing:
|
||||||
"metadata": {"title": "Error Section"},
|
"metadata": {"title": "Error Section"},
|
||||||
"sections": [{
|
"sections": [{
|
||||||
"id": f"error_section_{chunk_index}",
|
"id": f"error_section_{chunk_index}",
|
||||||
"type": "paragraph",
|
"content_type": "paragraph",
|
||||||
"data": {"text": f"Error parsing JSON: {str(e)}"}
|
"elements": [{"text": f"Error parsing JSON: {str(e)}"}]
|
||||||
}]
|
}]
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
|
import json
|
||||||
from typing import Any, Dict, List, Optional, Union, Tuple
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
import re
|
import re
|
||||||
|
|
@ -339,24 +340,8 @@ class GenerationService:
|
||||||
if not renderer:
|
if not renderer:
|
||||||
raise ValueError(f"Unsupported output format: {outputFormat}")
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||||
|
|
||||||
# Generate AI-based generation prompt if AI service is available
|
# Render the JSON content directly (AI generation handled by main service)
|
||||||
generationPrompt = userPrompt # Default to user prompt
|
renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService)
|
||||||
if aiService and userPrompt:
|
|
||||||
try:
|
|
||||||
from .subPromptBuilder import buildGenerationPrompt
|
|
||||||
generationPrompt = await buildGenerationPrompt(
|
|
||||||
outputFormat=outputFormat,
|
|
||||||
userPrompt=userPrompt,
|
|
||||||
title=title,
|
|
||||||
aiService=aiService,
|
|
||||||
services=self.services
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
|
|
||||||
generationPrompt = userPrompt
|
|
||||||
|
|
||||||
# Render the JSON content with AI-generated prompt
|
|
||||||
renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt, aiService)
|
|
||||||
# DEBUG: dump rendered output
|
# DEBUG: dump rendered output
|
||||||
try:
|
try:
|
||||||
import os
|
import os
|
||||||
|
|
@ -391,6 +376,23 @@ class GenerationService:
|
||||||
services=self.services
|
services=self.services
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def getGenerationPrompt(
|
||||||
|
self,
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None
|
||||||
|
) -> str:
|
||||||
|
"""Get generation prompt for enhancing extracted JSON content."""
|
||||||
|
from .subPromptBuilder import buildGenerationPrompt
|
||||||
|
return await buildGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
title=title,
|
||||||
|
aiService=aiService,
|
||||||
|
services=self.services
|
||||||
|
)
|
||||||
|
|
||||||
async def getGenericExtractionPrompt(
|
async def getGenericExtractionPrompt(
|
||||||
self,
|
self,
|
||||||
outputFormat: str,
|
outputFormat: str,
|
||||||
|
|
|
||||||
|
|
@ -81,11 +81,11 @@ class BaseRenderer(ABC):
|
||||||
if not isinstance(sections, list):
|
if not isinstance(sections, list):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Validate each section has type and data
|
# Validate each section has content_type and elements
|
||||||
for section in sections:
|
for section in sections:
|
||||||
if not isinstance(section, dict):
|
if not isinstance(section, dict):
|
||||||
return False
|
return False
|
||||||
if "type" not in section or "data" not in section:
|
if "content_type" not in section or "elements" not in section:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
@ -159,7 +159,7 @@ class BaseRenderer(ABC):
|
||||||
# Base implementation returns a simple dict
|
# Base implementation returns a simple dict
|
||||||
# Format-specific renderers should override this method
|
# Format-specific renderers should override this method
|
||||||
return {
|
return {
|
||||||
"type": "image",
|
"content_type": "image",
|
||||||
"base64Data": base64_data,
|
"base64Data": base64_data,
|
||||||
"altText": alt_text,
|
"altText": alt_text,
|
||||||
"width": section_data.get("width", None),
|
"width": section_data.get("width", None),
|
||||||
|
|
@ -259,25 +259,25 @@ class BaseRenderer(ABC):
|
||||||
|
|
||||||
if section_type == "table":
|
if section_type == "table":
|
||||||
headers, rows = self._extract_table_data(section_data)
|
headers, rows = self._extract_table_data(section_data)
|
||||||
return {"type": "table", "headers": headers, "rows": rows}
|
return {"content_type": "table", "headers": headers, "rows": rows}
|
||||||
elif section_type == "bullet_list":
|
elif section_type == "bullet_list":
|
||||||
items = self._extract_bullet_list_items(section_data)
|
items = self._extract_bullet_list_items(section_data)
|
||||||
return {"type": "bullet_list", "items": items}
|
return {"content_type": "bullet_list", "items": items}
|
||||||
elif section_type == "heading":
|
elif section_type == "heading":
|
||||||
level, text = self._extract_heading_data(section_data)
|
level, text = self._extract_heading_data(section_data)
|
||||||
return {"type": "heading", "level": level, "text": text}
|
return {"content_type": "heading", "level": level, "text": text}
|
||||||
elif section_type == "paragraph":
|
elif section_type == "paragraph":
|
||||||
text = self._extract_paragraph_text(section_data)
|
text = self._extract_paragraph_text(section_data)
|
||||||
return {"type": "paragraph", "text": text}
|
return {"content_type": "paragraph", "text": text}
|
||||||
elif section_type == "code_block":
|
elif section_type == "code_block":
|
||||||
code, language = self._extract_code_block_data(section_data)
|
code, language = self._extract_code_block_data(section_data)
|
||||||
return {"type": "code_block", "code": code, "language": language}
|
return {"content_type": "code_block", "code": code, "language": language}
|
||||||
elif section_type == "image":
|
elif section_type == "image":
|
||||||
base64_data, alt_text = self._extract_image_data(section_data)
|
base64_data, alt_text = self._extract_image_data(section_data)
|
||||||
# Validate image data
|
# Validate image data
|
||||||
if self._validate_image_data(base64_data, alt_text):
|
if self._validate_image_data(base64_data, alt_text):
|
||||||
return {
|
return {
|
||||||
"type": "image",
|
"content_type": "image",
|
||||||
"base64Data": base64_data,
|
"base64Data": base64_data,
|
||||||
"altText": alt_text,
|
"altText": alt_text,
|
||||||
"width": section_data.get("width"),
|
"width": section_data.get("width"),
|
||||||
|
|
@ -286,11 +286,11 @@ class BaseRenderer(ABC):
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
# Return placeholder if image data is invalid
|
# Return placeholder if image data is invalid
|
||||||
return {"type": "paragraph", "text": f"[Image: {alt_text}]"}
|
return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"}
|
||||||
else:
|
else:
|
||||||
# Fallback to paragraph
|
# Fallback to paragraph
|
||||||
text = self._extract_paragraph_text(section_data)
|
text = self._extract_paragraph_text(section_data)
|
||||||
return {"type": "paragraph", "text": text}
|
return {"content_type": "paragraph", "text": text}
|
||||||
|
|
||||||
def _format_timestamp(self, timestamp: str = None) -> str:
|
def _format_timestamp(self, timestamp: str = None) -> str:
|
||||||
"""Format timestamp for display."""
|
"""Format timestamp for display."""
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ class RendererJson(BaseRenderer):
|
||||||
# Return minimal JSON fallback
|
# Return minimal JSON fallback
|
||||||
fallback_data = {
|
fallback_data = {
|
||||||
"title": title,
|
"title": title,
|
||||||
"sections": [{"type": "paragraph", "data": {"text": f"Error rendering report: {str(e)}"}}],
|
"sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}],
|
||||||
"metadata": {"error": str(e)}
|
"metadata": {"error": str(e)}
|
||||||
}
|
}
|
||||||
return json.dumps(fallback_data, indent=2), "application/json"
|
return json.dumps(fallback_data, indent=2), "application/json"
|
||||||
|
|
@ -54,7 +54,7 @@ class RendererJson(BaseRenderer):
|
||||||
if "sections" not in content:
|
if "sections" not in content:
|
||||||
# Convert old format to new format
|
# Convert old format to new format
|
||||||
content = {
|
content = {
|
||||||
"sections": [{"type": "paragraph", "data": {"text": str(content)}}],
|
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||||
"metadata": {"title": title}
|
"metadata": {"title": title}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -73,7 +73,7 @@ class RendererJson(BaseRenderer):
|
||||||
self.logger.warning(f"Error cleaning JSON content: {str(e)}")
|
self.logger.warning(f"Error cleaning JSON content: {str(e)}")
|
||||||
# Return minimal valid JSON
|
# Return minimal valid JSON
|
||||||
fallback_data = {
|
fallback_data = {
|
||||||
"sections": [{"type": "paragraph", "data": {"text": str(content)}}],
|
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||||
"metadata": {"title": title, "error": str(e)}
|
"metadata": {"title": title, "error": str(e)}
|
||||||
}
|
}
|
||||||
return json.dumps(fallback_data, indent=2, ensure_ascii=False)
|
return json.dumps(fallback_data, indent=2, ensure_ascii=False)
|
||||||
|
|
|
||||||
|
|
@ -442,7 +442,7 @@ class RendererXlsx(BaseRenderer):
|
||||||
sheet_names = []
|
sheet_names = []
|
||||||
|
|
||||||
# Check if we have multiple table sections
|
# Check if we have multiple table sections
|
||||||
table_sections = [s for s in sections if s.get("type") == "table"]
|
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||||
|
|
||||||
if len(table_sections) > 1:
|
if len(table_sections) > 1:
|
||||||
# Create separate sheets for each table
|
# Create separate sheets for each table
|
||||||
|
|
@ -480,7 +480,7 @@ class RendererXlsx(BaseRenderer):
|
||||||
return
|
return
|
||||||
|
|
||||||
sections = json_content.get("sections", [])
|
sections = json_content.get("sections", [])
|
||||||
table_sections = [s for s in sections if s.get("type") == "table"]
|
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||||
|
|
||||||
if len(table_sections) > 1:
|
if len(table_sections) > 1:
|
||||||
# Multiple tables - populate each sheet with its corresponding table
|
# Multiple tables - populate each sheet with its corresponding table
|
||||||
|
|
@ -509,10 +509,15 @@ class RendererXlsx(BaseRenderer):
|
||||||
sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79")))
|
sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79")))
|
||||||
sheet['A1'].alignment = Alignment(horizontal="center")
|
sheet['A1'].alignment = Alignment(horizontal="center")
|
||||||
|
|
||||||
# Get table data
|
# Get table data from elements (canonical JSON format)
|
||||||
table_data = section.get("data", {})
|
elements = section.get("elements", [])
|
||||||
headers = table_data.get("headers", [])
|
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||||
rows = table_data.get("rows", [])
|
table_data = elements[0]
|
||||||
|
headers = table_data.get("headers", [])
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
else:
|
||||||
|
headers = []
|
||||||
|
rows = []
|
||||||
|
|
||||||
if not headers and not rows:
|
if not headers and not rows:
|
||||||
sheet['A3'] = "No table data available"
|
sheet['A3'] = "No table data available"
|
||||||
|
|
@ -683,9 +688,9 @@ class RendererXlsx(BaseRenderer):
|
||||||
def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||||
"""Add a table element to Excel sheet."""
|
"""Add a table element to Excel sheet."""
|
||||||
try:
|
try:
|
||||||
table_data = element.get("data", {})
|
# In canonical JSON format, table elements have headers and rows directly
|
||||||
headers = table_data.get("headers", [])
|
headers = element.get("headers", [])
|
||||||
rows = table_data.get("rows", [])
|
rows = element.get("rows", [])
|
||||||
|
|
||||||
if not headers and not rows:
|
if not headers and not rows:
|
||||||
return start_row
|
return start_row
|
||||||
|
|
@ -697,7 +702,7 @@ class RendererXlsx(BaseRenderer):
|
||||||
if header_style.get("bold"):
|
if header_style.get("bold"):
|
||||||
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
||||||
if header_style.get("background"):
|
if header_style.get("background"):
|
||||||
cell.fill = PatternFill(start_color=header_style["background"], end_color=header_style["background"], fill_type="solid")
|
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
|
||||||
|
|
||||||
start_row += 1
|
start_row += 1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,21 +1,21 @@
|
||||||
"""
|
"""
|
||||||
Centralized prompt builder for document generation across formats.
|
Prompt builder for AI document generation and extraction.
|
||||||
|
This module builds prompts for AI services to extract and generate documents.
|
||||||
Builds a robust prompt that:
|
|
||||||
- Accepts any user intent (no fixed structure assumptions)
|
|
||||||
- Injects format-specific guidelines from the selected renderer
|
|
||||||
- Adds a common policy section to always use real data from source docs
|
|
||||||
- Requires the AI to output a filename header that we can parse and use
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import Protocol, Dict, Any
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
|
|
||||||
|
# Type hint for renderer parameter
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .renderers.rendererBaseTemplate import BaseRenderer
|
||||||
|
_RendererLike = BaseRenderer
|
||||||
|
else:
|
||||||
|
_RendererLike = Any
|
||||||
|
|
||||||
class _RendererLike(Protocol):
|
logger = logging.getLogger(__name__)
|
||||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
async def buildAdaptiveExtractionPrompt(
|
async def buildAdaptiveExtractionPrompt(
|
||||||
outputFormat: str,
|
outputFormat: str,
|
||||||
|
|
@ -25,57 +25,65 @@ async def buildAdaptiveExtractionPrompt(
|
||||||
aiService=None,
|
aiService=None,
|
||||||
services=None
|
services=None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Build adaptive extraction prompt based on AI analysis."""
|
"""
|
||||||
|
Build adaptive extraction prompt based on AI analysis.
|
||||||
|
Uses multi-file or single-file approach based on analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
# Get appropriate JSON schema based on analysis
|
# Multi-file example data instead of schema
|
||||||
from .subJsonSchema import get_adaptive_json_schema
|
multi_file_example = {
|
||||||
json_schema = get_adaptive_json_schema(promptAnalysis)
|
"metadata": {
|
||||||
|
"title": "Multi-Document Example",
|
||||||
|
"splitStrategy": "by_section",
|
||||||
|
"source_documents": ["doc_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"id": "doc_section_1",
|
||||||
|
"title": "Section 1 Title",
|
||||||
|
"filename": "section_1.xlsx",
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "table_1",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2"],
|
||||||
|
"rows": [["Value 1", "Value 2"]]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Single-file example data instead of schema
|
||||||
|
single_file_example = {
|
||||||
|
"metadata": {
|
||||||
|
"title": "Single Document Example",
|
||||||
|
"source_documents": ["doc_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "table_1",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2"],
|
||||||
|
"rows": [["Value 1", "Value 2"]]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
if promptAnalysis.get("is_multi_file", False):
|
if promptAnalysis.get("is_multi_file", False):
|
||||||
schema_type = "multi-document"
|
# Multi-file prompt
|
||||||
else:
|
|
||||||
schema_type = "single-document"
|
|
||||||
|
|
||||||
# Build adaptive prompt using AI analysis - match single-file style
|
|
||||||
if promptAnalysis.get("is_multi_file", False):
|
|
||||||
# Multi-file prompt - use simple example format like single-file
|
|
||||||
multi_file_example = {
|
|
||||||
"metadata": {
|
|
||||||
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
|
|
||||||
"splitStrategy": "by_section"
|
|
||||||
},
|
|
||||||
"documents": [
|
|
||||||
{
|
|
||||||
"id": "doc_1",
|
|
||||||
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
|
|
||||||
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
|
|
||||||
"sections": [
|
|
||||||
{
|
|
||||||
"id": "section_1",
|
|
||||||
"content_type": "heading",
|
|
||||||
"elements": [
|
|
||||||
{
|
|
||||||
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
|
|
||||||
"level": 1
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"order": 1
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "section_2",
|
|
||||||
"content_type": "paragraph",
|
|
||||||
"elements": [
|
|
||||||
{
|
|
||||||
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"order": 2
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
adaptive_prompt = f"""
|
adaptive_prompt = f"""
|
||||||
{userPrompt}
|
{userPrompt}
|
||||||
|
|
||||||
|
|
@ -134,16 +142,31 @@ Return only the JSON structure with actual data from the documents. Do not inclu
|
||||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||||
""".strip()
|
""".strip()
|
||||||
else:
|
else:
|
||||||
# Single-file prompt - use original style
|
# Single-file prompt - use example data instead of schema
|
||||||
adaptive_prompt = f"""
|
adaptive_prompt = f"""
|
||||||
{userPrompt}
|
{userPrompt}
|
||||||
|
|
||||||
You are extracting structured content from documents and must respond with valid JSON only.
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||||
|
|
||||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
REQUIREMENTS:
|
||||||
{json.dumps(json_schema, indent=2)}
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Extract all content and organize it into logical sections
|
||||||
|
3. Create structured JSON with sections containing the extracted content
|
||||||
|
4. Preserve the original structure and data
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(single_file_example, indent=2)}
|
||||||
|
|
||||||
|
INSTRUCTIONS:
|
||||||
|
- Replace example data with actual content from the document
|
||||||
|
- Use actual headings, paragraphs, and text from the document
|
||||||
|
- Ensure all content is properly structured
|
||||||
|
- Do not use generic placeholder text
|
||||||
|
- Extract real content from the documents
|
||||||
|
|
||||||
|
CONTEXT (Document Content):
|
||||||
|
|
||||||
Content Types to Extract:
|
Content Types to Extract:
|
||||||
1. Tables: Extract all rows and columns with proper headers
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
|
@ -220,22 +243,53 @@ Consider the user's intent and the most logical way to organize the extracted co
|
||||||
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||||
|
|
||||||
# Fallback to single-file prompt
|
# Fallback to single-file prompt
|
||||||
from .subJsonSchema import get_document_subJsonSchema
|
example_data = {
|
||||||
json_schema = get_document_subJsonSchema()
|
"metadata": {
|
||||||
|
"title": "Example Document",
|
||||||
|
"author": "AI Assistant",
|
||||||
|
"source_documents": ["document_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "section_001",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||||
|
"rows": [
|
||||||
|
["Value 1", "Value 2", "Value 3"],
|
||||||
|
["Value 4", "Value 5", "Value 6"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1,
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": "",
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
|
||||||
return f"""
|
return f"""
|
||||||
{userPrompt}
|
{userPrompt}
|
||||||
|
|
||||||
You are extracting structured content from documents and must respond with valid JSON only.
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||||
|
|
||||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
REQUIREMENTS:
|
||||||
{json.dumps(json_schema, indent=2)}
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Extract all content and organize it into logical sections
|
||||||
|
3. Create structured JSON with sections containing the extracted content
|
||||||
|
4. Preserve the original structure and data
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(example_data, indent=2)}
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
- Preserve all original data - do not summarize or interpret
|
- Preserve all original data - do not summarize or interpret
|
||||||
- Use the exact JSON schema provided
|
- Use the exact JSON format shown above
|
||||||
- Maintain data integrity and structure
|
- Maintain data integrity and structure
|
||||||
|
|
||||||
Content Types to Extract:
|
Content Types to Extract:
|
||||||
|
|
@ -286,16 +340,55 @@ async def buildExtractionPrompt(
|
||||||
from .subJsonSchema import get_document_subJsonSchema
|
from .subJsonSchema import get_document_subJsonSchema
|
||||||
jsonSchema = get_document_subJsonSchema()
|
jsonSchema = get_document_subJsonSchema()
|
||||||
|
|
||||||
# Generic block for JSON extraction - use proper schema instead of hardcoded template
|
# Generic block for JSON extraction - use example data instead of schema
|
||||||
|
example_data = {
|
||||||
|
"metadata": {
|
||||||
|
"title": "Example Document",
|
||||||
|
"author": "AI Assistant",
|
||||||
|
"source_documents": ["document_001"],
|
||||||
|
"extraction_method": "ai_extraction"
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "section_001",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||||
|
"rows": [
|
||||||
|
["Value 1", "Value 2", "Value 3"],
|
||||||
|
["Value 4", "Value 5", "Value 6"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1,
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": "",
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
|
||||||
genericIntro = f"""
|
genericIntro = f"""
|
||||||
{extractionIntent}
|
{extractionIntent}
|
||||||
|
|
||||||
You are extracting structured content from documents and must respond with valid JSON only.
|
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||||
|
|
||||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||||
|
|
||||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
REQUIREMENTS:
|
||||||
{json.dumps(jsonSchema, indent=2)}
|
1. Analyze the document content provided in the context below
|
||||||
|
2. Extract all content and organize it into logical sections
|
||||||
|
3. Create structured JSON with sections containing the extracted content
|
||||||
|
4. Preserve the original structure and data
|
||||||
|
|
||||||
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||||
|
{json.dumps(example_data, indent=2)}
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Preserve all original data - do not summarize or interpret
|
||||||
|
- Use the exact JSON format shown above
|
||||||
|
- Maintain data integrity and structure
|
||||||
|
|
||||||
Content Types to Extract:
|
Content Types to Extract:
|
||||||
1. Tables: Extract all rows and columns with proper headers
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
|
@ -317,15 +410,20 @@ Return only the JSON structure with actual data from the documents. Do not inclu
|
||||||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||||
|
|
||||||
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||||
""".strip()
|
"""
|
||||||
|
|
||||||
# Final assembly
|
# Get format-specific guidelines from renderer
|
||||||
finalPrompt = genericIntro
|
formatGuidelines = ""
|
||||||
|
try:
|
||||||
|
if hasattr(renderer, 'getExtractionGuidelines'):
|
||||||
|
formatGuidelines = renderer.getExtractionGuidelines()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Combine all parts
|
||||||
|
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
|
||||||
|
|
||||||
# Debug output
|
# Save extraction prompt to debug file - only if debug enabled
|
||||||
services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER")
|
|
||||||
|
|
||||||
# Save full extraction prompt to debug file - only if debug enabled
|
|
||||||
try:
|
try:
|
||||||
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
if debug_enabled:
|
if debug_enabled:
|
||||||
|
|
@ -335,8 +433,7 @@ DO NOT return a schema description - return actual extracted content in the JSON
|
||||||
debug_root = "./test-chat/ai"
|
debug_root = "./test-chat/ai"
|
||||||
os.makedirs(debug_root, exist_ok=True)
|
os.makedirs(debug_root, exist_ok=True)
|
||||||
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
|
f.write(finalPrompt)
|
||||||
f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
@ -367,24 +464,46 @@ async def buildGenerationPrompt(
|
||||||
|
|
||||||
# AI call to generate the appropriate generation prompt
|
# AI call to generate the appropriate generation prompt
|
||||||
generationPromptRequest = f"""
|
generationPromptRequest = f"""
|
||||||
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
|
You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document.
|
||||||
|
|
||||||
User request: "{safeUserPrompt}"
|
User request: "{safeUserPrompt}"
|
||||||
Document title: "{title}"
|
Document title: "{title}"
|
||||||
Output format: {outputFormat}
|
Target format: {outputFormat}
|
||||||
|
|
||||||
Create a generation prompt that:
|
Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on:
|
||||||
1. Identifies what content is most important for the user
|
|
||||||
2. Specifies how to structure and organize the content
|
|
||||||
3. Includes any specific formatting or presentation requirements
|
|
||||||
4. Preserves any language requirements
|
|
||||||
5. Ensures the document meets the user's needs
|
|
||||||
|
|
||||||
IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.
|
1. What content is most important for the user
|
||||||
|
2. How to structure and organize the content using the canonical JSON format with 'sections'
|
||||||
|
3. Specific formatting requirements for the target format
|
||||||
|
4. Language requirements to preserve
|
||||||
|
5. How to ensure the JSON content meets the user's needs
|
||||||
|
|
||||||
CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}
|
CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure:
|
||||||
|
{{
|
||||||
|
"metadata": {{
|
||||||
|
"title": "Document Title"
|
||||||
|
}},
|
||||||
|
"sections": [
|
||||||
|
{{
|
||||||
|
"id": "section_1",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{{
|
||||||
|
"headers": ["Column1", "Column2", "Column3"],
|
||||||
|
"rows": [
|
||||||
|
["Value1", "Value2", "Value3"],
|
||||||
|
["Value4", "Value5", "Value6"]
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
|
The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements".
|
||||||
|
|
||||||
|
Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Call AI service to generate the prompt
|
# Call AI service to generate the prompt
|
||||||
|
|
@ -423,7 +542,7 @@ Return only the generation prompt, starting with "Generate a {outputFormat} docu
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback on any error - preserve user prompt for language instructions
|
# Fallback on any error - preserve user prompt for language instructions
|
||||||
|
|
@ -433,105 +552,104 @@ Return only the generation prompt, starting with "Generate a {outputFormat} docu
|
||||||
|
|
||||||
def _getFormatRules(outputFormat: str) -> str:
|
def _getFormatRules(outputFormat: str) -> str:
|
||||||
"""
|
"""
|
||||||
Get format-specific rules for JSON-based generation.
|
Get format-specific rules for the generation prompt.
|
||||||
Since we now use standardized JSON, all formats follow the same rules.
|
|
||||||
"""
|
"""
|
||||||
return """
|
format_rules = {
|
||||||
- Generate content in standardized JSON format following the document schema
|
"xlsx": """
|
||||||
- Tables: Use JSON table format with headers and rows arrays
|
XLSX Format Rules:
|
||||||
- Lists: Use JSON list format with items array
|
- Create tables with clear headers and organized data
|
||||||
- Text: Use JSON paragraph format with text field
|
- Use appropriate column widths and formatting
|
||||||
- Headings: Use JSON heading format with level field
|
- Include summary information if relevant
|
||||||
- Structure: Follow the document JSON schema exactly
|
- Ensure data is properly structured for spreadsheet analysis
|
||||||
""".strip()
|
""",
|
||||||
|
"pdf": """
|
||||||
|
PDF Format Rules:
|
||||||
|
- Create professional document layout
|
||||||
|
- Use appropriate headings and sections
|
||||||
|
- Include proper spacing and formatting
|
||||||
|
- Ensure content is well-organized and readable
|
||||||
|
""",
|
||||||
|
"docx": """
|
||||||
|
DOCX Format Rules:
|
||||||
|
- Create professional document layout
|
||||||
|
- Use appropriate headings and sections
|
||||||
|
- Include proper spacing and formatting
|
||||||
|
- Ensure content is well-organized and readable
|
||||||
|
""",
|
||||||
|
"html": """
|
||||||
|
HTML Format Rules:
|
||||||
|
- Create clean, semantic HTML structure
|
||||||
|
- Use appropriate tags for content organization
|
||||||
|
- Include proper styling classes
|
||||||
|
- Ensure content is accessible and well-formatted
|
||||||
|
""",
|
||||||
|
"json": """
|
||||||
|
JSON Format Rules:
|
||||||
|
- Create well-structured JSON data
|
||||||
|
- Use appropriate nesting and organization
|
||||||
|
- Include metadata and context information
|
||||||
|
- Ensure data is properly formatted and valid
|
||||||
|
""",
|
||||||
|
"csv": """
|
||||||
|
CSV Format Rules:
|
||||||
|
- Create clear, organized tabular data
|
||||||
|
- Use appropriate headers and data types
|
||||||
|
- Ensure proper CSV formatting
|
||||||
|
- Include all relevant data in structured format
|
||||||
|
""",
|
||||||
|
"txt": """
|
||||||
|
TXT Format Rules:
|
||||||
|
- Create clean, readable text format
|
||||||
|
- Use appropriate spacing and organization
|
||||||
|
- Include clear headings and sections
|
||||||
|
- Ensure content is well-structured and easy to read
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
return format_rules.get(outputFormat.lower(), f"""
|
||||||
|
{outputFormat.upper()} Format Rules:
|
||||||
|
- Create well-structured content appropriate for {outputFormat}
|
||||||
|
- Use appropriate formatting and organization
|
||||||
|
- Ensure content is clear and professional
|
||||||
|
- Include all relevant information in proper format
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
||||||
"""
|
"""
|
||||||
Use AI to extract a rich, structured extraction intent from the user prompt.
|
Parse user prompt to extract the core extraction intent.
|
||||||
Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
|
|
||||||
"""
|
"""
|
||||||
if not aiService:
|
if not aiService:
|
||||||
# Fallback if no AI service available
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||||
return "Extract all relevant content from the document according to the user's requirements"
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Protect userPrompt from injection by escaping quotes and newlines
|
analysis_prompt = f"""
|
||||||
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
Analyze this user request and extract the core extraction intent:
|
||||||
|
|
||||||
# Rich analysis to derive a complete extraction intent and structure guidance
|
|
||||||
extractionPrompt = f"""
|
|
||||||
Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
|
|
||||||
|
|
||||||
Goals:
|
User request: "{userPrompt}"
|
||||||
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
|
Target format: {outputFormat}
|
||||||
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
|
|
||||||
- Identify if multi-file output is appropriate and how to split/files name.
|
|
||||||
|
|
||||||
User request: "{safeUserPrompt}"
|
Extract the main intent and requirements for document processing. Focus on:
|
||||||
|
1. What content needs to be extracted
|
||||||
|
2. How it should be organized
|
||||||
|
3. Any specific requirements or preferences
|
||||||
|
|
||||||
Return JSON in this exact shape:
|
Respond with a clear, concise statement of the extraction intent.
|
||||||
{{
|
|
||||||
"detectedLanguage": "de|en|fr|it|...",
|
|
||||||
"normalizedRequest": "Full explicit instruction in detected language",
|
|
||||||
"requiresStructuredData": true|false,
|
|
||||||
"targetStructure": "table|list|mixed|unstructured",
|
|
||||||
"table": {{
|
|
||||||
"headers": ["Header1", "Header2", "..."],
|
|
||||||
"headerOrderStrict": true|false,
|
|
||||||
"rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
|
|
||||||
"formats": {{
|
|
||||||
"dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
|
|
||||||
"amountDecimals": 2,
|
|
||||||
"currencyFormat": "code|symbol",
|
|
||||||
"idMasking": "none|last4|custom"
|
|
||||||
}}
|
|
||||||
}},
|
|
||||||
"multiFile": true|false,
|
|
||||||
"fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
|
|
||||||
"fileNamingPattern": "suggested pattern for filenames",
|
|
||||||
"constraints": ["List of critical constraints to enforce"],
|
|
||||||
"reasoning": "Brief justification (one sentence)"
|
|
||||||
}}
|
|
||||||
|
|
||||||
Rules:
|
|
||||||
- Preserve user terminology and language in normalizedRequest.
|
|
||||||
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
|
|
||||||
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
|
|
||||||
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Call AI service to extract intention
|
|
||||||
services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER")
|
|
||||||
|
|
||||||
# Import and set proper options for AI call
|
|
||||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||||
request_options = AiCallOptions()
|
request_options = AiCallOptions()
|
||||||
request_options.operationType = OperationType.GENERAL
|
request_options.operationType = OperationType.GENERAL
|
||||||
|
|
||||||
request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||||
response = await aiService.aiObjects.call(request)
|
response = await aiService.aiObjects.call(request)
|
||||||
result = response.content if response else ""
|
|
||||||
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
|
|
||||||
|
|
||||||
# Try to extract and pretty print JSON
|
|
||||||
if result:
|
|
||||||
import re, json as _json
|
|
||||||
match = re.search(r'\{[\s\S]*\}', result)
|
|
||||||
if match:
|
|
||||||
try:
|
|
||||||
obj = _json.loads(match.group(0))
|
|
||||||
return _json.dumps(obj, ensure_ascii=False, indent=2)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Fallback to previous simple format
|
|
||||||
return f"Extract: {safeUserPrompt}"
|
|
||||||
|
|
||||||
|
if response and response.content:
|
||||||
|
return response.content.strip()
|
||||||
|
else:
|
||||||
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback on any error - preserve user prompt for language instructions
|
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||||
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
|
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||||
return f"Extract: {userPrompt}"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,13 +28,17 @@ class NormalizationService:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract table data from elements array
|
# Extract table data from elements array
|
||||||
|
hdrs = []
|
||||||
|
rows = []
|
||||||
for element in section.get("elements", []):
|
for element in section.get("elements", []):
|
||||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
hdrs = element.get("headers") or []
|
hdrs = element.get("headers") or []
|
||||||
rows = element.get("rows") or []
|
rows = element.get("rows") or []
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
|
if not hdrs or not rows:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for h in hdrs:
|
for h in hdrs:
|
||||||
if not isinstance(h, str):
|
if not isinstance(h, str):
|
||||||
continue
|
continue
|
||||||
|
|
@ -122,13 +126,14 @@ class NormalizationService:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract table data from elements array
|
# Extract table data from elements array
|
||||||
|
sourceHeaders = []
|
||||||
|
sourceRows = []
|
||||||
for element in section.get("elements", []):
|
for element in section.get("elements", []):
|
||||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
sourceHeaders = element.get("headers") or []
|
sourceHeaders = element.get("headers") or []
|
||||||
sourceRows = element.get("rows") or []
|
sourceRows = element.get("rows") or []
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
continue
|
|
||||||
if not sourceHeaders or not sourceRows:
|
if not sourceHeaders or not sourceRows:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -78,11 +78,15 @@ class WorkflowService:
|
||||||
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
|
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
|
||||||
"""Get ChatDocuments from a list of document references using all three formats."""
|
"""Get ChatDocuments from a list of document references using all three formats."""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services (same pattern as setWorkflowContext)
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
# Reload workflow from database to ensure we have all messages
|
||||||
logger.error("No workflow available for document list resolution")
|
if hasattr(workflow, 'id'):
|
||||||
return []
|
try:
|
||||||
|
workflow = self.getWorkflow(workflow.id)
|
||||||
|
logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not reload workflow from database: {str(e)}")
|
||||||
|
|
||||||
all_documents = []
|
all_documents = []
|
||||||
for doc_ref in documentList:
|
for doc_ref in documentList:
|
||||||
|
|
@ -418,11 +422,7 @@ class WorkflowService:
|
||||||
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
|
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
|
||||||
"""Set current workflow context for document generation and routing"""
|
"""Set current workflow context for document generation and routing"""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
|
||||||
logger.error("No workflow available for context setting")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Prepare update data
|
# Prepare update data
|
||||||
update_data = {}
|
update_data = {}
|
||||||
|
|
@ -529,10 +529,7 @@ class WorkflowService:
|
||||||
def getDocumentCount(self) -> str:
|
def getDocumentCount(self) -> str:
|
||||||
"""Get document count for task planning (matching old handlingTasks.py logic)"""
|
"""Get document count for task planning (matching old handlingTasks.py logic)"""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
|
||||||
return "No documents available"
|
|
||||||
|
|
||||||
# Count documents from all messages in the workflow (like old system)
|
# Count documents from all messages in the workflow (like old system)
|
||||||
total_docs = 0
|
total_docs = 0
|
||||||
|
|
@ -551,10 +548,7 @@ class WorkflowService:
|
||||||
def getWorkflowHistoryContext(self) -> str:
|
def getWorkflowHistoryContext(self) -> str:
|
||||||
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
|
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
|
||||||
try:
|
try:
|
||||||
# Get the current workflow from services
|
workflow = self.services.currentWorkflow
|
||||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
|
||||||
if not workflow:
|
|
||||||
return "No previous round context available"
|
|
||||||
|
|
||||||
# Check if there are any previous rounds by looking for "first" messages
|
# Check if there are any previous rounds by looking for "first" messages
|
||||||
has_previous_rounds = False
|
has_previous_rounds = False
|
||||||
|
|
|
||||||
|
|
@ -226,7 +226,7 @@ class ReactMode(BaseMode):
|
||||||
|
|
||||||
# Get available documents from the current workflow
|
# Get available documents from the current workflow
|
||||||
try:
|
try:
|
||||||
available_docs = self.services.workflow.getAvailableDocuments(context.workflow)
|
available_docs = self.services.workflow.getAvailableDocuments(self.services.currentWorkflow)
|
||||||
if not available_docs or available_docs == "No documents available":
|
if not available_docs or available_docs == "No documents available":
|
||||||
logger.warning("No documents available for validation")
|
logger.warning("No documents available for validation")
|
||||||
return
|
return
|
||||||
|
|
|
||||||
|
|
@ -68,20 +68,12 @@ def extractWorkflowHistory(service: Any, context: Any) -> str:
|
||||||
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
||||||
Reverse-chronological, enriched with message summaries and document labels.
|
Reverse-chronological, enriched with message summaries and document labels.
|
||||||
"""
|
"""
|
||||||
# Prefer explicit workflow on context; else fall back to services.workflow
|
|
||||||
workflow = None
|
|
||||||
try:
|
try:
|
||||||
if hasattr(context, 'workflow') and context.workflow:
|
history = getPreviousRoundContext(service, service.currentWorkflow)
|
||||||
workflow = context.workflow
|
|
||||||
elif hasattr(service, 'workflow') and service.workflow:
|
|
||||||
workflow = service.workflow
|
|
||||||
except Exception:
|
|
||||||
workflow = None
|
|
||||||
|
|
||||||
if workflow:
|
|
||||||
history = getPreviousRoundContext(service, workflow)
|
|
||||||
return history or "No previous workflow rounds available"
|
return history or "No previous workflow rounds available"
|
||||||
return "No previous workflow rounds available"
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting workflow history: {str(e)}")
|
||||||
|
return "No previous workflow rounds available"
|
||||||
|
|
||||||
def extractAvailableMethods(service: Any) -> str:
|
def extractAvailableMethods(service: Any) -> str:
|
||||||
"""Extract available methods for action planning. Maps to {{KEY:AVAILABLE_METHODS}}"""
|
"""Extract available methods for action planning. Maps to {{KEY:AVAILABLE_METHODS}}"""
|
||||||
|
|
@ -390,7 +382,7 @@ def extractLatestRefinementFeedback(context: Any) -> str:
|
||||||
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
||||||
"""Summary of available documents (count only)."""
|
"""Summary of available documents (count only)."""
|
||||||
try:
|
try:
|
||||||
documents = service.workflow.getAvailableDocuments(context.workflow)
|
documents = service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||||
if documents and documents != "No documents available":
|
if documents and documents != "No documents available":
|
||||||
# Count only actual documents, not list labels
|
# Count only actual documents, not list labels
|
||||||
doc_count = documents.count("docItem:")
|
doc_count = documents.count("docItem:")
|
||||||
|
|
@ -403,7 +395,7 @@ def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
||||||
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
|
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
|
||||||
"""Index of available documents with detailed references for parameter generation."""
|
"""Index of available documents with detailed references for parameter generation."""
|
||||||
try:
|
try:
|
||||||
return service.workflow.getAvailableDocuments(context.workflow)
|
return service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error getting document index: {str(e)}")
|
logger.error(f"Error getting document index: {str(e)}")
|
||||||
return "No documents available"
|
return "No documents available"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue