Workflow end 2 End Validated - Start Variant Testing
This commit is contained in:
parent
bdc87eb5c6
commit
e0afc72e13
13 changed files with 541 additions and 285 deletions
|
|
@ -263,8 +263,7 @@ async def read_user_me(
|
|||
@limiter.limit("60/minute")
|
||||
async def refresh_token(
|
||||
request: Request,
|
||||
response: Response,
|
||||
currentUser: User = Depends(getCurrentUser)
|
||||
response: Response
|
||||
) -> Dict[str, Any]:
|
||||
"""Refresh access token using refresh token from cookie"""
|
||||
try:
|
||||
|
|
@ -283,12 +282,27 @@ async def refresh_token(
|
|||
except jwt.JWTError:
|
||||
raise HTTPException(status_code=401, detail="Invalid refresh token")
|
||||
|
||||
# Get user information from refresh token payload
|
||||
user_id = payload.get("userId")
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="Invalid refresh token - missing user ID")
|
||||
|
||||
# Get user from database using the user ID from refresh token
|
||||
try:
|
||||
app_interface = getRootInterface()
|
||||
current_user = app_interface.getUser(user_id)
|
||||
if not current_user:
|
||||
raise HTTPException(status_code=401, detail="User not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get user from database: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail="Failed to validate user")
|
||||
|
||||
# Create new token data
|
||||
token_data = {
|
||||
"sub": currentUser.username,
|
||||
"mandateId": str(currentUser.mandateId),
|
||||
"userId": str(currentUser.id),
|
||||
"authenticationAuthority": currentUser.authenticationAuthority
|
||||
"sub": current_user.username,
|
||||
"mandateId": str(current_user.mandateId),
|
||||
"userId": str(current_user.id),
|
||||
"authenticationAuthority": current_user.authenticationAuthority
|
||||
}
|
||||
|
||||
# Create new access token + set cookie
|
||||
|
|
|
|||
|
|
@ -345,7 +345,7 @@ class SubCoreAi:
|
|||
options=options
|
||||
)
|
||||
response = await self.aiObjects.call(request)
|
||||
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "type": "paragraph", "data": {"text": response.content}}]}
|
||||
result = {"metadata": {"title": "AI Response"}, "sections": [{"id": "section_1", "content_type": "paragraph", "elements": [{"text": response.content}]}]}
|
||||
|
||||
# Convert single-file result to multi-file format if needed
|
||||
if "sections" in result and "documents" not in result:
|
||||
|
|
|
|||
|
|
@ -77,7 +77,8 @@ class SubDocumentGeneration:
|
|||
documents: Optional[List[ChatDocument]],
|
||||
options: AiCallOptions,
|
||||
outputFormat: str,
|
||||
title: Optional[str]
|
||||
title: Optional[str],
|
||||
generationPrompt: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Handle single-file document generation (existing functionality)."""
|
||||
try:
|
||||
|
|
@ -125,9 +126,72 @@ class SubDocumentGeneration:
|
|||
except Exception:
|
||||
parsedFilename = None
|
||||
|
||||
# Render the JSON content to the specified format
|
||||
# Use AI generation to enhance the extracted JSON before rendering
|
||||
enhancedContent = aiResponseJson # Default to original
|
||||
if prompt:
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
# Get generation prompt
|
||||
generationPrompt = await generation_service.getGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=title,
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Prepare the AI call
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
# Create context with the extracted JSON content
|
||||
import json
|
||||
context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
context=context,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
# Call AI to enhance the content
|
||||
response = await self.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
# Parse the AI response as JSON
|
||||
try:
|
||||
import re
|
||||
result = response.content.strip()
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to parse JSON
|
||||
enhancedContent = json.loads(result)
|
||||
logger.info(f"AI enhanced JSON content successfully")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
||||
enhancedContent = aiResponseJson
|
||||
else:
|
||||
logger.warning("AI generation returned empty response, using original content")
|
||||
enhancedContent = aiResponseJson
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||
enhancedContent = aiResponseJson
|
||||
|
||||
# Render the enhanced JSON content
|
||||
renderedContent, mimeType = await generation_service.renderReport(
|
||||
extractedContent=aiResponseJson,
|
||||
extractedContent=enhancedContent,
|
||||
outputFormat=outputFormat,
|
||||
title=title,
|
||||
userPrompt=prompt,
|
||||
|
|
@ -232,11 +296,8 @@ class SubDocumentGeneration:
|
|||
# Convert AI format to renderer format
|
||||
transformed_section = {
|
||||
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
||||
"type": section.get("content_type", "paragraph"),
|
||||
"data": {
|
||||
"text": "",
|
||||
"elements": section.get("elements", [])
|
||||
},
|
||||
"content_type": section.get("content_type", "paragraph"),
|
||||
"elements": section.get("elements", []),
|
||||
"order": section.get("order", len(transformed_sections) + 1)
|
||||
}
|
||||
|
||||
|
|
@ -246,7 +307,11 @@ class SubDocumentGeneration:
|
|||
for element in section.get("elements", []):
|
||||
if "text" in element:
|
||||
text_parts.append(element["text"])
|
||||
transformed_section["data"]["text"] = "\n".join(text_parts)
|
||||
# Add text to the first element or create a new one
|
||||
if transformed_section["elements"]:
|
||||
transformed_section["elements"][0]["text"] = "\n".join(text_parts)
|
||||
else:
|
||||
transformed_section["elements"] = [{"text": "\n".join(text_parts)}]
|
||||
|
||||
transformed_sections.append(transformed_section)
|
||||
|
||||
|
|
@ -264,8 +329,72 @@ class SubDocumentGeneration:
|
|||
"tags": ["multi_file", "ai_generated"]
|
||||
}
|
||||
|
||||
# Use AI generation to enhance the extracted JSON before rendering
|
||||
enhancedContent = complete_document # Default to original
|
||||
if prompt:
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
# Get generation prompt
|
||||
generationPrompt = await generation_service.getGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=prompt,
|
||||
title=doc_data["title"],
|
||||
aiService=self
|
||||
)
|
||||
|
||||
# Prepare the AI call
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
# Create context with the extracted JSON content
|
||||
import json
|
||||
context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
context=context,
|
||||
options=request_options
|
||||
)
|
||||
|
||||
# Call AI to enhance the content
|
||||
response = await self.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
# Parse the AI response as JSON
|
||||
try:
|
||||
import re
|
||||
result = response.content.strip()
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if json_match:
|
||||
result = json_match.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to parse JSON
|
||||
enhancedContent = json.loads(result)
|
||||
logger.info(f"AI enhanced JSON content successfully")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
||||
enhancedContent = complete_document
|
||||
else:
|
||||
logger.warning("AI generation returned empty response, using original content")
|
||||
enhancedContent = complete_document
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
||||
enhancedContent = complete_document
|
||||
|
||||
# Render the enhanced JSON content
|
||||
rendered_content, mime_type = await generation_service.renderReport(
|
||||
extractedContent=complete_document,
|
||||
extractedContent=enhancedContent,
|
||||
outputFormat=outputFormat,
|
||||
title=doc_data["title"],
|
||||
userPrompt=prompt,
|
||||
|
|
@ -477,9 +606,7 @@ Return only the JSON response.
|
|||
"""
|
||||
try:
|
||||
services = self.services
|
||||
workflow = getattr(services, 'currentWorkflow', None)
|
||||
if not workflow:
|
||||
return
|
||||
workflow = services.currentWorkflow
|
||||
|
||||
# Serialize payload
|
||||
import json as _json
|
||||
|
|
|
|||
|
|
@ -181,9 +181,8 @@ class SubDocumentProcessing:
|
|||
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
|
||||
normalizer = NormalizationService(self.services)
|
||||
inventory = normalizer.discoverStructures(mergedJsonDocument)
|
||||
# Use workflow id if available as cache key, else default
|
||||
cacheKey = getattr(self.services, 'currentWorkflow', None)
|
||||
cacheKey = getattr(cacheKey, 'id', 'workflow_run') if cacheKey else 'workflow_run'
|
||||
# Use workflow id as cache key
|
||||
cacheKey = self.services.currentWorkflow.id
|
||||
# Provide the extraction/merge prompt context when available to help mapping
|
||||
mergePrompt = prompt
|
||||
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
|
||||
|
|
@ -476,8 +475,8 @@ class SubDocumentProcessing:
|
|||
"metadata": {"title": f"Image Analysis - Chunk {chunk_index}"},
|
||||
"sections": [{
|
||||
"id": f"image_section_{chunk_index}",
|
||||
"type": "paragraph",
|
||||
"data": {"text": fallback_content}
|
||||
"content_type": "paragraph",
|
||||
"elements": [{"text": fallback_content}]
|
||||
}]
|
||||
})
|
||||
self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE")
|
||||
|
|
@ -583,8 +582,8 @@ class SubDocumentProcessing:
|
|||
"metadata": {"title": f"Document Analysis - Chunk {chunk_index}"},
|
||||
"sections": [{
|
||||
"id": f"analysis_section_{chunk_index}",
|
||||
"type": "paragraph",
|
||||
"data": {"text": fallback_content}
|
||||
"content_type": "paragraph",
|
||||
"elements": [{"text": fallback_content}]
|
||||
}]
|
||||
})
|
||||
self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE")
|
||||
|
|
@ -676,8 +675,8 @@ class SubDocumentProcessing:
|
|||
"metadata": {"title": "Error Section"},
|
||||
"sections": [{
|
||||
"id": f"error_section_{chunk_index}",
|
||||
"type": "paragraph",
|
||||
"data": {"text": f"Error parsing JSON: {str(e)}"}
|
||||
"content_type": "paragraph",
|
||||
"elements": [{"text": f"Error parsing JSON: {str(e)}"}]
|
||||
}]
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
import uuid
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||
from datetime import datetime, UTC
|
||||
import re
|
||||
|
|
@ -339,24 +340,8 @@ class GenerationService:
|
|||
if not renderer:
|
||||
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||
|
||||
# Generate AI-based generation prompt if AI service is available
|
||||
generationPrompt = userPrompt # Default to user prompt
|
||||
if aiService and userPrompt:
|
||||
try:
|
||||
from .subPromptBuilder import buildGenerationPrompt
|
||||
generationPrompt = await buildGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
|
||||
generationPrompt = userPrompt
|
||||
|
||||
# Render the JSON content with AI-generated prompt
|
||||
renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt, aiService)
|
||||
# Render the JSON content directly (AI generation handled by main service)
|
||||
renderedContent, mimeType = await renderer.render(extractedContent, title, userPrompt, aiService)
|
||||
# DEBUG: dump rendered output
|
||||
try:
|
||||
import os
|
||||
|
|
@ -391,6 +376,23 @@ class GenerationService:
|
|||
services=self.services
|
||||
)
|
||||
|
||||
async def getGenerationPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get generation prompt for enhancing extracted JSON content."""
|
||||
from .subPromptBuilder import buildGenerationPrompt
|
||||
return await buildGenerationPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
async def getGenericExtractionPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
|
|
|
|||
|
|
@ -81,11 +81,11 @@ class BaseRenderer(ABC):
|
|||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
# Validate each section has type and data
|
||||
# Validate each section has content_type and elements
|
||||
for section in sections:
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
if "type" not in section or "data" not in section:
|
||||
if "content_type" not in section or "elements" not in section:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
|
@ -159,7 +159,7 @@ class BaseRenderer(ABC):
|
|||
# Base implementation returns a simple dict
|
||||
# Format-specific renderers should override this method
|
||||
return {
|
||||
"type": "image",
|
||||
"content_type": "image",
|
||||
"base64Data": base64_data,
|
||||
"altText": alt_text,
|
||||
"width": section_data.get("width", None),
|
||||
|
|
@ -259,25 +259,25 @@ class BaseRenderer(ABC):
|
|||
|
||||
if section_type == "table":
|
||||
headers, rows = self._extract_table_data(section_data)
|
||||
return {"type": "table", "headers": headers, "rows": rows}
|
||||
return {"content_type": "table", "headers": headers, "rows": rows}
|
||||
elif section_type == "bullet_list":
|
||||
items = self._extract_bullet_list_items(section_data)
|
||||
return {"type": "bullet_list", "items": items}
|
||||
return {"content_type": "bullet_list", "items": items}
|
||||
elif section_type == "heading":
|
||||
level, text = self._extract_heading_data(section_data)
|
||||
return {"type": "heading", "level": level, "text": text}
|
||||
return {"content_type": "heading", "level": level, "text": text}
|
||||
elif section_type == "paragraph":
|
||||
text = self._extract_paragraph_text(section_data)
|
||||
return {"type": "paragraph", "text": text}
|
||||
return {"content_type": "paragraph", "text": text}
|
||||
elif section_type == "code_block":
|
||||
code, language = self._extract_code_block_data(section_data)
|
||||
return {"type": "code_block", "code": code, "language": language}
|
||||
return {"content_type": "code_block", "code": code, "language": language}
|
||||
elif section_type == "image":
|
||||
base64_data, alt_text = self._extract_image_data(section_data)
|
||||
# Validate image data
|
||||
if self._validate_image_data(base64_data, alt_text):
|
||||
return {
|
||||
"type": "image",
|
||||
"content_type": "image",
|
||||
"base64Data": base64_data,
|
||||
"altText": alt_text,
|
||||
"width": section_data.get("width"),
|
||||
|
|
@ -286,11 +286,11 @@ class BaseRenderer(ABC):
|
|||
}
|
||||
else:
|
||||
# Return placeholder if image data is invalid
|
||||
return {"type": "paragraph", "text": f"[Image: {alt_text}]"}
|
||||
return {"content_type": "paragraph", "text": f"[Image: {alt_text}]"}
|
||||
else:
|
||||
# Fallback to paragraph
|
||||
text = self._extract_paragraph_text(section_data)
|
||||
return {"type": "paragraph", "text": text}
|
||||
return {"content_type": "paragraph", "text": text}
|
||||
|
||||
def _format_timestamp(self, timestamp: str = None) -> str:
|
||||
"""Format timestamp for display."""
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class RendererJson(BaseRenderer):
|
|||
# Return minimal JSON fallback
|
||||
fallback_data = {
|
||||
"title": title,
|
||||
"sections": [{"type": "paragraph", "data": {"text": f"Error rendering report: {str(e)}"}}],
|
||||
"sections": [{"content_type": "paragraph", "elements": [{"text": f"Error rendering report: {str(e)}"}]}],
|
||||
"metadata": {"error": str(e)}
|
||||
}
|
||||
return json.dumps(fallback_data, indent=2), "application/json"
|
||||
|
|
@ -54,7 +54,7 @@ class RendererJson(BaseRenderer):
|
|||
if "sections" not in content:
|
||||
# Convert old format to new format
|
||||
content = {
|
||||
"sections": [{"type": "paragraph", "data": {"text": str(content)}}],
|
||||
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||
"metadata": {"title": title}
|
||||
}
|
||||
|
||||
|
|
@ -73,7 +73,7 @@ class RendererJson(BaseRenderer):
|
|||
self.logger.warning(f"Error cleaning JSON content: {str(e)}")
|
||||
# Return minimal valid JSON
|
||||
fallback_data = {
|
||||
"sections": [{"type": "paragraph", "data": {"text": str(content)}}],
|
||||
"sections": [{"content_type": "paragraph", "elements": [{"text": str(content)}]}],
|
||||
"metadata": {"title": title, "error": str(e)}
|
||||
}
|
||||
return json.dumps(fallback_data, indent=2, ensure_ascii=False)
|
||||
|
|
|
|||
|
|
@ -442,7 +442,7 @@ class RendererXlsx(BaseRenderer):
|
|||
sheet_names = []
|
||||
|
||||
# Check if we have multiple table sections
|
||||
table_sections = [s for s in sections if s.get("type") == "table"]
|
||||
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||
|
||||
if len(table_sections) > 1:
|
||||
# Create separate sheets for each table
|
||||
|
|
@ -480,7 +480,7 @@ class RendererXlsx(BaseRenderer):
|
|||
return
|
||||
|
||||
sections = json_content.get("sections", [])
|
||||
table_sections = [s for s in sections if s.get("type") == "table"]
|
||||
table_sections = [s for s in sections if s.get("content_type") == "table"]
|
||||
|
||||
if len(table_sections) > 1:
|
||||
# Multiple tables - populate each sheet with its corresponding table
|
||||
|
|
@ -509,10 +509,15 @@ class RendererXlsx(BaseRenderer):
|
|||
sheet['A1'].font = Font(size=16, bold=True, color=self._get_safe_color(styles.get("title", {}).get("color", "FF1F4E79")))
|
||||
sheet['A1'].alignment = Alignment(horizontal="center")
|
||||
|
||||
# Get table data
|
||||
table_data = section.get("data", {})
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
# Get table data from elements (canonical JSON format)
|
||||
elements = section.get("elements", [])
|
||||
if elements and isinstance(elements, list) and len(elements) > 0:
|
||||
table_data = elements[0]
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
else:
|
||||
headers = []
|
||||
rows = []
|
||||
|
||||
if not headers and not rows:
|
||||
sheet['A3'] = "No table data available"
|
||||
|
|
@ -683,9 +688,9 @@ class RendererXlsx(BaseRenderer):
|
|||
def _add_table_to_excel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], start_row: int) -> int:
|
||||
"""Add a table element to Excel sheet."""
|
||||
try:
|
||||
table_data = element.get("data", {})
|
||||
headers = table_data.get("headers", [])
|
||||
rows = table_data.get("rows", [])
|
||||
# In canonical JSON format, table elements have headers and rows directly
|
||||
headers = element.get("headers", [])
|
||||
rows = element.get("rows", [])
|
||||
|
||||
if not headers and not rows:
|
||||
return start_row
|
||||
|
|
@ -697,7 +702,7 @@ class RendererXlsx(BaseRenderer):
|
|||
if header_style.get("bold"):
|
||||
cell.font = Font(bold=True, color=self._get_safe_color(header_style.get("text_color", "FF000000")))
|
||||
if header_style.get("background"):
|
||||
cell.fill = PatternFill(start_color=header_style["background"], end_color=header_style["background"], fill_type="solid")
|
||||
cell.fill = PatternFill(start_color=self._get_safe_color(header_style["background"]), end_color=self._get_safe_color(header_style["background"]), fill_type="solid")
|
||||
|
||||
start_row += 1
|
||||
|
||||
|
|
|
|||
|
|
@ -1,21 +1,21 @@
|
|||
"""
|
||||
Centralized prompt builder for document generation across formats.
|
||||
|
||||
Builds a robust prompt that:
|
||||
- Accepts any user intent (no fixed structure assumptions)
|
||||
- Injects format-specific guidelines from the selected renderer
|
||||
- Adds a common policy section to always use real data from source docs
|
||||
- Requires the AI to output a filename header that we can parse and use
|
||||
Prompt builder for AI document generation and extraction.
|
||||
This module builds prompts for AI services to extract and generate documents.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Protocol, Dict, Any
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List, TYPE_CHECKING
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
|
||||
# Type hint for renderer parameter
|
||||
if TYPE_CHECKING:
|
||||
from .renderers.rendererBaseTemplate import BaseRenderer
|
||||
_RendererLike = BaseRenderer
|
||||
else:
|
||||
_RendererLike = Any
|
||||
|
||||
class _RendererLike(Protocol):
|
||||
def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines
|
||||
...
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def buildAdaptiveExtractionPrompt(
|
||||
outputFormat: str,
|
||||
|
|
@ -25,57 +25,65 @@ async def buildAdaptiveExtractionPrompt(
|
|||
aiService=None,
|
||||
services=None
|
||||
) -> str:
|
||||
"""Build adaptive extraction prompt based on AI analysis."""
|
||||
"""
|
||||
Build adaptive extraction prompt based on AI analysis.
|
||||
Uses multi-file or single-file approach based on analysis.
|
||||
"""
|
||||
|
||||
# Get appropriate JSON schema based on analysis
|
||||
from .subJsonSchema import get_adaptive_json_schema
|
||||
json_schema = get_adaptive_json_schema(promptAnalysis)
|
||||
# Multi-file example data instead of schema
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "Multi-Document Example",
|
||||
"splitStrategy": "by_section",
|
||||
"source_documents": ["doc_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_section_1",
|
||||
"title": "Section 1 Title",
|
||||
"filename": "section_1.xlsx",
|
||||
"sections": [
|
||||
{
|
||||
"id": "table_1",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [["Value 1", "Value 2"]]
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Single-file example data instead of schema
|
||||
single_file_example = {
|
||||
"metadata": {
|
||||
"title": "Single Document Example",
|
||||
"source_documents": ["doc_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "table_1",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [["Value 1", "Value 2"]]
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
if promptAnalysis.get("is_multi_file", False):
|
||||
schema_type = "multi-document"
|
||||
else:
|
||||
schema_type = "single-document"
|
||||
|
||||
# Build adaptive prompt using AI analysis - match single-file style
|
||||
if promptAnalysis.get("is_multi_file", False):
|
||||
# Multi-file prompt - use simple example format like single-file
|
||||
multi_file_example = {
|
||||
"metadata": {
|
||||
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
|
||||
"splitStrategy": "by_section"
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"id": "doc_1",
|
||||
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
|
||||
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_1",
|
||||
"content_type": "heading",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"order": 1
|
||||
},
|
||||
{
|
||||
"id": "section_2",
|
||||
"content_type": "paragraph",
|
||||
"elements": [
|
||||
{
|
||||
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
|
||||
}
|
||||
],
|
||||
"order": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Multi-file prompt
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
||||
|
|
@ -134,16 +142,31 @@ Return only the JSON structure with actual data from the documents. Do not inclu
|
|||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
""".strip()
|
||||
else:
|
||||
# Single-file prompt - use original style
|
||||
# Single-file prompt - use example data instead of schema
|
||||
adaptive_prompt = f"""
|
||||
{userPrompt}
|
||||
|
||||
You are extracting structured content from documents and must respond with valid JSON only.
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
|
||||
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||
|
||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||
{json.dumps(json_schema, indent=2)}
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Extract all content and organize it into logical sections
|
||||
3. Create structured JSON with sections containing the extracted content
|
||||
4. Preserve the original structure and data
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(single_file_example, indent=2)}
|
||||
|
||||
INSTRUCTIONS:
|
||||
- Replace example data with actual content from the document
|
||||
- Use actual headings, paragraphs, and text from the document
|
||||
- Ensure all content is properly structured
|
||||
- Do not use generic placeholder text
|
||||
- Extract real content from the documents
|
||||
|
||||
CONTEXT (Document Content):
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
|
|
@ -220,22 +243,53 @@ Consider the user's intent and the most logical way to organize the extracted co
|
|||
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||
|
||||
# Fallback to single-file prompt
|
||||
from .subJsonSchema import get_document_subJsonSchema
|
||||
json_schema = get_document_subJsonSchema()
|
||||
example_data = {
|
||||
"metadata": {
|
||||
"title": "Example Document",
|
||||
"author": "AI Assistant",
|
||||
"source_documents": ["document_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_001",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||
"rows": [
|
||||
["Value 1", "Value 2", "Value 3"],
|
||||
["Value 4", "Value 5", "Value 6"]
|
||||
]
|
||||
}
|
||||
],
|
||||
"order": 1,
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"summary": "",
|
||||
"tags": []
|
||||
}
|
||||
|
||||
return f"""
|
||||
{userPrompt}
|
||||
|
||||
You are extracting structured content from documents and must respond with valid JSON only.
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
||||
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||
|
||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||
{json.dumps(json_schema, indent=2)}
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Extract all content and organize it into logical sections
|
||||
3. Create structured JSON with sections containing the extracted content
|
||||
4. Preserve the original structure and data
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(example_data, indent=2)}
|
||||
|
||||
Requirements:
|
||||
- Preserve all original data - do not summarize or interpret
|
||||
- Use the exact JSON schema provided
|
||||
- Use the exact JSON format shown above
|
||||
- Maintain data integrity and structure
|
||||
|
||||
Content Types to Extract:
|
||||
|
|
@ -286,16 +340,55 @@ async def buildExtractionPrompt(
|
|||
from .subJsonSchema import get_document_subJsonSchema
|
||||
jsonSchema = get_document_subJsonSchema()
|
||||
|
||||
# Generic block for JSON extraction - use proper schema instead of hardcoded template
|
||||
# Generic block for JSON extraction - use example data instead of schema
|
||||
example_data = {
|
||||
"metadata": {
|
||||
"title": "Example Document",
|
||||
"author": "AI Assistant",
|
||||
"source_documents": ["document_001"],
|
||||
"extraction_method": "ai_extraction"
|
||||
},
|
||||
"sections": [
|
||||
{
|
||||
"id": "section_001",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2", "Column 3"],
|
||||
"rows": [
|
||||
["Value 1", "Value 2", "Value 3"],
|
||||
["Value 4", "Value 5", "Value 6"]
|
||||
]
|
||||
}
|
||||
],
|
||||
"order": 1,
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"summary": "",
|
||||
"tags": []
|
||||
}
|
||||
|
||||
genericIntro = f"""
|
||||
{extractionIntent}
|
||||
|
||||
You are extracting structured content from documents and must respond with valid JSON only.
|
||||
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
|
||||
|
||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
||||
TASK: Extract the actual content from the document and organize it into structured sections.
|
||||
|
||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||
{json.dumps(jsonSchema, indent=2)}
|
||||
REQUIREMENTS:
|
||||
1. Analyze the document content provided in the context below
|
||||
2. Extract all content and organize it into logical sections
|
||||
3. Create structured JSON with sections containing the extracted content
|
||||
4. Preserve the original structure and data
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(example_data, indent=2)}
|
||||
|
||||
Requirements:
|
||||
- Preserve all original data - do not summarize or interpret
|
||||
- Use the exact JSON format shown above
|
||||
- Maintain data integrity and structure
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
|
|
@ -317,15 +410,20 @@ Return only the JSON structure with actual data from the documents. Do not inclu
|
|||
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
|
||||
|
||||
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
|
||||
""".strip()
|
||||
"""
|
||||
|
||||
# Final assembly
|
||||
finalPrompt = genericIntro
|
||||
# Get format-specific guidelines from renderer
|
||||
formatGuidelines = ""
|
||||
try:
|
||||
if hasattr(renderer, 'getExtractionGuidelines'):
|
||||
formatGuidelines = renderer.getExtractionGuidelines()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Combine all parts
|
||||
finalPrompt = f"{genericIntro}\n\n{formatGuidelines}".strip()
|
||||
|
||||
# Debug output
|
||||
services.utils.debugLogToFile(f"EXTRACTION INTENT: Processed", "PROMPT_BUILDER")
|
||||
|
||||
# Save full extraction prompt to debug file - only if debug enabled
|
||||
# Save extraction prompt to debug file - only if debug enabled
|
||||
try:
|
||||
debug_enabled = services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||
if debug_enabled:
|
||||
|
|
@ -335,8 +433,7 @@ DO NOT return a schema description - return actual extracted content in the JSON
|
|||
debug_root = "./test-chat/ai"
|
||||
os.makedirs(debug_root, exist_ok=True)
|
||||
with open(os.path.join(debug_root, f"{ts}_extraction_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"EXTRACTION PROMPT:\n{finalPrompt}\n\n")
|
||||
f.write(f"EXTRACTION INTENT:\n{extractionIntent}\n")
|
||||
f.write(finalPrompt)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -367,24 +464,46 @@ async def buildGenerationPrompt(
|
|||
|
||||
# AI call to generate the appropriate generation prompt
|
||||
generationPromptRequest = f"""
|
||||
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
|
||||
You are creating instructions for an AI to generate JSON content in the CANONICAL FORMAT that will be converted to a {outputFormat} document.
|
||||
|
||||
User request: "{safeUserPrompt}"
|
||||
Document title: "{title}"
|
||||
Output format: {outputFormat}
|
||||
Target format: {outputFormat}
|
||||
|
||||
Create a generation prompt that:
|
||||
1. Identifies what content is most important for the user
|
||||
2. Specifies how to structure and organize the content
|
||||
3. Includes any specific formatting or presentation requirements
|
||||
4. Preserves any language requirements
|
||||
5. Ensures the document meets the user's needs
|
||||
Write clear, detailed instructions that tell the AI how to generate JSON content using the CANONICAL JSON FORMAT. Focus on:
|
||||
|
||||
IMPORTANT: Always generate content in STANDARDIZED JSON FORMAT. In your response, include the exact text "PLACEHOLDER_FOR_FORMAT_RULES" where specific format rules will be inserted afterwards automatically.
|
||||
1. What content is most important for the user
|
||||
2. How to structure and organize the content using the canonical JSON format with 'sections'
|
||||
3. Specific formatting requirements for the target format
|
||||
4. Language requirements to preserve
|
||||
5. How to ensure the JSON content meets the user's needs
|
||||
|
||||
CRITICAL: You MUST start your response with exactly "Generate a {outputFormat} document that:" - do NOT use "docx" or any other format. Use the exact format specified: {outputFormat}
|
||||
CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this exact structure:
|
||||
{{
|
||||
"metadata": {{
|
||||
"title": "Document Title"
|
||||
}},
|
||||
"sections": [
|
||||
{{
|
||||
"id": "section_1",
|
||||
"content_type": "table",
|
||||
"elements": [
|
||||
{{
|
||||
"headers": ["Column1", "Column2", "Column3"],
|
||||
"rows": [
|
||||
["Value1", "Value2", "Value3"],
|
||||
["Value4", "Value5", "Value6"]
|
||||
]
|
||||
}}
|
||||
],
|
||||
"order": 1
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
|
||||
The AI should NOT create format-specific structures like "sheets" or "columns" - only use the canonical format with "sections" and "elements".
|
||||
|
||||
Write the instructions as plain text, not JSON. Start with "Generate JSON content that..." and provide clear, actionable instructions for creating structured JSON data in the canonical format.
|
||||
"""
|
||||
|
||||
# Call AI service to generate the prompt
|
||||
|
|
@ -423,7 +542,7 @@ Return only the generation prompt, starting with "Generate a {outputFormat} docu
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content. User requirements: {userPrompt}"
|
||||
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||
|
||||
except Exception as e:
|
||||
# Fallback on any error - preserve user prompt for language instructions
|
||||
|
|
@ -433,105 +552,104 @@ Return only the generation prompt, starting with "Generate a {outputFormat} docu
|
|||
|
||||
def _getFormatRules(outputFormat: str) -> str:
|
||||
"""
|
||||
Get format-specific rules for JSON-based generation.
|
||||
Since we now use standardized JSON, all formats follow the same rules.
|
||||
Get format-specific rules for the generation prompt.
|
||||
"""
|
||||
return """
|
||||
- Generate content in standardized JSON format following the document schema
|
||||
- Tables: Use JSON table format with headers and rows arrays
|
||||
- Lists: Use JSON list format with items array
|
||||
- Text: Use JSON paragraph format with text field
|
||||
- Headings: Use JSON heading format with level field
|
||||
- Structure: Follow the document JSON schema exactly
|
||||
""".strip()
|
||||
format_rules = {
|
||||
"xlsx": """
|
||||
XLSX Format Rules:
|
||||
- Create tables with clear headers and organized data
|
||||
- Use appropriate column widths and formatting
|
||||
- Include summary information if relevant
|
||||
- Ensure data is properly structured for spreadsheet analysis
|
||||
""",
|
||||
"pdf": """
|
||||
PDF Format Rules:
|
||||
- Create professional document layout
|
||||
- Use appropriate headings and sections
|
||||
- Include proper spacing and formatting
|
||||
- Ensure content is well-organized and readable
|
||||
""",
|
||||
"docx": """
|
||||
DOCX Format Rules:
|
||||
- Create professional document layout
|
||||
- Use appropriate headings and sections
|
||||
- Include proper spacing and formatting
|
||||
- Ensure content is well-organized and readable
|
||||
""",
|
||||
"html": """
|
||||
HTML Format Rules:
|
||||
- Create clean, semantic HTML structure
|
||||
- Use appropriate tags for content organization
|
||||
- Include proper styling classes
|
||||
- Ensure content is accessible and well-formatted
|
||||
""",
|
||||
"json": """
|
||||
JSON Format Rules:
|
||||
- Create well-structured JSON data
|
||||
- Use appropriate nesting and organization
|
||||
- Include metadata and context information
|
||||
- Ensure data is properly formatted and valid
|
||||
""",
|
||||
"csv": """
|
||||
CSV Format Rules:
|
||||
- Create clear, organized tabular data
|
||||
- Use appropriate headers and data types
|
||||
- Ensure proper CSV formatting
|
||||
- Include all relevant data in structured format
|
||||
""",
|
||||
"txt": """
|
||||
TXT Format Rules:
|
||||
- Create clean, readable text format
|
||||
- Use appropriate spacing and organization
|
||||
- Include clear headings and sections
|
||||
- Ensure content is well-structured and easy to read
|
||||
"""
|
||||
}
|
||||
|
||||
return format_rules.get(outputFormat.lower(), f"""
|
||||
{outputFormat.upper()} Format Rules:
|
||||
- Create well-structured content appropriate for {outputFormat}
|
||||
- Use appropriate formatting and organization
|
||||
- Ensure content is clear and professional
|
||||
- Include all relevant information in proper format
|
||||
""")
|
||||
|
||||
|
||||
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
||||
"""
|
||||
Use AI to extract a rich, structured extraction intent from the user prompt.
|
||||
Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
|
||||
Parse user prompt to extract the core extraction intent.
|
||||
"""
|
||||
if not aiService:
|
||||
# Fallback if no AI service available
|
||||
return "Extract all relevant content from the document according to the user's requirements"
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
try:
|
||||
# Protect userPrompt from injection by escaping quotes and newlines
|
||||
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
||||
|
||||
# Rich analysis to derive a complete extraction intent and structure guidance
|
||||
extractionPrompt = f"""
|
||||
Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and extract the core extraction intent:
|
||||
|
||||
Goals:
|
||||
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
|
||||
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
|
||||
- Identify if multi-file output is appropriate and how to split/files name.
|
||||
User request: "{userPrompt}"
|
||||
Target format: {outputFormat}
|
||||
|
||||
User request: "{safeUserPrompt}"
|
||||
Extract the main intent and requirements for document processing. Focus on:
|
||||
1. What content needs to be extracted
|
||||
2. How it should be organized
|
||||
3. Any specific requirements or preferences
|
||||
|
||||
Return JSON in this exact shape:
|
||||
{{
|
||||
"detectedLanguage": "de|en|fr|it|...",
|
||||
"normalizedRequest": "Full explicit instruction in detected language",
|
||||
"requiresStructuredData": true|false,
|
||||
"targetStructure": "table|list|mixed|unstructured",
|
||||
"table": {{
|
||||
"headers": ["Header1", "Header2", "..."],
|
||||
"headerOrderStrict": true|false,
|
||||
"rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
|
||||
"formats": {{
|
||||
"dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
|
||||
"amountDecimals": 2,
|
||||
"currencyFormat": "code|symbol",
|
||||
"idMasking": "none|last4|custom"
|
||||
}}
|
||||
}},
|
||||
"multiFile": true|false,
|
||||
"fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
|
||||
"fileNamingPattern": "suggested pattern for filenames",
|
||||
"constraints": ["List of critical constraints to enforce"],
|
||||
"reasoning": "Brief justification (one sentence)"
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- Preserve user terminology and language in normalizedRequest.
|
||||
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
|
||||
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
|
||||
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
|
||||
Respond with a clear, concise statement of the extraction intent.
|
||||
"""
|
||||
|
||||
# Call AI service to extract intention
|
||||
services.utils.debugLogToFile("DEBUG: Calling AI for extraction intent...", "PROMPT_BUILDER")
|
||||
|
||||
# Import and set proper options for AI call
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationType.GENERAL
|
||||
|
||||
request = AiCallRequest(prompt=extractionPrompt, context="", options=request_options)
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await aiService.aiObjects.call(request)
|
||||
result = response.content if response else ""
|
||||
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
|
||||
|
||||
# Try to extract and pretty print JSON
|
||||
if result:
|
||||
import re, json as _json
|
||||
match = re.search(r'\{[\s\S]*\}', result)
|
||||
if match:
|
||||
try:
|
||||
obj = _json.loads(match.group(0))
|
||||
return _json.dumps(obj, ensure_ascii=False, indent=2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to previous simple format
|
||||
return f"Extract: {safeUserPrompt}"
|
||||
|
||||
if response and response.content:
|
||||
return response.content.strip()
|
||||
else:
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
except Exception as e:
|
||||
# Fallback on any error - preserve user prompt for language instructions
|
||||
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
|
||||
return f"Extract: {userPrompt}"
|
||||
|
||||
|
||||
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
|
|
|
|||
|
|
@ -28,13 +28,17 @@ class NormalizationService:
|
|||
continue
|
||||
|
||||
# Extract table data from elements array
|
||||
hdrs = []
|
||||
rows = []
|
||||
for element in section.get("elements", []):
|
||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||
hdrs = element.get("headers") or []
|
||||
rows = element.get("rows") or []
|
||||
break
|
||||
else:
|
||||
|
||||
if not hdrs or not rows:
|
||||
continue
|
||||
|
||||
for h in hdrs:
|
||||
if not isinstance(h, str):
|
||||
continue
|
||||
|
|
@ -122,13 +126,14 @@ class NormalizationService:
|
|||
continue
|
||||
|
||||
# Extract table data from elements array
|
||||
sourceHeaders = []
|
||||
sourceRows = []
|
||||
for element in section.get("elements", []):
|
||||
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||
sourceHeaders = element.get("headers") or []
|
||||
sourceRows = element.get("rows") or []
|
||||
break
|
||||
else:
|
||||
continue
|
||||
|
||||
if not sourceHeaders or not sourceRows:
|
||||
continue
|
||||
|
||||
|
|
|
|||
|
|
@ -78,11 +78,15 @@ class WorkflowService:
|
|||
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
|
||||
"""Get ChatDocuments from a list of document references using all three formats."""
|
||||
try:
|
||||
# Get the current workflow from services (same pattern as setWorkflowContext)
|
||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
logger.error("No workflow available for document list resolution")
|
||||
return []
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Reload workflow from database to ensure we have all messages
|
||||
if hasattr(workflow, 'id'):
|
||||
try:
|
||||
workflow = self.getWorkflow(workflow.id)
|
||||
logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reload workflow from database: {str(e)}")
|
||||
|
||||
all_documents = []
|
||||
for doc_ref in documentList:
|
||||
|
|
@ -418,11 +422,7 @@ class WorkflowService:
|
|||
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
|
||||
"""Set current workflow context for document generation and routing"""
|
||||
try:
|
||||
# Get the current workflow from services
|
||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
logger.error("No workflow available for context setting")
|
||||
return
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Prepare update data
|
||||
update_data = {}
|
||||
|
|
@ -529,10 +529,7 @@ class WorkflowService:
|
|||
def getDocumentCount(self) -> str:
|
||||
"""Get document count for task planning (matching old handlingTasks.py logic)"""
|
||||
try:
|
||||
# Get the current workflow from services
|
||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
return "No documents available"
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Count documents from all messages in the workflow (like old system)
|
||||
total_docs = 0
|
||||
|
|
@ -551,10 +548,7 @@ class WorkflowService:
|
|||
def getWorkflowHistoryContext(self) -> str:
|
||||
"""Get workflow history context for task planning (matching old handlingTasks.py logic)"""
|
||||
try:
|
||||
# Get the current workflow from services
|
||||
workflow = getattr(self.services, 'currentWorkflow', None) or self.workflow
|
||||
if not workflow:
|
||||
return "No previous round context available"
|
||||
workflow = self.services.currentWorkflow
|
||||
|
||||
# Check if there are any previous rounds by looking for "first" messages
|
||||
has_previous_rounds = False
|
||||
|
|
|
|||
|
|
@ -226,7 +226,7 @@ class ReactMode(BaseMode):
|
|||
|
||||
# Get available documents from the current workflow
|
||||
try:
|
||||
available_docs = self.services.workflow.getAvailableDocuments(context.workflow)
|
||||
available_docs = self.services.workflow.getAvailableDocuments(self.services.currentWorkflow)
|
||||
if not available_docs or available_docs == "No documents available":
|
||||
logger.warning("No documents available for validation")
|
||||
return
|
||||
|
|
|
|||
|
|
@ -68,20 +68,12 @@ def extractWorkflowHistory(service: Any, context: Any) -> str:
|
|||
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
||||
Reverse-chronological, enriched with message summaries and document labels.
|
||||
"""
|
||||
# Prefer explicit workflow on context; else fall back to services.workflow
|
||||
workflow = None
|
||||
try:
|
||||
if hasattr(context, 'workflow') and context.workflow:
|
||||
workflow = context.workflow
|
||||
elif hasattr(service, 'workflow') and service.workflow:
|
||||
workflow = service.workflow
|
||||
except Exception:
|
||||
workflow = None
|
||||
|
||||
if workflow:
|
||||
history = getPreviousRoundContext(service, workflow)
|
||||
history = getPreviousRoundContext(service, service.currentWorkflow)
|
||||
return history or "No previous workflow rounds available"
|
||||
return "No previous workflow rounds available"
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting workflow history: {str(e)}")
|
||||
return "No previous workflow rounds available"
|
||||
|
||||
def extractAvailableMethods(service: Any) -> str:
|
||||
"""Extract available methods for action planning. Maps to {{KEY:AVAILABLE_METHODS}}"""
|
||||
|
|
@ -390,7 +382,7 @@ def extractLatestRefinementFeedback(context: Any) -> str:
|
|||
def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
||||
"""Summary of available documents (count only)."""
|
||||
try:
|
||||
documents = service.workflow.getAvailableDocuments(context.workflow)
|
||||
documents = service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||
if documents and documents != "No documents available":
|
||||
# Count only actual documents, not list labels
|
||||
doc_count = documents.count("docItem:")
|
||||
|
|
@ -403,7 +395,7 @@ def extractAvailableDocumentsSummary(service: Any, context: Any) -> str:
|
|||
def extractAvailableDocumentsIndex(service: Any, context: Any) -> str:
|
||||
"""Index of available documents with detailed references for parameter generation."""
|
||||
try:
|
||||
return service.workflow.getAvailableDocuments(context.workflow)
|
||||
return service.workflow.getAvailableDocuments(service.currentWorkflow)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting document index: {str(e)}")
|
||||
return "No documents available"
|
||||
|
|
|
|||
Loading…
Reference in a new issue