537 lines
No EOL
24 KiB
Python
537 lines
No EOL
24 KiB
Python
import logging
|
|
import uuid
|
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
from datetime import datetime, UTC
|
|
import re
|
|
from modules.shared.timezoneUtils import get_utc_timestamp
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.services.serviceGeneration.subDocumentUtility import (
|
|
getFileExtension,
|
|
getMimeTypeFromExtension,
|
|
detectMimeTypeFromContent,
|
|
detectMimeTypeFromData,
|
|
convertDocumentDataToString
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class GenerationService:
|
|
def __init__(self, serviceCenter=None):
|
|
# Directly use interfaces from the provided service center (no self.service calls)
|
|
self.services = serviceCenter
|
|
self.interfaceDbComponent = getattr(serviceCenter, 'interfaceDbComponent', None) if serviceCenter else None
|
|
self.interfaceDbChat = getattr(serviceCenter, 'interfaceDbChat', None) if serviceCenter else None
|
|
self.workflow = getattr(serviceCenter, 'workflow', None) if serviceCenter else None
|
|
|
|
def processActionResultDocuments(self, action_result, action, workflow) -> List[Dict[str, Any]]:
|
|
"""
|
|
Process documents produced by AI actions and convert them to ChatDocument format.
|
|
This function handles AI-generated document data, not document references.
|
|
Returns a list of processed document dictionaries.
|
|
"""
|
|
try:
|
|
# Read documents from the standard documents field (not data.documents)
|
|
documents = action_result.documents if action_result and hasattr(action_result, 'documents') else []
|
|
|
|
if not documents:
|
|
logger.info(f"No documents found in action_result.documents for {action.execMethod}.{action.execAction}")
|
|
return []
|
|
|
|
logger.info(f"Processing {len(documents)} documents from action_result.documents")
|
|
|
|
# Process each document from the AI action result
|
|
processed_documents = []
|
|
for doc in documents:
|
|
processed_doc = self.processSingleDocument(doc, action)
|
|
if processed_doc:
|
|
processed_documents.append(processed_doc)
|
|
|
|
logger.info(f"Successfully processed {len(processed_documents)} documents")
|
|
return processed_documents
|
|
except Exception as e:
|
|
logger.error(f"Error processing action result documents: {str(e)}")
|
|
return []
|
|
|
|
def processSingleDocument(self, doc: Any, action) -> Optional[Dict[str, Any]]:
|
|
"""Process a single document from action result with simplified logic"""
|
|
try:
|
|
# ActionDocument objects have documentName, documentData, and mimeType
|
|
mime_type = doc.mimeType
|
|
if mime_type == "application/octet-stream":
|
|
content = doc.documentData
|
|
# Detect MIME without relying on a service center
|
|
mime_type = detectMimeTypeFromContent(content, doc.documentName)
|
|
|
|
return {
|
|
'fileName': doc.documentName,
|
|
'fileSize': len(str(doc.documentData)),
|
|
'mimeType': mime_type,
|
|
'content': doc.documentData,
|
|
'document': doc
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error processing single document: {str(e)}")
|
|
return None
|
|
|
|
def createDocumentsFromActionResult(self, action_result, action, workflow, message_id=None) -> List[Any]:
|
|
"""
|
|
Create actual document objects from action result and store them in the system.
|
|
Returns a list of created document objects with proper workflow context.
|
|
"""
|
|
try:
|
|
logger.info(f"Creating documents from action result for {action.execMethod}.{action.execAction}")
|
|
logger.info(f"Action result documents count: {len(action_result.documents) if action_result.documents else 0}")
|
|
|
|
processed_docs = self.processActionResultDocuments(action_result, action, workflow)
|
|
logger.info(f"Processed {len(processed_docs)} documents")
|
|
|
|
created_documents = []
|
|
for i, doc_data in enumerate(processed_docs):
|
|
try:
|
|
document_name = doc_data['fileName']
|
|
document_data = doc_data['content']
|
|
mime_type = doc_data['mimeType']
|
|
|
|
logger.info(f"Creating document {i+1}: {document_name} (mime: {mime_type}, content length: {len(str(document_data))})")
|
|
|
|
# Convert document data to string content
|
|
content = convertDocumentDataToString(document_data, getFileExtension(document_name))
|
|
|
|
# Skip empty or minimal content
|
|
minimal_content_patterns = ['{}', '[]', 'null', '""', "''"]
|
|
if not content or content.strip() == "" or content.strip() in minimal_content_patterns:
|
|
logger.warning(f"Empty or minimal content for document {document_name}, skipping")
|
|
continue
|
|
|
|
logger.info(f"Document {document_name} has content: {len(content)} characters")
|
|
|
|
# Normalize file extension based on mime type if missing or incorrect
|
|
try:
|
|
mime_to_ext = {
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
"application/pdf": ".pdf",
|
|
"text/html": ".html",
|
|
"text/markdown": ".md",
|
|
"text/plain": ".txt",
|
|
"application/json": ".json",
|
|
}
|
|
expected_ext = mime_to_ext.get(mime_type)
|
|
if expected_ext:
|
|
if not document_name.lower().endswith(expected_ext):
|
|
# Append/replace extension to match mime type
|
|
if "." in document_name:
|
|
document_name = document_name.rsplit(".", 1)[0] + expected_ext
|
|
else:
|
|
document_name = document_name + expected_ext
|
|
except Exception:
|
|
pass
|
|
|
|
# Decide if content is base64-encoded binary (e.g., docx/pdf) or plain text
|
|
base64encoded = False
|
|
try:
|
|
binary_mime_types = {
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
"application/pdf",
|
|
}
|
|
if isinstance(document_data, str) and mime_type in binary_mime_types:
|
|
base64encoded = True
|
|
except Exception:
|
|
base64encoded = False
|
|
|
|
# Create document with file in one step using interfaces directly
|
|
document = self._createDocument(
|
|
fileName=document_name,
|
|
mimeType=mime_type,
|
|
content=content,
|
|
base64encoded=base64encoded,
|
|
messageId=message_id
|
|
)
|
|
if document:
|
|
# Set workflow context on the document if possible
|
|
self._setDocumentWorkflowContext(document, action, workflow)
|
|
created_documents.append(document)
|
|
logger.info(f"Successfully created ChatDocument: {document_name} (ID: {document.id if hasattr(document, 'id') else 'N/A'}, fileId: {document.fileId if hasattr(document, 'fileId') else 'N/A'})")
|
|
else:
|
|
logger.error(f"Failed to create ChatDocument object for {document_name}")
|
|
except Exception as e:
|
|
logger.error(f"Error creating document {doc_data.get('fileName', 'unknown')}: {str(e)}")
|
|
continue
|
|
|
|
logger.info(f"Successfully created {len(created_documents)} documents")
|
|
return created_documents
|
|
except Exception as e:
|
|
logger.error(f"Error creating documents from action result: {str(e)}")
|
|
return []
|
|
|
|
def _setDocumentWorkflowContext(self, document, action, workflow):
|
|
"""Set workflow context on a document for proper routing and labeling"""
|
|
try:
|
|
# Get current workflow context directly from workflow object
|
|
workflow_context = self._getWorkflowContext(workflow)
|
|
workflow_stats = self._getWorkflowStats(workflow)
|
|
|
|
current_round = workflow_context.get('currentRound', 0)
|
|
current_task = workflow_context.get('currentTask', 0)
|
|
current_action = workflow_context.get('currentAction', 0)
|
|
|
|
# Try to set workflow context attributes if they exist
|
|
if hasattr(document, 'roundNumber'):
|
|
document.roundNumber = current_round
|
|
if hasattr(document, 'taskNumber'):
|
|
document.taskNumber = current_task
|
|
if hasattr(document, 'actionNumber'):
|
|
document.actionNumber = current_action
|
|
if hasattr(document, 'actionId'):
|
|
document.actionId = action.id if hasattr(action, 'id') else None
|
|
|
|
# Set additional workflow metadata if available
|
|
if hasattr(document, 'workflowId'):
|
|
document.workflowId = workflow_stats.get('workflowId', workflow.id if hasattr(workflow, 'id') else None)
|
|
if hasattr(document, 'workflowStatus'):
|
|
document.workflowStatus = workflow_stats.get('workflowStatus', workflow.status if hasattr(workflow, 'status') else 'unknown')
|
|
|
|
logger.debug(f"Set workflow context on document: Round {current_round}, Task {current_task}, Action {current_action}")
|
|
logger.debug(f"Document workflow metadata: ID={document.workflowId if hasattr(document, 'workflowId') else 'N/A'}, Status={document.workflowStatus if hasattr(document, 'workflowStatus') else 'N/A'}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Could not set workflow context on document: {str(e)}")
|
|
|
|
def _createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True, messageId: str = None) -> Optional[ChatDocument]:
|
|
"""Create file and ChatDocument using interfaces without service indirection."""
|
|
try:
|
|
if not self.interfaceDbComponent:
|
|
logger.error("Component interface not available for document creation")
|
|
return None
|
|
# Convert content to bytes
|
|
if base64encoded:
|
|
import base64
|
|
content_bytes = base64.b64decode(content)
|
|
else:
|
|
content_bytes = content.encode('utf-8')
|
|
# Create file and store data
|
|
file_item = self.interfaceDbComponent.createFile(
|
|
name=fileName,
|
|
mimeType=mimeType,
|
|
content=content_bytes
|
|
)
|
|
self.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
|
# Collect file info
|
|
file_info = self._getFileInfo(file_item.id)
|
|
if not file_info:
|
|
logger.error(f"Could not get file info for fileId: {file_item.id}")
|
|
return None
|
|
# Build ChatDocument
|
|
document = ChatDocument(
|
|
id=str(uuid.uuid4()),
|
|
messageId=messageId or "",
|
|
fileId=file_item.id,
|
|
fileName=file_info.get("fileName", fileName),
|
|
fileSize=file_info.get("size", 0),
|
|
mimeType=file_info.get("mimeType", mimeType)
|
|
)
|
|
# Ensure document can access component interface later
|
|
if hasattr(document, 'setComponentInterface') and self.interfaceDbComponent:
|
|
try:
|
|
document.setComponentInterface(self.interfaceDbComponent)
|
|
except Exception:
|
|
pass
|
|
return document
|
|
except Exception as e:
|
|
logger.error(f"Error creating document: {str(e)}")
|
|
return None
|
|
|
|
def _getFileInfo(self, fileId: str) -> Optional[Dict[str, Any]]:
|
|
try:
|
|
if not self.interfaceDbComponent:
|
|
return None
|
|
file_item = self.interfaceDbComponent.getFile(fileId)
|
|
if file_item:
|
|
return {
|
|
"id": file_item.id,
|
|
"fileName": file_item.fileName,
|
|
"size": file_item.fileSize,
|
|
"mimeType": file_item.mimeType,
|
|
"fileHash": getattr(file_item, 'fileHash', None),
|
|
"creationDate": getattr(file_item, 'creationDate', None)
|
|
}
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting file info for {fileId}: {str(e)}")
|
|
return None
|
|
|
|
def _getWorkflowContext(self, workflow) -> Dict[str, int]:
|
|
try:
|
|
return {
|
|
'currentRound': getattr(workflow, 'currentRound', 0),
|
|
'currentTask': getattr(workflow, 'currentTask', 0),
|
|
'currentAction': getattr(workflow, 'currentAction', 0)
|
|
}
|
|
except Exception:
|
|
return {'currentRound': 0, 'currentTask': 0, 'currentAction': 0}
|
|
|
|
def _getWorkflowStats(self, workflow) -> Dict[str, Any]:
|
|
try:
|
|
context = self._getWorkflowContext(workflow)
|
|
return {
|
|
'currentRound': context['currentRound'],
|
|
'currentTask': context['currentTask'],
|
|
'currentAction': context['currentAction'],
|
|
'totalTasks': getattr(workflow, 'totalTasks', 0),
|
|
'totalActions': getattr(workflow, 'totalActions', 0),
|
|
'workflowStatus': getattr(workflow, 'status', 'unknown'),
|
|
'workflowId': getattr(workflow, 'id', 'unknown')
|
|
}
|
|
except Exception:
|
|
return {
|
|
'currentRound': 0,
|
|
'currentTask': 0,
|
|
'currentAction': 0,
|
|
'totalTasks': 0,
|
|
'totalActions': 0,
|
|
'workflowStatus': 'unknown',
|
|
'workflowId': 'unknown'
|
|
}
|
|
|
|
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
|
|
"""
|
|
Render extracted JSON content to the specified output format.
|
|
|
|
Args:
|
|
extractedContent: Structured JSON document from AI extraction
|
|
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
|
title: Report title
|
|
userPrompt: User's original prompt for report generation
|
|
aiService: AI service instance for generation prompt creation
|
|
|
|
Returns:
|
|
tuple: (rendered_content, mime_type)
|
|
"""
|
|
try:
|
|
# Validate JSON input
|
|
if not isinstance(extractedContent, dict):
|
|
raise ValueError("extractedContent must be a JSON dictionary")
|
|
|
|
if "sections" not in extractedContent:
|
|
raise ValueError("extractedContent must contain 'sections' field")
|
|
|
|
# DEBUG: Log renderer input metadata only (no verbose JSON) - only if debug enabled
|
|
try:
|
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
import os
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
debug_root = "./test-chat/ai"
|
|
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
|
|
os.makedirs(debug_dir, exist_ok=True)
|
|
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
|
|
f.write(f"title: {title}\nformat: {outputFormat}\ncontent_type: {type(extractedContent).__name__}\n")
|
|
f.write(f"content_size: {len(str(extractedContent))} characters\n")
|
|
f.write(f"sections_count: {len(extractedContent.get('sections', []))}\n")
|
|
except Exception:
|
|
pass
|
|
|
|
# Get the appropriate renderer for the format
|
|
renderer = self._getFormatRenderer(outputFormat)
|
|
if not renderer:
|
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
|
|
|
# Generate AI-based generation prompt if AI service is available
|
|
generationPrompt = userPrompt # Default to user prompt
|
|
if aiService and userPrompt:
|
|
try:
|
|
from .subPromptBuilder import buildGenerationPrompt
|
|
generationPrompt = await buildGenerationPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=userPrompt,
|
|
title=title,
|
|
aiService=aiService,
|
|
services=self.services
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
|
|
generationPrompt = userPrompt
|
|
|
|
# Render the JSON content with AI-generated prompt
|
|
renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt, aiService)
|
|
# DEBUG: dump rendered output
|
|
try:
|
|
import os
|
|
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
|
|
f.write(renderedContent or "")
|
|
except Exception:
|
|
pass
|
|
|
|
logger.info(f"Successfully rendered JSON report to {outputFormat} format: {len(renderedContent)} characters")
|
|
return renderedContent, mimeType
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
|
|
raise
|
|
|
|
async def getAdaptiveExtractionPrompt(
|
|
self,
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
promptAnalysis: Dict[str, Any],
|
|
aiService=None
|
|
) -> str:
|
|
"""Get adaptive extraction prompt based on AI analysis."""
|
|
from .subPromptBuilder import buildAdaptiveExtractionPrompt
|
|
return await buildAdaptiveExtractionPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=userPrompt,
|
|
title=title,
|
|
promptAnalysis=promptAnalysis,
|
|
aiService=aiService,
|
|
services=self.services
|
|
)
|
|
|
|
async def getGenericExtractionPrompt(
|
|
self,
|
|
outputFormat: str,
|
|
userPrompt: str,
|
|
title: str,
|
|
aiService=None
|
|
) -> str:
|
|
"""Get generic extraction prompt that works for both single and multi-file."""
|
|
from .subPromptBuilder import buildGenericExtractionPrompt
|
|
return await buildGenericExtractionPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=userPrompt,
|
|
title=title,
|
|
aiService=aiService,
|
|
services=self.services
|
|
)
|
|
|
|
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
|
|
"""
|
|
Get the format-specific extraction prompt for AI content extraction.
|
|
|
|
Args:
|
|
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
|
userPrompt: User's original prompt for report generation
|
|
title: Report title
|
|
aiService: AI service instance for intent extraction
|
|
|
|
Returns:
|
|
str: Format-specific prompt for AI extraction
|
|
"""
|
|
try:
|
|
# Get the appropriate renderer for the format
|
|
renderer = self._getFormatRenderer(outputFormat)
|
|
if not renderer:
|
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
|
|
|
# Build centralized prompt with generic rules + format-specific guidelines
|
|
from .subPromptBuilder import buildExtractionPrompt
|
|
extractionPrompt = await buildExtractionPrompt(
|
|
outputFormat=outputFormat,
|
|
renderer=renderer,
|
|
userPrompt=userPrompt,
|
|
title=title,
|
|
aiService=aiService,
|
|
services=self.services
|
|
)
|
|
|
|
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
|
|
return extractionPrompt
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
|
|
raise
|
|
|
|
async def renderAdaptiveReport(
|
|
self,
|
|
extractedContent: Dict[str, Any],
|
|
outputFormat: str,
|
|
title: str,
|
|
userPrompt: str = None,
|
|
aiService=None,
|
|
isMultiFile: bool = False
|
|
) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
|
|
"""Render report adaptively based on content structure."""
|
|
|
|
if isMultiFile and "documents" in extractedContent:
|
|
return await self._renderMultiFileReport(
|
|
extractedContent, outputFormat, title, userPrompt, aiService
|
|
)
|
|
else:
|
|
return await self._renderSingleFileReport(
|
|
extractedContent, outputFormat, title, userPrompt, aiService
|
|
)
|
|
|
|
async def _renderMultiFileReport(
|
|
self,
|
|
extractedContent: Dict[str, Any],
|
|
outputFormat: str,
|
|
title: str,
|
|
userPrompt: str = None,
|
|
aiService=None
|
|
) -> List[Dict[str, Any]]:
|
|
"""Render multiple documents from extracted content."""
|
|
|
|
generated_documents = []
|
|
|
|
for doc_data in extractedContent.get("documents", []):
|
|
# Use existing single-file renderer for each document
|
|
renderer = self._getFormatRenderer(outputFormat)
|
|
if not renderer:
|
|
continue
|
|
|
|
# Render individual document
|
|
rendered_content, mime_type = await renderer.render(
|
|
extractedContent={"sections": doc_data["sections"]},
|
|
title=doc_data["title"],
|
|
userPrompt=userPrompt,
|
|
aiService=aiService
|
|
)
|
|
|
|
generated_documents.append({
|
|
"filename": doc_data["filename"],
|
|
"content": rendered_content,
|
|
"mime_type": mime_type,
|
|
"title": doc_data["title"]
|
|
})
|
|
|
|
return generated_documents
|
|
|
|
async def _renderSingleFileReport(
|
|
self,
|
|
extractedContent: Dict[str, Any],
|
|
outputFormat: str,
|
|
title: str,
|
|
userPrompt: str = None,
|
|
aiService=None
|
|
) -> Tuple[str, str]:
|
|
"""Render single file report (existing functionality)."""
|
|
# Use existing renderReport method
|
|
return await self.renderReport(
|
|
extractedContent, outputFormat, title, userPrompt, aiService
|
|
)
|
|
|
|
def _getFormatRenderer(self, output_format: str):
|
|
"""Get the appropriate renderer for the specified format using auto-discovery."""
|
|
try:
|
|
from .renderers.registry import get_renderer
|
|
renderer = get_renderer(output_format, services=self.services)
|
|
|
|
if renderer:
|
|
return renderer
|
|
|
|
# Fallback to text renderer if no specific renderer found
|
|
logger.warning(f"No renderer found for format {output_format}, falling back to text")
|
|
fallback_renderer = get_renderer('text', services=self.services)
|
|
if fallback_renderer:
|
|
return fallback_renderer
|
|
|
|
logger.error("Even text renderer fallback failed")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting renderer for {output_format}: {str(e)}")
|
|
return None |