gateway/modules/chat/documents/documentGeneration.py

272 lines
15 KiB
Python

import logging
from typing import Any, Dict, List, Optional
from datetime import datetime, UTC
from .documentUtility import (
getFileExtension,
getMimeTypeFromExtension,
detectMimeTypeFromContent,
detectMimeTypeFromData,
convertDocumentDataToString
)
logger = logging.getLogger(__name__)
class DocumentGenerator:
def __init__(self, service):
self.service = service
def processActionResultDocuments(self, action_result, action, workflow) -> List[Dict[str, Any]]:
"""
Main function to process documents from an action result.
Returns a list of processed document dictionaries.
"""
try:
# Read documents from the standard documents field (not data.documents)
documents = action_result.documents if hasattr(action_result, 'documents') else []
if not documents:
logger.info(f"No documents found in action_result.documents for {action.execMethod}.{action.execAction}")
return []
logger.info(f"Processing {len(documents)} documents from action_result.documents")
# Check if documents are references (strings starting with "docItem:") or actual document objects
if documents and isinstance(documents[0], str) and documents[0].startswith("docItem:"):
# These are document references, resolve them to actual documents
logger.info(f"Resolving {len(documents)} document references to actual documents")
try:
actual_documents = self.service.getChatDocumentsFromDocumentList(documents)
logger.info(f"Resolved {len(actual_documents)} actual documents from references")
documents = actual_documents
except Exception as e:
logger.error(f"Error resolving document references: {str(e)}")
return []
processed_documents = []
for doc in documents:
processed_doc = self.processSingleDocument(doc, action)
if processed_doc:
processed_documents.append(processed_doc)
logger.info(f"Successfully processed {len(processed_documents)} documents")
return processed_documents
except Exception as e:
logger.error(f"Error processing action result documents: {str(e)}")
return []
def processSingleDocument(self, doc: Any, action) -> Optional[Dict[str, Any]]:
"""Process a single document from action result"""
try:
if hasattr(doc, 'filename') and doc.filename:
# Document object with filename attribute
mime_type = getattr(doc, 'mimeType', 'application/octet-stream')
if mime_type == "application/octet-stream":
content = getattr(doc, 'content', '')
mime_type = detectMimeTypeFromContent(content, doc.filename, self.service)
# Add result label to filename for document objects too
base_filename = doc.filename
if hasattr(action, 'execResultLabel') and action.execResultLabel:
result_label = action.execResultLabel.strip()
if result_label:
# Check if filename already starts with resultLabel to avoid duplication
if not base_filename.startswith(f"{result_label}-"):
base_filename = f"{result_label}-{base_filename}"
logger.info(f"Added resultLabel '{result_label}' as prefix to document object filename: {base_filename}")
else:
logger.info(f"Document object filename already has resultLabel prefix: {base_filename}")
return {
'filename': base_filename,
'fileSize': getattr(doc, 'fileSize', 0),
'mimeType': mime_type,
'content': getattr(doc, 'content', ''),
'document': doc
}
elif hasattr(doc, 'documentName') and doc.documentName:
# ActionDocument object with documentName attribute
base_filename = doc.documentName
mime_type = getattr(doc, 'mimeType', 'application/octet-stream')
content = getattr(doc, 'documentData', '')
# Add result label to filename for ActionDocument objects
if hasattr(action, 'execResultLabel') and action.execResultLabel:
result_label = action.execResultLabel.strip()
if result_label:
# Check if filename already starts with resultLabel to avoid duplication
if not base_filename.startswith(f"{result_label}-"):
base_filename = f"{result_label}-{base_filename}"
logger.info(f"Added resultLabel '{result_label}' as prefix to ActionDocument filename: {base_filename}")
else:
logger.info(f"ActionDocument filename already has resultLabel prefix: {base_filename}")
# Calculate file size from actual content
fileSize = len(str(content)) if content else 0
logger.info(f"Processed ActionDocument: {base_filename}, content length: {len(str(content))}, mimeType: {mime_type}")
return {
'filename': base_filename,
'fileSize': fileSize,
'mimeType': mime_type,
'content': content,
'document': doc
}
elif isinstance(doc, dict):
# Dictionary format document - handle both 'documentName' and 'filename' keys
base_filename = doc.get('documentName', doc.get('filename', ''))
# Debug logging for resultLabel
if hasattr(action, 'execResultLabel'):
logger.info(f"Action {action.execMethod}.{action.execAction} has execResultLabel: '{action.execResultLabel}' (type: {type(action.execResultLabel)})")
else:
logger.info(f"Action {action.execMethod}.{action.execAction} has NO execResultLabel attribute")
# If no filename provided, generate one with action info
if not base_filename:
base_filename = f"{action.execMethod}_{action.execAction}_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}"
# ALWAYS add result label to filename for better document selection
# This ensures consistent naming regardless of whether filename was provided or generated
if hasattr(action, 'execResultLabel') and action.execResultLabel:
result_label = action.execResultLabel.strip()
if result_label:
# Check if filename already starts with resultLabel to avoid duplication
if not base_filename.startswith(f"{result_label}-"):
base_filename = f"{result_label}-{base_filename}"
logger.info(f"Added resultLabel '{result_label}' as prefix to filename: {base_filename}")
else:
logger.info(f"Filename already has resultLabel prefix: {base_filename}")
else:
logger.info(f"No resultLabel available for action {action.execMethod}.{action.execAction}")
filename = base_filename
mimeType = doc.get('mimeType', 'application/octet-stream')
# Handle documentData structure - it might be a dict with 'content' key or direct content
document_data = doc.get('documentData', '')
if isinstance(document_data, dict) and 'content' in document_data:
# This is the structure returned by extract action: documentData.content
content = document_data['content']
# Also check for other potential content fields
if not content and 'data' in document_data:
content = document_data['data']
else:
# Direct content (fallback)
content = document_data
# Calculate file size from actual content
fileSize = len(str(content)) if content else 0
# Detect mime type if not specified
if mimeType == "application/octet-stream":
mimeType = detectMimeTypeFromContent(content, filename, self.service)
logger.info(f"Processed document: {filename}, content length: {len(str(content))}, mimeType: {mimeType}")
return {
'filename': filename,
'fileSize': fileSize,
'mimeType': mimeType,
'content': content,
'document': doc
}
else:
# Unknown document type
logger.warning(f"Unknown document type for action {action.execMethod}.{action.execAction}: {type(doc)}")
base_filename = f"{action.execMethod}_{action.execAction}_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}"
# ALWAYS add result label to filename for better document selection
# This ensures consistent naming regardless of document type
if hasattr(action, 'execResultLabel') and action.execResultLabel:
result_label = action.execResultLabel.strip()
if result_label:
# Check if filename already starts with resultLabel to avoid duplication
if not base_filename.startswith(f"{result_label}-"):
base_filename = f"{result_label}-{base_filename}"
logger.info(f"Added resultLabel '{result_label}' as prefix to fallback filename: {base_filename}")
else:
logger.info(f"Fallback filename already has resultLabel prefix: {base_filename}")
else:
logger.info(f"No resultLabel available for action {action.execMethod}.{action.execAction}")
filename = base_filename
mimeType = detectMimeTypeFromContent(doc, filename, self.service)
return {
'filename': filename,
'fileSize': 0,
'mimeType': mimeType,
'content': str(doc),
'document': doc
}
except Exception as e:
logger.error(f"Error processing single document: {str(e)}")
return None
def createDocumentsFromActionResult(self, action_result, action, workflow) -> List[Any]:
"""
Create actual document objects from action result and store them in the system.
Returns a list of created document objects.
"""
try:
logger.info(f"Creating documents from action result for {action.execMethod}.{action.execAction}")
logger.info(f"Action result documents count: {len(action_result.documents) if action_result.documents else 0}")
processed_docs = self.processActionResultDocuments(action_result, action, workflow)
logger.info(f"Processed {len(processed_docs)} documents")
created_documents = []
for i, doc_data in enumerate(processed_docs):
try:
document_name = doc_data['filename']
document_data = doc_data['content']
mime_type = doc_data['mimeType']
logger.info(f"Creating document {i+1}: {document_name} (mime: {mime_type}, content length: {len(str(document_data))})")
# Convert document data to string content
content = convertDocumentDataToString(document_data, getFileExtension(document_name))
# Skip empty or minimal content
minimal_content_patterns = ['{}', '[]', 'null', '""', "''"]
if not content or content.strip() == "" or content.strip() in minimal_content_patterns:
logger.warning(f"Empty or minimal content for document {document_name}, skipping")
continue
logger.info(f"Document {document_name} has content: {len(content)} characters")
# Create file in system
file_id = self.service.createFile(
fileName=document_name,
mimeType=mime_type,
content=content,
base64encoded=False
)
if not file_id:
logger.error(f"Failed to create file for document {document_name}")
continue
logger.info(f"Created file with ID: {file_id}")
# Create document object using existing file ID
document = self.service.createDocument(
fileName=document_name,
mimeType=mime_type,
content=content,
base64encoded=False,
existing_file_id=file_id
)
if document:
created_documents.append(document)
logger.info(f"Successfully created ChatDocument: {document_name} (ID: {getattr(document, 'id', 'N/A')}, fileId: {getattr(document, 'fileId', 'N/A')})")
else:
logger.error(f"Failed to create ChatDocument object for {document_name}")
except Exception as e:
logger.error(f"Error creating document {doc_data.get('filename', 'unknown')}: {str(e)}")
continue
logger.info(f"Successfully created {len(created_documents)} documents")
return created_documents
except Exception as e:
logger.error(f"Error creating documents from action result: {str(e)}")
return []