gateway/modules/workflows/methods/methodAi/actions/generateDocument.py
2025-12-23 00:34:15 +01:00

401 lines
19 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Generate Document action for AI operations.
Generates documents from scratch or based on templates/inputs using hierarchical approach.
"""
import logging
import time
from typing import Dict, Any, Optional
from modules.workflows.methods.methodBase import action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
from modules.services.serviceGeneration.subStructureGenerator import StructureGenerator
from modules.services.serviceGeneration.subContentGenerator import ContentGenerator
from modules.services.serviceGeneration.subDocumentPurposeAnalyzer import DocumentPurposeAnalyzer
logger = logging.getLogger(__name__)
@action
async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Generate documents from scratch or based on templates/inputs using hierarchical approach.
- Input requirements: prompt or description (required); optional documentList (for templates/references).
- Output format: Document in specified format. Any format supported by dynamically registered renderers is acceptable (default: txt).
Parameters:
- prompt (str, required): Description of the document to generate.
- documentList (list, optional): Template documents or reference documents to use as a guide.
- documentType (str, optional): Type of document - letter, memo, proposal, contract, etc.
- resultType (str, optional): Output format. Any format supported by dynamically registered renderers is acceptable (formats are discovered automatically from renderer registry). Common formats: txt, html, pdf, docx, md, json, csv, xlsx, pptx, png, jpg. Default: txt.
- maxSectionLength (int, optional): Maximum words for simple sections. Default: 500.
- parallelGeneration (bool, optional): Enable parallel section generation. Default: True.
- progressLogging (bool, optional): Send ChatLog progress updates. Default: True.
"""
prompt = parameters.get("prompt")
if not prompt:
return ActionResult.isFailure(error="prompt is required")
documentList = parameters.get("documentList", [])
documentType = parameters.get("documentType")
resultType = parameters.get("resultType", "txt")
# Auto-detect format from prompt if not explicitly provided
if resultType == "txt" and prompt:
promptLower = prompt.lower()
if "html" in promptLower or "html5" in promptLower:
resultType = "html"
logger.info(f"Auto-detected HTML format from prompt")
elif "pdf" in promptLower:
resultType = "pdf"
logger.info(f"Auto-detected PDF format from prompt")
elif "markdown" in promptLower or " md " in promptLower or promptLower.endswith(" md"):
resultType = "md"
logger.info(f"Auto-detected Markdown format from prompt")
elif ("text" in promptLower or "txt" in promptLower) and "html" not in promptLower:
resultType = "txt"
logger.info(f"Auto-detected Text format from prompt")
maxSectionLength = parameters.get("maxSectionLength", 500)
parallelGeneration = parameters.get("parallelGeneration", True)
progressLogging = parameters.get("progressLogging", True)
# Create operation ID for progress tracking
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"doc_gen_{workflowId}_{int(time.time())}"
parentOperationId = parameters.get('parentOperationId')
try:
# Phase 1: Structure Generation
if progressLogging:
self.services.chat.progressLogStart(
operationId,
"Document",
"Structure Generation",
"Generating document structure...",
parentOperationId=parentOperationId
)
structureGenerator = StructureGenerator(self.services)
# Analyze document purposes and process documents accordingly
cachedContent = None
imageDocuments = []
documentPurposes = {}
if documentList:
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.1, "Analyzing document purposes...")
# Convert documentList to DocumentReferenceList
from modules.datamodels.datamodelDocref import DocumentReferenceList
if isinstance(documentList, DocumentReferenceList):
docRefList = documentList
elif isinstance(documentList, str):
docRefList = DocumentReferenceList.from_string_list([documentList])
elif isinstance(documentList, list):
docRefList = DocumentReferenceList.from_string_list(documentList)
else:
docRefList = DocumentReferenceList(references=[])
# Get ChatDocuments
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList)
if chatDocuments:
logger.info(f"Analyzing purposes for {len(chatDocuments)} documents")
# Analyze document purposes using AI
purposeAnalyzer = DocumentPurposeAnalyzer(self.services)
purposeAnalysis = await purposeAnalyzer.analyzeDocumentPurposes(
userPrompt=prompt,
chatDocuments=chatDocuments,
actionContext="generateDocument"
)
documentPurposes = {dp["document_id"]: dp for dp in purposeAnalysis.get("document_purposes", [])}
logger.info(f"Purpose analysis complete: {purposeAnalysis.get('overall_intent', 'N/A')}")
# Separate documents by purpose
textDocs = []
imageDocsToInclude = []
imageDocsToAnalyze = []
for doc in chatDocuments:
docPurpose = documentPurposes.get(doc.id, {})
purpose = docPurpose.get("purpose", "extract_text_content")
if purpose == "include_image":
imageDocsToInclude.append(doc)
elif purpose == "analyze_image_vision":
imageDocsToAnalyze.append(doc)
elif purpose in ["extract_text_content", "use_as_template", "use_as_reference", "extract_data"]:
textDocs.append(doc)
# Skip "attach" purpose - don't process
# Process text documents (extract content)
extractedResults = []
if textDocs:
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.15, f"Extracting content from {len(textDocs)} text document(s)...")
# Prepare extraction options with purpose-specific prompts
extractionOptionsList = []
for doc in textDocs:
docPurpose = documentPurposes.get(doc.id, {})
extractionPrompt = docPurpose.get("extractionPrompt") or "Extract all content from the document"
extractionOptions = ExtractionOptions(
prompt=extractionPrompt,
mergeStrategy=MergeStrategy(
mergeType="concatenate",
groupBy="typeGroup",
orderBy="id"
),
processDocumentsIndividually=True
)
extractionOptionsList.append((doc, extractionOptions))
# Extract content from text documents
for doc, extractionOptions in extractionOptionsList:
try:
docResults = self.services.extraction.extractContent(
[doc],
extractionOptions,
parentOperationId=operationId
)
extractedResults.extend(docResults)
except Exception as e:
logger.error(f"Error extracting content from {doc.fileName}: {str(e)}")
logger.info(f"Extracted content from {len(extractedResults)} text document(s)")
# Process images to analyze (vision call)
if imageDocsToAnalyze:
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.2, f"Analyzing {len(imageDocsToAnalyze)} image(s) with vision AI...")
# Extract content from images using vision analysis
for doc in imageDocsToAnalyze:
try:
docPurpose = documentPurposes.get(doc.id, {})
extractionPrompt = docPurpose.get("extractionPrompt") or "Extract all text and information from this image"
extractionOptions = ExtractionOptions(
prompt=extractionPrompt,
mergeStrategy=MergeStrategy(
mergeType="concatenate",
groupBy="typeGroup",
orderBy="id"
),
processDocumentsIndividually=True
)
docResults = self.services.extraction.extractContent(
[doc],
extractionOptions,
parentOperationId=operationId
)
extractedResults.extend(docResults)
except Exception as e:
logger.error(f"Error analyzing image {doc.fileName}: {str(e)}")
logger.info(f"Analyzed {len(imageDocsToAnalyze)} image(s) with vision AI")
# Process images to include (store image data)
if imageDocsToInclude:
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.25, f"Preparing {len(imageDocsToInclude)} image(s) for inclusion...")
# Get image data for inclusion
from modules.interfaces.interfaceDbComponentObjects import getInterface
dbInterface = getInterface()
for doc in imageDocsToInclude:
try:
# Get image bytes
imageBytes = dbInterface.getFileData(doc.fileId)
if imageBytes:
# Encode to base64
import base64
base64Data = base64.b64encode(imageBytes).decode('utf-8')
# Create image document entry
imageDoc = {
"id": doc.id,
"fileName": doc.fileName,
"mimeType": doc.mimeType,
"base64Data": base64Data,
"altText": doc.fileName or "Image",
"fileSize": doc.fileSize
}
imageDocuments.append(imageDoc)
logger.debug(f"Prepared image {doc.fileName} for inclusion ({len(base64Data)} chars base64)")
else:
logger.warning(f"Could not retrieve image data for {doc.fileName}")
except Exception as e:
logger.error(f"Error preparing image {doc.fileName} for inclusion: {str(e)}")
logger.info(f"Prepared {len(imageDocuments)} image(s) for inclusion")
# Build cachedContent with all information
cachedContent = {
"extractedContent": extractedResults,
"imageDocuments": imageDocuments,
"documentPurposes": documentPurposes,
"extractionTimestamp": time.time(),
"sourceDocuments": [doc.id for doc in chatDocuments]
}
logger.info(f"Document processing complete: {len(extractedResults)} extracted, {len(imageDocuments)} images to include")
# Generate structure
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.2, "Generating document structure...")
structure = await structureGenerator.generateStructure(
userPrompt=prompt,
documentList=documentList if documentList else None,
cachedContent=cachedContent,
maxSectionLength=maxSectionLength,
existingImages=imageDocuments # Pass existing images for structure generation
)
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.33, "Structure generated")
# Phase 2: Content Generation
if progressLogging:
self.services.chat.progressLogUpdate(
operationId,
0.34,
"Starting content generation..."
)
contentGenerator = ContentGenerator(self.services)
# Create enhanced progress callback
def progressCallback(sectionIndex: int, totalSections: int, message: str):
if progressLogging:
# Calculate progress: 34% to 90% for content generation phase
if totalSections > 0:
progress = 0.34 + (0.56 * (sectionIndex / totalSections))
else:
progress = 0.34
# Format message
if sectionIndex > 0 and totalSections > 0:
progressMessage = f"Section {sectionIndex}/{totalSections}: {message}"
else:
progressMessage = message
self.services.chat.progressLogUpdate(
operationId,
progress,
progressMessage
)
completeStructure = await contentGenerator.generateContent(
structure=structure,
cachedContent=cachedContent,
userPrompt=prompt,
progressCallback=progressCallback,
parallelGeneration=parallelGeneration
)
if progressLogging:
self.services.chat.progressLogUpdate(operationId, 0.90, "Content generated")
# Phase 3: Integration & Rendering
if progressLogging:
self.services.chat.progressLogUpdate(
operationId,
0.91,
"Rendering final document..."
)
# Use existing renderReport method
title = structure.get("metadata", {}).get("title", "Generated Document")
if documentType:
title = f"{title} ({documentType})"
renderedContent, mimeType, images = await self.services.generation.renderReport(
extractedContent=completeStructure,
outputFormat=resultType,
title=title,
userPrompt=prompt,
aiService=self.services.ai
)
# Build list of documents to return
documents = [
ActionDocument(
documentName=f"document.{resultType}",
documentData=renderedContent,
mimeType=mimeType
)
]
# Add images as separate documents
if images:
logger.info(f"Processing {len(images)} image(s) from renderer")
import base64
for idx, imageData in enumerate(images):
try:
base64Data = imageData.get("base64Data", "")
altText = imageData.get("altText", f"image_{idx + 1}")
caption = imageData.get("caption", "")
sectionId = imageData.get("sectionId", f"section_{idx + 1}")
if base64Data:
# Decode base64 to bytes
imageBytes = base64.b64decode(base64Data)
# Determine filename and mime type
filename = imageData.get("filename", f"image_{idx + 1}.png")
if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
filename = f"image_{idx + 1}.png"
# Determine mime type from filename
if filename.lower().endswith('.png'):
imageMimeType = "image/png"
elif filename.lower().endswith(('.jpg', '.jpeg')):
imageMimeType = "image/jpeg"
elif filename.lower().endswith('.gif'):
imageMimeType = "image/gif"
elif filename.lower().endswith('.webp'):
imageMimeType = "image/webp"
else:
imageMimeType = "image/png" # Default
# Add image document
documents.append(ActionDocument(
documentName=filename,
documentData=imageBytes,
mimeType=imageMimeType
))
logger.info(f"Added image document: {filename} (section: {sectionId}, {len(imageBytes)} bytes, alt: {altText})")
else:
logger.warning(f"Image {idx + 1} (section: {sectionId}) has no base64Data, skipping")
except Exception as e:
logger.error(f"Error adding image document {idx + 1}: {str(e)}", exc_info=True)
continue
else:
logger.debug("No images returned from renderer")
# Note: Document creation is handled by the workflow system
# We just return the rendered content and images in ActionResult
if progressLogging:
self.services.chat.progressLogFinish(operationId, True)
return ActionResult.isSuccess(documents=documents)
except Exception as e:
logger.error(f"Error in hierarchical document generation: {str(e)}")
if progressLogging:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=str(e))