gateway/tests/functional/test10_document_generation_formats.py
2026-01-22 17:00:29 +01:00

555 lines
23 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Generation Formats Test 10 - Tests document generation in DOCX, XLSX, PPTX, and PDF formats
Tests professional document formats with various content types including tables, images, and structured data.
"""
import asyncio
import json
import sys
import os
import time
import base64
from typing import Dict, Any, List, Optional
# Add the gateway to path (go up 2 levels from tests/functional/)
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if _gateway_path not in sys.path:
sys.path.insert(0, _gateway_path)
# Import the service initialization
from modules.services import getInterface as getServices
from modules.features.aichat.datamodelFeatureAiChat import UserInputRequest, WorkflowModeEnum
from modules.datamodels.datamodelUam import User
from modules.workflows.automation import chatStart
import modules.features.aichat.interfaceFeatureAiChat as interfaceFeatureAiChat
class DocumentGenerationFormatsTester10:
def __init__(self):
# Use root user for testing (has full access to everything)
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
self.testUser = rootInterface.currentUser
# Initialize services using the existing system
self.services = getServices(self.testUser, None) # Test user, no workflow
self.workflow = None
self.testResults = {}
self.generatedDocuments = {}
self.pdfFileId = None # Store PDF file ID for reuse
async def initialize(self):
"""Initialize the test environment."""
# Enable debug file logging for tests
from modules.shared.configuration import APP_CONFIG
APP_CONFIG.set("APP_DEBUG_CHAT_WORKFLOW_ENABLED", True)
# Set logging level to INFO to see workflow progress
import logging
logging.getLogger().setLevel(logging.INFO)
print(f"Initialized test with user: {self.testUser.id}")
print(f"Mandate ID: {self.testUser.mandateId}")
print(f"Debug logging enabled: {APP_CONFIG.get('APP_DEBUG_CHAT_WORKFLOW_ENABLED', False)}")
# Upload PDF file for testing
await self.uploadPdfFile()
async def uploadPdfFile(self):
"""Upload the PDF file and store its file ID."""
pdfPath = os.path.join(os.path.dirname(__file__), "..", "..", "..", "local", "temp", "B2025-02c.pdf")
pdfPath = os.path.abspath(pdfPath)
if not os.path.exists(pdfPath):
print(f"⚠️ Warning: PDF file not found at {pdfPath}")
print(" Test will continue without PDF attachment")
return
try:
# Read PDF file
with open(pdfPath, "rb") as f:
pdfContent = f.read()
# Create file using services.interfaceDbComponent
if not hasattr(self.services, 'interfaceDbComponent') or not self.services.interfaceDbComponent:
print("⚠️ Warning: interfaceDbComponent not available in services")
print(" Test will continue without PDF attachment")
return
interfaceDbComponent = self.services.interfaceDbComponent
fileItem = interfaceDbComponent.createFile(
name="B2025-02c.pdf",
mimeType="application/pdf",
content=pdfContent
)
# Store file data
interfaceDbComponent.createFileData(fileItem.id, pdfContent)
self.pdfFileId = fileItem.id
print(f"✅ Uploaded PDF file: {fileItem.fileName} (ID: {self.pdfFileId}, Size: {len(pdfContent)} bytes)")
except Exception as e:
import traceback
print(f"⚠️ Warning: Failed to upload PDF file: {str(e)}")
print(f" Traceback: {traceback.format_exc()}")
print(" Test will continue without PDF attachment")
def createTestPrompt(self, format: str) -> str:
"""Create a test prompt for document generation in the specified format.
The prompt requests:
- Professional document structure with title, sections, tables, and images
- Extraction of content from attached PDF
- Structured data presentation appropriate for the format
"""
formatPrompts = {
"docx": (
"Create a professional Word document about 'Fuel Station Receipt Analysis' with:\n"
"1) A main title\n"
"2) An executive summary paragraph\n"
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
"4) A detailed analysis section with:\n"
" - Bullet points of key findings\n"
" - A table summarizing transaction details\n"
"5) A conclusion section with recommendations\n\n"
"Format as a professional DOCX document with proper headings and structure."
),
"xlsx": (
"Create an Excel spreadsheet analyzing the fuel station receipt from the attached PDF (B2025-02c.pdf).\n"
"Include:\n"
"1) A summary sheet with key metrics\n"
"2) A detailed data sheet with:\n"
" - Transaction details in rows\n"
" - Columns for: Date, Item, Quantity, Price, Total\n"
" - Proper formatting and headers\n"
"3) A calculations sheet with:\n"
" - VAT calculations\n"
" - Net and gross totals\n\n"
"Format as a professional XLSX spreadsheet with formulas and formatting."
),
"pptx": (
"Create a PowerPoint presentation about 'Fuel Station Receipt Analysis' with:\n"
"1) Title slide with main title\n"
"2) Overview slide explaining the receipt analysis\n"
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
"4) Analysis slides with:\n"
" - Bullet points of key findings\n"
" - Visual representation of data\n"
"5) Conclusion slide with recommendations\n\n"
"Format as a professional PPTX presentation with consistent styling."
),
"pdf": (
"Create a professional PDF document about 'Fuel Station Receipt Analysis' with:\n"
"1) A main title\n"
"2) An introduction paragraph explaining the receipt analysis\n"
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
"4) A section analyzing the receipt data with:\n"
" - Bullet points of key findings\n"
" - A table summarizing transaction details\n"
"5) A conclusion paragraph with recommendations\n\n"
"Format as a professional PDF document suitable for printing."
),
"html": (
"Create a professional HTML document about 'Fuel Station Receipt Analysis' with:\n"
"1) A main title\n"
"2) An introduction paragraph explaining the receipt analysis\n"
"3) Extract and include the image from the attached PDF document (B2025-02c.pdf)\n"
"4) A section analyzing the receipt data with:\n"
" - Bullet points of key findings\n"
" - A table summarizing transaction details\n"
"5) A conclusion paragraph with recommendations\n\n"
"Format as a professional HTML document with proper styling, responsive design, and embedded CSS."
)
}
return formatPrompts.get(format.lower(), formatPrompts["docx"])
async def generateDocumentInFormat(self, format: str) -> Dict[str, Any]:
"""Generate a document in the specified format using workflow."""
print("\n" + "="*80)
print(f"GENERATING DOCUMENT IN {format.upper()} FORMAT")
print("="*80)
prompt = self.createTestPrompt(format)
print(f"Prompt: {prompt[:200]}...")
# Create user input request with PDF file attachment
listFileId = []
if self.pdfFileId:
listFileId = [self.pdfFileId]
print(f"Attaching PDF file (ID: {self.pdfFileId})")
else:
print("⚠️ No PDF file attached (file upload may have failed)")
# Create user input request
userInput = UserInputRequest(
prompt=prompt,
listFileId=listFileId,
userLanguage="en"
)
# Start workflow
print(f"\nStarting workflow for {format.upper()} generation...")
workflow = await chatStart(
currentUser=self.testUser,
userInput=userInput,
workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC,
workflowId=None
)
if not workflow:
return {
"success": False,
"error": "Failed to start workflow"
}
self.workflow = workflow
print(f"Workflow started: {workflow.id}")
# Wait for workflow completion (no timeout - wait indefinitely)
print(f"Waiting for workflow completion...")
completed = await self.waitForWorkflowCompletion(timeout=None)
if not completed:
return {
"success": False,
"error": "Workflow did not complete",
"workflowId": workflow.id,
"status": workflow.status if workflow else "unknown"
}
# Analyze results
results = self.analyzeWorkflowResults()
# Extract documents for this format
documents = results.get("documents", [])
formatDocuments = [d for d in documents if d.get("fileName", "").endswith(f".{format.lower()}")]
return {
"success": True,
"format": format,
"workflowId": workflow.id,
"status": results.get("status"),
"documentCount": len(formatDocuments),
"documents": formatDocuments,
"results": results
}
async def waitForWorkflowCompletion(self, timeout: Optional[int] = None, checkInterval: int = 2) -> bool:
"""Wait for workflow to complete."""
if not self.workflow:
return False
startTime = time.time()
lastStatus = None
interfaceDbChat = interfaceDbChat.getInterface(self.testUser)
if timeout is None:
print("Waiting indefinitely (no timeout)")
while True:
# Check timeout only if specified
if timeout is not None and time.time() - startTime > timeout:
print(f"\n⏱️ Timeout after {timeout} seconds")
return False
# Get current workflow status
try:
currentWorkflow = interfaceDbChat.getWorkflow(self.workflow.id)
if not currentWorkflow:
print("\n❌ Workflow not found")
return False
currentStatus = currentWorkflow.status
elapsed = int(time.time() - startTime)
# Print status if it changed
if currentStatus != lastStatus:
print(f"Workflow status: {currentStatus} (elapsed: {elapsed}s)")
lastStatus = currentStatus
# Check if workflow is complete
if currentStatus in ["completed", "stopped", "failed"]:
self.workflow = currentWorkflow
statusIcon = "" if currentStatus == "completed" else ""
print(f"\n{statusIcon} Workflow finished with status: {currentStatus} (elapsed: {elapsed}s)")
return currentStatus == "completed"
# Wait before next check
await asyncio.sleep(checkInterval)
except Exception as e:
print(f"\n⚠️ Error checking workflow status: {str(e)}")
await asyncio.sleep(checkInterval)
def analyzeWorkflowResults(self) -> Dict[str, Any]:
"""Analyze workflow results and extract information."""
if not self.workflow:
return {"error": "No workflow to analyze"}
interfaceDbChat = interfaceDbChat.getInterface(self.testUser)
workflow = interfaceDbChat.getWorkflow(self.workflow.id)
if not workflow:
return {"error": "Workflow not found"}
# Get unified chat data
chatData = interfaceDbChat.getUnifiedChatData(workflow.id, None)
# Count messages
messages = chatData.get("messages", [])
userMessages = [m for m in messages if m.get("role") == "user"]
assistantMessages = [m for m in messages if m.get("role") == "assistant"]
# Count documents
documents = chatData.get("documents", [])
# Get logs
logs = chatData.get("logs", [])
results = {
"workflowId": workflow.id,
"status": workflow.status,
"workflowMode": str(workflow.workflowMode) if hasattr(workflow, 'workflowMode') else None,
"currentRound": workflow.currentRound,
"totalTasks": workflow.totalTasks,
"totalActions": workflow.totalActions,
"messageCount": len(messages),
"userMessageCount": len(userMessages),
"assistantMessageCount": len(assistantMessages),
"documentCount": len(documents),
"logCount": len(logs),
"documents": documents,
"logs": logs
}
print(f"\nWorkflow Results:")
print(f" Status: {results['status']}")
print(f" Tasks: {results['totalTasks']}")
print(f" Actions: {results['totalActions']}")
print(f" Messages: {results['messageCount']}")
print(f" Documents: {results['documentCount']}")
# Print document details
if documents:
print(f"\nGenerated Documents:")
for doc in documents:
fileName = doc.get("fileName", "unknown")
fileSize = doc.get("fileSize", 0)
mimeType = doc.get("mimeType", "unknown")
documentType = doc.get("documentType", "N/A")
print(f" - {fileName} ({fileSize} bytes, {mimeType}, type: {documentType})")
return results
def verifyDocumentFormat(self, document: Dict[str, Any], expectedFormat: str) -> Dict[str, Any]:
"""Verify that a document matches the expected format and contains expected metadata."""
fileName = document.get("fileName", "")
mimeType = document.get("mimeType", "")
fileSize = document.get("fileSize", 0)
documentType = document.get("documentType")
metadata = document.get("metadata")
# Expected MIME types
expectedMimeTypes = {
"pdf": ["application/pdf"],
"docx": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
"xlsx": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"],
"pptx": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"],
"html": ["text/html", "application/xhtml+xml"]
}
# Expected file extensions
expectedExtensions = {
"pdf": [".pdf"],
"docx": [".docx"],
"xlsx": [".xlsx"],
"pptx": [".pptx"],
"html": [".html", ".htm"]
}
formatLower = expectedFormat.lower()
expectedMimes = expectedMimeTypes.get(formatLower, [])
expectedExts = expectedExtensions.get(formatLower, [])
# Check file extension
hasCorrectExtension = any(fileName.lower().endswith(ext) for ext in expectedExts)
# Check MIME type
hasCorrectMimeType = any(mimeType.lower() == mime.lower() for mime in expectedMimes)
# Check file size (should be > 0)
hasValidSize = fileSize > 0
# Check document type (should be present)
hasDocumentType = documentType is not None
# Check metadata (should be present)
hasMetadata = metadata is not None and isinstance(metadata, dict)
verification = {
"format": expectedFormat,
"fileName": fileName,
"mimeType": mimeType,
"fileSize": fileSize,
"documentType": documentType,
"hasMetadata": hasMetadata,
"hasCorrectExtension": hasCorrectExtension,
"hasCorrectMimeType": hasCorrectMimeType,
"hasValidSize": hasValidSize,
"hasDocumentType": hasDocumentType,
"isValid": hasCorrectExtension and hasValidSize and hasCorrectMimeType,
"isComplete": hasCorrectExtension and hasValidSize and hasCorrectMimeType and hasDocumentType and hasMetadata
}
return verification
async def testAllFormats(self) -> Dict[str, Any]:
"""Test document generation in DOCX, XLSX, PPTX, PDF, and HTML formats."""
print("\n" + "="*80)
print("TESTING DOCUMENT GENERATION IN ALL FORMATS")
print("="*80)
# Test all document formats
formats = ["docx", "xlsx", "pptx", "pdf", "html"]
results = {}
for format in formats:
try:
print(f"\n{'='*80}")
print(f"Testing {format.upper()} format...")
print(f"{'='*80}")
result = await self.generateDocumentInFormat(format)
results[format] = result
if result.get("success"):
documents = result.get("documents", [])
if documents:
# Verify first document
verification = self.verifyDocumentFormat(documents[0], format)
result["verification"] = verification
print(f"\n{format.upper()} generation successful!")
print(f" Documents: {len(documents)}")
print(f" Verification: {'✅ PASS' if verification['isValid'] else '❌ FAIL'}")
print(f" Complete (with metadata): {'✅ YES' if verification['isComplete'] else '❌ NO'}")
if verification.get("fileName"):
print(f" File: {verification['fileName']}")
print(f" Size: {verification['fileSize']} bytes")
print(f" MIME: {verification['mimeType']}")
print(f" Document Type: {verification.get('documentType', 'N/A')}")
print(f" Has Metadata: {'' if verification.get('hasMetadata') else ''}")
else:
print(f"\n⚠️ {format.upper()} generation completed but no documents found")
else:
error = result.get("error", "Unknown error")
print(f"\n{format.upper()} generation failed: {error}")
# Small delay between tests
await asyncio.sleep(2)
except Exception as e:
import traceback
print(f"\n❌ Error testing {format.upper()}: {str(e)}")
print(traceback.format_exc())
results[format] = {
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}
return results
async def runTest(self):
"""Run the complete test."""
print("\n" + "="*80)
print("DOCUMENT GENERATION FORMATS TEST 10 - ALL FORMATS")
print("="*80)
try:
# Initialize
await self.initialize()
# Test all formats
formatResults = await self.testAllFormats()
# Summary
print("\n" + "="*80)
print("TEST SUMMARY")
print("="*80)
# Format tests summary
print("\nFormat Tests:")
successCount = 0
failCount = 0
completeCount = 0 # Documents with metadata
for format, result in formatResults.items():
if result.get("success"):
successCount += 1
verification = result.get("verification", {})
isValid = verification.get("isValid", False)
isComplete = verification.get("isComplete", False)
if isComplete:
completeCount += 1
statusIcon = "" if isValid else "⚠️"
completeIcon = "" if isComplete else ""
docCount = result.get("documentCount", 0)
print(f"{statusIcon} {format.upper():6s}: {'PASS' if isValid else 'FAIL'} - {docCount} document(s) - Metadata: {completeIcon}")
else:
failCount += 1
error = result.get("error", "Unknown error")
print(f"{format.upper():6s}: FAIL - {error}")
print(f"\nFormat Tests: {successCount} passed, {failCount} failed out of {len(formatResults)} formats")
print(f"Complete Documents (with metadata): {completeCount} out of {successCount} successful generations")
self.testResults = {
"success": failCount == 0,
"formatTests": {
"successCount": successCount,
"failCount": failCount,
"completeCount": completeCount,
"totalFormats": len(formatResults),
"results": formatResults
},
"totalSuccess": successCount,
"totalFail": failCount
}
return self.testResults
except Exception as e:
import traceback
print(f"\n❌ Test failed with error: {type(e).__name__}: {str(e)}")
print(f"Traceback:\n{traceback.format_exc()}")
self.testResults = {
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}
return self.testResults
async def main():
"""Run document generation formats test 10."""
tester = DocumentGenerationFormatsTester10()
results = await tester.runTest()
# Print final results as JSON for easy parsing
print("\n" + "="*80)
print("FINAL RESULTS (JSON)")
print("="*80)
print(json.dumps(results, indent=2, default=str))
if __name__ == "__main__":
asyncio.run(main())