gateway/test_document_extraction.py
2025-07-10 16:13:05 +02:00

288 lines
No EOL
10 KiB
Python

#!/usr/bin/env python3
"""
Test procedure for DocumentManager document extraction functionality.
"""
import asyncio
import sys
import os
import json
import argparse
from datetime import datetime, UTC
from pathlib import Path
import logging
print("Starting test_document_extraction.py...")
# Configure logging FIRST, before any other imports
import logging
# Clear any existing handlers to avoid duplicate logs
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('test_document_extraction.log', mode='w', encoding='utf-8') # 'w' mode clears the file
],
force=True # Force reconfiguration even if already configured
)
# Filter out httpcore messages
logging.getLogger('httpcore').setLevel(logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
# Set up test configuration
os.environ['POWERON_CONFIG_FILE'] = 'test_config.ini'
print("Set POWERON_CONFIG_FILE environment variable")
try:
# Import required modules
from modules.interfaces.interfaceAppObjects import User, UserConnection
from modules.interfaces.interfaceChatModel import ChatWorkflow
from modules.workflow.managerDocument import DocumentManager
from modules.workflow.serviceContainer import ServiceContainer
print("All imports successful")
except Exception as e:
print(f"Import error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
def log_extraction_debug(message: str, data: dict = None):
"""Log extraction debug data with JSON dumps"""
timestamp = datetime.now(UTC).isoformat()
if data:
logger.debug(f"[{timestamp}] {message}\n{json.dumps(data, indent=2, ensure_ascii=False)}")
else:
logger.debug(f"[{timestamp}] {message}")
def create_test_user() -> User:
"""Create a test user for the document extraction"""
return User(
id="test-user-doc-001",
mandateId="test-mandate-doc-001",
username="testuser_doc",
email="test_doc@example.com",
fullName="Test Document User",
enabled=True,
language="en",
privilege="user",
authenticationAuthority="local"
)
def create_test_workflow() -> ChatWorkflow:
"""Create a test workflow for document extraction"""
return ChatWorkflow(
id="test-workflow-doc-001",
mandateId="test-mandate-doc-001",
status="running",
name="Document Extraction Test Workflow",
currentRound=1,
lastActivity=datetime.now(UTC).isoformat(),
startedAt=datetime.now(UTC).isoformat(),
logs=[],
messages=[],
stats=None,
tasks=[]
)
def detect_mime_type(file_path: str) -> str:
"""Detect MIME type based on file extension"""
ext = Path(file_path).suffix.lower()
mime_types = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.csv': 'text/csv',
'.json': 'application/json',
'.xml': 'application/xml',
'.js': 'application/javascript',
'.py': 'application/x-python',
'.svg': 'image/svg+xml',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.html': 'text/html',
'.htm': 'text/html'
}
return mime_types.get(ext, 'application/octet-stream')
async def test_document_extraction(file_path: str):
"""Test document extraction from a file path"""
try:
# Clear the log file before each run
log_file_path = "test_document_extraction.log"
if os.path.exists(log_file_path):
with open(log_file_path, 'w') as f:
f.write("") # Clear the file
logger.info(f"Cleared log file: {log_file_path}")
logger.info("=== STARTING DOCUMENT EXTRACTION TEST ===")
# Validate file path
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
# Get file info
file_path_obj = Path(file_path)
filename = file_path_obj.name
mime_type = detect_mime_type(file_path)
file_size = file_path_obj.stat().st_size
log_extraction_debug("File information", {
"file_path": file_path,
"filename": filename,
"mime_type": mime_type,
"file_size_bytes": file_size,
"file_size_mb": round(file_size / (1024 * 1024), 2)
})
# Read file data
try:
with open(file_path, 'rb') as f:
file_data = f.read()
log_extraction_debug("File read successfully", {
"bytes_read": len(file_data),
"file_encoding": "binary"
})
except Exception as e:
logger.error(f"Error reading file: {str(e)}")
raise
# Create test user and workflow
test_user = create_test_user()
test_workflow = create_test_workflow()
# Create service container
service_container = ServiceContainer(test_user, test_workflow)
log_extraction_debug("Service container created", {
"user_id": test_user.id,
"workflow_id": test_workflow.id
})
# Create document manager
document_manager = DocumentManager(service_container)
log_extraction_debug("Document manager created")
# Define extraction prompt
extraction_prompt = "extract the table and convert it to a csv table"
log_extraction_debug("Starting document extraction", {
"prompt": extraction_prompt,
"filename": filename,
"mime_type": mime_type
})
# Extract content from file data
try:
extracted_content = await document_manager.extractContentFromFileData(
prompt=extraction_prompt,
fileData=file_data,
filename=filename,
mimeType=mime_type,
base64Encoded=False,
documentId=f"test-doc-{datetime.now(UTC).timestamp()}"
)
# Log extraction results
extraction_result = {
"extracted_content_id": extracted_content.id,
"content_items_count": len(extracted_content.contents)
}
# Add objectId and objectType if they exist (set by DocumentManager)
if hasattr(extracted_content, 'objectId'):
extraction_result["object_id"] = extracted_content.objectId
if hasattr(extracted_content, 'objectType'):
extraction_result["object_type"] = extracted_content.objectType
log_extraction_debug("Document extraction completed successfully", extraction_result)
# Log detailed content information
for i, content_item in enumerate(extracted_content.contents):
content_info = {
"label": content_item.label,
"data_length": len(content_item.data) if content_item.data else 0,
"data_preview": content_item.data[:500] + "..." if content_item.data and len(content_item.data) > 500 else content_item.data
}
# Add metadata if available
if content_item.metadata:
content_info["metadata"] = {
"size": content_item.metadata.size,
"mime_type": content_item.metadata.mimeType,
"base64_encoded": content_item.metadata.base64Encoded,
"pages": content_item.metadata.pages
}
log_extraction_debug(f"CONTENT ITEM {i+1}:", content_info)
# Log summary of all extracted content
all_content = "\n\n".join([item.data for item in extracted_content.contents if item.data])
log_extraction_debug("COMPLETE EXTRACTED CONTENT:", {
"total_length": len(all_content),
"content": all_content
})
return extracted_content
except Exception as e:
log_extraction_debug("DOCUMENT EXTRACTION EXCEPTION:", {
"error_type": type(e).__name__,
"error_message": str(e),
"error_args": e.args if hasattr(e, 'args') else None
})
raise
logger.info("=== DOCUMENT EXTRACTION TEST COMPLETED ===")
return extracted_content
except Exception as e:
logger.error(f"❌ Document extraction test failed with error: {str(e)}")
log_extraction_debug("Full error details", {
"error_type": type(e).__name__,
"error_message": str(e)
})
raise
async def main():
"""Main function to run the document extraction test"""
print("Inside main()")
logger.info("=" * 50)
logger.info("DOCUMENT EXTRACTION TEST")
logger.info("=" * 50)
# Parse command line arguments
parser = argparse.ArgumentParser(description='Test document extraction functionality')
parser.add_argument('file_path', help='Path to the file to extract content from')
args = parser.parse_args()
try:
extracted_content = await test_document_extraction(args.file_path)
logger.info("=" * 50)
logger.info("TEST COMPLETED SUCCESSFULLY")
logger.info("=" * 50)
return extracted_content
except Exception as e:
logger.error("=" * 50)
logger.error("TEST FAILED")
logger.error("=" * 50)
raise
if __name__ == "__main__":
print("About to run main()")
asyncio.run(main())
print("main() finished")