855 lines
34 KiB
Python
855 lines
34 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for DocumentExtraction class.
|
|
Processes all files in d:/temp folder and stores extracted content in d:/temp/extracted.
|
|
|
|
Features:
|
|
- Option to extract content WITH AI processing (default)
|
|
- Option to extract content WITHOUT AI processing (content-only mode)
|
|
- Supports all document types: text, images, PDFs, Office documents, etc.
|
|
- Detailed logging and progress tracking
|
|
- Separate output directories for AI vs content-only modes
|
|
|
|
Usage:
|
|
- Interactive mode: python test_documentExtraction.py
|
|
- Content-only mode: python test_documentExtraction.py --no-ai
|
|
- Content-only mode: python test_documentExtraction.py --content-only
|
|
- Specify custom input/output: python test_documentExtraction.py --input-dir /path/to/input --output-dir /path/to/output --no-ai
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
from datetime import datetime, UTC
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.DEBUG, # Changed from INFO to DEBUG
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Filter out specific unwanted log messages
|
|
class LogFilter(logging.Filter):
|
|
"""Filter to hide specific unwanted log messages."""
|
|
|
|
def filter(self, record):
|
|
# Hide workflow stats update errors
|
|
if "Workflow" in record.getMessage() and "not found for stats update" in record.getMessage():
|
|
return False
|
|
|
|
# Hide HTTP request info messages
|
|
if "HTTP Request:" in record.getMessage() and "POST https://api.openai.com" in record.getMessage():
|
|
return False
|
|
|
|
# Hide HTTP response info messages
|
|
if "HTTP/1.1 200 OK" in record.getMessage():
|
|
return False
|
|
|
|
return True
|
|
|
|
# Apply the filter to the root logger
|
|
root_logger = logging.getLogger()
|
|
root_logger.addFilter(LogFilter())
|
|
|
|
def check_dependencies():
|
|
"""Check if required dependencies are available and provide installation instructions."""
|
|
missing_deps = []
|
|
|
|
# Check for required dependencies
|
|
try:
|
|
import bs4
|
|
logger.info("✓ beautifulsoup4 is available")
|
|
except ImportError:
|
|
missing_deps.append("beautifulsoup4")
|
|
logger.error("✗ beautifulsoup4 is missing")
|
|
|
|
try:
|
|
import PyPDF2
|
|
logger.info("✓ PyPDF2 is available")
|
|
except ImportError:
|
|
missing_deps.append("PyPDF2")
|
|
logger.error("✗ PyPDF2 is missing")
|
|
|
|
try:
|
|
import fitz
|
|
logger.info("✓ PyMuPDF (fitz) is available")
|
|
except ImportError:
|
|
missing_deps.append("PyMuPDF")
|
|
logger.error("✗ PyMuPDF (fitz) is missing")
|
|
|
|
try:
|
|
import docx
|
|
logger.info("✓ python-docx is available")
|
|
except ImportError:
|
|
missing_deps.append("python-docx")
|
|
logger.error("✗ python-docx is missing")
|
|
|
|
try:
|
|
import openpyxl
|
|
logger.info("✓ openpyxl is available")
|
|
except ImportError:
|
|
missing_deps.append("openpyxl")
|
|
logger.error("✗ openpyxl is missing")
|
|
|
|
try:
|
|
import pptx
|
|
logger.info("✓ python-pptx is available")
|
|
except ImportError:
|
|
missing_deps.append("python-pptx")
|
|
logger.error("✗ python-pptx is missing")
|
|
|
|
try:
|
|
from PIL import Image
|
|
logger.info("✓ Pillow (PIL) is available")
|
|
except ImportError:
|
|
missing_deps.append("Pillow")
|
|
logger.error("✗ Pillow (PIL) is missing")
|
|
|
|
if missing_deps:
|
|
logger.error("\n" + "="*60)
|
|
logger.error("MISSING DEPENDENCIES DETECTED!")
|
|
logger.error("="*60)
|
|
logger.error("The following packages are required but not installed:")
|
|
for dep in missing_deps:
|
|
logger.error(f" - {dep}")
|
|
logger.error("\nTo install all dependencies, run:")
|
|
logger.error("pip install -r requirements.txt")
|
|
logger.error("\nOr install individual packages:")
|
|
for dep in missing_deps:
|
|
if dep == "beautifulsoup4":
|
|
logger.error(f" pip install {dep}")
|
|
elif dep == "PyMuPDF":
|
|
logger.error(f" pip install {dep}")
|
|
elif dep == "Pillow":
|
|
logger.error(f" pip install {dep}")
|
|
else:
|
|
logger.error(f" pip install {dep}")
|
|
logger.error("="*60)
|
|
return False
|
|
|
|
logger.info("✓ All required dependencies are available!")
|
|
return True
|
|
|
|
def check_module_imports():
|
|
"""Check if we can import the required modules."""
|
|
try:
|
|
# Add the gateway directory to the path so we can import our modules
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
|
|
|
|
from modules.chat.documents.documentExtraction import DocumentExtraction
|
|
from modules.chat.serviceCenter import ServiceCenter
|
|
from modules.interfaces.interfaceAppModel import User, UserConnection
|
|
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem
|
|
|
|
logger.info("✓ All required modules imported successfully")
|
|
return True
|
|
except ImportError as e:
|
|
logger.error(f"✗ Failed to import required modules: {e}")
|
|
logger.error("Make sure you're running this script from the gateway directory")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"✗ Unexpected error importing modules: {e}")
|
|
return False
|
|
|
|
def create_mock_service_center():
|
|
"""Create a proper ServiceCenter for testing purposes with all required fields."""
|
|
try:
|
|
from modules.chat.serviceCenter import ServiceCenter
|
|
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
|
|
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
|
|
from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat
|
|
|
|
# Create proper user with all required fields
|
|
mock_user = User(
|
|
id="test_user_001",
|
|
username="testuser",
|
|
email="test@example.com",
|
|
fullName="Test User",
|
|
language="en",
|
|
enabled=True,
|
|
privilege=UserPrivilege.USER,
|
|
authenticationAuthority=AuthAuthority.LOCAL,
|
|
mandateId="test_mandate_001"
|
|
)
|
|
|
|
# Create proper workflow with all required fields
|
|
current_time = datetime.now(UTC).isoformat()
|
|
mock_workflow = ChatWorkflow(
|
|
id="test_workflow_001",
|
|
mandateId="test_mandate_001",
|
|
status="active",
|
|
name="Test Document Extraction Workflow",
|
|
currentRound=1,
|
|
lastActivity=current_time,
|
|
startedAt=current_time,
|
|
logs=[],
|
|
messages=[],
|
|
stats=None,
|
|
tasks=[]
|
|
)
|
|
|
|
# Create service center
|
|
service_center = ServiceCenter(mock_user, mock_workflow)
|
|
logger.info("✓ ServiceCenter created successfully with proper objects")
|
|
return service_center
|
|
|
|
except Exception as e:
|
|
logger.error(f"✗ Failed to create ServiceCenter: {e}")
|
|
return None
|
|
|
|
class DocumentExtractionTester:
|
|
"""Test class for DocumentExtraction functionality."""
|
|
|
|
def __init__(self, input_dir: str = "d:/temp/test-extraction", output_dir: str = None, enable_ai: bool = True):
|
|
"""
|
|
Initialize the tester.
|
|
|
|
Args:
|
|
input_dir: Directory containing files to process
|
|
output_dir: Directory to store extracted content (auto-generated if None)
|
|
enable_ai: Whether to enable AI processing (default: True)
|
|
"""
|
|
self.input_dir = Path(input_dir)
|
|
|
|
# Auto-generate output directory if not specified
|
|
if output_dir is None:
|
|
if enable_ai:
|
|
self.output_dir = Path(input_dir) / "extracted"
|
|
else:
|
|
self.output_dir = Path(input_dir) / "extracted-raw"
|
|
else:
|
|
self.output_dir = Path(output_dir)
|
|
|
|
self.extractor = None
|
|
self.service_center = None
|
|
self.enable_ai = enable_ai
|
|
|
|
if enable_ai:
|
|
self.prompt = "Make a summary of each sentence for each page or chapter of the document"
|
|
else:
|
|
self.prompt = None # No prompt needed for content-only extraction
|
|
|
|
# Track processing results for summary
|
|
self.processing_results = []
|
|
|
|
# Ensure output directory exists
|
|
logger.info(f"Creating output directory: {self.output_dir}")
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Verify directory was created
|
|
if self.output_dir.exists():
|
|
logger.info(f"✓ Output directory created/verified: {self.output_dir}")
|
|
logger.info(f"Output directory absolute path: {self.output_dir.absolute()}")
|
|
else:
|
|
logger.error(f"✗ Failed to create output directory: {self.output_dir}")
|
|
|
|
# Log configuration
|
|
logger.info(f"Configuration: AI processing = {'ENABLED' if self.enable_ai else 'DISABLED'}")
|
|
logger.info(f"Input directory: {self.input_dir}")
|
|
logger.info(f"Output directory: {self.output_dir}")
|
|
|
|
# Test basic file writing capability
|
|
test_file = self.output_dir / "test_write_capability.txt"
|
|
try:
|
|
logger.info(f"Testing file write capability to: {test_file}")
|
|
logger.info(f"Absolute path: {test_file.absolute()}")
|
|
|
|
with open(test_file, 'w', encoding='utf-8') as f:
|
|
f.write("Test file to verify write capability")
|
|
|
|
if test_file.exists():
|
|
actual_size = test_file.stat().st_size
|
|
logger.info(f"✓ Basic file writing test passed: {test_file} (size: {actual_size} bytes)")
|
|
|
|
# Test reading the file back
|
|
with open(test_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
logger.info(f"✓ File read test passed: content length = {len(content)}")
|
|
|
|
# Clean up test file
|
|
test_file.unlink()
|
|
logger.info("✓ Test file cleaned up")
|
|
else:
|
|
logger.error(f"✗ Basic file writing test failed: {test_file}")
|
|
except Exception as e:
|
|
logger.error(f"✗ Basic file writing test failed with error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Supported file extensions for content extraction
|
|
self.supported_extensions = {
|
|
# Text and data files
|
|
'.txt', '.csv', '.json', '.xml', '.html', '.htm', '.svg',
|
|
'.md', '.markdown', '.rst', '.log', '.ini', '.cfg', '.conf',
|
|
|
|
# Programming languages
|
|
'.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.c', '.cpp', '.cc', '.cxx',
|
|
'.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
|
|
'.r', '.m', '.pl', '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
|
|
'.vbs', '.lua', '.sql', '.r', '.dart', '.elm', '.clj', '.hs', '.fs', '.ml',
|
|
|
|
# Web technologies
|
|
'.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.astro',
|
|
|
|
# Configuration and build files
|
|
'.yaml', '.yml', '.toml', '.env', '.gitignore', '.dockerfile', '.dockerignore',
|
|
'.makefile', '.cmake', '.gradle', '.maven', '.pom', '.sln', '.vcxproj',
|
|
'.csproj', '.fsproj', '.vbproj', '.xcodeproj', '.pbxproj',
|
|
|
|
# Documentation and markup
|
|
'.tex', '.bib', '.adoc', '.asciidoc', '.wiki', '.creole',
|
|
|
|
# Images
|
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico',
|
|
|
|
# Documents
|
|
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.ods', '.odp',
|
|
|
|
# Legacy Office formats
|
|
'.doc', '.xls', '.ppt',
|
|
|
|
# Archives and binaries
|
|
'.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib'
|
|
}
|
|
|
|
def initialize_extractor(self):
|
|
"""Initialize the DocumentExtraction instance with a proper ServiceCenter."""
|
|
try:
|
|
# First create the service center
|
|
self.service_center = create_mock_service_center()
|
|
if not self.service_center:
|
|
logger.error("Failed to create ServiceCenter!")
|
|
return False
|
|
|
|
# Now create DocumentExtraction with the service center
|
|
from modules.chat.documents.documentExtraction import DocumentExtraction
|
|
self.extractor = DocumentExtraction(self.service_center)
|
|
logger.info("✓ DocumentExtraction initialized successfully with ServiceCenter")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"✗ Failed to initialize DocumentExtraction: {e}")
|
|
return False
|
|
|
|
def get_files_to_process(self) -> List[Path]:
|
|
"""Get list of files to process from input directory."""
|
|
if not self.input_dir.exists():
|
|
logger.error(f"Input directory {self.input_dir} does not exist!")
|
|
logger.info("Creating input directory and adding a test file...")
|
|
self.input_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create a test file if none exist
|
|
test_file = self.input_dir / "test.txt"
|
|
with open(test_file, 'w') as f:
|
|
f.write("This is a test file for document extraction.\nIt contains multiple lines.\nAnd some special characters: äöüß")
|
|
logger.info(f"Created test file: {test_file}")
|
|
|
|
files = []
|
|
all_files = list(self.input_dir.iterdir())
|
|
logger.info(f"All files in directory: {[f.name for f in all_files]}")
|
|
|
|
for file_path in all_files:
|
|
if file_path.is_file():
|
|
logger.debug(f"Checking file: {file_path.name} (extension: {file_path.suffix})")
|
|
if file_path.suffix.lower() in self.supported_extensions:
|
|
files.append(file_path)
|
|
logger.debug(f"Added file: {file_path.name}")
|
|
else:
|
|
logger.debug(f"Skipped file: {file_path.name} (unsupported extension)")
|
|
|
|
logger.info(f"Found {len(files)} supported files to process")
|
|
if files:
|
|
logger.info(f"Files to process: {[f.name for f in files]}")
|
|
return files
|
|
|
|
async def process_single_file(self, file_path: Path) -> bool:
|
|
"""
|
|
Process a single file and extract its content.
|
|
|
|
Args:
|
|
file_path: Path to the file to process
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
if not self.extractor:
|
|
logger.error("DocumentExtraction not initialized!")
|
|
return False
|
|
|
|
try:
|
|
logger.info(f"Processing file: {file_path.name}")
|
|
|
|
# Read file data
|
|
with open(file_path, 'rb') as f:
|
|
file_data = f.read()
|
|
|
|
logger.debug(f"File size: {len(file_data)} bytes")
|
|
|
|
# Determine MIME type based on extension
|
|
mime_type = self._get_mime_type(file_path.suffix)
|
|
logger.debug(f"MIME type: {mime_type}")
|
|
|
|
# Process the file with or without AI based on configuration
|
|
extracted_content = await self.extractor.processFileData(
|
|
fileData=file_data,
|
|
fileName=file_path.name,
|
|
mimeType=mime_type,
|
|
base64Encoded=False,
|
|
prompt=self.prompt,
|
|
enableAI=self.enable_ai
|
|
)
|
|
|
|
logger.debug(f"Extracted {len(extracted_content.contents)} content items")
|
|
|
|
# Debug: Show content details
|
|
for i, content_item in enumerate(extracted_content.contents):
|
|
logger.debug(f"Content item {i+1}: label='{content_item.label}', has_data={content_item.data is not None}, data_length={len(content_item.data) if content_item.data else 0}")
|
|
|
|
# Special logging for JavaScript files
|
|
if mime_type == "application/javascript":
|
|
logger.debug(f"JavaScript file detected: {file_path.name}")
|
|
logger.debug(f"Original file size: {len(file_data)} bytes")
|
|
for i, content_item in enumerate(extracted_content.contents):
|
|
if content_item.data:
|
|
content_size = len(content_item.data.encode('utf-8'))
|
|
logger.debug(f"JavaScript content item {i+1}: {content_size} bytes")
|
|
# Check if content was truncated
|
|
if content_size < len(file_data) * 0.9: # If less than 90% of original
|
|
logger.warning(f"JavaScript content may be truncated: {content_size} bytes vs {len(file_data)} bytes original")
|
|
|
|
# Track processing result
|
|
result = {
|
|
'fileName': file_path.name,
|
|
'status': 'OK',
|
|
'content_items': 0,
|
|
'output_files': [],
|
|
'total_content_size': 0
|
|
}
|
|
|
|
# Save each content item as a separate file
|
|
if extracted_content.contents:
|
|
for i, content_item in enumerate(extracted_content.contents):
|
|
if content_item.data:
|
|
content_size = len(content_item.data.encode('utf-8'))
|
|
result['total_content_size'] += content_size
|
|
logger.debug(f"Content item {i+1}: {content_item.label}, size: {content_size} bytes")
|
|
|
|
# Generate fileName with new naming convention
|
|
if len(extracted_content.contents) == 1:
|
|
# Single content item
|
|
output_fileName = f"{file_path.stem} - {content_item.label} 1.txt"
|
|
else:
|
|
# Multiple content items - add sequence number
|
|
output_fileName = f"{file_path.stem} - {content_item.label} {i+1}.txt"
|
|
|
|
output_file = self.output_dir / output_fileName
|
|
|
|
# Write only the raw extracted content
|
|
logger.debug(f"Attempting to write to: {output_file}")
|
|
try:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(content_item.data)
|
|
|
|
# Verify file was created
|
|
if output_file.exists():
|
|
actual_size = output_file.stat().st_size
|
|
logger.info(f"✓ File created successfully: {output_fileName} (expected: {content_size} bytes, actual: {actual_size} bytes)")
|
|
else:
|
|
logger.error(f"✗ File was not created: {output_file}")
|
|
|
|
result['output_files'].append(output_fileName)
|
|
result['content_items'] += 1
|
|
except Exception as write_error:
|
|
logger.error(f"✗ Error writing file {output_fileName}: {write_error}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
else:
|
|
logger.warning(f"Content item {i+1} has no data, skipping")
|
|
else:
|
|
logger.warning(f"No content extracted from {file_path.name}")
|
|
result['status'] = 'FAIL'
|
|
result['error'] = 'No content extracted'
|
|
|
|
# Add result to tracking list
|
|
self.processing_results.append(result)
|
|
|
|
logger.info(f"Successfully processed {file_path.name} - Total content: {result['total_content_size']} bytes")
|
|
return True
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
logger.error(f"Error processing {file_path.name}: {error_msg}")
|
|
|
|
# Track failed result
|
|
result = {
|
|
'fileName': file_path.name,
|
|
'status': 'FAIL',
|
|
'content_items': 0,
|
|
'output_files': [],
|
|
'error': error_msg,
|
|
'total_content_size': 0
|
|
}
|
|
self.processing_results.append(result)
|
|
|
|
return False
|
|
|
|
def _get_mime_type(self, extension: str) -> str:
|
|
"""Get MIME type based on file extension."""
|
|
mime_types = {
|
|
# Text and data files
|
|
'.txt': 'text/plain',
|
|
'.csv': 'text/csv',
|
|
'.json': 'application/json',
|
|
'.xml': 'application/xml',
|
|
'.html': 'text/html',
|
|
'.htm': 'text/html',
|
|
'.svg': 'image/svg+xml',
|
|
'.md': 'text/markdown',
|
|
'.markdown': 'text/markdown',
|
|
'.rst': 'text/x-rst',
|
|
'.log': 'text/plain',
|
|
'.ini': 'text/plain',
|
|
'.cfg': 'text/plain',
|
|
'.conf': 'text/plain',
|
|
|
|
# Programming languages
|
|
'.js': 'application/javascript',
|
|
'.ts': 'application/typescript',
|
|
'.jsx': 'text/jsx',
|
|
'.tsx': 'text/tsx',
|
|
'.py': 'text/x-python',
|
|
'.java': 'text/x-java-source',
|
|
'.c': 'text/x-c',
|
|
'.cpp': 'text/x-c++src',
|
|
'.cc': 'text/x-c++src',
|
|
'.cxx': 'text/x-c++src',
|
|
'.h': 'text/x-c',
|
|
'.hpp': 'text/x-c++hdr',
|
|
'.cs': 'text/x-csharp',
|
|
'.php': 'application/x-httpd-php',
|
|
'.rb': 'text/x-ruby',
|
|
'.go': 'text/x-go',
|
|
'.rs': 'text/x-rust',
|
|
'.swift': 'text/x-swift',
|
|
'.kt': 'text/x-kotlin',
|
|
'.scala': 'text/x-scala',
|
|
'.r': 'text/x-r',
|
|
'.m': 'text/x-matlab',
|
|
'.pl': 'text/x-perl',
|
|
'.sh': 'application/x-sh',
|
|
'.bash': 'application/x-sh',
|
|
'.zsh': 'application/x-sh',
|
|
'.fish': 'application/x-sh',
|
|
'.ps1': 'application/x-powershell',
|
|
'.bat': 'application/x-msdos-program',
|
|
'.cmd': 'application/x-msdos-program',
|
|
'.vbs': 'text/vbscript',
|
|
'.lua': 'text/x-lua',
|
|
'.sql': 'application/sql',
|
|
'.dart': 'application/dart',
|
|
'.elm': 'text/x-elm',
|
|
'.clj': 'text/x-clojure',
|
|
'.hs': 'text/x-haskell',
|
|
'.fs': 'text/x-fsharp',
|
|
'.ml': 'text/x-ocaml',
|
|
|
|
# Web technologies
|
|
'.css': 'text/css',
|
|
'.scss': 'text/x-scss',
|
|
'.sass': 'text/x-sass',
|
|
'.less': 'text/x-less',
|
|
'.vue': 'text/x-vue',
|
|
'.svelte': 'text/x-svelte',
|
|
'.astro': 'text/x-astro',
|
|
|
|
# Configuration and build files
|
|
'.yaml': 'application/x-yaml',
|
|
'.yml': 'application/x-yaml',
|
|
'.toml': 'application/toml',
|
|
'.env': 'text/plain',
|
|
'.gitignore': 'text/plain',
|
|
'.dockerfile': 'text/x-dockerfile',
|
|
'.dockerignore': 'text/plain',
|
|
'.makefile': 'text/x-makefile',
|
|
'.cmake': 'text/x-cmake',
|
|
'.gradle': 'text/x-gradle',
|
|
'.maven': 'text/x-maven',
|
|
'.pom': 'application/xml',
|
|
'.sln': 'text/plain',
|
|
'.vcxproj': 'application/xml',
|
|
'.csproj': 'application/xml',
|
|
'.fsproj': 'application/xml',
|
|
'.vbproj': 'application/xml',
|
|
'.xcodeproj': 'text/plain',
|
|
'.pbxproj': 'text/plain',
|
|
|
|
# Documentation and markup
|
|
'.tex': 'application/x-tex',
|
|
'.bib': 'text/x-bibtex',
|
|
'.adoc': 'text/asciidoc',
|
|
'.asciidoc': 'text/asciidoc',
|
|
'.wiki': 'text/x-wiki',
|
|
'.creole': 'text/x-wiki',
|
|
|
|
# Images
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.png': 'image/png',
|
|
'.gif': 'image/gif',
|
|
'.webp': 'image/webp',
|
|
'.bmp': 'image/bmp',
|
|
'.tiff': 'image/tiff',
|
|
'.ico': 'image/x-icon',
|
|
|
|
# Documents
|
|
'.pdf': 'application/pdf',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'.odt': 'application/vnd.oasis.opendocument.text',
|
|
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
|
|
'.odp': 'application/vnd.oasis.opendocument.presentation',
|
|
|
|
# Legacy Office formats
|
|
'.doc': 'application/msword',
|
|
'.xls': 'application/vnd.ms-excel',
|
|
'.ppt': 'application/vnd.ms-powerpoint',
|
|
|
|
# Archives and binaries (will be processed as binary)
|
|
'.zip': 'application/zip',
|
|
'.tar': 'application/x-tar',
|
|
'.gz': 'application/gzip',
|
|
'.7z': 'application/x-7z-compressed',
|
|
'.rar': 'application/vnd.rar',
|
|
'.exe': 'application/x-msdownload',
|
|
'.dll': 'application/x-msdownload',
|
|
'.so': 'application/x-sharedlib',
|
|
'.dylib': 'application/x-mach-binary'
|
|
}
|
|
return mime_types.get(extension.lower(), 'application/octet-stream')
|
|
|
|
async def run_tests(self) -> None:
|
|
"""Run the document extraction tests on all files."""
|
|
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
|
|
logger.info(f"Starting document extraction tests - {mode}")
|
|
logger.info(f"Input directory: {self.input_dir}")
|
|
logger.info(f"Output directory: {self.output_dir}")
|
|
if self.enable_ai:
|
|
logger.info(f"Processing prompt: {self.prompt}")
|
|
else:
|
|
logger.info("AI processing: DISABLED - Raw content extraction only")
|
|
|
|
# Initialize the extractor
|
|
if not self.initialize_extractor():
|
|
logger.error("Cannot proceed without DocumentExtraction!")
|
|
return
|
|
|
|
# Get files to process
|
|
files = self.get_files_to_process()
|
|
|
|
if not files:
|
|
logger.warning("No files found to process!")
|
|
return
|
|
|
|
# Process each file
|
|
successful = 0
|
|
failed = 0
|
|
|
|
logger.info(f"Starting to process {len(files)} files...")
|
|
for i, file_path in enumerate(files):
|
|
logger.info(f"Processing file {i+1}/{len(files)}: {file_path.name}")
|
|
try:
|
|
if await self.process_single_file(file_path):
|
|
successful += 1
|
|
logger.info(f"✓ File {i+1} processed successfully")
|
|
else:
|
|
failed += 1
|
|
logger.error(f"✗ File {i+1} processing failed")
|
|
except Exception as e:
|
|
failed += 1
|
|
logger.error(f"✗ Exception processing file {i+1}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Print detailed summary
|
|
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
|
|
logger.info("\n" + "=" * 80)
|
|
logger.info(f"DETAILED TEST SUMMARY - {mode}")
|
|
logger.info("=" * 80)
|
|
logger.info(f"Total files processed: {len(files)}")
|
|
logger.info(f"Successful: {successful}")
|
|
logger.info(f"Failed: {failed}")
|
|
logger.info(f"Output directory: {self.output_dir}")
|
|
if self.enable_ai:
|
|
logger.info("AI processing: ENABLED")
|
|
else:
|
|
logger.info("AI processing: DISABLED")
|
|
logger.info("=" * 80)
|
|
|
|
# List all processed documents with results
|
|
logger.info("\nPROCESSING RESULTS:")
|
|
logger.info("-" * 80)
|
|
|
|
for result in self.processing_results:
|
|
status_icon = "✅" if result['status'] == 'OK' else "❌"
|
|
logger.info(f"{status_icon} {result['fileName']} - {result['status']}")
|
|
|
|
if result['status'] == 'OK':
|
|
if result['content_items'] == 1:
|
|
logger.info(f" └─ Generated: {result['output_files'][0]} ({result['total_content_size']} bytes)")
|
|
else:
|
|
logger.info(f" └─ Generated {result['content_items']} files ({result['total_content_size']} total bytes):")
|
|
for output_file in result['output_files']:
|
|
logger.info(f" └─ {output_file}")
|
|
else:
|
|
error_msg = result.get('error', 'Unknown error')
|
|
logger.info(f" └─ Error: {error_msg}")
|
|
|
|
logger.info("-" * 80)
|
|
logger.info("=" * 80)
|
|
|
|
def parse_arguments():
|
|
"""Parse command line arguments."""
|
|
parser = argparse.ArgumentParser(description='Document Extraction Test Script')
|
|
parser.add_argument('--no-ai', '--content-only', action='store_true',
|
|
help='Run in content-only mode without AI processing')
|
|
parser.add_argument('--input-dir', type=str, default='d:/temp/test-extraction',
|
|
help='Input directory containing files to process (default: d:/temp/test-extraction)')
|
|
parser.add_argument('--output-dir', type=str,
|
|
help='Output directory for extracted content (auto-generated if not specified)')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Enable verbose logging')
|
|
|
|
return parser.parse_args()
|
|
|
|
async def main():
|
|
"""Main function to run the tests."""
|
|
# Parse command line arguments
|
|
args = parse_arguments()
|
|
|
|
# Set logging level based on verbosity
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
else:
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
|
|
logger.info("DocumentExtraction Test Script")
|
|
logger.info("=" * 50)
|
|
logger.info(f"Source: {args.input_dir}")
|
|
|
|
# Determine output directory
|
|
if args.output_dir:
|
|
output_dir = args.output_dir
|
|
else:
|
|
if args.no_ai:
|
|
output_dir = f"{args.input_dir}/extracted-raw"
|
|
else:
|
|
output_dir = f"{args.input_dir}/extracted"
|
|
|
|
logger.info(f"Output: {output_dir}")
|
|
logger.info("=" * 50)
|
|
|
|
# Check dependencies first
|
|
if not check_dependencies():
|
|
logger.error("Please install missing dependencies before running tests.")
|
|
return
|
|
|
|
# Check module imports
|
|
if not check_module_imports():
|
|
logger.error("Cannot import required modules. Please check your setup.")
|
|
return
|
|
|
|
# Determine mode based on command line arguments
|
|
if args.no_ai:
|
|
enable_ai = False
|
|
logger.info("Running in CONTENT ONLY mode (no AI processing)")
|
|
else:
|
|
# Interactive mode: ask user for choice
|
|
print("\n" + "=" * 50)
|
|
print("SELECT EXTRACTION MODE:")
|
|
print("=" * 50)
|
|
print("1. With AI processing (default)")
|
|
print("2. Content only (no AI processing)")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
choice = input("Enter your choice (1 or 2, default is 1): ").strip()
|
|
if choice == "2":
|
|
enable_ai = False
|
|
output_dir = f"{args.input_dir}/extracted-raw"
|
|
logger.info("Selected: Content only mode (no AI processing)")
|
|
else:
|
|
enable_ai = True
|
|
output_dir = f"{args.input_dir}/extracted"
|
|
logger.info("Selected: AI processing mode")
|
|
except (EOFError, KeyboardInterrupt):
|
|
# Default to AI mode if input fails
|
|
enable_ai = True
|
|
output_dir = f"{args.input_dir}/extracted"
|
|
logger.info("Defaulting to AI processing mode")
|
|
|
|
# Run tests with selected mode
|
|
tester = DocumentExtractionTester(
|
|
input_dir=args.input_dir,
|
|
output_dir=output_dir,
|
|
enable_ai=enable_ai
|
|
)
|
|
await tester.run_tests()
|
|
|
|
if __name__ == "__main__":
|
|
# Check if command line arguments are provided for automated testing
|
|
if len(sys.argv) > 1:
|
|
# Parse arguments and run directly
|
|
asyncio.run(main())
|
|
else:
|
|
# Interactive mode: ask user for choice
|
|
asyncio.run(main())
|
|
|
|
# Convenience function for easy content-only extraction
|
|
async def extract_documents_content_only(input_folder: str, output_folder: str = None):
|
|
"""
|
|
Convenience function to extract documents without AI processing.
|
|
|
|
Args:
|
|
input_folder: Path to folder containing documents to extract
|
|
output_folder: Path to folder where extracted content will be stored (optional)
|
|
|
|
Example:
|
|
# Extract from d:/temp to d:/temp/extracted-raw
|
|
asyncio.run(extract_documents_content_only("d:/temp"))
|
|
|
|
# Extract from custom folders
|
|
asyncio.run(extract_documents_content_only("c:/my_docs", "c:/my_docs/extracted"))
|
|
"""
|
|
if output_folder is None:
|
|
output_folder = f"{input_folder}/extracted-raw"
|
|
|
|
logger.info(f"Running content-only extraction from {input_folder} to {output_folder}")
|
|
|
|
# Check dependencies and imports
|
|
if not check_dependencies():
|
|
logger.error("Missing dependencies. Please install required packages.")
|
|
return False
|
|
|
|
if not check_module_imports():
|
|
logger.error("Cannot import required modules. Please check your setup.")
|
|
return False
|
|
|
|
# Create tester and run
|
|
tester = DocumentExtractionTester(
|
|
input_dir=input_folder,
|
|
output_dir=output_folder,
|
|
enable_ai=False
|
|
)
|
|
|
|
await tester.run_tests()
|
|
return True
|
|
|
|
# Example usage (uncomment to use):
|
|
# if __name__ == "__main__":
|
|
# # For content-only extraction from d:/temp to d:/temp/extracted-raw
|
|
# asyncio.run(extract_documents_content_only("d:/temp"))
|