gateway/test_documentExtraction.py

855 lines
34 KiB
Python

#!/usr/bin/env python3
"""
Test script for DocumentExtraction class.
Processes all files in d:/temp folder and stores extracted content in d:/temp/extracted.
Features:
- Option to extract content WITH AI processing (default)
- Option to extract content WITHOUT AI processing (content-only mode)
- Supports all document types: text, images, PDFs, Office documents, etc.
- Detailed logging and progress tracking
- Separate output directories for AI vs content-only modes
Usage:
- Interactive mode: python test_documentExtraction.py
- Content-only mode: python test_documentExtraction.py --no-ai
- Content-only mode: python test_documentExtraction.py --content-only
- Specify custom input/output: python test_documentExtraction.py --input-dir /path/to/input --output-dir /path/to/output --no-ai
"""
import os
import asyncio
import logging
import sys
import argparse
from pathlib import Path
from typing import List, Optional
from datetime import datetime, UTC
# Configure logging
logging.basicConfig(
level=logging.DEBUG, # Changed from INFO to DEBUG
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Filter out specific unwanted log messages
class LogFilter(logging.Filter):
"""Filter to hide specific unwanted log messages."""
def filter(self, record):
# Hide workflow stats update errors
if "Workflow" in record.getMessage() and "not found for stats update" in record.getMessage():
return False
# Hide HTTP request info messages
if "HTTP Request:" in record.getMessage() and "POST https://api.openai.com" in record.getMessage():
return False
# Hide HTTP response info messages
if "HTTP/1.1 200 OK" in record.getMessage():
return False
return True
# Apply the filter to the root logger
root_logger = logging.getLogger()
root_logger.addFilter(LogFilter())
def check_dependencies():
"""Check if required dependencies are available and provide installation instructions."""
missing_deps = []
# Check for required dependencies
try:
import bs4
logger.info("✓ beautifulsoup4 is available")
except ImportError:
missing_deps.append("beautifulsoup4")
logger.error("✗ beautifulsoup4 is missing")
try:
import PyPDF2
logger.info("✓ PyPDF2 is available")
except ImportError:
missing_deps.append("PyPDF2")
logger.error("✗ PyPDF2 is missing")
try:
import fitz
logger.info("✓ PyMuPDF (fitz) is available")
except ImportError:
missing_deps.append("PyMuPDF")
logger.error("✗ PyMuPDF (fitz) is missing")
try:
import docx
logger.info("✓ python-docx is available")
except ImportError:
missing_deps.append("python-docx")
logger.error("✗ python-docx is missing")
try:
import openpyxl
logger.info("✓ openpyxl is available")
except ImportError:
missing_deps.append("openpyxl")
logger.error("✗ openpyxl is missing")
try:
import pptx
logger.info("✓ python-pptx is available")
except ImportError:
missing_deps.append("python-pptx")
logger.error("✗ python-pptx is missing")
try:
from PIL import Image
logger.info("✓ Pillow (PIL) is available")
except ImportError:
missing_deps.append("Pillow")
logger.error("✗ Pillow (PIL) is missing")
if missing_deps:
logger.error("\n" + "="*60)
logger.error("MISSING DEPENDENCIES DETECTED!")
logger.error("="*60)
logger.error("The following packages are required but not installed:")
for dep in missing_deps:
logger.error(f" - {dep}")
logger.error("\nTo install all dependencies, run:")
logger.error("pip install -r requirements.txt")
logger.error("\nOr install individual packages:")
for dep in missing_deps:
if dep == "beautifulsoup4":
logger.error(f" pip install {dep}")
elif dep == "PyMuPDF":
logger.error(f" pip install {dep}")
elif dep == "Pillow":
logger.error(f" pip install {dep}")
else:
logger.error(f" pip install {dep}")
logger.error("="*60)
return False
logger.info("✓ All required dependencies are available!")
return True
def check_module_imports():
"""Check if we can import the required modules."""
try:
# Add the gateway directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
from modules.chat.documents.documentExtraction import DocumentExtraction
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserConnection
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem
logger.info("✓ All required modules imported successfully")
return True
except ImportError as e:
logger.error(f"✗ Failed to import required modules: {e}")
logger.error("Make sure you're running this script from the gateway directory")
return False
except Exception as e:
logger.error(f"✗ Unexpected error importing modules: {e}")
return False
def create_mock_service_center():
"""Create a proper ServiceCenter for testing purposes with all required fields."""
try:
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat
# Create proper user with all required fields
mock_user = User(
id="test_user_001",
username="testuser",
email="test@example.com",
fullName="Test User",
language="en",
enabled=True,
privilege=UserPrivilege.USER,
authenticationAuthority=AuthAuthority.LOCAL,
mandateId="test_mandate_001"
)
# Create proper workflow with all required fields
current_time = datetime.now(UTC).isoformat()
mock_workflow = ChatWorkflow(
id="test_workflow_001",
mandateId="test_mandate_001",
status="active",
name="Test Document Extraction Workflow",
currentRound=1,
lastActivity=current_time,
startedAt=current_time,
logs=[],
messages=[],
stats=None,
tasks=[]
)
# Create service center
service_center = ServiceCenter(mock_user, mock_workflow)
logger.info("✓ ServiceCenter created successfully with proper objects")
return service_center
except Exception as e:
logger.error(f"✗ Failed to create ServiceCenter: {e}")
return None
class DocumentExtractionTester:
"""Test class for DocumentExtraction functionality."""
def __init__(self, input_dir: str = "d:/temp/test-extraction", output_dir: str = None, enable_ai: bool = True):
"""
Initialize the tester.
Args:
input_dir: Directory containing files to process
output_dir: Directory to store extracted content (auto-generated if None)
enable_ai: Whether to enable AI processing (default: True)
"""
self.input_dir = Path(input_dir)
# Auto-generate output directory if not specified
if output_dir is None:
if enable_ai:
self.output_dir = Path(input_dir) / "extracted"
else:
self.output_dir = Path(input_dir) / "extracted-raw"
else:
self.output_dir = Path(output_dir)
self.extractor = None
self.service_center = None
self.enable_ai = enable_ai
if enable_ai:
self.prompt = "Make a summary of each sentence for each page or chapter of the document"
else:
self.prompt = None # No prompt needed for content-only extraction
# Track processing results for summary
self.processing_results = []
# Ensure output directory exists
logger.info(f"Creating output directory: {self.output_dir}")
self.output_dir.mkdir(parents=True, exist_ok=True)
# Verify directory was created
if self.output_dir.exists():
logger.info(f"✓ Output directory created/verified: {self.output_dir}")
logger.info(f"Output directory absolute path: {self.output_dir.absolute()}")
else:
logger.error(f"✗ Failed to create output directory: {self.output_dir}")
# Log configuration
logger.info(f"Configuration: AI processing = {'ENABLED' if self.enable_ai else 'DISABLED'}")
logger.info(f"Input directory: {self.input_dir}")
logger.info(f"Output directory: {self.output_dir}")
# Test basic file writing capability
test_file = self.output_dir / "test_write_capability.txt"
try:
logger.info(f"Testing file write capability to: {test_file}")
logger.info(f"Absolute path: {test_file.absolute()}")
with open(test_file, 'w', encoding='utf-8') as f:
f.write("Test file to verify write capability")
if test_file.exists():
actual_size = test_file.stat().st_size
logger.info(f"✓ Basic file writing test passed: {test_file} (size: {actual_size} bytes)")
# Test reading the file back
with open(test_file, 'r', encoding='utf-8') as f:
content = f.read()
logger.info(f"✓ File read test passed: content length = {len(content)}")
# Clean up test file
test_file.unlink()
logger.info("✓ Test file cleaned up")
else:
logger.error(f"✗ Basic file writing test failed: {test_file}")
except Exception as e:
logger.error(f"✗ Basic file writing test failed with error: {e}")
import traceback
traceback.print_exc()
# Supported file extensions for content extraction
self.supported_extensions = {
# Text and data files
'.txt', '.csv', '.json', '.xml', '.html', '.htm', '.svg',
'.md', '.markdown', '.rst', '.log', '.ini', '.cfg', '.conf',
# Programming languages
'.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.c', '.cpp', '.cc', '.cxx',
'.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
'.r', '.m', '.pl', '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
'.vbs', '.lua', '.sql', '.r', '.dart', '.elm', '.clj', '.hs', '.fs', '.ml',
# Web technologies
'.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.astro',
# Configuration and build files
'.yaml', '.yml', '.toml', '.env', '.gitignore', '.dockerfile', '.dockerignore',
'.makefile', '.cmake', '.gradle', '.maven', '.pom', '.sln', '.vcxproj',
'.csproj', '.fsproj', '.vbproj', '.xcodeproj', '.pbxproj',
# Documentation and markup
'.tex', '.bib', '.adoc', '.asciidoc', '.wiki', '.creole',
# Images
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico',
# Documents
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.ods', '.odp',
# Legacy Office formats
'.doc', '.xls', '.ppt',
# Archives and binaries
'.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib'
}
def initialize_extractor(self):
"""Initialize the DocumentExtraction instance with a proper ServiceCenter."""
try:
# First create the service center
self.service_center = create_mock_service_center()
if not self.service_center:
logger.error("Failed to create ServiceCenter!")
return False
# Now create DocumentExtraction with the service center
from modules.chat.documents.documentExtraction import DocumentExtraction
self.extractor = DocumentExtraction(self.service_center)
logger.info("✓ DocumentExtraction initialized successfully with ServiceCenter")
return True
except Exception as e:
logger.error(f"✗ Failed to initialize DocumentExtraction: {e}")
return False
def get_files_to_process(self) -> List[Path]:
"""Get list of files to process from input directory."""
if not self.input_dir.exists():
logger.error(f"Input directory {self.input_dir} does not exist!")
logger.info("Creating input directory and adding a test file...")
self.input_dir.mkdir(parents=True, exist_ok=True)
# Create a test file if none exist
test_file = self.input_dir / "test.txt"
with open(test_file, 'w') as f:
f.write("This is a test file for document extraction.\nIt contains multiple lines.\nAnd some special characters: äöüß")
logger.info(f"Created test file: {test_file}")
files = []
all_files = list(self.input_dir.iterdir())
logger.info(f"All files in directory: {[f.name for f in all_files]}")
for file_path in all_files:
if file_path.is_file():
logger.debug(f"Checking file: {file_path.name} (extension: {file_path.suffix})")
if file_path.suffix.lower() in self.supported_extensions:
files.append(file_path)
logger.debug(f"Added file: {file_path.name}")
else:
logger.debug(f"Skipped file: {file_path.name} (unsupported extension)")
logger.info(f"Found {len(files)} supported files to process")
if files:
logger.info(f"Files to process: {[f.name for f in files]}")
return files
async def process_single_file(self, file_path: Path) -> bool:
"""
Process a single file and extract its content.
Args:
file_path: Path to the file to process
Returns:
True if successful, False otherwise
"""
if not self.extractor:
logger.error("DocumentExtraction not initialized!")
return False
try:
logger.info(f"Processing file: {file_path.name}")
# Read file data
with open(file_path, 'rb') as f:
file_data = f.read()
logger.debug(f"File size: {len(file_data)} bytes")
# Determine MIME type based on extension
mime_type = self._get_mime_type(file_path.suffix)
logger.debug(f"MIME type: {mime_type}")
# Process the file with or without AI based on configuration
extracted_content = await self.extractor.processFileData(
fileData=file_data,
fileName=file_path.name,
mimeType=mime_type,
base64Encoded=False,
prompt=self.prompt,
enableAI=self.enable_ai
)
logger.debug(f"Extracted {len(extracted_content.contents)} content items")
# Debug: Show content details
for i, content_item in enumerate(extracted_content.contents):
logger.debug(f"Content item {i+1}: label='{content_item.label}', has_data={content_item.data is not None}, data_length={len(content_item.data) if content_item.data else 0}")
# Special logging for JavaScript files
if mime_type == "application/javascript":
logger.debug(f"JavaScript file detected: {file_path.name}")
logger.debug(f"Original file size: {len(file_data)} bytes")
for i, content_item in enumerate(extracted_content.contents):
if content_item.data:
content_size = len(content_item.data.encode('utf-8'))
logger.debug(f"JavaScript content item {i+1}: {content_size} bytes")
# Check if content was truncated
if content_size < len(file_data) * 0.9: # If less than 90% of original
logger.warning(f"JavaScript content may be truncated: {content_size} bytes vs {len(file_data)} bytes original")
# Track processing result
result = {
'fileName': file_path.name,
'status': 'OK',
'content_items': 0,
'output_files': [],
'total_content_size': 0
}
# Save each content item as a separate file
if extracted_content.contents:
for i, content_item in enumerate(extracted_content.contents):
if content_item.data:
content_size = len(content_item.data.encode('utf-8'))
result['total_content_size'] += content_size
logger.debug(f"Content item {i+1}: {content_item.label}, size: {content_size} bytes")
# Generate fileName with new naming convention
if len(extracted_content.contents) == 1:
# Single content item
output_fileName = f"{file_path.stem} - {content_item.label} 1.txt"
else:
# Multiple content items - add sequence number
output_fileName = f"{file_path.stem} - {content_item.label} {i+1}.txt"
output_file = self.output_dir / output_fileName
# Write only the raw extracted content
logger.debug(f"Attempting to write to: {output_file}")
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content_item.data)
# Verify file was created
if output_file.exists():
actual_size = output_file.stat().st_size
logger.info(f"✓ File created successfully: {output_fileName} (expected: {content_size} bytes, actual: {actual_size} bytes)")
else:
logger.error(f"✗ File was not created: {output_file}")
result['output_files'].append(output_fileName)
result['content_items'] += 1
except Exception as write_error:
logger.error(f"✗ Error writing file {output_fileName}: {write_error}")
import traceback
traceback.print_exc()
else:
logger.warning(f"Content item {i+1} has no data, skipping")
else:
logger.warning(f"No content extracted from {file_path.name}")
result['status'] = 'FAIL'
result['error'] = 'No content extracted'
# Add result to tracking list
self.processing_results.append(result)
logger.info(f"Successfully processed {file_path.name} - Total content: {result['total_content_size']} bytes")
return True
except Exception as e:
error_msg = str(e)
logger.error(f"Error processing {file_path.name}: {error_msg}")
# Track failed result
result = {
'fileName': file_path.name,
'status': 'FAIL',
'content_items': 0,
'output_files': [],
'error': error_msg,
'total_content_size': 0
}
self.processing_results.append(result)
return False
def _get_mime_type(self, extension: str) -> str:
"""Get MIME type based on file extension."""
mime_types = {
# Text and data files
'.txt': 'text/plain',
'.csv': 'text/csv',
'.json': 'application/json',
'.xml': 'application/xml',
'.html': 'text/html',
'.htm': 'text/html',
'.svg': 'image/svg+xml',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.rst': 'text/x-rst',
'.log': 'text/plain',
'.ini': 'text/plain',
'.cfg': 'text/plain',
'.conf': 'text/plain',
# Programming languages
'.js': 'application/javascript',
'.ts': 'application/typescript',
'.jsx': 'text/jsx',
'.tsx': 'text/tsx',
'.py': 'text/x-python',
'.java': 'text/x-java-source',
'.c': 'text/x-c',
'.cpp': 'text/x-c++src',
'.cc': 'text/x-c++src',
'.cxx': 'text/x-c++src',
'.h': 'text/x-c',
'.hpp': 'text/x-c++hdr',
'.cs': 'text/x-csharp',
'.php': 'application/x-httpd-php',
'.rb': 'text/x-ruby',
'.go': 'text/x-go',
'.rs': 'text/x-rust',
'.swift': 'text/x-swift',
'.kt': 'text/x-kotlin',
'.scala': 'text/x-scala',
'.r': 'text/x-r',
'.m': 'text/x-matlab',
'.pl': 'text/x-perl',
'.sh': 'application/x-sh',
'.bash': 'application/x-sh',
'.zsh': 'application/x-sh',
'.fish': 'application/x-sh',
'.ps1': 'application/x-powershell',
'.bat': 'application/x-msdos-program',
'.cmd': 'application/x-msdos-program',
'.vbs': 'text/vbscript',
'.lua': 'text/x-lua',
'.sql': 'application/sql',
'.dart': 'application/dart',
'.elm': 'text/x-elm',
'.clj': 'text/x-clojure',
'.hs': 'text/x-haskell',
'.fs': 'text/x-fsharp',
'.ml': 'text/x-ocaml',
# Web technologies
'.css': 'text/css',
'.scss': 'text/x-scss',
'.sass': 'text/x-sass',
'.less': 'text/x-less',
'.vue': 'text/x-vue',
'.svelte': 'text/x-svelte',
'.astro': 'text/x-astro',
# Configuration and build files
'.yaml': 'application/x-yaml',
'.yml': 'application/x-yaml',
'.toml': 'application/toml',
'.env': 'text/plain',
'.gitignore': 'text/plain',
'.dockerfile': 'text/x-dockerfile',
'.dockerignore': 'text/plain',
'.makefile': 'text/x-makefile',
'.cmake': 'text/x-cmake',
'.gradle': 'text/x-gradle',
'.maven': 'text/x-maven',
'.pom': 'application/xml',
'.sln': 'text/plain',
'.vcxproj': 'application/xml',
'.csproj': 'application/xml',
'.fsproj': 'application/xml',
'.vbproj': 'application/xml',
'.xcodeproj': 'text/plain',
'.pbxproj': 'text/plain',
# Documentation and markup
'.tex': 'application/x-tex',
'.bib': 'text/x-bibtex',
'.adoc': 'text/asciidoc',
'.asciidoc': 'text/asciidoc',
'.wiki': 'text/x-wiki',
'.creole': 'text/x-wiki',
# Images
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp',
'.bmp': 'image/bmp',
'.tiff': 'image/tiff',
'.ico': 'image/x-icon',
# Documents
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.odt': 'application/vnd.oasis.opendocument.text',
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
'.odp': 'application/vnd.oasis.opendocument.presentation',
# Legacy Office formats
'.doc': 'application/msword',
'.xls': 'application/vnd.ms-excel',
'.ppt': 'application/vnd.ms-powerpoint',
# Archives and binaries (will be processed as binary)
'.zip': 'application/zip',
'.tar': 'application/x-tar',
'.gz': 'application/gzip',
'.7z': 'application/x-7z-compressed',
'.rar': 'application/vnd.rar',
'.exe': 'application/x-msdownload',
'.dll': 'application/x-msdownload',
'.so': 'application/x-sharedlib',
'.dylib': 'application/x-mach-binary'
}
return mime_types.get(extension.lower(), 'application/octet-stream')
async def run_tests(self) -> None:
"""Run the document extraction tests on all files."""
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
logger.info(f"Starting document extraction tests - {mode}")
logger.info(f"Input directory: {self.input_dir}")
logger.info(f"Output directory: {self.output_dir}")
if self.enable_ai:
logger.info(f"Processing prompt: {self.prompt}")
else:
logger.info("AI processing: DISABLED - Raw content extraction only")
# Initialize the extractor
if not self.initialize_extractor():
logger.error("Cannot proceed without DocumentExtraction!")
return
# Get files to process
files = self.get_files_to_process()
if not files:
logger.warning("No files found to process!")
return
# Process each file
successful = 0
failed = 0
logger.info(f"Starting to process {len(files)} files...")
for i, file_path in enumerate(files):
logger.info(f"Processing file {i+1}/{len(files)}: {file_path.name}")
try:
if await self.process_single_file(file_path):
successful += 1
logger.info(f"✓ File {i+1} processed successfully")
else:
failed += 1
logger.error(f"✗ File {i+1} processing failed")
except Exception as e:
failed += 1
logger.error(f"✗ Exception processing file {i+1}: {e}")
import traceback
traceback.print_exc()
# Print detailed summary
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
logger.info("\n" + "=" * 80)
logger.info(f"DETAILED TEST SUMMARY - {mode}")
logger.info("=" * 80)
logger.info(f"Total files processed: {len(files)}")
logger.info(f"Successful: {successful}")
logger.info(f"Failed: {failed}")
logger.info(f"Output directory: {self.output_dir}")
if self.enable_ai:
logger.info("AI processing: ENABLED")
else:
logger.info("AI processing: DISABLED")
logger.info("=" * 80)
# List all processed documents with results
logger.info("\nPROCESSING RESULTS:")
logger.info("-" * 80)
for result in self.processing_results:
status_icon = "" if result['status'] == 'OK' else ""
logger.info(f"{status_icon} {result['fileName']} - {result['status']}")
if result['status'] == 'OK':
if result['content_items'] == 1:
logger.info(f" └─ Generated: {result['output_files'][0]} ({result['total_content_size']} bytes)")
else:
logger.info(f" └─ Generated {result['content_items']} files ({result['total_content_size']} total bytes):")
for output_file in result['output_files']:
logger.info(f" └─ {output_file}")
else:
error_msg = result.get('error', 'Unknown error')
logger.info(f" └─ Error: {error_msg}")
logger.info("-" * 80)
logger.info("=" * 80)
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description='Document Extraction Test Script')
parser.add_argument('--no-ai', '--content-only', action='store_true',
help='Run in content-only mode without AI processing')
parser.add_argument('--input-dir', type=str, default='d:/temp/test-extraction',
help='Input directory containing files to process (default: d:/temp/test-extraction)')
parser.add_argument('--output-dir', type=str,
help='Output directory for extracted content (auto-generated if not specified)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Enable verbose logging')
return parser.parse_args()
async def main():
"""Main function to run the tests."""
# Parse command line arguments
args = parse_arguments()
# Set logging level based on verbosity
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
logger.info("DocumentExtraction Test Script")
logger.info("=" * 50)
logger.info(f"Source: {args.input_dir}")
# Determine output directory
if args.output_dir:
output_dir = args.output_dir
else:
if args.no_ai:
output_dir = f"{args.input_dir}/extracted-raw"
else:
output_dir = f"{args.input_dir}/extracted"
logger.info(f"Output: {output_dir}")
logger.info("=" * 50)
# Check dependencies first
if not check_dependencies():
logger.error("Please install missing dependencies before running tests.")
return
# Check module imports
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return
# Determine mode based on command line arguments
if args.no_ai:
enable_ai = False
logger.info("Running in CONTENT ONLY mode (no AI processing)")
else:
# Interactive mode: ask user for choice
print("\n" + "=" * 50)
print("SELECT EXTRACTION MODE:")
print("=" * 50)
print("1. With AI processing (default)")
print("2. Content only (no AI processing)")
print("=" * 50)
try:
choice = input("Enter your choice (1 or 2, default is 1): ").strip()
if choice == "2":
enable_ai = False
output_dir = f"{args.input_dir}/extracted-raw"
logger.info("Selected: Content only mode (no AI processing)")
else:
enable_ai = True
output_dir = f"{args.input_dir}/extracted"
logger.info("Selected: AI processing mode")
except (EOFError, KeyboardInterrupt):
# Default to AI mode if input fails
enable_ai = True
output_dir = f"{args.input_dir}/extracted"
logger.info("Defaulting to AI processing mode")
# Run tests with selected mode
tester = DocumentExtractionTester(
input_dir=args.input_dir,
output_dir=output_dir,
enable_ai=enable_ai
)
await tester.run_tests()
if __name__ == "__main__":
# Check if command line arguments are provided for automated testing
if len(sys.argv) > 1:
# Parse arguments and run directly
asyncio.run(main())
else:
# Interactive mode: ask user for choice
asyncio.run(main())
# Convenience function for easy content-only extraction
async def extract_documents_content_only(input_folder: str, output_folder: str = None):
"""
Convenience function to extract documents without AI processing.
Args:
input_folder: Path to folder containing documents to extract
output_folder: Path to folder where extracted content will be stored (optional)
Example:
# Extract from d:/temp to d:/temp/extracted-raw
asyncio.run(extract_documents_content_only("d:/temp"))
# Extract from custom folders
asyncio.run(extract_documents_content_only("c:/my_docs", "c:/my_docs/extracted"))
"""
if output_folder is None:
output_folder = f"{input_folder}/extracted-raw"
logger.info(f"Running content-only extraction from {input_folder} to {output_folder}")
# Check dependencies and imports
if not check_dependencies():
logger.error("Missing dependencies. Please install required packages.")
return False
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return False
# Create tester and run
tester = DocumentExtractionTester(
input_dir=input_folder,
output_dir=output_folder,
enable_ai=False
)
await tester.run_tests()
return True
# Example usage (uncomment to use):
# if __name__ == "__main__":
# # For content-only extraction from d:/temp to d:/temp/extracted-raw
# asyncio.run(extract_documents_content_only("d:/temp"))