#!/usr/bin/env python3 """ Test script for DocumentExtraction class. Processes all files in d:/temp folder and stores extracted content in d:/temp/extracted. Features: - Option to extract content WITH AI processing (default) - Option to extract content WITHOUT AI processing (content-only mode) - Supports all document types: text, images, PDFs, Office documents, etc. - Detailed logging and progress tracking - Separate output directories for AI vs content-only modes Usage: - Interactive mode: python test_documentExtraction.py - Content-only mode: python test_documentExtraction.py --no-ai - Content-only mode: python test_documentExtraction.py --content-only - Specify custom input/output: python test_documentExtraction.py --input-dir /path/to/input --output-dir /path/to/output --no-ai """ import os import asyncio import logging import sys import argparse from pathlib import Path from typing import List, Optional from datetime import datetime, UTC # Configure logging logging.basicConfig( level=logging.DEBUG, # Changed from INFO to DEBUG format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Filter out specific unwanted log messages class LogFilter(logging.Filter): """Filter to hide specific unwanted log messages.""" def filter(self, record): # Hide workflow stats update errors if "Workflow" in record.getMessage() and "not found for stats update" in record.getMessage(): return False # Hide HTTP request info messages if "HTTP Request:" in record.getMessage() and "POST https://api.openai.com" in record.getMessage(): return False # Hide HTTP response info messages if "HTTP/1.1 200 OK" in record.getMessage(): return False return True # Apply the filter to the root logger root_logger = logging.getLogger() root_logger.addFilter(LogFilter()) def check_dependencies(): """Check if required dependencies are available and provide installation instructions.""" missing_deps = [] # Check for required dependencies try: import bs4 logger.info("✓ beautifulsoup4 is available") except ImportError: missing_deps.append("beautifulsoup4") logger.error("✗ beautifulsoup4 is missing") try: import PyPDF2 logger.info("✓ PyPDF2 is available") except ImportError: missing_deps.append("PyPDF2") logger.error("✗ PyPDF2 is missing") try: import fitz logger.info("✓ PyMuPDF (fitz) is available") except ImportError: missing_deps.append("PyMuPDF") logger.error("✗ PyMuPDF (fitz) is missing") try: import docx logger.info("✓ python-docx is available") except ImportError: missing_deps.append("python-docx") logger.error("✗ python-docx is missing") try: import openpyxl logger.info("✓ openpyxl is available") except ImportError: missing_deps.append("openpyxl") logger.error("✗ openpyxl is missing") try: import pptx logger.info("✓ python-pptx is available") except ImportError: missing_deps.append("python-pptx") logger.error("✗ python-pptx is missing") try: from PIL import Image logger.info("✓ Pillow (PIL) is available") except ImportError: missing_deps.append("Pillow") logger.error("✗ Pillow (PIL) is missing") if missing_deps: logger.error("\n" + "="*60) logger.error("MISSING DEPENDENCIES DETECTED!") logger.error("="*60) logger.error("The following packages are required but not installed:") for dep in missing_deps: logger.error(f" - {dep}") logger.error("\nTo install all dependencies, run:") logger.error("pip install -r requirements.txt") logger.error("\nOr install individual packages:") for dep in missing_deps: if dep == "beautifulsoup4": logger.error(f" pip install {dep}") elif dep == "PyMuPDF": logger.error(f" pip install {dep}") elif dep == "Pillow": logger.error(f" pip install {dep}") else: logger.error(f" pip install {dep}") logger.error("="*60) return False logger.info("✓ All required dependencies are available!") return True def check_module_imports(): """Check if we can import the required modules.""" try: # Add the gateway directory to the path so we can import our modules sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..')) from modules.chat.documents.documentExtraction import DocumentExtraction from modules.chat.serviceCenter import ServiceCenter from modules.interfaces.interfaceAppModel import User, UserConnection from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem logger.info("✓ All required modules imported successfully") return True except ImportError as e: logger.error(f"✗ Failed to import required modules: {e}") logger.error("Make sure you're running this script from the gateway directory") return False except Exception as e: logger.error(f"✗ Unexpected error importing modules: {e}") return False def create_mock_service_center(): """Create a proper ServiceCenter for testing purposes with all required fields.""" try: from modules.chat.serviceCenter import ServiceCenter from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat # Create proper user with all required fields mock_user = User( id="test_user_001", username="testuser", email="test@example.com", fullName="Test User", language="en", enabled=True, privilege=UserPrivilege.USER, authenticationAuthority=AuthAuthority.LOCAL, mandateId="test_mandate_001" ) # Create proper workflow with all required fields current_time = datetime.now(UTC).isoformat() mock_workflow = ChatWorkflow( id="test_workflow_001", mandateId="test_mandate_001", status="active", name="Test Document Extraction Workflow", currentRound=1, lastActivity=current_time, startedAt=current_time, logs=[], messages=[], stats=None, tasks=[] ) # Create service center service_center = ServiceCenter(mock_user, mock_workflow) logger.info("✓ ServiceCenter created successfully with proper objects") return service_center except Exception as e: logger.error(f"✗ Failed to create ServiceCenter: {e}") return None class DocumentExtractionTester: """Test class for DocumentExtraction functionality.""" def __init__(self, input_dir: str = "d:/temp/test-extraction", output_dir: str = None, enable_ai: bool = True): """ Initialize the tester. Args: input_dir: Directory containing files to process output_dir: Directory to store extracted content (auto-generated if None) enable_ai: Whether to enable AI processing (default: True) """ self.input_dir = Path(input_dir) # Auto-generate output directory if not specified if output_dir is None: if enable_ai: self.output_dir = Path(input_dir) / "extracted" else: self.output_dir = Path(input_dir) / "extracted-raw" else: self.output_dir = Path(output_dir) self.extractor = None self.service_center = None self.enable_ai = enable_ai if enable_ai: self.prompt = "Make a summary of each sentence for each page or chapter of the document" else: self.prompt = None # No prompt needed for content-only extraction # Track processing results for summary self.processing_results = [] # Ensure output directory exists logger.info(f"Creating output directory: {self.output_dir}") self.output_dir.mkdir(parents=True, exist_ok=True) # Verify directory was created if self.output_dir.exists(): logger.info(f"✓ Output directory created/verified: {self.output_dir}") logger.info(f"Output directory absolute path: {self.output_dir.absolute()}") else: logger.error(f"✗ Failed to create output directory: {self.output_dir}") # Log configuration logger.info(f"Configuration: AI processing = {'ENABLED' if self.enable_ai else 'DISABLED'}") logger.info(f"Input directory: {self.input_dir}") logger.info(f"Output directory: {self.output_dir}") # Test basic file writing capability test_file = self.output_dir / "test_write_capability.txt" try: logger.info(f"Testing file write capability to: {test_file}") logger.info(f"Absolute path: {test_file.absolute()}") with open(test_file, 'w', encoding='utf-8') as f: f.write("Test file to verify write capability") if test_file.exists(): actual_size = test_file.stat().st_size logger.info(f"✓ Basic file writing test passed: {test_file} (size: {actual_size} bytes)") # Test reading the file back with open(test_file, 'r', encoding='utf-8') as f: content = f.read() logger.info(f"✓ File read test passed: content length = {len(content)}") # Clean up test file test_file.unlink() logger.info("✓ Test file cleaned up") else: logger.error(f"✗ Basic file writing test failed: {test_file}") except Exception as e: logger.error(f"✗ Basic file writing test failed with error: {e}") import traceback traceback.print_exc() # Supported file extensions for content extraction self.supported_extensions = { # Text and data files '.txt', '.csv', '.json', '.xml', '.html', '.htm', '.svg', '.md', '.markdown', '.rst', '.log', '.ini', '.cfg', '.conf', # Programming languages '.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.c', '.cpp', '.cc', '.cxx', '.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.r', '.m', '.pl', '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd', '.vbs', '.lua', '.sql', '.r', '.dart', '.elm', '.clj', '.hs', '.fs', '.ml', # Web technologies '.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.astro', # Configuration and build files '.yaml', '.yml', '.toml', '.env', '.gitignore', '.dockerfile', '.dockerignore', '.makefile', '.cmake', '.gradle', '.maven', '.pom', '.sln', '.vcxproj', '.csproj', '.fsproj', '.vbproj', '.xcodeproj', '.pbxproj', # Documentation and markup '.tex', '.bib', '.adoc', '.asciidoc', '.wiki', '.creole', # Images '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico', # Documents '.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.ods', '.odp', # Legacy Office formats '.doc', '.xls', '.ppt', # Archives and binaries '.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib' } def initialize_extractor(self): """Initialize the DocumentExtraction instance with a proper ServiceCenter.""" try: # First create the service center self.service_center = create_mock_service_center() if not self.service_center: logger.error("Failed to create ServiceCenter!") return False # Now create DocumentExtraction with the service center from modules.chat.documents.documentExtraction import DocumentExtraction self.extractor = DocumentExtraction(self.service_center) logger.info("✓ DocumentExtraction initialized successfully with ServiceCenter") return True except Exception as e: logger.error(f"✗ Failed to initialize DocumentExtraction: {e}") return False def get_files_to_process(self) -> List[Path]: """Get list of files to process from input directory.""" if not self.input_dir.exists(): logger.error(f"Input directory {self.input_dir} does not exist!") logger.info("Creating input directory and adding a test file...") self.input_dir.mkdir(parents=True, exist_ok=True) # Create a test file if none exist test_file = self.input_dir / "test.txt" with open(test_file, 'w') as f: f.write("This is a test file for document extraction.\nIt contains multiple lines.\nAnd some special characters: äöüß") logger.info(f"Created test file: {test_file}") files = [] all_files = list(self.input_dir.iterdir()) logger.info(f"All files in directory: {[f.name for f in all_files]}") for file_path in all_files: if file_path.is_file(): logger.debug(f"Checking file: {file_path.name} (extension: {file_path.suffix})") if file_path.suffix.lower() in self.supported_extensions: files.append(file_path) logger.debug(f"Added file: {file_path.name}") else: logger.debug(f"Skipped file: {file_path.name} (unsupported extension)") logger.info(f"Found {len(files)} supported files to process") if files: logger.info(f"Files to process: {[f.name for f in files]}") return files async def process_single_file(self, file_path: Path) -> bool: """ Process a single file and extract its content. Args: file_path: Path to the file to process Returns: True if successful, False otherwise """ if not self.extractor: logger.error("DocumentExtraction not initialized!") return False try: logger.info(f"Processing file: {file_path.name}") # Read file data with open(file_path, 'rb') as f: file_data = f.read() logger.debug(f"File size: {len(file_data)} bytes") # Determine MIME type based on extension mime_type = self._get_mime_type(file_path.suffix) logger.debug(f"MIME type: {mime_type}") # Process the file with or without AI based on configuration extracted_content = await self.extractor.processFileData( fileData=file_data, filename=file_path.name, mimeType=mime_type, base64Encoded=False, prompt=self.prompt, enableAI=self.enable_ai ) logger.debug(f"Extracted {len(extracted_content.contents)} content items") # Debug: Show content details for i, content_item in enumerate(extracted_content.contents): logger.debug(f"Content item {i+1}: label='{content_item.label}', has_data={content_item.data is not None}, data_length={len(content_item.data) if content_item.data else 0}") # Special logging for JavaScript files if mime_type == "application/javascript": logger.debug(f"JavaScript file detected: {file_path.name}") logger.debug(f"Original file size: {len(file_data)} bytes") for i, content_item in enumerate(extracted_content.contents): if content_item.data: content_size = len(content_item.data.encode('utf-8')) logger.debug(f"JavaScript content item {i+1}: {content_size} bytes") # Check if content was truncated if content_size < len(file_data) * 0.9: # If less than 90% of original logger.warning(f"JavaScript content may be truncated: {content_size} bytes vs {len(file_data)} bytes original") # Track processing result result = { 'filename': file_path.name, 'status': 'OK', 'content_items': 0, 'output_files': [], 'total_content_size': 0 } # Save each content item as a separate file if extracted_content.contents: for i, content_item in enumerate(extracted_content.contents): if content_item.data: content_size = len(content_item.data.encode('utf-8')) result['total_content_size'] += content_size logger.debug(f"Content item {i+1}: {content_item.label}, size: {content_size} bytes") # Generate filename with new naming convention if len(extracted_content.contents) == 1: # Single content item output_filename = f"{file_path.stem} - {content_item.label} 1.txt" else: # Multiple content items - add sequence number output_filename = f"{file_path.stem} - {content_item.label} {i+1}.txt" output_file = self.output_dir / output_filename # Write only the raw extracted content logger.debug(f"Attempting to write to: {output_file}") try: with open(output_file, 'w', encoding='utf-8') as f: f.write(content_item.data) # Verify file was created if output_file.exists(): actual_size = output_file.stat().st_size logger.info(f"✓ File created successfully: {output_filename} (expected: {content_size} bytes, actual: {actual_size} bytes)") else: logger.error(f"✗ File was not created: {output_file}") result['output_files'].append(output_filename) result['content_items'] += 1 except Exception as write_error: logger.error(f"✗ Error writing file {output_filename}: {write_error}") import traceback traceback.print_exc() else: logger.warning(f"Content item {i+1} has no data, skipping") else: logger.warning(f"No content extracted from {file_path.name}") result['status'] = 'FAIL' result['error'] = 'No content extracted' # Add result to tracking list self.processing_results.append(result) logger.info(f"Successfully processed {file_path.name} - Total content: {result['total_content_size']} bytes") return True except Exception as e: error_msg = str(e) logger.error(f"Error processing {file_path.name}: {error_msg}") # Track failed result result = { 'filename': file_path.name, 'status': 'FAIL', 'content_items': 0, 'output_files': [], 'error': error_msg, 'total_content_size': 0 } self.processing_results.append(result) return False def _get_mime_type(self, extension: str) -> str: """Get MIME type based on file extension.""" mime_types = { # Text and data files '.txt': 'text/plain', '.csv': 'text/csv', '.json': 'application/json', '.xml': 'application/xml', '.html': 'text/html', '.htm': 'text/html', '.svg': 'image/svg+xml', '.md': 'text/markdown', '.markdown': 'text/markdown', '.rst': 'text/x-rst', '.log': 'text/plain', '.ini': 'text/plain', '.cfg': 'text/plain', '.conf': 'text/plain', # Programming languages '.js': 'application/javascript', '.ts': 'application/typescript', '.jsx': 'text/jsx', '.tsx': 'text/tsx', '.py': 'text/x-python', '.java': 'text/x-java-source', '.c': 'text/x-c', '.cpp': 'text/x-c++src', '.cc': 'text/x-c++src', '.cxx': 'text/x-c++src', '.h': 'text/x-c', '.hpp': 'text/x-c++hdr', '.cs': 'text/x-csharp', '.php': 'application/x-httpd-php', '.rb': 'text/x-ruby', '.go': 'text/x-go', '.rs': 'text/x-rust', '.swift': 'text/x-swift', '.kt': 'text/x-kotlin', '.scala': 'text/x-scala', '.r': 'text/x-r', '.m': 'text/x-matlab', '.pl': 'text/x-perl', '.sh': 'application/x-sh', '.bash': 'application/x-sh', '.zsh': 'application/x-sh', '.fish': 'application/x-sh', '.ps1': 'application/x-powershell', '.bat': 'application/x-msdos-program', '.cmd': 'application/x-msdos-program', '.vbs': 'text/vbscript', '.lua': 'text/x-lua', '.sql': 'application/sql', '.dart': 'application/dart', '.elm': 'text/x-elm', '.clj': 'text/x-clojure', '.hs': 'text/x-haskell', '.fs': 'text/x-fsharp', '.ml': 'text/x-ocaml', # Web technologies '.css': 'text/css', '.scss': 'text/x-scss', '.sass': 'text/x-sass', '.less': 'text/x-less', '.vue': 'text/x-vue', '.svelte': 'text/x-svelte', '.astro': 'text/x-astro', # Configuration and build files '.yaml': 'application/x-yaml', '.yml': 'application/x-yaml', '.toml': 'application/toml', '.env': 'text/plain', '.gitignore': 'text/plain', '.dockerfile': 'text/x-dockerfile', '.dockerignore': 'text/plain', '.makefile': 'text/x-makefile', '.cmake': 'text/x-cmake', '.gradle': 'text/x-gradle', '.maven': 'text/x-maven', '.pom': 'application/xml', '.sln': 'text/plain', '.vcxproj': 'application/xml', '.csproj': 'application/xml', '.fsproj': 'application/xml', '.vbproj': 'application/xml', '.xcodeproj': 'text/plain', '.pbxproj': 'text/plain', # Documentation and markup '.tex': 'application/x-tex', '.bib': 'text/x-bibtex', '.adoc': 'text/asciidoc', '.asciidoc': 'text/asciidoc', '.wiki': 'text/x-wiki', '.creole': 'text/x-wiki', # Images '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp', '.tiff': 'image/tiff', '.ico': 'image/x-icon', # Documents '.pdf': 'application/pdf', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', '.odt': 'application/vnd.oasis.opendocument.text', '.ods': 'application/vnd.oasis.opendocument.spreadsheet', '.odp': 'application/vnd.oasis.opendocument.presentation', # Legacy Office formats '.doc': 'application/msword', '.xls': 'application/vnd.ms-excel', '.ppt': 'application/vnd.ms-powerpoint', # Archives and binaries (will be processed as binary) '.zip': 'application/zip', '.tar': 'application/x-tar', '.gz': 'application/gzip', '.7z': 'application/x-7z-compressed', '.rar': 'application/vnd.rar', '.exe': 'application/x-msdownload', '.dll': 'application/x-msdownload', '.so': 'application/x-sharedlib', '.dylib': 'application/x-mach-binary' } return mime_types.get(extension.lower(), 'application/octet-stream') async def run_tests(self) -> None: """Run the document extraction tests on all files.""" mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)" logger.info(f"Starting document extraction tests - {mode}") logger.info(f"Input directory: {self.input_dir}") logger.info(f"Output directory: {self.output_dir}") if self.enable_ai: logger.info(f"Processing prompt: {self.prompt}") else: logger.info("AI processing: DISABLED - Raw content extraction only") # Initialize the extractor if not self.initialize_extractor(): logger.error("Cannot proceed without DocumentExtraction!") return # Get files to process files = self.get_files_to_process() if not files: logger.warning("No files found to process!") return # Process each file successful = 0 failed = 0 logger.info(f"Starting to process {len(files)} files...") for i, file_path in enumerate(files): logger.info(f"Processing file {i+1}/{len(files)}: {file_path.name}") try: if await self.process_single_file(file_path): successful += 1 logger.info(f"✓ File {i+1} processed successfully") else: failed += 1 logger.error(f"✗ File {i+1} processing failed") except Exception as e: failed += 1 logger.error(f"✗ Exception processing file {i+1}: {e}") import traceback traceback.print_exc() # Print detailed summary mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)" logger.info("\n" + "=" * 80) logger.info(f"DETAILED TEST SUMMARY - {mode}") logger.info("=" * 80) logger.info(f"Total files processed: {len(files)}") logger.info(f"Successful: {successful}") logger.info(f"Failed: {failed}") logger.info(f"Output directory: {self.output_dir}") if self.enable_ai: logger.info("AI processing: ENABLED") else: logger.info("AI processing: DISABLED") logger.info("=" * 80) # List all processed documents with results logger.info("\nPROCESSING RESULTS:") logger.info("-" * 80) for result in self.processing_results: status_icon = "✅" if result['status'] == 'OK' else "❌" logger.info(f"{status_icon} {result['filename']} - {result['status']}") if result['status'] == 'OK': if result['content_items'] == 1: logger.info(f" └─ Generated: {result['output_files'][0]} ({result['total_content_size']} bytes)") else: logger.info(f" └─ Generated {result['content_items']} files ({result['total_content_size']} total bytes):") for output_file in result['output_files']: logger.info(f" └─ {output_file}") else: error_msg = result.get('error', 'Unknown error') logger.info(f" └─ Error: {error_msg}") logger.info("-" * 80) logger.info("=" * 80) def parse_arguments(): """Parse command line arguments.""" parser = argparse.ArgumentParser(description='Document Extraction Test Script') parser.add_argument('--no-ai', '--content-only', action='store_true', help='Run in content-only mode without AI processing') parser.add_argument('--input-dir', type=str, default='d:/temp/test-extraction', help='Input directory containing files to process (default: d:/temp/test-extraction)') parser.add_argument('--output-dir', type=str, help='Output directory for extracted content (auto-generated if not specified)') parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging') return parser.parse_args() async def main(): """Main function to run the tests.""" # Parse command line arguments args = parse_arguments() # Set logging level based on verbosity if args.verbose: logging.getLogger().setLevel(logging.DEBUG) else: logging.getLogger().setLevel(logging.INFO) logger.info("DocumentExtraction Test Script") logger.info("=" * 50) logger.info(f"Source: {args.input_dir}") # Determine output directory if args.output_dir: output_dir = args.output_dir else: if args.no_ai: output_dir = f"{args.input_dir}/extracted-raw" else: output_dir = f"{args.input_dir}/extracted" logger.info(f"Output: {output_dir}") logger.info("=" * 50) # Check dependencies first if not check_dependencies(): logger.error("Please install missing dependencies before running tests.") return # Check module imports if not check_module_imports(): logger.error("Cannot import required modules. Please check your setup.") return # Determine mode based on command line arguments if args.no_ai: enable_ai = False logger.info("Running in CONTENT ONLY mode (no AI processing)") else: # Interactive mode: ask user for choice print("\n" + "=" * 50) print("SELECT EXTRACTION MODE:") print("=" * 50) print("1. With AI processing (default)") print("2. Content only (no AI processing)") print("=" * 50) try: choice = input("Enter your choice (1 or 2, default is 1): ").strip() if choice == "2": enable_ai = False output_dir = f"{args.input_dir}/extracted-raw" logger.info("Selected: Content only mode (no AI processing)") else: enable_ai = True output_dir = f"{args.input_dir}/extracted" logger.info("Selected: AI processing mode") except (EOFError, KeyboardInterrupt): # Default to AI mode if input fails enable_ai = True output_dir = f"{args.input_dir}/extracted" logger.info("Defaulting to AI processing mode") # Run tests with selected mode tester = DocumentExtractionTester( input_dir=args.input_dir, output_dir=output_dir, enable_ai=enable_ai ) await tester.run_tests() if __name__ == "__main__": # Check if command line arguments are provided for automated testing if len(sys.argv) > 1: # Parse arguments and run directly asyncio.run(main()) else: # Interactive mode: ask user for choice asyncio.run(main()) # Convenience function for easy content-only extraction async def extract_documents_content_only(input_folder: str, output_folder: str = None): """ Convenience function to extract documents without AI processing. Args: input_folder: Path to folder containing documents to extract output_folder: Path to folder where extracted content will be stored (optional) Example: # Extract from d:/temp to d:/temp/extracted-raw asyncio.run(extract_documents_content_only("d:/temp")) # Extract from custom folders asyncio.run(extract_documents_content_only("c:/my_docs", "c:/my_docs/extracted")) """ if output_folder is None: output_folder = f"{input_folder}/extracted-raw" logger.info(f"Running content-only extraction from {input_folder} to {output_folder}") # Check dependencies and imports if not check_dependencies(): logger.error("Missing dependencies. Please install required packages.") return False if not check_module_imports(): logger.error("Cannot import required modules. Please check your setup.") return False # Create tester and run tester = DocumentExtractionTester( input_dir=input_folder, output_dir=output_folder, enable_ai=False ) await tester.run_tests() return True # Example usage (uncomment to use): # if __name__ == "__main__": # # For content-only extraction from d:/temp to d:/temp/extracted-raw # asyncio.run(extract_documents_content_only("d:/temp"))