gateway/test_documentExtraction.py

#!/usr/bin/env python3
"""
Test script for DocumentExtraction class.
Processes all files in d:/temp folder and stores extracted content in d:/temp/extracted.

Features:
- Option to extract content WITH AI processing (default)
- Option to extract content WITHOUT AI processing (content-only mode)
- Supports all document types: text, images, PDFs, Office documents, etc.
- Detailed logging and progress tracking
- Separate output directories for AI vs content-only modes

Usage:
- Interactive mode: python test_documentExtraction.py
- Content-only mode: python test_documentExtraction.py --no-ai
- Content-only mode: python test_documentExtraction.py --content-only
- Specify custom input/output: python test_documentExtraction.py --input-dir /path/to/input --output-dir /path/to/output --no-ai
"""

import os
import asyncio
import logging
import sys
import argparse
from pathlib import Path
from typing import List, Optional
from datetime import datetime, UTC

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Changed from INFO to DEBUG
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Filter out specific unwanted log messages
class LogFilter(logging.Filter):
    """Filter to hide specific unwanted log messages."""

    def filter(self, record):
        # Hide workflow stats update errors
        if "Workflow" in record.getMessage() and "not found for stats update" in record.getMessage():
            return False

        # Hide HTTP request info messages
        if "HTTP Request:" in record.getMessage() and "POST https://api.openai.com" in record.getMessage():
            return False

        # Hide HTTP response info messages
        if "HTTP/1.1 200 OK" in record.getMessage():
            return False

        return True

# Apply the filter to the root logger
root_logger = logging.getLogger()
root_logger.addFilter(LogFilter())

def check_dependencies():
    """Check if required dependencies are available and provide installation instructions."""
    missing_deps = []

    # Check for required dependencies
    try:
        import bs4
        logger.info("✓ beautifulsoup4 is available")
    except ImportError:
        missing_deps.append("beautifulsoup4")
        logger.error("✗ beautifulsoup4 is missing")

    try:
        import PyPDF2
        logger.info("✓ PyPDF2 is available")
    except ImportError:
        missing_deps.append("PyPDF2")
        logger.error("✗ PyPDF2 is missing")

    try:
        import fitz
        logger.info("✓ PyMuPDF (fitz) is available")
    except ImportError:
        missing_deps.append("PyMuPDF")
        logger.error("✗ PyMuPDF (fitz) is missing")

    try:
        import docx
        logger.info("✓ python-docx is available")
    except ImportError:
        missing_deps.append("python-docx")
        logger.error("✗ python-docx is missing")

    try:
        import openpyxl
        logger.info("✓ openpyxl is available")
    except ImportError:
        missing_deps.append("openpyxl")
        logger.error("✗ openpyxl is missing")

    try:
        import pptx
        logger.info("✓ python-pptx is available")
    except ImportError:
        missing_deps.append("python-pptx")
        logger.error("✗ python-pptx is missing")

    try:
        from PIL import Image
        logger.info("✓ Pillow (PIL) is available")
    except ImportError:
        missing_deps.append("Pillow")
        logger.error("✗ Pillow (PIL) is missing")

    if missing_deps:
        logger.error("\n" + "="*60)
        logger.error("MISSING DEPENDENCIES DETECTED!")
        logger.error("="*60)
        logger.error("The following packages are required but not installed:")
        for dep in missing_deps:
            logger.error(f"  - {dep}")
        logger.error("\nTo install all dependencies, run:")
        logger.error("pip install -r requirements.txt")
        logger.error("\nOr install individual packages:")
        for dep in missing_deps:
            if dep == "beautifulsoup4":
                logger.error(f"  pip install {dep}")
            elif dep == "PyMuPDF":
                logger.error(f"  pip install {dep}")
            elif dep == "Pillow":
                logger.error(f"  pip install {dep}")
            else:
                logger.error(f"  pip install {dep}")
        logger.error("="*60)
        return False

    logger.info("✓ All required dependencies are available!")
    return True

def check_module_imports():
    """Check if we can import the required modules."""
    try:
        # Add the gateway directory to the path so we can import our modules
        sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))

        from modules.chat.documents.documentExtraction import DocumentExtraction
        from modules.chat.serviceCenter import ServiceCenter
        from modules.interfaces.interfaceAppModel import User, UserConnection
        from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem

        logger.info("✓ All required modules imported successfully")
        return True
    except ImportError as e:
        logger.error(f"✗ Failed to import required modules: {e}")
        logger.error("Make sure you're running this script from the gateway directory")
        return False
    except Exception as e:
        logger.error(f"✗ Unexpected error importing modules: {e}")
        return False

def create_mock_service_center():
    """Create a proper ServiceCenter for testing purposes with all required fields."""
    try:
        from modules.chat.serviceCenter import ServiceCenter
        from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
        from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
        from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat

        # Create proper user with all required fields
        mock_user = User(
            id="test_user_001",
            username="testuser",
            email="test@example.com",
            fullName="Test User",
            language="en",
            enabled=True,
            privilege=UserPrivilege.USER,
            authenticationAuthority=AuthAuthority.LOCAL,
            mandateId="test_mandate_001"
        )

        # Create proper workflow with all required fields
        current_time = datetime.now(UTC).isoformat()
        mock_workflow = ChatWorkflow(
            id="test_workflow_001",
            mandateId="test_mandate_001",
            status="active",
            name="Test Document Extraction Workflow",
            currentRound=1,
            lastActivity=current_time,
            startedAt=current_time,
            logs=[],
            messages=[],
            stats=None,
            tasks=[]
        )

        # Create service center
        service_center = ServiceCenter(mock_user, mock_workflow)
        logger.info("✓ ServiceCenter created successfully with proper objects")
        return service_center

    except Exception as e:
        logger.error(f"✗ Failed to create ServiceCenter: {e}")
        return None

class DocumentExtractionTester:
    """Test class for DocumentExtraction functionality."""

    def __init__(self, input_dir: str = "d:/temp/test-extraction", output_dir: str = None, enable_ai: bool = True):
        """
        Initialize the tester.

        Args:
            input_dir: Directory containing files to process
            output_dir: Directory to store extracted content (auto-generated if None)
            enable_ai: Whether to enable AI processing (default: True)
        """
        self.input_dir = Path(input_dir)

        # Auto-generate output directory if not specified
        if output_dir is None:
            if enable_ai:
                self.output_dir = Path(input_dir) / "extracted"
            else:
                self.output_dir = Path(input_dir) / "extracted-raw"
        else:
            self.output_dir = Path(output_dir)

        self.extractor = None
        self.service_center = None
        self.enable_ai = enable_ai

        if enable_ai:
            self.prompt = "Make a summary of each sentence for each page or chapter of the document"
        else:
            self.prompt = None  # No prompt needed for content-only extraction

        # Track processing results for summary
        self.processing_results = []

        # Ensure output directory exists
        logger.info(f"Creating output directory: {self.output_dir}")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # Verify directory was created
        if self.output_dir.exists():
            logger.info(f"✓ Output directory created/verified: {self.output_dir}")
            logger.info(f"Output directory absolute path: {self.output_dir.absolute()}")
        else:
            logger.error(f"✗ Failed to create output directory: {self.output_dir}")

        # Log configuration
        logger.info(f"Configuration: AI processing = {'ENABLED' if self.enable_ai else 'DISABLED'}")
        logger.info(f"Input directory: {self.input_dir}")
        logger.info(f"Output directory: {self.output_dir}")

        # Test basic file writing capability
        test_file = self.output_dir / "test_write_capability.txt"
        try:
            logger.info(f"Testing file write capability to: {test_file}")
            logger.info(f"Absolute path: {test_file.absolute()}")

            with open(test_file, 'w', encoding='utf-8') as f:
                f.write("Test file to verify write capability")

            if test_file.exists():
                actual_size = test_file.stat().st_size
                logger.info(f"✓ Basic file writing test passed: {test_file} (size: {actual_size} bytes)")

                # Test reading the file back
                with open(test_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                logger.info(f"✓ File read test passed: content length = {len(content)}")

                # Clean up test file
                test_file.unlink()
                logger.info("✓ Test file cleaned up")
            else:
                logger.error(f"✗ Basic file writing test failed: {test_file}")
        except Exception as e:
            logger.error(f"✗ Basic file writing test failed with error: {e}")
            import traceback
            traceback.print_exc()

        # Supported file extensions for content extraction
        self.supported_extensions = {
            # Text and data files
            '.txt', '.csv', '.json', '.xml', '.html', '.htm', '.svg',
            '.md', '.markdown', '.rst', '.log', '.ini', '.cfg', '.conf',

            # Programming languages
            '.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.c', '.cpp', '.cc', '.cxx',
            '.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
            '.r', '.m', '.pl', '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
            '.vbs', '.lua', '.sql', '.r', '.dart', '.elm', '.clj', '.hs', '.fs', '.ml',

            # Web technologies
            '.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.astro',

            # Configuration and build files
            '.yaml', '.yml', '.toml', '.env', '.gitignore', '.dockerfile', '.dockerignore',
            '.makefile', '.cmake', '.gradle', '.maven', '.pom', '.sln', '.vcxproj',
            '.csproj', '.fsproj', '.vbproj', '.xcodeproj', '.pbxproj',

            # Documentation and markup
            '.tex', '.bib', '.adoc', '.asciidoc', '.wiki', '.creole',

            # Images
            '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico',

            # Documents
            '.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.ods', '.odp',

            # Legacy Office formats
            '.doc', '.xls', '.ppt',

            # Archives and binaries
            '.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib'
        }

    def initialize_extractor(self):
        """Initialize the DocumentExtraction instance with a proper ServiceCenter."""
        try:
            # First create the service center
            self.service_center = create_mock_service_center()
            if not self.service_center:
                logger.error("Failed to create ServiceCenter!")
                return False

            # Now create DocumentExtraction with the service center
            from modules.chat.documents.documentExtraction import DocumentExtraction
            self.extractor = DocumentExtraction(self.service_center)
            logger.info("✓ DocumentExtraction initialized successfully with ServiceCenter")
            return True
        except Exception as e:
            logger.error(f"✗ Failed to initialize DocumentExtraction: {e}")
            return False

    def get_files_to_process(self) -> List[Path]:
        """Get list of files to process from input directory."""
        if not self.input_dir.exists():
            logger.error(f"Input directory {self.input_dir} does not exist!")
            logger.info("Creating input directory and adding a test file...")
            self.input_dir.mkdir(parents=True, exist_ok=True)

            # Create a test file if none exist
            test_file = self.input_dir / "test.txt"
            with open(test_file, 'w') as f:
                f.write("This is a test file for document extraction.\nIt contains multiple lines.\nAnd some special characters: äöüß")
            logger.info(f"Created test file: {test_file}")

        files = []
        all_files = list(self.input_dir.iterdir())
        logger.info(f"All files in directory: {[f.name for f in all_files]}")

        for file_path in all_files:
            if file_path.is_file():
                logger.debug(f"Checking file: {file_path.name} (extension: {file_path.suffix})")
                if file_path.suffix.lower() in self.supported_extensions:
                    files.append(file_path)
                    logger.debug(f"Added file: {file_path.name}")
                else:
                    logger.debug(f"Skipped file: {file_path.name} (unsupported extension)")

        logger.info(f"Found {len(files)} supported files to process")
        if files:
            logger.info(f"Files to process: {[f.name for f in files]}")
        return files

    async def process_single_file(self, file_path: Path) -> bool:
        """
        Process a single file and extract its content.

        Args:
            file_path: Path to the file to process

        Returns:
            True if successful, False otherwise
        """
        if not self.extractor:
            logger.error("DocumentExtraction not initialized!")
            return False

        try:
            logger.info(f"Processing file: {file_path.name}")

            # Read file data
            with open(file_path, 'rb') as f:
                file_data = f.read()

            logger.debug(f"File size: {len(file_data)} bytes")

            # Determine MIME type based on extension
            mime_type = self._get_mime_type(file_path.suffix)
            logger.debug(f"MIME type: {mime_type}")

            # Process the file with or without AI based on configuration
            extracted_content = await self.extractor.processFileData(
                fileData=file_data,
                fileName=file_path.name,
                mimeType=mime_type,
                base64Encoded=False,
                prompt=self.prompt,
                enableAI=self.enable_ai
            )

            logger.debug(f"Extracted {len(extracted_content.contents)} content items")

            # Debug: Show content details
            for i, content_item in enumerate(extracted_content.contents):
                logger.debug(f"Content item {i+1}: label='{content_item.label}', has_data={content_item.data is not None}, data_length={len(content_item.data) if content_item.data else 0}")

            # Special logging for JavaScript files
            if mime_type == "application/javascript":
                logger.debug(f"JavaScript file detected: {file_path.name}")
                logger.debug(f"Original file size: {len(file_data)} bytes")
                for i, content_item in enumerate(extracted_content.contents):
                    if content_item.data:
                        content_size = len(content_item.data.encode('utf-8'))
                        logger.debug(f"JavaScript content item {i+1}: {content_size} bytes")
                        # Check if content was truncated
                        if content_size < len(file_data) * 0.9:  # If less than 90% of original
                            logger.warning(f"JavaScript content may be truncated: {content_size} bytes vs {len(file_data)} bytes original")

            # Track processing result
            result = {
                'fileName': file_path.name,
                'status': 'OK',
                'content_items': 0,
                'output_files': [],
                'total_content_size': 0
            }

            # Save each content item as a separate file
            if extracted_content.contents:
                for i, content_item in enumerate(extracted_content.contents):
                    if content_item.data:
                        content_size = len(content_item.data.encode('utf-8'))
                        result['total_content_size'] += content_size
                        logger.debug(f"Content item {i+1}: {content_item.label}, size: {content_size} bytes")

                        # Generate fileName with new naming convention
                        if len(extracted_content.contents) == 1:
                            # Single content item
                            output_fileName = f"{file_path.stem} - {content_item.label} 1.txt"
                        else:
                            # Multiple content items - add sequence number
                            output_fileName = f"{file_path.stem} - {content_item.label} {i+1}.txt"

                        output_file = self.output_dir / output_fileName

                        # Write only the raw extracted content
                        logger.debug(f"Attempting to write to: {output_file}")
                        try:
                            with open(output_file, 'w', encoding='utf-8') as f:
                                f.write(content_item.data)

                            # Verify file was created
                            if output_file.exists():
                                actual_size = output_file.stat().st_size
                                logger.info(f"✓ File created successfully: {output_fileName} (expected: {content_size} bytes, actual: {actual_size} bytes)")
                            else:
                                logger.error(f"✗ File was not created: {output_file}")

                            result['output_files'].append(output_fileName)
                            result['content_items'] += 1
                        except Exception as write_error:
                            logger.error(f"✗ Error writing file {output_fileName}: {write_error}")
                            import traceback
                            traceback.print_exc()
                    else:
                        logger.warning(f"Content item {i+1} has no data, skipping")
            else:
                logger.warning(f"No content extracted from {file_path.name}")
                result['status'] = 'FAIL'
                result['error'] = 'No content extracted'

            # Add result to tracking list
            self.processing_results.append(result)

            logger.info(f"Successfully processed {file_path.name} - Total content: {result['total_content_size']} bytes")
            return True

        except Exception as e:
            error_msg = str(e)
            logger.error(f"Error processing {file_path.name}: {error_msg}")

            # Track failed result
            result = {
                'fileName': file_path.name,
                'status': 'FAIL',
                'content_items': 0,
                'output_files': [],
                'error': error_msg,
                'total_content_size': 0
            }
            self.processing_results.append(result)

            return False

    def _get_mime_type(self, extension: str) -> str:
        """Get MIME type based on file extension."""
        mime_types = {
            # Text and data files
            '.txt': 'text/plain',
            '.csv': 'text/csv',
            '.json': 'application/json',
            '.xml': 'application/xml',
            '.html': 'text/html',
            '.htm': 'text/html',
            '.svg': 'image/svg+xml',
            '.md': 'text/markdown',
            '.markdown': 'text/markdown',
            '.rst': 'text/x-rst',
            '.log': 'text/plain',
            '.ini': 'text/plain',
            '.cfg': 'text/plain',
            '.conf': 'text/plain',

            # Programming languages
            '.js': 'application/javascript',
            '.ts': 'application/typescript',
            '.jsx': 'text/jsx',
            '.tsx': 'text/tsx',
            '.py': 'text/x-python',
            '.java': 'text/x-java-source',
            '.c': 'text/x-c',
            '.cpp': 'text/x-c++src',
            '.cc': 'text/x-c++src',
            '.cxx': 'text/x-c++src',
            '.h': 'text/x-c',
            '.hpp': 'text/x-c++hdr',
            '.cs': 'text/x-csharp',
            '.php': 'application/x-httpd-php',
            '.rb': 'text/x-ruby',
            '.go': 'text/x-go',
            '.rs': 'text/x-rust',
            '.swift': 'text/x-swift',
            '.kt': 'text/x-kotlin',
            '.scala': 'text/x-scala',
            '.r': 'text/x-r',
            '.m': 'text/x-matlab',
            '.pl': 'text/x-perl',
            '.sh': 'application/x-sh',
            '.bash': 'application/x-sh',
            '.zsh': 'application/x-sh',
            '.fish': 'application/x-sh',
            '.ps1': 'application/x-powershell',
            '.bat': 'application/x-msdos-program',
            '.cmd': 'application/x-msdos-program',
            '.vbs': 'text/vbscript',
            '.lua': 'text/x-lua',
            '.sql': 'application/sql',
            '.dart': 'application/dart',
            '.elm': 'text/x-elm',
            '.clj': 'text/x-clojure',
            '.hs': 'text/x-haskell',
            '.fs': 'text/x-fsharp',
            '.ml': 'text/x-ocaml',

            # Web technologies
            '.css': 'text/css',
            '.scss': 'text/x-scss',
            '.sass': 'text/x-sass',
            '.less': 'text/x-less',
            '.vue': 'text/x-vue',
            '.svelte': 'text/x-svelte',
            '.astro': 'text/x-astro',

            # Configuration and build files
            '.yaml': 'application/x-yaml',
            '.yml': 'application/x-yaml',
            '.toml': 'application/toml',
            '.env': 'text/plain',
            '.gitignore': 'text/plain',
            '.dockerfile': 'text/x-dockerfile',
            '.dockerignore': 'text/plain',
            '.makefile': 'text/x-makefile',
            '.cmake': 'text/x-cmake',
            '.gradle': 'text/x-gradle',
            '.maven': 'text/x-maven',
            '.pom': 'application/xml',
            '.sln': 'text/plain',
            '.vcxproj': 'application/xml',
            '.csproj': 'application/xml',
            '.fsproj': 'application/xml',
            '.vbproj': 'application/xml',
            '.xcodeproj': 'text/plain',
            '.pbxproj': 'text/plain',

            # Documentation and markup
            '.tex': 'application/x-tex',
            '.bib': 'text/x-bibtex',
            '.adoc': 'text/asciidoc',
            '.asciidoc': 'text/asciidoc',
            '.wiki': 'text/x-wiki',
            '.creole': 'text/x-wiki',

            # Images
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.png': 'image/png',
            '.gif': 'image/gif',
            '.webp': 'image/webp',
            '.bmp': 'image/bmp',
            '.tiff': 'image/tiff',
            '.ico': 'image/x-icon',

            # Documents
            '.pdf': 'application/pdf',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
            '.odt': 'application/vnd.oasis.opendocument.text',
            '.ods': 'application/vnd.oasis.opendocument.spreadsheet',
            '.odp': 'application/vnd.oasis.opendocument.presentation',

            # Legacy Office formats
            '.doc': 'application/msword',
            '.xls': 'application/vnd.ms-excel',
            '.ppt': 'application/vnd.ms-powerpoint',

            # Archives and binaries (will be processed as binary)
            '.zip': 'application/zip',
            '.tar': 'application/x-tar',
            '.gz': 'application/gzip',
            '.7z': 'application/x-7z-compressed',
            '.rar': 'application/vnd.rar',
            '.exe': 'application/x-msdownload',
            '.dll': 'application/x-msdownload',
            '.so': 'application/x-sharedlib',
            '.dylib': 'application/x-mach-binary'
        }
        return mime_types.get(extension.lower(), 'application/octet-stream')

    async def run_tests(self) -> None:
        """Run the document extraction tests on all files."""
        mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
        logger.info(f"Starting document extraction tests - {mode}")
        logger.info(f"Input directory: {self.input_dir}")
        logger.info(f"Output directory: {self.output_dir}")
        if self.enable_ai:
            logger.info(f"Processing prompt: {self.prompt}")
        else:
            logger.info("AI processing: DISABLED - Raw content extraction only")

        # Initialize the extractor
        if not self.initialize_extractor():
            logger.error("Cannot proceed without DocumentExtraction!")
            return

        # Get files to process
        files = self.get_files_to_process()

        if not files:
            logger.warning("No files found to process!")
            return

        # Process each file
        successful = 0
        failed = 0

        logger.info(f"Starting to process {len(files)} files...")
        for i, file_path in enumerate(files):
            logger.info(f"Processing file {i+1}/{len(files)}: {file_path.name}")
            try:
                if await self.process_single_file(file_path):
                    successful += 1
                    logger.info(f"✓ File {i+1} processed successfully")
                else:
                    failed += 1
                    logger.error(f"✗ File {i+1} processing failed")
            except Exception as e:
                failed += 1
                logger.error(f"✗ Exception processing file {i+1}: {e}")
                import traceback
                traceback.print_exc()

        # Print detailed summary
        mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
        logger.info("\n" + "=" * 80)
        logger.info(f"DETAILED TEST SUMMARY - {mode}")
        logger.info("=" * 80)
        logger.info(f"Total files processed: {len(files)}")
        logger.info(f"Successful: {successful}")
        logger.info(f"Failed: {failed}")
        logger.info(f"Output directory: {self.output_dir}")
        if self.enable_ai:
            logger.info("AI processing: ENABLED")
        else:
            logger.info("AI processing: DISABLED")
        logger.info("=" * 80)

        # List all processed documents with results
        logger.info("\nPROCESSING RESULTS:")
        logger.info("-" * 80)

        for result in self.processing_results:
            status_icon = "✅" if result['status'] == 'OK' else "❌"
            logger.info(f"{status_icon} {result['fileName']} - {result['status']}")

            if result['status'] == 'OK':
                if result['content_items'] == 1:
                    logger.info(f"   └─ Generated: {result['output_files'][0]} ({result['total_content_size']} bytes)")
                else:
                    logger.info(f"   └─ Generated {result['content_items']} files ({result['total_content_size']} total bytes):")
                    for output_file in result['output_files']:
                        logger.info(f"      └─ {output_file}")
            else:
                error_msg = result.get('error', 'Unknown error')
                logger.info(f"   └─ Error: {error_msg}")

        logger.info("-" * 80)
        logger.info("=" * 80)

def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description='Document Extraction Test Script')
    parser.add_argument('--no-ai', '--content-only', action='store_true',
                       help='Run in content-only mode without AI processing')
    parser.add_argument('--input-dir', type=str, default='d:/temp/test-extraction',
                       help='Input directory containing files to process (default: d:/temp/test-extraction)')
    parser.add_argument('--output-dir', type=str,
                       help='Output directory for extracted content (auto-generated if not specified)')
    parser.add_argument('--verbose', '-v', action='store_true',
                       help='Enable verbose logging')

    return parser.parse_args()

async def main():
    """Main function to run the tests."""
    # Parse command line arguments
    args = parse_arguments()

    # Set logging level based on verbosity
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.getLogger().setLevel(logging.INFO)

    logger.info("DocumentExtraction Test Script")
    logger.info("=" * 50)
    logger.info(f"Source: {args.input_dir}")

    # Determine output directory
    if args.output_dir:
        output_dir = args.output_dir
    else:
        if args.no_ai:
            output_dir = f"{args.input_dir}/extracted-raw"
        else:
            output_dir = f"{args.input_dir}/extracted"

    logger.info(f"Output: {output_dir}")
    logger.info("=" * 50)

    # Check dependencies first
    if not check_dependencies():
        logger.error("Please install missing dependencies before running tests.")
        return

    # Check module imports
    if not check_module_imports():
        logger.error("Cannot import required modules. Please check your setup.")
        return

    # Determine mode based on command line arguments
    if args.no_ai:
        enable_ai = False
        logger.info("Running in CONTENT ONLY mode (no AI processing)")
    else:
        # Interactive mode: ask user for choice
        print("\n" + "=" * 50)
        print("SELECT EXTRACTION MODE:")
        print("=" * 50)
        print("1. With AI processing (default)")
        print("2. Content only (no AI processing)")
        print("=" * 50)

        try:
            choice = input("Enter your choice (1 or 2, default is 1): ").strip()
            if choice == "2":
                enable_ai = False
                output_dir = f"{args.input_dir}/extracted-raw"
                logger.info("Selected: Content only mode (no AI processing)")
            else:
                enable_ai = True
                output_dir = f"{args.input_dir}/extracted"
                logger.info("Selected: AI processing mode")
        except (EOFError, KeyboardInterrupt):
            # Default to AI mode if input fails
            enable_ai = True
            output_dir = f"{args.input_dir}/extracted"
            logger.info("Defaulting to AI processing mode")

    # Run tests with selected mode
    tester = DocumentExtractionTester(
        input_dir=args.input_dir,
        output_dir=output_dir,
        enable_ai=enable_ai
    )
    await tester.run_tests()

if __name__ == "__main__":
    # Check if command line arguments are provided for automated testing
    if len(sys.argv) > 1:
        # Parse arguments and run directly
        asyncio.run(main())
    else:
        # Interactive mode: ask user for choice
        asyncio.run(main())

# Convenience function for easy content-only extraction
async def extract_documents_content_only(input_folder: str, output_folder: str = None):
    """
    Convenience function to extract documents without AI processing.

    Args:
        input_folder: Path to folder containing documents to extract
        output_folder: Path to folder where extracted content will be stored (optional)

    Example:
        # Extract from d:/temp to d:/temp/extracted-raw
        asyncio.run(extract_documents_content_only("d:/temp"))

        # Extract from custom folders
        asyncio.run(extract_documents_content_only("c:/my_docs", "c:/my_docs/extracted"))
    """
    if output_folder is None:
        output_folder = f"{input_folder}/extracted-raw"

    logger.info(f"Running content-only extraction from {input_folder} to {output_folder}")

    # Check dependencies and imports
    if not check_dependencies():
        logger.error("Missing dependencies. Please install required packages.")
        return False

    if not check_module_imports():
        logger.error("Cannot import required modules. Please check your setup.")
        return False

    # Create tester and run
    tester = DocumentExtractionTester(
        input_dir=input_folder,
        output_dir=output_folder,
        enable_ai=False
    )

    await tester.run_tests()
    return True

# Example usage (uncomment to use):
# if __name__ == "__main__":
#     # For content-only extraction from d:/temp to d:/temp/extracted-raw
#     asyncio.run(extract_documents_content_only("d:/temp"))