wiki/offerings/generate_complete_pdf.py

#!/usr/bin/env python3
"""
Complete PDF Generation Script
1. Opens HTML in headless browser
2. Waits for JavaScript to complete (TOC + footer)
3. Extracts complete HTML with TOC and footer
4. Generates PDF from final HTML

Usage:
    python generate_complete_pdf.py           # Auto-cleanup temp files
    python generate_complete_pdf.py --keep-temp  # Keep temp files for debugging
"""

import os
import sys
import time
import subprocess
import platform
import json
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

def find_chrome_executable():
    """Find Chrome executable based on platform"""
    if platform.system() == "Windows":
        possible_paths = [
            r"C:\Program Files\Google\Chrome\Application\chrome.exe",
            r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
            r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')),
        ]
    elif platform.system() == "Darwin":  # macOS
        possible_paths = [
            "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
        ]
    else:  # Linux
        possible_paths = [
            "/usr/bin/google-chrome",
            "/usr/bin/chromium-browser",
            "/usr/bin/chromium",
        ]

    for path in possible_paths:
        if os.path.exists(path):
            return path

    # Try to find in PATH
    try:
        result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
        if result.returncode == 0:
            return result.stdout.strip()
    except:
        pass

    return None

def setup_chrome_options():
    """Setup Chrome options for headless operation"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Use new headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-web-security")
    chrome_options.add_argument("--allow-running-insecure-content")
    chrome_options.add_argument("--disable-features=VizDisplayCompositor")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")

    return chrome_options

def wait_for_toc_completion(driver, timeout=30):
    """Wait for TOC generation to complete"""
    print("⏳ Waiting for TOC generation to complete...")

    try:
        # Wait for TOC title to appear (not "Lade Inhaltsverzeichnis...")
        WebDriverWait(driver, timeout).until(
            lambda d: d.find_element(By.CLASS_NAME, "toc-title").text == "Inhaltsverzeichnis"
        )

        # Wait for TOC entries to appear
        WebDriverWait(driver, timeout).until(
            lambda d: len(d.find_elements(By.CLASS_NAME, "toc-entry")) > 0
        )

        # Wait a bit more for all entries to load
        time.sleep(2)

        print("✅ TOC generation completed!")
        return True

    except TimeoutException:
        print("❌ Timeout waiting for TOC generation")
        return False
    except Exception as e:
        print(f"⚠️ Error during TOC wait: {e}")
        return False

def check_driver_health(driver):
    """Check if Chrome driver is still healthy and responsive"""
    try:
        # Try to get current URL to check if driver is responsive
        current_url = driver.current_url
        return True
    except Exception as e:
        print(f"⚠️ Driver health check failed: {e}")
        return False

def wait_for_page_counter_completion(driver, timeout=30):
    """Wait for page counter to complete and update footer"""
    print("⏳ Waiting for page counter to complete...")

    try:
        # Wait for footer to show actual page count (not CSS variable)
        WebDriverWait(driver, timeout).until(
            lambda d: "var(--total-pages" not in d.page_source
        )

        # Wait a bit more for footer to stabilize
        time.sleep(2)

        print("✅ Page counter completed!")
        return True

    except TimeoutException:
        print("❌ Timeout waiting for page counter")
        return False

def analyze_pdf_page_numbers(pdf_path):
    """Analyze PDF to extract real page numbers for sections"""
    print(f"📊 Analyzing PDF: {pdf_path}")

    try:
        # Use PyMuPDF to analyze the PDF
        import fitz  # PyMuPDF

        doc = fitz.open(pdf_path)
        total_pages = len(doc)
        print(f"📄 Total pages in PDF: {total_pages}")

        # Define the sections to look for (both with and without numbering)
        sections = [
            # Main sections (level 1)
            "Offerte Übersicht",
            "Gesamtübersicht & Zahlungsmodalitäten",
            "Zusammenarbeitsmodell",
            "Investitionsschätzung",
            "Kontaktdaten",
            "Rahmenbedingungen & Konditionen",
            "Unterschriften",

            # Sub-sections (level 2)
            "Struktur der Offerte",
            "Detaillierte Lösungsbeschreibung",
            "Projektphasen",

            # Sub-sub-sections (level 3)
            "Kernkomponenten des KI-Systems",
            "Customer Journey - Vollständiger Prozess",
            "Nutzen für Vertriebsprozess durch KI-Automatisierung",
            "User Interfaces & Touchpoints",
            "Technische Integration & Compliance",
            "Organisatorische Verantwortlichkeiten",
            "Kundenseitige Mitwirkung (FTE)",
            "Phase 1: MVP (2-3 Monate)",
            "Phase 2: Erweiterung (Monate 4-6)",
            "Phase 3: Vollintegration (Monate 7-9)",
            "Phase 4: Skalierung (Monate 10-12)"
        ]

        page_numbers = {}

        # Search for each section in the PDF with flexible matching
        for section in sections:
            found = False
            for page_num in range(total_pages):
                page = doc[page_num]
                text = page.get_text()

                # Try exact match first
                if section in text:
                    page_numbers[section] = page_num + 1
                    print(f"📍 '{section}' found on page {page_num + 1}")
                    found = True
                    break

                # Try partial match (for longer section names)
                if len(section) > 10:  # Only for longer sections
                    words = section.split()
                    if len(words) >= 3:  # Only for sections with 3+ words
                        # Check if at least 2 key words are present
                        key_words = [word for word in words if len(word) > 3]  # Skip short words
                        if len(key_words) >= 2:
                            matches = sum(1 for word in key_words if word in text)
                            if matches >= 2:
                                page_numbers[section] = page_num + 1
                                print(f"📍 '{section}' (partial match) found on page {page_num + 1}")
                                found = True
                                break

            if not found:
                print(f"⚠️ Section '{section}' not found in any page")
                # Assign to page 1 as fallback
                page_numbers[section] = 1

        doc.close()

        # Save page numbers to JSON
        output_data = {
            "total_pages": total_pages,
            "page_numbers": page_numbers
        }

        json_path = Path("_tmp/page_numbers_updated.json")
        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)

        print(f"✅ Page numbers saved to: {json_path}")
        return output_data

    except ImportError:
        print("❌ PyMuPDF not available, using fallback method...")
        return fallback_pdf_analysis(pdf_path)
    except Exception as e:
        print(f"❌ PDF analysis error: {e}")
        return None

def fallback_pdf_analysis(pdf_path):
    """Fallback PDF analysis using Chrome headless"""
    print("🔄 Using Chrome headless for PDF analysis...")

    try:
        # Use Chrome to extract text from PDF
        chrome_path = find_chrome_executable()
        if not chrome_path:
            return None

        # Convert PDF to HTML for analysis
        html_path = pdf_path.replace(".pdf", "_text.html")
        cmd = [
            chrome_path,
            "--headless",
            "--no-sandbox",
            "--disable-dev-shm-usage",
            "--print-to-pdf-no-header",
            f"file://{os.path.abspath(pdf_path)}"
        ]

        # This is a simplified fallback - in practice, you'd want more sophisticated text extraction
        print("⚠️ Fallback analysis limited - consider installing PyMuPDF")
        return {"total_pages": 22, "page_numbers": {}}  # Default values

    except Exception as e:
        print(f"❌ Fallback analysis failed: {e}")
        return None

def wait_for_footer_update(driver, timeout=10):
    """Wait for footer to be updated with correct page count"""
    print("⏳ Waiting for footer update...")

    try:
        # Check if footer shows correct page count (not "var(--total-pages, 16)")
        WebDriverWait(driver, timeout).until(
            lambda d: "var(--total-pages" not in d.page_source
        )

        print("✅ Footer updated!")
        return True

    except TimeoutException:
        print("⚠️ Footer update timeout - continuing anyway")
        return True

def extract_complete_html(driver, output_filename=None, tmp_dir=None):
    """Extract the complete HTML after JavaScript processing"""
    if output_filename is None:
        output_filename = "energie360-offerte-complete.html"

    # Ensure tmp_dir exists
    if tmp_dir:
        tmp_dir.mkdir(parents=True, exist_ok=True)
        output_path = tmp_dir / output_filename
    else:
        output_path = Path(output_filename)

    print(f"📄 Extracting complete HTML to: {output_path}")

    # Get the complete HTML
    complete_html = driver.page_source

    # Save the complete HTML for inspection
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(complete_html)

    print(f"✅ Complete HTML saved to: {output_path}")
    return complete_html

def clean_and_number_headings_professionally(driver):
    """Clean existing numbering and add professional outline numbering to headings"""
    print("🔧 Cleaning and adding professional outline numbering to headings...")

    try:
        js_code = """
        (function() {
            console.log('🔧 Cleaning and adding professional outline numbering...');

            // Function to clean and add professional outline numbering
            function cleanAndNumberHeadings() {
                let level1Count = 0;
                let level2Count = 0;
                let level3Count = 0;

                const headings = document.querySelectorAll('h1.content-heading, h2.content-heading, h3.content-heading');

                headings.forEach(heading => {
                    const level = parseInt(heading.className.match(/heading-level-(\\d+)/)?.[1] || '1');

                    // Get the original text and clean it completely
                    let originalText = heading.textContent.trim();

                    // Remove ALL existing numbering patterns (1, 1.1, 1.1.1, etc.)
                    originalText = originalText.replace(/^\\d+(\\.\\d+)*\\s*/, '');

                    // Store the clean original text
                    heading.setAttribute('data-original-text', originalText);

                    // Add appropriate professional numbering
                    let numberedText = '';
                    if (level === 1) {
                        level1Count++;
                        level2Count = 0;
                        level3Count = 0;
                        numberedText = level1Count + ' ' + originalText;
                    } else if (level === 2) {
                        level2Count++;
                        level3Count = 0;
                        numberedText = level1Count + '.' + level2Count + ' ' + originalText;
                    } else if (level === 3) {
                        level3Count++;
                        numberedText = level1Count + '.' + level2Count + '.' + level3Count + ' ' + originalText;
                    }

                    // Update heading text with clean numbering
                    heading.textContent = numberedText;
                    heading.setAttribute('data-numbered-text', numberedText);

                    console.log(`✅ Heading ${level}: "${originalText}" -> "${numberedText}"`);
                });

                console.log('✅ Professional outline numbering completed');
                return true;
            }

            return cleanAndNumberHeadings();
        })();
        """

        # Execute the JavaScript
        result = driver.execute_script(js_code)

        # Wait a bit for the JavaScript to complete
        time.sleep(1)

        print("✅ Professional outline numbering added successfully")
        return True

    except Exception as e:
        print(f"❌ Error adding professional outline numbering: {e}")
        return False

def create_professional_toc_with_real_page_numbers(driver, page_numbers):
    """Create a professional TOC with correct page numbers and proper alignment"""
    print("🔧 Creating professional TOC with real page numbers...")

    try:
        # Create JavaScript to build a professional TOC
        js_code = """
        (function() {
            console.log('🔧 Creating professional TOC with real page numbers...');

            // Get the page numbers data
            const pageNumbers = %s;

            // Function to find the best matching section for a heading text
            function findBestSectionMatch(headingText) {
                const cleanHeading = headingText.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim();

                console.log(`🔍 Looking for match for: "${cleanHeading}"`);
                console.log(`📋 Available sections:`, Object.keys(pageNumbers.page_numbers));

                // Try exact match first
                for (let [section, page] of Object.entries(pageNumbers.page_numbers)) {
                    if (cleanHeading === section) {
                        console.log(`✅ Exact match: "${section}" -> page ${page}`);
                        return { section, page };
                    }
                }

                // Try partial match (more flexible)
                for (let [section, page] of Object.entries(pageNumbers.page_numbers)) {
                    const cleanSection = section.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim();

                    // Check if either contains the other
                    if (cleanHeading.includes(cleanSection) || cleanSection.includes(cleanHeading)) {
                        console.log(`✅ Partial match: "${cleanSection}" -> page ${page}`);
                        return { section, page };
                    }
                }

                // Try word-based matching for longer headings
                if (cleanHeading.length > 10) {
                    const headingWords = cleanHeading.split(' ').filter(word => word.length > 3);

                    for (let [section, page] of Object.entries(pageNumbers.page_numbers)) {
                        const cleanSection = section.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim();
                        const sectionWords = cleanSection.split(' ').filter(word => word.length > 3);

                        // Count matching words
                        const matches = headingWords.filter(word =>
                            sectionWords.some(sword =>
                                word.toLowerCase() === sword.toLowerCase() ||
                                word.toLowerCase().includes(sword.toLowerCase()) ||
                                sword.toLowerCase().includes(word.toLowerCase())
                            )
                        );

                        if (matches.length >= 2) {
                            console.log(`✅ Word-based match: "${cleanSection}" -> page ${page} (${matches.length} words)`);
                            return { section, page };
                        }
                    }
                }

                console.warn(`⚠️ No match found for: "${cleanHeading}"`);
                return null;
            }

            // Function to create professional TOC
            function createProfessionalTOC() {
                // Find the TOC container
                const tocContainer = document.querySelector('.toc-auto');
                if (!tocContainer) {
                    console.error('❌ TOC container not found');
                    return false;
                }

                // Clear existing TOC completely
                tocContainer.innerHTML = '';

                // Create professional TOC title
                const title = document.createElement('h1');
                title.className = 'toc-title';
                title.textContent = 'Inhaltsverzeichnis';
                title.style.textAlign = 'center';
                title.style.marginBottom = '2rem';
                title.style.fontSize = '1.5rem';
                title.style.fontWeight = 'bold';
                title.style.color = '#2c3e50';
                tocContainer.appendChild(title);

                // Get all numbered headings
                const headings = document.querySelectorAll('h1.content-heading, h2.content-heading, h3.content-heading');

                if (headings.length === 0) {
                    console.error('❌ No content headings found');
                    return false;
                }

                console.log(`📋 Found ${headings.length} headings to process`);

                headings.forEach((heading, index) => {
                    const headingText = heading.textContent.trim();
                    const level = parseInt(heading.className.match(/heading-level-(\\d+)/)?.[1] || '1');

                    // Find page number for this heading
                    const match = findBestSectionMatch(headingText);
                    const pageNumber = match ? match.page : 1; // Default to page 1 if no match

                    // Create TOC entry with professional styling
                    const entry = document.createElement('div');
                    entry.className = 'toc-entry';
                    entry.classList.add(`toc-level-${level}`);

                    // Set professional styling
                    entry.style.display = 'flex';
                    entry.style.justifyContent = 'space-between';
                    entry.style.alignItems = 'center';
                    entry.style.marginBottom = '0.5rem';
                    entry.style.cursor = 'pointer';
                    entry.style.transition = 'background-color 0.2s';

                    // Add hover effect
                    entry.addEventListener('mouseenter', () => {
                        entry.style.backgroundColor = '#f8f9fa';
                    });
                    entry.addEventListener('mouseleave', () => {
                        entry.style.backgroundColor = 'transparent';
                    });

                    // Create text container
                    const textContainer = document.createElement('div');
                    textContainer.style.flex = '1';
                    textContainer.style.display = 'flex';
                    textContainer.style.alignItems = 'center';

                    // Add level-based indentation
                    const indent = (level - 1) * 1.5;
                    textContainer.style.marginLeft = indent + 'rem';

                    // Create text span
                    const textSpan = document.createElement('span');
                    textSpan.className = 'toc-text';
                    textSpan.textContent = headingText;
                    textSpan.style.fontSize = level === 1 ? '1.1rem' : level === 2 ? '1rem' : '0.9rem';
                    textSpan.style.fontWeight = level === 1 ? 'bold' : 'normal';
                    textSpan.style.color = level === 1 ? '#2c3e50' : level === 2 ? '#34495e' : '#7f8c8d';

                    textContainer.appendChild(textSpan);

                    // Create right-aligned page number
                    const pageSpan = document.createElement('span');
                    pageSpan.className = 'toc-page';
                    pageSpan.textContent = pageNumber;
                    pageSpan.style.fontWeight = 'bold';
                    pageSpan.style.color = '#e74c3c';
                    pageSpan.style.minWidth = '2rem';
                    pageSpan.style.textAlign = 'right';

                    // Create dots (leader)
                    const dotsSpan = document.createElement('span');
                    dotsSpan.className = 'toc-dots';
                    dotsSpan.textContent = '.'.repeat(20);
                    dotsSpan.style.flex = '1';
                    dotsSpan.style.margin = '0 0.5rem';
                    dotsSpan.style.color = '#bdc3c7';
                    dotsSpan.style.overflow = 'hidden';
                    dotsSpan.style.whiteSpace = 'nowrap';

                    // Assemble entry
                    entry.appendChild(textContainer);
                    entry.appendChild(dotsSpan);
                    entry.appendChild(pageSpan);

                    // Add click functionality
                    entry.addEventListener('click', () => {
                        heading.scrollIntoView({
                            behavior: 'smooth',
                            block: 'start'
                        });
                    });

                    // Add to TOC
                    tocContainer.appendChild(entry);

                    console.log(`✅ TOC entry ${index + 1}: "${headingText}" -> page ${pageNumber} (level ${level})`);
                });

                console.log('✅ Professional TOC created successfully');
                return true;
            }

            // Execute the function
            return createProfessionalTOC();
        })();
        """ % json.dumps(page_numbers)

        # Execute the JavaScript
        result = driver.execute_script(js_code)

        # Wait a bit for the JavaScript to complete
        time.sleep(2)

        print("✅ Professional TOC created successfully")
        return True

    except Exception as e:
        print(f"❌ Error creating professional TOC: {e}")
        return False

def generate_pdf_from_complete_html(html_file_path, output_filename=None):
    """Generate PDF from the complete HTML file"""
    if output_filename is None:
        output_filename = "energie360-offerte-complete.pdf"

    print(f"🔄 Generating PDF: {output_filename}")

    chrome_path = find_chrome_executable()
    if not chrome_path:
        print("❌ Chrome not found!")
        return False

    try:
        # Ensure output filename has absolute path
        output_abs_path = os.path.abspath(output_filename)
        html_abs_path = os.path.abspath(html_file_path)

        print(f"📁 Output will be saved to: {output_abs_path}")
        print(f"📄 HTML source: {html_abs_path}")

        # Use Chrome to generate PDF from the complete HTML
        cmd = [
            chrome_path,
            "--headless",
            "--no-sandbox",
            "--disable-dev-shm-usage",
            f"--print-to-pdf={output_abs_path}",
            "--print-to-pdf-no-header",
            f"file://{html_abs_path}"
        ]

        print(f"🔧 Running command: {' '.join(cmd)}")

        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)

        if result.returncode == 0:
            # Check if file was actually created
            if os.path.exists(output_abs_path):
                print(f"✅ PDF generated successfully: {output_filename}")
                print(f"📁 File location: {output_abs_path}")
                return True
            else:
                print(f"⚠️ PDF generation reported success but file not found: {output_filename}")
                print(f"🔍 Current working directory: {os.getcwd()}")
                print(f"🔍 Expected file location: {output_abs_path}")
                print(f"📋 Chrome stdout: {result.stdout}")
                print(f"📋 Chrome stderr: {result.stderr}")
                return False
        else:
            print(f"❌ PDF generation failed with return code: {result.returncode}")
            print(f"📋 Chrome stdout: {result.stdout}")
            print(f"📋 Chrome stderr: {result.stderr}")
            return False

    except subprocess.TimeoutExpired:
        print("❌ PDF generation timeout")
        return False
    except Exception as e:
        print(f"❌ PDF generation error: {e}")
        return False

def cleanup_old_files(tmp_dir, final_pdf_path):
    """Clean up old files before starting the process"""
    print("🧹 Cleaning up old files...")

    # Remove existing final PDF if it exists
    if final_pdf_path.exists():
        print(f"🗑️ Removing existing final PDF: {final_pdf_path}")
        final_pdf_path.unlink()

    # Clean up old temporary directory if it exists
    if tmp_dir.exists():
        print(f"🗑️ Cleaning up old temporary directory: {tmp_dir}")
        import shutil
        shutil.rmtree(tmp_dir)

    print("✅ Cleanup completed")

def cleanup_temp_files(tmp_dir):
    """Clean up temporary files after successful completion"""
    try:
        if tmp_dir.exists():
            import shutil
            shutil.rmtree(tmp_dir)
            print(f"✅ Temporary files cleaned up: {tmp_dir}")
        else:
            print("ℹ️ No temporary files to clean up")
    except Exception as e:
        print(f"⚠️ Warning: Could not clean up temporary files: {e}")
        print(f"💡 You can manually delete the folder: {tmp_dir}")

def main():
    """Main function"""
    # Parse command line arguments
    keep_temp = "--keep-temp" in sys.argv
    if keep_temp:
        print("📝 Note: Temporary files will be kept (--keep-temp flag detected)")

    print("🚀 Starting Complete PDF Generation Process...")

    # Check if HTML file exists
    html_file = Path("energie360/energie360-offerte.html")
    if not html_file.exists():
        print(f"❌ HTML file not found: {html_file}")
        return False

    # Create temporary directory for all intermediate files
    tmp_dir = Path("_tmp")

    # Define output paths
    source_dir = html_file.parent
    final_pdf_path = source_dir / "energie360-offerte-final.pdf"

    # Clean up old files
    cleanup_old_files(tmp_dir, final_pdf_path)

    # Create fresh temporary directory
    tmp_dir.mkdir(exist_ok=True)
    print(f"📁 Temporary files will be saved to: {tmp_dir.absolute()}")
    print(f"📁 Temporary files include: HTML extracts, intermediate PDFs, and analysis data")

    print(f"📁 Final PDF will be saved to: {final_pdf_path}")

    # Setup Chrome options
    chrome_options = setup_chrome_options()

    # Find Chrome executable
    chrome_path = find_chrome_executable()
    if not chrome_path:
        print("❌ Chrome not found! Please install Google Chrome.")
        return False

    print(f"✅ Chrome found at: {chrome_path}")

    # Start Chrome driver with retry logic
    driver = None
    max_retries = 3

    for attempt in range(max_retries):
        try:
            print(f"🔄 Attempt {attempt + 1}/{max_retries} to start Chrome driver...")
            driver = webdriver.Chrome(options=chrome_options)
            print("✅ Chrome driver started successfully")
            break
        except Exception as e:
            print(f"❌ Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print("⏳ Waiting 2 seconds before retry...")
                time.sleep(2)
            else:
                print("❌ All attempts to start Chrome driver failed")
                return False

    try:
        # ===== FIRST PASS: Generate TOC with estimated page numbers =====
        print("\n🔄 FIRST PASS: Generating TOC with estimated page numbers...")

        # Open the HTML file
        html_url = f"file://{os.path.abspath(html_file)}"
        print(f"🌐 Opening HTML: {html_url}")
        driver.get(html_url)

        # Wait for page to load
        time.sleep(3)

        # Wait for TOC generation with estimated numbers
        try:
            if not check_driver_health(driver):
                print("❌ Driver unhealthy after page load")
                return False

            if not wait_for_toc_completion(driver):
                print("❌ First pass TOC generation failed")
                return False
        except Exception as e:
            print(f"❌ Error during first pass TOC generation: {e}")
            return False

        print("✅ First pass TOC completed with estimated page numbers")

        # ===== SECOND PASS: Extract HTML with estimated TOC and generate PDF =====
        print("\n🔄 SECOND PASS: Extracting HTML with estimated TOC and generating PDF...")

        # Check driver health before extraction
        if not check_driver_health(driver):
            print("❌ Driver unhealthy before HTML extraction")
            return False

        # Extract HTML with estimated TOC for analysis
        print("📄 Extracting HTML with estimated TOC...")
        estimated_html = extract_complete_html(driver, "energie360-offerte-estimated.html", tmp_dir)

        # Generate PDF from HTML with estimated TOC
        estimated_html_path = tmp_dir / "energie360-offerte-estimated.html"
        estimated_pdf_path = tmp_dir / "energie360-offerte-estimated.pdf"
        if not generate_pdf_from_complete_html(str(estimated_html_path), str(estimated_pdf_path)):
            print("❌ PDF generation for analysis failed")
            return False

        # Analyze PDF to get real page numbers
        print("📊 Analyzing PDF for real page numbers...")
        # Use absolute path for PDF analysis
        pdf_path = estimated_pdf_path.absolute()
        print(f"📄 Looking for PDF at: {pdf_path}")
        page_numbers = analyze_pdf_page_numbers(str(pdf_path))

        if not page_numbers:
            print("❌ PDF analysis failed")
            return False

        # ===== THIRD PASS: Update TOC with real page numbers =====
        print("\n🔄 THIRD PASS: Updating TOC with real page numbers...")

        # First, clean existing numbering and add professional outline numbering
        print("🔧 Cleaning and adding professional outline numbering to headings...")
        if not clean_and_number_headings_professionally(driver):
            print("❌ Failed to add professional outline numbering")
            return False

        # Now create a professional TOC with real page numbers
        print("🔧 Creating professional TOC with real page numbers...")
        if not create_professional_toc_with_real_page_numbers(driver, page_numbers):
            print("❌ Failed to create professional TOC")
            return False

        # Wait for page counter to update footer
        if not wait_for_page_counter_completion(driver):
            print("❌ Page counter update failed")
            return False

        print("✅ Second pass TOC completed with real page numbers")

        # ===== FINAL: Extract complete HTML and generate final PDF =====
        print("\n🔄 FINAL: Generating final PDF with accurate TOC and footer...")

        # Extract complete HTML with accurate TOC and footer
        complete_html = extract_complete_html(driver, "energie360-offerte-final.html", tmp_dir)

        # Generate final PDF from complete HTML
        final_html_path = tmp_dir / "energie360-offerte-final.html"
        if generate_pdf_from_complete_html(str(final_html_path), str(final_pdf_path)):
            print("🎉 Final PDF generation successful!")
            print(f"📁 Final PDF saved to: {final_pdf_path}")

            # Handle temporary files based on command line flag
            if keep_temp:
                print(f"📁 Temporary files kept in: {tmp_dir.absolute()}")
                print("💡 Use '--keep-temp' flag to automatically clean up temp files")
            else:
                print(f"📁 Temporary files are available in: {tmp_dir.absolute()}")
                print("🧹 Cleaning up temporary files...")
                cleanup_temp_files(tmp_dir)

            return True
        else:
            print("❌ Final PDF generation failed")
            return False

    except Exception as e:
        print(f"❌ Error during process: {e}")
        return False
    finally:
        driver.quit()
        print("✅ Chrome driver closed")

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)