#!/usr/bin/env python3 """ Complete PDF Generation Script 1. Opens HTML in headless browser 2. Waits for JavaScript to complete (TOC + footer) 3. Extracts complete HTML with TOC and footer 4. Generates PDF from final HTML Usage: python generate_complete_pdf.py # Auto-cleanup temp files python generate_complete_pdf.py --keep-temp # Keep temp files for debugging """ import os import sys import time import subprocess import platform import json from pathlib import Path from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException def find_chrome_executable(): """Find Chrome executable based on platform""" if platform.system() == "Windows": possible_paths = [ r"C:\Program Files\Google\Chrome\Application\chrome.exe", r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe", r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')), ] elif platform.system() == "Darwin": # macOS possible_paths = [ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", ] else: # Linux possible_paths = [ "/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium", ] for path in possible_paths: if os.path.exists(path): return path # Try to find in PATH try: result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() except: pass return None def setup_chrome_options(): """Setup Chrome options for headless operation""" chrome_options = Options() chrome_options.add_argument("--headless=new") # Use new headless mode chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-web-security") chrome_options.add_argument("--allow-running-insecure-content") chrome_options.add_argument("--disable-features=VizDisplayCompositor") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") return chrome_options def wait_for_toc_completion(driver, timeout=30): """Wait for TOC generation to complete""" print("⏳ Waiting for TOC generation to complete...") try: # Wait for TOC title to appear (not "Lade Inhaltsverzeichnis...") WebDriverWait(driver, timeout).until( lambda d: d.find_element(By.CLASS_NAME, "toc-title").text == "Inhaltsverzeichnis" ) # Wait for TOC entries to appear WebDriverWait(driver, timeout).until( lambda d: len(d.find_elements(By.CLASS_NAME, "toc-entry")) > 0 ) # Wait a bit more for all entries to load time.sleep(2) print("✅ TOC generation completed!") return True except TimeoutException: print("❌ Timeout waiting for TOC generation") return False except Exception as e: print(f"⚠️ Error during TOC wait: {e}") return False def check_driver_health(driver): """Check if Chrome driver is still healthy and responsive""" try: # Try to get current URL to check if driver is responsive current_url = driver.current_url return True except Exception as e: print(f"⚠️ Driver health check failed: {e}") return False def wait_for_page_counter_completion(driver, timeout=30): """Wait for page counter to complete and update footer""" print("⏳ Waiting for page counter to complete...") try: # Wait for footer to show actual page count (not CSS variable) WebDriverWait(driver, timeout).until( lambda d: "var(--total-pages" not in d.page_source ) # Wait a bit more for footer to stabilize time.sleep(2) print("✅ Page counter completed!") return True except TimeoutException: print("❌ Timeout waiting for page counter") return False def analyze_pdf_page_numbers(pdf_path): """Analyze PDF to extract real page numbers for sections""" print(f"📊 Analyzing PDF: {pdf_path}") try: # Use PyMuPDF to analyze the PDF import fitz # PyMuPDF doc = fitz.open(pdf_path) total_pages = len(doc) print(f"📄 Total pages in PDF: {total_pages}") # Define the sections to look for (both with and without numbering) sections = [ # Main sections (level 1) "Offerte Übersicht", "Gesamtübersicht & Zahlungsmodalitäten", "Zusammenarbeitsmodell", "Investitionsschätzung", "Kontaktdaten", "Rahmenbedingungen & Konditionen", "Unterschriften", # Sub-sections (level 2) "Struktur der Offerte", "Detaillierte Lösungsbeschreibung", "Projektphasen", # Sub-sub-sections (level 3) "Kernkomponenten des KI-Systems", "Customer Journey - Vollständiger Prozess", "Nutzen für Vertriebsprozess durch KI-Automatisierung", "User Interfaces & Touchpoints", "Technische Integration & Compliance", "Organisatorische Verantwortlichkeiten", "Kundenseitige Mitwirkung (FTE)", "Phase 1: MVP (2-3 Monate)", "Phase 2: Erweiterung (Monate 4-6)", "Phase 3: Vollintegration (Monate 7-9)", "Phase 4: Skalierung (Monate 10-12)" ] page_numbers = {} # Search for each section in the PDF with flexible matching for section in sections: found = False for page_num in range(total_pages): page = doc[page_num] text = page.get_text() # Try exact match first if section in text: page_numbers[section] = page_num + 1 print(f"📍 '{section}' found on page {page_num + 1}") found = True break # Try partial match (for longer section names) if len(section) > 10: # Only for longer sections words = section.split() if len(words) >= 3: # Only for sections with 3+ words # Check if at least 2 key words are present key_words = [word for word in words if len(word) > 3] # Skip short words if len(key_words) >= 2: matches = sum(1 for word in key_words if word in text) if matches >= 2: page_numbers[section] = page_num + 1 print(f"📍 '{section}' (partial match) found on page {page_num + 1}") found = True break if not found: print(f"⚠️ Section '{section}' not found in any page") # Assign to page 1 as fallback page_numbers[section] = 1 doc.close() # Save page numbers to JSON output_data = { "total_pages": total_pages, "page_numbers": page_numbers } json_path = Path("_tmp/page_numbers_updated.json") json_path.parent.mkdir(parents=True, exist_ok=True) with open(json_path, "w", encoding="utf-8") as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"✅ Page numbers saved to: {json_path}") return output_data except ImportError: print("❌ PyMuPDF not available, using fallback method...") return fallback_pdf_analysis(pdf_path) except Exception as e: print(f"❌ PDF analysis error: {e}") return None def fallback_pdf_analysis(pdf_path): """Fallback PDF analysis using Chrome headless""" print("🔄 Using Chrome headless for PDF analysis...") try: # Use Chrome to extract text from PDF chrome_path = find_chrome_executable() if not chrome_path: return None # Convert PDF to HTML for analysis html_path = pdf_path.replace(".pdf", "_text.html") cmd = [ chrome_path, "--headless", "--no-sandbox", "--disable-dev-shm-usage", "--print-to-pdf-no-header", f"file://{os.path.abspath(pdf_path)}" ] # This is a simplified fallback - in practice, you'd want more sophisticated text extraction print("⚠️ Fallback analysis limited - consider installing PyMuPDF") return {"total_pages": 22, "page_numbers": {}} # Default values except Exception as e: print(f"❌ Fallback analysis failed: {e}") return None def wait_for_footer_update(driver, timeout=10): """Wait for footer to be updated with correct page count""" print("⏳ Waiting for footer update...") try: # Check if footer shows correct page count (not "var(--total-pages, 16)") WebDriverWait(driver, timeout).until( lambda d: "var(--total-pages" not in d.page_source ) print("✅ Footer updated!") return True except TimeoutException: print("⚠️ Footer update timeout - continuing anyway") return True def extract_complete_html(driver, output_filename=None, tmp_dir=None): """Extract the complete HTML after JavaScript processing""" if output_filename is None: output_filename = "energie360-offerte-complete.html" # Ensure tmp_dir exists if tmp_dir: tmp_dir.mkdir(parents=True, exist_ok=True) output_path = tmp_dir / output_filename else: output_path = Path(output_filename) print(f"📄 Extracting complete HTML to: {output_path}") # Get the complete HTML complete_html = driver.page_source # Save the complete HTML for inspection with open(output_path, "w", encoding="utf-8") as f: f.write(complete_html) print(f"✅ Complete HTML saved to: {output_path}") return complete_html def clean_and_number_headings_professionally(driver): """Clean existing numbering and add professional outline numbering to headings""" print("🔧 Cleaning and adding professional outline numbering to headings...") try: js_code = """ (function() { console.log('🔧 Cleaning and adding professional outline numbering...'); // Function to clean and add professional outline numbering function cleanAndNumberHeadings() { let level1Count = 0; let level2Count = 0; let level3Count = 0; const headings = document.querySelectorAll('h1.content-heading, h2.content-heading, h3.content-heading'); headings.forEach(heading => { const level = parseInt(heading.className.match(/heading-level-(\\d+)/)?.[1] || '1'); // Get the original text and clean it completely let originalText = heading.textContent.trim(); // Remove ALL existing numbering patterns (1, 1.1, 1.1.1, etc.) originalText = originalText.replace(/^\\d+(\\.\\d+)*\\s*/, ''); // Store the clean original text heading.setAttribute('data-original-text', originalText); // Add appropriate professional numbering let numberedText = ''; if (level === 1) { level1Count++; level2Count = 0; level3Count = 0; numberedText = level1Count + ' ' + originalText; } else if (level === 2) { level2Count++; level3Count = 0; numberedText = level1Count + '.' + level2Count + ' ' + originalText; } else if (level === 3) { level3Count++; numberedText = level1Count + '.' + level2Count + '.' + level3Count + ' ' + originalText; } // Update heading text with clean numbering heading.textContent = numberedText; heading.setAttribute('data-numbered-text', numberedText); console.log(`✅ Heading ${level}: "${originalText}" -> "${numberedText}"`); }); console.log('✅ Professional outline numbering completed'); return true; } return cleanAndNumberHeadings(); })(); """ # Execute the JavaScript result = driver.execute_script(js_code) # Wait a bit for the JavaScript to complete time.sleep(1) print("✅ Professional outline numbering added successfully") return True except Exception as e: print(f"❌ Error adding professional outline numbering: {e}") return False def create_professional_toc_with_real_page_numbers(driver, page_numbers): """Create a professional TOC with correct page numbers and proper alignment""" print("🔧 Creating professional TOC with real page numbers...") try: # Create JavaScript to build a professional TOC js_code = """ (function() { console.log('🔧 Creating professional TOC with real page numbers...'); // Get the page numbers data const pageNumbers = %s; // Function to find the best matching section for a heading text function findBestSectionMatch(headingText) { const cleanHeading = headingText.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim(); console.log(`🔍 Looking for match for: "${cleanHeading}"`); console.log(`📋 Available sections:`, Object.keys(pageNumbers.page_numbers)); // Try exact match first for (let [section, page] of Object.entries(pageNumbers.page_numbers)) { if (cleanHeading === section) { console.log(`✅ Exact match: "${section}" -> page ${page}`); return { section, page }; } } // Try partial match (more flexible) for (let [section, page] of Object.entries(pageNumbers.page_numbers)) { const cleanSection = section.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim(); // Check if either contains the other if (cleanHeading.includes(cleanSection) || cleanSection.includes(cleanHeading)) { console.log(`✅ Partial match: "${cleanSection}" -> page ${page}`); return { section, page }; } } // Try word-based matching for longer headings if (cleanHeading.length > 10) { const headingWords = cleanHeading.split(' ').filter(word => word.length > 3); for (let [section, page] of Object.entries(pageNumbers.page_numbers)) { const cleanSection = section.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim(); const sectionWords = cleanSection.split(' ').filter(word => word.length > 3); // Count matching words const matches = headingWords.filter(word => sectionWords.some(sword => word.toLowerCase() === sword.toLowerCase() || word.toLowerCase().includes(sword.toLowerCase()) || sword.toLowerCase().includes(word.toLowerCase()) ) ); if (matches.length >= 2) { console.log(`✅ Word-based match: "${cleanSection}" -> page ${page} (${matches.length} words)`); return { section, page }; } } } console.warn(`⚠️ No match found for: "${cleanHeading}"`); return null; } // Function to create professional TOC function createProfessionalTOC() { // Find the TOC container const tocContainer = document.querySelector('.toc-auto'); if (!tocContainer) { console.error('❌ TOC container not found'); return false; } // Clear existing TOC completely tocContainer.innerHTML = ''; // Create professional TOC title const title = document.createElement('h1'); title.className = 'toc-title'; title.textContent = 'Inhaltsverzeichnis'; title.style.textAlign = 'center'; title.style.marginBottom = '2rem'; title.style.fontSize = '1.5rem'; title.style.fontWeight = 'bold'; title.style.color = '#2c3e50'; tocContainer.appendChild(title); // Get all numbered headings const headings = document.querySelectorAll('h1.content-heading, h2.content-heading, h3.content-heading'); if (headings.length === 0) { console.error('❌ No content headings found'); return false; } console.log(`📋 Found ${headings.length} headings to process`); headings.forEach((heading, index) => { const headingText = heading.textContent.trim(); const level = parseInt(heading.className.match(/heading-level-(\\d+)/)?.[1] || '1'); // Find page number for this heading const match = findBestSectionMatch(headingText); const pageNumber = match ? match.page : 1; // Default to page 1 if no match // Create TOC entry with professional styling const entry = document.createElement('div'); entry.className = 'toc-entry'; entry.classList.add(`toc-level-${level}`); // Set professional styling entry.style.display = 'flex'; entry.style.justifyContent = 'space-between'; entry.style.alignItems = 'center'; entry.style.marginBottom = '0.5rem'; entry.style.cursor = 'pointer'; entry.style.transition = 'background-color 0.2s'; // Add hover effect entry.addEventListener('mouseenter', () => { entry.style.backgroundColor = '#f8f9fa'; }); entry.addEventListener('mouseleave', () => { entry.style.backgroundColor = 'transparent'; }); // Create text container const textContainer = document.createElement('div'); textContainer.style.flex = '1'; textContainer.style.display = 'flex'; textContainer.style.alignItems = 'center'; // Add level-based indentation const indent = (level - 1) * 1.5; textContainer.style.marginLeft = indent + 'rem'; // Create text span const textSpan = document.createElement('span'); textSpan.className = 'toc-text'; textSpan.textContent = headingText; textSpan.style.fontSize = level === 1 ? '1.1rem' : level === 2 ? '1rem' : '0.9rem'; textSpan.style.fontWeight = level === 1 ? 'bold' : 'normal'; textSpan.style.color = level === 1 ? '#2c3e50' : level === 2 ? '#34495e' : '#7f8c8d'; textContainer.appendChild(textSpan); // Create right-aligned page number const pageSpan = document.createElement('span'); pageSpan.className = 'toc-page'; pageSpan.textContent = pageNumber; pageSpan.style.fontWeight = 'bold'; pageSpan.style.color = '#e74c3c'; pageSpan.style.minWidth = '2rem'; pageSpan.style.textAlign = 'right'; // Create dots (leader) const dotsSpan = document.createElement('span'); dotsSpan.className = 'toc-dots'; dotsSpan.textContent = '.'.repeat(20); dotsSpan.style.flex = '1'; dotsSpan.style.margin = '0 0.5rem'; dotsSpan.style.color = '#bdc3c7'; dotsSpan.style.overflow = 'hidden'; dotsSpan.style.whiteSpace = 'nowrap'; // Assemble entry entry.appendChild(textContainer); entry.appendChild(dotsSpan); entry.appendChild(pageSpan); // Add click functionality entry.addEventListener('click', () => { heading.scrollIntoView({ behavior: 'smooth', block: 'start' }); }); // Add to TOC tocContainer.appendChild(entry); console.log(`✅ TOC entry ${index + 1}: "${headingText}" -> page ${pageNumber} (level ${level})`); }); console.log('✅ Professional TOC created successfully'); return true; } // Execute the function return createProfessionalTOC(); })(); """ % json.dumps(page_numbers) # Execute the JavaScript result = driver.execute_script(js_code) # Wait a bit for the JavaScript to complete time.sleep(2) print("✅ Professional TOC created successfully") return True except Exception as e: print(f"❌ Error creating professional TOC: {e}") return False def generate_pdf_from_complete_html(html_file_path, output_filename=None): """Generate PDF from the complete HTML file""" if output_filename is None: output_filename = "energie360-offerte-complete.pdf" print(f"🔄 Generating PDF: {output_filename}") chrome_path = find_chrome_executable() if not chrome_path: print("❌ Chrome not found!") return False try: # Ensure output filename has absolute path output_abs_path = os.path.abspath(output_filename) html_abs_path = os.path.abspath(html_file_path) print(f"📁 Output will be saved to: {output_abs_path}") print(f"📄 HTML source: {html_abs_path}") # Use Chrome to generate PDF from the complete HTML cmd = [ chrome_path, "--headless", "--no-sandbox", "--disable-dev-shm-usage", f"--print-to-pdf={output_abs_path}", "--print-to-pdf-no-header", f"file://{html_abs_path}" ] print(f"🔧 Running command: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) if result.returncode == 0: # Check if file was actually created if os.path.exists(output_abs_path): print(f"✅ PDF generated successfully: {output_filename}") print(f"📁 File location: {output_abs_path}") return True else: print(f"⚠️ PDF generation reported success but file not found: {output_filename}") print(f"🔍 Current working directory: {os.getcwd()}") print(f"🔍 Expected file location: {output_abs_path}") print(f"📋 Chrome stdout: {result.stdout}") print(f"📋 Chrome stderr: {result.stderr}") return False else: print(f"❌ PDF generation failed with return code: {result.returncode}") print(f"📋 Chrome stdout: {result.stdout}") print(f"📋 Chrome stderr: {result.stderr}") return False except subprocess.TimeoutExpired: print("❌ PDF generation timeout") return False except Exception as e: print(f"❌ PDF generation error: {e}") return False def cleanup_old_files(tmp_dir, final_pdf_path): """Clean up old files before starting the process""" print("🧹 Cleaning up old files...") # Remove existing final PDF if it exists if final_pdf_path.exists(): print(f"🗑️ Removing existing final PDF: {final_pdf_path}") final_pdf_path.unlink() # Clean up old temporary directory if it exists if tmp_dir.exists(): print(f"🗑️ Cleaning up old temporary directory: {tmp_dir}") import shutil shutil.rmtree(tmp_dir) print("✅ Cleanup completed") def cleanup_temp_files(tmp_dir): """Clean up temporary files after successful completion""" try: if tmp_dir.exists(): import shutil shutil.rmtree(tmp_dir) print(f"✅ Temporary files cleaned up: {tmp_dir}") else: print("ℹ️ No temporary files to clean up") except Exception as e: print(f"⚠️ Warning: Could not clean up temporary files: {e}") print(f"💡 You can manually delete the folder: {tmp_dir}") def main(): """Main function""" # Parse command line arguments keep_temp = "--keep-temp" in sys.argv if keep_temp: print("📝 Note: Temporary files will be kept (--keep-temp flag detected)") print("🚀 Starting Complete PDF Generation Process...") # Check if HTML file exists html_file = Path("energie360/energie360-offerte.html") if not html_file.exists(): print(f"❌ HTML file not found: {html_file}") return False # Create temporary directory for all intermediate files tmp_dir = Path("_tmp") # Define output paths source_dir = html_file.parent final_pdf_path = source_dir / "energie360-offerte-final.pdf" # Clean up old files cleanup_old_files(tmp_dir, final_pdf_path) # Create fresh temporary directory tmp_dir.mkdir(exist_ok=True) print(f"📁 Temporary files will be saved to: {tmp_dir.absolute()}") print(f"📁 Temporary files include: HTML extracts, intermediate PDFs, and analysis data") print(f"📁 Final PDF will be saved to: {final_pdf_path}") # Setup Chrome options chrome_options = setup_chrome_options() # Find Chrome executable chrome_path = find_chrome_executable() if not chrome_path: print("❌ Chrome not found! Please install Google Chrome.") return False print(f"✅ Chrome found at: {chrome_path}") # Start Chrome driver with retry logic driver = None max_retries = 3 for attempt in range(max_retries): try: print(f"🔄 Attempt {attempt + 1}/{max_retries} to start Chrome driver...") driver = webdriver.Chrome(options=chrome_options) print("✅ Chrome driver started successfully") break except Exception as e: print(f"❌ Attempt {attempt + 1} failed: {e}") if attempt < max_retries - 1: print("⏳ Waiting 2 seconds before retry...") time.sleep(2) else: print("❌ All attempts to start Chrome driver failed") return False try: # ===== FIRST PASS: Generate TOC with estimated page numbers ===== print("\n🔄 FIRST PASS: Generating TOC with estimated page numbers...") # Open the HTML file html_url = f"file://{os.path.abspath(html_file)}" print(f"🌐 Opening HTML: {html_url}") driver.get(html_url) # Wait for page to load time.sleep(3) # Wait for TOC generation with estimated numbers try: if not check_driver_health(driver): print("❌ Driver unhealthy after page load") return False if not wait_for_toc_completion(driver): print("❌ First pass TOC generation failed") return False except Exception as e: print(f"❌ Error during first pass TOC generation: {e}") return False print("✅ First pass TOC completed with estimated page numbers") # ===== SECOND PASS: Extract HTML with estimated TOC and generate PDF ===== print("\n🔄 SECOND PASS: Extracting HTML with estimated TOC and generating PDF...") # Check driver health before extraction if not check_driver_health(driver): print("❌ Driver unhealthy before HTML extraction") return False # Extract HTML with estimated TOC for analysis print("📄 Extracting HTML with estimated TOC...") estimated_html = extract_complete_html(driver, "energie360-offerte-estimated.html", tmp_dir) # Generate PDF from HTML with estimated TOC estimated_html_path = tmp_dir / "energie360-offerte-estimated.html" estimated_pdf_path = tmp_dir / "energie360-offerte-estimated.pdf" if not generate_pdf_from_complete_html(str(estimated_html_path), str(estimated_pdf_path)): print("❌ PDF generation for analysis failed") return False # Analyze PDF to get real page numbers print("📊 Analyzing PDF for real page numbers...") # Use absolute path for PDF analysis pdf_path = estimated_pdf_path.absolute() print(f"📄 Looking for PDF at: {pdf_path}") page_numbers = analyze_pdf_page_numbers(str(pdf_path)) if not page_numbers: print("❌ PDF analysis failed") return False # ===== THIRD PASS: Update TOC with real page numbers ===== print("\n🔄 THIRD PASS: Updating TOC with real page numbers...") # First, clean existing numbering and add professional outline numbering print("🔧 Cleaning and adding professional outline numbering to headings...") if not clean_and_number_headings_professionally(driver): print("❌ Failed to add professional outline numbering") return False # Now create a professional TOC with real page numbers print("🔧 Creating professional TOC with real page numbers...") if not create_professional_toc_with_real_page_numbers(driver, page_numbers): print("❌ Failed to create professional TOC") return False # Wait for page counter to update footer if not wait_for_page_counter_completion(driver): print("❌ Page counter update failed") return False print("✅ Second pass TOC completed with real page numbers") # ===== FINAL: Extract complete HTML and generate final PDF ===== print("\n🔄 FINAL: Generating final PDF with accurate TOC and footer...") # Extract complete HTML with accurate TOC and footer complete_html = extract_complete_html(driver, "energie360-offerte-final.html", tmp_dir) # Generate final PDF from complete HTML final_html_path = tmp_dir / "energie360-offerte-final.html" if generate_pdf_from_complete_html(str(final_html_path), str(final_pdf_path)): print("🎉 Final PDF generation successful!") print(f"📁 Final PDF saved to: {final_pdf_path}") # Handle temporary files based on command line flag if keep_temp: print(f"📁 Temporary files kept in: {tmp_dir.absolute()}") print("💡 Use '--keep-temp' flag to automatically clean up temp files") else: print(f"📁 Temporary files are available in: {tmp_dir.absolute()}") print("🧹 Cleaning up temporary files...") cleanup_temp_files(tmp_dir) return True else: print("❌ Final PDF generation failed") return False except Exception as e: print(f"❌ Error during process: {e}") return False finally: driver.quit() print("✅ Chrome driver closed") if __name__ == "__main__": success = main() sys.exit(0 if success else 1)