853 lines
34 KiB
Python
853 lines
34 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Complete PDF Generation Script
|
||
1. Opens HTML in headless browser
|
||
2. Waits for JavaScript to complete (TOC + footer)
|
||
3. Extracts complete HTML with TOC and footer
|
||
4. Generates PDF from final HTML
|
||
|
||
Usage:
|
||
python generate_complete_pdf.py # Auto-cleanup temp files
|
||
python generate_complete_pdf.py --keep-temp # Keep temp files for debugging
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import time
|
||
import subprocess
|
||
import platform
|
||
import json
|
||
from pathlib import Path
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.common.exceptions import TimeoutException
|
||
|
||
def find_chrome_executable():
|
||
"""Find Chrome executable based on platform"""
|
||
if platform.system() == "Windows":
|
||
possible_paths = [
|
||
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
||
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
||
r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')),
|
||
]
|
||
elif platform.system() == "Darwin": # macOS
|
||
possible_paths = [
|
||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||
]
|
||
else: # Linux
|
||
possible_paths = [
|
||
"/usr/bin/google-chrome",
|
||
"/usr/bin/chromium-browser",
|
||
"/usr/bin/chromium",
|
||
]
|
||
|
||
for path in possible_paths:
|
||
if os.path.exists(path):
|
||
return path
|
||
|
||
# Try to find in PATH
|
||
try:
|
||
result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
|
||
if result.returncode == 0:
|
||
return result.stdout.strip()
|
||
except:
|
||
pass
|
||
|
||
return None
|
||
|
||
def setup_chrome_options():
|
||
"""Setup Chrome options for headless operation"""
|
||
chrome_options = Options()
|
||
chrome_options.add_argument("--headless=new") # Use new headless mode
|
||
chrome_options.add_argument("--no-sandbox")
|
||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||
chrome_options.add_argument("--disable-gpu")
|
||
chrome_options.add_argument("--disable-web-security")
|
||
chrome_options.add_argument("--allow-running-insecure-content")
|
||
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
|
||
chrome_options.add_argument("--window-size=1920,1080")
|
||
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||
|
||
return chrome_options
|
||
|
||
def wait_for_toc_completion(driver, timeout=30):
|
||
"""Wait for TOC generation to complete"""
|
||
print("⏳ Waiting for TOC generation to complete...")
|
||
|
||
try:
|
||
# Wait for TOC title to appear (not "Lade Inhaltsverzeichnis...")
|
||
WebDriverWait(driver, timeout).until(
|
||
lambda d: d.find_element(By.CLASS_NAME, "toc-title").text == "Inhaltsverzeichnis"
|
||
)
|
||
|
||
# Wait for TOC entries to appear
|
||
WebDriverWait(driver, timeout).until(
|
||
lambda d: len(d.find_elements(By.CLASS_NAME, "toc-entry")) > 0
|
||
)
|
||
|
||
# Wait a bit more for all entries to load
|
||
time.sleep(2)
|
||
|
||
print("✅ TOC generation completed!")
|
||
return True
|
||
|
||
except TimeoutException:
|
||
print("❌ Timeout waiting for TOC generation")
|
||
return False
|
||
except Exception as e:
|
||
print(f"⚠️ Error during TOC wait: {e}")
|
||
return False
|
||
|
||
def check_driver_health(driver):
|
||
"""Check if Chrome driver is still healthy and responsive"""
|
||
try:
|
||
# Try to get current URL to check if driver is responsive
|
||
current_url = driver.current_url
|
||
return True
|
||
except Exception as e:
|
||
print(f"⚠️ Driver health check failed: {e}")
|
||
return False
|
||
|
||
def wait_for_page_counter_completion(driver, timeout=30):
|
||
"""Wait for page counter to complete and update footer"""
|
||
print("⏳ Waiting for page counter to complete...")
|
||
|
||
try:
|
||
# Wait for footer to show actual page count (not CSS variable)
|
||
WebDriverWait(driver, timeout).until(
|
||
lambda d: "var(--total-pages" not in d.page_source
|
||
)
|
||
|
||
# Wait a bit more for footer to stabilize
|
||
time.sleep(2)
|
||
|
||
print("✅ Page counter completed!")
|
||
return True
|
||
|
||
except TimeoutException:
|
||
print("❌ Timeout waiting for page counter")
|
||
return False
|
||
|
||
def analyze_pdf_page_numbers(pdf_path):
|
||
"""Analyze PDF to extract real page numbers for sections"""
|
||
print(f"📊 Analyzing PDF: {pdf_path}")
|
||
|
||
try:
|
||
# Use PyMuPDF to analyze the PDF
|
||
import fitz # PyMuPDF
|
||
|
||
doc = fitz.open(pdf_path)
|
||
total_pages = len(doc)
|
||
print(f"📄 Total pages in PDF: {total_pages}")
|
||
|
||
# Define the sections to look for (both with and without numbering)
|
||
sections = [
|
||
# Main sections (level 1)
|
||
"Offerte Übersicht",
|
||
"Gesamtübersicht & Zahlungsmodalitäten",
|
||
"Zusammenarbeitsmodell",
|
||
"Investitionsschätzung",
|
||
"Kontaktdaten",
|
||
"Rahmenbedingungen & Konditionen",
|
||
"Unterschriften",
|
||
|
||
# Sub-sections (level 2)
|
||
"Struktur der Offerte",
|
||
"Detaillierte Lösungsbeschreibung",
|
||
"Projektphasen",
|
||
|
||
# Sub-sub-sections (level 3)
|
||
"Kernkomponenten des KI-Systems",
|
||
"Customer Journey - Vollständiger Prozess",
|
||
"Nutzen für Vertriebsprozess durch KI-Automatisierung",
|
||
"User Interfaces & Touchpoints",
|
||
"Technische Integration & Compliance",
|
||
"Organisatorische Verantwortlichkeiten",
|
||
"Kundenseitige Mitwirkung (FTE)",
|
||
"Phase 1: MVP (2-3 Monate)",
|
||
"Phase 2: Erweiterung (Monate 4-6)",
|
||
"Phase 3: Vollintegration (Monate 7-9)",
|
||
"Phase 4: Skalierung (Monate 10-12)"
|
||
]
|
||
|
||
page_numbers = {}
|
||
|
||
# Search for each section in the PDF with flexible matching
|
||
for section in sections:
|
||
found = False
|
||
for page_num in range(total_pages):
|
||
page = doc[page_num]
|
||
text = page.get_text()
|
||
|
||
# Try exact match first
|
||
if section in text:
|
||
page_numbers[section] = page_num + 1
|
||
print(f"📍 '{section}' found on page {page_num + 1}")
|
||
found = True
|
||
break
|
||
|
||
# Try partial match (for longer section names)
|
||
if len(section) > 10: # Only for longer sections
|
||
words = section.split()
|
||
if len(words) >= 3: # Only for sections with 3+ words
|
||
# Check if at least 2 key words are present
|
||
key_words = [word for word in words if len(word) > 3] # Skip short words
|
||
if len(key_words) >= 2:
|
||
matches = sum(1 for word in key_words if word in text)
|
||
if matches >= 2:
|
||
page_numbers[section] = page_num + 1
|
||
print(f"📍 '{section}' (partial match) found on page {page_num + 1}")
|
||
found = True
|
||
break
|
||
|
||
if not found:
|
||
print(f"⚠️ Section '{section}' not found in any page")
|
||
# Assign to page 1 as fallback
|
||
page_numbers[section] = 1
|
||
|
||
doc.close()
|
||
|
||
# Save page numbers to JSON
|
||
output_data = {
|
||
"total_pages": total_pages,
|
||
"page_numbers": page_numbers
|
||
}
|
||
|
||
json_path = Path("_tmp/page_numbers_updated.json")
|
||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(json_path, "w", encoding="utf-8") as f:
|
||
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
||
|
||
print(f"✅ Page numbers saved to: {json_path}")
|
||
return output_data
|
||
|
||
except ImportError:
|
||
print("❌ PyMuPDF not available, using fallback method...")
|
||
return fallback_pdf_analysis(pdf_path)
|
||
except Exception as e:
|
||
print(f"❌ PDF analysis error: {e}")
|
||
return None
|
||
|
||
def fallback_pdf_analysis(pdf_path):
|
||
"""Fallback PDF analysis using Chrome headless"""
|
||
print("🔄 Using Chrome headless for PDF analysis...")
|
||
|
||
try:
|
||
# Use Chrome to extract text from PDF
|
||
chrome_path = find_chrome_executable()
|
||
if not chrome_path:
|
||
return None
|
||
|
||
# Convert PDF to HTML for analysis
|
||
html_path = pdf_path.replace(".pdf", "_text.html")
|
||
cmd = [
|
||
chrome_path,
|
||
"--headless",
|
||
"--no-sandbox",
|
||
"--disable-dev-shm-usage",
|
||
"--print-to-pdf-no-header",
|
||
f"file://{os.path.abspath(pdf_path)}"
|
||
]
|
||
|
||
# This is a simplified fallback - in practice, you'd want more sophisticated text extraction
|
||
print("⚠️ Fallback analysis limited - consider installing PyMuPDF")
|
||
return {"total_pages": 22, "page_numbers": {}} # Default values
|
||
|
||
except Exception as e:
|
||
print(f"❌ Fallback analysis failed: {e}")
|
||
return None
|
||
|
||
def wait_for_footer_update(driver, timeout=10):
|
||
"""Wait for footer to be updated with correct page count"""
|
||
print("⏳ Waiting for footer update...")
|
||
|
||
try:
|
||
# Check if footer shows correct page count (not "var(--total-pages, 16)")
|
||
WebDriverWait(driver, timeout).until(
|
||
lambda d: "var(--total-pages" not in d.page_source
|
||
)
|
||
|
||
print("✅ Footer updated!")
|
||
return True
|
||
|
||
except TimeoutException:
|
||
print("⚠️ Footer update timeout - continuing anyway")
|
||
return True
|
||
|
||
def extract_complete_html(driver, output_filename=None, tmp_dir=None):
|
||
"""Extract the complete HTML after JavaScript processing"""
|
||
if output_filename is None:
|
||
output_filename = "energie360-offerte-complete.html"
|
||
|
||
# Ensure tmp_dir exists
|
||
if tmp_dir:
|
||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||
output_path = tmp_dir / output_filename
|
||
else:
|
||
output_path = Path(output_filename)
|
||
|
||
print(f"📄 Extracting complete HTML to: {output_path}")
|
||
|
||
# Get the complete HTML
|
||
complete_html = driver.page_source
|
||
|
||
# Save the complete HTML for inspection
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
f.write(complete_html)
|
||
|
||
print(f"✅ Complete HTML saved to: {output_path}")
|
||
return complete_html
|
||
|
||
def clean_and_number_headings_professionally(driver):
|
||
"""Clean existing numbering and add professional outline numbering to headings"""
|
||
print("🔧 Cleaning and adding professional outline numbering to headings...")
|
||
|
||
try:
|
||
js_code = """
|
||
(function() {
|
||
console.log('🔧 Cleaning and adding professional outline numbering...');
|
||
|
||
// Function to clean and add professional outline numbering
|
||
function cleanAndNumberHeadings() {
|
||
let level1Count = 0;
|
||
let level2Count = 0;
|
||
let level3Count = 0;
|
||
|
||
const headings = document.querySelectorAll('h1.content-heading, h2.content-heading, h3.content-heading');
|
||
|
||
headings.forEach(heading => {
|
||
const level = parseInt(heading.className.match(/heading-level-(\\d+)/)?.[1] || '1');
|
||
|
||
// Get the original text and clean it completely
|
||
let originalText = heading.textContent.trim();
|
||
|
||
// Remove ALL existing numbering patterns (1, 1.1, 1.1.1, etc.)
|
||
originalText = originalText.replace(/^\\d+(\\.\\d+)*\\s*/, '');
|
||
|
||
// Store the clean original text
|
||
heading.setAttribute('data-original-text', originalText);
|
||
|
||
// Add appropriate professional numbering
|
||
let numberedText = '';
|
||
if (level === 1) {
|
||
level1Count++;
|
||
level2Count = 0;
|
||
level3Count = 0;
|
||
numberedText = level1Count + ' ' + originalText;
|
||
} else if (level === 2) {
|
||
level2Count++;
|
||
level3Count = 0;
|
||
numberedText = level1Count + '.' + level2Count + ' ' + originalText;
|
||
} else if (level === 3) {
|
||
level3Count++;
|
||
numberedText = level1Count + '.' + level2Count + '.' + level3Count + ' ' + originalText;
|
||
}
|
||
|
||
// Update heading text with clean numbering
|
||
heading.textContent = numberedText;
|
||
heading.setAttribute('data-numbered-text', numberedText);
|
||
|
||
console.log(`✅ Heading ${level}: "${originalText}" -> "${numberedText}"`);
|
||
});
|
||
|
||
console.log('✅ Professional outline numbering completed');
|
||
return true;
|
||
}
|
||
|
||
return cleanAndNumberHeadings();
|
||
})();
|
||
"""
|
||
|
||
# Execute the JavaScript
|
||
result = driver.execute_script(js_code)
|
||
|
||
# Wait a bit for the JavaScript to complete
|
||
time.sleep(1)
|
||
|
||
print("✅ Professional outline numbering added successfully")
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error adding professional outline numbering: {e}")
|
||
return False
|
||
|
||
def create_professional_toc_with_real_page_numbers(driver, page_numbers):
|
||
"""Create a professional TOC with correct page numbers and proper alignment"""
|
||
print("🔧 Creating professional TOC with real page numbers...")
|
||
|
||
try:
|
||
# Create JavaScript to build a professional TOC
|
||
js_code = """
|
||
(function() {
|
||
console.log('🔧 Creating professional TOC with real page numbers...');
|
||
|
||
// Get the page numbers data
|
||
const pageNumbers = %s;
|
||
|
||
// Function to find the best matching section for a heading text
|
||
function findBestSectionMatch(headingText) {
|
||
const cleanHeading = headingText.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim();
|
||
|
||
console.log(`🔍 Looking for match for: "${cleanHeading}"`);
|
||
console.log(`📋 Available sections:`, Object.keys(pageNumbers.page_numbers));
|
||
|
||
// Try exact match first
|
||
for (let [section, page] of Object.entries(pageNumbers.page_numbers)) {
|
||
if (cleanHeading === section) {
|
||
console.log(`✅ Exact match: "${section}" -> page ${page}`);
|
||
return { section, page };
|
||
}
|
||
}
|
||
|
||
// Try partial match (more flexible)
|
||
for (let [section, page] of Object.entries(pageNumbers.page_numbers)) {
|
||
const cleanSection = section.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim();
|
||
|
||
// Check if either contains the other
|
||
if (cleanHeading.includes(cleanSection) || cleanSection.includes(cleanHeading)) {
|
||
console.log(`✅ Partial match: "${cleanSection}" -> page ${page}`);
|
||
return { section, page };
|
||
}
|
||
}
|
||
|
||
// Try word-based matching for longer headings
|
||
if (cleanHeading.length > 10) {
|
||
const headingWords = cleanHeading.split(' ').filter(word => word.length > 3);
|
||
|
||
for (let [section, page] of Object.entries(pageNumbers.page_numbers)) {
|
||
const cleanSection = section.replace(/^\\d+(\\.\\d+)*\\s*/, '').trim();
|
||
const sectionWords = cleanSection.split(' ').filter(word => word.length > 3);
|
||
|
||
// Count matching words
|
||
const matches = headingWords.filter(word =>
|
||
sectionWords.some(sword =>
|
||
word.toLowerCase() === sword.toLowerCase() ||
|
||
word.toLowerCase().includes(sword.toLowerCase()) ||
|
||
sword.toLowerCase().includes(word.toLowerCase())
|
||
)
|
||
);
|
||
|
||
if (matches.length >= 2) {
|
||
console.log(`✅ Word-based match: "${cleanSection}" -> page ${page} (${matches.length} words)`);
|
||
return { section, page };
|
||
}
|
||
}
|
||
}
|
||
|
||
console.warn(`⚠️ No match found for: "${cleanHeading}"`);
|
||
return null;
|
||
}
|
||
|
||
// Function to create professional TOC
|
||
function createProfessionalTOC() {
|
||
// Find the TOC container
|
||
const tocContainer = document.querySelector('.toc-auto');
|
||
if (!tocContainer) {
|
||
console.error('❌ TOC container not found');
|
||
return false;
|
||
}
|
||
|
||
// Clear existing TOC completely
|
||
tocContainer.innerHTML = '';
|
||
|
||
// Create professional TOC title
|
||
const title = document.createElement('h1');
|
||
title.className = 'toc-title';
|
||
title.textContent = 'Inhaltsverzeichnis';
|
||
title.style.textAlign = 'center';
|
||
title.style.marginBottom = '2rem';
|
||
title.style.fontSize = '1.5rem';
|
||
title.style.fontWeight = 'bold';
|
||
title.style.color = '#2c3e50';
|
||
tocContainer.appendChild(title);
|
||
|
||
// Get all numbered headings
|
||
const headings = document.querySelectorAll('h1.content-heading, h2.content-heading, h3.content-heading');
|
||
|
||
if (headings.length === 0) {
|
||
console.error('❌ No content headings found');
|
||
return false;
|
||
}
|
||
|
||
console.log(`📋 Found ${headings.length} headings to process`);
|
||
|
||
headings.forEach((heading, index) => {
|
||
const headingText = heading.textContent.trim();
|
||
const level = parseInt(heading.className.match(/heading-level-(\\d+)/)?.[1] || '1');
|
||
|
||
// Find page number for this heading
|
||
const match = findBestSectionMatch(headingText);
|
||
const pageNumber = match ? match.page : 1; // Default to page 1 if no match
|
||
|
||
// Create TOC entry with professional styling
|
||
const entry = document.createElement('div');
|
||
entry.className = 'toc-entry';
|
||
entry.classList.add(`toc-level-${level}`);
|
||
|
||
// Set professional styling
|
||
entry.style.display = 'flex';
|
||
entry.style.justifyContent = 'space-between';
|
||
entry.style.alignItems = 'center';
|
||
entry.style.marginBottom = '0.5rem';
|
||
entry.style.cursor = 'pointer';
|
||
entry.style.transition = 'background-color 0.2s';
|
||
|
||
// Add hover effect
|
||
entry.addEventListener('mouseenter', () => {
|
||
entry.style.backgroundColor = '#f8f9fa';
|
||
});
|
||
entry.addEventListener('mouseleave', () => {
|
||
entry.style.backgroundColor = 'transparent';
|
||
});
|
||
|
||
// Create text container
|
||
const textContainer = document.createElement('div');
|
||
textContainer.style.flex = '1';
|
||
textContainer.style.display = 'flex';
|
||
textContainer.style.alignItems = 'center';
|
||
|
||
// Add level-based indentation
|
||
const indent = (level - 1) * 1.5;
|
||
textContainer.style.marginLeft = indent + 'rem';
|
||
|
||
// Create text span
|
||
const textSpan = document.createElement('span');
|
||
textSpan.className = 'toc-text';
|
||
textSpan.textContent = headingText;
|
||
textSpan.style.fontSize = level === 1 ? '1.1rem' : level === 2 ? '1rem' : '0.9rem';
|
||
textSpan.style.fontWeight = level === 1 ? 'bold' : 'normal';
|
||
textSpan.style.color = level === 1 ? '#2c3e50' : level === 2 ? '#34495e' : '#7f8c8d';
|
||
|
||
textContainer.appendChild(textSpan);
|
||
|
||
// Create right-aligned page number
|
||
const pageSpan = document.createElement('span');
|
||
pageSpan.className = 'toc-page';
|
||
pageSpan.textContent = pageNumber;
|
||
pageSpan.style.fontWeight = 'bold';
|
||
pageSpan.style.color = '#e74c3c';
|
||
pageSpan.style.minWidth = '2rem';
|
||
pageSpan.style.textAlign = 'right';
|
||
|
||
// Create dots (leader)
|
||
const dotsSpan = document.createElement('span');
|
||
dotsSpan.className = 'toc-dots';
|
||
dotsSpan.textContent = '.'.repeat(20);
|
||
dotsSpan.style.flex = '1';
|
||
dotsSpan.style.margin = '0 0.5rem';
|
||
dotsSpan.style.color = '#bdc3c7';
|
||
dotsSpan.style.overflow = 'hidden';
|
||
dotsSpan.style.whiteSpace = 'nowrap';
|
||
|
||
// Assemble entry
|
||
entry.appendChild(textContainer);
|
||
entry.appendChild(dotsSpan);
|
||
entry.appendChild(pageSpan);
|
||
|
||
// Add click functionality
|
||
entry.addEventListener('click', () => {
|
||
heading.scrollIntoView({
|
||
behavior: 'smooth',
|
||
block: 'start'
|
||
});
|
||
});
|
||
|
||
// Add to TOC
|
||
tocContainer.appendChild(entry);
|
||
|
||
console.log(`✅ TOC entry ${index + 1}: "${headingText}" -> page ${pageNumber} (level ${level})`);
|
||
});
|
||
|
||
console.log('✅ Professional TOC created successfully');
|
||
return true;
|
||
}
|
||
|
||
// Execute the function
|
||
return createProfessionalTOC();
|
||
})();
|
||
""" % json.dumps(page_numbers)
|
||
|
||
# Execute the JavaScript
|
||
result = driver.execute_script(js_code)
|
||
|
||
# Wait a bit for the JavaScript to complete
|
||
time.sleep(2)
|
||
|
||
print("✅ Professional TOC created successfully")
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error creating professional TOC: {e}")
|
||
return False
|
||
|
||
def generate_pdf_from_complete_html(html_file_path, output_filename=None):
|
||
"""Generate PDF from the complete HTML file"""
|
||
if output_filename is None:
|
||
output_filename = "energie360-offerte-complete.pdf"
|
||
|
||
print(f"🔄 Generating PDF: {output_filename}")
|
||
|
||
chrome_path = find_chrome_executable()
|
||
if not chrome_path:
|
||
print("❌ Chrome not found!")
|
||
return False
|
||
|
||
try:
|
||
# Ensure output filename has absolute path
|
||
output_abs_path = os.path.abspath(output_filename)
|
||
html_abs_path = os.path.abspath(html_file_path)
|
||
|
||
print(f"📁 Output will be saved to: {output_abs_path}")
|
||
print(f"📄 HTML source: {html_abs_path}")
|
||
|
||
# Use Chrome to generate PDF from the complete HTML
|
||
cmd = [
|
||
chrome_path,
|
||
"--headless",
|
||
"--no-sandbox",
|
||
"--disable-dev-shm-usage",
|
||
f"--print-to-pdf={output_abs_path}",
|
||
"--print-to-pdf-no-header",
|
||
f"file://{html_abs_path}"
|
||
]
|
||
|
||
print(f"🔧 Running command: {' '.join(cmd)}")
|
||
|
||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||
|
||
if result.returncode == 0:
|
||
# Check if file was actually created
|
||
if os.path.exists(output_abs_path):
|
||
print(f"✅ PDF generated successfully: {output_filename}")
|
||
print(f"📁 File location: {output_abs_path}")
|
||
return True
|
||
else:
|
||
print(f"⚠️ PDF generation reported success but file not found: {output_filename}")
|
||
print(f"🔍 Current working directory: {os.getcwd()}")
|
||
print(f"🔍 Expected file location: {output_abs_path}")
|
||
print(f"📋 Chrome stdout: {result.stdout}")
|
||
print(f"📋 Chrome stderr: {result.stderr}")
|
||
return False
|
||
else:
|
||
print(f"❌ PDF generation failed with return code: {result.returncode}")
|
||
print(f"📋 Chrome stdout: {result.stdout}")
|
||
print(f"📋 Chrome stderr: {result.stderr}")
|
||
return False
|
||
|
||
except subprocess.TimeoutExpired:
|
||
print("❌ PDF generation timeout")
|
||
return False
|
||
except Exception as e:
|
||
print(f"❌ PDF generation error: {e}")
|
||
return False
|
||
|
||
def cleanup_old_files(tmp_dir, final_pdf_path):
|
||
"""Clean up old files before starting the process"""
|
||
print("🧹 Cleaning up old files...")
|
||
|
||
# Remove existing final PDF if it exists
|
||
if final_pdf_path.exists():
|
||
print(f"🗑️ Removing existing final PDF: {final_pdf_path}")
|
||
final_pdf_path.unlink()
|
||
|
||
# Clean up old temporary directory if it exists
|
||
if tmp_dir.exists():
|
||
print(f"🗑️ Cleaning up old temporary directory: {tmp_dir}")
|
||
import shutil
|
||
shutil.rmtree(tmp_dir)
|
||
|
||
print("✅ Cleanup completed")
|
||
|
||
def cleanup_temp_files(tmp_dir):
|
||
"""Clean up temporary files after successful completion"""
|
||
try:
|
||
if tmp_dir.exists():
|
||
import shutil
|
||
shutil.rmtree(tmp_dir)
|
||
print(f"✅ Temporary files cleaned up: {tmp_dir}")
|
||
else:
|
||
print("ℹ️ No temporary files to clean up")
|
||
except Exception as e:
|
||
print(f"⚠️ Warning: Could not clean up temporary files: {e}")
|
||
print(f"💡 You can manually delete the folder: {tmp_dir}")
|
||
|
||
def main():
|
||
"""Main function"""
|
||
# Parse command line arguments
|
||
keep_temp = "--keep-temp" in sys.argv
|
||
if keep_temp:
|
||
print("📝 Note: Temporary files will be kept (--keep-temp flag detected)")
|
||
|
||
print("🚀 Starting Complete PDF Generation Process...")
|
||
|
||
# Check if HTML file exists
|
||
html_file = Path("energie360/energie360-offerte.html")
|
||
if not html_file.exists():
|
||
print(f"❌ HTML file not found: {html_file}")
|
||
return False
|
||
|
||
# Create temporary directory for all intermediate files
|
||
tmp_dir = Path("_tmp")
|
||
|
||
# Define output paths
|
||
source_dir = html_file.parent
|
||
final_pdf_path = source_dir / "energie360-offerte-final.pdf"
|
||
|
||
# Clean up old files
|
||
cleanup_old_files(tmp_dir, final_pdf_path)
|
||
|
||
# Create fresh temporary directory
|
||
tmp_dir.mkdir(exist_ok=True)
|
||
print(f"📁 Temporary files will be saved to: {tmp_dir.absolute()}")
|
||
print(f"📁 Temporary files include: HTML extracts, intermediate PDFs, and analysis data")
|
||
|
||
print(f"📁 Final PDF will be saved to: {final_pdf_path}")
|
||
|
||
# Setup Chrome options
|
||
chrome_options = setup_chrome_options()
|
||
|
||
# Find Chrome executable
|
||
chrome_path = find_chrome_executable()
|
||
if not chrome_path:
|
||
print("❌ Chrome not found! Please install Google Chrome.")
|
||
return False
|
||
|
||
print(f"✅ Chrome found at: {chrome_path}")
|
||
|
||
# Start Chrome driver with retry logic
|
||
driver = None
|
||
max_retries = 3
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
print(f"🔄 Attempt {attempt + 1}/{max_retries} to start Chrome driver...")
|
||
driver = webdriver.Chrome(options=chrome_options)
|
||
print("✅ Chrome driver started successfully")
|
||
break
|
||
except Exception as e:
|
||
print(f"❌ Attempt {attempt + 1} failed: {e}")
|
||
if attempt < max_retries - 1:
|
||
print("⏳ Waiting 2 seconds before retry...")
|
||
time.sleep(2)
|
||
else:
|
||
print("❌ All attempts to start Chrome driver failed")
|
||
return False
|
||
|
||
try:
|
||
# ===== FIRST PASS: Generate TOC with estimated page numbers =====
|
||
print("\n🔄 FIRST PASS: Generating TOC with estimated page numbers...")
|
||
|
||
# Open the HTML file
|
||
html_url = f"file://{os.path.abspath(html_file)}"
|
||
print(f"🌐 Opening HTML: {html_url}")
|
||
driver.get(html_url)
|
||
|
||
# Wait for page to load
|
||
time.sleep(3)
|
||
|
||
# Wait for TOC generation with estimated numbers
|
||
try:
|
||
if not check_driver_health(driver):
|
||
print("❌ Driver unhealthy after page load")
|
||
return False
|
||
|
||
if not wait_for_toc_completion(driver):
|
||
print("❌ First pass TOC generation failed")
|
||
return False
|
||
except Exception as e:
|
||
print(f"❌ Error during first pass TOC generation: {e}")
|
||
return False
|
||
|
||
print("✅ First pass TOC completed with estimated page numbers")
|
||
|
||
# ===== SECOND PASS: Extract HTML with estimated TOC and generate PDF =====
|
||
print("\n🔄 SECOND PASS: Extracting HTML with estimated TOC and generating PDF...")
|
||
|
||
# Check driver health before extraction
|
||
if not check_driver_health(driver):
|
||
print("❌ Driver unhealthy before HTML extraction")
|
||
return False
|
||
|
||
# Extract HTML with estimated TOC for analysis
|
||
print("📄 Extracting HTML with estimated TOC...")
|
||
estimated_html = extract_complete_html(driver, "energie360-offerte-estimated.html", tmp_dir)
|
||
|
||
# Generate PDF from HTML with estimated TOC
|
||
estimated_html_path = tmp_dir / "energie360-offerte-estimated.html"
|
||
estimated_pdf_path = tmp_dir / "energie360-offerte-estimated.pdf"
|
||
if not generate_pdf_from_complete_html(str(estimated_html_path), str(estimated_pdf_path)):
|
||
print("❌ PDF generation for analysis failed")
|
||
return False
|
||
|
||
# Analyze PDF to get real page numbers
|
||
print("📊 Analyzing PDF for real page numbers...")
|
||
# Use absolute path for PDF analysis
|
||
pdf_path = estimated_pdf_path.absolute()
|
||
print(f"📄 Looking for PDF at: {pdf_path}")
|
||
page_numbers = analyze_pdf_page_numbers(str(pdf_path))
|
||
|
||
if not page_numbers:
|
||
print("❌ PDF analysis failed")
|
||
return False
|
||
|
||
# ===== THIRD PASS: Update TOC with real page numbers =====
|
||
print("\n🔄 THIRD PASS: Updating TOC with real page numbers...")
|
||
|
||
# First, clean existing numbering and add professional outline numbering
|
||
print("🔧 Cleaning and adding professional outline numbering to headings...")
|
||
if not clean_and_number_headings_professionally(driver):
|
||
print("❌ Failed to add professional outline numbering")
|
||
return False
|
||
|
||
# Now create a professional TOC with real page numbers
|
||
print("🔧 Creating professional TOC with real page numbers...")
|
||
if not create_professional_toc_with_real_page_numbers(driver, page_numbers):
|
||
print("❌ Failed to create professional TOC")
|
||
return False
|
||
|
||
# Wait for page counter to update footer
|
||
if not wait_for_page_counter_completion(driver):
|
||
print("❌ Page counter update failed")
|
||
return False
|
||
|
||
print("✅ Second pass TOC completed with real page numbers")
|
||
|
||
# ===== FINAL: Extract complete HTML and generate final PDF =====
|
||
print("\n🔄 FINAL: Generating final PDF with accurate TOC and footer...")
|
||
|
||
# Extract complete HTML with accurate TOC and footer
|
||
complete_html = extract_complete_html(driver, "energie360-offerte-final.html", tmp_dir)
|
||
|
||
# Generate final PDF from complete HTML
|
||
final_html_path = tmp_dir / "energie360-offerte-final.html"
|
||
if generate_pdf_from_complete_html(str(final_html_path), str(final_pdf_path)):
|
||
print("🎉 Final PDF generation successful!")
|
||
print(f"📁 Final PDF saved to: {final_pdf_path}")
|
||
|
||
# Handle temporary files based on command line flag
|
||
if keep_temp:
|
||
print(f"📁 Temporary files kept in: {tmp_dir.absolute()}")
|
||
print("💡 Use '--keep-temp' flag to automatically clean up temp files")
|
||
else:
|
||
print(f"📁 Temporary files are available in: {tmp_dir.absolute()}")
|
||
print("🧹 Cleaning up temporary files...")
|
||
cleanup_temp_files(tmp_dir)
|
||
|
||
return True
|
||
else:
|
||
print("❌ Final PDF generation failed")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error during process: {e}")
|
||
return False
|
||
finally:
|
||
driver.quit()
|
||
print("✅ Chrome driver closed")
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1)
|