gateway/modules/routes/routeRealEstateScraping.py

"""
Real Estate scraping routes for the backend API.
Implements endpoints for scraping real estate data from external sources.
"""

import logging
import json
import aiohttp
import asyncio
from typing import Optional, Dict, Any
from fastapi import APIRouter, HTTPException, Depends, Body, Request, Query, status

# Import auth modules
from modules.auth import limiter, getCurrentUser

# Import models
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelRealEstate import (
    Gemeinde,
    Kanton,
    Dokument,
    Kontext,
    DokumentTyp,
)

# Import interfaces
from modules.interfaces.interfaceDbRealEstateObjects import getInterface as getRealEstateInterface
from modules.interfaces.interfaceDbComponentObjects import getInterface as getComponentInterface

# Import scraping script
from modules.features.realEstate.scrapeSwissTopo import scrape_switzerland

# Import Swiss Topo MapServer connector
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
from modules.connectors.connectorOerebWfs import OerebWfsConnector

# Import Tavily connector for BZO document search
from modules.aicore.aicorePluginTavily import AiTavily

# Configure logger
logger = logging.getLogger(__name__)

# Create router for real estate scraping endpoints
router = APIRouter(
    prefix="/api/realestate",
    tags=["Real Estate Scraping"],
    responses={
        404: {"description": "Not found"},
        400: {"description": "Bad request"},
        401: {"description": "Unauthorized"},
        403: {"description": "Forbidden"},
        500: {"description": "Internal server error"}
    }
)


@router.post("/scrape-switzerland", response_model=Dict[str, Any])
@limiter.limit("5/hour")  # Limit to 5 requests per hour (scraping is resource-intensive)
async def scrape_switzerland_route(
    request: Request,
    body: Dict[str, Any] = Body(..., description="Scraping parameters"),
    currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
    """
    Scrape Kanton Zürich systematically using Swiss Topo connector and save parcel data to database.

    This endpoint divides Kanton Zürich into a grid and queries parcels at each grid point,
    then deduplicates and saves unique parcels to the database. For each parcel, it also
    queries the ÖREB WFS service to retrieve bauzone information.

    **WARNING**: This is a resource-intensive operation that may take a long time
    and make many API requests. Use with caution.

    Request Body:
    {
        "grid_size": 500.0,  // Grid cell size in meters (default: 500m)
        "max_concurrent": 50,  // Maximum concurrent API requests (default: 50)
        "batch_size": 100  // Number of parcels to process before saving (default: 100)
    }

    Headers:
    - X-CSRF-Token: CSRF token (required for security)

    Returns:
    {
        "success": true,
        "stats": {
            "total_queries": 1234,
            "successful_queries": 1200,
            "failed_queries": 34,
            "unique_parcels_found": 500,
            "parcels_saved": 450,
            "parcels_skipped": 50,
            "error_count": 5,
            "errors": [...]
        }
    }

    Example:
    - POST /api/realestate/scrape-switzerland
      Body: {"grid_size": 1000.0, "max_concurrent": 5, "batch_size": 50}
    """
    try:
        # Validate CSRF token
        csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
        if not csrf_token:
            logger.warning(f"CSRF token missing for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="CSRF token missing. Please include X-CSRF-Token header."
            )

        # Basic CSRF token format validation
        if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
            logger.warning(f"Invalid CSRF token format for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Invalid CSRF token format"
            )

        # Validate token is hex string
        try:
            int(csrf_token, 16)
        except ValueError:
            logger.warning(f"CSRF token is not a valid hex string for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Invalid CSRF token format"
            )

        # Extract parameters from body with defaults
        grid_size = body.get("grid_size", 500.0)
        max_concurrent = body.get("max_concurrent", 50)
        batch_size = body.get("batch_size", 100)

        # Validate parameters
        if grid_size <= 0 or grid_size > 10000:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="grid_size must be between 0 and 10000 meters"
            )

        if max_concurrent <= 0 or max_concurrent > 200:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="max_concurrent must be between 1 and 200"
            )

        if batch_size <= 0 or batch_size > 1000:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="batch_size must be between 1 and 1000"
            )

        logger.info(
            f"Starting Switzerland scraping for user {currentUser.id} (mandate: {currentUser.mandateId}) "
            f"with grid_size={grid_size}, max_concurrent={max_concurrent}, batch_size={batch_size}"
        )

        # Run scraping operation
        result = await scrape_switzerland(
            current_user=currentUser,
            grid_size=grid_size,
            max_concurrent=max_concurrent,
            batch_size=batch_size
        )

        logger.info(
            f"Scraping completed for user {currentUser.id}: "
            f"{result['stats']['parcels_saved']} parcels saved"
        )

        return result

    except HTTPException:
        raise
    except ValueError as e:
        logger.error(f"Validation error in scrape_switzerland_route: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=f"Validation error: {str(e)}"
        )
    except Exception as e:
        logger.error(f"Error scraping Switzerland: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error scraping Switzerland: {str(e)}"
        )


@router.get("/gemeinden", response_model=Dict[str, Any])
@limiter.limit("60/minute")
async def get_all_gemeinden(
    request: Request,
    only_current: bool = Query(True, description="Only return current municipalities (exclude historical)"),
    currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
    """
    Fetch all Gemeinden (municipalities) from the Swiss Topo MapServer connector
    and save them to the database.

    This endpoint:
    1. Fetches all Swiss municipalities from the Swiss Federal Office of Topography
    2. Saves them to the database (skipping duplicates based on BFS number)
    3. Creates Kantone (cantons) as needed
    4. Returns statistics about the import operation

    Query Parameters:
    - only_current: If True, only return current municipalities (default: True).
                   If False, return all municipalities including historical ones.

    Headers:
    - X-CSRF-Token: CSRF token (required for security)

    Returns:
    {
        "gemeinden": [
            {
                "id": "uuid",
                "mandateId": "uuid",
                "label": "Bern",
                "id_kanton": "uuid",
                "kontextInformationen": [...],
                ...
            },
            ...
        ],
        "count": 2162,
        "stats": {
            "gemeinden_created": 2100,
            "gemeinden_skipped": 62,
            "kantone_created": 26,
            "error_count": 0,
            "errors": []
        }
    }

    Example:
    - GET /api/realestate/gemeinden
    - GET /api/realestate/gemeinden?only_current=false
    """
    try:
        # Validate CSRF token
        csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
        if not csrf_token:
            logger.warning(f"CSRF token missing for GET /api/realestate/gemeinden from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="CSRF token missing. Please include X-CSRF-Token header."
            )

        # Basic CSRF token format validation
        if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
            logger.warning(f"Invalid CSRF token format for GET /api/realestate/gemeinden from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Invalid CSRF token format"
            )

        # Validate token is hex string
        try:
            int(csrf_token, 16)
        except ValueError:
            logger.warning(f"CSRF token is not a valid hex string for GET /api/realestate/gemeinden from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Invalid CSRF token format"
            )

        logger.info(f"Fetching all Gemeinden for user {currentUser.id} (mandate: {currentUser.mandateId}), only_current={only_current}")

        # Initialize connectors and fetch all gemeinden
        oereb_connector = OerebWfsConnector()
        connector = SwissTopoMapServerConnector(oereb_connector=oereb_connector)
        gemeinden_data = await connector.get_all_gemeinden(only_current=only_current)

        # Get interface for database operations
        realEstateInterface = getRealEstateInterface(currentUser)

        # Statistics
        gemeinden_created = 0
        gemeinden_skipped = 0
        kantone_created = 0
        errors = []

        # Cache for Kanton UUIDs
        kanton_cache: Dict[str, str] = {}

        # Helper function to find Gemeinde by BFS number
        def find_gemeinde_by_bfs_nummer(bfs_nummer: str) -> Optional[Gemeinde]:
            """Find existing Gemeinde by BFS number (stored in kontextInformationen)."""
            try:
                gemeinden = realEstateInterface.getGemeinden(
                    recordFilter={"mandateId": currentUser.mandateId}
                )

                for gemeinde in gemeinden:
                    # Check kontextInformationen for bfs_nummer
                    for kontext in gemeinde.kontextInformationen:
                        try:
                            kontext_data = json.loads(kontext.inhalt) if isinstance(kontext.inhalt, str) else kontext.inhalt
                            if isinstance(kontext_data, dict):
                                if str(kontext_data.get("bfs_nummer")) == str(bfs_nummer):
                                    return gemeinde
                        except (json.JSONDecodeError, AttributeError):
                            continue

                return None
            except Exception as e:
                logger.error(f"Error finding Gemeinde by BFS number {bfs_nummer}: {e}", exc_info=True)
                return None

        # Helper function to get or create Kanton
        def get_or_create_kanton(kanton_abk: str) -> Optional[str]:
            """Get or create a Kanton by abbreviation."""
            nonlocal kantone_created, errors

            if not kanton_abk:
                return None

            # Check cache first
            if kanton_abk in kanton_cache:
                return kanton_cache[kanton_abk]

            # Check if exists
            kantone = realEstateInterface.getKantone(
                recordFilter={
                    "mandateId": currentUser.mandateId,
                    "abk": kanton_abk
                }
            )

            if kantone:
                kanton_cache[kanton_abk] = kantone[0].id
                return kantone[0].id

            # Create new Kanton
            try:
                # Map common abbreviations to full names
                kanton_names = {
                    "AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden",
                    "BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt",
                    "FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden",
                    "JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden",
                    "OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn",
                    "SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri",
                    "VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich"
                }

                kanton_label = kanton_names.get(kanton_abk, kanton_abk)

                kanton = Kanton(
                    mandateId=currentUser.mandateId,
                    label=kanton_label,
                    abk=kanton_abk
                )

                created_kanton = realEstateInterface.createKanton(kanton)
                if created_kanton and created_kanton.id:
                    kanton_cache[kanton_abk] = created_kanton.id
                    kantone_created += 1
                    logger.info(f"Created new Kanton: {kanton_label} ({kanton_abk})")
                    return created_kanton.id
            except Exception as e:
                error_msg = f"Error creating Kanton {kanton_abk}: {e}"
                logger.error(error_msg, exc_info=True)
                errors.append(error_msg)

            return None

        # Process each gemeinde and save to database
        saved_gemeinden = []
        for gemeinde_data in gemeinden_data:
            try:
                gemeinde_name = gemeinde_data.get("name")
                bfs_nummer = gemeinde_data.get("bfs_nummer")
                kanton_abk = gemeinde_data.get("kanton")

                if not gemeinde_name or not bfs_nummer:
                    logger.warning(f"Skipping Gemeinde with missing data: {gemeinde_data}")
                    gemeinden_skipped += 1
                    continue

                # Check if Gemeinde already exists
                existing_gemeinde = find_gemeinde_by_bfs_nummer(str(bfs_nummer))
                if existing_gemeinde:
                    logger.debug(f"Gemeinde {gemeinde_name} (BFS: {bfs_nummer}) already exists, skipping")
                    gemeinden_skipped += 1
                    saved_gemeinden.append(existing_gemeinde.model_dump() if hasattr(existing_gemeinde, 'model_dump') else existing_gemeinde)
                    continue

                # Get or create Kanton
                kanton_id = get_or_create_kanton(kanton_abk) if kanton_abk else None

                # Create new Gemeinde
                gemeinde = Gemeinde(
                    mandateId=currentUser.mandateId,
                    label=gemeinde_name,
                    id_kanton=kanton_id,
                    kontextInformationen=[
                        Kontext(
                            thema="BFS Nummer",
                            inhalt=json.dumps({"bfs_nummer": bfs_nummer}, ensure_ascii=False)
                        )
                    ]
                )

                created_gemeinde = realEstateInterface.createGemeinde(gemeinde)
                if created_gemeinde and created_gemeinde.id:
                    gemeinden_created += 1
                    logger.info(f"Created new Gemeinde: {gemeinde_name} (BFS: {bfs_nummer})")
                    saved_gemeinden.append(created_gemeinde.model_dump() if hasattr(created_gemeinde, 'model_dump') else created_gemeinde)
                else:
                    error_msg = f"Failed to create Gemeinde {gemeinde_name} (BFS: {bfs_nummer})"
                    logger.error(error_msg)
                    errors.append(error_msg)
                    gemeinden_skipped += 1

            except Exception as e:
                error_msg = f"Error processing Gemeinde {gemeinde_data.get('name', 'Unknown')}: {str(e)}"
                logger.error(error_msg, exc_info=True)
                errors.append(error_msg)
                gemeinden_skipped += 1

        logger.info(
            f"Gemeinden import completed: {gemeinden_created} created, "
            f"{gemeinden_skipped} skipped, {kantone_created} Kantone created"
        )

        return {
            "gemeinden": saved_gemeinden,
            "count": len(saved_gemeinden),
            "stats": {
                "gemeinden_created": gemeinden_created,
                "gemeinden_skipped": gemeinden_skipped,
                "kantone_created": kantone_created,
                "error_count": len(errors),
                "errors": errors[:10]  # Return first 10 errors
            }
        }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error fetching all Gemeinden: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error fetching Gemeinden: {str(e)}"
        )


def _get_language_from_kanton(kanton_abk: Optional[str]) -> str:
    """
    Determine language (German/French/Italian) based on Kanton abbreviation.

    Args:
        kanton_abk: Kanton abbreviation (e.g., 'ZH', 'VD', 'TI')

    Returns:
        Language code: 'de' (German), 'fr' (French), or 'it' (Italian)
    """
    if not kanton_abk:
        return 'de'  # Default to German

    # French-speaking cantons
    french_cantons = {'VD', 'GE', 'NE', 'JU'}
    # Italian-speaking canton
    italian_cantons = {'TI'}

    kanton_upper = kanton_abk.upper()
    if kanton_upper in french_cantons:
        return 'fr'
    elif kanton_upper in italian_cantons:
        return 'it'
    else:
        return 'de'  # Default to German


def _get_bzo_search_query(gemeinde_label: str, language: str) -> str:
    """
    Generate language-specific BZO search query for a Gemeinde.

    Args:
        gemeinde_label: Name of the Gemeinde
        language: Language code ('de', 'fr', 'it')

    Returns:
        Search query string
    """
    if language == 'fr':
        # French: Plan d'aménagement local or Règlement de construction
        return f"Plan d'aménagement local {gemeinde_label} OR Règlement de construction {gemeinde_label}"
    elif language == 'it':
        # Italian: Piano di utilizzazione or Regolamento edilizio
        return f"Piano di utilizzazione {gemeinde_label} OR Regolamento edilizio {gemeinde_label}"
    else:
        # German: Bau und Zonenordnung
        return f"Bau und Zonenordnung {gemeinde_label}"


@router.post("/gemeinden/fetch-bzo-documents", response_model=Dict[str, Any])
@limiter.limit("10/hour")  # Resource-intensive operation
async def fetch_bzo_documents(
    request: Request,
    currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
    """
    Search for and download Bau und Zonenordnung (BZO) documents for all Gemeinden.

    This endpoint:
    1. Fetches all Gemeinden from the database
    2. For each Gemeinde, determines language based on Kanton
    3. Uses Tavily search to find BZO documents (up to 5 results)
    4. Downloads all PDF files found and stores them with content
    5. Creates Dokument records for each PDF and links them to Gemeinde's dokumente field
    6. Skips Gemeinden that already have BZO documents

    Note: If Tavily returns multiple PDF results, all of them will be downloaded
    and saved as separate Dokument records.

    Headers:
    - X-CSRF-Token: CSRF token (required for security)

    Returns:
    {
        "success": true,
        "stats": {
            "gemeinden_processed": 100,
            "documents_created": 85,
            "documents_skipped": 15,
            "errors": []
        },
        "results": [
            {
                "gemeinde_id": "...",
                "gemeinde_label": "Zürich",
                "status": "created|skipped|error",
                "dokument_ids": ["...", "..."],  // List of created document IDs (can be multiple)
                "error": null
            }
        ]
    }
    """
    try:
        # Validate CSRF token
        csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
        if not csrf_token:
            logger.warning(f"CSRF token missing for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="CSRF token missing. Please include X-CSRF-Token header."
            )

        # Basic CSRF token format validation
        if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
            logger.warning(f"Invalid CSRF token format for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Invalid CSRF token format"
            )

        # Validate token is hex string
        try:
            int(csrf_token, 16)
        except ValueError:
            logger.warning(f"CSRF token is not a valid hex string for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Invalid CSRF token format"
            )

        logger.info(f"Starting BZO document fetch for user {currentUser.id} (mandate: {currentUser.mandateId})")

        # Get interfaces
        realEstateInterface = getRealEstateInterface(currentUser)
        componentInterface = getComponentInterface(currentUser)

        # Initialize Tavily connector
        tavily = AiTavily()

        # Get all Gemeinden
        gemeinden = realEstateInterface.getGemeinden(
            recordFilter={"mandateId": currentUser.mandateId}
        )

        logger.info(f"Found {len(gemeinden)} Gemeinden to process")

        # Statistics
        stats = {
            "gemeinden_processed": 0,
            "documents_created": 0,
            "documents_skipped": 0,
            "errors": []
        }
        results = []

        # Process each Gemeinde
        for gemeinde in gemeinden:
            gemeinde_result = {
                "gemeinde_id": gemeinde.id,
                "gemeinde_label": gemeinde.label,
                "status": None,
                "dokument_ids": [],  # Changed to list to support multiple documents
                "error": None
            }

            try:
                stats["gemeinden_processed"] += 1

                # Check if Gemeinde already has a BZO document
                existing_bzo = False
                if gemeinde.dokumente:
                    for doc in gemeinde.dokumente:
                        # Check if it's a BZO document by label or dokumentTyp
                        if (doc.label and ("BZO" in doc.label.upper() or "BAU UND ZONENORDNUNG" in doc.label.upper() or
                            "PLAN D'AMÉNAGEMENT" in doc.label.upper() or "RÈGLEMENT DE CONSTRUCTION" in doc.label.upper() or
                            "PIANO DI UTILIZZAZIONE" in doc.label.upper() or "REGOLAMENTO EDILIZIO" in doc.label.upper())) or \
                           (doc.dokumentTyp and doc.dokumentTyp in [DokumentTyp.GEMEINDE_BZO_AKTUELL, DokumentTyp.GEMEINDE_BZO_REVISION]):
                            existing_bzo = True
                            break

                if existing_bzo:
                    logger.debug(f"Gemeinde {gemeinde.label} already has BZO document, skipping")
                    gemeinde_result["status"] = "skipped"
                    stats["documents_skipped"] += 1
                    results.append(gemeinde_result)
                    continue

                # Get Kanton to determine language
                kanton_abk = None
                if gemeinde.id_kanton:
                    kanton = realEstateInterface.getKanton(gemeinde.id_kanton)
                    if kanton:
                        kanton_abk = kanton.abk

                # Determine language
                language = _get_language_from_kanton(kanton_abk)

                # Generate search query
                search_query = _get_bzo_search_query(gemeinde.label, language)
                logger.info(f"Searching for BZO document for {gemeinde.label} (language: {language}) with query: {search_query}")

                # Search with Tavily using the private _search method
                search_results = await tavily._search(
                    query=search_query,
                    maxResults=5,
                    country="switzerland"
                )

                if not search_results:
                    logger.warning(f"No search results found for {gemeinde.label}")
                    gemeinde_result["status"] = "error"
                    gemeinde_result["error"] = "No search results found"
                    stats["errors"].append(f"{gemeinde.label}: No search results found")
                    results.append(gemeinde_result)
                    continue

                # Find all PDF URLs from search results
                pdf_urls = []
                for result in search_results:
                    url = result.url.lower()
                    if url.endswith('.pdf') or 'pdf' in url:
                        pdf_urls.append(result.url)

                # If no PDF URLs found, try to use all results (they might be PDFs even without .pdf extension)
                if not pdf_urls:
                    pdf_urls = [result.url for result in search_results]
                    logger.info(f"No explicit PDF URLs found for {gemeinde.label}, trying all {len(pdf_urls)} results")

                logger.info(f"Found {len(pdf_urls)} potential PDF documents for {gemeinde.label}")

                # Helper function to download a single PDF
                async def download_pdf(pdf_url: str) -> Optional[bytes]:
                    """Download a PDF from a URL with retry logic."""
                    max_retries = 3
                    retry_delay = 2

                    for attempt in range(max_retries):
                        try:
                            # Create headers - use minimal headers on retry after 406 error
                            if attempt > 0:
                                # Minimal headers for retry
                                headers = {
                                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                                    'Accept': '*/*'
                                }
                            else:
                                # Full headers for first attempt
                                headers = {
                                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                                    'Accept': 'application/pdf,application/octet-stream,*/*',
                                    'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
                                    'Accept-Encoding': 'gzip, deflate, br',
                                    'Connection': 'keep-alive',
                                    'Upgrade-Insecure-Requests': '1'
                                }

                            timeout = aiohttp.ClientTimeout(total=30, connect=10)
                            async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
                                async with session.get(pdf_url, allow_redirects=True) as response:
                                    if response.status == 200:
                                        pdf_content = await response.read()

                                        if not pdf_content or len(pdf_content) < 100:  # Minimum size check
                                            raise Exception("Downloaded file is too small or empty")

                                        # Verify it's actually a PDF (check PDF magic bytes)
                                        if not pdf_content.startswith(b'%PDF'):
                                            # Check if it's HTML (common error page)
                                            if pdf_content.startswith(b'<') or pdf_content.startswith(b'<!DOCTYPE'):
                                                raise Exception("Server returned HTML instead of PDF")
                                            logger.warning(f"Downloaded file from {pdf_url} doesn't appear to be a PDF, but continuing anyway")

                                        # Success - return content
                                        return pdf_content
                                    elif response.status == 406:
                                        # Not Acceptable - try with minimal headers on next attempt
                                        logger.warning(f"HTTP 406 for {pdf_url}, will retry with minimal headers (attempt {attempt + 1}/{max_retries})")
                                        if attempt < max_retries - 1:
                                            await asyncio.sleep(retry_delay)
                                            continue
                                        else:
                                            raise Exception(f"HTTP {response.status} (Not Acceptable) - server rejected request after {max_retries} attempts")
                                    elif response.status in [301, 302, 303, 307, 308]:
                                        # Redirect - should be handled by allow_redirects=True, but log it
                                        logger.warning(f"Redirect {response.status} for {pdf_url}, following...")
                                        pdf_content = await response.read()
                                        return pdf_content
                                    else:
                                        raise Exception(f"HTTP {response.status} when downloading PDF")
                        except asyncio.TimeoutError:
                            logger.warning(f"Timeout downloading PDF from {pdf_url} (attempt {attempt + 1}/{max_retries})")
                            if attempt < max_retries - 1:
                                await asyncio.sleep(retry_delay)
                                continue
                            else:
                                raise Exception("Connection timeout after retries")
                        except aiohttp.ClientError as e:
                            logger.warning(f"Connection error downloading PDF from {pdf_url} (attempt {attempt + 1}/{max_retries}): {str(e)}")
                            if attempt < max_retries - 1:
                                await asyncio.sleep(retry_delay)
                                continue
                            else:
                                raise Exception(f"Connection error: {str(e)}")
                        except Exception as e:
                            # For other errors, don't retry
                            raise

                    return None

                # Process all PDF URLs
                created_dokumente = []
                current_dokumente = list(gemeinde.dokumente) if gemeinde.dokumente else []

                # Sanitize Gemeinde name for filename
                safe_name = "".join(c for c in gemeinde.label if c.isalnum() or c in (' ', '-', '_')).strip()
                safe_name = safe_name.replace(' ', '_')
                if not safe_name:
                    safe_name = "Gemeinde"

                # Determine base label based on language
                if language == 'fr':
                    base_doc_label = f"Plan d'aménagement local {gemeinde.label}"
                elif language == 'it':
                    base_doc_label = f"Piano di utilizzazione {gemeinde.label}"
                else:
                    base_doc_label = f"BZO {gemeinde.label}"

                # Process each PDF URL
                for idx, pdf_url in enumerate(pdf_urls):
                    try:
                        logger.info(f"Downloading PDF {idx + 1}/{len(pdf_urls)} from {pdf_url} for {gemeinde.label}")
                        pdf_content = await download_pdf(pdf_url)

                        if not pdf_content:
                            logger.warning(f"Failed to download PDF from {pdf_url} for {gemeinde.label}")
                            continue

                        # Additional validation
                        if len(pdf_content) < 100:
                            logger.warning(f"Downloaded file from {pdf_url} is too small for {gemeinde.label}")
                            continue

                        # Create unique file name (add index if multiple documents)
                        if len(pdf_urls) > 1:
                            file_name = f"BZO_{safe_name}_{idx + 1}.pdf"
                            doc_label = f"{base_doc_label} ({idx + 1})"
                        else:
                            file_name = f"BZO_{safe_name}.pdf"
                            doc_label = base_doc_label

                        # Store file using ComponentObjects
                        try:
                            file_item = componentInterface.createFile(
                                name=file_name,
                                mimeType="application/pdf",
                                content=pdf_content
                            )

                            # Store file data
                            componentInterface.createFileData(file_item.id, pdf_content)

                            logger.info(f"Stored file {file_name} with ID {file_item.id} for {gemeinde.label}")
                        except Exception as e:
                            logger.error(f"Error storing file {file_name} for {gemeinde.label}: {str(e)}", exc_info=True)
                            stats["errors"].append(f"{gemeinde.label}: File storage failed for {pdf_url} - {str(e)}")
                            continue

                        # Create Dokument record
                        dokument = Dokument(
                            mandateId=currentUser.mandateId,
                            label=doc_label,
                            versionsbezeichnung="Aktuell",
                            dokumentTyp=DokumentTyp.GEMEINDE_BZO_AKTUELL,
                            dokumentReferenz=file_item.id,  # FileId from ComponentObjects
                            quelle=pdf_url,  # Original URL
                            mimeType="application/pdf",
                            kategorienTags=["BZO", "Bauordnung", gemeinde.label]
                        )

                        # Create Dokument record in the Dokument table
                        created_dokument = realEstateInterface.createDokument(dokument)
                        logger.info(f"Created Dokument record with ID {created_dokument.id} for {gemeinde.label} (from {pdf_url})")

                        created_dokumente.append(created_dokument)
                        current_dokumente.append(created_dokument)
                        gemeinde_result["dokument_ids"].append(created_dokument.id)

                    except Exception as e:
                        logger.error(f"Error processing PDF {pdf_url} for {gemeinde.label}: {str(e)}", exc_info=True)
                        stats["errors"].append(f"{gemeinde.label}: Error processing PDF {pdf_url} - {str(e)}")
                        continue

                # Update Gemeinde with all new dokumente
                if created_dokumente:
                    updated_gemeinde = realEstateInterface.updateGemeinde(
                        gemeinde.id,
                        {"dokumente": current_dokumente}
                    )

                    if updated_gemeinde:
                        logger.info(f"Successfully created {len(created_dokumente)} BZO document(s) for {gemeinde.label}")
                        gemeinde_result["status"] = "created"
                        stats["documents_created"] += len(created_dokumente)
                    else:
                        raise Exception("Failed to update Gemeinde")
                else:
                    # No documents were successfully created
                    gemeinde_result["status"] = "error"
                    gemeinde_result["error"] = "No PDFs could be downloaded or processed"
                    stats["errors"].append(f"{gemeinde.label}: No PDFs could be downloaded or processed")

            except Exception as e:
                logger.error(f"Error processing Gemeinde {gemeinde.label}: {str(e)}", exc_info=True)
                gemeinde_result["status"] = "error"
                gemeinde_result["error"] = str(e)
                stats["errors"].append(f"{gemeinde.label}: {str(e)}")

            results.append(gemeinde_result)

        logger.info(
            f"BZO document fetch completed: {stats['documents_created']} created, "
            f"{stats['documents_skipped']} skipped, {len(stats['errors'])} errors"
        )

        return {
            "success": True,
            "stats": stats,
            "results": results
        }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error fetching BZO documents: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error fetching BZO documents: {str(e)}"
        )