gateway/modules/features/realEstate/realEstateGemeindeService.py

"""
Gemeinde and BZO document services for Real Estate feature.
Provides ensure/import logic used by both routes and extract_bzo_information.
"""

import asyncio
import hashlib
import json
import logging
import re
import ssl
from typing import Any, Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse

import aiohttp

from .datamodelFeatureRealEstate import Gemeinde, Kanton, Dokument, DokumentTyp, Kontext
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
from modules.aicore.aicorePluginTavily import AiTavily

logger = logging.getLogger(__name__)

KANTON_NAMES = {
    "AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden",
    "BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt",
    "FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden",
    "JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden",
    "OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn",
    "SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri",
    "VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich",
}

# Quartier/place names -> politische Gemeinde (Swiss Topo geocoding returns quarter names)
# Prevents wrong matches like "Enge" -> Martherenges instead of Zürich
QUARTIER_TO_GEMEINDE: Dict[str, str] = {
    "enge": "Zürich",      # Kreis 2 Enge (Zürich)
    "aussersihl": "Zürich",
    "wiedikon": "Zürich",
}

# Known direct BZO PDF URLs for municipalities (by normalized name, lowercase)
# Used when Tavily returns no matching PDFs; avoids SSL/HTML issues with Tavily results
# Uster: _docn shows HTML "Erlass ausser Kraft" page; _rtr/dokument_xxx serves the actual PDF
KNOWN_BZO_PDF_URLS: Dict[str, List[str]] = {
    "schlieren": ["https://www.schlieren.ch/_docn/6239470/SKR_10.10_Bauordnung.pdf"],
    "uster": [
        "https://www.uster.ch/_rtr/dokument_3619802",  # Direct document (PDF)
        "https://www.uster.ch/_docn/3619802/Bau-und-Zonenordnung-teilrevidiert-2021.pdf",  # May return HTML first
    ],
    "zürich": ["https://www.stadt-zuerich.ch/content/dam/stzh/portal/Deutsch/AmtlicheSammlung/Erlasse/700/100/700.100%20Bau-%20und%20Zonenordnung%20V2.pdf"],
    "zurich": ["https://www.stadt-zuerich.ch/content/dam/stzh/portal/Deutsch/AmtlicheSammlung/Erlasse/700/100/700.100%20Bau-%20und%20Zonenordnung%20V2.pdf"],
    "zuerich": ["https://www.stadt-zuerich.ch/content/dam/stzh/portal/Deutsch/AmtlicheSammlung/Erlasse/700/100/700.100%20Bau-%20und%20Zonenordnung%20V2.pdf"],
}


def _get_language_from_kanton(kanton_abk: Optional[str]) -> str:
    if not kanton_abk:
        return "de"
    if kanton_abk.upper() in {"VD", "GE", "NE", "JU"}:
        return "fr"
    if kanton_abk.upper() == "TI":
        return "it"
    return "de"


# Swiss news/media domains to exclude from BZO search (return HTML articles, not PDFs)
_EXCLUDE_BZO_DOMAINS = [
    "limmattalerzeitung.ch",
    "20min.ch",
    "tagesanzeiger.ch",
    "nzz.ch",
    "blick.ch",
    "watson.ch",
    "srf.ch",
    "swissinfo.ch",
    "zukunft-schlieren.ch",  # project/development site, not official BZO
]

# Keywords that indicate the actual BZO regulation document (at least one required in URL/title)
_BZO_ORDINANCE_KEYWORDS = (
    "bzo",
    "zonenordnung",
    "bauordnung",
    "bau-und-zonenordnung",
    "bau und zonenordnung",
    "plan d'aménagement",
    "règlement de construction",
    "piano di utilizzazione",
    "regolamento edilizio",
)

# Keywords that indicate articles or project docs (exclude if present in URL/title)
_BZO_ARTICLE_PROJECT_KEYWORDS = (
    "ld.",  # article ID (e.g. ld.2805321)
    "warum",  # "why" - typical in article headlines
    "ruft ",  # "calls [population to participate]"
    "artikel",  # article
    "news",
    "projektplanung",  # project planning
    "projekt/",  # URL path for project pages
    "/projekt",
    "entwicklungsplan",  # development plan (project doc)
)


def _normalize_gemeinde_for_match(name: str) -> str:
    """Normalize Gemeinde name for URL/title matching (lowercase, no umlauts)."""
    if not name:
        return ""
    s = name.lower().strip()
    s = s.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    s = "".join(c for c in s if c.isalnum())
    return s


def _get_bzo_search_query(gemeinde_label: str, language: str) -> str:
    """Build search query targeting BZO PDF documents (not articles)."""
    if language == "fr":
        return f"Plan d'aménagement local {gemeinde_label} PDF"
    if language == "it":
        return f"Piano di utilizzazione {gemeinde_label} PDF"
    return f"Bau und Zonenordnung {gemeinde_label} PDF"


async def ensure_single_gemeinde(
    interface: Any,
    mandateId: str,
    instanceId: str,
    gemeinde_name: str,
) -> Optional[Any]:
    """
    Ensure the given Gemeinde exists in DB. Fetches ONLY that one Gemeinde from Swiss Topo
    and creates it if not found. No bulk import.
    Returns the Gemeinde object if found/created, None otherwise.
    """
    if not gemeinde_name or not gemeinde_name.strip():
        return None
    # Resolve Quartier/place names to politische Gemeinde (e.g. Enge -> Zürich)
    lookup_name = gemeinde_name.strip()
    quartier_key = _normalize_gemeinde_for_match(lookup_name)
    if quartier_key and quartier_key in QUARTIER_TO_GEMEINDE:
        lookup_name = QUARTIER_TO_GEMEINDE[quartier_key]
        logger.debug(f"Mapped Quartier '{gemeinde_name}' -> Gemeinde '{lookup_name}'")
    try:
        connector = SwissTopoMapServerConnector()
        gd = await connector.get_gemeinde_by_name(lookup_name)
    except Exception as e:
        logger.error(f"Error fetching Gemeinde '{gemeinde_name}' from Swiss Topo: {e}", exc_info=True)
        return None
    if not gd:
        logger.warning(f"Gemeinde '{gemeinde_name}' not found in Swiss Topo")
        return None

    def find_gemeinde_by_bfs_nummer(bfs_nummer: str) -> Optional[Any]:
        try:
            gemeinden = interface.getGemeinden(recordFilter={"mandateId": mandateId})
            for g in gemeinden:
                for k in (g.kontextInformationen or []):
                    try:
                        data = json.loads(k.inhalt) if isinstance(k.inhalt, str) else k.inhalt
                        if isinstance(data, dict) and str(data.get("bfs_nummer")) == str(bfs_nummer):
                            return g
                    except (json.JSONDecodeError, AttributeError):
                        continue
        except Exception as ex:
            logger.error(f"Error finding Gemeinde by BFS {bfs_nummer}: {ex}", exc_info=True)
        return None

    existing = find_gemeinde_by_bfs_nummer(str(gd["bfs_nummer"]))
    if existing:
        logger.info(f"Gemeinde '{gd['name']}' already in DB")
        return existing

    kanton_abk = gd.get("kanton")
    kanton_id = None
    if kanton_abk:
        kantone = interface.getKantone(recordFilter={"mandateId": mandateId, "abk": kanton_abk})
        if kantone:
            kanton_id = kantone[0].id
        else:
            try:
                kanton_label = KANTON_NAMES.get(kanton_abk, kanton_abk)
                kanton = Kanton(
                    mandateId=mandateId,
                    featureInstanceId=instanceId,
                    label=kanton_label,
                    abk=kanton_abk,
                )
                created_k = interface.createKanton(kanton)
                if created_k and created_k.id:
                    kanton_id = created_k.id
            except Exception as ex:
                logger.error(f"Error creating Kanton {kanton_abk}: {ex}")

    try:
        gemeinde = Gemeinde(
            mandateId=mandateId,
            featureInstanceId=instanceId,
            label=gd["name"],
            id_kanton=kanton_id,
            kontextInformationen=[
                Kontext(thema="BFS Nummer", inhalt=json.dumps({"bfs_nummer": gd["bfs_nummer"]}, ensure_ascii=False))
            ],
        )
        created = interface.createGemeinde(gemeinde)
        if created and created.id:
            logger.info(f"Created single Gemeinde '{gd['name']}' (BFS {gd['bfs_nummer']})")
            return created
    except Exception as ex:
        logger.error(f"Error creating Gemeinde '{gd['name']}': {ex}", exc_info=True)
    return None


def _extract_quelle(doc: Any) -> Optional[str]:
    """Extract quelle (source URL) from a document."""
    return getattr(doc, "quelle", None) or (doc.get("quelle") if isinstance(doc, dict) else None)


async def fetch_bzo_for_gemeinde(
    interface: Any,
    componentInterface: Any,
    gemeinde: Any,
    mandateId: str,
    instanceId: str,
) -> bool:
    """
    Search for and download BZO documents for a single Gemeinde.
    Returns True if at least one document was created.
    Deduplication: re-fetches Gemeinde, skips if BZO exists, skips URLs we already have,
    creates at most 1 new document per call to avoid duplicates from multiple Tavily URLs.
    """
    logger.info(f"fetch_bzo_for_gemeinde: starting for {gemeinde.label} (id={gemeinde.id})")
    # Re-fetch Gemeinde to get latest dokumente (avoid race with concurrent requests)
    fresh = interface.getGemeinde(gemeinde.id)
    if not fresh:
        logger.warning(f"fetch_bzo_for_gemeinde: Gemeinde {gemeinde.id} not found after refresh")
        return False
    gemeinde = fresh

    existing_bzo = False
    existing_quellen: Set[str] = set()
    if gemeinde.dokumente:
        for doc in gemeinde.dokumente:
            typ = getattr(doc, "dokumentTyp", None) or (doc.get("dokumentTyp") if isinstance(doc, dict) else None)
            label = getattr(doc, "label", None) or (doc.get("label") if isinstance(doc, dict) else None)
            q = _extract_quelle(doc)
            if q:
                existing_quellen.add(q)
            if typ in [DokumentTyp.GEMEINDE_BZO_AKTUELL, DokumentTyp.GEMEINDE_BZO_REVISION]:
                doc_id = doc.id if hasattr(doc, "id") else doc.get("id")
                full = interface.getDokument(doc_id) if doc_id else None
                if full and full.dokumentReferenz:
                    existing_bzo = True
                    break
            if label and any(x in (label or "").upper() for x in ("BZO", "BAU UND ZONENORDNUNG", "PLAN D'AMÉNAGEMENT", "RÈGLEMENT DE CONSTRUCTION", "PIANO DI", "REGOLAMENTO EDILIZIO")):
                doc_id = doc.id if hasattr(doc, "id") else doc.get("id")
                full = interface.getDokument(doc_id) if doc_id else None
                if full and full.dokumentReferenz:
                    existing_bzo = True
                    break
    if existing_bzo:
        logger.info(f"fetch_bzo_for_gemeinde: {gemeinde.label} already has BZO document(s), skipping")
        return True

    kanton_abk = None
    if gemeinde.id_kanton:
        k = interface.getKanton(gemeinde.id_kanton)
        if k:
            kanton_abk = k.abk
    language = _get_language_from_kanton(kanton_abk)
    search_query = _get_bzo_search_query(gemeinde.label, language)

    logger.info(f"Tavily BZO search for {gemeinde.label}: {search_query}")
    tavily = AiTavily()
    gemeinde_normalized = _normalize_gemeinde_for_match(gemeinde.label or "")

    search_results = await tavily._search(
        query=search_query,
        maxResults=10,
        country="switzerland",
        excludeDomains=_EXCLUDE_BZO_DOMAINS,
    )
    if not search_results:
        logger.warning(f"No Tavily search results for BZO of {gemeinde.label}")
        return False
    logger.info(f"Tavily returned {len(search_results)} results for BZO of {gemeinde.label}")

    # Filter: ONLY keep PDF URLs that are the actual BZO ordinance (not articles/project docs)
    def _is_valid_bzo_result(url: str, title: str) -> bool:
        combined = f"{url} {title}".lower()
        combined_norm = _normalize_gemeinde_for_match(combined)
        # 1. Gemeinde name MUST appear in URL or title
        if not gemeinde_normalized or gemeinde_normalized not in combined_norm:
            return False
        # 2. MUST contain BZO ordinance keyword (actual regulation, not just "about" it)
        if not any(kw in combined for kw in _BZO_ORDINANCE_KEYWORDS):
            return False
        # 3. EXCLUDE if it looks like an article or project planning doc
        if any(kw in combined for kw in _BZO_ARTICLE_PROJECT_KEYWORDS):
            return False
        return True

    pdf_urls = [
        r.url
        for r in search_results
        if (r.url.lower().endswith(".pdf") or "/pdf" in r.url.lower())
        and _is_valid_bzo_result(r.url, r.title or "")
    ]

    # If Tavily returned nothing useful, try known direct PDF URLs (Uster, Schlieren, etc.)
    gemeinde_key = _normalize_gemeinde_for_match(gemeinde.label or "")
    gemeinde_key_alt = gemeinde.label.strip().lower() if gemeinde.label else ""
    if not pdf_urls and (gemeinde_key in KNOWN_BZO_PDF_URLS or gemeinde_key_alt in KNOWN_BZO_PDF_URLS):
        key = gemeinde_key if gemeinde_key in KNOWN_BZO_PDF_URLS else gemeinde_key_alt
        pdf_urls = list(KNOWN_BZO_PDF_URLS[key])
        logger.info(f"Using known BZO PDF URL for {gemeinde.label} (no Tavily matches)")

    if not pdf_urls:
        logger.warning(
            f"No PDF URLs with matching Gemeinde name for {gemeinde.label} "
            f"(filtered {len(search_results)} results, requiring .pdf and name in URL/title)"
        )
        return False

    # Prepend known direct PDF URL when available (avoids SSL/HTML issues with Tavily results)
    if gemeinde_key and gemeinde_key in KNOWN_BZO_PDF_URLS:
        known_urls = KNOWN_BZO_PDF_URLS[gemeinde_key]
        pdf_urls = list(known_urls) + [u for u in pdf_urls if u not in known_urls]
        logger.info(f"Preferring known BZO PDF URL for {gemeinde.label}")

    # Use ssl.CERT_NONE to avoid CERTIFICATE_VERIFY_FAILED on Windows/corporate environments
    # (same approach as routeRealEstate for external HTTP requests)
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
    connector = aiohttp.TCPConnector(ssl=ssl_context)
    # Use Accept: application/pdf first to encourage direct PDF delivery (e.g. uster.ch)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "application/pdf,application/octet-stream,*/*",
    }
    timeout = aiohttp.ClientTimeout(total=30)

    def _extract_document_url_from_html(html_bytes: bytes, base_url: str) -> Optional[str]:
        """Extract document/PDF URL from HTML (e.g. uster.ch 'Weiter' page)."""
        try:
            text = html_bytes.decode("utf-8", errors="ignore")
            # Swiss municipal doc systems: _rtr/dokument_xxx, .pdf links, or _docn redirect targets
            for pat in (
                r'href=["\']([^"\']*(?:/_rtr/dokument[_\w]*|dokument_\d+)[^"\']*)["\']',
                r'href=["\']([^"\']+\.pdf(?:\?[^"\']*)?)["\']',
                r'action=["\']([^"\']+\.pdf[^"\']*)["\']',
            ):
                m = re.search(pat, text, re.I)
                if m:
                    raw = m.group(1).strip()
                    if raw and not raw.startswith("#") and not raw.lower().startswith("javascript:"):
                        next_url = urljoin(base_url, raw)
                        parsed = urlparse(next_url)
                        if parsed.netloc and parsed.scheme:
                            return next_url
        except Exception:
            pass
        return None

    async def download_pdf(
        session: aiohttp.ClientSession, url: str, _followed_from_html: bool = False
    ) -> Optional[bytes]:
        for attempt in range(3):
            try:
                async with session.get(url, allow_redirects=True) as resp:
                    if resp.status == 200:
                        data = await resp.read()
                        if data and len(data) >= 100 and data.startswith(b"%PDF"):
                            return data
                        if (data.startswith(b"<") or data.startswith(b"<!DOCTYPE")) and not _followed_from_html:
                            fallback = _extract_document_url_from_html(data, url)
                            if fallback and fallback != url:
                                logger.debug(f"HTML from {url[:60]}..., following link to document")
                                return await download_pdf(session, fallback, _followed_from_html=True)
                            raise Exception("Server returned HTML instead of PDF")
                    elif resp.status == 406 and attempt < 2:
                        await asyncio.sleep(2)
                        continue
                    else:
                        raise Exception(f"HTTP {resp.status}")
            except Exception:
                if attempt >= 2:
                    raise
                await asyncio.sleep(2)
        return None

    created_dokumente: List[Any] = []
    current_dokumente = list(gemeinde.dokumente) if gemeinde.dokumente else []
    safe_name = "".join(c for c in gemeinde.label if c.isalnum() or c in (" ", "-", "_")).strip().replace(" ", "_") or "Gemeinde"
    base_label = f"BZO {gemeinde.label}" if language == "de" else (f"Plan d'aménagement local {gemeinde.label}" if language == "fr" else f"Piano di utilizzazione {gemeinde.label}")

    # Track content hashes to avoid duplicate PDFs from different URLs
    seen_content_hashes: Set[str] = set()

    async with aiohttp.ClientSession(timeout=timeout, headers=headers, connector=connector) as session:
        for idx, pdf_url in enumerate(pdf_urls[:5]):
            # Skip URL we already have
            if pdf_url in existing_quellen:
                logger.debug(f"Skipping duplicate URL for {gemeinde.label}: {pdf_url[:60]}...")
                continue
            try:
                pdf_content = await download_pdf(session, pdf_url)
                if not pdf_content or len(pdf_content) < 100:
                    continue
                # Deduplicate by content hash (same PDF from different URLs)
                content_hash = hashlib.sha256(pdf_content[:8192]).hexdigest()
                if content_hash in seen_content_hashes:
                    logger.debug(f"Skipping duplicate content for {gemeinde.label} (hash match)")
                    continue
                seen_content_hashes.add(content_hash)

                file_name = f"BZO_{safe_name}.pdf"
                doc_label = base_label
                file_item = componentInterface.createFile(name=file_name, mimeType="application/pdf", content=pdf_content)
                componentInterface.createFileData(file_item.id, pdf_content)
                dokument = Dokument(
                    mandateId=mandateId,
                    featureInstanceId=instanceId,
                    label=doc_label,
                    versionsbezeichnung="Aktuell",
                    dokumentTyp=DokumentTyp.GEMEINDE_BZO_AKTUELL,
                    dokumentReferenz=file_item.id,
                    quelle=pdf_url,
                    mimeType="application/pdf",
                    kategorienTags=["BZO", "Bauordnung", gemeinde.label],
                )
                created_dok = interface.createDokument(dokument)
                created_dokumente.append(created_dok)
                current_dokumente.append(created_dok)
                existing_quellen.add(pdf_url)
                # Create at most 1 BZO document per Gemeinde to prevent duplicates
                logger.info(f"Created BZO document for {gemeinde.label}, stopping (1 doc per Gemeinde)")
                break
            except Exception as ex:
                logger.warning(f"Error downloading BZO for {gemeinde.label} from {pdf_url}: {ex}")
                continue

    if created_dokumente:
        interface.updateGemeinde(gemeinde.id, {"dokumente": current_dokumente})
        logger.info(f"Created {len(created_dokumente)} BZO document(s) for {gemeinde.label}")
        return True
    return False