""" Gemeinde and BZO document services for Real Estate feature. Provides ensure/import logic used by both routes and extract_bzo_information. """ import asyncio import hashlib import json import logging import re import ssl from typing import Any, Dict, List, Optional, Set from urllib.parse import urljoin, urlparse import aiohttp from .datamodelFeatureRealEstate import Gemeinde, Kanton, Dokument, DokumentTyp, Kontext from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector from modules.aicore.aicorePluginTavily import AiTavily logger = logging.getLogger(__name__) KANTON_NAMES = { "AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden", "BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt", "FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden", "JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden", "OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn", "SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri", "VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich", } # Quartier/place names -> politische Gemeinde (Swiss Topo geocoding returns quarter names) # Prevents wrong matches like "Enge" -> Martherenges instead of Zürich QUARTIER_TO_GEMEINDE: Dict[str, str] = { "enge": "Zürich", # Kreis 2 Enge (Zürich) "aussersihl": "Zürich", "wiedikon": "Zürich", } # Known direct BZO PDF URLs for municipalities (by normalized name, lowercase) # Used when Tavily returns no matching PDFs; avoids SSL/HTML issues with Tavily results # Uster: _docn shows HTML "Erlass ausser Kraft" page; _rtr/dokument_xxx serves the actual PDF KNOWN_BZO_PDF_URLS: Dict[str, List[str]] = { "schlieren": ["https://www.schlieren.ch/_docn/6239470/SKR_10.10_Bauordnung.pdf"], "uster": [ "https://www.uster.ch/_rtr/dokument_3619802", # Direct document (PDF) "https://www.uster.ch/_docn/3619802/Bau-und-Zonenordnung-teilrevidiert-2021.pdf", # May return HTML first ], "zürich": ["https://www.stadt-zuerich.ch/content/dam/stzh/portal/Deutsch/AmtlicheSammlung/Erlasse/700/100/700.100%20Bau-%20und%20Zonenordnung%20V2.pdf"], "zurich": ["https://www.stadt-zuerich.ch/content/dam/stzh/portal/Deutsch/AmtlicheSammlung/Erlasse/700/100/700.100%20Bau-%20und%20Zonenordnung%20V2.pdf"], "zuerich": ["https://www.stadt-zuerich.ch/content/dam/stzh/portal/Deutsch/AmtlicheSammlung/Erlasse/700/100/700.100%20Bau-%20und%20Zonenordnung%20V2.pdf"], } def _get_language_from_kanton(kanton_abk: Optional[str]) -> str: if not kanton_abk: return "de" if kanton_abk.upper() in {"VD", "GE", "NE", "JU"}: return "fr" if kanton_abk.upper() == "TI": return "it" return "de" # Swiss news/media domains to exclude from BZO search (return HTML articles, not PDFs) _EXCLUDE_BZO_DOMAINS = [ "limmattalerzeitung.ch", "20min.ch", "tagesanzeiger.ch", "nzz.ch", "blick.ch", "watson.ch", "srf.ch", "swissinfo.ch", "zukunft-schlieren.ch", # project/development site, not official BZO ] # Keywords that indicate the actual BZO regulation document (at least one required in URL/title) _BZO_ORDINANCE_KEYWORDS = ( "bzo", "zonenordnung", "bauordnung", "bau-und-zonenordnung", "bau und zonenordnung", "plan d'aménagement", "règlement de construction", "piano di utilizzazione", "regolamento edilizio", ) # Keywords that indicate articles or project docs (exclude if present in URL/title) _BZO_ARTICLE_PROJECT_KEYWORDS = ( "ld.", # article ID (e.g. ld.2805321) "warum", # "why" - typical in article headlines "ruft ", # "calls [population to participate]" "artikel", # article "news", "projektplanung", # project planning "projekt/", # URL path for project pages "/projekt", "entwicklungsplan", # development plan (project doc) ) def _normalize_gemeinde_for_match(name: str) -> str: """Normalize Gemeinde name for URL/title matching (lowercase, no umlauts).""" if not name: return "" s = name.lower().strip() s = s.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss") s = "".join(c for c in s if c.isalnum()) return s def _get_bzo_search_query(gemeinde_label: str, language: str) -> str: """Build search query targeting BZO PDF documents (not articles).""" if language == "fr": return f"Plan d'aménagement local {gemeinde_label} PDF" if language == "it": return f"Piano di utilizzazione {gemeinde_label} PDF" return f"Bau und Zonenordnung {gemeinde_label} PDF" async def ensure_single_gemeinde( interface: Any, mandateId: str, instanceId: str, gemeinde_name: str, ) -> Optional[Any]: """ Ensure the given Gemeinde exists in DB. Fetches ONLY that one Gemeinde from Swiss Topo and creates it if not found. No bulk import. Returns the Gemeinde object if found/created, None otherwise. """ if not gemeinde_name or not gemeinde_name.strip(): return None # Resolve Quartier/place names to politische Gemeinde (e.g. Enge -> Zürich) lookup_name = gemeinde_name.strip() quartier_key = _normalize_gemeinde_for_match(lookup_name) if quartier_key and quartier_key in QUARTIER_TO_GEMEINDE: lookup_name = QUARTIER_TO_GEMEINDE[quartier_key] logger.debug(f"Mapped Quartier '{gemeinde_name}' -> Gemeinde '{lookup_name}'") try: connector = SwissTopoMapServerConnector() gd = await connector.get_gemeinde_by_name(lookup_name) except Exception as e: logger.error(f"Error fetching Gemeinde '{gemeinde_name}' from Swiss Topo: {e}", exc_info=True) return None if not gd: logger.warning(f"Gemeinde '{gemeinde_name}' not found in Swiss Topo") return None def find_gemeinde_by_bfs_nummer(bfs_nummer: str) -> Optional[Any]: try: gemeinden = interface.getGemeinden(recordFilter={"mandateId": mandateId}) for g in gemeinden: for k in (g.kontextInformationen or []): try: data = json.loads(k.inhalt) if isinstance(k.inhalt, str) else k.inhalt if isinstance(data, dict) and str(data.get("bfs_nummer")) == str(bfs_nummer): return g except (json.JSONDecodeError, AttributeError): continue except Exception as ex: logger.error(f"Error finding Gemeinde by BFS {bfs_nummer}: {ex}", exc_info=True) return None existing = find_gemeinde_by_bfs_nummer(str(gd["bfs_nummer"])) if existing: logger.info(f"Gemeinde '{gd['name']}' already in DB") return existing kanton_abk = gd.get("kanton") kanton_id = None if kanton_abk: kantone = interface.getKantone(recordFilter={"mandateId": mandateId, "abk": kanton_abk}) if kantone: kanton_id = kantone[0].id else: try: kanton_label = KANTON_NAMES.get(kanton_abk, kanton_abk) kanton = Kanton( mandateId=mandateId, featureInstanceId=instanceId, label=kanton_label, abk=kanton_abk, ) created_k = interface.createKanton(kanton) if created_k and created_k.id: kanton_id = created_k.id except Exception as ex: logger.error(f"Error creating Kanton {kanton_abk}: {ex}") try: gemeinde = Gemeinde( mandateId=mandateId, featureInstanceId=instanceId, label=gd["name"], id_kanton=kanton_id, kontextInformationen=[ Kontext(thema="BFS Nummer", inhalt=json.dumps({"bfs_nummer": gd["bfs_nummer"]}, ensure_ascii=False)) ], ) created = interface.createGemeinde(gemeinde) if created and created.id: logger.info(f"Created single Gemeinde '{gd['name']}' (BFS {gd['bfs_nummer']})") return created except Exception as ex: logger.error(f"Error creating Gemeinde '{gd['name']}': {ex}", exc_info=True) return None def _extract_quelle(doc: Any) -> Optional[str]: """Extract quelle (source URL) from a document.""" return getattr(doc, "quelle", None) or (doc.get("quelle") if isinstance(doc, dict) else None) async def fetch_bzo_for_gemeinde( interface: Any, componentInterface: Any, gemeinde: Any, mandateId: str, instanceId: str, ) -> bool: """ Search for and download BZO documents for a single Gemeinde. Returns True if at least one document was created. Deduplication: re-fetches Gemeinde, skips if BZO exists, skips URLs we already have, creates at most 1 new document per call to avoid duplicates from multiple Tavily URLs. """ logger.info(f"fetch_bzo_for_gemeinde: starting for {gemeinde.label} (id={gemeinde.id})") # Re-fetch Gemeinde to get latest dokumente (avoid race with concurrent requests) fresh = interface.getGemeinde(gemeinde.id) if not fresh: logger.warning(f"fetch_bzo_for_gemeinde: Gemeinde {gemeinde.id} not found after refresh") return False gemeinde = fresh existing_bzo = False existing_quellen: Set[str] = set() if gemeinde.dokumente: for doc in gemeinde.dokumente: typ = getattr(doc, "dokumentTyp", None) or (doc.get("dokumentTyp") if isinstance(doc, dict) else None) label = getattr(doc, "label", None) or (doc.get("label") if isinstance(doc, dict) else None) q = _extract_quelle(doc) if q: existing_quellen.add(q) if typ in [DokumentTyp.GEMEINDE_BZO_AKTUELL, DokumentTyp.GEMEINDE_BZO_REVISION]: doc_id = doc.id if hasattr(doc, "id") else doc.get("id") full = interface.getDokument(doc_id) if doc_id else None if full and full.dokumentReferenz: existing_bzo = True break if label and any(x in (label or "").upper() for x in ("BZO", "BAU UND ZONENORDNUNG", "PLAN D'AMÉNAGEMENT", "RÈGLEMENT DE CONSTRUCTION", "PIANO DI", "REGOLAMENTO EDILIZIO")): doc_id = doc.id if hasattr(doc, "id") else doc.get("id") full = interface.getDokument(doc_id) if doc_id else None if full and full.dokumentReferenz: existing_bzo = True break if existing_bzo: logger.info(f"fetch_bzo_for_gemeinde: {gemeinde.label} already has BZO document(s), skipping") return True kanton_abk = None if gemeinde.id_kanton: k = interface.getKanton(gemeinde.id_kanton) if k: kanton_abk = k.abk language = _get_language_from_kanton(kanton_abk) search_query = _get_bzo_search_query(gemeinde.label, language) logger.info(f"Tavily BZO search for {gemeinde.label}: {search_query}") tavily = AiTavily() gemeinde_normalized = _normalize_gemeinde_for_match(gemeinde.label or "") search_results = await tavily._search( query=search_query, maxResults=10, country="switzerland", excludeDomains=_EXCLUDE_BZO_DOMAINS, ) if not search_results: logger.warning(f"No Tavily search results for BZO of {gemeinde.label}") return False logger.info(f"Tavily returned {len(search_results)} results for BZO of {gemeinde.label}") # Filter: ONLY keep PDF URLs that are the actual BZO ordinance (not articles/project docs) def _is_valid_bzo_result(url: str, title: str) -> bool: combined = f"{url} {title}".lower() combined_norm = _normalize_gemeinde_for_match(combined) # 1. Gemeinde name MUST appear in URL or title if not gemeinde_normalized or gemeinde_normalized not in combined_norm: return False # 2. MUST contain BZO ordinance keyword (actual regulation, not just "about" it) if not any(kw in combined for kw in _BZO_ORDINANCE_KEYWORDS): return False # 3. EXCLUDE if it looks like an article or project planning doc if any(kw in combined for kw in _BZO_ARTICLE_PROJECT_KEYWORDS): return False return True pdf_urls = [ r.url for r in search_results if (r.url.lower().endswith(".pdf") or "/pdf" in r.url.lower()) and _is_valid_bzo_result(r.url, r.title or "") ] # If Tavily returned nothing useful, try known direct PDF URLs (Uster, Schlieren, etc.) gemeinde_key = _normalize_gemeinde_for_match(gemeinde.label or "") gemeinde_key_alt = gemeinde.label.strip().lower() if gemeinde.label else "" if not pdf_urls and (gemeinde_key in KNOWN_BZO_PDF_URLS or gemeinde_key_alt in KNOWN_BZO_PDF_URLS): key = gemeinde_key if gemeinde_key in KNOWN_BZO_PDF_URLS else gemeinde_key_alt pdf_urls = list(KNOWN_BZO_PDF_URLS[key]) logger.info(f"Using known BZO PDF URL for {gemeinde.label} (no Tavily matches)") if not pdf_urls: logger.warning( f"No PDF URLs with matching Gemeinde name for {gemeinde.label} " f"(filtered {len(search_results)} results, requiring .pdf and name in URL/title)" ) return False # Prepend known direct PDF URL when available (avoids SSL/HTML issues with Tavily results) if gemeinde_key and gemeinde_key in KNOWN_BZO_PDF_URLS: known_urls = KNOWN_BZO_PDF_URLS[gemeinde_key] pdf_urls = list(known_urls) + [u for u in pdf_urls if u not in known_urls] logger.info(f"Preferring known BZO PDF URL for {gemeinde.label}") # Use ssl.CERT_NONE to avoid CERTIFICATE_VERIFY_FAILED on Windows/corporate environments # (same approach as routeRealEstate for external HTTP requests) ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE connector = aiohttp.TCPConnector(ssl=ssl_context) # Use Accept: application/pdf first to encourage direct PDF delivery (e.g. uster.ch) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "application/pdf,application/octet-stream,*/*", } timeout = aiohttp.ClientTimeout(total=30) def _extract_document_url_from_html(html_bytes: bytes, base_url: str) -> Optional[str]: """Extract document/PDF URL from HTML (e.g. uster.ch 'Weiter' page).""" try: text = html_bytes.decode("utf-8", errors="ignore") # Swiss municipal doc systems: _rtr/dokument_xxx, .pdf links, or _docn redirect targets for pat in ( r'href=["\']([^"\']*(?:/_rtr/dokument[_\w]*|dokument_\d+)[^"\']*)["\']', r'href=["\']([^"\']+\.pdf(?:\?[^"\']*)?)["\']', r'action=["\']([^"\']+\.pdf[^"\']*)["\']', ): m = re.search(pat, text, re.I) if m: raw = m.group(1).strip() if raw and not raw.startswith("#") and not raw.lower().startswith("javascript:"): next_url = urljoin(base_url, raw) parsed = urlparse(next_url) if parsed.netloc and parsed.scheme: return next_url except Exception: pass return None async def download_pdf( session: aiohttp.ClientSession, url: str, _followed_from_html: bool = False ) -> Optional[bytes]: for attempt in range(3): try: async with session.get(url, allow_redirects=True) as resp: if resp.status == 200: data = await resp.read() if data and len(data) >= 100 and data.startswith(b"%PDF"): return data if (data.startswith(b"<") or data.startswith(b"= 2: raise await asyncio.sleep(2) return None created_dokumente: List[Any] = [] current_dokumente = list(gemeinde.dokumente) if gemeinde.dokumente else [] safe_name = "".join(c for c in gemeinde.label if c.isalnum() or c in (" ", "-", "_")).strip().replace(" ", "_") or "Gemeinde" base_label = f"BZO {gemeinde.label}" if language == "de" else (f"Plan d'aménagement local {gemeinde.label}" if language == "fr" else f"Piano di utilizzazione {gemeinde.label}") # Track content hashes to avoid duplicate PDFs from different URLs seen_content_hashes: Set[str] = set() async with aiohttp.ClientSession(timeout=timeout, headers=headers, connector=connector) as session: for idx, pdf_url in enumerate(pdf_urls[:5]): # Skip URL we already have if pdf_url in existing_quellen: logger.debug(f"Skipping duplicate URL for {gemeinde.label}: {pdf_url[:60]}...") continue try: pdf_content = await download_pdf(session, pdf_url) if not pdf_content or len(pdf_content) < 100: continue # Deduplicate by content hash (same PDF from different URLs) content_hash = hashlib.sha256(pdf_content[:8192]).hexdigest() if content_hash in seen_content_hashes: logger.debug(f"Skipping duplicate content for {gemeinde.label} (hash match)") continue seen_content_hashes.add(content_hash) file_name = f"BZO_{safe_name}.pdf" doc_label = base_label file_item = componentInterface.createFile(name=file_name, mimeType="application/pdf", content=pdf_content) componentInterface.createFileData(file_item.id, pdf_content) dokument = Dokument( mandateId=mandateId, featureInstanceId=instanceId, label=doc_label, versionsbezeichnung="Aktuell", dokumentTyp=DokumentTyp.GEMEINDE_BZO_AKTUELL, dokumentReferenz=file_item.id, quelle=pdf_url, mimeType="application/pdf", kategorienTags=["BZO", "Bauordnung", gemeinde.label], ) created_dok = interface.createDokument(dokument) created_dokumente.append(created_dok) current_dokumente.append(created_dok) existing_quellen.add(pdf_url) # Create at most 1 BZO document per Gemeinde to prevent duplicates logger.info(f"Created BZO document for {gemeinde.label}, stopping (1 doc per Gemeinde)") break except Exception as ex: logger.warning(f"Error downloading BZO for {gemeinde.label} from {pdf_url}: {ex}") continue if created_dokumente: interface.updateGemeinde(gemeinde.id, {"dokumente": current_dokumente}) logger.info(f"Created {len(created_dokumente)} BZO document(s) for {gemeinde.label}") return True return False