feat: fetch oereb kataster and return Wohnzohne

This commit is contained in:
Ida Dittrich 2026-01-26 07:14:56 +01:00
parent 0d1c30d4c6
commit fb6bac7d92
7 changed files with 3343 additions and 0 deletions

View file

@ -0,0 +1,525 @@
"""
ÖREB WFS Connector
This connector handles interactions with ÖREB (Öffentlich-rechtliche Eigentumsbeschränkungen)
WFS services for zone information retrieval.
ÖREB provides zoning information (Bauzonen) through WFS services.
"""
import logging
from typing import Dict, List, Any, Optional
import aiohttp
import xml.etree.ElementTree as ET
from shapely.geometry import Polygon
logger = logging.getLogger(__name__)
class OerebWfsConnector:
"""
Connector for ÖREB WFS services.
Provides methods for:
- Querying zone information (Bauzonen) by parcel geometry
- Retrieving zoning data from canton-specific WFS services
"""
def __init__(
self,
timeout: int = 10,
max_retries: int = 3,
retry_delay: float = 1.0
):
"""
Initialize ÖREB WFS connector.
Args:
timeout: Request timeout in seconds
max_retries: Maximum number of retry attempts
retry_delay: Initial retry delay in seconds (exponential backoff)
"""
self.timeout = aiohttp.ClientTimeout(total=timeout)
self.max_retries = max_retries
self.retry_delay = retry_delay
self._wfs_cache: Dict[str, List[Dict[str, Any]]] = {} # Cache for WFS queries by bbox
logger.info("ÖREB WFS Connector initialized")
def _get_oereb_wfs_url(self, canton: str) -> Optional[str]:
"""
Get ÖREB WFS service URL for a given canton.
Args:
canton: Canton abbreviation (e.g., "ZH", "BE")
Returns:
WFS service URL or None if canton not supported
"""
oereb_wfs_urls = {
"ZH": "https://maps.zh.ch/wfs/OerebKatasterZHWFS",
}
return oereb_wfs_urls.get(canton.upper())
def _geometry_to_shapely_polygon(self, geometry: Dict[str, Any]) -> Optional[Polygon]:
"""
Convert parcel geometry (ESRI rings or GeoJSON coordinates) to Shapely Polygon.
Args:
geometry: Geometry dictionary (ESRI rings or GeoJSON coordinates)
Returns:
Shapely Polygon or None if invalid
"""
try:
# Handle ESRI geometry format (rings)
if "rings" in geometry:
rings = geometry.get("rings", [])
if not rings or not rings[0]:
return None
# Use the first ring (exterior) for the polygon
exterior_ring = rings[0]
if len(exterior_ring) < 3:
return None
# Ensure polygon is closed
coords = list(exterior_ring)
if coords[0] != coords[-1]:
coords.append(coords[0])
return Polygon(coords)
# Handle GeoJSON format (coordinates)
elif "coordinates" in geometry:
coords = geometry.get("coordinates", [])
if not coords:
return None
# Handle Polygon coordinates: [[[x1,y1], [x2,y2], ...]]
# Flatten to get the exterior ring
def extract_exterior(coord_list, depth=0):
if depth == 0 and isinstance(coord_list, list) and len(coord_list) > 0:
# First level might be array of rings, take first one
if isinstance(coord_list[0], list) and len(coord_list[0]) > 0:
if isinstance(coord_list[0][0], list):
# This is Polygon format: [[[x,y],...]]
return extract_exterior(coord_list[0], depth + 1)
elif isinstance(coord_list[0][0], (int, float)):
# This is already a ring: [[x,y],...]
return coord_list[0]
elif depth == 1 and isinstance(coord_list, list) and len(coord_list) > 0:
if isinstance(coord_list[0], (int, float)):
return coord_list
elif isinstance(coord_list[0], list):
return coord_list
return coord_list
exterior_coords = extract_exterior(coords)
if not exterior_coords or len(exterior_coords) < 3:
return None
# Ensure polygon is closed
coords_list = list(exterior_coords)
if coords_list[0] != coords_list[-1]:
coords_list.append(coords_list[0])
return Polygon(coords_list)
except Exception as e:
logger.debug(f"Error converting geometry to Shapely Polygon: {e}")
return None
def _parse_gml_geometry(self, feature_elem: ET.Element) -> Optional[Polygon]:
"""
Parse GML geometry from WFS feature element and convert to Shapely Polygon.
Args:
feature_elem: XML element containing the feature
Returns:
Shapely Polygon or None if geometry not found or invalid
"""
try:
# Common GML namespaces
namespaces = {
'gml': 'http://www.opengis.net/gml',
'gml3': 'http://www.opengis.net/gml/3.2',
'gml32': 'http://www.opengis.net/gml/3.2'
}
# Try to find polygon geometry
polygon_elem = None
for ns_prefix, ns_url in namespaces.items():
# Try different GML polygon element names
for tag_name in ['Polygon', 'polygon', 'PolygonProperty', 'geometryProperty']:
polygon_elem = feature_elem.find(f'.//{{{ns_url}}}{tag_name}')
if polygon_elem is not None:
break
# Also try without namespace prefix
polygon_elem = feature_elem.find(f'.//{tag_name}')
if polygon_elem is not None:
break
if polygon_elem is not None:
break
if polygon_elem is None:
# Try to find any geometry element
for ns_prefix, ns_url in namespaces.items():
polygon_elem = feature_elem.find(f'.//{{{ns_url}}}*')
if polygon_elem is not None and 'polygon' in polygon_elem.tag.lower():
break
if polygon_elem is None:
return None
# Extract coordinates from GML
# GML Polygon typically has exterior ring with posList or pos elements
coords = []
# Try posList (most common in GML 3.2)
for ns_prefix, ns_url in namespaces.items():
pos_list = polygon_elem.find(f'.//{{{ns_url}}}posList')
if pos_list is not None and pos_list.text:
# posList format: "x1 y1 x2 y2 x3 y3 ..."
coord_strings = pos_list.text.strip().split()
for i in range(0, len(coord_strings) - 1, 2):
if i + 1 < len(coord_strings):
x = float(coord_strings[i])
y = float(coord_strings[i + 1])
coords.append((x, y))
break
# If no posList, try pos elements
if not coords:
for ns_prefix, ns_url in namespaces.items():
pos_elems = polygon_elem.findall(f'.//{{{ns_url}}}pos')
if pos_elems:
for pos in pos_elems:
if pos.text:
parts = pos.text.strip().split()
if len(parts) >= 2:
x = float(parts[0])
y = float(parts[1])
coords.append((x, y))
break
# If still no coords, try coordinates element (GML 2)
if not coords:
for ns_prefix, ns_url in namespaces.items():
coords_elem = polygon_elem.find(f'.//{{{ns_url}}}coordinates')
if coords_elem is not None and coords_elem.text:
# GML 2 coordinates format: "x1,y1 x2,y2 ..." or "x1,y1,z1 x2,y2,z2 ..."
coord_strings = coords_elem.text.strip().split()
for coord_str in coord_strings:
parts = coord_str.split(',')
if len(parts) >= 2:
x = float(parts[0])
y = float(parts[1])
coords.append((x, y))
break
if len(coords) < 3:
return None
# Ensure polygon is closed
if coords[0] != coords[-1]:
coords.append(coords[0])
return Polygon(coords)
except Exception as e:
logger.debug(f"Error parsing GML geometry: {e}")
return None
def _calculate_bbox_from_geometry(self, geometry: Dict[str, Any]) -> Optional[str]:
"""
Calculate bounding box from geometry for WFS queries.
Args:
geometry: Geometry dictionary (ESRI rings or GeoJSON coordinates)
Returns:
Bounding box string in format "min_x,min_y,max_x,max_y" or None if invalid
"""
try:
# Handle ESRI geometry format (rings)
if "rings" in geometry:
rings = geometry.get("rings", [])
if not rings or not rings[0]:
return None
# Flatten all coordinates from all rings
all_coords = []
for ring in rings:
all_coords.extend(ring)
if not all_coords:
return None
# Calculate bbox
x_coords = [coord[0] for coord in all_coords]
y_coords = [coord[1] for coord in all_coords]
min_x = min(x_coords)
min_y = min(y_coords)
max_x = max(x_coords)
max_y = max(y_coords)
return f"{min_x},{min_y},{max_x},{max_y}"
# Handle GeoJSON format (coordinates)
elif "coordinates" in geometry:
coords = geometry.get("coordinates", [])
if not coords:
return None
# Flatten coordinates based on geometry type
def flatten_coords(coord_list, depth=0):
if depth < 2:
result = []
for item in coord_list:
if isinstance(item, (int, float)):
return coord_list
result.extend(flatten_coords(item, depth + 1))
return result
return coord_list
flat_coords = flatten_coords(coords)
if not flat_coords or len(flat_coords) < 2:
return None
x_coords = [flat_coords[i] for i in range(0, len(flat_coords), 2)]
y_coords = [flat_coords[i+1] for i in range(0, len(flat_coords)-1, 2)]
min_x = min(x_coords)
min_y = min(y_coords)
max_x = max(x_coords)
max_y = max(y_coords)
return f"{min_x},{min_y},{max_x},{max_y}"
except Exception as e:
logger.debug(f"Error calculating bbox from geometry: {e}")
return None
async def _query_wfs_get_feature(
self,
wfs_url: str,
type_name: str,
bbox: str,
srs: str = "EPSG:2056"
) -> List[Dict[str, Any]]:
"""
Query WFS GetFeature to retrieve zone features within a bounding box.
Args:
wfs_url: WFS service URL
type_name: Feature type name (e.g., "nutzungsplanung")
bbox: Bounding box string "min_x,min_y,max_x,max_y"
srs: Spatial reference system (default: EPSG:2056 for LV95)
Returns:
List of feature dictionaries with properties and attributes
"""
# Only use WFS 1.1.0 (we know this works)
params = {
"service": "WFS",
"version": "1.1.0",
"request": "GetFeature",
"typeName": type_name,
"bbox": bbox,
"srsName": srs
}
logger.debug(f"Querying WFS GetFeature: {wfs_url} with typeName={type_name}, bbox={bbox}")
try:
async with aiohttp.ClientSession(timeout=self.timeout) as session:
async with session.get(wfs_url, params=params) as response:
if response.status != 200:
logger.debug(f"WFS GetFeature returned status {response.status}")
return []
# Parse XML/GML response
xml_content = await response.text()
try:
root = ET.fromstring(xml_content)
features = []
members = root.findall('.//{http://www.opengis.net/gml}featureMember') or \
root.findall('.//featureMember')
for member in members:
attrs = {}
# Find feature element
feature_elem = member
for child in member:
if child.tag and ('nutzung' in child.tag.lower() or 'plan' in child.tag.lower()):
feature_elem = child
break
# Extract attributes
for elem in feature_elem.iter():
if elem.tag and elem.text and elem.text.strip():
tag_lower = elem.tag.lower()
if any(term in tag_lower for term in [
'pos', 'coordinates', 'point', 'polygon', 'linestring',
'geometry', 'boundedby', 'envelope', 'gml'
]):
continue
tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
if ':' in tag_name:
tag_name = tag_name.split(':')[-1]
if tag_name not in attrs:
attrs[tag_name] = elem.text.strip()
# Parse geometry from GML
geometry_polygon = self._parse_gml_geometry(feature_elem)
if attrs:
feature_dict = {"properties": attrs, "attributes": attrs}
if geometry_polygon:
feature_dict["geometry"] = geometry_polygon
features.append(feature_dict)
return features
except ET.ParseError as e:
logger.debug(f"Failed to parse WFS XML response: {e}")
return []
except Exception as e:
logger.debug(f"Error parsing WFS XML: {e}")
return []
except Exception as e:
logger.debug(f"WFS GetFeature query failed: {e}")
return []
async def query_zone_layer(
self,
egrid: str,
x: float,
y: float,
canton: Optional[str] = None,
geometry: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
"""
Query zone information using ÖREB WFS service.
Returns only zones that contain the parcel based on the parcel geometry.
Args:
egrid: EGRID identifier (not currently used but kept for API compatibility)
x: X coordinate (LV95) - not used but kept for compatibility
y: Y coordinate (LV95) - not used but kept for compatibility
canton: Canton abbreviation (e.g., "ZH", "BE")
geometry: Parcel geometry dictionary (ESRI rings or GeoJSON coordinates)
Returns:
List of zone dictionaries with layerBodId and attributes, or empty list if not found
"""
if not canton or not geometry:
return []
wfs_url = self._get_oereb_wfs_url(canton)
if not wfs_url:
return []
try:
bbox = self._calculate_bbox_from_geometry(geometry)
if not bbox:
return []
# Check cache
cache_key = f"{wfs_url}:{bbox}"
if cache_key in self._wfs_cache:
cached_features = self._wfs_cache[cache_key]
else:
cached_features = await self._query_wfs_get_feature(wfs_url, "nutzungsplanung", bbox)
self._wfs_cache[cache_key] = cached_features
if not cached_features:
return []
# Convert parcel geometry to Shapely Polygon for spatial validation
parcel_polygon = self._geometry_to_shapely_polygon(geometry)
if not parcel_polygon:
logger.debug("Could not convert parcel geometry to Shapely Polygon")
# Fallback to first zone if geometry conversion fails
for feature in cached_features:
attrs = feature.get("properties", feature.get("attributes", {}))
typ_gde_abkuerzung = attrs.get("typ_gde_abkuerzung")
if typ_gde_abkuerzung:
return [{
"layerBodId": "oereb_wfs",
"attributes": {"typ_gde_abkuerzung": typ_gde_abkuerzung}
}]
return []
# Find the zone that actually contains or intersects the parcel
# Since a parcel is always in exactly one zone, we check for containment first,
# then find the zone with the largest intersection area if no perfect containment is found
containing_zone = None
best_intersecting_zone = None
best_intersection_area = 0.0
for feature in cached_features:
attrs = feature.get("properties", feature.get("attributes", {}))
typ_gde_abkuerzung = attrs.get("typ_gde_abkuerzung")
if not typ_gde_abkuerzung:
continue
zone_geometry = feature.get("geometry")
if not zone_geometry:
# If geometry not parsed, skip spatial check for this feature
# But keep it as fallback if no geometry-based match is found
if not best_intersecting_zone:
best_intersecting_zone = feature
continue
try:
# Check if zone contains the parcel (most precise)
if zone_geometry.contains(parcel_polygon):
containing_zone = feature
break # Found perfect match, stop searching
# Check if zone intersects the parcel (for border cases)
if zone_geometry.intersects(parcel_polygon):
# Calculate intersection area to find the best match
intersection = zone_geometry.intersection(parcel_polygon)
if not intersection.is_empty:
intersection_area = intersection.area
# Keep the zone with the largest intersection area
if intersection_area > best_intersection_area:
best_intersection_area = intersection_area
best_intersecting_zone = feature
except Exception as e:
logger.debug(f"Error checking spatial relationship: {e}")
# If spatial check fails, keep as fallback
if not best_intersecting_zone:
best_intersecting_zone = feature
# Return the containing zone if found, otherwise the best intersecting zone
selected_feature = containing_zone or best_intersecting_zone
if selected_feature:
attrs = selected_feature.get("properties", selected_feature.get("attributes", {}))
typ_gde_abkuerzung = attrs.get("typ_gde_abkuerzung")
if typ_gde_abkuerzung:
return [{
"layerBodId": "oereb_wfs",
"attributes": {"typ_gde_abkuerzung": typ_gde_abkuerzung}
}]
return []
except Exception:
return []

View file

@ -0,0 +1,199 @@
"""
Document retriever for BZO extraction pipeline.
Queries Dokument table and retrieves PDF content from ComponentObjects.
"""
import logging
from typing import List, Dict, Any, Optional
from modules.datamodels.datamodelRealEstate import Dokument, DokumentTyp, Gemeinde
from modules.interfaces.interfaceDbRealEstateObjects import RealEstateObjects
from modules.interfaces.interfaceDbComponentObjects import ComponentObjects
logger = logging.getLogger(__name__)
class BZODocumentRetriever:
"""Retrieves BZO documents from database and ComponentObjects."""
def __init__(self, realEstateInterface: RealEstateObjects, componentInterface: ComponentObjects):
"""
Initialize document retriever.
Args:
realEstateInterface: Real Estate database interface
componentInterface: ComponentObjects interface for file retrieval
"""
self.realEstateInterface = realEstateInterface
self.componentInterface = componentInterface
def get_documents_by_ids(self, dokument_ids: List[str]) -> List[Dokument]:
"""
Retrieve specific documents by their IDs.
Args:
dokument_ids: List of dokument IDs to retrieve
Returns:
List of Dokument records
"""
try:
dokumente = []
for dokument_id in dokument_ids:
dokument = self.realEstateInterface.getDokument(dokument_id)
if dokument:
dokumente.append(dokument)
else:
logger.warning(f"Dokument {dokument_id} not found")
logger.info(f"Retrieved {len(dokumente)} documents out of {len(dokument_ids)} requested")
return dokumente
except Exception as e:
logger.error(f"Error retrieving documents by IDs: {str(e)}", exc_info=True)
raise
def get_bzo_documents(self, gemeinde_id: Optional[str] = None) -> List[Dokument]:
"""
Query Dokument table for BZO documents.
Args:
gemeinde_id: Optional filter by specific Gemeinde
Returns:
List of Dokument records with BZO document type
"""
try:
# Query each document type separately (database connector doesn't support array filters)
dokumente_aktuell = self.realEstateInterface.getDokumente(
recordFilter={
"mandateId": self.realEstateInterface.mandateId,
"dokumentTyp": DokumentTyp.GEMEINDE_BZO_AKTUELL
}
)
dokumente_revision = self.realEstateInterface.getDokumente(
recordFilter={
"mandateId": self.realEstateInterface.mandateId,
"dokumentTyp": DokumentTyp.GEMEINDE_BZO_REVISION
}
)
# Combine results and deduplicate by ID
dokumente_dict = {}
for dokument in dokumente_aktuell + dokumente_revision:
dokumente_dict[dokument.id] = dokument
dokumente = list(dokumente_dict.values())
# If gemeinde_id provided, filter by checking Gemeinde.dokumente relationship
if gemeinde_id:
filtered_dokumente = []
for dokument in dokumente:
# Check if this dokument is linked to the specified Gemeinde
gemeinden = self.realEstateInterface.getGemeinden(
recordFilter={"mandateId": self.realEstateInterface.mandateId}
)
for gemeinde in gemeinden:
if gemeinde.id == gemeinde_id:
# Check if dokument.id is in gemeinde.dokumente
if gemeinde.dokumente:
for doc in gemeinde.dokumente:
if isinstance(doc, dict) and doc.get("id") == dokument.id:
filtered_dokumente.append(dokument)
break
elif hasattr(doc, "id") and doc.id == dokument.id:
filtered_dokumente.append(dokument)
break
break
dokumente = filtered_dokumente
logger.info(f"Found {len(dokumente)} BZO documents" + (f" for Gemeinde {gemeinde_id}" if gemeinde_id else ""))
return dokumente
except Exception as e:
logger.error(f"Error querying BZO documents: {str(e)}", exc_info=True)
raise
def retrieve_pdf_content(self, dokument: Dokument) -> Optional[bytes]:
"""
Retrieve PDF bytes from ComponentObjects using dokumentReferenz.
Args:
dokument: Dokument record with dokumentReferenz field
Returns:
PDF bytes or None if retrieval fails
"""
try:
if not dokument.dokumentReferenz:
logger.warning(f"Dokument {dokument.id} has no dokumentReferenz")
return None
# Retrieve PDF bytes
pdf_bytes = self.componentInterface.getFileData(dokument.dokumentReferenz)
if not pdf_bytes:
logger.warning(f"Could not retrieve PDF content for file {dokument.dokumentReferenz}")
return None
logger.debug(f"Retrieved PDF content for dokument {dokument.id} ({len(pdf_bytes)} bytes)")
return pdf_bytes
except Exception as e:
logger.error(f"Error retrieving PDF content for dokument {dokument.id}: {str(e)}", exc_info=True)
return None
def resolve_gemeinde_for_dokument(self, dokument: Dokument) -> Optional[str]:
"""
Resolve gemeinde_id for a Dokument by checking Gemeinde.dokumente relationships.
Args:
dokument: Dokument record
Returns:
gemeinde_id if found, None otherwise
"""
try:
gemeinden = self.realEstateInterface.getGemeinden(
recordFilter={"mandateId": self.realEstateInterface.mandateId}
)
for gemeinde in gemeinden:
if gemeinde.dokumente:
for doc in gemeinde.dokumente:
doc_id = doc.id if hasattr(doc, "id") else doc.get("id") if isinstance(doc, dict) else None
if doc_id == dokument.id:
logger.debug(f"Resolved gemeinde_id {gemeinde.id} for dokument {dokument.id}")
return gemeinde.id
logger.warning(f"Could not resolve gemeinde_id for dokument {dokument.id}")
return None
except Exception as e:
logger.error(f"Error resolving gemeinde for dokument {dokument.id}: {str(e)}", exc_info=True)
return None
def extract_year_from_dokument(self, dokument: Dokument) -> Optional[int]:
"""
Extract year from Dokument label or versionsbezeichnung.
Args:
dokument: Dokument record
Returns:
Year as integer if found, None otherwise
"""
import re
# Try to extract year from label
if dokument.label:
year_match = re.search(r'\b(19|20)\d{2}\b', dokument.label)
if year_match:
return int(year_match.group())
# Try to extract year from versionsbezeichnung
if dokument.versionsbezeichnung:
year_match = re.search(r'\b(19|20)\d{2}\b', dokument.versionsbezeichnung)
if year_match:
return int(year_match.group())
return None

View file

@ -0,0 +1,738 @@
"""
LangGraph-based pipeline for extracting structured content from BZO PDFs.
"""
import logging
import re
from typing import TypedDict, List, Dict, Any, Optional
from dataclasses import dataclass
from langgraph.graph import StateGraph, START, END
from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor, TextBlock
from modules.features.realEstate.bzoRuleTaxonomy import RULE_TAXONOMY
logger = logging.getLogger(__name__)
# ===== State Definition =====
@dataclass
class ClassifiedBlock:
"""Classified text block."""
block: TextBlock
block_type: str # "article", "heading", "table", "other"
article_label: Optional[str] = None
article_title: Optional[str] = None
@dataclass
class Article:
"""Assembled article."""
article_label: str
article_title: Optional[str]
text: str
page_start: int
page_end: int
section_level_1: Optional[str] = None
section_level_2: Optional[str] = None
section_level_3: Optional[str] = None
zone_raw: Optional[str] = None
@dataclass
class ZoneInfo:
"""Zone information."""
zone_code: str
zone_name: str
zone_category: Optional[str] = None
zone_subcategory: Optional[str] = None
empfindlichkeitsstufe: Optional[str] = None
geschosszahl: Optional[int] = None
gewerbeerleichterung: bool = False
@dataclass
class RuleCandidate:
"""Rule candidate from pattern matching."""
rule_type: str
matched_text: str
article_text: str
page: int
is_table_rule: bool = False
table_zones: List[str] = None
condition_text: Optional[str] = None
@dataclass
class ParsedRule:
"""Parsed rule with structured values."""
rule_type: str
value_numeric: Optional[float]
value_text: str
unit: Optional[str]
condition_text: Optional[str]
is_table_rule: bool
table_zones: List[str]
page: int
text_snippet: str
zone_raw: Optional[str] = None
rule_scope: str = "general"
confidence: float = 0.5
class BZOExtractionState(TypedDict):
"""State for BZO extraction pipeline."""
# Input metadata
dokument_id: Optional[str]
pdf_id: str
# Extracted text blocks (stored as dicts for serialization)
text_blocks: List[Dict[str, Any]]
# Classified blocks (stored as dicts for serialization)
classified_blocks: List[Dict[str, Any]]
# Assembled articles (stored as dicts for serialization)
articles: List[Dict[str, Any]]
# Zone tracking
current_zones: Dict[str, Dict[str, Any]]
zones: List[Dict[str, Any]]
# Rule extraction (stored as dicts for serialization)
rule_candidates: List[Dict[str, Any]]
parsed_rules: List[Dict[str, Any]]
# Processing metadata
errors: List[str]
warnings: List[str]
# ===== Node Implementations =====
def extract_pdf_text(state: BZOExtractionState) -> BZOExtractionState:
"""Extract text blocks from PDF."""
try:
# PDF bytes should be passed in state context
# This is handled in run_extraction function
# State already has text_blocks populated
return state
except Exception as e:
logger.error(f"Error extracting PDF text: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"PDF extraction error: {str(e)}"]
return state
def classify_text_block(state: BZOExtractionState) -> BZOExtractionState:
"""Classify text blocks into articles, headings, tables, etc."""
try:
classified = []
for block_dict in state["text_blocks"]:
text = block_dict["text"].strip()
if not text:
continue
block_type = "other"
article_label = None
article_title = None
# Check for article patterns
article_match = re.search(r'Art\.?\s*(\d+[a-z]?)', text, re.IGNORECASE)
if article_match:
block_type = "article"
article_label = f"Art. {article_match.group(1)}"
# Try to extract title (text after article label, before first period or newline)
title_match = re.search(r'Art\.?\s*\d+[a-z]?\s+(.+?)(?:\.|$|\n)', text, re.IGNORECASE)
if title_match:
article_title = title_match.group(1).strip()
# Check for heading patterns (Roman numerals, letters, numbers)
elif re.match(r'^[A-Z]\.\s+[A-Z]', text) or re.match(r'^[IVX]+\.\s+[A-Z]', text) or re.match(r'^\d+\.\s+[A-Z]', text):
block_type = "heading"
# Check for table patterns (multiple tabs or aligned columns)
elif '\t' in text or (len(text.split()) > 5 and text.count(' ') > 2):
block_type = "table"
classified.append({
"block": {
"page": block_dict["page"],
"text": block_dict["text"],
"block_id": block_dict["block_id"],
"bbox": block_dict.get("bbox")
},
"block_type": block_type,
"article_label": article_label,
"article_title": article_title
})
# Update state with new classified blocks
existing_blocks = state.get("classified_blocks", [])
state["classified_blocks"] = existing_blocks + classified
return state
except Exception as e:
logger.error(f"Error classifying text blocks: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Classification error: {str(e)}"]
return state
def assemble_articles(state: BZOExtractionState) -> BZOExtractionState:
"""Assemble classified blocks into articles with hierarchical structure."""
try:
articles = []
current_article = None
current_section_1 = None
current_section_2 = None
current_section_3 = None
for classified_dict in state["classified_blocks"]:
block_dict = classified_dict["block"]
block = TextBlock(
page=block_dict["page"],
text=block_dict["text"],
block_id=block_dict["block_id"],
bbox=block_dict.get("bbox")
)
text = block.text.strip()
block_type = classified_dict["block_type"]
article_label = classified_dict.get("article_label")
article_title = classified_dict.get("article_title")
# Update section levels
if block_type == "heading":
# Level 1: A., B., C.
if re.match(r'^[A-Z]\.\s+', text):
current_section_1 = text.split('.', 1)[0] + '.'
current_section_2 = None
current_section_3 = None
# Level 2: I., II., III.
elif re.match(r'^[IVX]+\.\s+', text):
current_section_2 = text.split('.', 1)[0] + '.'
current_section_3 = None
# Level 3: 1., 2., 3.
elif re.match(r'^\d+\.\s+', text):
current_section_3 = text.split('.', 1)[0] + '.'
# Start new article
if article_label:
# Save previous article if exists
if current_article:
articles.append(current_article)
# Start new article
current_article = {
"article_label": article_label,
"article_title": article_title,
"text": text,
"page_start": block.page,
"page_end": block.page,
"section_level_1": current_section_1,
"section_level_2": current_section_2,
"section_level_3": current_section_3,
"zone_raw": None
}
# Continue current article
elif current_article:
current_article["text"] += "\n" + text
current_article["page_end"] = block.page
# Add last article
if current_article:
articles.append(current_article)
# Update state with new articles
existing_articles = state.get("articles", [])
state["articles"] = existing_articles + articles
return state
except Exception as e:
logger.error(f"Error assembling articles: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Article assembly error: {str(e)}"]
return state
def detect_zone_changes(state: BZOExtractionState) -> BZOExtractionState:
"""Detect zone declarations and maintain zone scope."""
try:
zones = []
current_zones = state.get("current_zones", {})
for article_dict in state["articles"]:
text = article_dict.get("text", "")
article_label = article_dict.get("article_label", "")
page_start = article_dict.get("page_start", 0)
# Pattern: "Wohnzone W2", "Zone W3", "Gewerbezone G1"
zone_patterns = [
r'(?:Wohnzone|Zone|Gewerbezone|Industriezone|Zentrumszone|Ortsbildschutzzone|Erholungszone)\s+([A-Z0-9/]+)',
r'([A-Z]\d+(?:/\d+)?(?:G)?)', # W2/30, W2/30G, Z3, K3/4
]
for pattern in zone_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
zone_code = match.group(1).upper()
# Parse zone code
gewerbeerleichterung = zone_code.endswith('G')
if gewerbeerleichterung:
zone_code_base = zone_code[:-1]
else:
zone_code_base = zone_code
# Extract geschosszahl from code (e.g., W2 -> 2, W3/50 -> 3)
geschosszahl = None
if '/' in zone_code_base:
parts = zone_code_base.split('/')
geschosszahl_match = re.search(r'(\d+)', parts[0])
if geschosszahl_match:
geschosszahl = int(geschosszahl_match.group(1))
else:
geschosszahl_match = re.search(r'(\d+)', zone_code_base)
if geschosszahl_match:
geschosszahl = int(geschosszahl_match.group(1))
# Determine zone category from context
zone_category = None
if 'Wohnzone' in text or zone_code.startswith('W'):
zone_category = "Wohnzonen"
elif 'Zentrumszone' in text or zone_code.startswith('Z'):
zone_category = "Zentrumszonen"
elif 'Gewerbezone' in text or zone_code.startswith('G'):
zone_category = "Arbeitsplatzzonen"
elif 'Industriezone' in text or zone_code.startswith('I'):
zone_category = "Arbeitsplatzzonen"
zone_info = ZoneInfo(
zone_code=zone_code,
zone_name=f"Zone {zone_code}",
zone_category=zone_category,
geschosszahl=geschosszahl,
gewerbeerleichterung=gewerbeerleichterung
)
current_zones[zone_code] = zone_info
zones.append({
"zone_code": zone_code,
"zone_name": zone_info.zone_name,
"zone_category": zone_category,
"geschosszahl": geschosszahl,
"gewerbeerleichterung": gewerbeerleichterung,
"source_article": article_label,
"page": page_start
})
# Update state with zones
state["current_zones"] = current_zones
existing_zones = state.get("zones", [])
state["zones"] = existing_zones + zones
return state
except Exception as e:
logger.error(f"Error detecting zones: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Zone detection error: {str(e)}"]
return state
def detect_rule_candidates(state: BZOExtractionState) -> BZOExtractionState:
"""Detect rule candidates using pattern matching."""
try:
candidates = []
for article_dict in state["articles"]:
text = article_dict.get("text", "")
article_label = article_dict.get("article_label", "")
page_start = article_dict.get("page_start", 0)
# Check each rule type in taxonomy
for rule_type, rule_config in RULE_TAXONOMY.items():
patterns = rule_config.get("patterns", [])
for pattern in patterns:
# Create regex pattern (case-insensitive)
regex_pattern = re.compile(pattern, re.IGNORECASE)
matches = regex_pattern.finditer(text)
for match in matches:
# Extract context around match
start = max(0, match.start() - 100)
end = min(len(text), match.end() + 100)
context = text[start:end]
# Check for conditions (geographic, temporal, etc.)
condition_text = None
condition_patterns = [
r'(?:nördlich|südlich|östlich|westlich|oberhalb|unterhalb)\s+[^,\.]+',
r'(?:für|bei|in)\s+[^,\.]+',
]
for cond_pattern in condition_patterns:
cond_match = re.search(cond_pattern, context, re.IGNORECASE)
if cond_match:
condition_text = cond_match.group(0)
break
candidate = {
"rule_type": rule_type,
"matched_text": match.group(0),
"article_text": text,
"page": page_start,
"condition_text": condition_text,
"is_table_rule": False,
"table_zones": []
}
candidates.append(candidate)
# Update state with rule candidates
existing_candidates = state.get("rule_candidates", [])
state["rule_candidates"] = existing_candidates + candidates
return state
except Exception as e:
logger.error(f"Error detecting rule candidates: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Rule candidate detection error: {str(e)}"]
return state
def parse_rule_values(state: BZOExtractionState) -> BZOExtractionState:
"""Parse rule values using regex (LLM fallback can be added later)."""
try:
parsed_rules = []
for candidate_dict in state["rule_candidates"]:
rule_type = candidate_dict["rule_type"]
rule_config = RULE_TAXONOMY.get(rule_type, {})
units = rule_config.get("units", [])
value_type = rule_config.get("value_type", "numeric")
# Extract value using regex
matched_text = candidate_dict["matched_text"]
article_text = candidate_dict["article_text"]
text = matched_text + " " + article_text[article_text.find(matched_text):article_text.find(matched_text) + 200]
value_numeric = None
value_text = matched_text
unit = None
# Try to extract numeric value
if value_type in ["numeric", "integer"]:
# Pattern: "max. 4", "30 %", "min. 3.5 m"
value_patterns = [
r'(?:max|maximal|min|mindestens|höchstens)\s*\.?\s*(\d+(?:\.\d+)?)',
r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)',
r'(\d+(?:\.\d+)?)',
]
for pattern in value_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
value_numeric = float(match.group(1))
if value_type == "integer":
value_numeric = int(value_numeric)
# Check for unit
unit_match = re.search(r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)', text, re.IGNORECASE)
if unit_match:
unit = unit_match.group(2).lower()
if unit in ["meter", "metern"]:
unit = "m"
elif unit == "prozent":
unit = "%"
break
except ValueError:
continue
# Calculate confidence
confidence = 0.5
if value_numeric is not None:
confidence = 0.8
if unit:
confidence = 0.9
# Determine zone and scope
zone_raw = None
rule_scope = "general"
# Check current zones context
if state.get("current_zones"):
# Use first zone as default (can be improved)
zone_raw = list(state["current_zones"].keys())[0] if state["current_zones"] else None
rule_scope = "zone" if zone_raw else "general"
parsed_rule = {
"rule_type": rule_type,
"value_numeric": value_numeric,
"value_text": value_text,
"unit": unit,
"condition_text": candidate_dict.get("condition_text"),
"is_table_rule": candidate_dict.get("is_table_rule", False),
"table_zones": candidate_dict.get("table_zones", []),
"page": candidate_dict["page"],
"text_snippet": value_text,
"zone_raw": zone_raw,
"rule_scope": rule_scope,
"confidence": confidence
}
parsed_rules.append(parsed_rule)
# Update state with parsed rules
existing_rules = state.get("parsed_rules", [])
state["parsed_rules"] = existing_rules + parsed_rules
return state
except Exception as e:
logger.error(f"Error parsing rule values: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Rule parsing error: {str(e)}"]
return state
def assign_zone_and_scope(state: BZOExtractionState) -> BZOExtractionState:
"""Assign zone and scope to parsed rules."""
try:
# Rules already have zone and scope assigned in parse_rule_values
# This node can refine assignments if needed
return state
except Exception as e:
logger.error(f"Error assigning zone and scope: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Zone/scope assignment error: {str(e)}"]
return state
def confidence_scoring(state: BZOExtractionState) -> BZOExtractionState:
"""Calculate confidence scores for extracted data."""
try:
# Confidence already calculated in parse_rule_values
# This node can refine scores if needed
return state
except Exception as e:
logger.error(f"Error calculating confidence: {e}", exc_info=True)
state["errors"] = state.get("errors", []) + [f"Confidence scoring error: {str(e)}"]
return state
# ===== Graph Construction =====
def create_bzo_extraction_graph():
"""Create and compile the BZO extraction graph."""
workflow = StateGraph(BZOExtractionState)
# Add nodes
workflow.add_node("extract_pdf_text", extract_pdf_text)
workflow.add_node("classify_text_block", classify_text_block)
workflow.add_node("assemble_articles", assemble_articles)
workflow.add_node("detect_zone_changes", detect_zone_changes)
workflow.add_node("detect_rule_candidates", detect_rule_candidates)
workflow.add_node("parse_rule_values", parse_rule_values)
workflow.add_node("assign_zone_and_scope", assign_zone_and_scope)
workflow.add_node("confidence_scoring", confidence_scoring)
# Define edges
workflow.set_entry_point("extract_pdf_text")
workflow.add_edge("extract_pdf_text", "classify_text_block")
workflow.add_edge("classify_text_block", "assemble_articles")
workflow.add_edge("assemble_articles", "detect_zone_changes")
workflow.add_edge("detect_zone_changes", "detect_rule_candidates")
workflow.add_edge("detect_rule_candidates", "parse_rule_values")
workflow.add_edge("parse_rule_values", "assign_zone_and_scope")
workflow.add_edge("assign_zone_and_scope", "confidence_scoring")
workflow.add_edge("confidence_scoring", END)
return workflow.compile()
def run_extraction(pdf_bytes: bytes, pdf_id: str = None, dokument_id: str = None) -> Dict[str, Any]:
"""
Run the extraction pipeline on a PDF and return structured, sorted results.
Args:
pdf_bytes: PDF file content as bytes
pdf_id: Optional identifier for the PDF (defaults to generated ID)
dokument_id: Optional dokument ID for reference
Returns:
Dictionary with extracted and sorted content:
{
"articles": [...], # Sorted by page_start, then article_label
"zones": [...], # Sorted by zone_code
"rules": [...], # Sorted by rule_type, then page
"errors": [...],
"warnings": [...]
}
"""
import uuid
if not pdf_id:
pdf_id = f"pdf_{uuid.uuid4().hex[:8]}"
# Initialize state
state: BZOExtractionState = {
"dokument_id": dokument_id,
"pdf_id": pdf_id,
"text_blocks": [],
"classified_blocks": [],
"articles": [],
"current_zones": {},
"zones": [],
"rule_candidates": [],
"parsed_rules": [],
"errors": [],
"warnings": []
}
# Extract PDF text first
pdf_extractor = BZOPdfExtractor()
text_blocks_objects = pdf_extractor.extract_text_blocks(pdf_bytes, state["pdf_id"])
# Convert TextBlock objects to dicts for state
state["text_blocks"] = [
{
"page": tb.page,
"text": tb.text,
"block_id": tb.block_id,
"bbox": tb.bbox
}
for tb in text_blocks_objects
]
# Create and run graph
graph = create_bzo_extraction_graph()
final_state = graph.invoke(state)
# Sort and structure results
articles = sorted(
final_state.get("articles", []),
key=lambda x: (x.get("page_start", 0), x.get("article_label", ""))
)
zones = sorted(
final_state.get("zones", []),
key=lambda x: x.get("zone_code", "")
)
rules = sorted(
final_state.get("parsed_rules", []),
key=lambda x: (x.get("rule_type", ""), x.get("page", 0))
)
return {
"articles": articles,
"zones": zones,
"rules": rules,
"errors": final_state.get("errors", []),
"warnings": final_state.get("warnings", [])
}
def extract_from_documents(
document_retriever,
dokument_ids: List[str]
) -> Dict[str, Any]:
"""
Extract BZO content from one or more documents.
Args:
document_retriever: BZODocumentRetriever instance
dokument_ids: List of dokument IDs to process
Returns:
Dictionary with results per document:
{
"results": [
{
"dokument_id": "...",
"articles": [...],
"zones": [...],
"rules": [...],
"errors": [...],
"warnings": [...]
},
...
],
"summary": {
"total_documents": N,
"successful": M,
"failed": K,
"total_articles": X,
"total_zones": Y,
"total_rules": Z
}
}
"""
results = []
total_articles = 0
total_zones = 0
total_rules = 0
successful = 0
failed = 0
# Retrieve documents
dokumente = document_retriever.get_documents_by_ids(dokument_ids)
for dokument in dokumente:
try:
# Retrieve PDF content
pdf_bytes = document_retriever.retrieve_pdf_content(dokument)
if not pdf_bytes:
logger.warning(f"Could not retrieve PDF for dokument {dokument.id}")
results.append({
"dokument_id": dokument.id,
"articles": [],
"zones": [],
"rules": [],
"errors": [f"Could not retrieve PDF content"],
"warnings": []
})
failed += 1
continue
# Run extraction
extraction_result = run_extraction(
pdf_bytes=pdf_bytes,
pdf_id=dokument.dokumentReferenz or f"dok_{dokument.id}",
dokument_id=dokument.id
)
# Add dokument_id to result
extraction_result["dokument_id"] = dokument.id
results.append(extraction_result)
# Update counters
total_articles += len(extraction_result.get("articles", []))
total_zones += len(extraction_result.get("zones", []))
total_rules += len(extraction_result.get("rules", []))
if extraction_result.get("errors"):
failed += 1
else:
successful += 1
except Exception as e:
logger.error(f"Error processing dokument {dokument.id}: {str(e)}", exc_info=True)
results.append({
"dokument_id": dokument.id,
"articles": [],
"zones": [],
"rules": [],
"errors": [f"Processing error: {str(e)}"],
"warnings": []
})
failed += 1
return {
"results": results,
"summary": {
"total_documents": len(dokument_ids),
"successful": successful,
"failed": failed,
"total_articles": total_articles,
"total_zones": total_zones,
"total_rules": total_rules
}
}

View file

@ -0,0 +1,117 @@
"""
PDF extraction module for BZO documents.
Extracts page-aware text blocks from PDF files.
"""
import logging
from typing import List, Dict, Any
from dataclasses import dataclass
import fitz # PyMuPDF
logger = logging.getLogger(__name__)
@dataclass
class TextBlock:
"""Represents a text block from a PDF page."""
page: int
text: str
block_id: str
bbox: tuple = None # (x0, y0, x1, y1) bounding box
class BZOPdfExtractor:
"""Extracts text blocks from PDF files with page awareness."""
def __init__(self):
"""Initialize the PDF extractor."""
pass
def extract_text_blocks(self, pdf_bytes: bytes, pdf_id: str) -> List[TextBlock]:
"""
Extract page-aware text blocks from PDF.
Args:
pdf_bytes: PDF file content as bytes
pdf_id: Identifier for the PDF (for logging)
Returns:
List of TextBlock objects with page numbers
"""
text_blocks = []
try:
# Open PDF from bytes
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
# Extract text from each page
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# Extract text blocks from page
blocks = page.get_text("blocks")
for block_idx, block in enumerate(blocks):
# block format: (x0, y0, x1, y1, "text", block_no, block_type)
if len(block) >= 5:
bbox = (block[0], block[1], block[2], block[3])
text = block[4].strip()
# Skip empty blocks
if not text:
continue
# Create TextBlock
block_id = f"{pdf_id}_p{page_num + 1}_b{block_idx}"
text_block = TextBlock(
page=page_num + 1, # 1-indexed pages
text=text,
block_id=block_id,
bbox=bbox
)
text_blocks.append(text_block)
# Store page count before closing
page_count = len(pdf_document)
pdf_document.close()
logger.info(f"Extracted {len(text_blocks)} text blocks from PDF {pdf_id} ({page_count} pages)")
except Exception as e:
logger.error(f"Error extracting text from PDF {pdf_id}: {str(e)}", exc_info=True)
raise
return text_blocks
def extract_text_by_page(self, pdf_bytes: bytes, pdf_id: str) -> Dict[int, str]:
"""
Extract full text per page (alternative method).
Args:
pdf_bytes: PDF file content as bytes
pdf_id: Identifier for the PDF
Returns:
Dictionary mapping page number to full page text
"""
page_texts = {}
try:
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text = page.get_text()
page_texts[page_num + 1] = text # 1-indexed
# Store page count before closing
page_count = len(pdf_document)
pdf_document.close()
logger.debug(f"Extracted text from {page_count} pages for PDF {pdf_id}")
except Exception as e:
logger.error(f"Error extracting page text from PDF {pdf_id}: {str(e)}", exc_info=True)
raise
return page_texts

View file

@ -0,0 +1,105 @@
"""
Rule taxonomy for BZO extraction.
Defines fixed rule types and their patterns for deterministic rule detection.
"""
RULE_TAXONOMY = {
"max_building_height": {
"patterns": ["fassadenhöhe", "fassadenhöhen", "gebäudehöhe", "firsthöhe", "traufhöhe", "höchsthöhe", "gesamt höhe", "gesamt höhen"],
"units": ["m", "meter", "metern"],
"value_type": "numeric",
"keywords": ["max", "maximal", "höchstens"]
},
"max_floors": {
"patterns": ["vollgeschosse", "vollgeschoss", "geschosse", "geschosszahl"],
"units": [],
"value_type": "integer",
"keywords": ["max", "maximal", "höchstens"]
},
"max_attachable_attics": {
"patterns": ["anrechenbare dachgeschosse", "anrechenbares attikageschoss", "dachgeschosse", "attikageschoss"],
"units": [],
"value_type": "integer",
"keywords": ["max", "maximal"]
},
"max_attachable_basement": {
"patterns": ["anrechenbares untergeschoss", "untergeschoss"],
"units": [],
"value_type": "integer",
"keywords": ["max", "maximal"]
},
"density": {
"patterns": ["ausnützungsziffer", "az", "ausnützung"],
"units": ["%", "prozent"],
"value_type": "numeric",
"keywords": ["max", "maximal"]
},
"building_mass_index": {
"patterns": ["baumassenziffer", "bmz"],
"units": [],
"value_type": "numeric",
"keywords": ["max", "maximal"]
},
"green_space_index": {
"patterns": ["grünflächenziffer", "gfz"],
"units": ["%", "prozent"],
"value_type": "numeric",
"keywords": ["min", "mindestens"]
},
"boundary_distance": {
"patterns": ["grenzabstand", "grundabstand", "abstand"],
"units": ["m", "meter", "metern"],
"value_type": "numeric",
"keywords": ["min", "mindestens", "max", "maximal"]
},
"boundary_distance_length_surcharge": {
"patterns": ["mehrlängenzuschlag", "längenzuschlag"],
"units": [],
"value_type": "fraction", # e.g., "1/3", "1/5"
"keywords": []
},
"boundary_distance_max": {
"patterns": ["höchstmass", "höchstmass grenzabstand", "höchstmass abstand"],
"units": ["m", "meter"],
"value_type": "numeric",
"keywords": ["max", "maximal"]
},
"building_length": {
"patterns": ["gebäudelänge"],
"units": ["m", "meter"],
"value_type": "numeric",
"keywords": ["min", "mindestens", "max", "maximal"]
},
"building_width": {
"patterns": ["gebäudebreite"],
"units": ["m", "meter"],
"value_type": "numeric",
"keywords": ["min", "mindestens", "max", "maximal"]
},
"residential_area_share": {
"patterns": ["wohnflächenanteil", "wohnanteil"],
"units": ["%", "prozent"],
"value_type": "numeric",
"keywords": ["min", "mindestens", "max", "maximal"]
}
}
def get_rule_taxonomy() -> dict:
"""Get the rule taxonomy dictionary."""
return RULE_TAXONOMY
def get_rule_types() -> list:
"""Get list of all rule types."""
return list(RULE_TAXONOMY.keys())
def get_rule_patterns(rule_type: str) -> list:
"""Get patterns for a specific rule type."""
return RULE_TAXONOMY.get(rule_type, {}).get("patterns", [])
def get_rule_units(rule_type: str) -> list:
"""Get units for a specific rule type."""
return RULE_TAXONOMY.get(rule_type, {}).get("units", [])

View file

@ -0,0 +1,780 @@
"""
Swiss Topo Scraping Script
Scrapes Switzerland systematically using the Swiss Topo connector
and saves parcel data to the database.
This script divides Switzerland into a grid and queries parcels at each grid point,
then deduplicates and saves unique parcels to the database.
"""
import logging
import asyncio
from typing import Dict, Any, List, Set, Optional
from dataclasses import dataclass
import json
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelRealEstate import (
Parzelle,
GeoPolylinie,
GeoPunkt,
Kontext,
Gemeinde,
Kanton,
)
from modules.interfaces.interfaceDbRealEstateObjects import getInterface as getRealEstateInterface
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
from modules.connectors.connectorOerebWfs import OerebWfsConnector
logger = logging.getLogger(__name__)
@dataclass
class ScrapingStats:
"""Statistics for scraping operation."""
total_queries: int = 0
successful_queries: int = 0
failed_queries: int = 0
unique_parcels_found: int = 0
parcels_saved: int = 0
parcels_skipped: int = 0
errors: List[str] = None
def __post_init__(self):
if self.errors is None:
self.errors = []
class SwissTopoScraper:
"""
Scraper for Swiss Topo parcel data.
Divides Kanton Zürich into a grid and queries parcels systematically,
then saves unique parcels to the database with ÖREB bauzone information.
"""
# Zürich canton bounds in LV95 coordinates
ZURICH_BOUNDS = {
"min_x": 2680000,
"max_x": 2780000,
"min_y": 1210000,
"max_y": 1280000
}
def __init__(
self,
current_user: User,
grid_size: float = 500.0, # Grid size in meters (500m = reasonable coverage)
max_concurrent: int = 50, # Maximum concurrent API requests
batch_size: int = 100, # Process parcels in batches
):
"""
Initialize scraper.
Args:
current_user: User for database operations
grid_size: Size of grid cells in meters (default: 500m)
max_concurrent: Maximum concurrent API requests (default: 50)
batch_size: Number of parcels to process before saving (default: 100)
"""
self.current_user = current_user
self.grid_size = grid_size
self.max_concurrent = max_concurrent
self.batch_size = batch_size
oereb_connector = OerebWfsConnector()
self.connector = SwissTopoMapServerConnector(oereb_connector=oereb_connector)
self.realEstateInterface = getRealEstateInterface(current_user)
# Track unique parcels by EGRID or label
self.unique_parcels: Dict[str, Dict[str, Any]] = {}
self.stats = ScrapingStats()
# Cache for Gemeinde and Kanton UUIDs to avoid repeated database queries
# Key: bfs_nummer (int or str), Value: UUID (str)
self.gemeinde_cache: Dict[str, str] = {}
# Key: kanton abbreviation (str, e.g., "ZH"), Value: UUID (str)
self.kanton_cache: Dict[str, str] = {}
def _generate_grid_points(self) -> List[tuple]:
"""
Generate grid points covering Kanton Zürich.
Returns:
List of (x, y) coordinate tuples in LV95
"""
bounds = self.ZURICH_BOUNDS
min_x = bounds["min_x"]
max_x = bounds["max_x"]
min_y = bounds["min_y"]
max_y = bounds["max_y"]
grid_points = []
x = min_x
while x <= max_x:
y = min_y
while y <= max_y:
grid_points.append((x, y))
y += self.grid_size
x += self.grid_size
logger.info(f"Generated {len(grid_points)} grid points covering Kanton Zürich")
return grid_points
async def _query_parcel_at_point(
self,
x: float,
y: float,
semaphore: asyncio.Semaphore
) -> Optional[Dict[str, Any]]:
"""
Query parcel at a specific coordinate point.
Args:
x: X coordinate (LV95)
y: Y coordinate (LV95)
semaphore: Semaphore for concurrency control
Returns:
Parcel data dictionary or None if not found
"""
async with semaphore:
try:
self.stats.total_queries += 1
location_str = f"{x},{y}"
parcel_data = await self.connector.search_parcel(location_str, tolerance=5)
if parcel_data:
self.stats.successful_queries += 1
return parcel_data
else:
self.stats.failed_queries += 1
return None
except Exception as e:
self.stats.failed_queries += 1
error_msg = f"Error querying parcel at ({x}, {y}): {str(e)}"
logger.debug(error_msg)
self.stats.errors.append(error_msg)
return None
def _find_gemeinde_by_bfs_nummer(self, bfs_nummer: str) -> Optional[Gemeinde]:
"""
Find existing Gemeinde by BFS number (stored in kontextInformationen).
Args:
bfs_nummer: BFS municipality number
Returns:
Gemeinde instance if found, None otherwise
"""
try:
# Check cache first
if bfs_nummer in self.gemeinde_cache:
gemeinde_id = self.gemeinde_cache[bfs_nummer]
gemeinde = self.realEstateInterface.getGemeinde(gemeinde_id)
if gemeinde:
return gemeinde
# Search all Gemeinden and check kontextInformationen
gemeinden = self.realEstateInterface.getGemeinden(
recordFilter={"mandateId": self.current_user.mandateId}
)
for gemeinde in gemeinden:
# Check kontextInformationen for bfs_nummer
for kontext in gemeinde.kontextInformationen:
try:
kontext_data = json.loads(kontext.inhalt) if isinstance(kontext.inhalt, str) else kontext.inhalt
if isinstance(kontext_data, dict):
if str(kontext_data.get("bfs_nummer")) == str(bfs_nummer):
# Cache the result
self.gemeinde_cache[bfs_nummer] = gemeinde.id
return gemeinde
except (json.JSONDecodeError, AttributeError):
continue
return None
except Exception as e:
logger.error(f"Error finding Gemeinde by BFS number {bfs_nummer}: {e}", exc_info=True)
return None
def _find_kanton_by_abbreviation(self, abk: str) -> Optional[Kanton]:
"""
Find existing Kanton by abbreviation.
Args:
abk: Canton abbreviation (e.g., "BE", "ZH")
Returns:
Kanton instance if found, None otherwise
"""
try:
# Check cache first
if abk in self.kanton_cache:
kanton_id = self.kanton_cache[abk]
kanton = self.realEstateInterface.getKanton(kanton_id)
if kanton:
return kanton
# Search by abbreviation
kantone = self.realEstateInterface.getKantone(
recordFilter={
"mandateId": self.current_user.mandateId,
"abk": abk
}
)
if kantone:
kanton = kantone[0]
# Cache the result
self.kanton_cache[abk] = kanton.id
return kanton
return None
except Exception as e:
logger.error(f"Error finding Kanton by abbreviation {abk}: {e}", exc_info=True)
return None
def _get_or_create_kanton(self, kanton_abk: str) -> Optional[str]:
"""
Get or create a Kanton by abbreviation.
Args:
kanton_abk: Canton abbreviation (e.g., "BE", "ZH")
Returns:
UUID of the Kanton, or None if creation failed
"""
if not kanton_abk:
return None
# Check if exists
existing_kanton = self._find_kanton_by_abbreviation(kanton_abk)
if existing_kanton:
return existing_kanton.id
# Create new Kanton
try:
# Map common abbreviations to full names (fallback to abbreviation if not found)
kanton_names = {
"AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden",
"BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt",
"FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden",
"JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden",
"OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn",
"SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri",
"VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich"
}
kanton_label = kanton_names.get(kanton_abk, kanton_abk)
kanton = Kanton(
mandateId=self.current_user.mandateId,
label=kanton_label,
abk=kanton_abk
)
created_kanton = self.realEstateInterface.createKanton(kanton)
if created_kanton and created_kanton.id:
# Cache the result
self.kanton_cache[kanton_abk] = created_kanton.id
logger.info(f"Created new Kanton: {kanton_label} ({kanton_abk})")
return created_kanton.id
except Exception as e:
logger.error(f"Error creating Kanton {kanton_abk}: {e}", exc_info=True)
return None
def _get_or_create_gemeinde(
self,
gemeinde_name: str,
bfs_nummer: str,
kanton_abk: str
) -> Optional[str]:
"""
Get or create a Gemeinde by BFS number.
Args:
gemeinde_name: Municipality name
bfs_nummer: BFS municipality number
kanton_abk: Canton abbreviation
Returns:
UUID of the Gemeinde, or None if creation failed
"""
if not gemeinde_name or not bfs_nummer:
return None
# Check if exists
existing_gemeinde = self._find_gemeinde_by_bfs_nummer(bfs_nummer)
if existing_gemeinde:
return existing_gemeinde.id
# Get or create Kanton first
kanton_id = self._get_or_create_kanton(kanton_abk)
# Create new Gemeinde
try:
gemeinde = Gemeinde(
mandateId=self.current_user.mandateId,
label=gemeinde_name,
id_kanton=kanton_id,
kontextInformationen=[
Kontext(
thema="BFS Nummer",
inhalt=json.dumps({"bfs_nummer": bfs_nummer}, ensure_ascii=False)
)
]
)
created_gemeinde = self.realEstateInterface.createGemeinde(gemeinde)
if created_gemeinde and created_gemeinde.id:
# Cache the result
self.gemeinde_cache[bfs_nummer] = created_gemeinde.id
logger.info(f"Created new Gemeinde: {gemeinde_name} (BFS: {bfs_nummer})")
return created_gemeinde.id
except Exception as e:
logger.error(f"Error creating Gemeinde {gemeinde_name} (BFS: {bfs_nummer}): {e}", exc_info=True)
return None
def _extract_parcel_identifier(self, parcel_data: Dict[str, Any]) -> Optional[str]:
"""
Extract unique identifier for a parcel.
Args:
parcel_data: Parcel data from Swiss Topo
Returns:
Unique identifier (EGRID or label) or None
"""
attributes = parcel_data.get("attributes", {})
# Prefer EGRID as it's globally unique
egrid = attributes.get("egris_egrid")
if egrid:
return f"egrid:{egrid}"
# Fallback to label + municipality code
label = attributes.get("label") or attributes.get("number")
bfsnr = attributes.get("bfsnr")
if label and bfsnr:
return f"label:{bfsnr}:{label}"
# Last resort: just label
if label:
return f"label:{label}"
return None
def _convert_to_parzelle_model(
self,
parcel_data: Dict[str, Any],
parcel_id: str,
gemeinde_id: Optional[str] = None
) -> Optional[Parzelle]:
"""
Convert Swiss Topo parcel data to Parzelle model.
Args:
parcel_data: Raw parcel data from Swiss Topo
parcel_id: Unique parcel identifier
gemeinde_id: UUID of the Gemeinde (if already resolved)
Returns:
Parzelle model instance or None if conversion fails
"""
try:
attributes = parcel_data.get("attributes", {})
geometry = parcel_data.get("geometry", {})
# Extract attributes
extracted_attrs = self.connector.extract_parcel_attributes(parcel_data)
# Get geocoded address if available
geocoded_address = parcel_data.get('geocoded_address', {})
# Get bauzone - prefer ÖREB bauzone if available (for Zürich), otherwise use extracted_attrs
bauzone = parcel_data.get("oereb_bauzone") or extracted_attrs.get("bauzone")
# Build Parzelle data
parzelle_data = {
"mandateId": self.current_user.mandateId,
"label": extracted_attrs.get("label") or attributes.get("number") or f"Parcel-{parcel_id}",
"parzellenAliasTags": [attributes.get("egris_egrid")] if attributes.get("egris_egrid") else [],
"strasseNr": geocoded_address.get("full_address") or extracted_attrs.get("strasseNr"),
"plz": geocoded_address.get("plz") or extracted_attrs.get("plz"),
"eigentuemerschaft": extracted_attrs.get("eigentuemerschaft"),
"bauzone": bauzone,
"perimeter": extracted_attrs.get("perimeter"),
"baulinie": None,
"kontextGemeinde": gemeinde_id, # Use UUID reference instead of name
"az": None,
"bz": None,
"vollgeschossZahl": None,
"anrechenbarDachgeschoss": None,
"anrechenbarUntergeschoss": None,
"gebaeudehoeheMax": None,
"regelnGrenzabstand": [],
"regelnMehrlaengenzuschlag": [],
"regelnMehrhoehenzuschlag": [],
"parzelleBebaut": None,
"parzelleErschlossen": None,
"parzelleHanglage": None,
"laermschutzzone": None,
"hochwasserschutzzone": None,
"grundwasserschutzzone": None,
"parzellenNachbarschaft": [],
"dokumente": [],
"kontextInformationen": [
Kontext(
thema="Swiss Topo Scraping",
inhalt=json.dumps({
"egrid": attributes.get("egris_egrid"),
"identnd": attributes.get("identnd"),
"canton": attributes.get("ak"),
"municipality_code": attributes.get("bfsnr"),
"scraped_coordinates": parcel_data.get('query_coordinates', {}),
"source": "swiss_topo_scraping",
"bauzone_source": "oereb_wfs" if parcel_data.get("oereb_bauzone") else "swiss_topo"
}, ensure_ascii=False)
)
]
}
# Create Parzelle instance
parzelle_instance = Parzelle(**parzelle_data)
return parzelle_instance
except Exception as e:
logger.error(f"Error converting parcel {parcel_id} to Parzelle model: {e}", exc_info=True)
return None
async def _save_parcels_batch(
self,
parcels: List[Dict[str, Any]]
) -> int:
"""
Save a batch of parcels to the database.
This method handles the sorting algorithm:
- For each parcel, extracts gemeinde_information
- Checks if Gemeinde exists (by bfs_nummer), creates if not
- Checks if Kanton exists (by abbreviation), creates if not
- Links Parzelle to Gemeinde via UUID reference
Args:
parcels: List of parcel data dictionaries
Returns:
Number of parcels successfully saved
"""
saved_count = 0
for parcel_data in parcels:
try:
parcel_id = self._extract_parcel_identifier(parcel_data)
if not parcel_id:
logger.warning("Could not extract parcel identifier, skipping")
self.stats.parcels_skipped += 1
continue
# Check if parcel already exists in our unique set
if parcel_id in self.unique_parcels:
logger.debug(f"Parcel {parcel_id} already found, skipping duplicate")
self.stats.parcels_skipped += 1
continue
# Extract gemeinde_information from parcel_data
# The connector returns it as 'gemeinde_info' (not 'gemeinde_information')
gemeinde_info = parcel_data.get('gemeinde_info') or parcel_data.get('gemeinde_information')
gemeinde_id = None
kanton_abk = None
if gemeinde_info:
gemeinde_name = gemeinde_info.get('name')
bfs_nummer = gemeinde_info.get('bfs_nummer')
kanton_abk = gemeinde_info.get('kanton')
# Skip parcels not from Zürich (safety filter)
if kanton_abk and kanton_abk.upper() != "ZH":
logger.debug(f"Skipping parcel {parcel_id} from canton {kanton_abk} (only Zürich parcels are processed)")
self.stats.parcels_skipped += 1
continue
if gemeinde_name and bfs_nummer:
# Get or create Gemeinde (this also handles Kanton creation)
gemeinde_id = self._get_or_create_gemeinde(
gemeinde_name=gemeinde_name,
bfs_nummer=str(bfs_nummer),
kanton_abk=kanton_abk
)
if not gemeinde_id:
logger.warning(
f"Could not get or create Gemeinde for parcel {parcel_id}: "
f"name={gemeinde_name}, bfs_nummer={bfs_nummer}"
)
else:
logger.debug(
f"Missing Gemeinde info for parcel {parcel_id}: "
f"name={gemeinde_name}, bfs_nummer={bfs_nummer}"
)
else:
logger.debug(f"No gemeinde_info found in parcel_data for {parcel_id}")
# Skip parcels without gemeinde_info (likely not from Zürich)
self.stats.parcels_skipped += 1
continue
# Query ÖREB WFS for bauzone (all parcels are from Zürich)
if kanton_abk and kanton_abk.upper() == "ZH":
try:
attributes = parcel_data.get("attributes", {})
geometry = parcel_data.get("geometry", {})
egrid = attributes.get("egris_egrid", "")
# Get coordinates for query (use centroid or first point if available)
x = None
y = None
if geometry:
if "rings" in geometry and geometry["rings"]:
# Use first point of first ring
first_ring = geometry["rings"][0]
if first_ring and len(first_ring) > 0:
x = first_ring[0][0]
y = first_ring[0][1]
elif "coordinates" in geometry:
# Try to extract coordinates from GeoJSON format
coords = geometry.get("coordinates", [])
if coords and len(coords) > 0:
# Handle nested coordinate arrays
def get_first_coord(coord_list, depth=0):
if depth < 3 and isinstance(coord_list, list) and len(coord_list) > 0:
if isinstance(coord_list[0], (int, float)):
return coord_list
return get_first_coord(coord_list[0], depth + 1)
return None
first_coord = get_first_coord(coords)
if first_coord and len(first_coord) >= 2:
x = first_coord[0]
y = first_coord[1]
if geometry and self.connector.oereb_connector:
logger.debug(f"Querying ÖREB WFS for bauzone for parcel {parcel_id}")
zone_results = await self.connector.oereb_connector.query_zone_layer(
egrid=egrid,
x=x or 0.0,
y=y or 0.0,
canton="ZH",
geometry=geometry
)
if zone_results and len(zone_results) > 0:
# Extract typ_gde_abkuerzung from the first result
zone_attrs = zone_results[0].get("attributes", {})
oereb_bauzone = zone_attrs.get("typ_gde_abkuerzung")
if oereb_bauzone:
# Add bauzone to parcel_data so it can be used in _convert_to_parzelle_model
parcel_data["oereb_bauzone"] = oereb_bauzone
logger.debug(f"Found ÖREB bauzone '{oereb_bauzone}' for parcel {parcel_id}")
else:
logger.debug(f"No typ_gde_abkuerzung found in ÖREB response for parcel {parcel_id}")
else:
logger.debug(f"No zone results from ÖREB WFS for parcel {parcel_id}")
else:
logger.debug(f"Cannot query ÖREB WFS for parcel {parcel_id}: missing geometry or connector")
except Exception as e:
logger.warning(f"Error querying ÖREB WFS for bauzone for parcel {parcel_id}: {e}", exc_info=True)
# Continue without ÖREB bauzone - will use default from extracted_attrs
else:
# This should not happen since we filter for ZH above, but log if it does
logger.warning(f"Parcel {parcel_id} is not from Zürich (kanton: {kanton_abk}), skipping ÖREB query")
# Convert to Parzelle model (with Gemeinde UUID reference)
parzelle_instance = self._convert_to_parzelle_model(
parcel_data,
parcel_id,
gemeinde_id=gemeinde_id
)
if not parzelle_instance:
logger.warning(f"Could not convert parcel {parcel_id} to model, skipping")
self.stats.parcels_skipped += 1
continue
# Check if parcel already exists in database (by label)
# Note: We rely on in-memory deduplication by EGRID/label for uniqueness
# Database check is mainly to avoid re-saving parcels from previous runs
existing_parcels = self.realEstateInterface.getParzellen(
recordFilter={
"mandateId": self.current_user.mandateId,
"label": parzelle_instance.label
}
)
if existing_parcels:
logger.debug(f"Parcel {parzelle_instance.label} already exists in database, skipping")
self.unique_parcels[parcel_id] = parcel_data # Mark as seen
self.stats.parcels_skipped += 1
continue
# Save to database
created_parzelle = self.realEstateInterface.createParzelle(parzelle_instance)
if created_parzelle and created_parzelle.id:
self.unique_parcels[parcel_id] = parcel_data
saved_count += 1
self.stats.parcels_saved += 1
logger.debug(
f"Saved parcel {created_parzelle.label} (ID: {created_parzelle.id}) "
f"linked to Gemeinde {gemeinde_id if gemeinde_id else 'None'}"
)
else:
logger.warning(f"Failed to save parcel {parzelle_instance.label}")
self.stats.parcels_skipped += 1
except Exception as e:
error_msg = f"Error saving parcel: {str(e)}"
logger.error(error_msg, exc_info=True)
self.stats.errors.append(error_msg)
self.stats.parcels_skipped += 1
return saved_count
async def scrape(
self,
grid_points: Optional[List[tuple]] = None,
progress_callback: Optional[callable] = None
) -> ScrapingStats:
"""
Scrape Kanton Zürich for parcel data.
Args:
grid_points: Optional list of (x, y) coordinates to query.
If None, generates grid automatically for Zürich.
progress_callback: Optional callback function(status_dict) called periodically
Returns:
ScrapingStats object with scraping statistics
"""
logger.info("Starting Swiss Topo scraping operation for Kanton Zürich")
# Generate grid points if not provided
if grid_points is None:
grid_points = self._generate_grid_points()
logger.info(f"Scraping {len(grid_points)} grid points in Kanton Zürich")
# Create semaphore for concurrency control
semaphore = asyncio.Semaphore(self.max_concurrent)
# Query all grid points concurrently
tasks = [
self._query_parcel_at_point(x, y, semaphore)
for x, y in grid_points
]
# Process results in batches
batch = []
processed = 0
for coro in asyncio.as_completed(tasks):
try:
parcel_data = await coro
processed += 1
if parcel_data:
batch.append(parcel_data)
# Save batch when it reaches batch_size
if len(batch) >= self.batch_size:
await self._save_parcels_batch(batch)
batch = []
# Update stats
self.stats.unique_parcels_found = len(self.unique_parcels)
# Call progress callback if provided
if progress_callback:
progress_callback({
"processed": processed,
"total": len(grid_points),
"unique_parcels": self.stats.unique_parcels_found,
"saved": self.stats.parcels_saved,
"skipped": self.stats.parcels_skipped
})
# Log progress periodically
if processed % 100 == 0:
logger.info(
f"Progress: {processed}/{len(grid_points)} queries completed, "
f"{self.stats.unique_parcels_found} unique parcels found, "
f"{self.stats.parcels_saved} saved"
)
except Exception as e:
logger.error(f"Error processing query result: {e}", exc_info=True)
self.stats.errors.append(str(e))
# Save remaining batch
if batch:
await self._save_parcels_batch(batch)
# Final stats update
self.stats.unique_parcels_found = len(self.unique_parcels)
logger.info(
f"Scraping completed: {self.stats.unique_parcels_found} unique parcels found, "
f"{self.stats.parcels_saved} saved, {self.stats.parcels_skipped} skipped"
)
return self.stats
async def scrape_switzerland(
current_user: User,
grid_size: float = 500.0,
max_concurrent: int = 50,
batch_size: int = 100,
grid_points: Optional[List[tuple]] = None
) -> Dict[str, Any]:
"""
Main function to scrape Kanton Zürich for parcel data.
Note: This function now only scrapes parcels from Kanton Zürich.
All parcels are queried for ÖREB bauzone information.
Args:
current_user: User for database operations
grid_size: Size of grid cells in meters (default: 500m)
max_concurrent: Maximum concurrent API requests (default: 50)
batch_size: Number of parcels to process before saving (default: 100)
grid_points: Optional list of (x, y) coordinates to query (must be within Zürich bounds)
Returns:
Dictionary with scraping statistics and results
"""
scraper = SwissTopoScraper(
current_user=current_user,
grid_size=grid_size,
max_concurrent=max_concurrent,
batch_size=batch_size
)
stats = await scraper.scrape(grid_points=grid_points)
return {
"success": True,
"stats": {
"total_queries": stats.total_queries,
"successful_queries": stats.successful_queries,
"failed_queries": stats.failed_queries,
"unique_parcels_found": stats.unique_parcels_found,
"parcels_saved": stats.parcels_saved,
"parcels_skipped": stats.parcels_skipped,
"error_count": len(stats.errors),
"errors": stats.errors[:10] # Return first 10 errors
}
}

View file

@ -0,0 +1,879 @@
"""
Real Estate scraping routes for the backend API.
Implements endpoints for scraping real estate data from external sources.
"""
import logging
import json
import aiohttp
import asyncio
from typing import Optional, Dict, Any
from fastapi import APIRouter, HTTPException, Depends, Body, Request, Query, status
# Import auth modules
from modules.auth import limiter, getCurrentUser
# Import models
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelRealEstate import (
Gemeinde,
Kanton,
Dokument,
Kontext,
DokumentTyp,
)
# Import interfaces
from modules.interfaces.interfaceDbRealEstateObjects import getInterface as getRealEstateInterface
from modules.interfaces.interfaceDbComponentObjects import getInterface as getComponentInterface
# Import scraping script
from modules.features.realEstate.scrapeSwissTopo import scrape_switzerland
# Import Swiss Topo MapServer connector
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
from modules.connectors.connectorOerebWfs import OerebWfsConnector
# Import Tavily connector for BZO document search
from modules.aicore.aicorePluginTavily import AiTavily
# Configure logger
logger = logging.getLogger(__name__)
# Create router for real estate scraping endpoints
router = APIRouter(
prefix="/api/realestate",
tags=["Real Estate Scraping"],
responses={
404: {"description": "Not found"},
400: {"description": "Bad request"},
401: {"description": "Unauthorized"},
403: {"description": "Forbidden"},
500: {"description": "Internal server error"}
}
)
@router.post("/scrape-switzerland", response_model=Dict[str, Any])
@limiter.limit("5/hour") # Limit to 5 requests per hour (scraping is resource-intensive)
async def scrape_switzerland_route(
request: Request,
body: Dict[str, Any] = Body(..., description="Scraping parameters"),
currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
"""
Scrape Kanton Zürich systematically using Swiss Topo connector and save parcel data to database.
This endpoint divides Kanton Zürich into a grid and queries parcels at each grid point,
then deduplicates and saves unique parcels to the database. For each parcel, it also
queries the ÖREB WFS service to retrieve bauzone information.
**WARNING**: This is a resource-intensive operation that may take a long time
and make many API requests. Use with caution.
Request Body:
{
"grid_size": 500.0, // Grid cell size in meters (default: 500m)
"max_concurrent": 50, // Maximum concurrent API requests (default: 50)
"batch_size": 100 // Number of parcels to process before saving (default: 100)
}
Headers:
- X-CSRF-Token: CSRF token (required for security)
Returns:
{
"success": true,
"stats": {
"total_queries": 1234,
"successful_queries": 1200,
"failed_queries": 34,
"unique_parcels_found": 500,
"parcels_saved": 450,
"parcels_skipped": 50,
"error_count": 5,
"errors": [...]
}
}
Example:
- POST /api/realestate/scrape-switzerland
Body: {"grid_size": 1000.0, "max_concurrent": 5, "batch_size": 50}
"""
try:
# Validate CSRF token
csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
if not csrf_token:
logger.warning(f"CSRF token missing for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="CSRF token missing. Please include X-CSRF-Token header."
)
# Basic CSRF token format validation
if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
logger.warning(f"Invalid CSRF token format for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid CSRF token format"
)
# Validate token is hex string
try:
int(csrf_token, 16)
except ValueError:
logger.warning(f"CSRF token is not a valid hex string for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid CSRF token format"
)
# Extract parameters from body with defaults
grid_size = body.get("grid_size", 500.0)
max_concurrent = body.get("max_concurrent", 50)
batch_size = body.get("batch_size", 100)
# Validate parameters
if grid_size <= 0 or grid_size > 10000:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="grid_size must be between 0 and 10000 meters"
)
if max_concurrent <= 0 or max_concurrent > 200:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="max_concurrent must be between 1 and 200"
)
if batch_size <= 0 or batch_size > 1000:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="batch_size must be between 1 and 1000"
)
logger.info(
f"Starting Switzerland scraping for user {currentUser.id} (mandate: {currentUser.mandateId}) "
f"with grid_size={grid_size}, max_concurrent={max_concurrent}, batch_size={batch_size}"
)
# Run scraping operation
result = await scrape_switzerland(
current_user=currentUser,
grid_size=grid_size,
max_concurrent=max_concurrent,
batch_size=batch_size
)
logger.info(
f"Scraping completed for user {currentUser.id}: "
f"{result['stats']['parcels_saved']} parcels saved"
)
return result
except HTTPException:
raise
except ValueError as e:
logger.error(f"Validation error in scrape_switzerland_route: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Validation error: {str(e)}"
)
except Exception as e:
logger.error(f"Error scraping Switzerland: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error scraping Switzerland: {str(e)}"
)
@router.get("/gemeinden", response_model=Dict[str, Any])
@limiter.limit("60/minute")
async def get_all_gemeinden(
request: Request,
only_current: bool = Query(True, description="Only return current municipalities (exclude historical)"),
currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
"""
Fetch all Gemeinden (municipalities) from the Swiss Topo MapServer connector
and save them to the database.
This endpoint:
1. Fetches all Swiss municipalities from the Swiss Federal Office of Topography
2. Saves them to the database (skipping duplicates based on BFS number)
3. Creates Kantone (cantons) as needed
4. Returns statistics about the import operation
Query Parameters:
- only_current: If True, only return current municipalities (default: True).
If False, return all municipalities including historical ones.
Headers:
- X-CSRF-Token: CSRF token (required for security)
Returns:
{
"gemeinden": [
{
"id": "uuid",
"mandateId": "uuid",
"label": "Bern",
"id_kanton": "uuid",
"kontextInformationen": [...],
...
},
...
],
"count": 2162,
"stats": {
"gemeinden_created": 2100,
"gemeinden_skipped": 62,
"kantone_created": 26,
"error_count": 0,
"errors": []
}
}
Example:
- GET /api/realestate/gemeinden
- GET /api/realestate/gemeinden?only_current=false
"""
try:
# Validate CSRF token
csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
if not csrf_token:
logger.warning(f"CSRF token missing for GET /api/realestate/gemeinden from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="CSRF token missing. Please include X-CSRF-Token header."
)
# Basic CSRF token format validation
if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
logger.warning(f"Invalid CSRF token format for GET /api/realestate/gemeinden from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid CSRF token format"
)
# Validate token is hex string
try:
int(csrf_token, 16)
except ValueError:
logger.warning(f"CSRF token is not a valid hex string for GET /api/realestate/gemeinden from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid CSRF token format"
)
logger.info(f"Fetching all Gemeinden for user {currentUser.id} (mandate: {currentUser.mandateId}), only_current={only_current}")
# Initialize connectors and fetch all gemeinden
oereb_connector = OerebWfsConnector()
connector = SwissTopoMapServerConnector(oereb_connector=oereb_connector)
gemeinden_data = await connector.get_all_gemeinden(only_current=only_current)
# Get interface for database operations
realEstateInterface = getRealEstateInterface(currentUser)
# Statistics
gemeinden_created = 0
gemeinden_skipped = 0
kantone_created = 0
errors = []
# Cache for Kanton UUIDs
kanton_cache: Dict[str, str] = {}
# Helper function to find Gemeinde by BFS number
def find_gemeinde_by_bfs_nummer(bfs_nummer: str) -> Optional[Gemeinde]:
"""Find existing Gemeinde by BFS number (stored in kontextInformationen)."""
try:
gemeinden = realEstateInterface.getGemeinden(
recordFilter={"mandateId": currentUser.mandateId}
)
for gemeinde in gemeinden:
# Check kontextInformationen for bfs_nummer
for kontext in gemeinde.kontextInformationen:
try:
kontext_data = json.loads(kontext.inhalt) if isinstance(kontext.inhalt, str) else kontext.inhalt
if isinstance(kontext_data, dict):
if str(kontext_data.get("bfs_nummer")) == str(bfs_nummer):
return gemeinde
except (json.JSONDecodeError, AttributeError):
continue
return None
except Exception as e:
logger.error(f"Error finding Gemeinde by BFS number {bfs_nummer}: {e}", exc_info=True)
return None
# Helper function to get or create Kanton
def get_or_create_kanton(kanton_abk: str) -> Optional[str]:
"""Get or create a Kanton by abbreviation."""
nonlocal kantone_created, errors
if not kanton_abk:
return None
# Check cache first
if kanton_abk in kanton_cache:
return kanton_cache[kanton_abk]
# Check if exists
kantone = realEstateInterface.getKantone(
recordFilter={
"mandateId": currentUser.mandateId,
"abk": kanton_abk
}
)
if kantone:
kanton_cache[kanton_abk] = kantone[0].id
return kantone[0].id
# Create new Kanton
try:
# Map common abbreviations to full names
kanton_names = {
"AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden",
"BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt",
"FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden",
"JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden",
"OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn",
"SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri",
"VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich"
}
kanton_label = kanton_names.get(kanton_abk, kanton_abk)
kanton = Kanton(
mandateId=currentUser.mandateId,
label=kanton_label,
abk=kanton_abk
)
created_kanton = realEstateInterface.createKanton(kanton)
if created_kanton and created_kanton.id:
kanton_cache[kanton_abk] = created_kanton.id
kantone_created += 1
logger.info(f"Created new Kanton: {kanton_label} ({kanton_abk})")
return created_kanton.id
except Exception as e:
error_msg = f"Error creating Kanton {kanton_abk}: {e}"
logger.error(error_msg, exc_info=True)
errors.append(error_msg)
return None
# Process each gemeinde and save to database
saved_gemeinden = []
for gemeinde_data in gemeinden_data:
try:
gemeinde_name = gemeinde_data.get("name")
bfs_nummer = gemeinde_data.get("bfs_nummer")
kanton_abk = gemeinde_data.get("kanton")
if not gemeinde_name or not bfs_nummer:
logger.warning(f"Skipping Gemeinde with missing data: {gemeinde_data}")
gemeinden_skipped += 1
continue
# Check if Gemeinde already exists
existing_gemeinde = find_gemeinde_by_bfs_nummer(str(bfs_nummer))
if existing_gemeinde:
logger.debug(f"Gemeinde {gemeinde_name} (BFS: {bfs_nummer}) already exists, skipping")
gemeinden_skipped += 1
saved_gemeinden.append(existing_gemeinde.model_dump() if hasattr(existing_gemeinde, 'model_dump') else existing_gemeinde)
continue
# Get or create Kanton
kanton_id = get_or_create_kanton(kanton_abk) if kanton_abk else None
# Create new Gemeinde
gemeinde = Gemeinde(
mandateId=currentUser.mandateId,
label=gemeinde_name,
id_kanton=kanton_id,
kontextInformationen=[
Kontext(
thema="BFS Nummer",
inhalt=json.dumps({"bfs_nummer": bfs_nummer}, ensure_ascii=False)
)
]
)
created_gemeinde = realEstateInterface.createGemeinde(gemeinde)
if created_gemeinde and created_gemeinde.id:
gemeinden_created += 1
logger.info(f"Created new Gemeinde: {gemeinde_name} (BFS: {bfs_nummer})")
saved_gemeinden.append(created_gemeinde.model_dump() if hasattr(created_gemeinde, 'model_dump') else created_gemeinde)
else:
error_msg = f"Failed to create Gemeinde {gemeinde_name} (BFS: {bfs_nummer})"
logger.error(error_msg)
errors.append(error_msg)
gemeinden_skipped += 1
except Exception as e:
error_msg = f"Error processing Gemeinde {gemeinde_data.get('name', 'Unknown')}: {str(e)}"
logger.error(error_msg, exc_info=True)
errors.append(error_msg)
gemeinden_skipped += 1
logger.info(
f"Gemeinden import completed: {gemeinden_created} created, "
f"{gemeinden_skipped} skipped, {kantone_created} Kantone created"
)
return {
"gemeinden": saved_gemeinden,
"count": len(saved_gemeinden),
"stats": {
"gemeinden_created": gemeinden_created,
"gemeinden_skipped": gemeinden_skipped,
"kantone_created": kantone_created,
"error_count": len(errors),
"errors": errors[:10] # Return first 10 errors
}
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error fetching all Gemeinden: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error fetching Gemeinden: {str(e)}"
)
def _get_language_from_kanton(kanton_abk: Optional[str]) -> str:
"""
Determine language (German/French/Italian) based on Kanton abbreviation.
Args:
kanton_abk: Kanton abbreviation (e.g., 'ZH', 'VD', 'TI')
Returns:
Language code: 'de' (German), 'fr' (French), or 'it' (Italian)
"""
if not kanton_abk:
return 'de' # Default to German
# French-speaking cantons
french_cantons = {'VD', 'GE', 'NE', 'JU'}
# Italian-speaking canton
italian_cantons = {'TI'}
kanton_upper = kanton_abk.upper()
if kanton_upper in french_cantons:
return 'fr'
elif kanton_upper in italian_cantons:
return 'it'
else:
return 'de' # Default to German
def _get_bzo_search_query(gemeinde_label: str, language: str) -> str:
"""
Generate language-specific BZO search query for a Gemeinde.
Args:
gemeinde_label: Name of the Gemeinde
language: Language code ('de', 'fr', 'it')
Returns:
Search query string
"""
if language == 'fr':
# French: Plan d'aménagement local or Règlement de construction
return f"Plan d'aménagement local {gemeinde_label} OR Règlement de construction {gemeinde_label}"
elif language == 'it':
# Italian: Piano di utilizzazione or Regolamento edilizio
return f"Piano di utilizzazione {gemeinde_label} OR Regolamento edilizio {gemeinde_label}"
else:
# German: Bau und Zonenordnung
return f"Bau und Zonenordnung {gemeinde_label}"
@router.post("/gemeinden/fetch-bzo-documents", response_model=Dict[str, Any])
@limiter.limit("10/hour") # Resource-intensive operation
async def fetch_bzo_documents(
request: Request,
currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
"""
Search for and download Bau und Zonenordnung (BZO) documents for all Gemeinden.
This endpoint:
1. Fetches all Gemeinden from the database
2. For each Gemeinde, determines language based on Kanton
3. Uses Tavily search to find BZO documents (up to 5 results)
4. Downloads all PDF files found and stores them with content
5. Creates Dokument records for each PDF and links them to Gemeinde's dokumente field
6. Skips Gemeinden that already have BZO documents
Note: If Tavily returns multiple PDF results, all of them will be downloaded
and saved as separate Dokument records.
Headers:
- X-CSRF-Token: CSRF token (required for security)
Returns:
{
"success": true,
"stats": {
"gemeinden_processed": 100,
"documents_created": 85,
"documents_skipped": 15,
"errors": []
},
"results": [
{
"gemeinde_id": "...",
"gemeinde_label": "Zürich",
"status": "created|skipped|error",
"dokument_ids": ["...", "..."], // List of created document IDs (can be multiple)
"error": null
}
]
}
"""
try:
# Validate CSRF token
csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
if not csrf_token:
logger.warning(f"CSRF token missing for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="CSRF token missing. Please include X-CSRF-Token header."
)
# Basic CSRF token format validation
if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
logger.warning(f"Invalid CSRF token format for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid CSRF token format"
)
# Validate token is hex string
try:
int(csrf_token, 16)
except ValueError:
logger.warning(f"CSRF token is not a valid hex string for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid CSRF token format"
)
logger.info(f"Starting BZO document fetch for user {currentUser.id} (mandate: {currentUser.mandateId})")
# Get interfaces
realEstateInterface = getRealEstateInterface(currentUser)
componentInterface = getComponentInterface(currentUser)
# Initialize Tavily connector
tavily = AiTavily()
# Get all Gemeinden
gemeinden = realEstateInterface.getGemeinden(
recordFilter={"mandateId": currentUser.mandateId}
)
logger.info(f"Found {len(gemeinden)} Gemeinden to process")
# Statistics
stats = {
"gemeinden_processed": 0,
"documents_created": 0,
"documents_skipped": 0,
"errors": []
}
results = []
# Process each Gemeinde
for gemeinde in gemeinden:
gemeinde_result = {
"gemeinde_id": gemeinde.id,
"gemeinde_label": gemeinde.label,
"status": None,
"dokument_ids": [], # Changed to list to support multiple documents
"error": None
}
try:
stats["gemeinden_processed"] += 1
# Check if Gemeinde already has a BZO document
existing_bzo = False
if gemeinde.dokumente:
for doc in gemeinde.dokumente:
# Check if it's a BZO document by label or dokumentTyp
if (doc.label and ("BZO" in doc.label.upper() or "BAU UND ZONENORDNUNG" in doc.label.upper() or
"PLAN D'AMÉNAGEMENT" in doc.label.upper() or "RÈGLEMENT DE CONSTRUCTION" in doc.label.upper() or
"PIANO DI UTILIZZAZIONE" in doc.label.upper() or "REGOLAMENTO EDILIZIO" in doc.label.upper())) or \
(doc.dokumentTyp and doc.dokumentTyp in [DokumentTyp.GEMEINDE_BZO_AKTUELL, DokumentTyp.GEMEINDE_BZO_REVISION]):
existing_bzo = True
break
if existing_bzo:
logger.debug(f"Gemeinde {gemeinde.label} already has BZO document, skipping")
gemeinde_result["status"] = "skipped"
stats["documents_skipped"] += 1
results.append(gemeinde_result)
continue
# Get Kanton to determine language
kanton_abk = None
if gemeinde.id_kanton:
kanton = realEstateInterface.getKanton(gemeinde.id_kanton)
if kanton:
kanton_abk = kanton.abk
# Determine language
language = _get_language_from_kanton(kanton_abk)
# Generate search query
search_query = _get_bzo_search_query(gemeinde.label, language)
logger.info(f"Searching for BZO document for {gemeinde.label} (language: {language}) with query: {search_query}")
# Search with Tavily using the private _search method
search_results = await tavily._search(
query=search_query,
maxResults=5,
country="switzerland"
)
if not search_results:
logger.warning(f"No search results found for {gemeinde.label}")
gemeinde_result["status"] = "error"
gemeinde_result["error"] = "No search results found"
stats["errors"].append(f"{gemeinde.label}: No search results found")
results.append(gemeinde_result)
continue
# Find all PDF URLs from search results
pdf_urls = []
for result in search_results:
url = result.url.lower()
if url.endswith('.pdf') or 'pdf' in url:
pdf_urls.append(result.url)
# If no PDF URLs found, try to use all results (they might be PDFs even without .pdf extension)
if not pdf_urls:
pdf_urls = [result.url for result in search_results]
logger.info(f"No explicit PDF URLs found for {gemeinde.label}, trying all {len(pdf_urls)} results")
logger.info(f"Found {len(pdf_urls)} potential PDF documents for {gemeinde.label}")
# Helper function to download a single PDF
async def download_pdf(pdf_url: str) -> Optional[bytes]:
"""Download a PDF from a URL with retry logic."""
max_retries = 3
retry_delay = 2
for attempt in range(max_retries):
try:
# Create headers - use minimal headers on retry after 406 error
if attempt > 0:
# Minimal headers for retry
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': '*/*'
}
else:
# Full headers for first attempt
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/pdf,application/octet-stream,*/*',
'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
timeout = aiohttp.ClientTimeout(total=30, connect=10)
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
async with session.get(pdf_url, allow_redirects=True) as response:
if response.status == 200:
pdf_content = await response.read()
if not pdf_content or len(pdf_content) < 100: # Minimum size check
raise Exception("Downloaded file is too small or empty")
# Verify it's actually a PDF (check PDF magic bytes)
if not pdf_content.startswith(b'%PDF'):
# Check if it's HTML (common error page)
if pdf_content.startswith(b'<') or pdf_content.startswith(b'<!DOCTYPE'):
raise Exception("Server returned HTML instead of PDF")
logger.warning(f"Downloaded file from {pdf_url} doesn't appear to be a PDF, but continuing anyway")
# Success - return content
return pdf_content
elif response.status == 406:
# Not Acceptable - try with minimal headers on next attempt
logger.warning(f"HTTP 406 for {pdf_url}, will retry with minimal headers (attempt {attempt + 1}/{max_retries})")
if attempt < max_retries - 1:
await asyncio.sleep(retry_delay)
continue
else:
raise Exception(f"HTTP {response.status} (Not Acceptable) - server rejected request after {max_retries} attempts")
elif response.status in [301, 302, 303, 307, 308]:
# Redirect - should be handled by allow_redirects=True, but log it
logger.warning(f"Redirect {response.status} for {pdf_url}, following...")
pdf_content = await response.read()
return pdf_content
else:
raise Exception(f"HTTP {response.status} when downloading PDF")
except asyncio.TimeoutError:
logger.warning(f"Timeout downloading PDF from {pdf_url} (attempt {attempt + 1}/{max_retries})")
if attempt < max_retries - 1:
await asyncio.sleep(retry_delay)
continue
else:
raise Exception("Connection timeout after retries")
except aiohttp.ClientError as e:
logger.warning(f"Connection error downloading PDF from {pdf_url} (attempt {attempt + 1}/{max_retries}): {str(e)}")
if attempt < max_retries - 1:
await asyncio.sleep(retry_delay)
continue
else:
raise Exception(f"Connection error: {str(e)}")
except Exception as e:
# For other errors, don't retry
raise
return None
# Process all PDF URLs
created_dokumente = []
current_dokumente = list(gemeinde.dokumente) if gemeinde.dokumente else []
# Sanitize Gemeinde name for filename
safe_name = "".join(c for c in gemeinde.label if c.isalnum() or c in (' ', '-', '_')).strip()
safe_name = safe_name.replace(' ', '_')
if not safe_name:
safe_name = "Gemeinde"
# Determine base label based on language
if language == 'fr':
base_doc_label = f"Plan d'aménagement local {gemeinde.label}"
elif language == 'it':
base_doc_label = f"Piano di utilizzazione {gemeinde.label}"
else:
base_doc_label = f"BZO {gemeinde.label}"
# Process each PDF URL
for idx, pdf_url in enumerate(pdf_urls):
try:
logger.info(f"Downloading PDF {idx + 1}/{len(pdf_urls)} from {pdf_url} for {gemeinde.label}")
pdf_content = await download_pdf(pdf_url)
if not pdf_content:
logger.warning(f"Failed to download PDF from {pdf_url} for {gemeinde.label}")
continue
# Additional validation
if len(pdf_content) < 100:
logger.warning(f"Downloaded file from {pdf_url} is too small for {gemeinde.label}")
continue
# Create unique file name (add index if multiple documents)
if len(pdf_urls) > 1:
file_name = f"BZO_{safe_name}_{idx + 1}.pdf"
doc_label = f"{base_doc_label} ({idx + 1})"
else:
file_name = f"BZO_{safe_name}.pdf"
doc_label = base_doc_label
# Store file using ComponentObjects
try:
file_item = componentInterface.createFile(
name=file_name,
mimeType="application/pdf",
content=pdf_content
)
# Store file data
componentInterface.createFileData(file_item.id, pdf_content)
logger.info(f"Stored file {file_name} with ID {file_item.id} for {gemeinde.label}")
except Exception as e:
logger.error(f"Error storing file {file_name} for {gemeinde.label}: {str(e)}", exc_info=True)
stats["errors"].append(f"{gemeinde.label}: File storage failed for {pdf_url} - {str(e)}")
continue
# Create Dokument record
dokument = Dokument(
mandateId=currentUser.mandateId,
label=doc_label,
versionsbezeichnung="Aktuell",
dokumentTyp=DokumentTyp.GEMEINDE_BZO_AKTUELL,
dokumentReferenz=file_item.id, # FileId from ComponentObjects
quelle=pdf_url, # Original URL
mimeType="application/pdf",
kategorienTags=["BZO", "Bauordnung", gemeinde.label]
)
# Create Dokument record in the Dokument table
created_dokument = realEstateInterface.createDokument(dokument)
logger.info(f"Created Dokument record with ID {created_dokument.id} for {gemeinde.label} (from {pdf_url})")
created_dokumente.append(created_dokument)
current_dokumente.append(created_dokument)
gemeinde_result["dokument_ids"].append(created_dokument.id)
except Exception as e:
logger.error(f"Error processing PDF {pdf_url} for {gemeinde.label}: {str(e)}", exc_info=True)
stats["errors"].append(f"{gemeinde.label}: Error processing PDF {pdf_url} - {str(e)}")
continue
# Update Gemeinde with all new dokumente
if created_dokumente:
updated_gemeinde = realEstateInterface.updateGemeinde(
gemeinde.id,
{"dokumente": current_dokumente}
)
if updated_gemeinde:
logger.info(f"Successfully created {len(created_dokumente)} BZO document(s) for {gemeinde.label}")
gemeinde_result["status"] = "created"
stats["documents_created"] += len(created_dokumente)
else:
raise Exception("Failed to update Gemeinde")
else:
# No documents were successfully created
gemeinde_result["status"] = "error"
gemeinde_result["error"] = "No PDFs could be downloaded or processed"
stats["errors"].append(f"{gemeinde.label}: No PDFs could be downloaded or processed")
except Exception as e:
logger.error(f"Error processing Gemeinde {gemeinde.label}: {str(e)}", exc_info=True)
gemeinde_result["status"] = "error"
gemeinde_result["error"] = str(e)
stats["errors"].append(f"{gemeinde.label}: {str(e)}")
results.append(gemeinde_result)
logger.info(
f"BZO document fetch completed: {stats['documents_created']} created, "
f"{stats['documents_skipped']} skipped, {len(stats['errors'])} errors"
)
return {
"success": True,
"stats": stats,
"results": results
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error fetching BZO documents: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error fetching BZO documents: {str(e)}"
)