feat: fetch oereb kataster and return Wohnzohne

2026-01-26 07:14:56 +01:00 · 2026-01-26 07:14:56 +01:00 · fb6bac7d92
commit fb6bac7d92
parent 0d1c30d4c6
7 changed files with 3343 additions and 0 deletions
--- a/modules/connectors/connectorOerebWfs.py
+++ b/modules/connectors/connectorOerebWfs.py
@ -0,0 +1,525 @@
+"""
+ÖREB WFS Connector
+
+This connector handles interactions with ÖREB (Öffentlich-rechtliche Eigentumsbeschränkungen)
+WFS services for zone information retrieval.
+
+ÖREB provides zoning information (Bauzonen) through WFS services.
+"""
+
+import logging
+from typing import Dict, List, Any, Optional
+import aiohttp
+import xml.etree.ElementTree as ET
+from shapely.geometry import Polygon
+
+logger = logging.getLogger(__name__)
+
+
+class OerebWfsConnector:
+    """
+    Connector for ÖREB WFS services.
+    
+    Provides methods for:
+    - Querying zone information (Bauzonen) by parcel geometry
+    - Retrieving zoning data from canton-specific WFS services
+    """
+    
+    def __init__(
+        self,
+        timeout: int = 10,
+        max_retries: int = 3,
+        retry_delay: float = 1.0
+    ):
+        """
+        Initialize ÖREB WFS connector.
+        
+        Args:
+            timeout: Request timeout in seconds
+            max_retries: Maximum number of retry attempts
+            retry_delay: Initial retry delay in seconds (exponential backoff)
+        """
+        self.timeout = aiohttp.ClientTimeout(total=timeout)
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self._wfs_cache: Dict[str, List[Dict[str, Any]]] = {}  # Cache for WFS queries by bbox
+        
+        logger.info("ÖREB WFS Connector initialized")
+    
+    def _get_oereb_wfs_url(self, canton: str) -> Optional[str]:
+        """
+        Get ÖREB WFS service URL for a given canton.
+        
+        Args:
+            canton: Canton abbreviation (e.g., "ZH", "BE")
+            
+        Returns:
+            WFS service URL or None if canton not supported
+        """
+        oereb_wfs_urls = {
+            "ZH": "https://maps.zh.ch/wfs/OerebKatasterZHWFS",
+        }
+        return oereb_wfs_urls.get(canton.upper())
+    
+    def _geometry_to_shapely_polygon(self, geometry: Dict[str, Any]) -> Optional[Polygon]:
+        """
+        Convert parcel geometry (ESRI rings or GeoJSON coordinates) to Shapely Polygon.
+        
+        Args:
+            geometry: Geometry dictionary (ESRI rings or GeoJSON coordinates)
+            
+        Returns:
+            Shapely Polygon or None if invalid
+        """
+        try:
+            # Handle ESRI geometry format (rings)
+            if "rings" in geometry:
+                rings = geometry.get("rings", [])
+                if not rings or not rings[0]:
+                    return None
+                
+                # Use the first ring (exterior) for the polygon
+                exterior_ring = rings[0]
+                if len(exterior_ring) < 3:
+                    return None
+                
+                # Ensure polygon is closed
+                coords = list(exterior_ring)
+                if coords[0] != coords[-1]:
+                    coords.append(coords[0])
+                
+                return Polygon(coords)
+            
+            # Handle GeoJSON format (coordinates)
+            elif "coordinates" in geometry:
+                coords = geometry.get("coordinates", [])
+                if not coords:
+                    return None
+                
+                # Handle Polygon coordinates: [[[x1,y1], [x2,y2], ...]]
+                # Flatten to get the exterior ring
+                def extract_exterior(coord_list, depth=0):
+                    if depth == 0 and isinstance(coord_list, list) and len(coord_list) > 0:
+                        # First level might be array of rings, take first one
+                        if isinstance(coord_list[0], list) and len(coord_list[0]) > 0:
+                            if isinstance(coord_list[0][0], list):
+                                # This is Polygon format: [[[x,y],...]]
+                                return extract_exterior(coord_list[0], depth + 1)
+                            elif isinstance(coord_list[0][0], (int, float)):
+                                # This is already a ring: [[x,y],...]
+                                return coord_list[0]
+                    elif depth == 1 and isinstance(coord_list, list) and len(coord_list) > 0:
+                        if isinstance(coord_list[0], (int, float)):
+                            return coord_list
+                        elif isinstance(coord_list[0], list):
+                            return coord_list
+                    return coord_list
+                
+                exterior_coords = extract_exterior(coords)
+                if not exterior_coords or len(exterior_coords) < 3:
+                    return None
+                
+                # Ensure polygon is closed
+                coords_list = list(exterior_coords)
+                if coords_list[0] != coords_list[-1]:
+                    coords_list.append(coords_list[0])
+                
+                return Polygon(coords_list)
+            
+        except Exception as e:
+            logger.debug(f"Error converting geometry to Shapely Polygon: {e}")
+        
+        return None
+    
+    def _parse_gml_geometry(self, feature_elem: ET.Element) -> Optional[Polygon]:
+        """
+        Parse GML geometry from WFS feature element and convert to Shapely Polygon.
+        
+        Args:
+            feature_elem: XML element containing the feature
+            
+        Returns:
+            Shapely Polygon or None if geometry not found or invalid
+        """
+        try:
+            # Common GML namespaces
+            namespaces = {
+                'gml': 'http://www.opengis.net/gml',
+                'gml3': 'http://www.opengis.net/gml/3.2',
+                'gml32': 'http://www.opengis.net/gml/3.2'
+            }
+            
+            # Try to find polygon geometry
+            polygon_elem = None
+            for ns_prefix, ns_url in namespaces.items():
+                # Try different GML polygon element names
+                for tag_name in ['Polygon', 'polygon', 'PolygonProperty', 'geometryProperty']:
+                    polygon_elem = feature_elem.find(f'.//{{{ns_url}}}{tag_name}')
+                    if polygon_elem is not None:
+                        break
+                    # Also try without namespace prefix
+                    polygon_elem = feature_elem.find(f'.//{tag_name}')
+                    if polygon_elem is not None:
+                        break
+                if polygon_elem is not None:
+                    break
+            
+            if polygon_elem is None:
+                # Try to find any geometry element
+                for ns_prefix, ns_url in namespaces.items():
+                    polygon_elem = feature_elem.find(f'.//{{{ns_url}}}*')
+                    if polygon_elem is not None and 'polygon' in polygon_elem.tag.lower():
+                        break
+            
+            if polygon_elem is None:
+                return None
+            
+            # Extract coordinates from GML
+            # GML Polygon typically has exterior ring with posList or pos elements
+            coords = []
+            
+            # Try posList (most common in GML 3.2)
+            for ns_prefix, ns_url in namespaces.items():
+                pos_list = polygon_elem.find(f'.//{{{ns_url}}}posList')
+                if pos_list is not None and pos_list.text:
+                    # posList format: "x1 y1 x2 y2 x3 y3 ..."
+                    coord_strings = pos_list.text.strip().split()
+                    for i in range(0, len(coord_strings) - 1, 2):
+                        if i + 1 < len(coord_strings):
+                            x = float(coord_strings[i])
+                            y = float(coord_strings[i + 1])
+                            coords.append((x, y))
+                    break
+            
+            # If no posList, try pos elements
+            if not coords:
+                for ns_prefix, ns_url in namespaces.items():
+                    pos_elems = polygon_elem.findall(f'.//{{{ns_url}}}pos')
+                    if pos_elems:
+                        for pos in pos_elems:
+                            if pos.text:
+                                parts = pos.text.strip().split()
+                                if len(parts) >= 2:
+                                    x = float(parts[0])
+                                    y = float(parts[1])
+                                    coords.append((x, y))
+                        break
+            
+            # If still no coords, try coordinates element (GML 2)
+            if not coords:
+                for ns_prefix, ns_url in namespaces.items():
+                    coords_elem = polygon_elem.find(f'.//{{{ns_url}}}coordinates')
+                    if coords_elem is not None and coords_elem.text:
+                        # GML 2 coordinates format: "x1,y1 x2,y2 ..." or "x1,y1,z1 x2,y2,z2 ..."
+                        coord_strings = coords_elem.text.strip().split()
+                        for coord_str in coord_strings:
+                            parts = coord_str.split(',')
+                            if len(parts) >= 2:
+                                x = float(parts[0])
+                                y = float(parts[1])
+                                coords.append((x, y))
+                        break
+            
+            if len(coords) < 3:
+                return None
+            
+            # Ensure polygon is closed
+            if coords[0] != coords[-1]:
+                coords.append(coords[0])
+            
+            return Polygon(coords)
+            
+        except Exception as e:
+            logger.debug(f"Error parsing GML geometry: {e}")
+            return None
+    
+    def _calculate_bbox_from_geometry(self, geometry: Dict[str, Any]) -> Optional[str]:
+        """
+        Calculate bounding box from geometry for WFS queries.
+        
+        Args:
+            geometry: Geometry dictionary (ESRI rings or GeoJSON coordinates)
+            
+        Returns:
+            Bounding box string in format "min_x,min_y,max_x,max_y" or None if invalid
+        """
+        try:
+            # Handle ESRI geometry format (rings)
+            if "rings" in geometry:
+                rings = geometry.get("rings", [])
+                if not rings or not rings[0]:
+                    return None
+                
+                # Flatten all coordinates from all rings
+                all_coords = []
+                for ring in rings:
+                    all_coords.extend(ring)
+                
+                if not all_coords:
+                    return None
+                
+                # Calculate bbox
+                x_coords = [coord[0] for coord in all_coords]
+                y_coords = [coord[1] for coord in all_coords]
+                
+                min_x = min(x_coords)
+                min_y = min(y_coords)
+                max_x = max(x_coords)
+                max_y = max(y_coords)
+                
+                return f"{min_x},{min_y},{max_x},{max_y}"
+            
+            # Handle GeoJSON format (coordinates)
+            elif "coordinates" in geometry:
+                coords = geometry.get("coordinates", [])
+                if not coords:
+                    return None
+                
+                # Flatten coordinates based on geometry type
+                def flatten_coords(coord_list, depth=0):
+                    if depth < 2:
+                        result = []
+                        for item in coord_list:
+                            if isinstance(item, (int, float)):
+                                return coord_list
+                            result.extend(flatten_coords(item, depth + 1))
+                        return result
+                    return coord_list
+                
+                flat_coords = flatten_coords(coords)
+                if not flat_coords or len(flat_coords) < 2:
+                    return None
+                
+                x_coords = [flat_coords[i] for i in range(0, len(flat_coords), 2)]
+                y_coords = [flat_coords[i+1] for i in range(0, len(flat_coords)-1, 2)]
+                
+                min_x = min(x_coords)
+                min_y = min(y_coords)
+                max_x = max(x_coords)
+                max_y = max(y_coords)
+                
+                return f"{min_x},{min_y},{max_x},{max_y}"
+            
+        except Exception as e:
+            logger.debug(f"Error calculating bbox from geometry: {e}")
+        
+        return None
+    
+    async def _query_wfs_get_feature(
+        self,
+        wfs_url: str,
+        type_name: str,
+        bbox: str,
+        srs: str = "EPSG:2056"
+    ) -> List[Dict[str, Any]]:
+        """
+        Query WFS GetFeature to retrieve zone features within a bounding box.
+        
+        Args:
+            wfs_url: WFS service URL
+            type_name: Feature type name (e.g., "nutzungsplanung")
+            bbox: Bounding box string "min_x,min_y,max_x,max_y"
+            srs: Spatial reference system (default: EPSG:2056 for LV95)
+            
+        Returns:
+            List of feature dictionaries with properties and attributes
+        """
+        # Only use WFS 1.1.0 (we know this works)
+        params = {
+            "service": "WFS",
+            "version": "1.1.0",
+            "request": "GetFeature",
+            "typeName": type_name,
+            "bbox": bbox,
+            "srsName": srs
+        }
+        
+        logger.debug(f"Querying WFS GetFeature: {wfs_url} with typeName={type_name}, bbox={bbox}")
+        
+        try:
+            async with aiohttp.ClientSession(timeout=self.timeout) as session:
+                async with session.get(wfs_url, params=params) as response:
+                    if response.status != 200:
+                        logger.debug(f"WFS GetFeature returned status {response.status}")
+                        return []
+                    
+                    # Parse XML/GML response
+                    xml_content = await response.text()
+                    
+                    try:
+                        root = ET.fromstring(xml_content)
+                        
+                        features = []
+                        members = root.findall('.//{http://www.opengis.net/gml}featureMember') or \
+                                 root.findall('.//featureMember')
+                        
+                        for member in members:
+                            attrs = {}
+                            
+                            # Find feature element
+                            feature_elem = member
+                            for child in member:
+                                if child.tag and ('nutzung' in child.tag.lower() or 'plan' in child.tag.lower()):
+                                    feature_elem = child
+                                    break
+                            
+                            # Extract attributes
+                            for elem in feature_elem.iter():
+                                if elem.tag and elem.text and elem.text.strip():
+                                    tag_lower = elem.tag.lower()
+                                    if any(term in tag_lower for term in [
+                                        'pos', 'coordinates', 'point', 'polygon', 'linestring',
+                                        'geometry', 'boundedby', 'envelope', 'gml'
+                                    ]):
+                                        continue
+                                    
+                                    tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
+                                    if ':' in tag_name:
+                                        tag_name = tag_name.split(':')[-1]
+                                    
+                                    if tag_name not in attrs:
+                                        attrs[tag_name] = elem.text.strip()
+                            
+                            # Parse geometry from GML
+                            geometry_polygon = self._parse_gml_geometry(feature_elem)
+                            
+                            if attrs:
+                                feature_dict = {"properties": attrs, "attributes": attrs}
+                                if geometry_polygon:
+                                    feature_dict["geometry"] = geometry_polygon
+                                features.append(feature_dict)
+                        
+                        return features
+                        
+                    except ET.ParseError as e:
+                        logger.debug(f"Failed to parse WFS XML response: {e}")
+                        return []
+                    except Exception as e:
+                        logger.debug(f"Error parsing WFS XML: {e}")
+                        return []
+                        
+        except Exception as e:
+            logger.debug(f"WFS GetFeature query failed: {e}")
+            return []
+    
+    async def query_zone_layer(
+        self, 
+        egrid: str,
+        x: float,
+        y: float,
+        canton: Optional[str] = None,
+        geometry: Optional[Dict[str, Any]] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Query zone information using ÖREB WFS service.
+        
+        Returns only zones that contain the parcel based on the parcel geometry.
+        
+        Args:
+            egrid: EGRID identifier (not currently used but kept for API compatibility)
+            x: X coordinate (LV95) - not used but kept for compatibility
+            y: Y coordinate (LV95) - not used but kept for compatibility
+            canton: Canton abbreviation (e.g., "ZH", "BE")
+            geometry: Parcel geometry dictionary (ESRI rings or GeoJSON coordinates)
+            
+        Returns:
+            List of zone dictionaries with layerBodId and attributes, or empty list if not found
+        """
+        if not canton or not geometry:
+            return []
+        
+        wfs_url = self._get_oereb_wfs_url(canton)
+        if not wfs_url:
+            return []
+        
+        try:
+            bbox = self._calculate_bbox_from_geometry(geometry)
+            if not bbox:
+                return []
+            
+            # Check cache
+            cache_key = f"{wfs_url}:{bbox}"
+            if cache_key in self._wfs_cache:
+                cached_features = self._wfs_cache[cache_key]
+            else:
+                cached_features = await self._query_wfs_get_feature(wfs_url, "nutzungsplanung", bbox)
+                self._wfs_cache[cache_key] = cached_features
+            
+            if not cached_features:
+                return []
+            
+            # Convert parcel geometry to Shapely Polygon for spatial validation
+            parcel_polygon = self._geometry_to_shapely_polygon(geometry)
+            if not parcel_polygon:
+                logger.debug("Could not convert parcel geometry to Shapely Polygon")
+                # Fallback to first zone if geometry conversion fails
+                for feature in cached_features:
+                    attrs = feature.get("properties", feature.get("attributes", {}))
+                    typ_gde_abkuerzung = attrs.get("typ_gde_abkuerzung")
+                    if typ_gde_abkuerzung:
+                        return [{
+                            "layerBodId": "oereb_wfs",
+                            "attributes": {"typ_gde_abkuerzung": typ_gde_abkuerzung}
+                        }]
+                return []
+            
+            # Find the zone that actually contains or intersects the parcel
+            # Since a parcel is always in exactly one zone, we check for containment first,
+            # then find the zone with the largest intersection area if no perfect containment is found
+            containing_zone = None
+            best_intersecting_zone = None
+            best_intersection_area = 0.0
+            
+            for feature in cached_features:
+                attrs = feature.get("properties", feature.get("attributes", {}))
+                typ_gde_abkuerzung = attrs.get("typ_gde_abkuerzung")
+                
+                if not typ_gde_abkuerzung:
+                    continue
+                
+                zone_geometry = feature.get("geometry")
+                if not zone_geometry:
+                    # If geometry not parsed, skip spatial check for this feature
+                    # But keep it as fallback if no geometry-based match is found
+                    if not best_intersecting_zone:
+                        best_intersecting_zone = feature
+                    continue
+                
+                try:
+                    # Check if zone contains the parcel (most precise)
+                    if zone_geometry.contains(parcel_polygon):
+                        containing_zone = feature
+                        break  # Found perfect match, stop searching
+                    
+                    # Check if zone intersects the parcel (for border cases)
+                    if zone_geometry.intersects(parcel_polygon):
+                        # Calculate intersection area to find the best match
+                        intersection = zone_geometry.intersection(parcel_polygon)
+                        if not intersection.is_empty:
+                            intersection_area = intersection.area
+                            # Keep the zone with the largest intersection area
+                            if intersection_area > best_intersection_area:
+                                best_intersection_area = intersection_area
+                                best_intersecting_zone = feature
+                            
+                except Exception as e:
+                    logger.debug(f"Error checking spatial relationship: {e}")
+                    # If spatial check fails, keep as fallback
+                    if not best_intersecting_zone:
+                        best_intersecting_zone = feature
+            
+            # Return the containing zone if found, otherwise the best intersecting zone
+            selected_feature = containing_zone or best_intersecting_zone
+            if selected_feature:
+                attrs = selected_feature.get("properties", selected_feature.get("attributes", {}))
+                typ_gde_abkuerzung = attrs.get("typ_gde_abkuerzung")
+                if typ_gde_abkuerzung:
+                    return [{
+                        "layerBodId": "oereb_wfs",
+                        "attributes": {"typ_gde_abkuerzung": typ_gde_abkuerzung}
+                    }]
+            
+            return []
+            
+        except Exception:
+            return []
--- a/modules/features/realEstate/bzoDocumentRetriever.py
+++ b/modules/features/realEstate/bzoDocumentRetriever.py
@ -0,0 +1,199 @@
+"""
+Document retriever for BZO extraction pipeline.
+Queries Dokument table and retrieves PDF content from ComponentObjects.
+"""
+
+import logging
+from typing import List, Dict, Any, Optional
+from modules.datamodels.datamodelRealEstate import Dokument, DokumentTyp, Gemeinde
+from modules.interfaces.interfaceDbRealEstateObjects import RealEstateObjects
+from modules.interfaces.interfaceDbComponentObjects import ComponentObjects
+
+logger = logging.getLogger(__name__)
+
+
+class BZODocumentRetriever:
+    """Retrieves BZO documents from database and ComponentObjects."""
+    
+    def __init__(self, realEstateInterface: RealEstateObjects, componentInterface: ComponentObjects):
+        """
+        Initialize document retriever.
+        
+        Args:
+            realEstateInterface: Real Estate database interface
+            componentInterface: ComponentObjects interface for file retrieval
+        """
+        self.realEstateInterface = realEstateInterface
+        self.componentInterface = componentInterface
+    
+    def get_documents_by_ids(self, dokument_ids: List[str]) -> List[Dokument]:
+        """
+        Retrieve specific documents by their IDs.
+        
+        Args:
+            dokument_ids: List of dokument IDs to retrieve
+            
+        Returns:
+            List of Dokument records
+        """
+        try:
+            dokumente = []
+            for dokument_id in dokument_ids:
+                dokument = self.realEstateInterface.getDokument(dokument_id)
+                if dokument:
+                    dokumente.append(dokument)
+                else:
+                    logger.warning(f"Dokument {dokument_id} not found")
+            
+            logger.info(f"Retrieved {len(dokumente)} documents out of {len(dokument_ids)} requested")
+            return dokumente
+            
+        except Exception as e:
+            logger.error(f"Error retrieving documents by IDs: {str(e)}", exc_info=True)
+            raise
+    
+    def get_bzo_documents(self, gemeinde_id: Optional[str] = None) -> List[Dokument]:
+        """
+        Query Dokument table for BZO documents.
+        
+        Args:
+            gemeinde_id: Optional filter by specific Gemeinde
+            
+        Returns:
+            List of Dokument records with BZO document type
+        """
+        try:
+            # Query each document type separately (database connector doesn't support array filters)
+            dokumente_aktuell = self.realEstateInterface.getDokumente(
+                recordFilter={
+                    "mandateId": self.realEstateInterface.mandateId,
+                    "dokumentTyp": DokumentTyp.GEMEINDE_BZO_AKTUELL
+                }
+            )
+            
+            dokumente_revision = self.realEstateInterface.getDokumente(
+                recordFilter={
+                    "mandateId": self.realEstateInterface.mandateId,
+                    "dokumentTyp": DokumentTyp.GEMEINDE_BZO_REVISION
+                }
+            )
+            
+            # Combine results and deduplicate by ID
+            dokumente_dict = {}
+            for dokument in dokumente_aktuell + dokumente_revision:
+                dokumente_dict[dokument.id] = dokument
+            dokumente = list(dokumente_dict.values())
+            
+            # If gemeinde_id provided, filter by checking Gemeinde.dokumente relationship
+            if gemeinde_id:
+                filtered_dokumente = []
+                for dokument in dokumente:
+                    # Check if this dokument is linked to the specified Gemeinde
+                    gemeinden = self.realEstateInterface.getGemeinden(
+                        recordFilter={"mandateId": self.realEstateInterface.mandateId}
+                    )
+                    for gemeinde in gemeinden:
+                        if gemeinde.id == gemeinde_id:
+                            # Check if dokument.id is in gemeinde.dokumente
+                            if gemeinde.dokumente:
+                                for doc in gemeinde.dokumente:
+                                    if isinstance(doc, dict) and doc.get("id") == dokument.id:
+                                        filtered_dokumente.append(dokument)
+                                        break
+                                    elif hasattr(doc, "id") and doc.id == dokument.id:
+                                        filtered_dokumente.append(dokument)
+                                        break
+                            break
+                dokumente = filtered_dokumente
+            
+            logger.info(f"Found {len(dokumente)} BZO documents" + (f" for Gemeinde {gemeinde_id}" if gemeinde_id else ""))
+            return dokumente
+            
+        except Exception as e:
+            logger.error(f"Error querying BZO documents: {str(e)}", exc_info=True)
+            raise
+    
+    def retrieve_pdf_content(self, dokument: Dokument) -> Optional[bytes]:
+        """
+        Retrieve PDF bytes from ComponentObjects using dokumentReferenz.
+        
+        Args:
+            dokument: Dokument record with dokumentReferenz field
+            
+        Returns:
+            PDF bytes or None if retrieval fails
+        """
+        try:
+            if not dokument.dokumentReferenz:
+                logger.warning(f"Dokument {dokument.id} has no dokumentReferenz")
+                return None
+            
+            # Retrieve PDF bytes
+            pdf_bytes = self.componentInterface.getFileData(dokument.dokumentReferenz)
+            
+            if not pdf_bytes:
+                logger.warning(f"Could not retrieve PDF content for file {dokument.dokumentReferenz}")
+                return None
+            
+            logger.debug(f"Retrieved PDF content for dokument {dokument.id} ({len(pdf_bytes)} bytes)")
+            return pdf_bytes
+            
+        except Exception as e:
+            logger.error(f"Error retrieving PDF content for dokument {dokument.id}: {str(e)}", exc_info=True)
+            return None
+    
+    def resolve_gemeinde_for_dokument(self, dokument: Dokument) -> Optional[str]:
+        """
+        Resolve gemeinde_id for a Dokument by checking Gemeinde.dokumente relationships.
+        
+        Args:
+            dokument: Dokument record
+            
+        Returns:
+            gemeinde_id if found, None otherwise
+        """
+        try:
+            gemeinden = self.realEstateInterface.getGemeinden(
+                recordFilter={"mandateId": self.realEstateInterface.mandateId}
+            )
+            
+            for gemeinde in gemeinden:
+                if gemeinde.dokumente:
+                    for doc in gemeinde.dokumente:
+                        doc_id = doc.id if hasattr(doc, "id") else doc.get("id") if isinstance(doc, dict) else None
+                        if doc_id == dokument.id:
+                            logger.debug(f"Resolved gemeinde_id {gemeinde.id} for dokument {dokument.id}")
+                            return gemeinde.id
+            
+            logger.warning(f"Could not resolve gemeinde_id for dokument {dokument.id}")
+            return None
+            
+        except Exception as e:
+            logger.error(f"Error resolving gemeinde for dokument {dokument.id}: {str(e)}", exc_info=True)
+            return None
+    
+    def extract_year_from_dokument(self, dokument: Dokument) -> Optional[int]:
+        """
+        Extract year from Dokument label or versionsbezeichnung.
+        
+        Args:
+            dokument: Dokument record
+            
+        Returns:
+            Year as integer if found, None otherwise
+        """
+        import re
+        
+        # Try to extract year from label
+        if dokument.label:
+            year_match = re.search(r'\b(19|20)\d{2}\b', dokument.label)
+            if year_match:
+                return int(year_match.group())
+        
+        # Try to extract year from versionsbezeichnung
+        if dokument.versionsbezeichnung:
+            year_match = re.search(r'\b(19|20)\d{2}\b', dokument.versionsbezeichnung)
+            if year_match:
+                return int(year_match.group())
+        
+        return None
--- a/modules/features/realEstate/bzoExtractionLangGraph.py
+++ b/modules/features/realEstate/bzoExtractionLangGraph.py
@ -0,0 +1,738 @@
+"""
+LangGraph-based pipeline for extracting structured content from BZO PDFs.
+"""
+
+import logging
+import re
+from typing import TypedDict, List, Dict, Any, Optional
+from dataclasses import dataclass
+from langgraph.graph import StateGraph, START, END
+
+from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor, TextBlock
+from modules.features.realEstate.bzoRuleTaxonomy import RULE_TAXONOMY
+
+logger = logging.getLogger(__name__)
+
+
+# ===== State Definition =====
+
+@dataclass
+class ClassifiedBlock:
+    """Classified text block."""
+    block: TextBlock
+    block_type: str  # "article", "heading", "table", "other"
+    article_label: Optional[str] = None
+    article_title: Optional[str] = None
+
+
+@dataclass
+class Article:
+    """Assembled article."""
+    article_label: str
+    article_title: Optional[str]
+    text: str
+    page_start: int
+    page_end: int
+    section_level_1: Optional[str] = None
+    section_level_2: Optional[str] = None
+    section_level_3: Optional[str] = None
+    zone_raw: Optional[str] = None
+
+
+@dataclass
+class ZoneInfo:
+    """Zone information."""
+    zone_code: str
+    zone_name: str
+    zone_category: Optional[str] = None
+    zone_subcategory: Optional[str] = None
+    empfindlichkeitsstufe: Optional[str] = None
+    geschosszahl: Optional[int] = None
+    gewerbeerleichterung: bool = False
+
+
+@dataclass
+class RuleCandidate:
+    """Rule candidate from pattern matching."""
+    rule_type: str
+    matched_text: str
+    article_text: str
+    page: int
+    is_table_rule: bool = False
+    table_zones: List[str] = None
+    condition_text: Optional[str] = None
+
+
+@dataclass
+class ParsedRule:
+    """Parsed rule with structured values."""
+    rule_type: str
+    value_numeric: Optional[float]
+    value_text: str
+    unit: Optional[str]
+    condition_text: Optional[str]
+    is_table_rule: bool
+    table_zones: List[str]
+    page: int
+    text_snippet: str
+    zone_raw: Optional[str] = None
+    rule_scope: str = "general"
+    confidence: float = 0.5
+
+
+class BZOExtractionState(TypedDict):
+    """State for BZO extraction pipeline."""
+    # Input metadata
+    dokument_id: Optional[str]
+    pdf_id: str
+    
+    # Extracted text blocks (stored as dicts for serialization)
+    text_blocks: List[Dict[str, Any]]
+    
+    # Classified blocks (stored as dicts for serialization)
+    classified_blocks: List[Dict[str, Any]]
+    
+    # Assembled articles (stored as dicts for serialization)
+    articles: List[Dict[str, Any]]
+    
+    # Zone tracking
+    current_zones: Dict[str, Dict[str, Any]]
+    zones: List[Dict[str, Any]]
+    
+    # Rule extraction (stored as dicts for serialization)
+    rule_candidates: List[Dict[str, Any]]
+    parsed_rules: List[Dict[str, Any]]
+    
+    # Processing metadata
+    errors: List[str]
+    warnings: List[str]
+
+
+# ===== Node Implementations =====
+
+def extract_pdf_text(state: BZOExtractionState) -> BZOExtractionState:
+    """Extract text blocks from PDF."""
+    try:
+        # PDF bytes should be passed in state context
+        # This is handled in run_extraction function
+        # State already has text_blocks populated
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error extracting PDF text: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"PDF extraction error: {str(e)}"]
+        return state
+
+
+def classify_text_block(state: BZOExtractionState) -> BZOExtractionState:
+    """Classify text blocks into articles, headings, tables, etc."""
+    try:
+        classified = []
+        
+        for block_dict in state["text_blocks"]:
+            text = block_dict["text"].strip()
+            if not text:
+                continue
+            
+            block_type = "other"
+            article_label = None
+            article_title = None
+            
+            # Check for article patterns
+            article_match = re.search(r'Art\.?\s*(\d+[a-z]?)', text, re.IGNORECASE)
+            if article_match:
+                block_type = "article"
+                article_label = f"Art. {article_match.group(1)}"
+                # Try to extract title (text after article label, before first period or newline)
+                title_match = re.search(r'Art\.?\s*\d+[a-z]?\s+(.+?)(?:\.|$|\n)', text, re.IGNORECASE)
+                if title_match:
+                    article_title = title_match.group(1).strip()
+            
+            # Check for heading patterns (Roman numerals, letters, numbers)
+            elif re.match(r'^[A-Z]\.\s+[A-Z]', text) or re.match(r'^[IVX]+\.\s+[A-Z]', text) or re.match(r'^\d+\.\s+[A-Z]', text):
+                block_type = "heading"
+            
+            # Check for table patterns (multiple tabs or aligned columns)
+            elif '\t' in text or (len(text.split()) > 5 and text.count('  ') > 2):
+                block_type = "table"
+            
+            classified.append({
+                "block": {
+                    "page": block_dict["page"],
+                    "text": block_dict["text"],
+                    "block_id": block_dict["block_id"],
+                    "bbox": block_dict.get("bbox")
+                },
+                "block_type": block_type,
+                "article_label": article_label,
+                "article_title": article_title
+            })
+        
+        # Update state with new classified blocks
+        existing_blocks = state.get("classified_blocks", [])
+        state["classified_blocks"] = existing_blocks + classified
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error classifying text blocks: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Classification error: {str(e)}"]
+        return state
+
+
+def assemble_articles(state: BZOExtractionState) -> BZOExtractionState:
+    """Assemble classified blocks into articles with hierarchical structure."""
+    try:
+        articles = []
+        current_article = None
+        current_section_1 = None
+        current_section_2 = None
+        current_section_3 = None
+        
+        for classified_dict in state["classified_blocks"]:
+            block_dict = classified_dict["block"]
+            block = TextBlock(
+                page=block_dict["page"],
+                text=block_dict["text"],
+                block_id=block_dict["block_id"],
+                bbox=block_dict.get("bbox")
+            )
+            text = block.text.strip()
+            block_type = classified_dict["block_type"]
+            article_label = classified_dict.get("article_label")
+            article_title = classified_dict.get("article_title")
+            
+            # Update section levels
+            if block_type == "heading":
+                # Level 1: A., B., C.
+                if re.match(r'^[A-Z]\.\s+', text):
+                    current_section_1 = text.split('.', 1)[0] + '.'
+                    current_section_2 = None
+                    current_section_3 = None
+                # Level 2: I., II., III.
+                elif re.match(r'^[IVX]+\.\s+', text):
+                    current_section_2 = text.split('.', 1)[0] + '.'
+                    current_section_3 = None
+                # Level 3: 1., 2., 3.
+                elif re.match(r'^\d+\.\s+', text):
+                    current_section_3 = text.split('.', 1)[0] + '.'
+            
+            # Start new article
+            if article_label:
+                # Save previous article if exists
+                if current_article:
+                    articles.append(current_article)
+                
+                # Start new article
+                current_article = {
+                    "article_label": article_label,
+                    "article_title": article_title,
+                    "text": text,
+                    "page_start": block.page,
+                    "page_end": block.page,
+                    "section_level_1": current_section_1,
+                    "section_level_2": current_section_2,
+                    "section_level_3": current_section_3,
+                    "zone_raw": None
+                }
+            # Continue current article
+            elif current_article:
+                current_article["text"] += "\n" + text
+                current_article["page_end"] = block.page
+        
+        # Add last article
+        if current_article:
+            articles.append(current_article)
+        
+        # Update state with new articles
+        existing_articles = state.get("articles", [])
+        state["articles"] = existing_articles + articles
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error assembling articles: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Article assembly error: {str(e)}"]
+        return state
+
+
+def detect_zone_changes(state: BZOExtractionState) -> BZOExtractionState:
+    """Detect zone declarations and maintain zone scope."""
+    try:
+        zones = []
+        current_zones = state.get("current_zones", {})
+        
+        for article_dict in state["articles"]:
+            text = article_dict.get("text", "")
+            article_label = article_dict.get("article_label", "")
+            page_start = article_dict.get("page_start", 0)
+            
+            # Pattern: "Wohnzone W2", "Zone W3", "Gewerbezone G1"
+            zone_patterns = [
+                r'(?:Wohnzone|Zone|Gewerbezone|Industriezone|Zentrumszone|Ortsbildschutzzone|Erholungszone)\s+([A-Z0-9/]+)',
+                r'([A-Z]\d+(?:/\d+)?(?:G)?)',  # W2/30, W2/30G, Z3, K3/4
+            ]
+            
+            for pattern in zone_patterns:
+                matches = re.finditer(pattern, text, re.IGNORECASE)
+                for match in matches:
+                    zone_code = match.group(1).upper()
+                    
+                    # Parse zone code
+                    gewerbeerleichterung = zone_code.endswith('G')
+                    if gewerbeerleichterung:
+                        zone_code_base = zone_code[:-1]
+                    else:
+                        zone_code_base = zone_code
+                    
+                    # Extract geschosszahl from code (e.g., W2 -> 2, W3/50 -> 3)
+                    geschosszahl = None
+                    if '/' in zone_code_base:
+                        parts = zone_code_base.split('/')
+                        geschosszahl_match = re.search(r'(\d+)', parts[0])
+                        if geschosszahl_match:
+                            geschosszahl = int(geschosszahl_match.group(1))
+                    else:
+                        geschosszahl_match = re.search(r'(\d+)', zone_code_base)
+                        if geschosszahl_match:
+                            geschosszahl = int(geschosszahl_match.group(1))
+                    
+                    # Determine zone category from context
+                    zone_category = None
+                    if 'Wohnzone' in text or zone_code.startswith('W'):
+                        zone_category = "Wohnzonen"
+                    elif 'Zentrumszone' in text or zone_code.startswith('Z'):
+                        zone_category = "Zentrumszonen"
+                    elif 'Gewerbezone' in text or zone_code.startswith('G'):
+                        zone_category = "Arbeitsplatzzonen"
+                    elif 'Industriezone' in text or zone_code.startswith('I'):
+                        zone_category = "Arbeitsplatzzonen"
+                    
+                    zone_info = ZoneInfo(
+                        zone_code=zone_code,
+                        zone_name=f"Zone {zone_code}",
+                        zone_category=zone_category,
+                        geschosszahl=geschosszahl,
+                        gewerbeerleichterung=gewerbeerleichterung
+                    )
+                    
+                    current_zones[zone_code] = zone_info
+                    zones.append({
+                        "zone_code": zone_code,
+                        "zone_name": zone_info.zone_name,
+                        "zone_category": zone_category,
+                        "geschosszahl": geschosszahl,
+                        "gewerbeerleichterung": gewerbeerleichterung,
+                        "source_article": article_label,
+                        "page": page_start
+                    })
+        
+        # Update state with zones
+        state["current_zones"] = current_zones
+        existing_zones = state.get("zones", [])
+        state["zones"] = existing_zones + zones
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error detecting zones: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Zone detection error: {str(e)}"]
+        return state
+
+
+def detect_rule_candidates(state: BZOExtractionState) -> BZOExtractionState:
+    """Detect rule candidates using pattern matching."""
+    try:
+        candidates = []
+        
+        for article_dict in state["articles"]:
+            text = article_dict.get("text", "")
+            article_label = article_dict.get("article_label", "")
+            page_start = article_dict.get("page_start", 0)
+            
+            # Check each rule type in taxonomy
+            for rule_type, rule_config in RULE_TAXONOMY.items():
+                patterns = rule_config.get("patterns", [])
+                
+                for pattern in patterns:
+                    # Create regex pattern (case-insensitive)
+                    regex_pattern = re.compile(pattern, re.IGNORECASE)
+                    matches = regex_pattern.finditer(text)
+                    
+                    for match in matches:
+                        # Extract context around match
+                        start = max(0, match.start() - 100)
+                        end = min(len(text), match.end() + 100)
+                        context = text[start:end]
+                        
+                        # Check for conditions (geographic, temporal, etc.)
+                        condition_text = None
+                        condition_patterns = [
+                            r'(?:nördlich|südlich|östlich|westlich|oberhalb|unterhalb)\s+[^,\.]+',
+                            r'(?:für|bei|in)\s+[^,\.]+',
+                        ]
+                        for cond_pattern in condition_patterns:
+                            cond_match = re.search(cond_pattern, context, re.IGNORECASE)
+                            if cond_match:
+                                condition_text = cond_match.group(0)
+                                break
+                        
+                        candidate = {
+                            "rule_type": rule_type,
+                            "matched_text": match.group(0),
+                            "article_text": text,
+                            "page": page_start,
+                            "condition_text": condition_text,
+                            "is_table_rule": False,
+                            "table_zones": []
+                        }
+                        candidates.append(candidate)
+        
+        # Update state with rule candidates
+        existing_candidates = state.get("rule_candidates", [])
+        state["rule_candidates"] = existing_candidates + candidates
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error detecting rule candidates: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Rule candidate detection error: {str(e)}"]
+        return state
+
+
+def parse_rule_values(state: BZOExtractionState) -> BZOExtractionState:
+    """Parse rule values using regex (LLM fallback can be added later)."""
+    try:
+        parsed_rules = []
+        
+        for candidate_dict in state["rule_candidates"]:
+            rule_type = candidate_dict["rule_type"]
+            rule_config = RULE_TAXONOMY.get(rule_type, {})
+            units = rule_config.get("units", [])
+            value_type = rule_config.get("value_type", "numeric")
+            
+            # Extract value using regex
+            matched_text = candidate_dict["matched_text"]
+            article_text = candidate_dict["article_text"]
+            text = matched_text + " " + article_text[article_text.find(matched_text):article_text.find(matched_text) + 200]
+            
+            value_numeric = None
+            value_text = matched_text
+            unit = None
+            
+            # Try to extract numeric value
+            if value_type in ["numeric", "integer"]:
+                # Pattern: "max. 4", "30 %", "min. 3.5 m"
+                value_patterns = [
+                    r'(?:max|maximal|min|mindestens|höchstens)\s*\.?\s*(\d+(?:\.\d+)?)',
+                    r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)',
+                    r'(\d+(?:\.\d+)?)',
+                ]
+                
+                for pattern in value_patterns:
+                    match = re.search(pattern, text, re.IGNORECASE)
+                    if match:
+                        try:
+                            value_numeric = float(match.group(1))
+                            if value_type == "integer":
+                                value_numeric = int(value_numeric)
+                            
+                            # Check for unit
+                            unit_match = re.search(r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)', text, re.IGNORECASE)
+                            if unit_match:
+                                unit = unit_match.group(2).lower()
+                                if unit in ["meter", "metern"]:
+                                    unit = "m"
+                                elif unit == "prozent":
+                                    unit = "%"
+                            
+                            break
+                        except ValueError:
+                            continue
+            
+            # Calculate confidence
+            confidence = 0.5
+            if value_numeric is not None:
+                confidence = 0.8
+                if unit:
+                    confidence = 0.9
+            
+            # Determine zone and scope
+            zone_raw = None
+            rule_scope = "general"
+            
+            # Check current zones context
+            if state.get("current_zones"):
+                # Use first zone as default (can be improved)
+                zone_raw = list(state["current_zones"].keys())[0] if state["current_zones"] else None
+                rule_scope = "zone" if zone_raw else "general"
+            
+            parsed_rule = {
+                "rule_type": rule_type,
+                "value_numeric": value_numeric,
+                "value_text": value_text,
+                "unit": unit,
+                "condition_text": candidate_dict.get("condition_text"),
+                "is_table_rule": candidate_dict.get("is_table_rule", False),
+                "table_zones": candidate_dict.get("table_zones", []),
+                "page": candidate_dict["page"],
+                "text_snippet": value_text,
+                "zone_raw": zone_raw,
+                "rule_scope": rule_scope,
+                "confidence": confidence
+            }
+            parsed_rules.append(parsed_rule)
+        
+        # Update state with parsed rules
+        existing_rules = state.get("parsed_rules", [])
+        state["parsed_rules"] = existing_rules + parsed_rules
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error parsing rule values: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Rule parsing error: {str(e)}"]
+        return state
+
+
+def assign_zone_and_scope(state: BZOExtractionState) -> BZOExtractionState:
+    """Assign zone and scope to parsed rules."""
+    try:
+        # Rules already have zone and scope assigned in parse_rule_values
+        # This node can refine assignments if needed
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error assigning zone and scope: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Zone/scope assignment error: {str(e)}"]
+        return state
+
+
+def confidence_scoring(state: BZOExtractionState) -> BZOExtractionState:
+    """Calculate confidence scores for extracted data."""
+    try:
+        # Confidence already calculated in parse_rule_values
+        # This node can refine scores if needed
+        return state
+        
+    except Exception as e:
+        logger.error(f"Error calculating confidence: {e}", exc_info=True)
+        state["errors"] = state.get("errors", []) + [f"Confidence scoring error: {str(e)}"]
+        return state
+
+
+
+
+# ===== Graph Construction =====
+
+def create_bzo_extraction_graph():
+    """Create and compile the BZO extraction graph."""
+    workflow = StateGraph(BZOExtractionState)
+    
+    # Add nodes
+    workflow.add_node("extract_pdf_text", extract_pdf_text)
+    workflow.add_node("classify_text_block", classify_text_block)
+    workflow.add_node("assemble_articles", assemble_articles)
+    workflow.add_node("detect_zone_changes", detect_zone_changes)
+    workflow.add_node("detect_rule_candidates", detect_rule_candidates)
+    workflow.add_node("parse_rule_values", parse_rule_values)
+    workflow.add_node("assign_zone_and_scope", assign_zone_and_scope)
+    workflow.add_node("confidence_scoring", confidence_scoring)
+    
+    # Define edges
+    workflow.set_entry_point("extract_pdf_text")
+    workflow.add_edge("extract_pdf_text", "classify_text_block")
+    workflow.add_edge("classify_text_block", "assemble_articles")
+    workflow.add_edge("assemble_articles", "detect_zone_changes")
+    workflow.add_edge("detect_zone_changes", "detect_rule_candidates")
+    workflow.add_edge("detect_rule_candidates", "parse_rule_values")
+    workflow.add_edge("parse_rule_values", "assign_zone_and_scope")
+    workflow.add_edge("assign_zone_and_scope", "confidence_scoring")
+    workflow.add_edge("confidence_scoring", END)
+    
+    return workflow.compile()
+
+
+def run_extraction(pdf_bytes: bytes, pdf_id: str = None, dokument_id: str = None) -> Dict[str, Any]:
+    """
+    Run the extraction pipeline on a PDF and return structured, sorted results.
+    
+    Args:
+        pdf_bytes: PDF file content as bytes
+        pdf_id: Optional identifier for the PDF (defaults to generated ID)
+        dokument_id: Optional dokument ID for reference
+        
+    Returns:
+        Dictionary with extracted and sorted content:
+        {
+            "articles": [...],  # Sorted by page_start, then article_label
+            "zones": [...],     # Sorted by zone_code
+            "rules": [...],     # Sorted by rule_type, then page
+            "errors": [...],
+            "warnings": [...]
+        }
+    """
+    import uuid
+    
+    if not pdf_id:
+        pdf_id = f"pdf_{uuid.uuid4().hex[:8]}"
+    
+    # Initialize state
+    state: BZOExtractionState = {
+        "dokument_id": dokument_id,
+        "pdf_id": pdf_id,
+        "text_blocks": [],
+        "classified_blocks": [],
+        "articles": [],
+        "current_zones": {},
+        "zones": [],
+        "rule_candidates": [],
+        "parsed_rules": [],
+        "errors": [],
+        "warnings": []
+    }
+    
+    # Extract PDF text first
+    pdf_extractor = BZOPdfExtractor()
+    text_blocks_objects = pdf_extractor.extract_text_blocks(pdf_bytes, state["pdf_id"])
+    # Convert TextBlock objects to dicts for state
+    state["text_blocks"] = [
+        {
+            "page": tb.page,
+            "text": tb.text,
+            "block_id": tb.block_id,
+            "bbox": tb.bbox
+        }
+        for tb in text_blocks_objects
+    ]
+    
+    # Create and run graph
+    graph = create_bzo_extraction_graph()
+    final_state = graph.invoke(state)
+    
+    # Sort and structure results
+    articles = sorted(
+        final_state.get("articles", []),
+        key=lambda x: (x.get("page_start", 0), x.get("article_label", ""))
+    )
+    
+    zones = sorted(
+        final_state.get("zones", []),
+        key=lambda x: x.get("zone_code", "")
+    )
+    
+    rules = sorted(
+        final_state.get("parsed_rules", []),
+        key=lambda x: (x.get("rule_type", ""), x.get("page", 0))
+    )
+    
+    return {
+        "articles": articles,
+        "zones": zones,
+        "rules": rules,
+        "errors": final_state.get("errors", []),
+        "warnings": final_state.get("warnings", [])
+    }
+
+
+def extract_from_documents(
+    document_retriever,
+    dokument_ids: List[str]
+) -> Dict[str, Any]:
+    """
+    Extract BZO content from one or more documents.
+    
+    Args:
+        document_retriever: BZODocumentRetriever instance
+        dokument_ids: List of dokument IDs to process
+        
+    Returns:
+        Dictionary with results per document:
+        {
+            "results": [
+                {
+                    "dokument_id": "...",
+                    "articles": [...],
+                    "zones": [...],
+                    "rules": [...],
+                    "errors": [...],
+                    "warnings": [...]
+                },
+                ...
+            ],
+            "summary": {
+                "total_documents": N,
+                "successful": M,
+                "failed": K,
+                "total_articles": X,
+                "total_zones": Y,
+                "total_rules": Z
+            }
+        }
+    """
+    results = []
+    total_articles = 0
+    total_zones = 0
+    total_rules = 0
+    successful = 0
+    failed = 0
+    
+    # Retrieve documents
+    dokumente = document_retriever.get_documents_by_ids(dokument_ids)
+    
+    for dokument in dokumente:
+        try:
+            # Retrieve PDF content
+            pdf_bytes = document_retriever.retrieve_pdf_content(dokument)
+            if not pdf_bytes:
+                logger.warning(f"Could not retrieve PDF for dokument {dokument.id}")
+                results.append({
+                    "dokument_id": dokument.id,
+                    "articles": [],
+                    "zones": [],
+                    "rules": [],
+                    "errors": [f"Could not retrieve PDF content"],
+                    "warnings": []
+                })
+                failed += 1
+                continue
+            
+            # Run extraction
+            extraction_result = run_extraction(
+                pdf_bytes=pdf_bytes,
+                pdf_id=dokument.dokumentReferenz or f"dok_{dokument.id}",
+                dokument_id=dokument.id
+            )
+            
+            # Add dokument_id to result
+            extraction_result["dokument_id"] = dokument.id
+            results.append(extraction_result)
+            
+            # Update counters
+            total_articles += len(extraction_result.get("articles", []))
+            total_zones += len(extraction_result.get("zones", []))
+            total_rules += len(extraction_result.get("rules", []))
+            
+            if extraction_result.get("errors"):
+                failed += 1
+            else:
+                successful += 1
+                
+        except Exception as e:
+            logger.error(f"Error processing dokument {dokument.id}: {str(e)}", exc_info=True)
+            results.append({
+                "dokument_id": dokument.id,
+                "articles": [],
+                "zones": [],
+                "rules": [],
+                "errors": [f"Processing error: {str(e)}"],
+                "warnings": []
+            })
+            failed += 1
+    
+    return {
+        "results": results,
+        "summary": {
+            "total_documents": len(dokument_ids),
+            "successful": successful,
+            "failed": failed,
+            "total_articles": total_articles,
+            "total_zones": total_zones,
+            "total_rules": total_rules
+        }
+    }
--- a/modules/features/realEstate/bzoPdfExtractor.py
+++ b/modules/features/realEstate/bzoPdfExtractor.py
@ -0,0 +1,117 @@
+"""
+PDF extraction module for BZO documents.
+Extracts page-aware text blocks from PDF files.
+"""
+
+import logging
+from typing import List, Dict, Any
+from dataclasses import dataclass
+import fitz  # PyMuPDF
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TextBlock:
+    """Represents a text block from a PDF page."""
+    page: int
+    text: str
+    block_id: str
+    bbox: tuple = None  # (x0, y0, x1, y1) bounding box
+
+
+class BZOPdfExtractor:
+    """Extracts text blocks from PDF files with page awareness."""
+    
+    def __init__(self):
+        """Initialize the PDF extractor."""
+        pass
+    
+    def extract_text_blocks(self, pdf_bytes: bytes, pdf_id: str) -> List[TextBlock]:
+        """
+        Extract page-aware text blocks from PDF.
+        
+        Args:
+            pdf_bytes: PDF file content as bytes
+            pdf_id: Identifier for the PDF (for logging)
+            
+        Returns:
+            List of TextBlock objects with page numbers
+        """
+        text_blocks = []
+        
+        try:
+            # Open PDF from bytes
+            pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
+            
+            # Extract text from each page
+            for page_num in range(len(pdf_document)):
+                page = pdf_document[page_num]
+                
+                # Extract text blocks from page
+                blocks = page.get_text("blocks")
+                
+                for block_idx, block in enumerate(blocks):
+                    # block format: (x0, y0, x1, y1, "text", block_no, block_type)
+                    if len(block) >= 5:
+                        bbox = (block[0], block[1], block[2], block[3])
+                        text = block[4].strip()
+                        
+                        # Skip empty blocks
+                        if not text:
+                            continue
+                        
+                        # Create TextBlock
+                        block_id = f"{pdf_id}_p{page_num + 1}_b{block_idx}"
+                        text_block = TextBlock(
+                            page=page_num + 1,  # 1-indexed pages
+                            text=text,
+                            block_id=block_id,
+                            bbox=bbox
+                        )
+                        text_blocks.append(text_block)
+            
+            # Store page count before closing
+            page_count = len(pdf_document)
+            pdf_document.close()
+            
+            logger.info(f"Extracted {len(text_blocks)} text blocks from PDF {pdf_id} ({page_count} pages)")
+            
+        except Exception as e:
+            logger.error(f"Error extracting text from PDF {pdf_id}: {str(e)}", exc_info=True)
+            raise
+        
+        return text_blocks
+    
+    def extract_text_by_page(self, pdf_bytes: bytes, pdf_id: str) -> Dict[int, str]:
+        """
+        Extract full text per page (alternative method).
+        
+        Args:
+            pdf_bytes: PDF file content as bytes
+            pdf_id: Identifier for the PDF
+            
+        Returns:
+            Dictionary mapping page number to full page text
+        """
+        page_texts = {}
+        
+        try:
+            pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
+            
+            for page_num in range(len(pdf_document)):
+                page = pdf_document[page_num]
+                text = page.get_text()
+                page_texts[page_num + 1] = text  # 1-indexed
+            
+            # Store page count before closing
+            page_count = len(pdf_document)
+            pdf_document.close()
+            
+            logger.debug(f"Extracted text from {page_count} pages for PDF {pdf_id}")
+            
+        except Exception as e:
+            logger.error(f"Error extracting page text from PDF {pdf_id}: {str(e)}", exc_info=True)
+            raise
+        
+        return page_texts
--- a/modules/features/realEstate/bzoRuleTaxonomy.py
+++ b/modules/features/realEstate/bzoRuleTaxonomy.py
@ -0,0 +1,105 @@
+"""
+Rule taxonomy for BZO extraction.
+Defines fixed rule types and their patterns for deterministic rule detection.
+"""
+
+RULE_TAXONOMY = {
+    "max_building_height": {
+        "patterns": ["fassadenhöhe", "fassadenhöhen", "gebäudehöhe", "firsthöhe", "traufhöhe", "höchsthöhe", "gesamt höhe", "gesamt höhen"],
+        "units": ["m", "meter", "metern"],
+        "value_type": "numeric",
+        "keywords": ["max", "maximal", "höchstens"]
+    },
+    "max_floors": {
+        "patterns": ["vollgeschosse", "vollgeschoss", "geschosse", "geschosszahl"],
+        "units": [],
+        "value_type": "integer",
+        "keywords": ["max", "maximal", "höchstens"]
+    },
+    "max_attachable_attics": {
+        "patterns": ["anrechenbare dachgeschosse", "anrechenbares attikageschoss", "dachgeschosse", "attikageschoss"],
+        "units": [],
+        "value_type": "integer",
+        "keywords": ["max", "maximal"]
+    },
+    "max_attachable_basement": {
+        "patterns": ["anrechenbares untergeschoss", "untergeschoss"],
+        "units": [],
+        "value_type": "integer",
+        "keywords": ["max", "maximal"]
+    },
+    "density": {
+        "patterns": ["ausnützungsziffer", "az", "ausnützung"],
+        "units": ["%", "prozent"],
+        "value_type": "numeric",
+        "keywords": ["max", "maximal"]
+    },
+    "building_mass_index": {
+        "patterns": ["baumassenziffer", "bmz"],
+        "units": [],
+        "value_type": "numeric",
+        "keywords": ["max", "maximal"]
+    },
+    "green_space_index": {
+        "patterns": ["grünflächenziffer", "gfz"],
+        "units": ["%", "prozent"],
+        "value_type": "numeric",
+        "keywords": ["min", "mindestens"]
+    },
+    "boundary_distance": {
+        "patterns": ["grenzabstand", "grundabstand", "abstand"],
+        "units": ["m", "meter", "metern"],
+        "value_type": "numeric",
+        "keywords": ["min", "mindestens", "max", "maximal"]
+    },
+    "boundary_distance_length_surcharge": {
+        "patterns": ["mehrlängenzuschlag", "längenzuschlag"],
+        "units": [],
+        "value_type": "fraction",  # e.g., "1/3", "1/5"
+        "keywords": []
+    },
+    "boundary_distance_max": {
+        "patterns": ["höchstmass", "höchstmass grenzabstand", "höchstmass abstand"],
+        "units": ["m", "meter"],
+        "value_type": "numeric",
+        "keywords": ["max", "maximal"]
+    },
+    "building_length": {
+        "patterns": ["gebäudelänge"],
+        "units": ["m", "meter"],
+        "value_type": "numeric",
+        "keywords": ["min", "mindestens", "max", "maximal"]
+    },
+    "building_width": {
+        "patterns": ["gebäudebreite"],
+        "units": ["m", "meter"],
+        "value_type": "numeric",
+        "keywords": ["min", "mindestens", "max", "maximal"]
+    },
+    "residential_area_share": {
+        "patterns": ["wohnflächenanteil", "wohnanteil"],
+        "units": ["%", "prozent"],
+        "value_type": "numeric",
+        "keywords": ["min", "mindestens", "max", "maximal"]
+    }
+}
+
+
+def get_rule_taxonomy() -> dict:
+    """Get the rule taxonomy dictionary."""
+    return RULE_TAXONOMY
+
+
+def get_rule_types() -> list:
+    """Get list of all rule types."""
+    return list(RULE_TAXONOMY.keys())
+
+
+def get_rule_patterns(rule_type: str) -> list:
+    """Get patterns for a specific rule type."""
+    return RULE_TAXONOMY.get(rule_type, {}).get("patterns", [])
+
+
+def get_rule_units(rule_type: str) -> list:
+    """Get units for a specific rule type."""
+    return RULE_TAXONOMY.get(rule_type, {}).get("units", [])
--- a/modules/features/realEstate/scrapeSwissTopo.py
+++ b/modules/features/realEstate/scrapeSwissTopo.py
@ -0,0 +1,780 @@
+"""
+Swiss Topo Scraping Script
+
+Scrapes Switzerland systematically using the Swiss Topo connector
+and saves parcel data to the database.
+
+This script divides Switzerland into a grid and queries parcels at each grid point,
+then deduplicates and saves unique parcels to the database.
+"""
+
+import logging
+import asyncio
+from typing import Dict, Any, List, Set, Optional
+from dataclasses import dataclass
+import json
+
+from modules.datamodels.datamodelUam import User
+from modules.datamodels.datamodelRealEstate import (
+    Parzelle,
+    GeoPolylinie,
+    GeoPunkt,
+    Kontext,
+    Gemeinde,
+    Kanton,
+)
+from modules.interfaces.interfaceDbRealEstateObjects import getInterface as getRealEstateInterface
+from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
+from modules.connectors.connectorOerebWfs import OerebWfsConnector
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ScrapingStats:
+    """Statistics for scraping operation."""
+    total_queries: int = 0
+    successful_queries: int = 0
+    failed_queries: int = 0
+    unique_parcels_found: int = 0
+    parcels_saved: int = 0
+    parcels_skipped: int = 0
+    errors: List[str] = None
+    
+    def __post_init__(self):
+        if self.errors is None:
+            self.errors = []
+
+
+class SwissTopoScraper:
+    """
+    Scraper for Swiss Topo parcel data.
+    
+    Divides Kanton Zürich into a grid and queries parcels systematically,
+    then saves unique parcels to the database with ÖREB bauzone information.
+    """
+    
+    # Zürich canton bounds in LV95 coordinates
+    ZURICH_BOUNDS = {
+        "min_x": 2680000,
+        "max_x": 2780000,
+        "min_y": 1210000,
+        "max_y": 1280000
+    }
+    
+    def __init__(
+        self,
+        current_user: User,
+        grid_size: float = 500.0,  # Grid size in meters (500m = reasonable coverage)
+        max_concurrent: int = 50,  # Maximum concurrent API requests
+        batch_size: int = 100,  # Process parcels in batches
+    ):
+        """
+        Initialize scraper.
+        
+        Args:
+            current_user: User for database operations
+            grid_size: Size of grid cells in meters (default: 500m)
+            max_concurrent: Maximum concurrent API requests (default: 50)
+            batch_size: Number of parcels to process before saving (default: 100)
+        """
+        self.current_user = current_user
+        self.grid_size = grid_size
+        self.max_concurrent = max_concurrent
+        self.batch_size = batch_size
+        
+        oereb_connector = OerebWfsConnector()
+        self.connector = SwissTopoMapServerConnector(oereb_connector=oereb_connector)
+        self.realEstateInterface = getRealEstateInterface(current_user)
+        
+        # Track unique parcels by EGRID or label
+        self.unique_parcels: Dict[str, Dict[str, Any]] = {}
+        self.stats = ScrapingStats()
+        
+        # Cache for Gemeinde and Kanton UUIDs to avoid repeated database queries
+        # Key: bfs_nummer (int or str), Value: UUID (str)
+        self.gemeinde_cache: Dict[str, str] = {}
+        # Key: kanton abbreviation (str, e.g., "ZH"), Value: UUID (str)
+        self.kanton_cache: Dict[str, str] = {}
+    
+    def _generate_grid_points(self) -> List[tuple]:
+        """
+        Generate grid points covering Kanton Zürich.
+        
+        Returns:
+            List of (x, y) coordinate tuples in LV95
+        """
+        bounds = self.ZURICH_BOUNDS
+        min_x = bounds["min_x"]
+        max_x = bounds["max_x"]
+        min_y = bounds["min_y"]
+        max_y = bounds["max_y"]
+        
+        grid_points = []
+        x = min_x
+        while x <= max_x:
+            y = min_y
+            while y <= max_y:
+                grid_points.append((x, y))
+                y += self.grid_size
+            x += self.grid_size
+        
+        logger.info(f"Generated {len(grid_points)} grid points covering Kanton Zürich")
+        return grid_points
+    
+    async def _query_parcel_at_point(
+        self,
+        x: float,
+        y: float,
+        semaphore: asyncio.Semaphore
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Query parcel at a specific coordinate point.
+        
+        Args:
+            x: X coordinate (LV95)
+            y: Y coordinate (LV95)
+            semaphore: Semaphore for concurrency control
+            
+        Returns:
+            Parcel data dictionary or None if not found
+        """
+        async with semaphore:
+            try:
+                self.stats.total_queries += 1
+                location_str = f"{x},{y}"
+                
+                parcel_data = await self.connector.search_parcel(location_str, tolerance=5)
+                
+                if parcel_data:
+                    self.stats.successful_queries += 1
+                    return parcel_data
+                else:
+                    self.stats.failed_queries += 1
+                    return None
+                    
+            except Exception as e:
+                self.stats.failed_queries += 1
+                error_msg = f"Error querying parcel at ({x}, {y}): {str(e)}"
+                logger.debug(error_msg)
+                self.stats.errors.append(error_msg)
+                return None
+    
+    def _find_gemeinde_by_bfs_nummer(self, bfs_nummer: str) -> Optional[Gemeinde]:
+        """
+        Find existing Gemeinde by BFS number (stored in kontextInformationen).
+        
+        Args:
+            bfs_nummer: BFS municipality number
+            
+        Returns:
+            Gemeinde instance if found, None otherwise
+        """
+        try:
+            # Check cache first
+            if bfs_nummer in self.gemeinde_cache:
+                gemeinde_id = self.gemeinde_cache[bfs_nummer]
+                gemeinde = self.realEstateInterface.getGemeinde(gemeinde_id)
+                if gemeinde:
+                    return gemeinde
+            
+            # Search all Gemeinden and check kontextInformationen
+            gemeinden = self.realEstateInterface.getGemeinden(
+                recordFilter={"mandateId": self.current_user.mandateId}
+            )
+            
+            for gemeinde in gemeinden:
+                # Check kontextInformationen for bfs_nummer
+                for kontext in gemeinde.kontextInformationen:
+                    try:
+                        kontext_data = json.loads(kontext.inhalt) if isinstance(kontext.inhalt, str) else kontext.inhalt
+                        if isinstance(kontext_data, dict):
+                            if str(kontext_data.get("bfs_nummer")) == str(bfs_nummer):
+                                # Cache the result
+                                self.gemeinde_cache[bfs_nummer] = gemeinde.id
+                                return gemeinde
+                    except (json.JSONDecodeError, AttributeError):
+                        continue
+            
+            return None
+        except Exception as e:
+            logger.error(f"Error finding Gemeinde by BFS number {bfs_nummer}: {e}", exc_info=True)
+            return None
+    
+    def _find_kanton_by_abbreviation(self, abk: str) -> Optional[Kanton]:
+        """
+        Find existing Kanton by abbreviation.
+        
+        Args:
+            abk: Canton abbreviation (e.g., "BE", "ZH")
+            
+        Returns:
+            Kanton instance if found, None otherwise
+        """
+        try:
+            # Check cache first
+            if abk in self.kanton_cache:
+                kanton_id = self.kanton_cache[abk]
+                kanton = self.realEstateInterface.getKanton(kanton_id)
+                if kanton:
+                    return kanton
+            
+            # Search by abbreviation
+            kantone = self.realEstateInterface.getKantone(
+                recordFilter={
+                    "mandateId": self.current_user.mandateId,
+                    "abk": abk
+                }
+            )
+            
+            if kantone:
+                kanton = kantone[0]
+                # Cache the result
+                self.kanton_cache[abk] = kanton.id
+                return kanton
+            
+            return None
+        except Exception as e:
+            logger.error(f"Error finding Kanton by abbreviation {abk}: {e}", exc_info=True)
+            return None
+    
+    def _get_or_create_kanton(self, kanton_abk: str) -> Optional[str]:
+        """
+        Get or create a Kanton by abbreviation.
+        
+        Args:
+            kanton_abk: Canton abbreviation (e.g., "BE", "ZH")
+            
+        Returns:
+            UUID of the Kanton, or None if creation failed
+        """
+        if not kanton_abk:
+            return None
+        
+        # Check if exists
+        existing_kanton = self._find_kanton_by_abbreviation(kanton_abk)
+        if existing_kanton:
+            return existing_kanton.id
+        
+        # Create new Kanton
+        try:
+            # Map common abbreviations to full names (fallback to abbreviation if not found)
+            kanton_names = {
+                "AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden",
+                "BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt",
+                "FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden",
+                "JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden",
+                "OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn",
+                "SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri",
+                "VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich"
+            }
+            
+            kanton_label = kanton_names.get(kanton_abk, kanton_abk)
+            
+            kanton = Kanton(
+                mandateId=self.current_user.mandateId,
+                label=kanton_label,
+                abk=kanton_abk
+            )
+            
+            created_kanton = self.realEstateInterface.createKanton(kanton)
+            if created_kanton and created_kanton.id:
+                # Cache the result
+                self.kanton_cache[kanton_abk] = created_kanton.id
+                logger.info(f"Created new Kanton: {kanton_label} ({kanton_abk})")
+                return created_kanton.id
+        except Exception as e:
+            logger.error(f"Error creating Kanton {kanton_abk}: {e}", exc_info=True)
+        
+        return None
+    
+    def _get_or_create_gemeinde(
+        self,
+        gemeinde_name: str,
+        bfs_nummer: str,
+        kanton_abk: str
+    ) -> Optional[str]:
+        """
+        Get or create a Gemeinde by BFS number.
+        
+        Args:
+            gemeinde_name: Municipality name
+            bfs_nummer: BFS municipality number
+            kanton_abk: Canton abbreviation
+            
+        Returns:
+            UUID of the Gemeinde, or None if creation failed
+        """
+        if not gemeinde_name or not bfs_nummer:
+            return None
+        
+        # Check if exists
+        existing_gemeinde = self._find_gemeinde_by_bfs_nummer(bfs_nummer)
+        if existing_gemeinde:
+            return existing_gemeinde.id
+        
+        # Get or create Kanton first
+        kanton_id = self._get_or_create_kanton(kanton_abk)
+        
+        # Create new Gemeinde
+        try:
+            gemeinde = Gemeinde(
+                mandateId=self.current_user.mandateId,
+                label=gemeinde_name,
+                id_kanton=kanton_id,
+                kontextInformationen=[
+                    Kontext(
+                        thema="BFS Nummer",
+                        inhalt=json.dumps({"bfs_nummer": bfs_nummer}, ensure_ascii=False)
+                    )
+                ]
+            )
+            
+            created_gemeinde = self.realEstateInterface.createGemeinde(gemeinde)
+            if created_gemeinde and created_gemeinde.id:
+                # Cache the result
+                self.gemeinde_cache[bfs_nummer] = created_gemeinde.id
+                logger.info(f"Created new Gemeinde: {gemeinde_name} (BFS: {bfs_nummer})")
+                return created_gemeinde.id
+        except Exception as e:
+            logger.error(f"Error creating Gemeinde {gemeinde_name} (BFS: {bfs_nummer}): {e}", exc_info=True)
+        
+        return None
+    
+    def _extract_parcel_identifier(self, parcel_data: Dict[str, Any]) -> Optional[str]:
+        """
+        Extract unique identifier for a parcel.
+        
+        Args:
+            parcel_data: Parcel data from Swiss Topo
+            
+        Returns:
+            Unique identifier (EGRID or label) or None
+        """
+        attributes = parcel_data.get("attributes", {})
+        
+        # Prefer EGRID as it's globally unique
+        egrid = attributes.get("egris_egrid")
+        if egrid:
+            return f"egrid:{egrid}"
+        
+        # Fallback to label + municipality code
+        label = attributes.get("label") or attributes.get("number")
+        bfsnr = attributes.get("bfsnr")
+        if label and bfsnr:
+            return f"label:{bfsnr}:{label}"
+        
+        # Last resort: just label
+        if label:
+            return f"label:{label}"
+        
+        return None
+    
+    def _convert_to_parzelle_model(
+        self,
+        parcel_data: Dict[str, Any],
+        parcel_id: str,
+        gemeinde_id: Optional[str] = None
+    ) -> Optional[Parzelle]:
+        """
+        Convert Swiss Topo parcel data to Parzelle model.
+        
+        Args:
+            parcel_data: Raw parcel data from Swiss Topo
+            parcel_id: Unique parcel identifier
+            gemeinde_id: UUID of the Gemeinde (if already resolved)
+            
+        Returns:
+            Parzelle model instance or None if conversion fails
+        """
+        try:
+            attributes = parcel_data.get("attributes", {})
+            geometry = parcel_data.get("geometry", {})
+            
+            # Extract attributes
+            extracted_attrs = self.connector.extract_parcel_attributes(parcel_data)
+            
+            # Get geocoded address if available
+            geocoded_address = parcel_data.get('geocoded_address', {})
+            
+            # Get bauzone - prefer ÖREB bauzone if available (for Zürich), otherwise use extracted_attrs
+            bauzone = parcel_data.get("oereb_bauzone") or extracted_attrs.get("bauzone")
+            
+            # Build Parzelle data
+            parzelle_data = {
+                "mandateId": self.current_user.mandateId,
+                "label": extracted_attrs.get("label") or attributes.get("number") or f"Parcel-{parcel_id}",
+                "parzellenAliasTags": [attributes.get("egris_egrid")] if attributes.get("egris_egrid") else [],
+                "strasseNr": geocoded_address.get("full_address") or extracted_attrs.get("strasseNr"),
+                "plz": geocoded_address.get("plz") or extracted_attrs.get("plz"),
+                "eigentuemerschaft": extracted_attrs.get("eigentuemerschaft"),
+                "bauzone": bauzone,
+                "perimeter": extracted_attrs.get("perimeter"),
+                "baulinie": None,
+                "kontextGemeinde": gemeinde_id,  # Use UUID reference instead of name
+                "az": None,
+                "bz": None,
+                "vollgeschossZahl": None,
+                "anrechenbarDachgeschoss": None,
+                "anrechenbarUntergeschoss": None,
+                "gebaeudehoeheMax": None,
+                "regelnGrenzabstand": [],
+                "regelnMehrlaengenzuschlag": [],
+                "regelnMehrhoehenzuschlag": [],
+                "parzelleBebaut": None,
+                "parzelleErschlossen": None,
+                "parzelleHanglage": None,
+                "laermschutzzone": None,
+                "hochwasserschutzzone": None,
+                "grundwasserschutzzone": None,
+                "parzellenNachbarschaft": [],
+                "dokumente": [],
+                "kontextInformationen": [
+                    Kontext(
+                        thema="Swiss Topo Scraping",
+                        inhalt=json.dumps({
+                            "egrid": attributes.get("egris_egrid"),
+                            "identnd": attributes.get("identnd"),
+                            "canton": attributes.get("ak"),
+                            "municipality_code": attributes.get("bfsnr"),
+                            "scraped_coordinates": parcel_data.get('query_coordinates', {}),
+                            "source": "swiss_topo_scraping",
+                            "bauzone_source": "oereb_wfs" if parcel_data.get("oereb_bauzone") else "swiss_topo"
+                        }, ensure_ascii=False)
+                    )
+                ]
+            }
+            
+            # Create Parzelle instance
+            parzelle_instance = Parzelle(**parzelle_data)
+            return parzelle_instance
+            
+        except Exception as e:
+            logger.error(f"Error converting parcel {parcel_id} to Parzelle model: {e}", exc_info=True)
+            return None
+    
+    async def _save_parcels_batch(
+        self,
+        parcels: List[Dict[str, Any]]
+    ) -> int:
+        """
+        Save a batch of parcels to the database.
+        
+        This method handles the sorting algorithm:
+        - For each parcel, extracts gemeinde_information
+        - Checks if Gemeinde exists (by bfs_nummer), creates if not
+        - Checks if Kanton exists (by abbreviation), creates if not
+        - Links Parzelle to Gemeinde via UUID reference
+        
+        Args:
+            parcels: List of parcel data dictionaries
+            
+        Returns:
+            Number of parcels successfully saved
+        """
+        saved_count = 0
+        
+        for parcel_data in parcels:
+            try:
+                parcel_id = self._extract_parcel_identifier(parcel_data)
+                if not parcel_id:
+                    logger.warning("Could not extract parcel identifier, skipping")
+                    self.stats.parcels_skipped += 1
+                    continue
+                
+                # Check if parcel already exists in our unique set
+                if parcel_id in self.unique_parcels:
+                    logger.debug(f"Parcel {parcel_id} already found, skipping duplicate")
+                    self.stats.parcels_skipped += 1
+                    continue
+                
+                # Extract gemeinde_information from parcel_data
+                # The connector returns it as 'gemeinde_info' (not 'gemeinde_information')
+                gemeinde_info = parcel_data.get('gemeinde_info') or parcel_data.get('gemeinde_information')
+                gemeinde_id = None
+                kanton_abk = None
+                
+                if gemeinde_info:
+                    gemeinde_name = gemeinde_info.get('name')
+                    bfs_nummer = gemeinde_info.get('bfs_nummer')
+                    kanton_abk = gemeinde_info.get('kanton')
+                    
+                    # Skip parcels not from Zürich (safety filter)
+                    if kanton_abk and kanton_abk.upper() != "ZH":
+                        logger.debug(f"Skipping parcel {parcel_id} from canton {kanton_abk} (only Zürich parcels are processed)")
+                        self.stats.parcels_skipped += 1
+                        continue
+                    
+                    if gemeinde_name and bfs_nummer:
+                        # Get or create Gemeinde (this also handles Kanton creation)
+                        gemeinde_id = self._get_or_create_gemeinde(
+                            gemeinde_name=gemeinde_name,
+                            bfs_nummer=str(bfs_nummer),
+                            kanton_abk=kanton_abk
+                        )
+                        
+                        if not gemeinde_id:
+                            logger.warning(
+                                f"Could not get or create Gemeinde for parcel {parcel_id}: "
+                                f"name={gemeinde_name}, bfs_nummer={bfs_nummer}"
+                            )
+                    else:
+                        logger.debug(
+                            f"Missing Gemeinde info for parcel {parcel_id}: "
+                            f"name={gemeinde_name}, bfs_nummer={bfs_nummer}"
+                        )
+                else:
+                    logger.debug(f"No gemeinde_info found in parcel_data for {parcel_id}")
+                    # Skip parcels without gemeinde_info (likely not from Zürich)
+                    self.stats.parcels_skipped += 1
+                    continue
+                
+                # Query ÖREB WFS for bauzone (all parcels are from Zürich)
+                if kanton_abk and kanton_abk.upper() == "ZH":
+                    try:
+                        attributes = parcel_data.get("attributes", {})
+                        geometry = parcel_data.get("geometry", {})
+                        egrid = attributes.get("egris_egrid", "")
+                        
+                        # Get coordinates for query (use centroid or first point if available)
+                        x = None
+                        y = None
+                        if geometry:
+                            if "rings" in geometry and geometry["rings"]:
+                                # Use first point of first ring
+                                first_ring = geometry["rings"][0]
+                                if first_ring and len(first_ring) > 0:
+                                    x = first_ring[0][0]
+                                    y = first_ring[0][1]
+                            elif "coordinates" in geometry:
+                                # Try to extract coordinates from GeoJSON format
+                                coords = geometry.get("coordinates", [])
+                                if coords and len(coords) > 0:
+                                    # Handle nested coordinate arrays
+                                    def get_first_coord(coord_list, depth=0):
+                                        if depth < 3 and isinstance(coord_list, list) and len(coord_list) > 0:
+                                            if isinstance(coord_list[0], (int, float)):
+                                                return coord_list
+                                            return get_first_coord(coord_list[0], depth + 1)
+                                        return None
+                                    first_coord = get_first_coord(coords)
+                                    if first_coord and len(first_coord) >= 2:
+                                        x = first_coord[0]
+                                        y = first_coord[1]
+                        
+                        if geometry and self.connector.oereb_connector:
+                            logger.debug(f"Querying ÖREB WFS for bauzone for parcel {parcel_id}")
+                            zone_results = await self.connector.oereb_connector.query_zone_layer(
+                                egrid=egrid,
+                                x=x or 0.0,
+                                y=y or 0.0,
+                                canton="ZH",
+                                geometry=geometry
+                            )
+                            
+                            if zone_results and len(zone_results) > 0:
+                                # Extract typ_gde_abkuerzung from the first result
+                                zone_attrs = zone_results[0].get("attributes", {})
+                                oereb_bauzone = zone_attrs.get("typ_gde_abkuerzung")
+                                if oereb_bauzone:
+                                    # Add bauzone to parcel_data so it can be used in _convert_to_parzelle_model
+                                    parcel_data["oereb_bauzone"] = oereb_bauzone
+                                    logger.debug(f"Found ÖREB bauzone '{oereb_bauzone}' for parcel {parcel_id}")
+                                else:
+                                    logger.debug(f"No typ_gde_abkuerzung found in ÖREB response for parcel {parcel_id}")
+                            else:
+                                logger.debug(f"No zone results from ÖREB WFS for parcel {parcel_id}")
+                        else:
+                            logger.debug(f"Cannot query ÖREB WFS for parcel {parcel_id}: missing geometry or connector")
+                    except Exception as e:
+                        logger.warning(f"Error querying ÖREB WFS for bauzone for parcel {parcel_id}: {e}", exc_info=True)
+                        # Continue without ÖREB bauzone - will use default from extracted_attrs
+                else:
+                    # This should not happen since we filter for ZH above, but log if it does
+                    logger.warning(f"Parcel {parcel_id} is not from Zürich (kanton: {kanton_abk}), skipping ÖREB query")
+                
+                # Convert to Parzelle model (with Gemeinde UUID reference)
+                parzelle_instance = self._convert_to_parzelle_model(
+                    parcel_data,
+                    parcel_id,
+                    gemeinde_id=gemeinde_id
+                )
+                if not parzelle_instance:
+                    logger.warning(f"Could not convert parcel {parcel_id} to model, skipping")
+                    self.stats.parcels_skipped += 1
+                    continue
+                
+                # Check if parcel already exists in database (by label)
+                # Note: We rely on in-memory deduplication by EGRID/label for uniqueness
+                # Database check is mainly to avoid re-saving parcels from previous runs
+                existing_parcels = self.realEstateInterface.getParzellen(
+                    recordFilter={
+                        "mandateId": self.current_user.mandateId,
+                        "label": parzelle_instance.label
+                    }
+                )
+                
+                if existing_parcels:
+                    logger.debug(f"Parcel {parzelle_instance.label} already exists in database, skipping")
+                    self.unique_parcels[parcel_id] = parcel_data  # Mark as seen
+                    self.stats.parcels_skipped += 1
+                    continue
+                
+                # Save to database
+                created_parzelle = self.realEstateInterface.createParzelle(parzelle_instance)
+                
+                if created_parzelle and created_parzelle.id:
+                    self.unique_parcels[parcel_id] = parcel_data
+                    saved_count += 1
+                    self.stats.parcels_saved += 1
+                    logger.debug(
+                        f"Saved parcel {created_parzelle.label} (ID: {created_parzelle.id}) "
+                        f"linked to Gemeinde {gemeinde_id if gemeinde_id else 'None'}"
+                    )
+                else:
+                    logger.warning(f"Failed to save parcel {parzelle_instance.label}")
+                    self.stats.parcels_skipped += 1
+                    
+            except Exception as e:
+                error_msg = f"Error saving parcel: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                self.stats.errors.append(error_msg)
+                self.stats.parcels_skipped += 1
+        
+        return saved_count
+    
+    async def scrape(
+        self,
+        grid_points: Optional[List[tuple]] = None,
+        progress_callback: Optional[callable] = None
+    ) -> ScrapingStats:
+        """
+        Scrape Kanton Zürich for parcel data.
+        
+        Args:
+            grid_points: Optional list of (x, y) coordinates to query.
+                         If None, generates grid automatically for Zürich.
+            progress_callback: Optional callback function(status_dict) called periodically
+            
+        Returns:
+            ScrapingStats object with scraping statistics
+        """
+        logger.info("Starting Swiss Topo scraping operation for Kanton Zürich")
+        
+        # Generate grid points if not provided
+        if grid_points is None:
+            grid_points = self._generate_grid_points()
+        
+        logger.info(f"Scraping {len(grid_points)} grid points in Kanton Zürich")
+        
+        # Create semaphore for concurrency control
+        semaphore = asyncio.Semaphore(self.max_concurrent)
+        
+        # Query all grid points concurrently
+        tasks = [
+            self._query_parcel_at_point(x, y, semaphore)
+            for x, y in grid_points
+        ]
+        
+        # Process results in batches
+        batch = []
+        processed = 0
+        
+        for coro in asyncio.as_completed(tasks):
+            try:
+                parcel_data = await coro
+                processed += 1
+                
+                if parcel_data:
+                    batch.append(parcel_data)
+                    
+                    # Save batch when it reaches batch_size
+                    if len(batch) >= self.batch_size:
+                        await self._save_parcels_batch(batch)
+                        batch = []
+                        
+                        # Update stats
+                        self.stats.unique_parcels_found = len(self.unique_parcels)
+                        
+                        # Call progress callback if provided
+                        if progress_callback:
+                            progress_callback({
+                                "processed": processed,
+                                "total": len(grid_points),
+                                "unique_parcels": self.stats.unique_parcels_found,
+                                "saved": self.stats.parcels_saved,
+                                "skipped": self.stats.parcels_skipped
+                            })
+                
+                # Log progress periodically
+                if processed % 100 == 0:
+                    logger.info(
+                        f"Progress: {processed}/{len(grid_points)} queries completed, "
+                        f"{self.stats.unique_parcels_found} unique parcels found, "
+                        f"{self.stats.parcels_saved} saved"
+                    )
+                    
+            except Exception as e:
+                logger.error(f"Error processing query result: {e}", exc_info=True)
+                self.stats.errors.append(str(e))
+        
+        # Save remaining batch
+        if batch:
+            await self._save_parcels_batch(batch)
+        
+        # Final stats update
+        self.stats.unique_parcels_found = len(self.unique_parcels)
+        
+        logger.info(
+            f"Scraping completed: {self.stats.unique_parcels_found} unique parcels found, "
+            f"{self.stats.parcels_saved} saved, {self.stats.parcels_skipped} skipped"
+        )
+        
+        return self.stats
+
+
+async def scrape_switzerland(
+    current_user: User,
+    grid_size: float = 500.0,
+    max_concurrent: int = 50,
+    batch_size: int = 100,
+    grid_points: Optional[List[tuple]] = None
+) -> Dict[str, Any]:
+    """
+    Main function to scrape Kanton Zürich for parcel data.
+    
+    Note: This function now only scrapes parcels from Kanton Zürich.
+    All parcels are queried for ÖREB bauzone information.
+    
+    Args:
+        current_user: User for database operations
+        grid_size: Size of grid cells in meters (default: 500m)
+        max_concurrent: Maximum concurrent API requests (default: 50)
+        batch_size: Number of parcels to process before saving (default: 100)
+        grid_points: Optional list of (x, y) coordinates to query (must be within Zürich bounds)
+        
+    Returns:
+        Dictionary with scraping statistics and results
+    """
+    scraper = SwissTopoScraper(
+        current_user=current_user,
+        grid_size=grid_size,
+        max_concurrent=max_concurrent,
+        batch_size=batch_size
+    )
+    
+    stats = await scraper.scrape(grid_points=grid_points)
+    
+    return {
+        "success": True,
+        "stats": {
+            "total_queries": stats.total_queries,
+            "successful_queries": stats.successful_queries,
+            "failed_queries": stats.failed_queries,
+            "unique_parcels_found": stats.unique_parcels_found,
+            "parcels_saved": stats.parcels_saved,
+            "parcels_skipped": stats.parcels_skipped,
+            "error_count": len(stats.errors),
+            "errors": stats.errors[:10]  # Return first 10 errors
+        }
+    }
--- a/modules/routes/routeRealEstateScraping.py
+++ b/modules/routes/routeRealEstateScraping.py
@ -0,0 +1,879 @@
+"""
+Real Estate scraping routes for the backend API.
+Implements endpoints for scraping real estate data from external sources.
+"""
+
+import logging
+import json
+import aiohttp
+import asyncio
+from typing import Optional, Dict, Any
+from fastapi import APIRouter, HTTPException, Depends, Body, Request, Query, status
+
+# Import auth modules
+from modules.auth import limiter, getCurrentUser
+
+# Import models
+from modules.datamodels.datamodelUam import User
+from modules.datamodels.datamodelRealEstate import (
+    Gemeinde,
+    Kanton,
+    Dokument,
+    Kontext,
+    DokumentTyp,
+)
+
+# Import interfaces
+from modules.interfaces.interfaceDbRealEstateObjects import getInterface as getRealEstateInterface
+from modules.interfaces.interfaceDbComponentObjects import getInterface as getComponentInterface
+
+# Import scraping script
+from modules.features.realEstate.scrapeSwissTopo import scrape_switzerland
+
+# Import Swiss Topo MapServer connector
+from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
+from modules.connectors.connectorOerebWfs import OerebWfsConnector
+
+# Import Tavily connector for BZO document search
+from modules.aicore.aicorePluginTavily import AiTavily
+
+# Configure logger
+logger = logging.getLogger(__name__)
+
+# Create router for real estate scraping endpoints
+router = APIRouter(
+    prefix="/api/realestate",
+    tags=["Real Estate Scraping"],
+    responses={
+        404: {"description": "Not found"},
+        400: {"description": "Bad request"},
+        401: {"description": "Unauthorized"},
+        403: {"description": "Forbidden"},
+        500: {"description": "Internal server error"}
+    }
+)
+
+
+@router.post("/scrape-switzerland", response_model=Dict[str, Any])
+@limiter.limit("5/hour")  # Limit to 5 requests per hour (scraping is resource-intensive)
+async def scrape_switzerland_route(
+    request: Request,
+    body: Dict[str, Any] = Body(..., description="Scraping parameters"),
+    currentUser: User = Depends(getCurrentUser)
+) -> Dict[str, Any]:
+    """
+    Scrape Kanton Zürich systematically using Swiss Topo connector and save parcel data to database.
+    
+    This endpoint divides Kanton Zürich into a grid and queries parcels at each grid point,
+    then deduplicates and saves unique parcels to the database. For each parcel, it also
+    queries the ÖREB WFS service to retrieve bauzone information.
+    
+    **WARNING**: This is a resource-intensive operation that may take a long time
+    and make many API requests. Use with caution.
+    
+    Request Body:
+    {
+        "grid_size": 500.0,  // Grid cell size in meters (default: 500m)
+        "max_concurrent": 50,  // Maximum concurrent API requests (default: 50)
+        "batch_size": 100  // Number of parcels to process before saving (default: 100)
+    }
+    
+    Headers:
+    - X-CSRF-Token: CSRF token (required for security)
+    
+    Returns:
+    {
+        "success": true,
+        "stats": {
+            "total_queries": 1234,
+            "successful_queries": 1200,
+            "failed_queries": 34,
+            "unique_parcels_found": 500,
+            "parcels_saved": 450,
+            "parcels_skipped": 50,
+            "error_count": 5,
+            "errors": [...]
+        }
+    }
+    
+    Example:
+    - POST /api/realestate/scrape-switzerland
+      Body: {"grid_size": 1000.0, "max_concurrent": 5, "batch_size": 50}
+    """
+    try:
+        # Validate CSRF token
+        csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
+        if not csrf_token:
+            logger.warning(f"CSRF token missing for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="CSRF token missing. Please include X-CSRF-Token header."
+            )
+        
+        # Basic CSRF token format validation
+        if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
+            logger.warning(f"Invalid CSRF token format for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid CSRF token format"
+            )
+        
+        # Validate token is hex string
+        try:
+            int(csrf_token, 16)
+        except ValueError:
+            logger.warning(f"CSRF token is not a valid hex string for POST /api/realestate/scrape-switzerland from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid CSRF token format"
+            )
+        
+        # Extract parameters from body with defaults
+        grid_size = body.get("grid_size", 500.0)
+        max_concurrent = body.get("max_concurrent", 50)
+        batch_size = body.get("batch_size", 100)
+        
+        # Validate parameters
+        if grid_size <= 0 or grid_size > 10000:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="grid_size must be between 0 and 10000 meters"
+            )
+        
+        if max_concurrent <= 0 or max_concurrent > 200:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="max_concurrent must be between 1 and 200"
+            )
+        
+        if batch_size <= 0 or batch_size > 1000:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="batch_size must be between 1 and 1000"
+            )
+        
+        logger.info(
+            f"Starting Switzerland scraping for user {currentUser.id} (mandate: {currentUser.mandateId}) "
+            f"with grid_size={grid_size}, max_concurrent={max_concurrent}, batch_size={batch_size}"
+        )
+        
+        # Run scraping operation
+        result = await scrape_switzerland(
+            current_user=currentUser,
+            grid_size=grid_size,
+            max_concurrent=max_concurrent,
+            batch_size=batch_size
+        )
+        
+        logger.info(
+            f"Scraping completed for user {currentUser.id}: "
+            f"{result['stats']['parcels_saved']} parcels saved"
+        )
+        
+        return result
+        
+    except HTTPException:
+        raise
+    except ValueError as e:
+        logger.error(f"Validation error in scrape_switzerland_route: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Validation error: {str(e)}"
+        )
+    except Exception as e:
+        logger.error(f"Error scraping Switzerland: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error scraping Switzerland: {str(e)}"
+        )
+
+
+@router.get("/gemeinden", response_model=Dict[str, Any])
+@limiter.limit("60/minute")
+async def get_all_gemeinden(
+    request: Request,
+    only_current: bool = Query(True, description="Only return current municipalities (exclude historical)"),
+    currentUser: User = Depends(getCurrentUser)
+) -> Dict[str, Any]:
+    """
+    Fetch all Gemeinden (municipalities) from the Swiss Topo MapServer connector
+    and save them to the database.
+    
+    This endpoint:
+    1. Fetches all Swiss municipalities from the Swiss Federal Office of Topography
+    2. Saves them to the database (skipping duplicates based on BFS number)
+    3. Creates Kantone (cantons) as needed
+    4. Returns statistics about the import operation
+    
+    Query Parameters:
+    - only_current: If True, only return current municipalities (default: True).
+                   If False, return all municipalities including historical ones.
+    
+    Headers:
+    - X-CSRF-Token: CSRF token (required for security)
+    
+    Returns:
+    {
+        "gemeinden": [
+            {
+                "id": "uuid",
+                "mandateId": "uuid",
+                "label": "Bern",
+                "id_kanton": "uuid",
+                "kontextInformationen": [...],
+                ...
+            },
+            ...
+        ],
+        "count": 2162,
+        "stats": {
+            "gemeinden_created": 2100,
+            "gemeinden_skipped": 62,
+            "kantone_created": 26,
+            "error_count": 0,
+            "errors": []
+        }
+    }
+    
+    Example:
+    - GET /api/realestate/gemeinden
+    - GET /api/realestate/gemeinden?only_current=false
+    """
+    try:
+        # Validate CSRF token
+        csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
+        if not csrf_token:
+            logger.warning(f"CSRF token missing for GET /api/realestate/gemeinden from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="CSRF token missing. Please include X-CSRF-Token header."
+            )
+        
+        # Basic CSRF token format validation
+        if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
+            logger.warning(f"Invalid CSRF token format for GET /api/realestate/gemeinden from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid CSRF token format"
+            )
+        
+        # Validate token is hex string
+        try:
+            int(csrf_token, 16)
+        except ValueError:
+            logger.warning(f"CSRF token is not a valid hex string for GET /api/realestate/gemeinden from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid CSRF token format"
+            )
+        
+        logger.info(f"Fetching all Gemeinden for user {currentUser.id} (mandate: {currentUser.mandateId}), only_current={only_current}")
+        
+        # Initialize connectors and fetch all gemeinden
+        oereb_connector = OerebWfsConnector()
+        connector = SwissTopoMapServerConnector(oereb_connector=oereb_connector)
+        gemeinden_data = await connector.get_all_gemeinden(only_current=only_current)
+        
+        # Get interface for database operations
+        realEstateInterface = getRealEstateInterface(currentUser)
+        
+        # Statistics
+        gemeinden_created = 0
+        gemeinden_skipped = 0
+        kantone_created = 0
+        errors = []
+        
+        # Cache for Kanton UUIDs
+        kanton_cache: Dict[str, str] = {}
+        
+        # Helper function to find Gemeinde by BFS number
+        def find_gemeinde_by_bfs_nummer(bfs_nummer: str) -> Optional[Gemeinde]:
+            """Find existing Gemeinde by BFS number (stored in kontextInformationen)."""
+            try:
+                gemeinden = realEstateInterface.getGemeinden(
+                    recordFilter={"mandateId": currentUser.mandateId}
+                )
+                
+                for gemeinde in gemeinden:
+                    # Check kontextInformationen for bfs_nummer
+                    for kontext in gemeinde.kontextInformationen:
+                        try:
+                            kontext_data = json.loads(kontext.inhalt) if isinstance(kontext.inhalt, str) else kontext.inhalt
+                            if isinstance(kontext_data, dict):
+                                if str(kontext_data.get("bfs_nummer")) == str(bfs_nummer):
+                                    return gemeinde
+                        except (json.JSONDecodeError, AttributeError):
+                            continue
+                
+                return None
+            except Exception as e:
+                logger.error(f"Error finding Gemeinde by BFS number {bfs_nummer}: {e}", exc_info=True)
+                return None
+        
+        # Helper function to get or create Kanton
+        def get_or_create_kanton(kanton_abk: str) -> Optional[str]:
+            """Get or create a Kanton by abbreviation."""
+            nonlocal kantone_created, errors
+            
+            if not kanton_abk:
+                return None
+            
+            # Check cache first
+            if kanton_abk in kanton_cache:
+                return kanton_cache[kanton_abk]
+            
+            # Check if exists
+            kantone = realEstateInterface.getKantone(
+                recordFilter={
+                    "mandateId": currentUser.mandateId,
+                    "abk": kanton_abk
+                }
+            )
+            
+            if kantone:
+                kanton_cache[kanton_abk] = kantone[0].id
+                return kantone[0].id
+            
+            # Create new Kanton
+            try:
+                # Map common abbreviations to full names
+                kanton_names = {
+                    "AG": "Aargau", "AI": "Appenzell Innerrhoden", "AR": "Appenzell Ausserrhoden",
+                    "BE": "Bern", "BL": "Basel-Landschaft", "BS": "Basel-Stadt",
+                    "FR": "Freiburg", "GE": "Genf", "GL": "Glarus", "GR": "Graubünden",
+                    "JU": "Jura", "LU": "Luzern", "NE": "Neuenburg", "NW": "Nidwalden",
+                    "OW": "Obwalden", "SG": "St. Gallen", "SH": "Schaffhausen", "SO": "Solothurn",
+                    "SZ": "Schwyz", "TG": "Thurgau", "TI": "Tessin", "UR": "Uri",
+                    "VD": "Waadt", "VS": "Wallis", "ZG": "Zug", "ZH": "Zürich"
+                }
+                
+                kanton_label = kanton_names.get(kanton_abk, kanton_abk)
+                
+                kanton = Kanton(
+                    mandateId=currentUser.mandateId,
+                    label=kanton_label,
+                    abk=kanton_abk
+                )
+                
+                created_kanton = realEstateInterface.createKanton(kanton)
+                if created_kanton and created_kanton.id:
+                    kanton_cache[kanton_abk] = created_kanton.id
+                    kantone_created += 1
+                    logger.info(f"Created new Kanton: {kanton_label} ({kanton_abk})")
+                    return created_kanton.id
+            except Exception as e:
+                error_msg = f"Error creating Kanton {kanton_abk}: {e}"
+                logger.error(error_msg, exc_info=True)
+                errors.append(error_msg)
+            
+            return None
+        
+        # Process each gemeinde and save to database
+        saved_gemeinden = []
+        for gemeinde_data in gemeinden_data:
+            try:
+                gemeinde_name = gemeinde_data.get("name")
+                bfs_nummer = gemeinde_data.get("bfs_nummer")
+                kanton_abk = gemeinde_data.get("kanton")
+                
+                if not gemeinde_name or not bfs_nummer:
+                    logger.warning(f"Skipping Gemeinde with missing data: {gemeinde_data}")
+                    gemeinden_skipped += 1
+                    continue
+                
+                # Check if Gemeinde already exists
+                existing_gemeinde = find_gemeinde_by_bfs_nummer(str(bfs_nummer))
+                if existing_gemeinde:
+                    logger.debug(f"Gemeinde {gemeinde_name} (BFS: {bfs_nummer}) already exists, skipping")
+                    gemeinden_skipped += 1
+                    saved_gemeinden.append(existing_gemeinde.model_dump() if hasattr(existing_gemeinde, 'model_dump') else existing_gemeinde)
+                    continue
+                
+                # Get or create Kanton
+                kanton_id = get_or_create_kanton(kanton_abk) if kanton_abk else None
+                
+                # Create new Gemeinde
+                gemeinde = Gemeinde(
+                    mandateId=currentUser.mandateId,
+                    label=gemeinde_name,
+                    id_kanton=kanton_id,
+                    kontextInformationen=[
+                        Kontext(
+                            thema="BFS Nummer",
+                            inhalt=json.dumps({"bfs_nummer": bfs_nummer}, ensure_ascii=False)
+                        )
+                    ]
+                )
+                
+                created_gemeinde = realEstateInterface.createGemeinde(gemeinde)
+                if created_gemeinde and created_gemeinde.id:
+                    gemeinden_created += 1
+                    logger.info(f"Created new Gemeinde: {gemeinde_name} (BFS: {bfs_nummer})")
+                    saved_gemeinden.append(created_gemeinde.model_dump() if hasattr(created_gemeinde, 'model_dump') else created_gemeinde)
+                else:
+                    error_msg = f"Failed to create Gemeinde {gemeinde_name} (BFS: {bfs_nummer})"
+                    logger.error(error_msg)
+                    errors.append(error_msg)
+                    gemeinden_skipped += 1
+                    
+            except Exception as e:
+                error_msg = f"Error processing Gemeinde {gemeinde_data.get('name', 'Unknown')}: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                errors.append(error_msg)
+                gemeinden_skipped += 1
+        
+        logger.info(
+            f"Gemeinden import completed: {gemeinden_created} created, "
+            f"{gemeinden_skipped} skipped, {kantone_created} Kantone created"
+        )
+        
+        return {
+            "gemeinden": saved_gemeinden,
+            "count": len(saved_gemeinden),
+            "stats": {
+                "gemeinden_created": gemeinden_created,
+                "gemeinden_skipped": gemeinden_skipped,
+                "kantone_created": kantone_created,
+                "error_count": len(errors),
+                "errors": errors[:10]  # Return first 10 errors
+            }
+        }
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching all Gemeinden: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error fetching Gemeinden: {str(e)}"
+        )
+
+
+def _get_language_from_kanton(kanton_abk: Optional[str]) -> str:
+    """
+    Determine language (German/French/Italian) based on Kanton abbreviation.
+    
+    Args:
+        kanton_abk: Kanton abbreviation (e.g., 'ZH', 'VD', 'TI')
+    
+    Returns:
+        Language code: 'de' (German), 'fr' (French), or 'it' (Italian)
+    """
+    if not kanton_abk:
+        return 'de'  # Default to German
+    
+    # French-speaking cantons
+    french_cantons = {'VD', 'GE', 'NE', 'JU'}
+    # Italian-speaking canton
+    italian_cantons = {'TI'}
+    
+    kanton_upper = kanton_abk.upper()
+    if kanton_upper in french_cantons:
+        return 'fr'
+    elif kanton_upper in italian_cantons:
+        return 'it'
+    else:
+        return 'de'  # Default to German
+
+
+def _get_bzo_search_query(gemeinde_label: str, language: str) -> str:
+    """
+    Generate language-specific BZO search query for a Gemeinde.
+    
+    Args:
+        gemeinde_label: Name of the Gemeinde
+        language: Language code ('de', 'fr', 'it')
+    
+    Returns:
+        Search query string
+    """
+    if language == 'fr':
+        # French: Plan d'aménagement local or Règlement de construction
+        return f"Plan d'aménagement local {gemeinde_label} OR Règlement de construction {gemeinde_label}"
+    elif language == 'it':
+        # Italian: Piano di utilizzazione or Regolamento edilizio
+        return f"Piano di utilizzazione {gemeinde_label} OR Regolamento edilizio {gemeinde_label}"
+    else:
+        # German: Bau und Zonenordnung
+        return f"Bau und Zonenordnung {gemeinde_label}"
+
+
+@router.post("/gemeinden/fetch-bzo-documents", response_model=Dict[str, Any])
+@limiter.limit("10/hour")  # Resource-intensive operation
+async def fetch_bzo_documents(
+    request: Request,
+    currentUser: User = Depends(getCurrentUser)
+) -> Dict[str, Any]:
+    """
+    Search for and download Bau und Zonenordnung (BZO) documents for all Gemeinden.
+    
+    This endpoint:
+    1. Fetches all Gemeinden from the database
+    2. For each Gemeinde, determines language based on Kanton
+    3. Uses Tavily search to find BZO documents (up to 5 results)
+    4. Downloads all PDF files found and stores them with content
+    5. Creates Dokument records for each PDF and links them to Gemeinde's dokumente field
+    6. Skips Gemeinden that already have BZO documents
+    
+    Note: If Tavily returns multiple PDF results, all of them will be downloaded
+    and saved as separate Dokument records.
+    
+    Headers:
+    - X-CSRF-Token: CSRF token (required for security)
+    
+    Returns:
+    {
+        "success": true,
+        "stats": {
+            "gemeinden_processed": 100,
+            "documents_created": 85,
+            "documents_skipped": 15,
+            "errors": []
+        },
+        "results": [
+            {
+                "gemeinde_id": "...",
+                "gemeinde_label": "Zürich",
+                "status": "created|skipped|error",
+                "dokument_ids": ["...", "..."],  // List of created document IDs (can be multiple)
+                "error": null
+            }
+        ]
+    }
+    """
+    try:
+        # Validate CSRF token
+        csrf_token = request.headers.get("X-CSRF-Token") or request.headers.get("x-csrf-token")
+        if not csrf_token:
+            logger.warning(f"CSRF token missing for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="CSRF token missing. Please include X-CSRF-Token header."
+            )
+        
+        # Basic CSRF token format validation
+        if not isinstance(csrf_token, str) or len(csrf_token) < 16 or len(csrf_token) > 64:
+            logger.warning(f"Invalid CSRF token format for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid CSRF token format"
+            )
+        
+        # Validate token is hex string
+        try:
+            int(csrf_token, 16)
+        except ValueError:
+            logger.warning(f"CSRF token is not a valid hex string for POST /api/realestate/gemeinden/fetch-bzo-documents from user {currentUser.id}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid CSRF token format"
+            )
+        
+        logger.info(f"Starting BZO document fetch for user {currentUser.id} (mandate: {currentUser.mandateId})")
+        
+        # Get interfaces
+        realEstateInterface = getRealEstateInterface(currentUser)
+        componentInterface = getComponentInterface(currentUser)
+        
+        # Initialize Tavily connector
+        tavily = AiTavily()
+        
+        # Get all Gemeinden
+        gemeinden = realEstateInterface.getGemeinden(
+            recordFilter={"mandateId": currentUser.mandateId}
+        )
+        
+        logger.info(f"Found {len(gemeinden)} Gemeinden to process")
+        
+        # Statistics
+        stats = {
+            "gemeinden_processed": 0,
+            "documents_created": 0,
+            "documents_skipped": 0,
+            "errors": []
+        }
+        results = []
+        
+        # Process each Gemeinde
+        for gemeinde in gemeinden:
+            gemeinde_result = {
+                "gemeinde_id": gemeinde.id,
+                "gemeinde_label": gemeinde.label,
+                "status": None,
+                "dokument_ids": [],  # Changed to list to support multiple documents
+                "error": None
+            }
+            
+            try:
+                stats["gemeinden_processed"] += 1
+                
+                # Check if Gemeinde already has a BZO document
+                existing_bzo = False
+                if gemeinde.dokumente:
+                    for doc in gemeinde.dokumente:
+                        # Check if it's a BZO document by label or dokumentTyp
+                        if (doc.label and ("BZO" in doc.label.upper() or "BAU UND ZONENORDNUNG" in doc.label.upper() or 
+                            "PLAN D'AMÉNAGEMENT" in doc.label.upper() or "RÈGLEMENT DE CONSTRUCTION" in doc.label.upper() or
+                            "PIANO DI UTILIZZAZIONE" in doc.label.upper() or "REGOLAMENTO EDILIZIO" in doc.label.upper())) or \
+                           (doc.dokumentTyp and doc.dokumentTyp in [DokumentTyp.GEMEINDE_BZO_AKTUELL, DokumentTyp.GEMEINDE_BZO_REVISION]):
+                            existing_bzo = True
+                            break
+                
+                if existing_bzo:
+                    logger.debug(f"Gemeinde {gemeinde.label} already has BZO document, skipping")
+                    gemeinde_result["status"] = "skipped"
+                    stats["documents_skipped"] += 1
+                    results.append(gemeinde_result)
+                    continue
+                
+                # Get Kanton to determine language
+                kanton_abk = None
+                if gemeinde.id_kanton:
+                    kanton = realEstateInterface.getKanton(gemeinde.id_kanton)
+                    if kanton:
+                        kanton_abk = kanton.abk
+                
+                # Determine language
+                language = _get_language_from_kanton(kanton_abk)
+                
+                # Generate search query
+                search_query = _get_bzo_search_query(gemeinde.label, language)
+                logger.info(f"Searching for BZO document for {gemeinde.label} (language: {language}) with query: {search_query}")
+                
+                # Search with Tavily using the private _search method
+                search_results = await tavily._search(
+                    query=search_query,
+                    maxResults=5,
+                    country="switzerland"
+                )
+                
+                if not search_results:
+                    logger.warning(f"No search results found for {gemeinde.label}")
+                    gemeinde_result["status"] = "error"
+                    gemeinde_result["error"] = "No search results found"
+                    stats["errors"].append(f"{gemeinde.label}: No search results found")
+                    results.append(gemeinde_result)
+                    continue
+                
+                # Find all PDF URLs from search results
+                pdf_urls = []
+                for result in search_results:
+                    url = result.url.lower()
+                    if url.endswith('.pdf') or 'pdf' in url:
+                        pdf_urls.append(result.url)
+                
+                # If no PDF URLs found, try to use all results (they might be PDFs even without .pdf extension)
+                if not pdf_urls:
+                    pdf_urls = [result.url for result in search_results]
+                    logger.info(f"No explicit PDF URLs found for {gemeinde.label}, trying all {len(pdf_urls)} results")
+                
+                logger.info(f"Found {len(pdf_urls)} potential PDF documents for {gemeinde.label}")
+                
+                # Helper function to download a single PDF
+                async def download_pdf(pdf_url: str) -> Optional[bytes]:
+                    """Download a PDF from a URL with retry logic."""
+                    max_retries = 3
+                    retry_delay = 2
+                    
+                    for attempt in range(max_retries):
+                        try:
+                            # Create headers - use minimal headers on retry after 406 error
+                            if attempt > 0:
+                                # Minimal headers for retry
+                                headers = {
+                                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+                                    'Accept': '*/*'
+                                }
+                            else:
+                                # Full headers for first attempt
+                                headers = {
+                                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                                    'Accept': 'application/pdf,application/octet-stream,*/*',
+                                    'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
+                                    'Accept-Encoding': 'gzip, deflate, br',
+                                    'Connection': 'keep-alive',
+                                    'Upgrade-Insecure-Requests': '1'
+                                }
+                            
+                            timeout = aiohttp.ClientTimeout(total=30, connect=10)
+                            async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
+                                async with session.get(pdf_url, allow_redirects=True) as response:
+                                    if response.status == 200:
+                                        pdf_content = await response.read()
+                                        
+                                        if not pdf_content or len(pdf_content) < 100:  # Minimum size check
+                                            raise Exception("Downloaded file is too small or empty")
+                                        
+                                        # Verify it's actually a PDF (check PDF magic bytes)
+                                        if not pdf_content.startswith(b'%PDF'):
+                                            # Check if it's HTML (common error page)
+                                            if pdf_content.startswith(b'<') or pdf_content.startswith(b'<!DOCTYPE'):
+                                                raise Exception("Server returned HTML instead of PDF")
+                                            logger.warning(f"Downloaded file from {pdf_url} doesn't appear to be a PDF, but continuing anyway")
+                                        
+                                        # Success - return content
+                                        return pdf_content
+                                    elif response.status == 406:
+                                        # Not Acceptable - try with minimal headers on next attempt
+                                        logger.warning(f"HTTP 406 for {pdf_url}, will retry with minimal headers (attempt {attempt + 1}/{max_retries})")
+                                        if attempt < max_retries - 1:
+                                            await asyncio.sleep(retry_delay)
+                                            continue
+                                        else:
+                                            raise Exception(f"HTTP {response.status} (Not Acceptable) - server rejected request after {max_retries} attempts")
+                                    elif response.status in [301, 302, 303, 307, 308]:
+                                        # Redirect - should be handled by allow_redirects=True, but log it
+                                        logger.warning(f"Redirect {response.status} for {pdf_url}, following...")
+                                        pdf_content = await response.read()
+                                        return pdf_content
+                                    else:
+                                        raise Exception(f"HTTP {response.status} when downloading PDF")
+                        except asyncio.TimeoutError:
+                            logger.warning(f"Timeout downloading PDF from {pdf_url} (attempt {attempt + 1}/{max_retries})")
+                            if attempt < max_retries - 1:
+                                await asyncio.sleep(retry_delay)
+                                continue
+                            else:
+                                raise Exception("Connection timeout after retries")
+                        except aiohttp.ClientError as e:
+                            logger.warning(f"Connection error downloading PDF from {pdf_url} (attempt {attempt + 1}/{max_retries}): {str(e)}")
+                            if attempt < max_retries - 1:
+                                await asyncio.sleep(retry_delay)
+                                continue
+                            else:
+                                raise Exception(f"Connection error: {str(e)}")
+                        except Exception as e:
+                            # For other errors, don't retry
+                            raise
+                    
+                    return None
+                
+                # Process all PDF URLs
+                created_dokumente = []
+                current_dokumente = list(gemeinde.dokumente) if gemeinde.dokumente else []
+                
+                # Sanitize Gemeinde name for filename
+                safe_name = "".join(c for c in gemeinde.label if c.isalnum() or c in (' ', '-', '_')).strip()
+                safe_name = safe_name.replace(' ', '_')
+                if not safe_name:
+                    safe_name = "Gemeinde"
+                
+                # Determine base label based on language
+                if language == 'fr':
+                    base_doc_label = f"Plan d'aménagement local {gemeinde.label}"
+                elif language == 'it':
+                    base_doc_label = f"Piano di utilizzazione {gemeinde.label}"
+                else:
+                    base_doc_label = f"BZO {gemeinde.label}"
+                
+                # Process each PDF URL
+                for idx, pdf_url in enumerate(pdf_urls):
+                    try:
+                        logger.info(f"Downloading PDF {idx + 1}/{len(pdf_urls)} from {pdf_url} for {gemeinde.label}")
+                        pdf_content = await download_pdf(pdf_url)
+                        
+                        if not pdf_content:
+                            logger.warning(f"Failed to download PDF from {pdf_url} for {gemeinde.label}")
+                            continue
+                        
+                        # Additional validation
+                        if len(pdf_content) < 100:
+                            logger.warning(f"Downloaded file from {pdf_url} is too small for {gemeinde.label}")
+                            continue
+                        
+                        # Create unique file name (add index if multiple documents)
+                        if len(pdf_urls) > 1:
+                            file_name = f"BZO_{safe_name}_{idx + 1}.pdf"
+                            doc_label = f"{base_doc_label} ({idx + 1})"
+                        else:
+                            file_name = f"BZO_{safe_name}.pdf"
+                            doc_label = base_doc_label
+                        
+                        # Store file using ComponentObjects
+                        try:
+                            file_item = componentInterface.createFile(
+                                name=file_name,
+                                mimeType="application/pdf",
+                                content=pdf_content
+                            )
+                            
+                            # Store file data
+                            componentInterface.createFileData(file_item.id, pdf_content)
+                            
+                            logger.info(f"Stored file {file_name} with ID {file_item.id} for {gemeinde.label}")
+                        except Exception as e:
+                            logger.error(f"Error storing file {file_name} for {gemeinde.label}: {str(e)}", exc_info=True)
+                            stats["errors"].append(f"{gemeinde.label}: File storage failed for {pdf_url} - {str(e)}")
+                            continue
+                        
+                        # Create Dokument record
+                        dokument = Dokument(
+                            mandateId=currentUser.mandateId,
+                            label=doc_label,
+                            versionsbezeichnung="Aktuell",
+                            dokumentTyp=DokumentTyp.GEMEINDE_BZO_AKTUELL,
+                            dokumentReferenz=file_item.id,  # FileId from ComponentObjects
+                            quelle=pdf_url,  # Original URL
+                            mimeType="application/pdf",
+                            kategorienTags=["BZO", "Bauordnung", gemeinde.label]
+                        )
+                        
+                        # Create Dokument record in the Dokument table
+                        created_dokument = realEstateInterface.createDokument(dokument)
+                        logger.info(f"Created Dokument record with ID {created_dokument.id} for {gemeinde.label} (from {pdf_url})")
+                        
+                        created_dokumente.append(created_dokument)
+                        current_dokumente.append(created_dokument)
+                        gemeinde_result["dokument_ids"].append(created_dokument.id)
+                        
+                    except Exception as e:
+                        logger.error(f"Error processing PDF {pdf_url} for {gemeinde.label}: {str(e)}", exc_info=True)
+                        stats["errors"].append(f"{gemeinde.label}: Error processing PDF {pdf_url} - {str(e)}")
+                        continue
+                
+                # Update Gemeinde with all new dokumente
+                if created_dokumente:
+                    updated_gemeinde = realEstateInterface.updateGemeinde(
+                        gemeinde.id,
+                        {"dokumente": current_dokumente}
+                    )
+                    
+                    if updated_gemeinde:
+                        logger.info(f"Successfully created {len(created_dokumente)} BZO document(s) for {gemeinde.label}")
+                        gemeinde_result["status"] = "created"
+                        stats["documents_created"] += len(created_dokumente)
+                    else:
+                        raise Exception("Failed to update Gemeinde")
+                else:
+                    # No documents were successfully created
+                    gemeinde_result["status"] = "error"
+                    gemeinde_result["error"] = "No PDFs could be downloaded or processed"
+                    stats["errors"].append(f"{gemeinde.label}: No PDFs could be downloaded or processed")
+                
+            except Exception as e:
+                logger.error(f"Error processing Gemeinde {gemeinde.label}: {str(e)}", exc_info=True)
+                gemeinde_result["status"] = "error"
+                gemeinde_result["error"] = str(e)
+                stats["errors"].append(f"{gemeinde.label}: {str(e)}")
+            
+            results.append(gemeinde_result)
+        
+        logger.info(
+            f"BZO document fetch completed: {stats['documents_created']} created, "
+            f"{stats['documents_skipped']} skipped, {len(stats['errors'])} errors"
+        )
+        
+        return {
+            "success": True,
+            "stats": stats,
+            "results": results
+        }
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching BZO documents: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error fetching BZO documents: {str(e)}"
+        )