platform-core/modules/features/realEstate/bzoExtraction.py

"""
Pipeline for extracting structured content from BZO PDFs.

The extraction runs as a plain sequential pipeline of step functions; each step
takes the shared state dict, mutates/returns it, and the steps are chained
directly (no external workflow-orchestration framework).
"""

import logging
import re
import uuid
from typing import TypedDict, List, Dict, Any, Optional
from dataclasses import dataclass

from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor, TextBlock
from modules.features.realEstate.bzoRuleTaxonomy import RULE_TAXONOMY

logger = logging.getLogger(__name__)


# ===== BZO Params Extraction State (LLM step) =====

class BZOParamsExtractionState(TypedDict):
    """State for BZO params extraction via LLM."""
    extracted_content: Dict[str, Any]
    bauzone: str
    total_area_m2: Optional[float]
    relevant_rules: List[Dict[str, Any]]
    relevant_articles: List[Dict[str, Any]]
    zone_parameter_tables: List[Dict[str, Any]]
    ai_service: Any
    gemeinde: str
    # Output
    bauzone_params_list: List[str]
    fakten: List[Dict[str, str]]
    zusatzinformationen: List[Dict[str, Any]]
    errors: List[str]


# ===== State Definition =====

@dataclass
class ClassifiedBlock:
    """Classified text block."""
    block: TextBlock
    block_type: str  # "article", "heading", "table", "other"
    article_label: Optional[str] = None
    article_title: Optional[str] = None


@dataclass
class Article:
    """Assembled article."""
    article_label: str
    article_title: Optional[str]
    text: str
    page_start: int
    page_end: int
    section_level_1: Optional[str] = None
    section_level_2: Optional[str] = None
    section_level_3: Optional[str] = None
    zone_raw: Optional[str] = None


@dataclass
class ZoneInfo:
    """Zone information."""
    zone_code: str
    zone_name: str
    zone_category: Optional[str] = None
    zone_subcategory: Optional[str] = None
    empfindlichkeitsstufe: Optional[str] = None
    geschosszahl: Optional[int] = None
    gewerbeerleichterung: bool = False


@dataclass
class RuleCandidate:
    """Rule candidate from pattern matching."""
    rule_type: str
    matched_text: str
    article_text: str
    page: int
    is_table_rule: bool = False
    table_zones: List[str] = None
    condition_text: Optional[str] = None


@dataclass
class ParsedRule:
    """Parsed rule with structured values."""
    rule_type: str
    value_numeric: Optional[float]
    value_text: str
    unit: Optional[str]
    condition_text: Optional[str]
    is_table_rule: bool
    table_zones: List[str]
    page: int
    text_snippet: str
    zone_raw: Optional[str] = None
    rule_scope: str = "general"
    confidence: float = 0.5


class BZOExtractionState(TypedDict):
    """State for BZO extraction pipeline."""
    # Input metadata
    dokument_id: Optional[str]
    pdf_id: str

    # Extracted text blocks (stored as dicts for serialization)
    text_blocks: List[Dict[str, Any]]

    # Classified blocks (stored as dicts for serialization)
    classified_blocks: List[Dict[str, Any]]

    # Assembled articles (stored as dicts for serialization)
    articles: List[Dict[str, Any]]

    # Zone tracking
    current_zones: Dict[str, Dict[str, Any]]
    zones: List[Dict[str, Any]]

    # Rule extraction (stored as dicts for serialization)
    rule_candidates: List[Dict[str, Any]]
    parsed_rules: List[Dict[str, Any]]

    # Zone-parameter tables (structured table data mapping zones to parameters)
    zone_parameter_tables: List[Dict[str, Any]]

    # Processing metadata
    errors: List[str]
    warnings: List[str]


# ===== Node Implementations (Simplified 4-node pipeline) =====

def classify_and_assemble(state: BZOExtractionState) -> BZOExtractionState:
    """Classify text blocks and assemble into articles (merged node)."""
    try:
        classified = []
        for block_dict in state["text_blocks"]:
            text = block_dict["text"].strip()
            if not text:
                continue
            block_type = "other"
            article_label = None
            article_title = None
            article_match = re.search(r'Art\.?\s*(\d+[a-z]?)', text, re.IGNORECASE)
            if article_match:
                block_type = "article"
                article_label = f"Art. {article_match.group(1)}"
                title_match = re.search(r'Art\.?\s*\d+[a-z]?\s+(.+?)(?:\.|$|\n)', text, re.IGNORECASE)
                if title_match:
                    article_title = title_match.group(1).strip()
            elif re.match(r'^[A-Z]\.\s+[A-Z]', text) or re.match(r'^[IVX]+\.\s+[A-Z]', text) or re.match(r'^\d+\.\s+[A-Z]', text):
                block_type = "heading"
            elif '\t' in text or (len(text.split()) > 5 and text.count('  ') > 2):
                block_type = "table"
            classified.append({
                "block": {"page": block_dict["page"], "text": block_dict["text"], "block_id": block_dict["block_id"], "bbox": block_dict.get("bbox")},
                "block_type": block_type, "article_label": article_label, "article_title": article_title
            })
        state["classified_blocks"] = classified

        articles = []
        current_article = None
        current_section_1 = current_section_2 = current_section_3 = None
        for classified_dict in classified:
            block_dict = classified_dict["block"]
            text = block_dict["text"].strip()
            block_type = classified_dict["block_type"]
            article_label = classified_dict.get("article_label")
            article_title = classified_dict.get("article_title")
            if block_type == "heading":
                if re.match(r'^[A-Z]\.\s+', text):
                    current_section_1 = text.split('.', 1)[0] + '.'
                    current_section_2 = current_section_3 = None
                elif re.match(r'^[IVX]+\.\s+', text):
                    current_section_2 = text.split('.', 1)[0] + '.'
                    current_section_3 = None
                elif re.match(r'^\d+\.\s+', text):
                    current_section_3 = text.split('.', 1)[0] + '.'
            if article_label:
                if current_article:
                    articles.append(current_article)
                current_article = {
                    "article_label": article_label, "article_title": article_title, "text": text,
                    "page_start": block_dict["page"], "page_end": block_dict["page"],
                    "section_level_1": current_section_1, "section_level_2": current_section_2,
                    "section_level_3": current_section_3, "zone_raw": None
                }
            elif current_article:
                current_article["text"] += "\n" + text
                current_article["page_end"] = block_dict["page"]
        if current_article:
            articles.append(current_article)
        state["articles"] = articles
        return state
    except Exception as e:
        logger.error(f"Error in classify_and_assemble: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Classify/assemble error: {str(e)}"]
        return state


def extract_zones_and_tables(state: BZOExtractionState) -> BZOExtractionState:
    """Detect zones and extract zone-parameter tables (merged node)."""
    try:
        # Part 1: Detect zone declarations
        zones = []
        current_zones = {}
        for article_dict in state["articles"]:
            text = article_dict.get("text", "")
            article_label = article_dict.get("article_label", "")
            page_start = article_dict.get("page_start", 0)

            # Pattern: "Wohnzone W2", "Zone W3", "Gewerbezone G1"
            zone_patterns = [
                r'(?:Wohnzone|Zone|Gewerbezone|Industriezone|Zentrumszone|Ortsbildschutzzone|Erholungszone)\s+([A-Z0-9/]+)',
                r'([A-Z]\d+(?:/\d+)?(?:G)?)',  # W2/30, W2/30G, Z3, K3/4
            ]

            for pattern in zone_patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE)
                for match in matches:
                    zone_code = match.group(1).upper()

                    # Parse zone code
                    gewerbeerleichterung = zone_code.endswith('G')
                    if gewerbeerleichterung:
                        zone_code_base = zone_code[:-1]
                    else:
                        zone_code_base = zone_code

                    # Extract geschosszahl from code (e.g., W2 -> 2, W3/50 -> 3)
                    geschosszahl = None
                    if '/' in zone_code_base:
                        parts = zone_code_base.split('/')
                        geschosszahl_match = re.search(r'(\d+)', parts[0])
                        if geschosszahl_match:
                            geschosszahl = int(geschosszahl_match.group(1))
                    else:
                        geschosszahl_match = re.search(r'(\d+)', zone_code_base)
                        if geschosszahl_match:
                            geschosszahl = int(geschosszahl_match.group(1))

                    # Determine zone category from context
                    zone_category = None
                    if 'Wohnzone' in text or zone_code.startswith('W'):
                        zone_category = "Wohnzonen"
                    elif 'Zentrumszone' in text or zone_code.startswith('Z'):
                        zone_category = "Zentrumszonen"
                    elif 'Gewerbezone' in text or zone_code.startswith('G'):
                        zone_category = "Arbeitsplatzzonen"
                    elif 'Industriezone' in text or zone_code.startswith('I'):
                        zone_category = "Arbeitsplatzzonen"

                    zone_info = ZoneInfo(
                        zone_code=zone_code,
                        zone_name=f"Zone {zone_code}",
                        zone_category=zone_category,
                        geschosszahl=geschosszahl,
                        gewerbeerleichterung=gewerbeerleichterung
                    )

                    current_zones[zone_code] = zone_info
                    zones.append({
                        "zone_code": zone_code,
                        "zone_name": zone_info.zone_name,
                        "zone_category": zone_category,
                        "geschosszahl": geschosszahl,
                        "gewerbeerleichterung": gewerbeerleichterung,
                        "source_article": article_label,
                        "page": page_start
                    })

        state["current_zones"] = current_zones
        state["zones"] = zones

        # Part 2: Extract zone-parameter tables
        _extract_zone_parameter_tables_impl(state)
        return state
    except Exception as e:
        logger.error(f"Error in extract_zones_and_tables: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Zones/tables error: {str(e)}"]
        return state


def _extract_zone_parameter_tables_impl(state: BZOExtractionState) -> None:
    """Extract zone-parameter tables from classified blocks. Mutates state in place."""
    tables = []
    table_blocks = [b for b in state.get("classified_blocks", []) if b.get("block_type") == "table"]
    zone_pattern = r'\b([WLIZK]\d+(?:/\d+)?(?:G\*?)?)\b'
    parameter_keywords = [
        r'Ausnützungsziffer', r'Überbauungsziffer', r'Vollgeschosse', r'Dachgeschosse', r'Attikageschoss', r'Untergeschoss',
        r'Gebäudelänge', r'Grenzabstand', r'Fassadenhöhen', r'Grundabstand', r'Mehrlängen', r'Höchstmass'
    ]
    parameter_row_patterns = [
        r'^[a-g]\)\s+(.+?)(?:\s+max\.|min\.|:)?',
        r'^(Ausnützungsziffer|Überbauungsziffer|Vollgeschosse|Dachgeschosse|Attikageschoss|Untergeschoss|Gebäudelänge|Grenzabstand|Fassadenhöhen|Grundabstand|Mehrlängen|Höchstmass|Höchstmaß)',
    ]
    subparameter_patterns = [
        r'^(Grundabstand|Mehrlängen|Höchstmass|Höchstmaß|Fassadenhöhen)\s*(min\.|max\.)?',
        r'^(anrechenbare\s+Dachgeschosse|anrechenbares\s+Attikageschoss|anrechenbares\s+Untergeschoss)',
    ]
    numeric_pattern = r'(\d+(?:\.\d+)?)\s*(%|m|Geschoss|Geschosse|Geschosse\s+max\.?|Geschoss\s+max\.?)?'
    for table_block in table_blocks:
        block_dict = table_block.get("block", {})
        text = block_dict.get("text", "")
        page = block_dict.get("page", 0)
        if not text or len(text.strip()) < 20:
            continue
        lines = text.split('\n')
        header_row_idx, zone_columns = None, []
        for idx, line in enumerate(lines):
            zone_matches = re.findall(zone_pattern, line, re.IGNORECASE)
            if len(zone_matches) >= 3:
                header_row_idx, zone_columns = idx, zone_matches
                break
        if not zone_columns:
            has_parameters = any(re.search(kw, text, re.IGNORECASE) for kw in parameter_keywords)
            has_zones = len(re.findall(zone_pattern, text, re.IGNORECASE)) >= 3
            if has_parameters and has_zones:
                zone_columns = list(dict.fromkeys(re.findall(zone_pattern, text, re.IGNORECASE)))
                header_row_idx = 0
        if not zone_columns:
            continue
        article_context = None
        for block in state.get("classified_blocks", []):
            if block.get("block", {}).get("page") == page and block.get("article_label"):
                article_context = block.get("article_label")
                break
        table_data = {"page": page, "zones": zone_columns, "parameters": [], "source_text": text[:500], "article": article_context}
        start_idx = (header_row_idx + 1) if header_row_idx is not None else 0
        current_parameter = current_subparameter = None
        parameter_values = subparameter_values = {}
        for line_idx in range(start_idx, len(lines)):
            line = lines[line_idx].strip()
            if not line:
                continue
            is_parameter_row, parameter_name = False, None
            for pat in parameter_row_patterns:
                m = re.match(pat, line, re.IGNORECASE)
                if m:
                    is_parameter_row, parameter_name = True, re.sub(r'\s+max\.?\s*$', '', re.sub(r'\s+min\.?\s*$', '', m.group(1).strip(), flags=re.I), flags=re.I)
                    break
            is_subparameter, subparameter_name = False, None
            if not is_parameter_row:
                for pat in subparameter_patterns:
                    m = re.search(pat, line, re.IGNORECASE)
                    if m:
                        is_subparameter, subparameter_name = True, m.group(1).strip() + (f" {m.group(2).strip()}" if m.lastindex and m.lastindex >= 2 and m.group(2) else "")
                        break
            target_values = subparameter_values if current_subparameter else parameter_values
            if is_parameter_row and parameter_name:
                if current_parameter and parameter_values:
                    table_data["parameters"].append({"parameter": current_parameter, "values_by_zone": parameter_values.copy(), "article": article_context})
                current_parameter, current_subparameter, parameter_values, subparameter_values = parameter_name, None, {}, {}
                continue
            if is_subparameter and subparameter_name:
                if current_subparameter and subparameter_values and current_parameter:
                    table_data["parameters"].append({"parameter": f"{current_parameter} - {current_subparameter}", "values_by_zone": subparameter_values.copy(), "article": article_context})
                current_subparameter, subparameter_values = subparameter_name, {}
                continue
            if current_parameter or current_subparameter:
                line_parts = re.split(r'\s{2,}|\t', line)
                line_parts = [p.strip() for p in line_parts if p.strip()]
                n = len(zone_columns)
                value_parts = []
                # Column-based: extract trailing numeric/fraction parts that align with zone count
                for p in reversed(line_parts):
                    if re.match(r'^\d+(?:\.\d+)?\s*(%|m)?$', p, re.I) or re.match(r'^\d+/\d+$', p):
                        val = re.sub(r'\s*(%|m)$', '', p, flags=re.I).strip()
                        unit = None
                        um = re.search(r'\s*(%|m)$', p, re.I)
                        if um:
                            unit = 'm' if um.group(1).lower() == 'm' else '%'
                        value_parts.insert(0, (val, unit))
                    else:
                        break
                if len(value_parts) == n:
                    for zi, zone in enumerate(zone_columns):
                        if zone not in target_values:
                            target_values[zone] = []
                        val, unit = value_parts[zi]
                        target_values[zone].append({"value": val, "unit": unit, "raw_text": line[:200], "line_number": line_idx})
                else:
                    # Fallback: regex match by character position
                    all_matches = [(m.start(), m.group(0), m.group(1), m.group(2) if m.lastindex and m.lastindex > 1 else None) for m in re.finditer(numeric_pattern, line, re.I)]
                    all_matches += [(m.start(), m.group(0), m.group(0), None) for m in re.finditer(r'(\d+/\d+)', line, re.I)]
                    all_matches.sort(key=lambda x: x[0])
                    if len(all_matches) == n:
                        for zi, zone in enumerate(zone_columns):
                            if zone not in target_values:
                                target_values[zone] = []
                            _, _, val, unit = all_matches[zi]
                            target_values[zone].append({"value": val, "unit": unit.strip() if unit else None, "raw_text": line[:200], "line_number": line_idx})
        if current_subparameter and subparameter_values and current_parameter:
            table_data["parameters"].append({"parameter": f"{current_parameter} - {current_subparameter}", "values_by_zone": subparameter_values.copy(), "article": article_context})
        if current_parameter and parameter_values:
            table_data["parameters"].append({"parameter": current_parameter, "values_by_zone": parameter_values.copy(), "article": article_context})
        if table_data["parameters"]:
            tables.append(table_data)
    state["zone_parameter_tables"] = state.get("zone_parameter_tables", []) + tables
    if tables:
        logger.info(f"Extracted {len(tables)} zone-parameter tables")


# Zone code pattern: W5, W2/30, Z3, K3/4, W5G, W 5 (optional space)
_ZONE_CODE_PATTERN = re.compile(r'\b([WZIK]\s*\d+(?:\s*/\s*\d+)?(?:G)?)\b', re.IGNORECASE)


def _zones_in_text(text: str) -> List[str]:
    """Extract zone codes (W5, W2/30, Z3, etc.) from text. Returns unique list, normalized (e.g. W5)."""
    matches = _ZONE_CODE_PATTERN.findall(text)
    seen = set()
    result = []
    for m in matches:
        # Normalize: remove spaces -> W5, W2/30
        n = re.sub(r'\s+', '', m).upper()
        if n and n not in seen:
            seen.add(n)
            result.append(n)
    return result


def extract_rules(state: BZOExtractionState) -> BZOExtractionState:
    """Detect rule candidates and parse values. Associates each rule with zones from its source article."""
    try:
        candidates = []
        for article_dict in state["articles"]:
            text = article_dict.get("text", "")
            page_start = article_dict.get("page_start", 0)
            # Zones mentioned in THIS article - rules from this article apply to these zones
            article_zones = _zones_in_text(text)
            for rule_type, rule_config in RULE_TAXONOMY.items():
                for pattern in rule_config.get("patterns", []):
                    for match in re.finditer(pattern, text, re.IGNORECASE):
                        start, end = max(0, match.start() - 100), min(len(text), match.end() + 100)
                        context = text[start:end]
                        condition_text = None
                        for cond_pat in [r'(?:nördlich|südlich|östlich|westlich|oberhalb|unterhalb)\s+[^,\.]+', r'(?:für|bei|in)\s+[^,\.]+']:
                            cm = re.search(cond_pat, context, re.IGNORECASE)
                            if cm:
                                condition_text = cm.group(0)
                                break
                        candidates.append({
                            "rule_type": rule_type, "matched_text": match.group(0), "article_text": text,
                            "page": page_start, "article_label": article_dict.get("article_label"),
                            "condition_text": condition_text, "is_table_rule": False,
                            "table_zones": article_zones.copy(),
                        })
        parsed_rules = []
        for candidate_dict in candidates:
            rule_type = candidate_dict["rule_type"]
            rule_config = RULE_TAXONOMY.get(rule_type, {})
            units = rule_config.get("units", [])
            value_type = rule_config.get("value_type", "numeric")

            # Extract value using regex
            matched_text = candidate_dict["matched_text"]
            article_text = candidate_dict["article_text"]
            text = matched_text + " " + article_text[article_text.find(matched_text):article_text.find(matched_text) + 200]

            value_numeric = None
            value_text = matched_text
            unit = None

            # Try to extract numeric value
            if value_type in ["numeric", "integer"]:
                # Pattern: "max. 4", "30 %", "min. 3.5 m"
                value_patterns = [
                    r'(?:max|maximal|min|mindestens|höchstens)\s*\.?\s*(\d+(?:\.\d+)?)',
                    r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)',
                    r'(\d+(?:\.\d+)?)',
                ]

                for pattern in value_patterns:
                    match = re.search(pattern, text, re.IGNORECASE)
                    if match:
                        try:
                            value_numeric = float(match.group(1))
                            if value_type == "integer":
                                value_numeric = int(value_numeric)

                            # Check for unit
                            unit_match = re.search(r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)', text, re.IGNORECASE)
                            if unit_match:
                                unit = unit_match.group(2).lower()
                                if unit in ["meter", "metern"]:
                                    unit = "m"
                                elif unit == "prozent":
                                    unit = "%"

                            break
                        except ValueError:
                            continue

            # Calculate confidence
            confidence = 0.5
            if value_numeric is not None:
                confidence = 0.8
                if unit:
                    confidence = 0.9

            # Zone association from source article (zones mentioned in that article)
            article_zones = candidate_dict.get("table_zones", [])
            zone_raw = article_zones[0] if article_zones else None
            rule_scope = "zone" if zone_raw else "general"

            parsed_rule = {
                "rule_type": rule_type,
                "value_numeric": value_numeric,
                "value_text": value_text,
                "unit": unit,
                "condition_text": candidate_dict.get("condition_text"),
                "is_table_rule": candidate_dict.get("is_table_rule", False),
                "table_zones": article_zones,
                "page": candidate_dict["page"],
                "article_label": candidate_dict.get("article_label"),
                "text_snippet": value_text,
                "zone_raw": zone_raw,
                "rule_scope": rule_scope,
                "confidence": confidence
            }
            parsed_rules.append(parsed_rule)
        state["parsed_rules"] = parsed_rules
        return state
    except Exception as e:
        logger.error(f"Error in extract_rules: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Extract rules error: {str(e)}"]
        return state


# ===== Wohnzone Parameter Extraction =====

# Canonical order for BZO parameters (Fakten)
BZO_PARAM_ORDER = [
    "vollgeschosse", "vollgeschoss",
    "anrechenbares untergeschoss", "untergeschoss",
    "anrechenbares dachgeschoss", "dachgeschoss", "attikageschoss",
    "ausnützungsziffer", "ausnutzungsziffer", "az",
    "überbauungsziffer",
    "gebäudehöhe", "fassadenhöhen",
    "grundabstand", "grenzabstand",
    "gebäudelänge",
    "mehrlängen", "höchstmass",
    "baumassenziffer", "grünflächenziffer", "wohnflächenanteil", "gebäudebreite",
]

RULE_TYPE_TO_PARAM: Dict[str, str] = {
    "max_building_height": "Gebäudehöhe max.",
    "max_floors": "Vollgeschosse max.",
    "max_attachable_attics": "anrechenbares Dachgeschoss max.",
    "max_attachable_basement": "anrechenbares Untergeschoss max.",
    "density": "Ausnützungsziffer",
    "building_coverage": "Überbauungsziffer",
    "building_mass_index": "Baumassenziffer (BMZ)",
    "green_space_index": "Grünflächenziffer (GFZ)",
    "boundary_distance": "Grundabstand min.",
    "boundary_distance_length_surcharge": "Mehrlängen-zuschlag (MLZ)",
    "boundary_distance_max": "Höchstmass Grenzabstand max.",
    "building_length": "Gebäudelänge max.",
    "building_width": "Gebäudebreite max.",
    "residential_area_share": "Wohnflächenanteil",
}

RULE_TYPE_TO_DEFAULT_UNIT: Dict[str, str] = {
    "max_building_height": "m",
    "max_floors": "Stk.",
    "max_attachable_attics": "Stk.",
    "max_attachable_basement": "Stk.",
    "density": "%",
    "building_coverage": "%",
    "building_mass_index": "",
    "green_space_index": "%",
    "boundary_distance": "m",
    "boundary_distance_length_surcharge": "",
    "boundary_distance_max": "m",
    "building_length": "m",
    "building_width": "m",
    "residential_area_share": "%",
}

_ARTIKEL_KEYWORDS = [
    r"herabsetzung", r"grenzabstand", r"nutzweise", r"wohnanteil",
    r"besondere\s+gebäude", r"überbauungsziffer", r"sonderregel",
    r"ausnahmen", r"abweichungen", r"erleichterungen",
    r"mischung", r"gewerbe", r"dienstleistung",
    r"kantonale", r"abstandsvorschriften",
    r"vollgeschoss", r"reduziert", r"mindestmass",
    r"störend", r"nicht\s+störend", r"mässig\s+störend",
]

# Artikel that are parameter tables - EXCLUDE from Weiterführende Bestimmungen
_ZUSATZ_EXCLUDE_TITLES = ("zonen", "grundmasse", "mehrlängenzuschlag", "mehrlaengenzuschlag")

# Artikel that are substantive provisions - INCLUDE in Weiterführende Bestimmungen
_ZUSATZ_INCLUDE_TITLES = (
    "herabsetzung", "nutzweise", "besondere", "besonderes",
    "ausnahmen", "abweichungen", "erleichterungen", "sonderregel",
    "wohnanteil", "nutzungsart", "abstandsvorschriften",
    "mischung", "gewerbe", "dienstleistung",
)


def _format_article_text_readable(text: str, article_label: str = "", article_title: str = "") -> str:
    """Format raw PDF-extracted text for readable display."""
    if not text or not text.strip():
        return ""
    # Strip redundant article header at start (e.g. "Art. 16 Nutzweise" when already in summary)
    if article_label or article_title:
        prefix = f"{article_label} {article_title}".strip()
        if prefix:
            pat = re.escape(prefix)
            text = re.sub(rf"^{pat}\s*", "", text.strip(), flags=re.I).lstrip()
    lines = []
    for line in text.split("\n"):
        line = line.strip()
        if not line:
            continue
        lines.append(line)
    if not lines:
        return ""
    # Join hyphenated word breaks (e.g. "Gewerbe-\nund" -> "Gewerbe und")
    merged = []
    i = 0
    while i < len(lines):
        line = lines[i]
        while line.rstrip().endswith("-") and i + 1 < len(lines):
            line = line.rstrip()[:-1] + lines[i + 1].strip()
            i += 1
        if re.match(r"^\d{1,2}\s*$", line) and i + 1 < len(lines):
            next_line = lines[i + 1]
            if not re.match(r"^Art\.\s", next_line) and len(next_line) > 3:
                line = line + " " + next_line.strip()
                i += 1
        elif re.match(r"^\d{1,2}\s*$", line) and i + 1 < len(lines) and re.match(r"^Art\.\s", lines[i + 1]):
            i += 1
            continue
        merged.append(line)
        i += 1
    combined = " ".join(merged)
    # Fix run-together paragraph numbers: "1In" -> "1. In", "2Ist" -> "2. Ist"
    combined = re.sub(r"(\d)([A-ZÄÖÜ])", r"\1. \2", combined)
    # Also fix "a)Something" -> "a) Something" for subparagraphs
    combined = re.sub(r"([a-z]\))([A-ZÄÖÜ])", r"\1 \2", combined)
    # Split into paragraphs: numbered (1. ..., 2. ...) or lettered (a) ..., b) ...)
    parts = re.split(r"(?=\d+\.\s+[A-ZÄÖÜa-zäöü])|(?=[a-z]\)\s+[A-ZÄÖÜa-zäöü])", combined)
    paragraphs = []
    for p in parts:
        p = p.strip()
        if not p or len(p) < 3:
            continue
        paragraphs.append(p)
    return "\n\n".join(paragraphs)


def _is_zusatzinfo_article(title: str) -> bool:
    """True if article should appear in Weiterführende Bestimmungen (provisions, not param tables)."""
    t = (title or "").lower().strip()
    for exc in _ZUSATZ_EXCLUDE_TITLES:
        if exc in t:
            return False
    for inc in _ZUSATZ_INCLUDE_TITLES:
        if inc in t:
            return True
    return False


def _bzo_build_source(page: Optional[int], article: Optional[str]) -> str:
    """Build source string: Art. X, S. Y"""
    parts = []
    if article:
        parts.append(str(article))
    if page is not None and page > 0:
        parts.append(f"S. {page}")
    return ", ".join(parts) if parts else ""


def _bzo_zone_matches_table(bauzone: str, zone_col: str) -> bool:
    """Check if table column zone matches target bauzone."""
    b = (bauzone or "").upper().strip()
    z = (zone_col or "").upper().strip()
    if not b or not z:
        return False
    return b in z or (len(z) >= 2 and z in b)


def _bzo_article_mentions_bauzone(article_text: str, bauzone: str) -> bool:
    """Check if article text mentions the bauzone or applies to it."""
    if not bauzone or not article_text:
        return False
    b = bauzone.upper().strip()
    t = article_text.upper()
    if b in t:
        return True
    if len(b) >= 2 and b[0] in "WZIK" and re.search(rf"\b{b[0]}\s*\d+", t):
        base = re.sub(r"\s+", "", b.split("/")[0].rstrip("G"))
        if base in t or re.search(rf"\b{base}\b", t):
            return True
    return False


def _bzo_get_params_from_tables(
    zone_parameter_tables: List[Dict[str, Any]],
    bauzone: str
) -> List[Dict[str, Any]]:
    """Extract parameter values for a Bauzone from zone-parameter tables."""
    result = []
    seen = set()
    for table in zone_parameter_tables:
        zones = table.get("zones", [])
        if not any(_bzo_zone_matches_table(bauzone, str(z)) for z in zones):
            continue
        page = table.get("page")
        art = table.get("article")
        for param in table.get("parameters", []):
            values_by_zone = param.get("values_by_zone", {})
            for zone, values in values_by_zone.items():
                if not _bzo_zone_matches_table(bauzone, str(zone)):
                    continue
                if not isinstance(values, list) or len(values) == 0:
                    continue
                val_entry = values[0]
                value = val_entry.get("value", "")
                unit = val_entry.get("unit") or ""
                param_name = param.get("parameter", "")
                key = f"{param_name}|{value}|{unit}"
                if key not in seen:
                    seen.add(key)
                    source = _bzo_build_source(page, param.get("article") or art)
                    result.append({
                        "parameter": param_name,
                        "value": str(value),
                        "unit": str(unit).strip() if unit else "",
                        "source": source or "Tabelle im Dokument",
                        "rule_type": None,
                    })
    return result


def _bzo_filter_rules_by_bauzone(rules: List[Dict[str, Any]], bauzone: str) -> List[Dict[str, Any]]:
    """Filter rules by Bauzone code."""
    bauzone_upper = (bauzone or "").upper()
    out = []
    for r in rules:
        if bauzone_upper in (r.get("zone_raw") or "").upper():
            out.append(r)
            continue
        for tz in (r.get("table_zones") or []):
            if bauzone_upper in str(tz).upper():
                out.append(r)
                break
        else:
            if bauzone_upper in (r.get("text_snippet") or "").upper():
                out.append(r)
    return out


def _bzo_get_params_from_rules(rules: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Convert parsed rules to {parameter, value, unit, source, rule_type} format."""
    result = []
    seen = set()
    for r in rules:
        rule_type = r.get("rule_type", "")
        param_name = RULE_TYPE_TO_PARAM.get(rule_type) or rule_type.replace("_", " ").title()
        value_numeric = r.get("value_numeric")
        value_text = r.get("value_text", "")
        unit = r.get("unit") or ""
        if value_numeric is not None:
            val_str = str(int(value_numeric)) if isinstance(value_numeric, float) and value_numeric == int(value_numeric) else str(value_numeric)
        else:
            val_str = str(value_text).strip() if value_text else ""
        if not val_str:
            continue
        val_lower = val_str.lower()
        if val_lower in ("gebäudelänge", "gebäudebreite", "mehrlängenzuschlag", "mehrlängen", "grenzabstand", "fassadenhöhe"):
            continue
        unit_str = str(unit).strip() if unit else (RULE_TYPE_TO_DEFAULT_UNIT.get(rule_type, ""))
        page = r.get("page")
        article = r.get("article_label")
        source = _bzo_build_source(page, article) or "Artikeltxt"
        key = f"{param_name}|{val_str}|{unit_str}"
        if key not in seen:
            seen.add(key)
            result.append({
                "parameter": param_name,
                "value": val_str,
                "unit": unit_str,
                "source": source,
                "rule_type": rule_type,
            })
    return result


def _bzo_param_to_rule_type(param_name: str) -> Optional[str]:
    """Map parameter display name to rule_type."""
    p = (param_name or "").lower()
    if "vollgeschoss" in p:
        return "max_floors"
    if "dachgeschoss" in p or "attika" in p:
        return "max_attachable_attics"
    if "untergeschoss" in p:
        return "max_attachable_basement"
    if "ausnützungsziffer" in p or "ausnutzungsziffer" in p or " az " in p:
        return "density"
    if "überbauungsziffer" in p or " uz " in p:
        return "building_coverage"
    if "baumassenziffer" in p or "bmz" in p:
        return "building_mass_index"
    if "grünflächen" in p or "gfz" in p:
        return "green_space_index"
    if "grenzabstand" in p or "grundabstand" in p:
        return "boundary_distance"
    if "mehrlängen" in p or "mlz" in p:
        return "boundary_distance_length_surcharge"
    if "höchstmass" in p:
        return "boundary_distance_max"
    if "gebäudelänge" in p:
        return "building_length"
    if "gebäudebreite" in p:
        return "building_width"
    if "fassadenhöhe" in p or "gebäudehöhe" in p:
        return "max_building_height"
    if "wohnflächenanteil" in p or "wohnanteil" in p:
        return "residential_area_share"
    return None


def _bzo_merge_rules(
    from_tables: List[Dict[str, Any]],
    from_rules: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Merge table params and rule params. Tables take precedence."""
    by_param_lower: Dict[str, Dict[str, Any]] = {}
    for r in from_tables:
        p = (r.get("parameter") or "").lower()
        if p and p not in by_param_lower:
            rr = r.copy()
            if not rr.get("rule_type"):
                rr["rule_type"] = _bzo_param_to_rule_type(rr.get("parameter", ""))
            by_param_lower[p] = rr
    for r in from_rules:
        p = (r.get("parameter") or "").lower()
        if p and p not in by_param_lower:
            by_param_lower[p] = r.copy()
    return list(by_param_lower.values())


def _bzo_param_sort_key(param_name: str) -> int:
    """Order parameters by BZO_PARAM_ORDER."""
    p = (param_name or "").lower()
    for i, kw in enumerate(BZO_PARAM_ORDER):
        if kw in p:
            return i
    return 99


def _bzo_extract_zusatzinformationen(
    articles: List[Dict[str, Any]],
    bauzone: str = "",
    zone_parameter_tables: Optional[List[Dict[str, Any]]] = None,
) -> List[Dict[str, Any]]:
    """Extract article excerpts relevant to the bauzone."""
    result = []
    seen = set()
    patterns = [re.compile(kw, re.IGNORECASE) for kw in _ARTIKEL_KEYWORDS]
    table_articles = set()
    if zone_parameter_tables and bauzone:
        for t in zone_parameter_tables:
            if not any(_bzo_zone_matches_table(bauzone, str(z)) for z in t.get("zones", [])):
                continue
            table_articles.add(t.get("article") or "")
    for art in articles:
        label = art.get("article_label") or ""
        title = (art.get("article_title") or "").strip()
        text = (art.get("text") or "").strip()
        page = art.get("page_start") or art.get("page_end") or 0
        if not label or not text:
            continue
        key = f"{label}|{page}"
        if key in seen:
            continue
        combined = f"{title} {text}"
        if not any(p.search(combined) for p in patterns):
            continue
        if bauzone:
            if not _bzo_article_mentions_bauzone(combined, bauzone) and label not in table_articles:
                continue
        seen.add(key)
        source = _bzo_build_source(page, label)
        result.append({
            "article_label": label,
            "article_title": title,
            "text": text[:3500].strip(),
            "page": page,
            "source": source or "BZO-Dokument",
        })
    return sorted(result, key=lambda x: (x.get("page", 0), x.get("article_label", "")))


def extract_wohnzone_params(
    extracted_content: Dict[str, Any],
    bauzone: str,
    relevant_rules: Optional[List[Dict[str, Any]]] = None,
    total_area_m2: Optional[float] = None,
) -> Dict[str, Any]:
    """
    Extract BZO parameters for a Wohnzone from extracted content.
    Returns ordered list of fakten (with sources) and zusatzinformationen.
    """
    articles = extracted_content.get("articles", [])
    zone_parameter_tables = extracted_content.get("zone_parameter_tables", [])
    all_rules = extracted_content.get("rules", [])
    rules_to_use = relevant_rules if relevant_rules is not None else _bzo_filter_rules_by_bauzone(all_rules, bauzone)
    from_tables = _bzo_get_params_from_tables(zone_parameter_tables, bauzone)
    from_rules = _bzo_get_params_from_rules(rules_to_use)
    bauzone_rules = _bzo_merge_rules(from_tables, from_rules)
    fakten = []
    if bauzone:
        fakten.append({"item": "Auswertung für Bauzone", "value": bauzone, "source": ""})
    if total_area_m2 is not None and total_area_m2 > 0:
        fakten.append({
            "item": "Grundstücksfläche",
            "value": f"{total_area_m2:,.0f} m²".replace(",", "'"),
            "source": "Parzellendaten",
        })
    for r in sorted(bauzone_rules, key=lambda x: _bzo_param_sort_key(x.get("parameter", ""))):
        param = r.get("parameter", "").strip()
        val = r.get("value", "")
        unit = (r.get("unit") or "").strip()
        rule_type = r.get("rule_type") or _bzo_param_to_rule_type(param)
        if not unit and rule_type:
            unit = RULE_TYPE_TO_DEFAULT_UNIT.get(rule_type, "")
        value_str = f"{val}{(' ' + unit) if unit else ''}".strip()
        if param and value_str:
            fakten.append({
                "item": param,
                "value": value_str,
                "source": r.get("source") or "BZO-Dokument",
            })
    zusatzinformationen = _bzo_extract_zusatzinformationen(
        articles, bauzone, zone_parameter_tables
    )
    return {
        "bauzone": bauzone,
        "fakten": fakten,
        "zusatzinformationen": zusatzinformationen,
    }


# ===== LLM-based BZO Params Extraction =====

def _build_bauzone_context_for_llm(state: BZOParamsExtractionState) -> str:
    """Build context string for LLM from extracted BZO content."""
    bauzone = (state.get("bauzone") or "").upper()
    zone_parameter_tables = state.get("zone_parameter_tables", [])
    relevant_articles = state.get("relevant_articles", [])
    relevant_rules = state.get("relevant_rules", [])
    total_area_m2 = state.get("total_area_m2")
    parts = []

    if total_area_m2 is not None and total_area_m2 > 0:
        parts.append(f"Grundstücksfläche der Parzelle: {total_area_m2:,.0f} m²".replace(",", "'"))
        parts.append("")

    # Full article texts - LLM can parse tables like Art. 14 (zones in rows, values in columns)
    parts.append("=== ARTIKEL MIT VOLLEM TEXT (Tabellen genau lesen, richtige Spalte/Zeile für Bauzone wählen) ===")
    for art in relevant_articles:
        label = art.get("article_label", "")
        title = (art.get("article_title") or "").strip()
        text = art.get("text", "")
        page = art.get("page_start") or art.get("page_end", 0)
        parts.append(f"\n{label}: {title}")
        parts.append(f"Seite: {page}")
        parts.append(f"Inhalt:\n{text}")
    parts.append("")

    # Zone-parameter tables (pre-parsed)
    if zone_parameter_tables:
        parts.append("=== VORSTRUKTURIERTE TABELLENWERTE FÜR BAUZONE ===")
        for table in zone_parameter_tables:
            page = table.get("page", 0)
            art = table.get("article", "")
            parts.append(f"\n{art} (S. {page}):")
            for param in table.get("parameters", []):
                pname = param.get("parameter", "")
                for zone, values in (param.get("values_by_zone") or {}).items():
                    if bauzone in (zone or "").upper():
                        if isinstance(values, list) and values:
                            v = values[0].get("value", "")
                            u = values[0].get("unit") or ""
                            parts.append(f"  {pname} [{zone}]: {v} {u}".strip())
    parts.append("")

    # Rules from text
    if relevant_rules:
        parts.append("=== REGELN AUS ARTIKELTEXT ===")
        for r in relevant_rules[:20]:
            rt = r.get("rule_type", "")
            vn = r.get("value_numeric")
            vt = r.get("value_text", "")
            u = r.get("unit", "")
            page = r.get("page", 0)
            art = r.get("article_label", "")
            val = str(int(vn)) if vn is not None and isinstance(vn, float) and vn == int(vn) else (str(vn) if vn is not None else vt)
            parts.append(f"  {rt}: {val} {u} ({art}, S. {page})".strip())

    return "\n".join(parts)


def _parse_llm_bullet_list(text: str) -> List[Dict[str, str]]:
    """Parse LLM response into fakten list. Expects lines like '- Param: value (Art. X, S. Y)'."""
    fakten = []
    for line in (text or "").strip().split("\n"):
        line = line.strip()
        if not line or not line.startswith("-"):
            continue
        line = line.lstrip("- ").strip()
        # Match "Param: value (source)" or "Param: value"
        match = re.match(r"^(.+?):\s*(.+?)(?:\s*\(([^)]+)\))?\s*$", line)
        if match:
            item = match.group(1).strip()
            value = match.group(2).strip()
            source = (match.group(3) or "").strip()
            if item and value:
                fakten.append({"item": item, "value": value, "source": source})
        elif ":" in line:
            idx = line.find(":")
            fakten.append({
                "item": line[:idx].strip(),
                "value": line[idx + 1 :].strip(),
                "source": "",
            })
    return fakten


async def _llm_filter_relevant_provisions(
    ai_service: Any,
    bauzone: str,
    fakten: List[Dict[str, str]],
    provision_articles: List[Dict[str, Any]],
) -> Optional[set]:
    """
    Use LLM to determine which provision articles are relevant for a parcel in this bauzone.
    Returns set of article labels (e.g. {"Art. 15", "Art. 16"}) or None to include all on error.
    """
    if not provision_articles:
        return set()
    fakten_str = "\n".join(
        f"- {f.get('item', '')}: {f.get('value', '')}" for f in fakten
        if f.get("item") and "Auswertung" not in (f.get("item") or "")
    )
    articles_str = "\n".join(
        f"- {a.get('article_label', '')}: {a.get('article_title', '')}"
        for a in provision_articles
        if a.get("article_label")
    )
    prompt = f"""Du bist Experte für Schweizer Bau- und Zonenordnungen (BZO).

Eine Parzelle liegt in der Bauzone {bauzone}. Folgende BZO-Parameter gelten für diese Zone:
{fakten_str}

Folgende Bestimmungen (Weiterführende Artikel) könnten zutreffen:
{articles_str}

AUFGABE: Welche dieser Artikel sind für eine Parzelle in Bauzone {bauzone} mit diesen Parametern TATSÄCHLICH RELEVANT?
- Nur Artikel angeben, die auf diese Zone/Parameter Bezug nehmen oder Bedingungen nennen, die hier greifen
- z.B. Art. 15 Herabsetzung: relevant wenn Vollgeschosse und Grenzabstand vorhanden (Reduktion bei weggelassenen Geschossen)
- z.B. Art. 16 Nutzweise: relevant für Wohnzonen mit Wohnanteil
- z.B. Art. 40 Wohnanteil: nur wenn dieser Artikel die Zone {bauzone} erwähnt oder für Wohnzonen gilt
- Artikel die andere Zonen betreffen (z.B. nur Z5, I) und {bauzone} ausschliessen: NICHT aufnehmen

Antwort NUR mit den relevanten Artikelnummern, eine pro Zeile (z.B. "Art. 15", "Art. 16"). Keine anderen Zeichen."""

    try:
        response = await ai_service.callAiPlanning(
            prompt=prompt,
            debugType="bzo_relevant_provisions",
        )
        labels = set()
        for line in (response or "").strip().split("\n"):
            m = re.search(r"(Art\.\s*\d+[a-z]?)", line.strip(), re.I)
            if m:
                lbl = re.sub(r"\s+", " ", m.group(1).strip())
                labels.add(lbl)
        return labels if labels else None  # None = include all (fallback on error or empty)
    except Exception as e:
        logger.warning(f"LLM provision filter failed: {e}")
        return None


async def llm_extract_bauzone_params_node(state: BZOParamsExtractionState) -> BZOParamsExtractionState:
    """Use LLM to extract BZO parameters for Bauzone as bullet list."""
    bauzone = state.get("bauzone", "")
    gemeinde = state.get("gemeinde", "")
    ai_service = state.get("ai_service")
    errors = list(state.get("errors", []))

    if not ai_service:
        errors.append("AI service not provided")
        return {**state, "fakten": [], "bauzone_params_list": [], "errors": errors}

    context = _build_bauzone_context_for_llm(state)

    prompt = f"""Du bist Experte für Schweizer Bau- und Zonenordnungen (BZO). Extrahiere alle relevanten BZO-Parameter für die Bauzone {bauzone} in {gemeinde}.

BZO-INHALT:
{context}

AUFGABE: Erstelle eine geordnete Bullet-Liste ALLER zutreffenden Parameter für Bauzone {bauzone}.
Priorität: Vollgeschosse, anrechenbares Untergeschoss, anrechenbares Dachgeschoss, Ausnützungsziffer, Überbauungsziffer, Gebäudehöhe, Grundabstand/Grenzabstand, Gebäudelänge, Mehrlängenzuschlag, Höchstmass, sowie alle anderen Bestimmungen die für diese Zone gelten.

WICHTIG:
- Bei Tabellen: die richtige Spalte/Zeile für {bauzone} verwenden (z.B. Art. 14 Mehrlängenzuschlag: W5 = 13 m)
- Jede Zeile: "- Parametername: Wert (Art. X, S. Y)"
- Nur tatsächlich im Dokument vorhandene Werte angeben
- Einheit (m, %, Stk.) bei Zahlen mit angeben
- Keine leeren Zeilen oder Kommentare - nur die Liste

Antwort NUR mit der Bullet-Liste, sonst nichts:"""

    try:
        ai_response = await ai_service.callAiPlanning(
            prompt=prompt,
            debugType="bzo_params_extraction",
        )
        response_text = (ai_response or "").strip()
        # Parse into fakten
        fakten = _parse_llm_bullet_list(response_text)
        # Build bauzone_params_list (raw "- ..." strings)
        bauzone_params_list = [f"- {f['item']}: {f['value']}" + (f" ({f['source']})" if f.get("source") else "") for f in fakten]
        # Add header items if missing
        if bauzone and not any("Auswertung" in (f.get("item") or "") for f in fakten):
            fakten.insert(0, {"item": "Auswertung für Bauzone", "value": bauzone, "source": ""})
        total_area_m2 = state.get("total_area_m2")
        if total_area_m2 is not None and total_area_m2 > 0 and not any("Grundstücksfläche" in (f.get("item") or "") for f in fakten):
            fakten.insert(1, {
                "item": "Grundstücksfläche",
                "value": f"{total_area_m2:,.0f} m²".replace(",", "'"),
                "source": "Parzellendaten",
            })
        # Zusatzinformationen: only provisions RELEVANT for this parcel in this bauzone
        all_articles = state.get("extracted_content", {}).get("articles", []) or state.get("relevant_articles", [])
        provision_articles = [a for a in all_articles if _is_zusatzinfo_article((a.get("article_title") or "").strip())]
        relevant_labels = await _llm_filter_relevant_provisions(
            ai_service=ai_service,
            bauzone=bauzone,
            fakten=fakten,
            provision_articles=provision_articles,
        )
        def _norm_label(s: str) -> str:
            return re.sub(r"\s+", " ", (s or "").strip())

        zusatzinformationen = []
        for art in provision_articles:
            label = art.get("article_label", "")
            title = (art.get("article_title") or "").strip()
            norm = _norm_label(label)
            if relevant_labels is not None and norm and norm not in relevant_labels:
                continue
            raw_text = (art.get("text") or "")[:4000]
            text = _format_article_text_readable(
                raw_text,
                article_label=label,
                article_title=title,
            )
            if not text:
                continue
            page = art.get("page_start") or art.get("page_end", 0)
            source = f"{label}, S. {page}" if label else f"S. {page}"
            zusatzinformationen.append({
                "article_label": label,
                "article_title": title,
                "text": text,
                "source": source,
            })
        return {
            **state,
            "fakten": fakten,
            "bauzone_params_list": bauzone_params_list,
            "zusatzinformationen": zusatzinformationen,
            "errors": errors,
        }
    except Exception as e:
        logger.error(f"LLM BZO params extraction failed: {e}", exc_info=True)
        errors.append(str(e))
        return {
            **state,
            "fakten": [],
            "bauzone_params_list": [],
            "zusatzinformationen": [],
            "errors": errors,
        }


def _filter_articles_by_bauzone(articles: List[Dict[str, Any]], bauzone: str) -> List[Dict[str, Any]]:
    """Filter articles that mention the Bauzone."""
    bauzone_upper = (bauzone or "").upper()
    return [
        a for a in articles
        if bauzone_upper in (a.get("text") or "").upper() or bauzone_upper in (a.get("zone_raw") or "").upper()
    ]


def _filter_tables_by_bauzone(tables: List[Dict[str, Any]], bauzone: str) -> List[Dict[str, Any]]:
    """Filter zone-parameter tables to those containing the Bauzone."""
    bauzone_upper = (bauzone or "").upper()
    relevant = []
    for table in tables:
        zones = table.get("zones", [])
        matching = [z for z in zones if bauzone_upper in str(z).upper()]
        if matching:
            filtered = {
                "page": table.get("page"),
                "article": table.get("article"),
                "zones": matching,
                "parameters": [
                    {"parameter": p.get("parameter"), "values_by_zone": {
                        z: v for z, v in (p.get("values_by_zone") or {}).items()
                        if bauzone_upper in str(z).upper()
                    }}
                    for p in table.get("parameters", [])
                    if any(bauzone_upper in str(z).upper() for z in (p.get("values_by_zone") or {}))
                ],
            }
            filtered["parameters"] = [x for x in filtered["parameters"] if x["values_by_zone"]]
            if filtered["parameters"]:
                relevant.append(filtered)
    return relevant


async def run_bzo_params_extraction(
    extracted_content: Dict[str, Any],
    bauzone: str,
    ai_service: Any,
    gemeinde: str,
    relevant_rules: Optional[List[Dict[str, Any]]] = None,
    relevant_articles: Optional[List[Dict[str, Any]]] = None,
    total_area_m2: Optional[float] = None,
) -> Dict[str, Any]:
    """
    Extract BZO parameters for a Bauzone via LLM.
    Returns fakten (item/value/source), bauzone_params_list (bullet strings), zusatzinformationen.
    """
    rules = relevant_rules if relevant_rules is not None else _bzo_filter_rules_by_bauzone(
        extracted_content.get("rules", []), bauzone
    )
    articles = relevant_articles if relevant_articles is not None else _filter_articles_by_bauzone(
        extracted_content.get("articles", []), bauzone
    )
    tables = _filter_tables_by_bauzone(
        extracted_content.get("zone_parameter_tables", []),
        bauzone
    )

    state: BZOParamsExtractionState = {
        "extracted_content": extracted_content,
        "bauzone": bauzone,
        "total_area_m2": total_area_m2,
        "relevant_rules": rules,
        "relevant_articles": articles,
        "zone_parameter_tables": tables,
        "ai_service": ai_service,
        "gemeinde": gemeinde,
        "bauzone_params_list": [],
        "fakten": [],
        "zusatzinformationen": [],
        "errors": [],
    }

    final_state = await llm_extract_bauzone_params_node(state)

    return {
        "bauzone": bauzone,
        "fakten": final_state.get("fakten", []),
        "bauzone_params_list": final_state.get("bauzone_params_list", []),
        "zusatzinformationen": final_state.get("zusatzinformationen", []),
        "errors": final_state.get("errors", []),
    }


# ===== Pipeline Execution =====

def _run_bzo_extraction_pipeline(state: BZOExtractionState) -> BZOExtractionState:
    """Run the BZO extraction steps sequentially on the shared state."""
    state = classify_and_assemble(state)
    state = extract_zones_and_tables(state)
    state = extract_rules(state)
    return state


def run_extraction(pdf_bytes: bytes, pdf_id: str = None, dokument_id: str = None) -> Dict[str, Any]:
    """
    Run the extraction pipeline on a PDF and return structured, sorted results.

    Args:
        pdf_bytes: PDF file content as bytes
        pdf_id: Optional identifier for the PDF (defaults to generated ID)
        dokument_id: Optional dokument ID for reference

    Returns:
        Dictionary with extracted and sorted content:
        {
            "articles": [...],  # Sorted by page_start, then article_label
            "zones": [...],     # Sorted by zone_code
            "rules": [...],     # Sorted by rule_type, then page
            "errors": [...],
            "warnings": [...]
        }
    """
    if not pdf_id:
        pdf_id = f"pdf_{uuid.uuid4().hex[:8]}"

    # Initialize state
    state: BZOExtractionState = {
        "dokument_id": dokument_id,
        "pdf_id": pdf_id,
        "text_blocks": [],
        "classified_blocks": [],
        "articles": [],
        "current_zones": {},
        "zones": [],
        "rule_candidates": [],
        "parsed_rules": [],
        "zone_parameter_tables": [],
        "errors": [],
        "warnings": []
    }

    # Extract PDF text first
    pdf_extractor = BZOPdfExtractor()
    text_blocks_objects = pdf_extractor.extract_text_blocks(pdf_bytes, state["pdf_id"])
    # Convert TextBlock objects to dicts for state
    state["text_blocks"] = [
        {
            "page": tb.page,
            "text": tb.text,
            "block_id": tb.block_id,
            "bbox": tb.bbox
        }
        for tb in text_blocks_objects
    ]

    # Run the extraction pipeline
    final_state = _run_bzo_extraction_pipeline(state)

    # Sort and structure results
    articles = sorted(
        final_state.get("articles", []),
        key=lambda x: (x.get("page_start", 0), x.get("article_label", ""))
    )

    zones = sorted(
        final_state.get("zones", []),
        key=lambda x: x.get("zone_code", "")
    )

    rules = sorted(
        final_state.get("parsed_rules", []),
        key=lambda x: (x.get("rule_type", ""), x.get("page", 0))
    )

    zone_parameter_tables = final_state.get("zone_parameter_tables", [])

    return {
        "articles": articles,
        "zones": zones,
        "rules": rules,
        "zone_parameter_tables": zone_parameter_tables,
        "errors": final_state.get("errors", []),
        "warnings": final_state.get("warnings", [])
    }


def extract_from_documents(
    document_retriever,
    dokument_ids: List[str]
) -> Dict[str, Any]:
    """
    Extract BZO content from one or more documents.

    Args:
        document_retriever: BZODocumentRetriever instance
        dokument_ids: List of dokument IDs to process

    Returns:
        Dictionary with results per document:
        {
            "results": [
                {
                    "dokument_id": "...",
                    "articles": [...],
                    "zones": [...],
                    "rules": [...],
                    "errors": [...],
                    "warnings": [...]
                },
                ...
            ],
            "summary": {
                "total_documents": N,
                "successful": M,
                "failed": K,
                "total_articles": X,
                "total_zones": Y,
                "total_rules": Z
            }
        }
    """
    results = []
    total_articles = 0
    total_zones = 0
    total_rules = 0
    successful = 0
    failed = 0

    # Retrieve documents
    dokumente = document_retriever.get_documents_by_ids(dokument_ids)

    for dokument in dokumente:
        try:
            # Retrieve PDF content
            pdf_bytes = document_retriever.retrieve_pdf_content(dokument)
            if not pdf_bytes:
                logger.warning(f"Could not retrieve PDF for dokument {dokument.id}")
                results.append({
                    "dokument_id": dokument.id,
                    "articles": [],
                    "zones": [],
                    "rules": [],
                    "errors": [f"Could not retrieve PDF content"],
                    "warnings": []
                })
                failed += 1
                continue

            # Run extraction
            extraction_result = run_extraction(
                pdf_bytes=pdf_bytes,
                pdf_id=dokument.dokumentReferenz or f"dok_{dokument.id}",
                dokument_id=dokument.id
            )

            # Add dokument_id to result
            extraction_result["dokument_id"] = dokument.id
            results.append(extraction_result)

            # Update counters
            total_articles += len(extraction_result.get("articles", []))
            total_zones += len(extraction_result.get("zones", []))
            total_rules += len(extraction_result.get("rules", []))

            if extraction_result.get("errors"):
                failed += 1
            else:
                successful += 1

        except Exception as e:
            logger.error(f"Error processing dokument {dokument.id}: {str(e)}", exc_info=True)
            results.append({
                "dokument_id": dokument.id,
                "articles": [],
                "zones": [],
                "rules": [],
                "errors": [f"Processing error: {str(e)}"],
                "warnings": []
            })
            failed += 1

    return {
        "results": results,
        "summary": {
            "total_documents": len(dokument_ids),
            "successful": successful,
            "failed": failed,
            "total_articles": total_articles,
            "total_zones": total_zones,
            "total_rules": total_rules
        }
    }