gateway/modules/features/realEstate/bzoExtractionLangGraph.py

"""
LangGraph-based pipeline for extracting structured content from BZO PDFs.
"""

import logging
import re
from typing import TypedDict, List, Dict, Any, Optional
from dataclasses import dataclass
from langgraph.graph import StateGraph, START, END

from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor, TextBlock
from modules.features.realEstate.bzoRuleTaxonomy import RULE_TAXONOMY

logger = logging.getLogger(__name__)


# ===== State Definition =====

@dataclass
class ClassifiedBlock:
    """Classified text block."""
    block: TextBlock
    block_type: str  # "article", "heading", "table", "other"
    article_label: Optional[str] = None
    article_title: Optional[str] = None


@dataclass
class Article:
    """Assembled article."""
    article_label: str
    article_title: Optional[str]
    text: str
    page_start: int
    page_end: int
    section_level_1: Optional[str] = None
    section_level_2: Optional[str] = None
    section_level_3: Optional[str] = None
    zone_raw: Optional[str] = None


@dataclass
class ZoneInfo:
    """Zone information."""
    zone_code: str
    zone_name: str
    zone_category: Optional[str] = None
    zone_subcategory: Optional[str] = None
    empfindlichkeitsstufe: Optional[str] = None
    geschosszahl: Optional[int] = None
    gewerbeerleichterung: bool = False


@dataclass
class RuleCandidate:
    """Rule candidate from pattern matching."""
    rule_type: str
    matched_text: str
    article_text: str
    page: int
    is_table_rule: bool = False
    table_zones: List[str] = None
    condition_text: Optional[str] = None


@dataclass
class ParsedRule:
    """Parsed rule with structured values."""
    rule_type: str
    value_numeric: Optional[float]
    value_text: str
    unit: Optional[str]
    condition_text: Optional[str]
    is_table_rule: bool
    table_zones: List[str]
    page: int
    text_snippet: str
    zone_raw: Optional[str] = None
    rule_scope: str = "general"
    confidence: float = 0.5


class BZOExtractionState(TypedDict):
    """State for BZO extraction pipeline."""
    # Input metadata
    dokument_id: Optional[str]
    pdf_id: str

    # Extracted text blocks (stored as dicts for serialization)
    text_blocks: List[Dict[str, Any]]

    # Classified blocks (stored as dicts for serialization)
    classified_blocks: List[Dict[str, Any]]

    # Assembled articles (stored as dicts for serialization)
    articles: List[Dict[str, Any]]

    # Zone tracking
    current_zones: Dict[str, Dict[str, Any]]
    zones: List[Dict[str, Any]]

    # Rule extraction (stored as dicts for serialization)
    rule_candidates: List[Dict[str, Any]]
    parsed_rules: List[Dict[str, Any]]

    # Zone-parameter tables (structured table data mapping zones to parameters)
    zone_parameter_tables: List[Dict[str, Any]]

    # Processing metadata
    errors: List[str]
    warnings: List[str]


# ===== Node Implementations =====

def extract_pdf_text(state: BZOExtractionState) -> BZOExtractionState:
    """Extract text blocks from PDF."""
    try:
        # PDF bytes should be passed in state context
        # This is handled in run_extraction function
        # State already has text_blocks populated
        return state

    except Exception as e:
        logger.error(f"Error extracting PDF text: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"PDF extraction error: {str(e)}"]
        return state


def classify_text_block(state: BZOExtractionState) -> BZOExtractionState:
    """Classify text blocks into articles, headings, tables, etc."""
    try:
        classified = []

        for block_dict in state["text_blocks"]:
            text = block_dict["text"].strip()
            if not text:
                continue

            block_type = "other"
            article_label = None
            article_title = None

            # Check for article patterns
            article_match = re.search(r'Art\.?\s*(\d+[a-z]?)', text, re.IGNORECASE)
            if article_match:
                block_type = "article"
                article_label = f"Art. {article_match.group(1)}"
                # Try to extract title (text after article label, before first period or newline)
                title_match = re.search(r'Art\.?\s*\d+[a-z]?\s+(.+?)(?:\.|$|\n)', text, re.IGNORECASE)
                if title_match:
                    article_title = title_match.group(1).strip()

            # Check for heading patterns (Roman numerals, letters, numbers)
            elif re.match(r'^[A-Z]\.\s+[A-Z]', text) or re.match(r'^[IVX]+\.\s+[A-Z]', text) or re.match(r'^\d+\.\s+[A-Z]', text):
                block_type = "heading"

            # Check for table patterns (multiple tabs or aligned columns)
            elif '\t' in text or (len(text.split()) > 5 and text.count('  ') > 2):
                block_type = "table"

            classified.append({
                "block": {
                    "page": block_dict["page"],
                    "text": block_dict["text"],
                    "block_id": block_dict["block_id"],
                    "bbox": block_dict.get("bbox")
                },
                "block_type": block_type,
                "article_label": article_label,
                "article_title": article_title
            })

        # Update state with new classified blocks
        existing_blocks = state.get("classified_blocks", [])
        state["classified_blocks"] = existing_blocks + classified
        return state

    except Exception as e:
        logger.error(f"Error classifying text blocks: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Classification error: {str(e)}"]
        return state


def assemble_articles(state: BZOExtractionState) -> BZOExtractionState:
    """Assemble classified blocks into articles with hierarchical structure."""
    try:
        articles = []
        current_article = None
        current_section_1 = None
        current_section_2 = None
        current_section_3 = None

        for classified_dict in state["classified_blocks"]:
            block_dict = classified_dict["block"]
            block = TextBlock(
                page=block_dict["page"],
                text=block_dict["text"],
                block_id=block_dict["block_id"],
                bbox=block_dict.get("bbox")
            )
            text = block.text.strip()
            block_type = classified_dict["block_type"]
            article_label = classified_dict.get("article_label")
            article_title = classified_dict.get("article_title")

            # Update section levels
            if block_type == "heading":
                # Level 1: A., B., C.
                if re.match(r'^[A-Z]\.\s+', text):
                    current_section_1 = text.split('.', 1)[0] + '.'
                    current_section_2 = None
                    current_section_3 = None
                # Level 2: I., II., III.
                elif re.match(r'^[IVX]+\.\s+', text):
                    current_section_2 = text.split('.', 1)[0] + '.'
                    current_section_3 = None
                # Level 3: 1., 2., 3.
                elif re.match(r'^\d+\.\s+', text):
                    current_section_3 = text.split('.', 1)[0] + '.'

            # Start new article
            if article_label:
                # Save previous article if exists
                if current_article:
                    articles.append(current_article)

                # Start new article
                current_article = {
                    "article_label": article_label,
                    "article_title": article_title,
                    "text": text,
                    "page_start": block.page,
                    "page_end": block.page,
                    "section_level_1": current_section_1,
                    "section_level_2": current_section_2,
                    "section_level_3": current_section_3,
                    "zone_raw": None
                }
            # Continue current article
            elif current_article:
                current_article["text"] += "\n" + text
                current_article["page_end"] = block.page

        # Add last article
        if current_article:
            articles.append(current_article)

        # Update state with new articles
        existing_articles = state.get("articles", [])
        state["articles"] = existing_articles + articles
        return state

    except Exception as e:
        logger.error(f"Error assembling articles: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Article assembly error: {str(e)}"]
        return state


def detect_zone_changes(state: BZOExtractionState) -> BZOExtractionState:
    """Detect zone declarations and maintain zone scope."""
    try:
        zones = []
        current_zones = state.get("current_zones", {})

        for article_dict in state["articles"]:
            text = article_dict.get("text", "")
            article_label = article_dict.get("article_label", "")
            page_start = article_dict.get("page_start", 0)

            # Pattern: "Wohnzone W2", "Zone W3", "Gewerbezone G1"
            zone_patterns = [
                r'(?:Wohnzone|Zone|Gewerbezone|Industriezone|Zentrumszone|Ortsbildschutzzone|Erholungszone)\s+([A-Z0-9/]+)',
                r'([A-Z]\d+(?:/\d+)?(?:G)?)',  # W2/30, W2/30G, Z3, K3/4
            ]

            for pattern in zone_patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE)
                for match in matches:
                    zone_code = match.group(1).upper()

                    # Parse zone code
                    gewerbeerleichterung = zone_code.endswith('G')
                    if gewerbeerleichterung:
                        zone_code_base = zone_code[:-1]
                    else:
                        zone_code_base = zone_code

                    # Extract geschosszahl from code (e.g., W2 -> 2, W3/50 -> 3)
                    geschosszahl = None
                    if '/' in zone_code_base:
                        parts = zone_code_base.split('/')
                        geschosszahl_match = re.search(r'(\d+)', parts[0])
                        if geschosszahl_match:
                            geschosszahl = int(geschosszahl_match.group(1))
                    else:
                        geschosszahl_match = re.search(r'(\d+)', zone_code_base)
                        if geschosszahl_match:
                            geschosszahl = int(geschosszahl_match.group(1))

                    # Determine zone category from context
                    zone_category = None
                    if 'Wohnzone' in text or zone_code.startswith('W'):
                        zone_category = "Wohnzonen"
                    elif 'Zentrumszone' in text or zone_code.startswith('Z'):
                        zone_category = "Zentrumszonen"
                    elif 'Gewerbezone' in text or zone_code.startswith('G'):
                        zone_category = "Arbeitsplatzzonen"
                    elif 'Industriezone' in text or zone_code.startswith('I'):
                        zone_category = "Arbeitsplatzzonen"

                    zone_info = ZoneInfo(
                        zone_code=zone_code,
                        zone_name=f"Zone {zone_code}",
                        zone_category=zone_category,
                        geschosszahl=geschosszahl,
                        gewerbeerleichterung=gewerbeerleichterung
                    )

                    current_zones[zone_code] = zone_info
                    zones.append({
                        "zone_code": zone_code,
                        "zone_name": zone_info.zone_name,
                        "zone_category": zone_category,
                        "geschosszahl": geschosszahl,
                        "gewerbeerleichterung": gewerbeerleichterung,
                        "source_article": article_label,
                        "page": page_start
                    })

        # Update state with zones
        state["current_zones"] = current_zones
        existing_zones = state.get("zones", [])
        state["zones"] = existing_zones + zones
        return state

    except Exception as e:
        logger.error(f"Error detecting zones: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Zone detection error: {str(e)}"]
        return state


def detect_rule_candidates(state: BZOExtractionState) -> BZOExtractionState:
    """Detect rule candidates using pattern matching."""
    try:
        candidates = []

        for article_dict in state["articles"]:
            text = article_dict.get("text", "")
            article_label = article_dict.get("article_label", "")
            page_start = article_dict.get("page_start", 0)

            # Check each rule type in taxonomy
            for rule_type, rule_config in RULE_TAXONOMY.items():
                patterns = rule_config.get("patterns", [])

                for pattern in patterns:
                    # Create regex pattern (case-insensitive)
                    regex_pattern = re.compile(pattern, re.IGNORECASE)
                    matches = regex_pattern.finditer(text)

                    for match in matches:
                        # Extract context around match
                        start = max(0, match.start() - 100)
                        end = min(len(text), match.end() + 100)
                        context = text[start:end]

                        # Check for conditions (geographic, temporal, etc.)
                        condition_text = None
                        condition_patterns = [
                            r'(?:nördlich|südlich|östlich|westlich|oberhalb|unterhalb)\s+[^,\.]+',
                            r'(?:für|bei|in)\s+[^,\.]+',
                        ]
                        for cond_pattern in condition_patterns:
                            cond_match = re.search(cond_pattern, context, re.IGNORECASE)
                            if cond_match:
                                condition_text = cond_match.group(0)
                                break

                        candidate = {
                            "rule_type": rule_type,
                            "matched_text": match.group(0),
                            "article_text": text,
                            "page": page_start,
                            "condition_text": condition_text,
                            "is_table_rule": False,
                            "table_zones": []
                        }
                        candidates.append(candidate)

        # Update state with rule candidates
        existing_candidates = state.get("rule_candidates", [])
        state["rule_candidates"] = existing_candidates + candidates
        return state

    except Exception as e:
        logger.error(f"Error detecting rule candidates: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Rule candidate detection error: {str(e)}"]
        return state


def parse_rule_values(state: BZOExtractionState) -> BZOExtractionState:
    """Parse rule values using regex (LLM fallback can be added later)."""
    try:
        parsed_rules = []

        for candidate_dict in state["rule_candidates"]:
            rule_type = candidate_dict["rule_type"]
            rule_config = RULE_TAXONOMY.get(rule_type, {})
            units = rule_config.get("units", [])
            value_type = rule_config.get("value_type", "numeric")

            # Extract value using regex
            matched_text = candidate_dict["matched_text"]
            article_text = candidate_dict["article_text"]
            text = matched_text + " " + article_text[article_text.find(matched_text):article_text.find(matched_text) + 200]

            value_numeric = None
            value_text = matched_text
            unit = None

            # Try to extract numeric value
            if value_type in ["numeric", "integer"]:
                # Pattern: "max. 4", "30 %", "min. 3.5 m"
                value_patterns = [
                    r'(?:max|maximal|min|mindestens|höchstens)\s*\.?\s*(\d+(?:\.\d+)?)',
                    r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)',
                    r'(\d+(?:\.\d+)?)',
                ]

                for pattern in value_patterns:
                    match = re.search(pattern, text, re.IGNORECASE)
                    if match:
                        try:
                            value_numeric = float(match.group(1))
                            if value_type == "integer":
                                value_numeric = int(value_numeric)

                            # Check for unit
                            unit_match = re.search(r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)', text, re.IGNORECASE)
                            if unit_match:
                                unit = unit_match.group(2).lower()
                                if unit in ["meter", "metern"]:
                                    unit = "m"
                                elif unit == "prozent":
                                    unit = "%"

                            break
                        except ValueError:
                            continue

            # Calculate confidence
            confidence = 0.5
            if value_numeric is not None:
                confidence = 0.8
                if unit:
                    confidence = 0.9

            # Determine zone and scope
            zone_raw = None
            rule_scope = "general"

            # Check current zones context
            if state.get("current_zones"):
                # Use first zone as default (can be improved)
                zone_raw = list(state["current_zones"].keys())[0] if state["current_zones"] else None
                rule_scope = "zone" if zone_raw else "general"

            parsed_rule = {
                "rule_type": rule_type,
                "value_numeric": value_numeric,
                "value_text": value_text,
                "unit": unit,
                "condition_text": candidate_dict.get("condition_text"),
                "is_table_rule": candidate_dict.get("is_table_rule", False),
                "table_zones": candidate_dict.get("table_zones", []),
                "page": candidate_dict["page"],
                "text_snippet": value_text,
                "zone_raw": zone_raw,
                "rule_scope": rule_scope,
                "confidence": confidence
            }
            parsed_rules.append(parsed_rule)

        # Update state with parsed rules
        existing_rules = state.get("parsed_rules", [])
        state["parsed_rules"] = existing_rules + parsed_rules
        return state

    except Exception as e:
        logger.error(f"Error parsing rule values: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Rule parsing error: {str(e)}"]
        return state


def assign_zone_and_scope(state: BZOExtractionState) -> BZOExtractionState:
    """Assign zone and scope to parsed rules."""
    try:
        # Rules already have zone and scope assigned in parse_rule_values
        # This node can refine assignments if needed
        return state

    except Exception as e:
        logger.error(f"Error assigning zone and scope: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Zone/scope assignment error: {str(e)}"]
        return state


def confidence_scoring(state: BZOExtractionState) -> BZOExtractionState:
    """Calculate confidence scores for extracted data."""
    try:
        # Confidence already calculated in parse_rule_values
        # This node can refine scores if needed
        return state

    except Exception as e:
        logger.error(f"Error calculating confidence: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Confidence scoring error: {str(e)}"]
        return state


def extract_zone_parameter_tables(state: BZOExtractionState) -> BZOExtractionState:
    """
    Extract structured zone-parameter mappings from tables.

    Parses tables that map building zones to parameter values (e.g., Ausnützungsziffer,
    Vollgeschosse, Gebäudelänge, Grenzabstand, Fassadenhöhen).
    """
    try:
        import re
        tables = []

        # Find all table blocks
        table_blocks = [
            block for block in state.get("classified_blocks", [])
            if block.get("block_type") == "table"
        ]

        logger.info(f"Found {len(table_blocks)} table blocks to process")

        for table_block in table_blocks:
            block_dict = table_block.get("block", {})
            text = block_dict.get("text", "")
            page = block_dict.get("page", 0)

            if not text or len(text.strip()) < 20:  # Skip very short blocks
                continue

            # Try to parse table structure
            # Look for zone codes in header row (W2/30, W3/50, W4/70G*, etc.)
            zone_pattern = r'\b([WLIZK]\d+(?:/\d+)?(?:G\*?)?)\b'
            lines = text.split('\n')

            # Find header row (usually contains zone codes)
            header_row_idx = None
            zone_columns = []

            for idx, line in enumerate(lines):
                # Look for multiple zone codes in a line (header row)
                zone_matches = re.findall(zone_pattern, line, re.IGNORECASE)
                if len(zone_matches) >= 3:  # At least 3 zones indicates header row
                    header_row_idx = idx
                    zone_columns = zone_matches
                    logger.debug(f"Found header row at line {idx} with zones: {zone_columns}")
                    break

            if not header_row_idx or not zone_columns:
                # Try alternative: look for common table patterns
                # Check if text contains parameter names and zone codes
                parameter_keywords = [
                    r'Ausnützungsziffer',
                    r'Vollgeschosse',
                    r'Dachgeschosse',
                    r'Attikageschoss',
                    r'Untergeschoss',
                    r'Gebäudelänge',
                    r'Grenzabstand',
                    r'Fassadenhöhen',
                    r'Grundabstand',
                    r'Mehrlängen',
                    r'Höchstmass'
                ]

                has_parameters = any(re.search(kw, text, re.IGNORECASE) for kw in parameter_keywords)
                has_zones = len(re.findall(zone_pattern, text, re.IGNORECASE)) >= 3

                if has_parameters and has_zones:
                    # Extract all zones from entire text
                    all_zones = re.findall(zone_pattern, text, re.IGNORECASE)
                    zone_columns = list(dict.fromkeys(all_zones))  # Remove duplicates, preserve order
                    header_row_idx = 0  # Assume header is at start
                    logger.debug(f"Found zones in table text: {zone_columns}")

            if not zone_columns:
                continue

            # Parse parameter rows
            table_data = {
                "page": page,
                "zones": zone_columns,
                "parameters": [],
                "source_text": text[:500],  # Store first 500 chars for reference
                "article": None  # Will be set if found
            }

            # Extract parameters and their values
            # Look for parameter rows (a), b), c), etc. or parameter names
            parameter_row_patterns = [
                r'^[a-g]\)\s+(.+?)(?:\s+max\.|min\.|:)?',  # a) Parameter name
                r'^(Ausnützungsziffer|Vollgeschosse|Dachgeschosse|Attikageschoss|Untergeschoss|Gebäudelänge|Grenzabstand|Fassadenhöhen|Grundabstand|Mehrlängen|Höchstmass|Höchstmaß)',
            ]

            # Parse each line after header
            start_idx = header_row_idx + 1 if header_row_idx is not None else 0
            current_parameter = None
            current_subparameter = None
            parameter_values = {}
            subparameter_values = {}

            # Track which article/section this table belongs to
            article_context = None
            for block in state.get("classified_blocks", []):
                if block.get("block", {}).get("page") == page:
                    article_label = block.get("article_label")
                    if article_label:
                        article_context = article_label
                        break

            for line_idx in range(start_idx, len(lines)):
                line = lines[line_idx].strip()
                if not line:
                    continue

                # Check if this is a parameter row (main parameter like a), b), c))
                is_parameter_row = False
                parameter_name = None

                for pattern in parameter_row_patterns:
                    match = re.match(pattern, line, re.IGNORECASE)
                    if match:
                        is_parameter_row = True
                        parameter_name = match.group(1).strip()
                        # Clean up parameter name
                        parameter_name = re.sub(r'\s+max\.?\s*$', '', parameter_name, flags=re.IGNORECASE)
                        parameter_name = re.sub(r'\s+min\.?\s*$', '', parameter_name, flags=re.IGNORECASE)
                        break

                # Check for sub-parameters (like "Grundabstand min.", "Mehrlängen-zuschlag", "Höchstmass max.")
                is_subparameter = False
                subparameter_name = None
                if not is_parameter_row:
                    subparameter_patterns = [
                        r'^(Grundabstand|Mehrlängen|Höchstmass|Höchstmaß|Fassadenhöhen)\s*(min\.|max\.)?',
                        r'^(anrechenbare\s+Dachgeschosse|anrechenbares\s+Attikageschoss|anrechenbares\s+Untergeschoss)',
                    ]
                    for pattern in subparameter_patterns:
                        match = re.search(pattern, line, re.IGNORECASE)
                        if match:
                            is_subparameter = True
                            subparameter_name = match.group(1).strip()
                            if match.lastindex > 1 and match.group(2):
                                subparameter_name += f" {match.group(2).strip()}"
                            break

                if is_parameter_row and parameter_name:
                    # Save previous parameter if exists
                    if current_parameter and parameter_values:
                        param_entry = {
                            "parameter": current_parameter,
                            "values_by_zone": parameter_values.copy()
                        }
                        if article_context:
                            param_entry["article"] = article_context
                        table_data["parameters"].append(param_entry)

                    # Start new parameter
                    current_parameter = parameter_name
                    current_subparameter = None
                    parameter_values = {}
                    subparameter_values = {}
                    continue

                if is_subparameter and subparameter_name:
                    # Save previous subparameter if exists
                    if current_subparameter and subparameter_values:
                        if current_parameter:
                            # Add subparameter as nested parameter
                            param_entry = {
                                "parameter": f"{current_parameter} - {current_subparameter}",
                                "values_by_zone": subparameter_values.copy()
                            }
                            if article_context:
                                param_entry["article"] = article_context
                            table_data["parameters"].append(param_entry)

                    current_subparameter = subparameter_name
                    subparameter_values = {}
                    continue

                # Try to extract values for current parameter or subparameter
                target_values = subparameter_values if current_subparameter else parameter_values
                if current_parameter or current_subparameter:
                    # Improved parsing: try to align values with zone columns
                    # Split line by multiple spaces or tabs (table column separators)
                    line_parts = re.split(r'\s{2,}|\t', line)
                    line_parts = [p.strip() for p in line_parts if p.strip()]

                    # Look for numeric values with units
                    numeric_pattern = r'(\d+(?:\.\d+)?)\s*(%|m|Geschoss|Geschosse|Geschosse\s+max\.?|Geschoss\s+max\.?)?'
                    all_matches = list(re.finditer(numeric_pattern, line, re.IGNORECASE))

                    # Also look for fractions (like 1/3)
                    fraction_pattern = r'(\d+/\d+)'
                    fraction_matches = list(re.finditer(fraction_pattern, line, re.IGNORECASE))

                    # Combine all matches, preserving position
                    all_value_matches = []
                    for m in all_matches:
                        value = m.group(1)
                        unit = m.group(2) if m.lastindex > 1 else None
                        all_value_matches.append((m.start(), m.group(0), value, unit))

                    for m in fraction_matches:
                        all_value_matches.append((m.start(), m.group(0), m.group(0), None))

                    all_value_matches.sort(key=lambda x: x[0])

                    # Try to map values to zones
                    # Strategy: if we have roughly the same number of values as zones, map 1:1
                    # Otherwise, try to distribute evenly
                    if len(all_value_matches) > 0 and len(zone_columns) > 0:
                        if len(all_value_matches) == len(zone_columns):
                            # Perfect 1:1 mapping
                            for zone_idx, zone in enumerate(zone_columns):
                                if zone_idx < len(all_value_matches):
                                    _, full_match, value, unit = all_value_matches[zone_idx]
                                    if zone not in target_values:
                                        target_values[zone] = []
                                    target_values[zone].append({
                                        "value": value,
                                        "unit": unit.strip() if unit else None,
                                        "raw_text": line[:200],
                                        "line_number": line_idx
                                    })
                        elif len(all_value_matches) >= len(zone_columns):
                            # More values than zones - try to group
                            values_per_zone = len(all_value_matches) / len(zone_columns)
                            for zone_idx, zone in enumerate(zone_columns):
                                start_idx = int(zone_idx * values_per_zone)
                                end_idx = int((zone_idx + 1) * values_per_zone)
                                zone_values = all_value_matches[start_idx:end_idx]

                                if zone_values:
                                    if zone not in target_values:
                                        target_values[zone] = []
                                    # Take the first (or most relevant) value
                                    _, full_match, value, unit = zone_values[0]
                                    target_values[zone].append({
                                        "value": value,
                                        "unit": unit.strip() if unit else None,
                                        "raw_text": line[:200],
                                        "line_number": line_idx
                                    })
                        else:
                            # Fewer values than zones - try to match by position
                            # Use line_parts if they align better
                            if len(line_parts) >= len(zone_columns) * 0.7:
                                # Try to extract values from line_parts
                                for zone_idx, zone in enumerate(zone_columns):
                                    if zone_idx < len(line_parts):
                                        part = line_parts[zone_idx]
                                        # Extract numeric value from this part
                                        num_match = re.search(r'(\d+(?:\.\d+)?)', part)
                                        if num_match:
                                            value = num_match.group(1)
                                            unit_match = re.search(r'(%|m|Geschoss)', part, re.IGNORECASE)
                                            unit = unit_match.group(0) if unit_match else None

                                            if zone not in target_values:
                                                target_values[zone] = []
                                            target_values[zone].append({
                                                "value": value,
                                                "unit": unit,
                                                "raw_text": part[:100],
                                                "line_number": line_idx
                                            })
                            else:
                                # Fallback: assign to first zone(s)
                                for idx, (_, full_match, value, unit) in enumerate(all_value_matches):
                                    if idx < len(zone_columns):
                                        zone = zone_columns[idx]
                                        if zone not in target_values:
                                            target_values[zone] = []
                                        target_values[zone].append({
                                            "value": value,
                                            "unit": unit.strip() if unit else None,
                                            "raw_text": line[:200],
                                            "line_number": line_idx
                                        })

            # Save last parameter/subparameter
            if current_subparameter and subparameter_values:
                if current_parameter:
                    param_entry = {
                        "parameter": f"{current_parameter} - {current_subparameter}",
                        "values_by_zone": subparameter_values.copy()
                    }
                    if article_context:
                        param_entry["article"] = article_context
                    table_data["parameters"].append(param_entry)

            if current_parameter and parameter_values:
                param_entry = {
                    "parameter": current_parameter,
                    "values_by_zone": parameter_values.copy()
                }
                if article_context:
                    param_entry["article"] = article_context
                table_data["parameters"].append(param_entry)

            if table_data["parameters"]:
                tables.append(table_data)
                logger.info(f"Extracted table with {len(table_data['zones'])} zones and {len(table_data['parameters'])} parameters from page {page}")

        # Update state
        existing_tables = state.get("zone_parameter_tables", [])
        state["zone_parameter_tables"] = existing_tables + tables

        logger.info(f"Extracted {len(tables)} zone-parameter tables total")
        return state

    except Exception as e:
        logger.error(f"Error extracting zone-parameter tables: {e}", exc_info=True)
        state["errors"] = state.get("errors", []) + [f"Table extraction error: {str(e)}"]
        return state


# ===== Graph Construction =====

def create_bzo_extraction_graph():
    """Create and compile the BZO extraction graph."""
    workflow = StateGraph(BZOExtractionState)

    # Add nodes
    workflow.add_node("extract_pdf_text", extract_pdf_text)
    workflow.add_node("classify_text_block", classify_text_block)
    workflow.add_node("assemble_articles", assemble_articles)
    workflow.add_node("detect_zone_changes", detect_zone_changes)
    workflow.add_node("extract_zone_parameter_tables", extract_zone_parameter_tables)
    workflow.add_node("detect_rule_candidates", detect_rule_candidates)
    workflow.add_node("parse_rule_values", parse_rule_values)
    workflow.add_node("assign_zone_and_scope", assign_zone_and_scope)
    workflow.add_node("confidence_scoring", confidence_scoring)

    # Define edges
    workflow.set_entry_point("extract_pdf_text")
    workflow.add_edge("extract_pdf_text", "classify_text_block")
    workflow.add_edge("classify_text_block", "assemble_articles")
    workflow.add_edge("assemble_articles", "detect_zone_changes")
    workflow.add_edge("detect_zone_changes", "extract_zone_parameter_tables")
    workflow.add_edge("extract_zone_parameter_tables", "detect_rule_candidates")
    workflow.add_edge("detect_rule_candidates", "parse_rule_values")
    workflow.add_edge("parse_rule_values", "assign_zone_and_scope")
    workflow.add_edge("assign_zone_and_scope", "confidence_scoring")
    workflow.add_edge("confidence_scoring", END)

    return workflow.compile()


def run_extraction(pdf_bytes: bytes, pdf_id: str = None, dokument_id: str = None) -> Dict[str, Any]:
    """
    Run the extraction pipeline on a PDF and return structured, sorted results.

    Args:
        pdf_bytes: PDF file content as bytes
        pdf_id: Optional identifier for the PDF (defaults to generated ID)
        dokument_id: Optional dokument ID for reference

    Returns:
        Dictionary with extracted and sorted content:
        {
            "articles": [...],  # Sorted by page_start, then article_label
            "zones": [...],     # Sorted by zone_code
            "rules": [...],     # Sorted by rule_type, then page
            "errors": [...],
            "warnings": [...]
        }
    """
    import uuid

    if not pdf_id:
        pdf_id = f"pdf_{uuid.uuid4().hex[:8]}"

    # Initialize state
    state: BZOExtractionState = {
        "dokument_id": dokument_id,
        "pdf_id": pdf_id,
        "text_blocks": [],
        "classified_blocks": [],
        "articles": [],
        "current_zones": {},
        "zones": [],
        "rule_candidates": [],
        "parsed_rules": [],
        "zone_parameter_tables": [],
        "errors": [],
        "warnings": []
    }

    # Extract PDF text first
    pdf_extractor = BZOPdfExtractor()
    text_blocks_objects = pdf_extractor.extract_text_blocks(pdf_bytes, state["pdf_id"])
    # Convert TextBlock objects to dicts for state
    state["text_blocks"] = [
        {
            "page": tb.page,
            "text": tb.text,
            "block_id": tb.block_id,
            "bbox": tb.bbox
        }
        for tb in text_blocks_objects
    ]

    # Create and run graph
    graph = create_bzo_extraction_graph()
    final_state = graph.invoke(state)

    # Sort and structure results
    articles = sorted(
        final_state.get("articles", []),
        key=lambda x: (x.get("page_start", 0), x.get("article_label", ""))
    )

    zones = sorted(
        final_state.get("zones", []),
        key=lambda x: x.get("zone_code", "")
    )

    rules = sorted(
        final_state.get("parsed_rules", []),
        key=lambda x: (x.get("rule_type", ""), x.get("page", 0))
    )

    zone_parameter_tables = final_state.get("zone_parameter_tables", [])

    return {
        "articles": articles,
        "zones": zones,
        "rules": rules,
        "zone_parameter_tables": zone_parameter_tables,
        "errors": final_state.get("errors", []),
        "warnings": final_state.get("warnings", [])
    }


def extract_from_documents(
    document_retriever,
    dokument_ids: List[str]
) -> Dict[str, Any]:
    """
    Extract BZO content from one or more documents.

    Args:
        document_retriever: BZODocumentRetriever instance
        dokument_ids: List of dokument IDs to process

    Returns:
        Dictionary with results per document:
        {
            "results": [
                {
                    "dokument_id": "...",
                    "articles": [...],
                    "zones": [...],
                    "rules": [...],
                    "errors": [...],
                    "warnings": [...]
                },
                ...
            ],
            "summary": {
                "total_documents": N,
                "successful": M,
                "failed": K,
                "total_articles": X,
                "total_zones": Y,
                "total_rules": Z
            }
        }
    """
    results = []
    total_articles = 0
    total_zones = 0
    total_rules = 0
    successful = 0
    failed = 0

    # Retrieve documents
    dokumente = document_retriever.get_documents_by_ids(dokument_ids)

    for dokument in dokumente:
        try:
            # Retrieve PDF content
            pdf_bytes = document_retriever.retrieve_pdf_content(dokument)
            if not pdf_bytes:
                logger.warning(f"Could not retrieve PDF for dokument {dokument.id}")
                results.append({
                    "dokument_id": dokument.id,
                    "articles": [],
                    "zones": [],
                    "rules": [],
                    "errors": [f"Could not retrieve PDF content"],
                    "warnings": []
                })
                failed += 1
                continue

            # Run extraction
            extraction_result = run_extraction(
                pdf_bytes=pdf_bytes,
                pdf_id=dokument.dokumentReferenz or f"dok_{dokument.id}",
                dokument_id=dokument.id
            )

            # Add dokument_id to result
            extraction_result["dokument_id"] = dokument.id
            results.append(extraction_result)

            # Update counters
            total_articles += len(extraction_result.get("articles", []))
            total_zones += len(extraction_result.get("zones", []))
            total_rules += len(extraction_result.get("rules", []))

            if extraction_result.get("errors"):
                failed += 1
            else:
                successful += 1

        except Exception as e:
            logger.error(f"Error processing dokument {dokument.id}: {str(e)}", exc_info=True)
            results.append({
                "dokument_id": dokument.id,
                "articles": [],
                "zones": [],
                "rules": [],
                "errors": [f"Processing error: {str(e)}"],
                "warnings": []
            })
            failed += 1

    return {
        "results": results,
        "summary": {
            "total_documents": len(dokument_ids),
            "successful": successful,
            "failed": failed,
            "total_articles": total_articles,
            "total_zones": total_zones,
            "total_rules": total_rules
        }
    }