""" LangGraph-based pipeline for extracting structured content from BZO PDFs. """ import logging import re from typing import TypedDict, List, Dict, Any, Optional from dataclasses import dataclass from langgraph.graph import StateGraph, START, END from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor, TextBlock from modules.features.realEstate.bzoRuleTaxonomy import RULE_TAXONOMY logger = logging.getLogger(__name__) # ===== State Definition ===== @dataclass class ClassifiedBlock: """Classified text block.""" block: TextBlock block_type: str # "article", "heading", "table", "other" article_label: Optional[str] = None article_title: Optional[str] = None @dataclass class Article: """Assembled article.""" article_label: str article_title: Optional[str] text: str page_start: int page_end: int section_level_1: Optional[str] = None section_level_2: Optional[str] = None section_level_3: Optional[str] = None zone_raw: Optional[str] = None @dataclass class ZoneInfo: """Zone information.""" zone_code: str zone_name: str zone_category: Optional[str] = None zone_subcategory: Optional[str] = None empfindlichkeitsstufe: Optional[str] = None geschosszahl: Optional[int] = None gewerbeerleichterung: bool = False @dataclass class RuleCandidate: """Rule candidate from pattern matching.""" rule_type: str matched_text: str article_text: str page: int is_table_rule: bool = False table_zones: List[str] = None condition_text: Optional[str] = None @dataclass class ParsedRule: """Parsed rule with structured values.""" rule_type: str value_numeric: Optional[float] value_text: str unit: Optional[str] condition_text: Optional[str] is_table_rule: bool table_zones: List[str] page: int text_snippet: str zone_raw: Optional[str] = None rule_scope: str = "general" confidence: float = 0.5 class BZOExtractionState(TypedDict): """State for BZO extraction pipeline.""" # Input metadata dokument_id: Optional[str] pdf_id: str # Extracted text blocks (stored as dicts for serialization) text_blocks: List[Dict[str, Any]] # Classified blocks (stored as dicts for serialization) classified_blocks: List[Dict[str, Any]] # Assembled articles (stored as dicts for serialization) articles: List[Dict[str, Any]] # Zone tracking current_zones: Dict[str, Dict[str, Any]] zones: List[Dict[str, Any]] # Rule extraction (stored as dicts for serialization) rule_candidates: List[Dict[str, Any]] parsed_rules: List[Dict[str, Any]] # Processing metadata errors: List[str] warnings: List[str] # ===== Node Implementations ===== def extract_pdf_text(state: BZOExtractionState) -> BZOExtractionState: """Extract text blocks from PDF.""" try: # PDF bytes should be passed in state context # This is handled in run_extraction function # State already has text_blocks populated return state except Exception as e: logger.error(f"Error extracting PDF text: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"PDF extraction error: {str(e)}"] return state def classify_text_block(state: BZOExtractionState) -> BZOExtractionState: """Classify text blocks into articles, headings, tables, etc.""" try: classified = [] for block_dict in state["text_blocks"]: text = block_dict["text"].strip() if not text: continue block_type = "other" article_label = None article_title = None # Check for article patterns article_match = re.search(r'Art\.?\s*(\d+[a-z]?)', text, re.IGNORECASE) if article_match: block_type = "article" article_label = f"Art. {article_match.group(1)}" # Try to extract title (text after article label, before first period or newline) title_match = re.search(r'Art\.?\s*\d+[a-z]?\s+(.+?)(?:\.|$|\n)', text, re.IGNORECASE) if title_match: article_title = title_match.group(1).strip() # Check for heading patterns (Roman numerals, letters, numbers) elif re.match(r'^[A-Z]\.\s+[A-Z]', text) or re.match(r'^[IVX]+\.\s+[A-Z]', text) or re.match(r'^\d+\.\s+[A-Z]', text): block_type = "heading" # Check for table patterns (multiple tabs or aligned columns) elif '\t' in text or (len(text.split()) > 5 and text.count(' ') > 2): block_type = "table" classified.append({ "block": { "page": block_dict["page"], "text": block_dict["text"], "block_id": block_dict["block_id"], "bbox": block_dict.get("bbox") }, "block_type": block_type, "article_label": article_label, "article_title": article_title }) # Update state with new classified blocks existing_blocks = state.get("classified_blocks", []) state["classified_blocks"] = existing_blocks + classified return state except Exception as e: logger.error(f"Error classifying text blocks: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Classification error: {str(e)}"] return state def assemble_articles(state: BZOExtractionState) -> BZOExtractionState: """Assemble classified blocks into articles with hierarchical structure.""" try: articles = [] current_article = None current_section_1 = None current_section_2 = None current_section_3 = None for classified_dict in state["classified_blocks"]: block_dict = classified_dict["block"] block = TextBlock( page=block_dict["page"], text=block_dict["text"], block_id=block_dict["block_id"], bbox=block_dict.get("bbox") ) text = block.text.strip() block_type = classified_dict["block_type"] article_label = classified_dict.get("article_label") article_title = classified_dict.get("article_title") # Update section levels if block_type == "heading": # Level 1: A., B., C. if re.match(r'^[A-Z]\.\s+', text): current_section_1 = text.split('.', 1)[0] + '.' current_section_2 = None current_section_3 = None # Level 2: I., II., III. elif re.match(r'^[IVX]+\.\s+', text): current_section_2 = text.split('.', 1)[0] + '.' current_section_3 = None # Level 3: 1., 2., 3. elif re.match(r'^\d+\.\s+', text): current_section_3 = text.split('.', 1)[0] + '.' # Start new article if article_label: # Save previous article if exists if current_article: articles.append(current_article) # Start new article current_article = { "article_label": article_label, "article_title": article_title, "text": text, "page_start": block.page, "page_end": block.page, "section_level_1": current_section_1, "section_level_2": current_section_2, "section_level_3": current_section_3, "zone_raw": None } # Continue current article elif current_article: current_article["text"] += "\n" + text current_article["page_end"] = block.page # Add last article if current_article: articles.append(current_article) # Update state with new articles existing_articles = state.get("articles", []) state["articles"] = existing_articles + articles return state except Exception as e: logger.error(f"Error assembling articles: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Article assembly error: {str(e)}"] return state def detect_zone_changes(state: BZOExtractionState) -> BZOExtractionState: """Detect zone declarations and maintain zone scope.""" try: zones = [] current_zones = state.get("current_zones", {}) for article_dict in state["articles"]: text = article_dict.get("text", "") article_label = article_dict.get("article_label", "") page_start = article_dict.get("page_start", 0) # Pattern: "Wohnzone W2", "Zone W3", "Gewerbezone G1" zone_patterns = [ r'(?:Wohnzone|Zone|Gewerbezone|Industriezone|Zentrumszone|Ortsbildschutzzone|Erholungszone)\s+([A-Z0-9/]+)', r'([A-Z]\d+(?:/\d+)?(?:G)?)', # W2/30, W2/30G, Z3, K3/4 ] for pattern in zone_patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: zone_code = match.group(1).upper() # Parse zone code gewerbeerleichterung = zone_code.endswith('G') if gewerbeerleichterung: zone_code_base = zone_code[:-1] else: zone_code_base = zone_code # Extract geschosszahl from code (e.g., W2 -> 2, W3/50 -> 3) geschosszahl = None if '/' in zone_code_base: parts = zone_code_base.split('/') geschosszahl_match = re.search(r'(\d+)', parts[0]) if geschosszahl_match: geschosszahl = int(geschosszahl_match.group(1)) else: geschosszahl_match = re.search(r'(\d+)', zone_code_base) if geschosszahl_match: geschosszahl = int(geschosszahl_match.group(1)) # Determine zone category from context zone_category = None if 'Wohnzone' in text or zone_code.startswith('W'): zone_category = "Wohnzonen" elif 'Zentrumszone' in text or zone_code.startswith('Z'): zone_category = "Zentrumszonen" elif 'Gewerbezone' in text or zone_code.startswith('G'): zone_category = "Arbeitsplatzzonen" elif 'Industriezone' in text or zone_code.startswith('I'): zone_category = "Arbeitsplatzzonen" zone_info = ZoneInfo( zone_code=zone_code, zone_name=f"Zone {zone_code}", zone_category=zone_category, geschosszahl=geschosszahl, gewerbeerleichterung=gewerbeerleichterung ) current_zones[zone_code] = zone_info zones.append({ "zone_code": zone_code, "zone_name": zone_info.zone_name, "zone_category": zone_category, "geschosszahl": geschosszahl, "gewerbeerleichterung": gewerbeerleichterung, "source_article": article_label, "page": page_start }) # Update state with zones state["current_zones"] = current_zones existing_zones = state.get("zones", []) state["zones"] = existing_zones + zones return state except Exception as e: logger.error(f"Error detecting zones: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Zone detection error: {str(e)}"] return state def detect_rule_candidates(state: BZOExtractionState) -> BZOExtractionState: """Detect rule candidates using pattern matching.""" try: candidates = [] for article_dict in state["articles"]: text = article_dict.get("text", "") article_label = article_dict.get("article_label", "") page_start = article_dict.get("page_start", 0) # Check each rule type in taxonomy for rule_type, rule_config in RULE_TAXONOMY.items(): patterns = rule_config.get("patterns", []) for pattern in patterns: # Create regex pattern (case-insensitive) regex_pattern = re.compile(pattern, re.IGNORECASE) matches = regex_pattern.finditer(text) for match in matches: # Extract context around match start = max(0, match.start() - 100) end = min(len(text), match.end() + 100) context = text[start:end] # Check for conditions (geographic, temporal, etc.) condition_text = None condition_patterns = [ r'(?:nördlich|südlich|östlich|westlich|oberhalb|unterhalb)\s+[^,\.]+', r'(?:für|bei|in)\s+[^,\.]+', ] for cond_pattern in condition_patterns: cond_match = re.search(cond_pattern, context, re.IGNORECASE) if cond_match: condition_text = cond_match.group(0) break candidate = { "rule_type": rule_type, "matched_text": match.group(0), "article_text": text, "page": page_start, "condition_text": condition_text, "is_table_rule": False, "table_zones": [] } candidates.append(candidate) # Update state with rule candidates existing_candidates = state.get("rule_candidates", []) state["rule_candidates"] = existing_candidates + candidates return state except Exception as e: logger.error(f"Error detecting rule candidates: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Rule candidate detection error: {str(e)}"] return state def parse_rule_values(state: BZOExtractionState) -> BZOExtractionState: """Parse rule values using regex (LLM fallback can be added later).""" try: parsed_rules = [] for candidate_dict in state["rule_candidates"]: rule_type = candidate_dict["rule_type"] rule_config = RULE_TAXONOMY.get(rule_type, {}) units = rule_config.get("units", []) value_type = rule_config.get("value_type", "numeric") # Extract value using regex matched_text = candidate_dict["matched_text"] article_text = candidate_dict["article_text"] text = matched_text + " " + article_text[article_text.find(matched_text):article_text.find(matched_text) + 200] value_numeric = None value_text = matched_text unit = None # Try to extract numeric value if value_type in ["numeric", "integer"]: # Pattern: "max. 4", "30 %", "min. 3.5 m" value_patterns = [ r'(?:max|maximal|min|mindestens|höchstens)\s*\.?\s*(\d+(?:\.\d+)?)', r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)', r'(\d+(?:\.\d+)?)', ] for pattern in value_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: try: value_numeric = float(match.group(1)) if value_type == "integer": value_numeric = int(value_numeric) # Check for unit unit_match = re.search(r'(\d+(?:\.\d+)?)\s*(%|m|meter|metern|prozent)', text, re.IGNORECASE) if unit_match: unit = unit_match.group(2).lower() if unit in ["meter", "metern"]: unit = "m" elif unit == "prozent": unit = "%" break except ValueError: continue # Calculate confidence confidence = 0.5 if value_numeric is not None: confidence = 0.8 if unit: confidence = 0.9 # Determine zone and scope zone_raw = None rule_scope = "general" # Check current zones context if state.get("current_zones"): # Use first zone as default (can be improved) zone_raw = list(state["current_zones"].keys())[0] if state["current_zones"] else None rule_scope = "zone" if zone_raw else "general" parsed_rule = { "rule_type": rule_type, "value_numeric": value_numeric, "value_text": value_text, "unit": unit, "condition_text": candidate_dict.get("condition_text"), "is_table_rule": candidate_dict.get("is_table_rule", False), "table_zones": candidate_dict.get("table_zones", []), "page": candidate_dict["page"], "text_snippet": value_text, "zone_raw": zone_raw, "rule_scope": rule_scope, "confidence": confidence } parsed_rules.append(parsed_rule) # Update state with parsed rules existing_rules = state.get("parsed_rules", []) state["parsed_rules"] = existing_rules + parsed_rules return state except Exception as e: logger.error(f"Error parsing rule values: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Rule parsing error: {str(e)}"] return state def assign_zone_and_scope(state: BZOExtractionState) -> BZOExtractionState: """Assign zone and scope to parsed rules.""" try: # Rules already have zone and scope assigned in parse_rule_values # This node can refine assignments if needed return state except Exception as e: logger.error(f"Error assigning zone and scope: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Zone/scope assignment error: {str(e)}"] return state def confidence_scoring(state: BZOExtractionState) -> BZOExtractionState: """Calculate confidence scores for extracted data.""" try: # Confidence already calculated in parse_rule_values # This node can refine scores if needed return state except Exception as e: logger.error(f"Error calculating confidence: {e}", exc_info=True) state["errors"] = state.get("errors", []) + [f"Confidence scoring error: {str(e)}"] return state # ===== Graph Construction ===== def create_bzo_extraction_graph(): """Create and compile the BZO extraction graph.""" workflow = StateGraph(BZOExtractionState) # Add nodes workflow.add_node("extract_pdf_text", extract_pdf_text) workflow.add_node("classify_text_block", classify_text_block) workflow.add_node("assemble_articles", assemble_articles) workflow.add_node("detect_zone_changes", detect_zone_changes) workflow.add_node("detect_rule_candidates", detect_rule_candidates) workflow.add_node("parse_rule_values", parse_rule_values) workflow.add_node("assign_zone_and_scope", assign_zone_and_scope) workflow.add_node("confidence_scoring", confidence_scoring) # Define edges workflow.set_entry_point("extract_pdf_text") workflow.add_edge("extract_pdf_text", "classify_text_block") workflow.add_edge("classify_text_block", "assemble_articles") workflow.add_edge("assemble_articles", "detect_zone_changes") workflow.add_edge("detect_zone_changes", "detect_rule_candidates") workflow.add_edge("detect_rule_candidates", "parse_rule_values") workflow.add_edge("parse_rule_values", "assign_zone_and_scope") workflow.add_edge("assign_zone_and_scope", "confidence_scoring") workflow.add_edge("confidence_scoring", END) return workflow.compile() def run_extraction(pdf_bytes: bytes, pdf_id: str = None, dokument_id: str = None) -> Dict[str, Any]: """ Run the extraction pipeline on a PDF and return structured, sorted results. Args: pdf_bytes: PDF file content as bytes pdf_id: Optional identifier for the PDF (defaults to generated ID) dokument_id: Optional dokument ID for reference Returns: Dictionary with extracted and sorted content: { "articles": [...], # Sorted by page_start, then article_label "zones": [...], # Sorted by zone_code "rules": [...], # Sorted by rule_type, then page "errors": [...], "warnings": [...] } """ import uuid if not pdf_id: pdf_id = f"pdf_{uuid.uuid4().hex[:8]}" # Initialize state state: BZOExtractionState = { "dokument_id": dokument_id, "pdf_id": pdf_id, "text_blocks": [], "classified_blocks": [], "articles": [], "current_zones": {}, "zones": [], "rule_candidates": [], "parsed_rules": [], "errors": [], "warnings": [] } # Extract PDF text first pdf_extractor = BZOPdfExtractor() text_blocks_objects = pdf_extractor.extract_text_blocks(pdf_bytes, state["pdf_id"]) # Convert TextBlock objects to dicts for state state["text_blocks"] = [ { "page": tb.page, "text": tb.text, "block_id": tb.block_id, "bbox": tb.bbox } for tb in text_blocks_objects ] # Create and run graph graph = create_bzo_extraction_graph() final_state = graph.invoke(state) # Sort and structure results articles = sorted( final_state.get("articles", []), key=lambda x: (x.get("page_start", 0), x.get("article_label", "")) ) zones = sorted( final_state.get("zones", []), key=lambda x: x.get("zone_code", "") ) rules = sorted( final_state.get("parsed_rules", []), key=lambda x: (x.get("rule_type", ""), x.get("page", 0)) ) return { "articles": articles, "zones": zones, "rules": rules, "errors": final_state.get("errors", []), "warnings": final_state.get("warnings", []) } def extract_from_documents( document_retriever, dokument_ids: List[str] ) -> Dict[str, Any]: """ Extract BZO content from one or more documents. Args: document_retriever: BZODocumentRetriever instance dokument_ids: List of dokument IDs to process Returns: Dictionary with results per document: { "results": [ { "dokument_id": "...", "articles": [...], "zones": [...], "rules": [...], "errors": [...], "warnings": [...] }, ... ], "summary": { "total_documents": N, "successful": M, "failed": K, "total_articles": X, "total_zones": Y, "total_rules": Z } } """ results = [] total_articles = 0 total_zones = 0 total_rules = 0 successful = 0 failed = 0 # Retrieve documents dokumente = document_retriever.get_documents_by_ids(dokument_ids) for dokument in dokumente: try: # Retrieve PDF content pdf_bytes = document_retriever.retrieve_pdf_content(dokument) if not pdf_bytes: logger.warning(f"Could not retrieve PDF for dokument {dokument.id}") results.append({ "dokument_id": dokument.id, "articles": [], "zones": [], "rules": [], "errors": [f"Could not retrieve PDF content"], "warnings": [] }) failed += 1 continue # Run extraction extraction_result = run_extraction( pdf_bytes=pdf_bytes, pdf_id=dokument.dokumentReferenz or f"dok_{dokument.id}", dokument_id=dokument.id ) # Add dokument_id to result extraction_result["dokument_id"] = dokument.id results.append(extraction_result) # Update counters total_articles += len(extraction_result.get("articles", [])) total_zones += len(extraction_result.get("zones", [])) total_rules += len(extraction_result.get("rules", [])) if extraction_result.get("errors"): failed += 1 else: successful += 1 except Exception as e: logger.error(f"Error processing dokument {dokument.id}: {str(e)}", exc_info=True) results.append({ "dokument_id": dokument.id, "articles": [], "zones": [], "rules": [], "errors": [f"Processing error: {str(e)}"], "warnings": [] }) failed += 1 return { "results": results, "summary": { "total_documents": len(dokument_ids), "successful": successful, "failed": failed, "total_articles": total_articles, "total_zones": total_zones, "total_rules": total_rules } }