""" Document retriever for BZO extraction pipeline. Queries Dokument table and retrieves PDF content from ComponentObjects. """ import logging from typing import List, Dict, Any, Optional from .datamodelFeatureRealEstate import Dokument, DokumentTyp, Gemeinde from .interfaceFeatureRealEstate import RealEstateObjects from modules.interfaces.interfaceDbManagement import ComponentObjects logger = logging.getLogger(__name__) class BZODocumentRetriever: """Retrieves BZO documents from database and ComponentObjects.""" def __init__(self, realEstateInterface: RealEstateObjects, componentInterface: ComponentObjects): """ Initialize document retriever. Args: realEstateInterface: Real Estate database interface componentInterface: ComponentObjects interface for file retrieval """ self.realEstateInterface = realEstateInterface self.componentInterface = componentInterface def get_documents_by_ids(self, dokument_ids: List[str]) -> List[Dokument]: """ Retrieve specific documents by their IDs. Args: dokument_ids: List of dokument IDs to retrieve Returns: List of Dokument records """ try: dokumente = [] for dokument_id in dokument_ids: dokument = self.realEstateInterface.getDokument(dokument_id) if dokument: dokumente.append(dokument) else: logger.warning(f"Dokument {dokument_id} not found") logger.info(f"Retrieved {len(dokumente)} documents out of {len(dokument_ids)} requested") return dokumente except Exception as e: logger.error(f"Error retrieving documents by IDs: {str(e)}", exc_info=True) raise def get_bzo_documents(self, gemeinde_id: Optional[str] = None) -> List[Dokument]: """ Query Dokument table for BZO documents. Args: gemeinde_id: Optional filter by specific Gemeinde Returns: List of Dokument records with BZO document type """ try: # Query each document type separately (database connector doesn't support array filters) dokumente_aktuell = self.realEstateInterface.getDokumente( recordFilter={ "mandateId": self.realEstateInterface.mandateId, "dokumentTyp": DokumentTyp.GEMEINDE_BZO_AKTUELL } ) dokumente_revision = self.realEstateInterface.getDokumente( recordFilter={ "mandateId": self.realEstateInterface.mandateId, "dokumentTyp": DokumentTyp.GEMEINDE_BZO_REVISION } ) # Combine results and deduplicate by ID dokumente_dict = {} for dokument in dokumente_aktuell + dokumente_revision: dokumente_dict[dokument.id] = dokument dokumente = list(dokumente_dict.values()) # If gemeinde_id provided, filter by checking Gemeinde.dokumente relationship if gemeinde_id: filtered_dokumente = [] for dokument in dokumente: # Check if this dokument is linked to the specified Gemeinde gemeinden = self.realEstateInterface.getGemeinden( recordFilter={"mandateId": self.realEstateInterface.mandateId} ) for gemeinde in gemeinden: if gemeinde.id == gemeinde_id: # Check if dokument.id is in gemeinde.dokumente if gemeinde.dokumente: for doc in gemeinde.dokumente: if isinstance(doc, dict) and doc.get("id") == dokument.id: filtered_dokumente.append(dokument) break elif hasattr(doc, "id") and doc.id == dokument.id: filtered_dokumente.append(dokument) break break dokumente = filtered_dokumente logger.info(f"Found {len(dokumente)} BZO documents" + (f" for Gemeinde {gemeinde_id}" if gemeinde_id else "")) return dokumente except Exception as e: logger.error(f"Error querying BZO documents: {str(e)}", exc_info=True) raise def retrieve_pdf_content(self, dokument: Dokument) -> Optional[bytes]: """ Retrieve PDF bytes from ComponentObjects using dokumentReferenz. Args: dokument: Dokument record with dokumentReferenz field Returns: PDF bytes or None if retrieval fails """ try: if not dokument.dokumentReferenz: logger.warning(f"Dokument {dokument.id} has no dokumentReferenz") return None # Retrieve PDF bytes (unrestricted - BZO documents are public, accessible to all users) pdf_bytes = self.componentInterface.getFileDataForPublicDocument(dokument.dokumentReferenz) if not pdf_bytes: logger.warning(f"Could not retrieve PDF content for file {dokument.dokumentReferenz}") return None logger.debug(f"Retrieved PDF content for dokument {dokument.id} ({len(pdf_bytes)} bytes)") return pdf_bytes except Exception as e: logger.error(f"Error retrieving PDF content for dokument {dokument.id}: {str(e)}", exc_info=True) return None def resolve_gemeinde_for_dokument(self, dokument: Dokument) -> Optional[str]: """ Resolve gemeinde_id for a Dokument by checking Gemeinde.dokumente relationships. Args: dokument: Dokument record Returns: gemeinde_id if found, None otherwise """ try: gemeinden = self.realEstateInterface.getGemeinden( recordFilter={"mandateId": self.realEstateInterface.mandateId} ) for gemeinde in gemeinden: if gemeinde.dokumente: for doc in gemeinde.dokumente: doc_id = doc.id if hasattr(doc, "id") else doc.get("id") if isinstance(doc, dict) else None if doc_id == dokument.id: logger.debug(f"Resolved gemeinde_id {gemeinde.id} for dokument {dokument.id}") return gemeinde.id logger.warning(f"Could not resolve gemeinde_id for dokument {dokument.id}") return None except Exception as e: logger.error(f"Error resolving gemeinde for dokument {dokument.id}: {str(e)}", exc_info=True) return None def extract_year_from_dokument(self, dokument: Dokument) -> Optional[int]: """ Extract year from Dokument label or versionsbezeichnung. Args: dokument: Dokument record Returns: Year as integer if found, None otherwise """ import re # Try to extract year from label if dokument.label: year_match = re.search(r'\b(19|20)\d{2}\b', dokument.label) if year_match: return int(year_match.group()) # Try to extract year from versionsbezeichnung if dokument.versionsbezeichnung: year_match = re.search(r'\b(19|20)\d{2}\b', dokument.versionsbezeichnung) if year_match: return int(year_match.group()) return None