gateway/modules/features/realEstate/bzoDocumentRetriever.py

199 lines
7.9 KiB
Python

"""
Document retriever for BZO extraction pipeline.
Queries Dokument table and retrieves PDF content from ComponentObjects.
"""
import logging
from typing import List, Dict, Any, Optional
from .datamodelFeatureRealEstate import Dokument, DokumentTyp, Gemeinde
from .interfaceFeatureRealEstate import RealEstateObjects
from modules.interfaces.interfaceDbManagement import ComponentObjects
logger = logging.getLogger(__name__)
class BZODocumentRetriever:
"""Retrieves BZO documents from database and ComponentObjects."""
def __init__(self, realEstateInterface: RealEstateObjects, componentInterface: ComponentObjects):
"""
Initialize document retriever.
Args:
realEstateInterface: Real Estate database interface
componentInterface: ComponentObjects interface for file retrieval
"""
self.realEstateInterface = realEstateInterface
self.componentInterface = componentInterface
def get_documents_by_ids(self, dokument_ids: List[str]) -> List[Dokument]:
"""
Retrieve specific documents by their IDs.
Args:
dokument_ids: List of dokument IDs to retrieve
Returns:
List of Dokument records
"""
try:
dokumente = []
for dokument_id in dokument_ids:
dokument = self.realEstateInterface.getDokument(dokument_id)
if dokument:
dokumente.append(dokument)
else:
logger.warning(f"Dokument {dokument_id} not found")
logger.info(f"Retrieved {len(dokumente)} documents out of {len(dokument_ids)} requested")
return dokumente
except Exception as e:
logger.error(f"Error retrieving documents by IDs: {str(e)}", exc_info=True)
raise
def get_bzo_documents(self, gemeinde_id: Optional[str] = None) -> List[Dokument]:
"""
Query Dokument table for BZO documents.
Args:
gemeinde_id: Optional filter by specific Gemeinde
Returns:
List of Dokument records with BZO document type
"""
try:
# Query each document type separately (database connector doesn't support array filters)
dokumente_aktuell = self.realEstateInterface.getDokumente(
recordFilter={
"mandateId": self.realEstateInterface.mandateId,
"dokumentTyp": DokumentTyp.GEMEINDE_BZO_AKTUELL
}
)
dokumente_revision = self.realEstateInterface.getDokumente(
recordFilter={
"mandateId": self.realEstateInterface.mandateId,
"dokumentTyp": DokumentTyp.GEMEINDE_BZO_REVISION
}
)
# Combine results and deduplicate by ID
dokumente_dict = {}
for dokument in dokumente_aktuell + dokumente_revision:
dokumente_dict[dokument.id] = dokument
dokumente = list(dokumente_dict.values())
# If gemeinde_id provided, filter by checking Gemeinde.dokumente relationship
if gemeinde_id:
filtered_dokumente = []
for dokument in dokumente:
# Check if this dokument is linked to the specified Gemeinde
gemeinden = self.realEstateInterface.getGemeinden(
recordFilter={"mandateId": self.realEstateInterface.mandateId}
)
for gemeinde in gemeinden:
if gemeinde.id == gemeinde_id:
# Check if dokument.id is in gemeinde.dokumente
if gemeinde.dokumente:
for doc in gemeinde.dokumente:
if isinstance(doc, dict) and doc.get("id") == dokument.id:
filtered_dokumente.append(dokument)
break
elif hasattr(doc, "id") and doc.id == dokument.id:
filtered_dokumente.append(dokument)
break
break
dokumente = filtered_dokumente
logger.info(f"Found {len(dokumente)} BZO documents" + (f" for Gemeinde {gemeinde_id}" if gemeinde_id else ""))
return dokumente
except Exception as e:
logger.error(f"Error querying BZO documents: {str(e)}", exc_info=True)
raise
def retrieve_pdf_content(self, dokument: Dokument) -> Optional[bytes]:
"""
Retrieve PDF bytes from ComponentObjects using dokumentReferenz.
Args:
dokument: Dokument record with dokumentReferenz field
Returns:
PDF bytes or None if retrieval fails
"""
try:
if not dokument.dokumentReferenz:
logger.warning(f"Dokument {dokument.id} has no dokumentReferenz")
return None
# Retrieve PDF bytes (unrestricted - BZO documents are public, accessible to all users)
pdf_bytes = self.componentInterface.getFileDataForPublicDocument(dokument.dokumentReferenz)
if not pdf_bytes:
logger.warning(f"Could not retrieve PDF content for file {dokument.dokumentReferenz}")
return None
logger.debug(f"Retrieved PDF content for dokument {dokument.id} ({len(pdf_bytes)} bytes)")
return pdf_bytes
except Exception as e:
logger.error(f"Error retrieving PDF content for dokument {dokument.id}: {str(e)}", exc_info=True)
return None
def resolve_gemeinde_for_dokument(self, dokument: Dokument) -> Optional[str]:
"""
Resolve gemeinde_id for a Dokument by checking Gemeinde.dokumente relationships.
Args:
dokument: Dokument record
Returns:
gemeinde_id if found, None otherwise
"""
try:
gemeinden = self.realEstateInterface.getGemeinden(
recordFilter={"mandateId": self.realEstateInterface.mandateId}
)
for gemeinde in gemeinden:
if gemeinde.dokumente:
for doc in gemeinde.dokumente:
doc_id = doc.id if hasattr(doc, "id") else doc.get("id") if isinstance(doc, dict) else None
if doc_id == dokument.id:
logger.debug(f"Resolved gemeinde_id {gemeinde.id} for dokument {dokument.id}")
return gemeinde.id
logger.warning(f"Could not resolve gemeinde_id for dokument {dokument.id}")
return None
except Exception as e:
logger.error(f"Error resolving gemeinde for dokument {dokument.id}: {str(e)}", exc_info=True)
return None
def extract_year_from_dokument(self, dokument: Dokument) -> Optional[int]:
"""
Extract year from Dokument label or versionsbezeichnung.
Args:
dokument: Dokument record
Returns:
Year as integer if found, None otherwise
"""
import re
# Try to extract year from label
if dokument.label:
year_match = re.search(r'\b(19|20)\d{2}\b', dokument.label)
if year_match:
return int(year_match.group())
# Try to extract year from versionsbezeichnung
if dokument.versionsbezeichnung:
year_match = re.search(r'\b(19|20)\d{2}\b', dokument.versionsbezeichnung)
if year_match:
return int(year_match.group())
return None