199 lines
7.9 KiB
Python
199 lines
7.9 KiB
Python
"""
|
|
Document retriever for BZO extraction pipeline.
|
|
Queries Dokument table and retrieves PDF content from ComponentObjects.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Any, Optional
|
|
from modules.datamodels.datamodelRealEstate import Dokument, DokumentTyp, Gemeinde
|
|
from modules.interfaces.interfaceDbRealEstateObjects import RealEstateObjects
|
|
from modules.interfaces.interfaceDbComponentObjects import ComponentObjects
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BZODocumentRetriever:
|
|
"""Retrieves BZO documents from database and ComponentObjects."""
|
|
|
|
def __init__(self, realEstateInterface: RealEstateObjects, componentInterface: ComponentObjects):
|
|
"""
|
|
Initialize document retriever.
|
|
|
|
Args:
|
|
realEstateInterface: Real Estate database interface
|
|
componentInterface: ComponentObjects interface for file retrieval
|
|
"""
|
|
self.realEstateInterface = realEstateInterface
|
|
self.componentInterface = componentInterface
|
|
|
|
def get_documents_by_ids(self, dokument_ids: List[str]) -> List[Dokument]:
|
|
"""
|
|
Retrieve specific documents by their IDs.
|
|
|
|
Args:
|
|
dokument_ids: List of dokument IDs to retrieve
|
|
|
|
Returns:
|
|
List of Dokument records
|
|
"""
|
|
try:
|
|
dokumente = []
|
|
for dokument_id in dokument_ids:
|
|
dokument = self.realEstateInterface.getDokument(dokument_id)
|
|
if dokument:
|
|
dokumente.append(dokument)
|
|
else:
|
|
logger.warning(f"Dokument {dokument_id} not found")
|
|
|
|
logger.info(f"Retrieved {len(dokumente)} documents out of {len(dokument_ids)} requested")
|
|
return dokumente
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error retrieving documents by IDs: {str(e)}", exc_info=True)
|
|
raise
|
|
|
|
def get_bzo_documents(self, gemeinde_id: Optional[str] = None) -> List[Dokument]:
|
|
"""
|
|
Query Dokument table for BZO documents.
|
|
|
|
Args:
|
|
gemeinde_id: Optional filter by specific Gemeinde
|
|
|
|
Returns:
|
|
List of Dokument records with BZO document type
|
|
"""
|
|
try:
|
|
# Query each document type separately (database connector doesn't support array filters)
|
|
dokumente_aktuell = self.realEstateInterface.getDokumente(
|
|
recordFilter={
|
|
"mandateId": self.realEstateInterface.mandateId,
|
|
"dokumentTyp": DokumentTyp.GEMEINDE_BZO_AKTUELL
|
|
}
|
|
)
|
|
|
|
dokumente_revision = self.realEstateInterface.getDokumente(
|
|
recordFilter={
|
|
"mandateId": self.realEstateInterface.mandateId,
|
|
"dokumentTyp": DokumentTyp.GEMEINDE_BZO_REVISION
|
|
}
|
|
)
|
|
|
|
# Combine results and deduplicate by ID
|
|
dokumente_dict = {}
|
|
for dokument in dokumente_aktuell + dokumente_revision:
|
|
dokumente_dict[dokument.id] = dokument
|
|
dokumente = list(dokumente_dict.values())
|
|
|
|
# If gemeinde_id provided, filter by checking Gemeinde.dokumente relationship
|
|
if gemeinde_id:
|
|
filtered_dokumente = []
|
|
for dokument in dokumente:
|
|
# Check if this dokument is linked to the specified Gemeinde
|
|
gemeinden = self.realEstateInterface.getGemeinden(
|
|
recordFilter={"mandateId": self.realEstateInterface.mandateId}
|
|
)
|
|
for gemeinde in gemeinden:
|
|
if gemeinde.id == gemeinde_id:
|
|
# Check if dokument.id is in gemeinde.dokumente
|
|
if gemeinde.dokumente:
|
|
for doc in gemeinde.dokumente:
|
|
if isinstance(doc, dict) and doc.get("id") == dokument.id:
|
|
filtered_dokumente.append(dokument)
|
|
break
|
|
elif hasattr(doc, "id") and doc.id == dokument.id:
|
|
filtered_dokumente.append(dokument)
|
|
break
|
|
break
|
|
dokumente = filtered_dokumente
|
|
|
|
logger.info(f"Found {len(dokumente)} BZO documents" + (f" for Gemeinde {gemeinde_id}" if gemeinde_id else ""))
|
|
return dokumente
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error querying BZO documents: {str(e)}", exc_info=True)
|
|
raise
|
|
|
|
def retrieve_pdf_content(self, dokument: Dokument) -> Optional[bytes]:
|
|
"""
|
|
Retrieve PDF bytes from ComponentObjects using dokumentReferenz.
|
|
|
|
Args:
|
|
dokument: Dokument record with dokumentReferenz field
|
|
|
|
Returns:
|
|
PDF bytes or None if retrieval fails
|
|
"""
|
|
try:
|
|
if not dokument.dokumentReferenz:
|
|
logger.warning(f"Dokument {dokument.id} has no dokumentReferenz")
|
|
return None
|
|
|
|
# Retrieve PDF bytes
|
|
pdf_bytes = self.componentInterface.getFileData(dokument.dokumentReferenz)
|
|
|
|
if not pdf_bytes:
|
|
logger.warning(f"Could not retrieve PDF content for file {dokument.dokumentReferenz}")
|
|
return None
|
|
|
|
logger.debug(f"Retrieved PDF content for dokument {dokument.id} ({len(pdf_bytes)} bytes)")
|
|
return pdf_bytes
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error retrieving PDF content for dokument {dokument.id}: {str(e)}", exc_info=True)
|
|
return None
|
|
|
|
def resolve_gemeinde_for_dokument(self, dokument: Dokument) -> Optional[str]:
|
|
"""
|
|
Resolve gemeinde_id for a Dokument by checking Gemeinde.dokumente relationships.
|
|
|
|
Args:
|
|
dokument: Dokument record
|
|
|
|
Returns:
|
|
gemeinde_id if found, None otherwise
|
|
"""
|
|
try:
|
|
gemeinden = self.realEstateInterface.getGemeinden(
|
|
recordFilter={"mandateId": self.realEstateInterface.mandateId}
|
|
)
|
|
|
|
for gemeinde in gemeinden:
|
|
if gemeinde.dokumente:
|
|
for doc in gemeinde.dokumente:
|
|
doc_id = doc.id if hasattr(doc, "id") else doc.get("id") if isinstance(doc, dict) else None
|
|
if doc_id == dokument.id:
|
|
logger.debug(f"Resolved gemeinde_id {gemeinde.id} for dokument {dokument.id}")
|
|
return gemeinde.id
|
|
|
|
logger.warning(f"Could not resolve gemeinde_id for dokument {dokument.id}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error resolving gemeinde for dokument {dokument.id}: {str(e)}", exc_info=True)
|
|
return None
|
|
|
|
def extract_year_from_dokument(self, dokument: Dokument) -> Optional[int]:
|
|
"""
|
|
Extract year from Dokument label or versionsbezeichnung.
|
|
|
|
Args:
|
|
dokument: Dokument record
|
|
|
|
Returns:
|
|
Year as integer if found, None otherwise
|
|
"""
|
|
import re
|
|
|
|
# Try to extract year from label
|
|
if dokument.label:
|
|
year_match = re.search(r'\b(19|20)\d{2}\b', dokument.label)
|
|
if year_match:
|
|
return int(year_match.group())
|
|
|
|
# Try to extract year from versionsbezeichnung
|
|
if dokument.versionsbezeichnung:
|
|
year_match = re.search(r'\b(19|20)\d{2}\b', dokument.versionsbezeichnung)
|
|
if year_match:
|
|
return int(year_match.group())
|
|
|
|
return None
|