# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Database health utilities — table statistics and orphan detection/cleanup. All functions are intended for SysAdmin use only (access control in the route layer). """ import logging import time import threading from dataclasses import dataclass, asdict from typing import Dict, List, Optional, Set import psycopg2 import psycopg2.extras from modules.shared.configuration import APP_CONFIG from modules.shared.dbRegistry import getRegisteredDatabases from modules.shared.fkRegistry import getFkRelationships, FkRelationship logger = logging.getLogger(__name__) _ORPHAN_CACHE_TTL = 300 # 5 minutes _orphanCacheLock = threading.Lock() _orphanCache: Optional[Dict] = None # {"ts": float, "results": [...]} # --------------------------------------------------------------------------- # Dataclasses # --------------------------------------------------------------------------- @dataclass class TableStats: db: str table: str estimatedRows: int totalSizeBytes: int indexSizeBytes: int lastVacuum: Optional[str] lastAnalyze: Optional[str] @dataclass class OrphanResult: sourceDb: str sourceTable: str sourceColumn: str targetDb: str targetTable: str targetColumn: str orphanCount: int sourceRowCount: int = 0 targetRowCount: int = 0 targetEmpty: bool = False wouldDeleteAll: bool = False @dataclass class OrphanRecord: """A single orphan source-row -- includes the unresolved FK value plus the full row data. Used by the SysAdmin UI download button so the human can verify the orphan list before pressing "clean". """ sourceDb: str sourceTable: str sourceColumn: str targetDb: str targetTable: str targetColumn: str orphanFkValue: str rowId: Optional[str] row: Dict # --------------------------------------------------------------------------- # Safety thresholds for cleanup # --------------------------------------------------------------------------- # If a single cleanup would delete more than this fraction of the source table, # refuse without an explicit force=True. Protects against catastrophic wipes # caused by misconfigured / empty target tables. _MAX_CLEANUP_FRACTION = 0.5 class OrphanCleanupRefused(Exception): """Raised when a cleanup is refused for safety reasons (use force=True to override).""" # --------------------------------------------------------------------------- # Low-level DB helpers (read-only, lightweight connections) # --------------------------------------------------------------------------- def _getConnection(dbName: str): """Open a psycopg2 connection to the given registered database.""" registeredDbs = getRegisteredDatabases() configPrefix = registeredDbs.get(dbName) if configPrefix is None: raise ValueError(f"Database '{dbName}' is not registered.") hostKey = f"{configPrefix}_HOST" if configPrefix != "DB" else "DB_HOST" portKey = f"{configPrefix}_PORT" if configPrefix != "DB" else "DB_PORT" userKey = f"{configPrefix}_USER" if configPrefix != "DB" else "DB_USER" passwordKey = ( f"{configPrefix}_PASSWORD_SECRET" if configPrefix != "DB" else "DB_PASSWORD_SECRET" ) return psycopg2.connect( host=APP_CONFIG.get(hostKey, "localhost"), port=int(APP_CONFIG.get(portKey, 5432)), database=dbName, user=APP_CONFIG.get(userKey), password=APP_CONFIG.get(passwordKey), client_encoding="utf8", cursor_factory=psycopg2.extras.RealDictCursor, ) # --------------------------------------------------------------------------- # Table statistics # --------------------------------------------------------------------------- def _getTableStats(dbFilter: Optional[str] = None) -> List[dict]: """Query pg_stat_user_tables + pg_total_relation_size for every registered DB. Returns a list of TableStats dicts, optionally filtered by database name. """ registeredDbs = getRegisteredDatabases() if dbFilter: registeredDbs = {k: v for k, v in registeredDbs.items() if k == dbFilter} results: List[dict] = [] for dbName in sorted(registeredDbs): try: conn = _getConnection(dbName) try: with conn.cursor() as cur: cur.execute(""" SELECT s.relname AS "table", s.n_live_tup AS "estimatedRows", pg_total_relation_size(quote_ident(s.relname)) AS "totalSizeBytes", pg_indexes_size(quote_ident(s.relname)) AS "indexSizeBytes", s.last_vacuum::text AS "lastVacuum", s.last_analyze::text AS "lastAnalyze" FROM pg_stat_user_tables s WHERE s.schemaname = 'public' AND s.relname NOT LIKE '\\_%%' ORDER BY s.relname """) for row in cur.fetchall(): results.append(asdict(TableStats( db=dbName, table=row["table"], estimatedRows=row["estimatedRows"], totalSizeBytes=row["totalSizeBytes"], indexSizeBytes=row["indexSizeBytes"], lastVacuum=row["lastVacuum"], lastAnalyze=row["lastAnalyze"], ))) finally: conn.close() except Exception as e: logger.error(f"Failed to get table stats for {dbName}: {e}") return results # --------------------------------------------------------------------------- # Orphan scanning # --------------------------------------------------------------------------- def _loadParentIds(conn, tableName: str, columnName: str) -> Set[str]: """Load all distinct values of a column from a table (for cross-DB checks).""" ids: Set[str] = set() with conn.cursor() as cur: cur.execute(f'SELECT DISTINCT "{columnName}" FROM "{tableName}"') for row in cur.fetchall(): val = row[columnName] if val is not None: ids.add(str(val)) return ids def _loadPhysicalColumns(conn, tableName: str) -> Set[str]: """Return the set of physical (scalar) columns present on a table. Used by the orphan scanner to skip FK relationships whose ``sourceColumn`` is annotated on the Pydantic model but does NOT exist as a physical column -- e.g. virtual / computed fields, or fields that the database interface decided to fold into a JSONB blob (List/Dict typed fields). Comparing a JSONB array against a scalar via ``=`` always fails and would otherwise flag every single source row as an orphan (the user-reported "false positives"). """ cols: Set[str] = set() try: with conn.cursor() as cur: cur.execute( """ SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = %s """, (tableName,), ) for row in cur.fetchall(): cols.add(row["column_name"]) except Exception: pass return cols def _countRows(conn, tableName: str) -> int: """Count physical rows in a table. Returns 0 on any error.""" try: with conn.cursor() as cur: cur.execute(f'SELECT COUNT(*) AS cnt FROM "{tableName}"') return int(cur.fetchone()["cnt"]) except Exception: return 0 def _countNonNullSource(conn, tableName: str, columnName: str) -> int: """Count source rows where the FK column is non-null/non-empty.""" try: with conn.cursor() as cur: cur.execute(f""" SELECT COUNT(*) AS cnt FROM "{tableName}" WHERE "{columnName}" IS NOT NULL AND "{columnName}" != '' """) return int(cur.fetchone()["cnt"]) except Exception: return 0 def _countOrphansSameDb( conn, sourceTable: str, sourceColumn: str, targetTable: str, targetColumn: str, ) -> int: """Count orphans when source and target live in the same DB.""" with conn.cursor() as cur: cur.execute(f""" SELECT COUNT(*) AS cnt FROM "{sourceTable}" s WHERE s."{sourceColumn}" IS NOT NULL AND s."{sourceColumn}" != '' AND NOT EXISTS ( SELECT 1 FROM "{targetTable}" t WHERE t."{targetColumn}" = s."{sourceColumn}" ) """) return cur.fetchone()["cnt"] def _countOrphansCrossDb( sourceConn, sourceTable: str, sourceColumn: str, parentIds: Set[str], ) -> int: """Count orphans when parent IDs come from a different DB.""" if not parentIds: with sourceConn.cursor() as cur: cur.execute(f""" SELECT COUNT(*) AS cnt FROM "{sourceTable}" WHERE "{sourceColumn}" IS NOT NULL AND "{sourceColumn}" != '' """) return cur.fetchone()["cnt"] with sourceConn.cursor() as cur: cur.execute(f""" SELECT COUNT(*) AS cnt FROM "{sourceTable}" WHERE "{sourceColumn}" IS NOT NULL AND "{sourceColumn}" != '' AND "{sourceColumn}" NOT IN ( SELECT unnest(%(ids)s::text[]) ) """, {"ids": list(parentIds)}) return cur.fetchone()["cnt"] def _scanOrphans(dbFilter: Optional[str] = None) -> List[dict]: """Scan for orphaned records across all FK relationships. Uses a 5-minute cache to avoid repeated heavy scans. """ global _orphanCache with _orphanCacheLock: if _orphanCache and (time.time() - _orphanCache["ts"]) < _ORPHAN_CACHE_TTL: cached = _orphanCache["results"] if dbFilter: return [r for r in cached if r["sourceDb"] == dbFilter] return list(cached) relationships = getFkRelationships() if dbFilter: relationships = [r for r in relationships if r.sourceDb == dbFilter] connCache: Dict[str, any] = {} tableCache: Dict[str, Set[str]] = {} columnCache: Dict[str, Set[str]] = {} parentIdCache: Dict[str, Set[str]] = {} results: List[dict] = [] def _ensureConn(dbName: str): if dbName not in connCache: connCache[dbName] = _getConnection(dbName) return connCache[dbName] def _existingTables(dbName: str) -> Set[str]: """Cached lookup of physically existing public tables in a DB.""" if dbName not in tableCache: try: conn = _ensureConn(dbName) with conn.cursor() as cur: cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' """) tableCache[dbName] = {row["table_name"] for row in cur.fetchall()} except Exception: tableCache[dbName] = set() return tableCache[dbName] def _existingColumns(dbName: str, tableName: str) -> Set[str]: cacheKey = f"{dbName}.{tableName}" if cacheKey not in columnCache: try: columnCache[cacheKey] = _loadPhysicalColumns(_ensureConn(dbName), tableName) except Exception: columnCache[cacheKey] = set() return columnCache[cacheKey] try: for rel in relationships: try: if rel.softFk: logger.debug( "Skipping soft FK %s.%s.%s -> %s.%s.%s", rel.sourceDb, rel.sourceTable, rel.sourceColumn, rel.targetDb, rel.targetTable, rel.targetColumn, ) continue sourceTables = _existingTables(rel.sourceDb) if rel.sourceTable not in sourceTables: continue if rel.sourceDb == rel.targetDb: if rel.targetTable not in sourceTables: continue else: targetTables = _existingTables(rel.targetDb) if rel.targetTable not in targetTables: continue # Skip FK annotations whose source column is not a physical # scalar column (virtual / JSONB-resident / computed field). # See _loadPhysicalColumns docstring for why this matters. sourceColumns = _existingColumns(rel.sourceDb, rel.sourceTable) if rel.sourceColumn not in sourceColumns: logger.debug( "Skipping FK %s.%s.%s -- column not present as physical column", rel.sourceDb, rel.sourceTable, rel.sourceColumn, ) continue targetColumns = _existingColumns(rel.targetDb, rel.targetTable) if rel.targetColumn not in targetColumns: logger.debug( "Skipping FK %s.%s.%s -> %s.%s.%s -- target column not present", rel.sourceDb, rel.sourceTable, rel.sourceColumn, rel.targetDb, rel.targetTable, rel.targetColumn, ) continue sourceConn = _ensureConn(rel.sourceDb) if rel.sourceDb == rel.targetDb: targetRowCount = _countRows(sourceConn, rel.targetTable) count = _countOrphansSameDb( sourceConn, rel.sourceTable, rel.sourceColumn, rel.targetTable, rel.targetColumn, ) else: targetConn = _ensureConn(rel.targetDb) targetRowCount = _countRows(targetConn, rel.targetTable) parentKey = f"{rel.targetDb}.{rel.targetTable}.{rel.targetColumn}" if parentKey not in parentIdCache: parentIdCache[parentKey] = _loadParentIds( targetConn, rel.targetTable, rel.targetColumn, ) count = _countOrphansCrossDb( sourceConn, rel.sourceTable, rel.sourceColumn, parentIdCache[parentKey], ) sourceRowCount = _countNonNullSource( sourceConn, rel.sourceTable, rel.sourceColumn, ) wouldDeleteAll = (count > 0 and count >= sourceRowCount) targetEmpty = (targetRowCount == 0) results.append(asdict(OrphanResult( sourceDb=rel.sourceDb, sourceTable=rel.sourceTable, sourceColumn=rel.sourceColumn, targetDb=rel.targetDb, targetTable=rel.targetTable, targetColumn=rel.targetColumn, orphanCount=count, sourceRowCount=sourceRowCount, targetRowCount=targetRowCount, targetEmpty=targetEmpty, wouldDeleteAll=wouldDeleteAll, ))) except Exception as e: logger.warning( f"Orphan scan failed for {rel.sourceDb}.{rel.sourceTable}.{rel.sourceColumn}: {e}" ) for dbKey in (rel.sourceDb, rel.targetDb): if dbKey in connCache: try: connCache[dbKey].rollback() except Exception: pass finally: for conn in connCache.values(): try: conn.close() except Exception: pass with _orphanCacheLock: _orphanCache = {"ts": time.time(), "results": results} return results # --------------------------------------------------------------------------- # Orphan cleanup # --------------------------------------------------------------------------- def _cleanOrphans(db: str, table: str, column: str, force: bool = False) -> int: """Delete orphaned records for a single FK relationship. Returns count deleted. Safety guards (require force=True to override): - Refuses if the target table is empty (likely misconfiguration / lazy table). - Refuses if the cleanup would delete >= _MAX_CLEANUP_FRACTION of the source rows. These guards prevent catastrophic wipes (e.g. emptying FeatureInstance because the User table happened to be empty in the wrong DB at scan time). """ relationships = getFkRelationships() rel = next( (r for r in relationships if r.sourceDb == db and r.sourceTable == table and r.sourceColumn == column), None, ) if rel is None: raise ValueError(f"No FK relationship found for {db}.{table}.{column}") if rel.softFk: raise OrphanCleanupRefused( f"Refusing cleanup: {rel.sourceDb}.{rel.sourceTable}.{rel.sourceColumn} is " f"declared as a soft FK (sentinel / lineage reference) and is intentionally " f"excluded from orphan deletion." ) conn = _getConnection(rel.sourceDb) targetConn = None try: if rel.sourceDb == rel.targetDb: targetRowCount = _countRows(conn, rel.targetTable) parentIds: Optional[Set[str]] = None else: targetConn = _getConnection(rel.targetDb) targetRowCount = _countRows(targetConn, rel.targetTable) parentIds = _loadParentIds(targetConn, rel.targetTable, rel.targetColumn) sourceRowCount = _countNonNullSource(conn, rel.sourceTable, rel.sourceColumn) if not force: if targetRowCount == 0 and sourceRowCount > 0: raise OrphanCleanupRefused( f"Refusing cleanup: target table '{rel.targetDb}.{rel.targetTable}' " f"is empty but source '{rel.sourceDb}.{rel.sourceTable}' has " f"{sourceRowCount} rows with non-null '{rel.sourceColumn}'. " f"This likely indicates a misconfiguration. Use force=True to override." ) if rel.sourceDb == rel.targetDb: with conn.cursor() as cur: cur.execute(f""" SELECT COUNT(*) AS cnt FROM "{rel.sourceTable}" s WHERE s."{rel.sourceColumn}" IS NOT NULL AND s."{rel.sourceColumn}" != '' AND NOT EXISTS ( SELECT 1 FROM "{rel.targetTable}" t WHERE t."{rel.targetColumn}" = s."{rel.sourceColumn}" ) """) wouldDelete = int(cur.fetchone()["cnt"]) else: if not parentIds: wouldDelete = sourceRowCount else: with conn.cursor() as cur: cur.execute(f""" SELECT COUNT(*) AS cnt FROM "{rel.sourceTable}" WHERE "{rel.sourceColumn}" IS NOT NULL AND "{rel.sourceColumn}" != '' AND "{rel.sourceColumn}" NOT IN ( SELECT unnest(%(ids)s::text[]) ) """, {"ids": list(parentIds)}) wouldDelete = int(cur.fetchone()["cnt"]) if not force and sourceRowCount > 0: fraction = wouldDelete / sourceRowCount if fraction >= _MAX_CLEANUP_FRACTION: raise OrphanCleanupRefused( f"Refusing cleanup: would delete {wouldDelete} of {sourceRowCount} " f"non-null rows ({fraction:.0%}) from '{rel.sourceDb}.{rel.sourceTable}'. " f"Threshold is {_MAX_CLEANUP_FRACTION:.0%}. Use force=True to override." ) if rel.sourceDb == rel.targetDb: with conn.cursor() as cur: cur.execute(f""" DELETE FROM "{rel.sourceTable}" WHERE "{rel.sourceColumn}" IS NOT NULL AND "{rel.sourceColumn}" != '' AND NOT EXISTS ( SELECT 1 FROM "{rel.targetTable}" t WHERE t."{rel.targetColumn}" = "{rel.sourceTable}"."{rel.sourceColumn}" ) """) deleted = cur.rowcount conn.commit() else: if not parentIds: with conn.cursor() as cur: cur.execute(f""" DELETE FROM "{rel.sourceTable}" WHERE "{rel.sourceColumn}" IS NOT NULL AND "{rel.sourceColumn}" != '' """) deleted = cur.rowcount else: with conn.cursor() as cur: cur.execute(f""" DELETE FROM "{rel.sourceTable}" WHERE "{rel.sourceColumn}" IS NOT NULL AND "{rel.sourceColumn}" != '' AND "{rel.sourceColumn}" NOT IN ( SELECT unnest(%(ids)s::text[]) ) """, {"ids": list(parentIds)}) deleted = cur.rowcount conn.commit() except Exception: conn.rollback() raise finally: if targetConn is not None: try: targetConn.close() except Exception: pass conn.close() _invalidateOrphanCache() logger.info( f"Cleaned {deleted} orphans from {db}.{table}.{column} (force={force})" ) return deleted def _isUserIdFk(targetTable: str, targetColumn: str) -> bool: """Match the UserInDB.id reference exactly (case-insensitive on table name). Orphans pointing at deleted users are a distinct category: they accumulate naturally on every audit / billing / membership row when a user is deleted, and the SysAdmin typically wants to handle them separately from "real" FK drift. The orphan UI exposes a checkbox `excludeUserFks` that hides them from the list and skips them in `clean-all`; this helper keeps the rule in one place so frontend + clean-all + scan stay in lock-step. """ return targetTable.lower() == "userindb" and targetColumn == "id" def _cleanAllOrphans(force: bool = False, excludeUserFks: bool = False) -> List[dict]: """Clean all detected orphans. Returns list of {db, table, column, deleted, [error|skipped]}. Safety: each individual cleanup re-validates target row counts at delete-time to avoid cascading wipes (e.g. one delete emptying a target table that the next iteration depends on). Without force=True, dangerous cleanups are skipped. When ``excludeUserFks=True``, FK relationships pointing at ``UserInDB.id`` are skipped entirely — those orphans (deleted-user remnants in audit / billing / membership tables) are typically handled by a dedicated user purge workflow, not by generic FK cleanup. """ orphans = _scanOrphans() results = [] for orphan in orphans: if orphan.get("orphanCount", 0) <= 0: continue if excludeUserFks and _isUserIdFk(orphan.get("targetTable", ""), orphan.get("targetColumn", "")): continue try: deleted = _cleanOrphans( orphan["sourceDb"], orphan["sourceTable"], orphan["sourceColumn"], force=force, ) results.append({ "db": orphan["sourceDb"], "table": orphan["sourceTable"], "column": orphan["sourceColumn"], "deleted": deleted, }) except OrphanCleanupRefused as e: logger.warning( f"Skipping orphan cleanup for {orphan['sourceDb']}.{orphan['sourceTable']}.{orphan['sourceColumn']}: {e}" ) results.append({ "db": orphan["sourceDb"], "table": orphan["sourceTable"], "column": orphan["sourceColumn"], "deleted": 0, "skipped": str(e), }) except Exception as e: logger.error( f"Failed to clean orphans for {orphan['sourceDb']}.{orphan['sourceTable']}.{orphan['sourceColumn']}: {e}" ) results.append({ "db": orphan["sourceDb"], "table": orphan["sourceTable"], "column": orphan["sourceColumn"], "deleted": 0, "error": str(e), }) return results def _invalidateOrphanCache() -> None: global _orphanCache with _orphanCacheLock: _orphanCache = None # --------------------------------------------------------------------------- # Listing orphans (for SysAdmin "download / inspect" workflow) # --------------------------------------------------------------------------- def _listOrphans( db: str, table: str, column: str, limit: int = 1000, ) -> List[dict]: """Return up to ``limit`` actual orphan source-rows for one FK relationship. Each entry is ``{"orphanFkValue": str, "rowId": str|None, "row": dict}`` so the SysAdmin UI can present them as a download (CSV/JSON) for review before the destructive cleanup is triggered. """ relationships = getFkRelationships() rel = next( (r for r in relationships if r.sourceDb == db and r.sourceTable == table and r.sourceColumn == column), None, ) if rel is None: raise ValueError(f"No FK relationship found for {db}.{table}.{column}") if rel.softFk: return [] safeLimit = max(1, min(int(limit), 10000)) sourceConn = _getConnection(rel.sourceDb) targetConn = None try: sourceColumns = _loadPhysicalColumns(sourceConn, rel.sourceTable) if rel.sourceColumn not in sourceColumns: return [] if rel.sourceDb == rel.targetDb: targetColumns = _loadPhysicalColumns(sourceConn, rel.targetTable) if rel.targetColumn not in targetColumns: return [] with sourceConn.cursor() as cur: cur.execute(f""" SELECT s.* FROM "{rel.sourceTable}" s WHERE s."{rel.sourceColumn}" IS NOT NULL AND s."{rel.sourceColumn}" != '' AND NOT EXISTS ( SELECT 1 FROM "{rel.targetTable}" t WHERE t."{rel.targetColumn}" = s."{rel.sourceColumn}" ) LIMIT %s """, (safeLimit,)) rows = cur.fetchall() else: targetConn = _getConnection(rel.targetDb) targetColumns = _loadPhysicalColumns(targetConn, rel.targetTable) if rel.targetColumn not in targetColumns: return [] parentIds = _loadParentIds(targetConn, rel.targetTable, rel.targetColumn) with sourceConn.cursor() as cur: if not parentIds: cur.execute(f""" SELECT * FROM "{rel.sourceTable}" WHERE "{rel.sourceColumn}" IS NOT NULL AND "{rel.sourceColumn}" != '' LIMIT %s """, (safeLimit,)) else: cur.execute(f""" SELECT * FROM "{rel.sourceTable}" WHERE "{rel.sourceColumn}" IS NOT NULL AND "{rel.sourceColumn}" != '' AND "{rel.sourceColumn}" NOT IN ( SELECT unnest(%(ids)s::text[]) ) LIMIT %(lim)s """, {"ids": list(parentIds), "lim": safeLimit}) rows = cur.fetchall() finally: if targetConn is not None: try: targetConn.close() except Exception: pass sourceConn.close() out: List[dict] = [] for row in rows: rowDict = {k: _jsonSafe(v) for k, v in dict(row).items()} out.append(asdict(OrphanRecord( sourceDb=rel.sourceDb, sourceTable=rel.sourceTable, sourceColumn=rel.sourceColumn, targetDb=rel.targetDb, targetTable=rel.targetTable, targetColumn=rel.targetColumn, orphanFkValue=str(rowDict.get(rel.sourceColumn, "")), rowId=str(rowDict.get("id")) if rowDict.get("id") is not None else None, row=rowDict, ))) return out def _jsonSafe(v): """Coerce psycopg2 row values into JSON-serialisable primitives.""" import datetime import decimal import uuid if v is None or isinstance(v, (str, int, float, bool)): return v if isinstance(v, (datetime.datetime, datetime.date, datetime.time)): return v.isoformat() if isinstance(v, decimal.Decimal): return float(v) if isinstance(v, uuid.UUID): return str(v) if isinstance(v, (list, tuple)): return [_jsonSafe(x) for x in v] if isinstance(v, dict): return {str(k): _jsonSafe(val) for k, val in v.items()} if isinstance(v, (bytes, bytearray, memoryview)): try: return bytes(v).decode("utf-8", errors="replace") except Exception: return repr(v) return str(v)