gateway/modules/system/databaseHealth.py
2026-04-21 00:50:36 +02:00

756 lines
28 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Database health utilities — table statistics and orphan detection/cleanup.
All functions are intended for SysAdmin use only (access control in the route layer).
"""
import logging
import time
import threading
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional, Set
import psycopg2
import psycopg2.extras
from modules.shared.configuration import APP_CONFIG
from modules.shared.dbRegistry import _getRegisteredDatabases
from modules.shared.fkRegistry import _getFkRelationships, FkRelationship
logger = logging.getLogger(__name__)
_ORPHAN_CACHE_TTL = 300 # 5 minutes
_orphanCacheLock = threading.Lock()
_orphanCache: Optional[Dict] = None # {"ts": float, "results": [...]}
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class TableStats:
db: str
table: str
estimatedRows: int
totalSizeBytes: int
indexSizeBytes: int
lastVacuum: Optional[str]
lastAnalyze: Optional[str]
@dataclass
class OrphanResult:
sourceDb: str
sourceTable: str
sourceColumn: str
targetDb: str
targetTable: str
targetColumn: str
orphanCount: int
sourceRowCount: int = 0
targetRowCount: int = 0
targetEmpty: bool = False
wouldDeleteAll: bool = False
@dataclass
class OrphanRecord:
"""A single orphan source-row -- includes the unresolved FK value plus the full row data.
Used by the SysAdmin UI download button so the human can verify the orphan
list before pressing "clean".
"""
sourceDb: str
sourceTable: str
sourceColumn: str
targetDb: str
targetTable: str
targetColumn: str
orphanFkValue: str
rowId: Optional[str]
row: Dict
# ---------------------------------------------------------------------------
# Safety thresholds for cleanup
# ---------------------------------------------------------------------------
# If a single cleanup would delete more than this fraction of the source table,
# refuse without an explicit force=True. Protects against catastrophic wipes
# caused by misconfigured / empty target tables.
_MAX_CLEANUP_FRACTION = 0.5
class OrphanCleanupRefused(Exception):
"""Raised when a cleanup is refused for safety reasons (use force=True to override)."""
# ---------------------------------------------------------------------------
# Low-level DB helpers (read-only, lightweight connections)
# ---------------------------------------------------------------------------
def _getConnection(dbName: str):
"""Open a psycopg2 connection to the given registered database."""
registeredDbs = _getRegisteredDatabases()
configPrefix = registeredDbs.get(dbName)
if configPrefix is None:
raise ValueError(f"Database '{dbName}' is not registered.")
hostKey = f"{configPrefix}_HOST" if configPrefix != "DB" else "DB_HOST"
portKey = f"{configPrefix}_PORT" if configPrefix != "DB" else "DB_PORT"
userKey = f"{configPrefix}_USER" if configPrefix != "DB" else "DB_USER"
passwordKey = (
f"{configPrefix}_PASSWORD_SECRET" if configPrefix != "DB" else "DB_PASSWORD_SECRET"
)
return psycopg2.connect(
host=APP_CONFIG.get(hostKey, "localhost"),
port=int(APP_CONFIG.get(portKey, 5432)),
database=dbName,
user=APP_CONFIG.get(userKey),
password=APP_CONFIG.get(passwordKey),
client_encoding="utf8",
cursor_factory=psycopg2.extras.RealDictCursor,
)
# ---------------------------------------------------------------------------
# Table statistics
# ---------------------------------------------------------------------------
def _getTableStats(dbFilter: Optional[str] = None) -> List[dict]:
"""Query pg_stat_user_tables + pg_total_relation_size for every registered DB.
Returns a list of TableStats dicts, optionally filtered by database name.
"""
registeredDbs = _getRegisteredDatabases()
if dbFilter:
registeredDbs = {k: v for k, v in registeredDbs.items() if k == dbFilter}
results: List[dict] = []
for dbName in sorted(registeredDbs):
try:
conn = _getConnection(dbName)
try:
with conn.cursor() as cur:
cur.execute("""
SELECT
s.relname AS "table",
s.n_live_tup AS "estimatedRows",
pg_total_relation_size(quote_ident(s.relname)) AS "totalSizeBytes",
pg_indexes_size(quote_ident(s.relname)) AS "indexSizeBytes",
s.last_vacuum::text AS "lastVacuum",
s.last_analyze::text AS "lastAnalyze"
FROM pg_stat_user_tables s
WHERE s.schemaname = 'public'
AND s.relname NOT LIKE '\\_%%'
ORDER BY s.relname
""")
for row in cur.fetchall():
results.append(asdict(TableStats(
db=dbName,
table=row["table"],
estimatedRows=row["estimatedRows"],
totalSizeBytes=row["totalSizeBytes"],
indexSizeBytes=row["indexSizeBytes"],
lastVacuum=row["lastVacuum"],
lastAnalyze=row["lastAnalyze"],
)))
finally:
conn.close()
except Exception as e:
logger.error(f"Failed to get table stats for {dbName}: {e}")
return results
# ---------------------------------------------------------------------------
# Orphan scanning
# ---------------------------------------------------------------------------
def _loadParentIds(conn, tableName: str, columnName: str) -> Set[str]:
"""Load all distinct values of a column from a table (for cross-DB checks)."""
ids: Set[str] = set()
with conn.cursor() as cur:
cur.execute(f'SELECT DISTINCT "{columnName}" FROM "{tableName}"')
for row in cur.fetchall():
val = row[columnName]
if val is not None:
ids.add(str(val))
return ids
def _loadPhysicalColumns(conn, tableName: str) -> Set[str]:
"""Return the set of physical (scalar) columns present on a table.
Used by the orphan scanner to skip FK relationships whose ``sourceColumn``
is annotated on the Pydantic model but does NOT exist as a physical column
-- e.g. virtual / computed fields, or fields that the database interface
decided to fold into a JSONB blob (List/Dict typed fields). Comparing a
JSONB array against a scalar via ``=`` always fails and would otherwise
flag every single source row as an orphan (the user-reported "false
positives").
"""
cols: Set[str] = set()
try:
with conn.cursor() as cur:
cur.execute(
"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = 'public' AND table_name = %s
""",
(tableName,),
)
for row in cur.fetchall():
cols.add(row["column_name"])
except Exception:
pass
return cols
def _countRows(conn, tableName: str) -> int:
"""Count physical rows in a table. Returns 0 on any error."""
try:
with conn.cursor() as cur:
cur.execute(f'SELECT COUNT(*) AS cnt FROM "{tableName}"')
return int(cur.fetchone()["cnt"])
except Exception:
return 0
def _countNonNullSource(conn, tableName: str, columnName: str) -> int:
"""Count source rows where the FK column is non-null/non-empty."""
try:
with conn.cursor() as cur:
cur.execute(f"""
SELECT COUNT(*) AS cnt
FROM "{tableName}"
WHERE "{columnName}" IS NOT NULL
AND "{columnName}" != ''
""")
return int(cur.fetchone()["cnt"])
except Exception:
return 0
def _countOrphansSameDb(
conn, sourceTable: str, sourceColumn: str,
targetTable: str, targetColumn: str,
) -> int:
"""Count orphans when source and target live in the same DB."""
with conn.cursor() as cur:
cur.execute(f"""
SELECT COUNT(*) AS cnt
FROM "{sourceTable}" s
WHERE s."{sourceColumn}" IS NOT NULL
AND s."{sourceColumn}" != ''
AND NOT EXISTS (
SELECT 1 FROM "{targetTable}" t
WHERE t."{targetColumn}" = s."{sourceColumn}"
)
""")
return cur.fetchone()["cnt"]
def _countOrphansCrossDb(
sourceConn, sourceTable: str, sourceColumn: str,
parentIds: Set[str],
) -> int:
"""Count orphans when parent IDs come from a different DB."""
if not parentIds:
with sourceConn.cursor() as cur:
cur.execute(f"""
SELECT COUNT(*) AS cnt
FROM "{sourceTable}"
WHERE "{sourceColumn}" IS NOT NULL
AND "{sourceColumn}" != ''
""")
return cur.fetchone()["cnt"]
with sourceConn.cursor() as cur:
cur.execute(f"""
SELECT COUNT(*) AS cnt
FROM "{sourceTable}"
WHERE "{sourceColumn}" IS NOT NULL
AND "{sourceColumn}" != ''
AND "{sourceColumn}" NOT IN (
SELECT unnest(%(ids)s::text[])
)
""", {"ids": list(parentIds)})
return cur.fetchone()["cnt"]
def _scanOrphans(dbFilter: Optional[str] = None) -> List[dict]:
"""Scan for orphaned records across all FK relationships.
Uses a 5-minute cache to avoid repeated heavy scans.
"""
global _orphanCache
with _orphanCacheLock:
if _orphanCache and (time.time() - _orphanCache["ts"]) < _ORPHAN_CACHE_TTL:
cached = _orphanCache["results"]
if dbFilter:
return [r for r in cached if r["sourceDb"] == dbFilter]
return list(cached)
relationships = _getFkRelationships()
if dbFilter:
relationships = [r for r in relationships if r.sourceDb == dbFilter]
connCache: Dict[str, any] = {}
tableCache: Dict[str, Set[str]] = {}
columnCache: Dict[str, Set[str]] = {}
parentIdCache: Dict[str, Set[str]] = {}
results: List[dict] = []
def _ensureConn(dbName: str):
if dbName not in connCache:
connCache[dbName] = _getConnection(dbName)
return connCache[dbName]
def _existingTables(dbName: str) -> Set[str]:
"""Cached lookup of physically existing public tables in a DB."""
if dbName not in tableCache:
try:
conn = _ensureConn(dbName)
with conn.cursor() as cur:
cur.execute("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'public'
""")
tableCache[dbName] = {row["table_name"] for row in cur.fetchall()}
except Exception:
tableCache[dbName] = set()
return tableCache[dbName]
def _existingColumns(dbName: str, tableName: str) -> Set[str]:
cacheKey = f"{dbName}.{tableName}"
if cacheKey not in columnCache:
try:
columnCache[cacheKey] = _loadPhysicalColumns(_ensureConn(dbName), tableName)
except Exception:
columnCache[cacheKey] = set()
return columnCache[cacheKey]
try:
for rel in relationships:
try:
sourceTables = _existingTables(rel.sourceDb)
if rel.sourceTable not in sourceTables:
continue
if rel.sourceDb == rel.targetDb:
if rel.targetTable not in sourceTables:
continue
else:
targetTables = _existingTables(rel.targetDb)
if rel.targetTable not in targetTables:
continue
# Skip FK annotations whose source column is not a physical
# scalar column (virtual / JSONB-resident / computed field).
# See _loadPhysicalColumns docstring for why this matters.
sourceColumns = _existingColumns(rel.sourceDb, rel.sourceTable)
if rel.sourceColumn not in sourceColumns:
logger.debug(
"Skipping FK %s.%s.%s -- column not present as physical column",
rel.sourceDb, rel.sourceTable, rel.sourceColumn,
)
continue
targetColumns = _existingColumns(rel.targetDb, rel.targetTable)
if rel.targetColumn not in targetColumns:
logger.debug(
"Skipping FK %s.%s.%s -> %s.%s.%s -- target column not present",
rel.sourceDb, rel.sourceTable, rel.sourceColumn,
rel.targetDb, rel.targetTable, rel.targetColumn,
)
continue
sourceConn = _ensureConn(rel.sourceDb)
if rel.sourceDb == rel.targetDb:
targetRowCount = _countRows(sourceConn, rel.targetTable)
count = _countOrphansSameDb(
sourceConn, rel.sourceTable, rel.sourceColumn,
rel.targetTable, rel.targetColumn,
)
else:
targetConn = _ensureConn(rel.targetDb)
targetRowCount = _countRows(targetConn, rel.targetTable)
parentKey = f"{rel.targetDb}.{rel.targetTable}.{rel.targetColumn}"
if parentKey not in parentIdCache:
parentIdCache[parentKey] = _loadParentIds(
targetConn, rel.targetTable, rel.targetColumn,
)
count = _countOrphansCrossDb(
sourceConn, rel.sourceTable, rel.sourceColumn,
parentIdCache[parentKey],
)
sourceRowCount = _countNonNullSource(
sourceConn, rel.sourceTable, rel.sourceColumn,
)
wouldDeleteAll = (count > 0 and count >= sourceRowCount)
targetEmpty = (targetRowCount == 0)
results.append(asdict(OrphanResult(
sourceDb=rel.sourceDb,
sourceTable=rel.sourceTable,
sourceColumn=rel.sourceColumn,
targetDb=rel.targetDb,
targetTable=rel.targetTable,
targetColumn=rel.targetColumn,
orphanCount=count,
sourceRowCount=sourceRowCount,
targetRowCount=targetRowCount,
targetEmpty=targetEmpty,
wouldDeleteAll=wouldDeleteAll,
)))
except Exception as e:
logger.warning(
f"Orphan scan failed for {rel.sourceDb}.{rel.sourceTable}.{rel.sourceColumn}: {e}"
)
for dbKey in (rel.sourceDb, rel.targetDb):
if dbKey in connCache:
try:
connCache[dbKey].rollback()
except Exception:
pass
finally:
for conn in connCache.values():
try:
conn.close()
except Exception:
pass
with _orphanCacheLock:
_orphanCache = {"ts": time.time(), "results": results}
return results
# ---------------------------------------------------------------------------
# Orphan cleanup
# ---------------------------------------------------------------------------
def _cleanOrphans(db: str, table: str, column: str, force: bool = False) -> int:
"""Delete orphaned records for a single FK relationship. Returns count deleted.
Safety guards (require force=True to override):
- Refuses if the target table is empty (likely misconfiguration / lazy table).
- Refuses if the cleanup would delete >= _MAX_CLEANUP_FRACTION of the source rows.
These guards prevent catastrophic wipes (e.g. emptying FeatureInstance because
the User table happened to be empty in the wrong DB at scan time).
"""
relationships = _getFkRelationships()
rel = next(
(r for r in relationships
if r.sourceDb == db and r.sourceTable == table and r.sourceColumn == column),
None,
)
if rel is None:
raise ValueError(f"No FK relationship found for {db}.{table}.{column}")
conn = _getConnection(rel.sourceDb)
targetConn = None
try:
if rel.sourceDb == rel.targetDb:
targetRowCount = _countRows(conn, rel.targetTable)
parentIds: Optional[Set[str]] = None
else:
targetConn = _getConnection(rel.targetDb)
targetRowCount = _countRows(targetConn, rel.targetTable)
parentIds = _loadParentIds(targetConn, rel.targetTable, rel.targetColumn)
sourceRowCount = _countNonNullSource(conn, rel.sourceTable, rel.sourceColumn)
if not force:
if targetRowCount == 0 and sourceRowCount > 0:
raise OrphanCleanupRefused(
f"Refusing cleanup: target table '{rel.targetDb}.{rel.targetTable}' "
f"is empty but source '{rel.sourceDb}.{rel.sourceTable}' has "
f"{sourceRowCount} rows with non-null '{rel.sourceColumn}'. "
f"This likely indicates a misconfiguration. Use force=True to override."
)
if rel.sourceDb == rel.targetDb:
with conn.cursor() as cur:
cur.execute(f"""
SELECT COUNT(*) AS cnt
FROM "{rel.sourceTable}" s
WHERE s."{rel.sourceColumn}" IS NOT NULL
AND s."{rel.sourceColumn}" != ''
AND NOT EXISTS (
SELECT 1 FROM "{rel.targetTable}" t
WHERE t."{rel.targetColumn}" = s."{rel.sourceColumn}"
)
""")
wouldDelete = int(cur.fetchone()["cnt"])
else:
if not parentIds:
wouldDelete = sourceRowCount
else:
with conn.cursor() as cur:
cur.execute(f"""
SELECT COUNT(*) AS cnt
FROM "{rel.sourceTable}"
WHERE "{rel.sourceColumn}" IS NOT NULL
AND "{rel.sourceColumn}" != ''
AND "{rel.sourceColumn}" NOT IN (
SELECT unnest(%(ids)s::text[])
)
""", {"ids": list(parentIds)})
wouldDelete = int(cur.fetchone()["cnt"])
if not force and sourceRowCount > 0:
fraction = wouldDelete / sourceRowCount
if fraction >= _MAX_CLEANUP_FRACTION:
raise OrphanCleanupRefused(
f"Refusing cleanup: would delete {wouldDelete} of {sourceRowCount} "
f"non-null rows ({fraction:.0%}) from '{rel.sourceDb}.{rel.sourceTable}'. "
f"Threshold is {_MAX_CLEANUP_FRACTION:.0%}. Use force=True to override."
)
if rel.sourceDb == rel.targetDb:
with conn.cursor() as cur:
cur.execute(f"""
DELETE FROM "{rel.sourceTable}"
WHERE "{rel.sourceColumn}" IS NOT NULL
AND "{rel.sourceColumn}" != ''
AND NOT EXISTS (
SELECT 1 FROM "{rel.targetTable}" t
WHERE t."{rel.targetColumn}" = "{rel.sourceTable}"."{rel.sourceColumn}"
)
""")
deleted = cur.rowcount
conn.commit()
else:
if not parentIds:
with conn.cursor() as cur:
cur.execute(f"""
DELETE FROM "{rel.sourceTable}"
WHERE "{rel.sourceColumn}" IS NOT NULL
AND "{rel.sourceColumn}" != ''
""")
deleted = cur.rowcount
else:
with conn.cursor() as cur:
cur.execute(f"""
DELETE FROM "{rel.sourceTable}"
WHERE "{rel.sourceColumn}" IS NOT NULL
AND "{rel.sourceColumn}" != ''
AND "{rel.sourceColumn}" NOT IN (
SELECT unnest(%(ids)s::text[])
)
""", {"ids": list(parentIds)})
deleted = cur.rowcount
conn.commit()
except Exception:
conn.rollback()
raise
finally:
if targetConn is not None:
try:
targetConn.close()
except Exception:
pass
conn.close()
_invalidateOrphanCache()
logger.info(
f"Cleaned {deleted} orphans from {db}.{table}.{column} (force={force})"
)
return deleted
def _cleanAllOrphans(force: bool = False) -> List[dict]:
"""Clean all detected orphans. Returns list of {db, table, column, deleted, [error|skipped]}.
Safety: each individual cleanup re-validates target row counts at delete-time
to avoid cascading wipes (e.g. one delete emptying a target table that the
next iteration depends on). Without force=True, dangerous cleanups are skipped.
"""
orphans = _scanOrphans()
results = []
for orphan in orphans:
if orphan.get("orphanCount", 0) <= 0:
continue
try:
deleted = _cleanOrphans(
orphan["sourceDb"],
orphan["sourceTable"],
orphan["sourceColumn"],
force=force,
)
results.append({
"db": orphan["sourceDb"],
"table": orphan["sourceTable"],
"column": orphan["sourceColumn"],
"deleted": deleted,
})
except OrphanCleanupRefused as e:
logger.warning(
f"Skipping orphan cleanup for {orphan['sourceDb']}.{orphan['sourceTable']}.{orphan['sourceColumn']}: {e}"
)
results.append({
"db": orphan["sourceDb"],
"table": orphan["sourceTable"],
"column": orphan["sourceColumn"],
"deleted": 0,
"skipped": str(e),
})
except Exception as e:
logger.error(
f"Failed to clean orphans for {orphan['sourceDb']}.{orphan['sourceTable']}.{orphan['sourceColumn']}: {e}"
)
results.append({
"db": orphan["sourceDb"],
"table": orphan["sourceTable"],
"column": orphan["sourceColumn"],
"deleted": 0,
"error": str(e),
})
return results
def _invalidateOrphanCache() -> None:
global _orphanCache
with _orphanCacheLock:
_orphanCache = None
# ---------------------------------------------------------------------------
# Listing orphans (for SysAdmin "download / inspect" workflow)
# ---------------------------------------------------------------------------
def _listOrphans(
db: str,
table: str,
column: str,
limit: int = 1000,
) -> List[dict]:
"""Return up to ``limit`` actual orphan source-rows for one FK relationship.
Each entry is ``{"orphanFkValue": str, "rowId": str|None, "row": dict}`` so
the SysAdmin UI can present them as a download (CSV/JSON) for review before
the destructive cleanup is triggered.
"""
relationships = _getFkRelationships()
rel = next(
(r for r in relationships
if r.sourceDb == db and r.sourceTable == table and r.sourceColumn == column),
None,
)
if rel is None:
raise ValueError(f"No FK relationship found for {db}.{table}.{column}")
safeLimit = max(1, min(int(limit), 10000))
sourceConn = _getConnection(rel.sourceDb)
targetConn = None
try:
sourceColumns = _loadPhysicalColumns(sourceConn, rel.sourceTable)
if rel.sourceColumn not in sourceColumns:
return []
if rel.sourceDb == rel.targetDb:
targetColumns = _loadPhysicalColumns(sourceConn, rel.targetTable)
if rel.targetColumn not in targetColumns:
return []
with sourceConn.cursor() as cur:
cur.execute(f"""
SELECT s.*
FROM "{rel.sourceTable}" s
WHERE s."{rel.sourceColumn}" IS NOT NULL
AND s."{rel.sourceColumn}" != ''
AND NOT EXISTS (
SELECT 1 FROM "{rel.targetTable}" t
WHERE t."{rel.targetColumn}" = s."{rel.sourceColumn}"
)
LIMIT %s
""", (safeLimit,))
rows = cur.fetchall()
else:
targetConn = _getConnection(rel.targetDb)
targetColumns = _loadPhysicalColumns(targetConn, rel.targetTable)
if rel.targetColumn not in targetColumns:
return []
parentIds = _loadParentIds(targetConn, rel.targetTable, rel.targetColumn)
with sourceConn.cursor() as cur:
if not parentIds:
cur.execute(f"""
SELECT *
FROM "{rel.sourceTable}"
WHERE "{rel.sourceColumn}" IS NOT NULL
AND "{rel.sourceColumn}" != ''
LIMIT %s
""", (safeLimit,))
else:
cur.execute(f"""
SELECT *
FROM "{rel.sourceTable}"
WHERE "{rel.sourceColumn}" IS NOT NULL
AND "{rel.sourceColumn}" != ''
AND "{rel.sourceColumn}" NOT IN (
SELECT unnest(%(ids)s::text[])
)
LIMIT %(lim)s
""", {"ids": list(parentIds), "lim": safeLimit})
rows = cur.fetchall()
finally:
if targetConn is not None:
try:
targetConn.close()
except Exception:
pass
sourceConn.close()
out: List[dict] = []
for row in rows:
rowDict = {k: _jsonSafe(v) for k, v in dict(row).items()}
out.append(asdict(OrphanRecord(
sourceDb=rel.sourceDb,
sourceTable=rel.sourceTable,
sourceColumn=rel.sourceColumn,
targetDb=rel.targetDb,
targetTable=rel.targetTable,
targetColumn=rel.targetColumn,
orphanFkValue=str(rowDict.get(rel.sourceColumn, "")),
rowId=str(rowDict.get("id")) if rowDict.get("id") is not None else None,
row=rowDict,
)))
return out
def _jsonSafe(v):
"""Coerce psycopg2 row values into JSON-serialisable primitives."""
import datetime
import decimal
import uuid
if v is None or isinstance(v, (str, int, float, bool)):
return v
if isinstance(v, (datetime.datetime, datetime.date, datetime.time)):
return v.isoformat()
if isinstance(v, decimal.Decimal):
return float(v)
if isinstance(v, uuid.UUID):
return str(v)
if isinstance(v, (list, tuple)):
return [_jsonSafe(x) for x in v]
if isinstance(v, dict):
return {str(k): _jsonSafe(val) for k, val in v.items()}
if isinstance(v, (bytes, bytearray, memoryview)):
try:
return bytes(v).decode("utf-8", errors="replace")
except Exception:
return repr(v)
return str(v)