gateway/modules/features/trustee/accounting/accountingDataSync.py
2026-04-21 10:45:14 +02:00

533 lines
23 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Orchestrates importing accounting data from external systems into TrusteeData* tables.
Flow per phase:
1. async fetch from external system (HTTP, awaits cleanly on the event loop)
2. await asyncio.to_thread(...) for the DB write phase, so the heavy
synchronous psycopg2 calls do NOT block the FastAPI event loop
3. inside the worker thread we use bulk delete / bulk insert (single
transaction per phase) instead of N+1 single-row operations
Why this matters: a typical accounting sync is ~10k-100k rows. The old
implementation called ``recordCreate`` row-by-row on the event loop, which
froze every other request (chat, health-check, etc.) for minutes. See
``local/notes/changelog.txt`` for the original incident analysis.
"""
import asyncio
import json as _json
import logging
import os
import time
from collections import defaultdict
from pathlib import Path
from typing import Callable, Dict, Any, List, Optional, Type
from .accountingConnectorBase import BaseAccountingConnector
from .accountingRegistry import _getAccountingRegistry
logger = logging.getLogger(__name__)
_HEARTBEAT_EVERY = 500
def _isDebugDumpEnabled() -> bool:
"""Whether to write raw connector payloads to disk for offline inspection.
Controlled by ``APP_DEBUG_ACCOUNTING_SYNC_ENABLED``. Default False so that
INT/PROD never spend disk/IO/RAM on dumping 7-figure JSON files. Mirrors
the existing ``APP_DEBUG_CHAT_WORKFLOW_ENABLED`` pattern.
"""
try:
from modules.shared.configuration import APP_CONFIG
raw = APP_CONFIG.get("APP_DEBUG_ACCOUNTING_SYNC_ENABLED", False)
if isinstance(raw, bool):
return raw
return str(raw).strip().lower() in {"true", "1", "yes", "on"}
except Exception:
return False
def _resolveDebugDumpDir() -> Optional[Path]:
"""Resolve the debug dump directory. Returns None if dumping is disabled
or the path could not be created."""
if not _isDebugDumpEnabled():
return None
try:
from modules.shared.configuration import APP_CONFIG
configured = APP_CONFIG.get("APP_DEBUG_ACCOUNTING_SYNC_DIR", None)
if not configured:
return None
path = Path(str(configured))
if not path.is_absolute():
gatewayDir = Path(__file__).resolve().parents[4]
path = gatewayDir / configured
path.mkdir(parents=True, exist_ok=True)
return path
except Exception as ex:
logger.warning(f"Could not resolve debug dump dir: {ex}")
return None
def _dumpSyncData(tag: str, rows: list) -> None:
"""Write raw connector data to a timestamped JSON file. No-op unless
``APP_DEBUG_ACCOUNTING_SYNC_ENABLED`` is true AND the configured dir
resolves."""
dumpDir = _resolveDebugDumpDir()
if dumpDir is None:
return
try:
ts = time.strftime("%Y%m%d-%H%M%S")
path = dumpDir / f"{ts}_{tag}.json"
serializable = []
for r in rows:
if isinstance(r, dict):
serializable.append(r)
elif hasattr(r, "__dict__"):
serializable.append({k: v for k, v in r.__dict__.items() if not k.startswith("_")})
else:
serializable.append(str(r))
with open(path, "w", encoding="utf-8") as f:
_json.dump({"count": len(serializable), "rows": serializable}, f, ensure_ascii=False, indent=2, default=str)
logger.info(f"Debug sync dump: {path.name} ({len(serializable)} rows)")
except Exception as e:
logger.warning(f"Failed to write debug sync dump for {tag}: {e}")
class AccountingDataSync:
"""Imports accounting data (read-only) from an external system into local TrusteeData* tables."""
def __init__(self, trusteeInterface):
self._if = trusteeInterface
self._registry = _getAccountingRegistry()
async def importData(
self,
featureInstanceId: str,
mandateId: str,
dateFrom: Optional[str] = None,
dateTo: Optional[str] = None,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
) -> Dict[str, Any]:
"""Run a full data import for a feature instance.
Returns a summary dict with counts per entity and any errors. All heavy
DB work is offloaded to a worker thread via ``asyncio.to_thread`` so
the event loop remains responsive for other requests.
``progressCb(percent, message)`` -- if provided, called at every phase
boundary so the UI poll on ``GET /api/jobs/{jobId}`` shows real
movement instead of jumping from 10 % to 100 %. Safe to omit.
"""
def _progress(pct: int, msg: str) -> None:
if progressCb is None:
return
try:
progressCb(pct, msg)
except Exception as ex:
logger.warning(f"progressCb failed at {pct}%: {ex}")
from modules.features.trustee.datamodelFeatureTrustee import (
TrusteeAccountingConfig,
TrusteeDataAccount,
TrusteeDataJournalEntry,
TrusteeDataJournalLine,
TrusteeDataContact,
TrusteeDataAccountBalance,
)
from modules.shared.configuration import decryptValue
summary: Dict[str, Any] = {
"accounts": 0,
"journalEntries": 0,
"journalLines": 0,
"contacts": 0,
"accountBalances": 0,
"errors": [],
"startedAt": time.time(),
}
cfgRecords = self._if.db.getRecordset(
TrusteeAccountingConfig,
recordFilter={"featureInstanceId": featureInstanceId, "isActive": True},
)
if not cfgRecords:
summary["errors"].append("No active accounting configuration found")
return summary
cfgRecord = cfgRecords[0]
connectorType = cfgRecord.get("connectorType", "")
encryptedConfig = cfgRecord.get("encryptedConfig", "")
try:
plainJson = decryptValue(encryptedConfig)
connConfig = _json.loads(plainJson) if plainJson else {}
except Exception as e:
summary["errors"].append(f"Failed to decrypt config: {e}")
return summary
connector = self._registry.getConnector(connectorType)
if not connector:
summary["errors"].append(f"Unknown connector type: {connectorType}")
return summary
scope = {"featureInstanceId": featureInstanceId, "mandateId": mandateId}
logger.info(
f"AccountingDataSync starting for {featureInstanceId}, "
f"connector={connectorType}, dateFrom={dateFrom}, dateTo={dateTo}"
)
fetchedAccountNumbers: list = []
# ---- Phase 1: Chart of accounts ----
# Progress budget: 15-30 %. The fetch (15 %) is usually a single
# snappy API call; the persist step (30 %) is bulk-insert and finishes
# in <100 ms even for thousands of rows.
try:
_progress(15, "Lade Kontenplan...")
charts = await connector.getChartOfAccounts(connConfig)
_dumpSyncData("accounts", charts)
fetchedAccountNumbers = [acc.accountNumber for acc in charts if acc.accountNumber]
_progress(25, f"Speichere {len(charts)} Konten...")
written = await asyncio.to_thread(
self._persistAccounts, charts, scope, featureInstanceId, TrusteeDataAccount
)
summary["accounts"] = written
_progress(30, f"{written} Konten gespeichert.")
except Exception as e:
logger.error(f"Import accounts failed: {e}", exc_info=True)
summary["errors"].append(f"Accounts: {e}")
# ---- Phase 2: Journal entries + lines ----
# Progress budget: 35-65 %. Usually the longest phase; the external
# API often paginates per account, so the fetch alone can take 30+ s.
try:
_progress(35, "Lade Journaleintraege vom Buchhaltungssystem...")
rawEntries = await connector.getJournalEntries(
connConfig,
dateFrom=dateFrom,
dateTo=dateTo,
accountNumbers=fetchedAccountNumbers or None,
)
_dumpSyncData("journalEntries", rawEntries)
_progress(60, f"Speichere {len(rawEntries)} Buchungssaetze...")
entriesCount, linesCount = await asyncio.to_thread(
self._persistJournal, rawEntries, scope, featureInstanceId,
TrusteeDataJournalEntry, TrusteeDataJournalLine,
)
summary["journalEntries"] = entriesCount
summary["journalLines"] = linesCount
_progress(65, f"{entriesCount} Saetze + {linesCount} Buchungszeilen gespeichert.")
except Exception as e:
logger.error(f"Import journal entries failed: {e}", exc_info=True)
summary["errors"].append(f"Journal entries: {e}")
# ---- Phase 3: Contacts (customers + vendors) ----
# Progress budget: 70-85 %. Two quick API calls + one bulk-insert.
try:
_progress(70, "Lade Kunden...")
customers = await connector.getCustomers(connConfig)
_dumpSyncData("customers", customers)
_progress(78, "Lade Lieferanten...")
vendors = await connector.getVendors(connConfig)
_dumpSyncData("vendors", vendors)
_progress(82, f"Speichere {len(customers) + len(vendors)} Kontakte...")
contactCount = await asyncio.to_thread(
self._persistContacts, customers, vendors, scope,
featureInstanceId, TrusteeDataContact,
)
summary["contacts"] = contactCount
_progress(85, f"{contactCount} Kontakte gespeichert.")
except Exception as e:
logger.error(f"Import contacts failed: {e}", exc_info=True)
summary["errors"].append(f"Contacts: {e}")
# ---- Phase 4: Compute account balances ----
# Progress budget: 90-95 %. Pure DB aggregation, no external calls.
try:
_progress(90, "Berechne Kontensaldi...")
balanceCount = await asyncio.to_thread(
self._persistBalances, featureInstanceId, mandateId,
TrusteeDataJournalEntry, TrusteeDataJournalLine, TrusteeDataAccountBalance,
)
summary["accountBalances"] = balanceCount
_progress(95, f"{balanceCount} Saldi berechnet.")
except Exception as e:
logger.error(f"Compute balances failed: {e}", exc_info=True)
summary["errors"].append(f"Balances: {e}")
cfgId = cfgRecord.get("id")
if cfgId:
corePayload = {
"lastSyncAt": time.time(),
"lastSyncStatus": "success" if not summary["errors"] else "partial",
"lastSyncErrorMessage": "; ".join(summary["errors"])[:500] if summary["errors"] else None,
}
try:
self._if.db.recordModify(TrusteeAccountingConfig, cfgId, corePayload)
except Exception as coreErr:
logger.exception(f"AccountingDataSync: failed to write core lastSync* fields for cfg {cfgId}: {coreErr}")
summary["errors"].append(f"Persist lastSync core: {coreErr}")
extPayload = {
"lastSyncDateFrom": dateFrom,
"lastSyncDateTo": dateTo,
"lastSyncCounts": {
"accounts": int(summary.get("accounts", 0)),
"journalEntries": int(summary.get("journalEntries", 0)),
"journalLines": int(summary.get("journalLines", 0)),
"contacts": int(summary.get("contacts", 0)),
"accountBalances": int(summary.get("accountBalances", 0)),
},
}
try:
self._if.db.recordModify(TrusteeAccountingConfig, cfgId, extPayload)
except Exception as extErr:
logger.exception(f"AccountingDataSync: failed to write extended lastSync* fields for cfg {cfgId}: {extErr}")
summary["errors"].append(f"Persist lastSync ext: {extErr}")
summary["finishedAt"] = time.time()
summary["durationSeconds"] = round(summary["finishedAt"] - summary["startedAt"], 1)
logger.info(
f"AccountingDataSync completed for {featureInstanceId}: "
f"{summary['accounts']} accounts, {summary['journalEntries']} entries, "
f"{summary['journalLines']} lines, {summary['contacts']} contacts, "
f"{summary['accountBalances']} balances, {len(summary['errors'])} errors, "
f"{summary['durationSeconds']}s"
)
return summary
# ===== Sync persistence helpers (run inside asyncio.to_thread) =====
def _persistAccounts(self, charts: list, scope: Dict[str, Any],
featureInstanceId: str, modelAccount: Type) -> int:
"""Bulk-replace chart of accounts for one feature instance."""
t0 = time.time()
self._bulkClear(modelAccount, featureInstanceId)
rows = [{
"accountNumber": acc.accountNumber,
"label": acc.label,
"accountType": acc.accountType or "",
"currency": "CHF",
"isActive": True,
**scope,
} for acc in charts]
n = self._bulkCreate(modelAccount, rows)
logger.info(f"Persisted {n} accounts for {featureInstanceId} in {time.time() - t0:.1f}s")
return n
def _persistJournal(self, rawEntries: list, scope: Dict[str, Any],
featureInstanceId: str, modelEntry: Type, modelLine: Type) -> tuple:
"""Bulk-replace journal entries + journal lines for one feature instance.
We pre-build the line rows in memory keyed by the freshly minted entryId
so a single ``execute_values`` call can persist all of them.
"""
import uuid as _uuid
t0 = time.time()
self._bulkClear(modelEntry, featureInstanceId)
self._bulkClear(modelLine, featureInstanceId)
entryRows: List[Dict[str, Any]] = []
lineRows: List[Dict[str, Any]] = []
for raw in rawEntries:
entryId = str(_uuid.uuid4())
entryRows.append({
"id": entryId,
"externalId": raw.get("externalId"),
"bookingDate": raw.get("bookingDate"),
"reference": raw.get("reference"),
"description": raw.get("description", ""),
"currency": raw.get("currency", "CHF"),
"totalAmount": float(raw.get("totalAmount", 0)),
**scope,
})
for line in (raw.get("lines") or []):
lineRows.append({
"journalEntryId": entryId,
"accountNumber": line.get("accountNumber", ""),
"debitAmount": float(line.get("debitAmount", 0)),
"creditAmount": float(line.get("creditAmount", 0)),
"currency": line.get("currency", "CHF"),
"taxCode": line.get("taxCode"),
"costCenter": line.get("costCenter"),
"description": line.get("description", ""),
**scope,
})
if len(entryRows) % _HEARTBEAT_EVERY == 0:
logger.info(
f"Journal build progress: {len(entryRows)}/{len(rawEntries)} entries, "
f"{len(lineRows)} lines so far"
)
entriesCount = self._bulkCreate(modelEntry, entryRows)
linesCount = self._bulkCreate(modelLine, lineRows)
logger.info(
f"Persisted {entriesCount} entries + {linesCount} lines for "
f"{featureInstanceId} in {time.time() - t0:.1f}s"
)
return entriesCount, linesCount
def _persistContacts(self, customers: list, vendors: list, scope: Dict[str, Any],
featureInstanceId: str, modelContact: Type) -> int:
"""Bulk-replace contacts (customers + vendors) for one feature instance."""
t0 = time.time()
self._bulkClear(modelContact, featureInstanceId)
rows = [self._mapContact(c, "customer", scope) for c in customers]
rows += [self._mapContact(v, "vendor", scope) for v in vendors]
n = self._bulkCreate(modelContact, rows)
logger.info(f"Persisted {n} contacts for {featureInstanceId} in {time.time() - t0:.1f}s")
return n
def _persistBalances(self, featureInstanceId: str, mandateId: str,
modelEntry: Type, modelLine: Type, modelBalance: Type) -> int:
"""Re-aggregate journal lines into monthly + annual balances."""
t0 = time.time()
self._bulkClear(modelBalance, featureInstanceId)
entries = self._if.db.getRecordset(
modelEntry, recordFilter={"featureInstanceId": featureInstanceId},
) or []
entryDates: Dict[str, str] = {}
for e in entries:
eid = e.get("id") if isinstance(e, dict) else getattr(e, "id", None)
bdate = e.get("bookingDate") if isinstance(e, dict) else getattr(e, "bookingDate", None)
if eid and bdate:
entryDates[eid] = bdate
lines = self._if.db.getRecordset(
modelLine, recordFilter={"featureInstanceId": featureInstanceId},
) or []
buckets: Dict[tuple, Dict[str, float]] = defaultdict(lambda: {"debit": 0.0, "credit": 0.0})
for ln in lines:
if isinstance(ln, dict):
jeid = ln.get("journalEntryId", "")
accNo = ln.get("accountNumber", "")
debit = float(ln.get("debitAmount", 0))
credit = float(ln.get("creditAmount", 0))
else:
jeid = getattr(ln, "journalEntryId", "")
accNo = getattr(ln, "accountNumber", "")
debit = float(getattr(ln, "debitAmount", 0))
credit = float(getattr(ln, "creditAmount", 0))
bdate = entryDates.get(jeid, "")
if not accNo or not bdate:
continue
parts = bdate.split("-")
if len(parts) < 2:
continue
try:
year = int(parts[0])
month = int(parts[1])
except ValueError:
continue
buckets[(accNo, year, month)]["debit"] += debit
buckets[(accNo, year, month)]["credit"] += credit
buckets[(accNo, year, 0)]["debit"] += debit
buckets[(accNo, year, 0)]["credit"] += credit
scope = {"featureInstanceId": featureInstanceId, "mandateId": mandateId}
rows = [{
"accountNumber": accNo,
"periodYear": year,
"periodMonth": month,
"openingBalance": 0.0,
"debitTotal": round(totals["debit"], 2),
"creditTotal": round(totals["credit"], 2),
"closingBalance": round(totals["debit"] - totals["credit"], 2),
"currency": "CHF",
**scope,
} for (accNo, year, month), totals in buckets.items()]
n = self._bulkCreate(modelBalance, rows)
logger.info(
f"Persisted {n} balances for {featureInstanceId} in {time.time() - t0:.1f}s"
)
return n
# ===== Low-level bulk helpers =====
def _bulkClear(self, model: Type, featureInstanceId: str) -> int:
"""Delete every row for this feature instance in a single statement."""
try:
return self._if.db.recordDeleteWhere(
model, {"featureInstanceId": featureInstanceId}
)
except AttributeError:
# Backwards-compatible path if the connector hasn't been upgraded
# yet. Logs a warning so we notice in dev/CI.
logger.warning(
"DatabaseConnector.recordDeleteWhere missing — falling back to slow per-row delete for %s",
model.__name__,
)
records = self._if.db.getRecordset(
model, recordFilter={"featureInstanceId": featureInstanceId}
) or []
count = 0
for r in records:
rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None)
if rid:
try:
self._if.db.recordDelete(model, rid)
count += 1
except Exception:
pass
return count
def _bulkCreate(self, model: Type, rows: List[Dict[str, Any]]) -> int:
"""Insert all rows in a single transaction. Falls back to per-row
insert only if the connector lacks ``recordCreateBulk`` (legacy)."""
if not rows:
return 0
try:
return self._if.db.recordCreateBulk(model, rows)
except AttributeError:
logger.warning(
"DatabaseConnector.recordCreateBulk missing — falling back to slow per-row insert for %s",
model.__name__,
)
n = 0
for i, r in enumerate(rows, start=1):
try:
self._if.db.recordCreate(model, r)
n += 1
except Exception as ex:
logger.warning(f"Per-row insert failed at {i}/{len(rows)}: {ex}")
if i % _HEARTBEAT_EVERY == 0:
logger.info(f"Per-row insert progress: {i}/{len(rows)} rows ({model.__name__})")
return n
# ===== Field helpers =====
@staticmethod
def _safeStr(val: Any) -> str:
"""Convert a value to a safe string for DB storage, collapsing nested dicts/lists."""
if val is None:
return ""
if isinstance(val, (dict, list)):
return ""
return str(val)
def _mapContact(self, raw: Dict[str, Any], contactType: str, scope: Dict[str, Any]) -> Dict[str, Any]:
"""Extract contact fields from a raw API dict, handling varying field names across connectors."""
s = self._safeStr
return {
"externalId": s(raw.get("id") or raw.get("Id") or raw.get("customer_nr") or raw.get("vendor_nr") or ""),
"contactType": contactType,
"contactNumber": s(
raw.get("customernumber") or raw.get("customer_nr")
or raw.get("vendornumber") or raw.get("vendor_nr")
or raw.get("nr") or raw.get("ContactNumber")
or raw.get("id") or ""
),
"name": s(raw.get("name") or raw.get("Name") or raw.get("name_1") or ""),
"address": s(raw.get("addr1") or raw.get("address") or raw.get("Address") or ""),
"zip": s(raw.get("zipcode") or raw.get("postcode") or raw.get("Zip") or raw.get("zip") or ""),
"city": s(raw.get("city") or raw.get("City") or ""),
"country": s(raw.get("country") or raw.get("country_id") or raw.get("Country") or ""),
"email": s(raw.get("email") or raw.get("mail") or raw.get("Email") or ""),
"phone": s(raw.get("phone") or raw.get("phone_fixed") or raw.get("Phone") or ""),
"vatNumber": s(raw.get("vat_identifier") or raw.get("vatNumber") or ""),
**scope,
}