# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Orchestrates importing accounting data from external systems into TrusteeData* tables. Flow per phase: 1. async fetch from external system (HTTP, awaits cleanly on the event loop) 2. await asyncio.to_thread(...) for the DB write phase, so the heavy synchronous psycopg2 calls do NOT block the FastAPI event loop 3. inside the worker thread we use bulk delete / bulk insert (single transaction per phase) instead of N+1 single-row operations Why this matters: a typical accounting sync is ~10k-100k rows. The old implementation called ``recordCreate`` row-by-row on the event loop, which froze every other request (chat, health-check, etc.) for minutes. See ``local/notes/changelog.txt`` for the original incident analysis. """ import asyncio import json as _json import logging import os import time from collections import defaultdict from pathlib import Path from typing import Callable, Dict, Any, List, Optional, Type from .accountingConnectorBase import BaseAccountingConnector from .accountingRegistry import _getAccountingRegistry logger = logging.getLogger(__name__) _HEARTBEAT_EVERY = 500 def _isDebugDumpEnabled() -> bool: """Whether to write raw connector payloads to disk for offline inspection. Controlled by ``APP_DEBUG_ACCOUNTING_SYNC_ENABLED``. Default False so that INT/PROD never spend disk/IO/RAM on dumping 7-figure JSON files. Mirrors the existing ``APP_DEBUG_CHAT_WORKFLOW_ENABLED`` pattern. """ try: from modules.shared.configuration import APP_CONFIG raw = APP_CONFIG.get("APP_DEBUG_ACCOUNTING_SYNC_ENABLED", False) if isinstance(raw, bool): return raw return str(raw).strip().lower() in {"true", "1", "yes", "on"} except Exception: return False def _resolveDebugDumpDir() -> Optional[Path]: """Resolve the debug dump directory. Returns None if dumping is disabled or the path could not be created.""" if not _isDebugDumpEnabled(): return None try: from modules.shared.configuration import APP_CONFIG configured = APP_CONFIG.get("APP_DEBUG_ACCOUNTING_SYNC_DIR", None) if not configured: return None path = Path(str(configured)) if not path.is_absolute(): gatewayDir = Path(__file__).resolve().parents[4] path = gatewayDir / configured path.mkdir(parents=True, exist_ok=True) return path except Exception as ex: logger.warning(f"Could not resolve debug dump dir: {ex}") return None def _dumpSyncData(tag: str, rows: list) -> None: """Write raw connector data to a timestamped JSON file. No-op unless ``APP_DEBUG_ACCOUNTING_SYNC_ENABLED`` is true AND the configured dir resolves.""" dumpDir = _resolveDebugDumpDir() if dumpDir is None: return try: ts = time.strftime("%Y%m%d-%H%M%S") path = dumpDir / f"{ts}_{tag}.json" serializable = [] for r in rows: if isinstance(r, dict): serializable.append(r) elif hasattr(r, "__dict__"): serializable.append({k: v for k, v in r.__dict__.items() if not k.startswith("_")}) else: serializable.append(str(r)) with open(path, "w", encoding="utf-8") as f: _json.dump({"count": len(serializable), "rows": serializable}, f, ensure_ascii=False, indent=2, default=str) logger.info(f"Debug sync dump: {path.name} ({len(serializable)} rows)") except Exception as e: logger.warning(f"Failed to write debug sync dump for {tag}: {e}") class AccountingDataSync: """Imports accounting data (read-only) from an external system into local TrusteeData* tables.""" def __init__(self, trusteeInterface): self._if = trusteeInterface self._registry = _getAccountingRegistry() async def importData( self, featureInstanceId: str, mandateId: str, dateFrom: Optional[str] = None, dateTo: Optional[str] = None, progressCb: Optional[Callable[[int, Optional[str]], None]] = None, ) -> Dict[str, Any]: """Run a full data import for a feature instance. Returns a summary dict with counts per entity and any errors. All heavy DB work is offloaded to a worker thread via ``asyncio.to_thread`` so the event loop remains responsive for other requests. ``progressCb(percent, message)`` -- if provided, called at every phase boundary so the UI poll on ``GET /api/jobs/{jobId}`` shows real movement instead of jumping from 10 % to 100 %. Safe to omit. """ def _progress(pct: int, msg: str) -> None: if progressCb is None: return try: progressCb(pct, msg) except Exception as ex: logger.warning(f"progressCb failed at {pct}%: {ex}") from modules.features.trustee.datamodelFeatureTrustee import ( TrusteeAccountingConfig, TrusteeDataAccount, TrusteeDataJournalEntry, TrusteeDataJournalLine, TrusteeDataContact, TrusteeDataAccountBalance, ) from modules.shared.configuration import decryptValue summary: Dict[str, Any] = { "accounts": 0, "journalEntries": 0, "journalLines": 0, "contacts": 0, "accountBalances": 0, "errors": [], "startedAt": time.time(), } cfgRecords = self._if.db.getRecordset( TrusteeAccountingConfig, recordFilter={"featureInstanceId": featureInstanceId, "isActive": True}, ) if not cfgRecords: summary["errors"].append("No active accounting configuration found") return summary cfgRecord = cfgRecords[0] connectorType = cfgRecord.get("connectorType", "") encryptedConfig = cfgRecord.get("encryptedConfig", "") try: plainJson = decryptValue(encryptedConfig) connConfig = _json.loads(plainJson) if plainJson else {} except Exception as e: summary["errors"].append(f"Failed to decrypt config: {e}") return summary connector = self._registry.getConnector(connectorType) if not connector: summary["errors"].append(f"Unknown connector type: {connectorType}") return summary scope = {"featureInstanceId": featureInstanceId, "mandateId": mandateId} logger.info( f"AccountingDataSync starting for {featureInstanceId}, " f"connector={connectorType}, dateFrom={dateFrom}, dateTo={dateTo}" ) fetchedAccountNumbers: list = [] # ---- Phase 1: Chart of accounts ---- # Progress budget: 15-30 %. The fetch (15 %) is usually a single # snappy API call; the persist step (30 %) is bulk-insert and finishes # in <100 ms even for thousands of rows. try: _progress(15, "Lade Kontenplan...") charts = await connector.getChartOfAccounts(connConfig) _dumpSyncData("accounts", charts) fetchedAccountNumbers = [acc.accountNumber for acc in charts if acc.accountNumber] _progress(25, f"Speichere {len(charts)} Konten...") written = await asyncio.to_thread( self._persistAccounts, charts, scope, featureInstanceId, TrusteeDataAccount ) summary["accounts"] = written _progress(30, f"{written} Konten gespeichert.") except Exception as e: logger.error(f"Import accounts failed: {e}", exc_info=True) summary["errors"].append(f"Accounts: {e}") # ---- Phase 2: Journal entries + lines ---- # Progress budget: 35-65 %. Usually the longest phase; the external # API often paginates per account, so the fetch alone can take 30+ s. try: _progress(35, "Lade Journaleintraege vom Buchhaltungssystem...") rawEntries = await connector.getJournalEntries( connConfig, dateFrom=dateFrom, dateTo=dateTo, accountNumbers=fetchedAccountNumbers or None, ) _dumpSyncData("journalEntries", rawEntries) _progress(60, f"Speichere {len(rawEntries)} Buchungssaetze...") entriesCount, linesCount = await asyncio.to_thread( self._persistJournal, rawEntries, scope, featureInstanceId, TrusteeDataJournalEntry, TrusteeDataJournalLine, ) summary["journalEntries"] = entriesCount summary["journalLines"] = linesCount _progress(65, f"{entriesCount} Saetze + {linesCount} Buchungszeilen gespeichert.") except Exception as e: logger.error(f"Import journal entries failed: {e}", exc_info=True) summary["errors"].append(f"Journal entries: {e}") # ---- Phase 3: Contacts (customers + vendors) ---- # Progress budget: 70-85 %. Two quick API calls + one bulk-insert. try: _progress(70, "Lade Kunden...") customers = await connector.getCustomers(connConfig) _dumpSyncData("customers", customers) _progress(78, "Lade Lieferanten...") vendors = await connector.getVendors(connConfig) _dumpSyncData("vendors", vendors) _progress(82, f"Speichere {len(customers) + len(vendors)} Kontakte...") contactCount = await asyncio.to_thread( self._persistContacts, customers, vendors, scope, featureInstanceId, TrusteeDataContact, ) summary["contacts"] = contactCount _progress(85, f"{contactCount} Kontakte gespeichert.") except Exception as e: logger.error(f"Import contacts failed: {e}", exc_info=True) summary["errors"].append(f"Contacts: {e}") # ---- Phase 4: Compute account balances ---- # Progress budget: 90-95 %. Pure DB aggregation, no external calls. try: _progress(90, "Berechne Kontensaldi...") balanceCount = await asyncio.to_thread( self._persistBalances, featureInstanceId, mandateId, TrusteeDataJournalEntry, TrusteeDataJournalLine, TrusteeDataAccountBalance, ) summary["accountBalances"] = balanceCount _progress(95, f"{balanceCount} Saldi berechnet.") except Exception as e: logger.error(f"Compute balances failed: {e}", exc_info=True) summary["errors"].append(f"Balances: {e}") cfgId = cfgRecord.get("id") if cfgId: corePayload = { "lastSyncAt": time.time(), "lastSyncStatus": "success" if not summary["errors"] else "partial", "lastSyncErrorMessage": "; ".join(summary["errors"])[:500] if summary["errors"] else None, } try: self._if.db.recordModify(TrusteeAccountingConfig, cfgId, corePayload) except Exception as coreErr: logger.exception(f"AccountingDataSync: failed to write core lastSync* fields for cfg {cfgId}: {coreErr}") summary["errors"].append(f"Persist lastSync core: {coreErr}") extPayload = { "lastSyncDateFrom": dateFrom, "lastSyncDateTo": dateTo, "lastSyncCounts": { "accounts": int(summary.get("accounts", 0)), "journalEntries": int(summary.get("journalEntries", 0)), "journalLines": int(summary.get("journalLines", 0)), "contacts": int(summary.get("contacts", 0)), "accountBalances": int(summary.get("accountBalances", 0)), }, } try: self._if.db.recordModify(TrusteeAccountingConfig, cfgId, extPayload) except Exception as extErr: logger.exception(f"AccountingDataSync: failed to write extended lastSync* fields for cfg {cfgId}: {extErr}") summary["errors"].append(f"Persist lastSync ext: {extErr}") summary["finishedAt"] = time.time() summary["durationSeconds"] = round(summary["finishedAt"] - summary["startedAt"], 1) logger.info( f"AccountingDataSync completed for {featureInstanceId}: " f"{summary['accounts']} accounts, {summary['journalEntries']} entries, " f"{summary['journalLines']} lines, {summary['contacts']} contacts, " f"{summary['accountBalances']} balances, {len(summary['errors'])} errors, " f"{summary['durationSeconds']}s" ) return summary # ===== Sync persistence helpers (run inside asyncio.to_thread) ===== def _persistAccounts(self, charts: list, scope: Dict[str, Any], featureInstanceId: str, modelAccount: Type) -> int: """Bulk-replace chart of accounts for one feature instance.""" t0 = time.time() self._bulkClear(modelAccount, featureInstanceId) rows = [{ "accountNumber": acc.accountNumber, "label": acc.label, "accountType": acc.accountType or "", "currency": "CHF", "isActive": True, **scope, } for acc in charts] n = self._bulkCreate(modelAccount, rows) logger.info(f"Persisted {n} accounts for {featureInstanceId} in {time.time() - t0:.1f}s") return n def _persistJournal(self, rawEntries: list, scope: Dict[str, Any], featureInstanceId: str, modelEntry: Type, modelLine: Type) -> tuple: """Bulk-replace journal entries + journal lines for one feature instance. We pre-build the line rows in memory keyed by the freshly minted entryId so a single ``execute_values`` call can persist all of them. """ import uuid as _uuid t0 = time.time() self._bulkClear(modelEntry, featureInstanceId) self._bulkClear(modelLine, featureInstanceId) entryRows: List[Dict[str, Any]] = [] lineRows: List[Dict[str, Any]] = [] for raw in rawEntries: entryId = str(_uuid.uuid4()) entryRows.append({ "id": entryId, "externalId": raw.get("externalId"), "bookingDate": raw.get("bookingDate"), "reference": raw.get("reference"), "description": raw.get("description", ""), "currency": raw.get("currency", "CHF"), "totalAmount": float(raw.get("totalAmount", 0)), **scope, }) for line in (raw.get("lines") or []): lineRows.append({ "journalEntryId": entryId, "accountNumber": line.get("accountNumber", ""), "debitAmount": float(line.get("debitAmount", 0)), "creditAmount": float(line.get("creditAmount", 0)), "currency": line.get("currency", "CHF"), "taxCode": line.get("taxCode"), "costCenter": line.get("costCenter"), "description": line.get("description", ""), **scope, }) if len(entryRows) % _HEARTBEAT_EVERY == 0: logger.info( f"Journal build progress: {len(entryRows)}/{len(rawEntries)} entries, " f"{len(lineRows)} lines so far" ) entriesCount = self._bulkCreate(modelEntry, entryRows) linesCount = self._bulkCreate(modelLine, lineRows) logger.info( f"Persisted {entriesCount} entries + {linesCount} lines for " f"{featureInstanceId} in {time.time() - t0:.1f}s" ) return entriesCount, linesCount def _persistContacts(self, customers: list, vendors: list, scope: Dict[str, Any], featureInstanceId: str, modelContact: Type) -> int: """Bulk-replace contacts (customers + vendors) for one feature instance.""" t0 = time.time() self._bulkClear(modelContact, featureInstanceId) rows = [self._mapContact(c, "customer", scope) for c in customers] rows += [self._mapContact(v, "vendor", scope) for v in vendors] n = self._bulkCreate(modelContact, rows) logger.info(f"Persisted {n} contacts for {featureInstanceId} in {time.time() - t0:.1f}s") return n def _persistBalances(self, featureInstanceId: str, mandateId: str, modelEntry: Type, modelLine: Type, modelBalance: Type) -> int: """Re-aggregate journal lines into monthly + annual balances.""" t0 = time.time() self._bulkClear(modelBalance, featureInstanceId) entries = self._if.db.getRecordset( modelEntry, recordFilter={"featureInstanceId": featureInstanceId}, ) or [] entryDates: Dict[str, str] = {} for e in entries: eid = e.get("id") if isinstance(e, dict) else getattr(e, "id", None) bdate = e.get("bookingDate") if isinstance(e, dict) else getattr(e, "bookingDate", None) if eid and bdate: entryDates[eid] = bdate lines = self._if.db.getRecordset( modelLine, recordFilter={"featureInstanceId": featureInstanceId}, ) or [] buckets: Dict[tuple, Dict[str, float]] = defaultdict(lambda: {"debit": 0.0, "credit": 0.0}) for ln in lines: if isinstance(ln, dict): jeid = ln.get("journalEntryId", "") accNo = ln.get("accountNumber", "") debit = float(ln.get("debitAmount", 0)) credit = float(ln.get("creditAmount", 0)) else: jeid = getattr(ln, "journalEntryId", "") accNo = getattr(ln, "accountNumber", "") debit = float(getattr(ln, "debitAmount", 0)) credit = float(getattr(ln, "creditAmount", 0)) bdate = entryDates.get(jeid, "") if not accNo or not bdate: continue parts = bdate.split("-") if len(parts) < 2: continue try: year = int(parts[0]) month = int(parts[1]) except ValueError: continue buckets[(accNo, year, month)]["debit"] += debit buckets[(accNo, year, month)]["credit"] += credit buckets[(accNo, year, 0)]["debit"] += debit buckets[(accNo, year, 0)]["credit"] += credit scope = {"featureInstanceId": featureInstanceId, "mandateId": mandateId} rows = [{ "accountNumber": accNo, "periodYear": year, "periodMonth": month, "openingBalance": 0.0, "debitTotal": round(totals["debit"], 2), "creditTotal": round(totals["credit"], 2), "closingBalance": round(totals["debit"] - totals["credit"], 2), "currency": "CHF", **scope, } for (accNo, year, month), totals in buckets.items()] n = self._bulkCreate(modelBalance, rows) logger.info( f"Persisted {n} balances for {featureInstanceId} in {time.time() - t0:.1f}s" ) return n # ===== Low-level bulk helpers ===== def _bulkClear(self, model: Type, featureInstanceId: str) -> int: """Delete every row for this feature instance in a single statement.""" try: return self._if.db.recordDeleteWhere( model, {"featureInstanceId": featureInstanceId} ) except AttributeError: # Backwards-compatible path if the connector hasn't been upgraded # yet. Logs a warning so we notice in dev/CI. logger.warning( "DatabaseConnector.recordDeleteWhere missing — falling back to slow per-row delete for %s", model.__name__, ) records = self._if.db.getRecordset( model, recordFilter={"featureInstanceId": featureInstanceId} ) or [] count = 0 for r in records: rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None) if rid: try: self._if.db.recordDelete(model, rid) count += 1 except Exception: pass return count def _bulkCreate(self, model: Type, rows: List[Dict[str, Any]]) -> int: """Insert all rows in a single transaction. Falls back to per-row insert only if the connector lacks ``recordCreateBulk`` (legacy).""" if not rows: return 0 try: return self._if.db.recordCreateBulk(model, rows) except AttributeError: logger.warning( "DatabaseConnector.recordCreateBulk missing — falling back to slow per-row insert for %s", model.__name__, ) n = 0 for i, r in enumerate(rows, start=1): try: self._if.db.recordCreate(model, r) n += 1 except Exception as ex: logger.warning(f"Per-row insert failed at {i}/{len(rows)}: {ex}") if i % _HEARTBEAT_EVERY == 0: logger.info(f"Per-row insert progress: {i}/{len(rows)} rows ({model.__name__})") return n # ===== Field helpers ===== @staticmethod def _safeStr(val: Any) -> str: """Convert a value to a safe string for DB storage, collapsing nested dicts/lists.""" if val is None: return "" if isinstance(val, (dict, list)): return "" return str(val) def _mapContact(self, raw: Dict[str, Any], contactType: str, scope: Dict[str, Any]) -> Dict[str, Any]: """Extract contact fields from a raw API dict, handling varying field names across connectors.""" s = self._safeStr return { "externalId": s(raw.get("id") or raw.get("Id") or raw.get("customer_nr") or raw.get("vendor_nr") or ""), "contactType": contactType, "contactNumber": s( raw.get("customernumber") or raw.get("customer_nr") or raw.get("vendornumber") or raw.get("vendor_nr") or raw.get("nr") or raw.get("ContactNumber") or raw.get("id") or "" ), "name": s(raw.get("name") or raw.get("Name") or raw.get("name_1") or ""), "address": s(raw.get("addr1") or raw.get("address") or raw.get("Address") or ""), "zip": s(raw.get("zipcode") or raw.get("postcode") or raw.get("Zip") or raw.get("zip") or ""), "city": s(raw.get("city") or raw.get("City") or ""), "country": s(raw.get("country") or raw.get("country_id") or raw.get("Country") or ""), "email": s(raw.get("email") or raw.get("mail") or raw.get("Email") or ""), "phone": s(raw.get("phone") or raw.get("phone_fixed") or raw.get("Phone") or ""), "vatNumber": s(raw.get("vat_identifier") or raw.get("vatNumber") or ""), **scope, }