gateway/modules/features/trustee/accounting/accountingDataSync.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Orchestrates importing accounting data from external systems into TrusteeData* tables.

Flow per phase:
  1. async fetch from external system (HTTP, awaits cleanly on the event loop)
  2. await asyncio.to_thread(...) for the DB write phase, so the heavy
     synchronous psycopg2 calls do NOT block the FastAPI event loop
  3. inside the worker thread we use bulk delete / bulk insert (single
     transaction per phase) instead of N+1 single-row operations

Why this matters: a typical accounting sync is ~10k-100k rows. The old
implementation called ``recordCreate`` row-by-row on the event loop, which
froze every other request (chat, health-check, etc.) for minutes. See
``local/notes/changelog.txt`` for the original incident analysis.
"""

import asyncio
import json as _json
import logging
import os
import time
from collections import defaultdict
from pathlib import Path
from typing import Callable, Dict, Any, List, Optional, Type

from .accountingConnectorBase import BaseAccountingConnector
from .accountingRegistry import _getAccountingRegistry

logger = logging.getLogger(__name__)


_HEARTBEAT_EVERY = 500


def _isDebugDumpEnabled() -> bool:
    """Whether to write raw connector payloads to disk for offline inspection.

    Controlled by ``APP_DEBUG_ACCOUNTING_SYNC_ENABLED``. Default False so that
    INT/PROD never spend disk/IO/RAM on dumping 7-figure JSON files. Mirrors
    the existing ``APP_DEBUG_CHAT_WORKFLOW_ENABLED`` pattern.
    """
    try:
        from modules.shared.configuration import APP_CONFIG
        raw = APP_CONFIG.get("APP_DEBUG_ACCOUNTING_SYNC_ENABLED", False)
        if isinstance(raw, bool):
            return raw
        return str(raw).strip().lower() in {"true", "1", "yes", "on"}
    except Exception:
        return False


def _resolveDebugDumpDir() -> Optional[Path]:
    """Resolve the debug dump directory. Returns None if dumping is disabled
    or the path could not be created."""
    if not _isDebugDumpEnabled():
        return None
    try:
        from modules.shared.configuration import APP_CONFIG
        configured = APP_CONFIG.get("APP_DEBUG_ACCOUNTING_SYNC_DIR", None)
        if not configured:
            return None
        path = Path(str(configured))
        if not path.is_absolute():
            gatewayDir = Path(__file__).resolve().parents[4]
            path = gatewayDir / configured
        path.mkdir(parents=True, exist_ok=True)
        return path
    except Exception as ex:
        logger.warning(f"Could not resolve debug dump dir: {ex}")
        return None


def _dumpSyncData(tag: str, rows: list) -> None:
    """Write raw connector data to a timestamped JSON file. No-op unless
    ``APP_DEBUG_ACCOUNTING_SYNC_ENABLED`` is true AND the configured dir
    resolves."""
    dumpDir = _resolveDebugDumpDir()
    if dumpDir is None:
        return
    try:
        ts = time.strftime("%Y%m%d-%H%M%S")
        path = dumpDir / f"{ts}_{tag}.json"
        serializable = []
        for r in rows:
            if isinstance(r, dict):
                serializable.append(r)
            elif hasattr(r, "__dict__"):
                serializable.append({k: v for k, v in r.__dict__.items() if not k.startswith("_")})
            else:
                serializable.append(str(r))
        with open(path, "w", encoding="utf-8") as f:
            _json.dump({"count": len(serializable), "rows": serializable}, f, ensure_ascii=False, indent=2, default=str)
        logger.info(f"Debug sync dump: {path.name} ({len(serializable)} rows)")
    except Exception as e:
        logger.warning(f"Failed to write debug sync dump for {tag}: {e}")


class AccountingDataSync:
    """Imports accounting data (read-only) from an external system into local TrusteeData* tables."""

    def __init__(self, trusteeInterface):
        self._if = trusteeInterface
        self._registry = _getAccountingRegistry()

    async def importData(
        self,
        featureInstanceId: str,
        mandateId: str,
        dateFrom: Optional[str] = None,
        dateTo: Optional[str] = None,
        progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
    ) -> Dict[str, Any]:
        """Run a full data import for a feature instance.

        Returns a summary dict with counts per entity and any errors. All heavy
        DB work is offloaded to a worker thread via ``asyncio.to_thread`` so
        the event loop remains responsive for other requests.

        ``progressCb(percent, message)`` -- if provided, called at every phase
        boundary so the UI poll on ``GET /api/jobs/{jobId}`` shows real
        movement instead of jumping from 10 % to 100 %. Safe to omit.
        """
        def _progress(pct: int, msg: str) -> None:
            if progressCb is None:
                return
            try:
                progressCb(pct, msg)
            except Exception as ex:
                logger.warning(f"progressCb failed at {pct}%: {ex}")
        from modules.features.trustee.datamodelFeatureTrustee import (
            TrusteeAccountingConfig,
            TrusteeDataAccount,
            TrusteeDataJournalEntry,
            TrusteeDataJournalLine,
            TrusteeDataContact,
            TrusteeDataAccountBalance,
        )
        from modules.shared.configuration import decryptValue

        summary: Dict[str, Any] = {
            "accounts": 0,
            "journalEntries": 0,
            "journalLines": 0,
            "contacts": 0,
            "accountBalances": 0,
            "errors": [],
            "startedAt": time.time(),
        }

        cfgRecords = self._if.db.getRecordset(
            TrusteeAccountingConfig,
            recordFilter={"featureInstanceId": featureInstanceId, "isActive": True},
        )
        if not cfgRecords:
            summary["errors"].append("No active accounting configuration found")
            return summary

        cfgRecord = cfgRecords[0]
        connectorType = cfgRecord.get("connectorType", "")
        encryptedConfig = cfgRecord.get("encryptedConfig", "")

        try:
            plainJson = decryptValue(encryptedConfig)
            connConfig = _json.loads(plainJson) if plainJson else {}
        except Exception as e:
            summary["errors"].append(f"Failed to decrypt config: {e}")
            return summary

        connector = self._registry.getConnector(connectorType)
        if not connector:
            summary["errors"].append(f"Unknown connector type: {connectorType}")
            return summary

        scope = {"featureInstanceId": featureInstanceId, "mandateId": mandateId}
        logger.info(
            f"AccountingDataSync starting for {featureInstanceId}, "
            f"connector={connectorType}, dateFrom={dateFrom}, dateTo={dateTo}"
        )
        fetchedAccountNumbers: list = []

        # ---- Phase 1: Chart of accounts ----
        # Progress budget: 15-30 %. The fetch (15 %) is usually a single
        # snappy API call; the persist step (30 %) is bulk-insert and finishes
        # in <100 ms even for thousands of rows.
        try:
            _progress(15, "Lade Kontenplan...")
            charts = await connector.getChartOfAccounts(connConfig)
            _dumpSyncData("accounts", charts)
            fetchedAccountNumbers = [acc.accountNumber for acc in charts if acc.accountNumber]
            _progress(25, f"Speichere {len(charts)} Konten...")
            written = await asyncio.to_thread(
                self._persistAccounts, charts, scope, featureInstanceId, TrusteeDataAccount
            )
            summary["accounts"] = written
            _progress(30, f"{written} Konten gespeichert.")
        except Exception as e:
            logger.error(f"Import accounts failed: {e}", exc_info=True)
            summary["errors"].append(f"Accounts: {e}")

        # ---- Phase 2: Journal entries + lines ----
        # Progress budget: 35-65 %. Usually the longest phase; the external
        # API often paginates per account, so the fetch alone can take 30+ s.
        try:
            _progress(35, "Lade Journaleintraege vom Buchhaltungssystem...")
            rawEntries = await connector.getJournalEntries(
                connConfig,
                dateFrom=dateFrom,
                dateTo=dateTo,
                accountNumbers=fetchedAccountNumbers or None,
            )
            _dumpSyncData("journalEntries", rawEntries)
            _progress(60, f"Speichere {len(rawEntries)} Buchungssaetze...")
            entriesCount, linesCount = await asyncio.to_thread(
                self._persistJournal, rawEntries, scope, featureInstanceId,
                TrusteeDataJournalEntry, TrusteeDataJournalLine,
            )
            summary["journalEntries"] = entriesCount
            summary["journalLines"] = linesCount
            _progress(65, f"{entriesCount} Saetze + {linesCount} Buchungszeilen gespeichert.")
        except Exception as e:
            logger.error(f"Import journal entries failed: {e}", exc_info=True)
            summary["errors"].append(f"Journal entries: {e}")

        # ---- Phase 3: Contacts (customers + vendors) ----
        # Progress budget: 70-85 %. Two quick API calls + one bulk-insert.
        try:
            _progress(70, "Lade Kunden...")
            customers = await connector.getCustomers(connConfig)
            _dumpSyncData("customers", customers)
            _progress(78, "Lade Lieferanten...")
            vendors = await connector.getVendors(connConfig)
            _dumpSyncData("vendors", vendors)
            _progress(82, f"Speichere {len(customers) + len(vendors)} Kontakte...")
            contactCount = await asyncio.to_thread(
                self._persistContacts, customers, vendors, scope,
                featureInstanceId, TrusteeDataContact,
            )
            summary["contacts"] = contactCount
            _progress(85, f"{contactCount} Kontakte gespeichert.")
        except Exception as e:
            logger.error(f"Import contacts failed: {e}", exc_info=True)
            summary["errors"].append(f"Contacts: {e}")

        # ---- Phase 4: Compute account balances ----
        # Progress budget: 90-95 %. Pure DB aggregation, no external calls.
        try:
            _progress(90, "Berechne Kontensaldi...")
            balanceCount = await asyncio.to_thread(
                self._persistBalances, featureInstanceId, mandateId,
                TrusteeDataJournalEntry, TrusteeDataJournalLine, TrusteeDataAccountBalance,
            )
            summary["accountBalances"] = balanceCount
            _progress(95, f"{balanceCount} Saldi berechnet.")
        except Exception as e:
            logger.error(f"Compute balances failed: {e}", exc_info=True)
            summary["errors"].append(f"Balances: {e}")

        cfgId = cfgRecord.get("id")
        if cfgId:
            corePayload = {
                "lastSyncAt": time.time(),
                "lastSyncStatus": "success" if not summary["errors"] else "partial",
                "lastSyncErrorMessage": "; ".join(summary["errors"])[:500] if summary["errors"] else None,
            }
            try:
                self._if.db.recordModify(TrusteeAccountingConfig, cfgId, corePayload)
            except Exception as coreErr:
                logger.exception(f"AccountingDataSync: failed to write core lastSync* fields for cfg {cfgId}: {coreErr}")
                summary["errors"].append(f"Persist lastSync core: {coreErr}")
            extPayload = {
                "lastSyncDateFrom": dateFrom,
                "lastSyncDateTo": dateTo,
                "lastSyncCounts": {
                    "accounts": int(summary.get("accounts", 0)),
                    "journalEntries": int(summary.get("journalEntries", 0)),
                    "journalLines": int(summary.get("journalLines", 0)),
                    "contacts": int(summary.get("contacts", 0)),
                    "accountBalances": int(summary.get("accountBalances", 0)),
                },
            }
            try:
                self._if.db.recordModify(TrusteeAccountingConfig, cfgId, extPayload)
            except Exception as extErr:
                logger.exception(f"AccountingDataSync: failed to write extended lastSync* fields for cfg {cfgId}: {extErr}")
                summary["errors"].append(f"Persist lastSync ext: {extErr}")

        summary["finishedAt"] = time.time()
        summary["durationSeconds"] = round(summary["finishedAt"] - summary["startedAt"], 1)
        logger.info(
            f"AccountingDataSync completed for {featureInstanceId}: "
            f"{summary['accounts']} accounts, {summary['journalEntries']} entries, "
            f"{summary['journalLines']} lines, {summary['contacts']} contacts, "
            f"{summary['accountBalances']} balances, {len(summary['errors'])} errors, "
            f"{summary['durationSeconds']}s"
        )
        return summary

    # ===== Sync persistence helpers (run inside asyncio.to_thread) =====

    def _persistAccounts(self, charts: list, scope: Dict[str, Any],
                         featureInstanceId: str, modelAccount: Type) -> int:
        """Bulk-replace chart of accounts for one feature instance."""
        t0 = time.time()
        self._bulkClear(modelAccount, featureInstanceId)
        rows = [{
            "accountNumber": acc.accountNumber,
            "label": acc.label,
            "accountType": acc.accountType or "",
            "currency": "CHF",
            "isActive": True,
            **scope,
        } for acc in charts]
        n = self._bulkCreate(modelAccount, rows)
        logger.info(f"Persisted {n} accounts for {featureInstanceId} in {time.time() - t0:.1f}s")
        return n

    def _persistJournal(self, rawEntries: list, scope: Dict[str, Any],
                        featureInstanceId: str, modelEntry: Type, modelLine: Type) -> tuple:
        """Bulk-replace journal entries + journal lines for one feature instance.

        We pre-build the line rows in memory keyed by the freshly minted entryId
        so a single ``execute_values`` call can persist all of them.
        """
        import uuid as _uuid
        t0 = time.time()
        self._bulkClear(modelEntry, featureInstanceId)
        self._bulkClear(modelLine, featureInstanceId)

        entryRows: List[Dict[str, Any]] = []
        lineRows: List[Dict[str, Any]] = []
        for raw in rawEntries:
            entryId = str(_uuid.uuid4())
            entryRows.append({
                "id": entryId,
                "externalId": raw.get("externalId"),
                "bookingDate": raw.get("bookingDate"),
                "reference": raw.get("reference"),
                "description": raw.get("description", ""),
                "currency": raw.get("currency", "CHF"),
                "totalAmount": float(raw.get("totalAmount", 0)),
                **scope,
            })
            for line in (raw.get("lines") or []):
                lineRows.append({
                    "journalEntryId": entryId,
                    "accountNumber": line.get("accountNumber", ""),
                    "debitAmount": float(line.get("debitAmount", 0)),
                    "creditAmount": float(line.get("creditAmount", 0)),
                    "currency": line.get("currency", "CHF"),
                    "taxCode": line.get("taxCode"),
                    "costCenter": line.get("costCenter"),
                    "description": line.get("description", ""),
                    **scope,
                })
            if len(entryRows) % _HEARTBEAT_EVERY == 0:
                logger.info(
                    f"Journal build progress: {len(entryRows)}/{len(rawEntries)} entries, "
                    f"{len(lineRows)} lines so far"
                )

        entriesCount = self._bulkCreate(modelEntry, entryRows)
        linesCount = self._bulkCreate(modelLine, lineRows)
        logger.info(
            f"Persisted {entriesCount} entries + {linesCount} lines for "
            f"{featureInstanceId} in {time.time() - t0:.1f}s"
        )
        return entriesCount, linesCount

    def _persistContacts(self, customers: list, vendors: list, scope: Dict[str, Any],
                         featureInstanceId: str, modelContact: Type) -> int:
        """Bulk-replace contacts (customers + vendors) for one feature instance."""
        t0 = time.time()
        self._bulkClear(modelContact, featureInstanceId)
        rows = [self._mapContact(c, "customer", scope) for c in customers]
        rows += [self._mapContact(v, "vendor", scope) for v in vendors]
        n = self._bulkCreate(modelContact, rows)
        logger.info(f"Persisted {n} contacts for {featureInstanceId} in {time.time() - t0:.1f}s")
        return n

    def _persistBalances(self, featureInstanceId: str, mandateId: str,
                         modelEntry: Type, modelLine: Type, modelBalance: Type) -> int:
        """Re-aggregate journal lines into monthly + annual balances."""
        t0 = time.time()
        self._bulkClear(modelBalance, featureInstanceId)

        entries = self._if.db.getRecordset(
            modelEntry, recordFilter={"featureInstanceId": featureInstanceId},
        ) or []
        entryDates: Dict[str, str] = {}
        for e in entries:
            eid = e.get("id") if isinstance(e, dict) else getattr(e, "id", None)
            bdate = e.get("bookingDate") if isinstance(e, dict) else getattr(e, "bookingDate", None)
            if eid and bdate:
                entryDates[eid] = bdate

        lines = self._if.db.getRecordset(
            modelLine, recordFilter={"featureInstanceId": featureInstanceId},
        ) or []

        buckets: Dict[tuple, Dict[str, float]] = defaultdict(lambda: {"debit": 0.0, "credit": 0.0})
        for ln in lines:
            if isinstance(ln, dict):
                jeid = ln.get("journalEntryId", "")
                accNo = ln.get("accountNumber", "")
                debit = float(ln.get("debitAmount", 0))
                credit = float(ln.get("creditAmount", 0))
            else:
                jeid = getattr(ln, "journalEntryId", "")
                accNo = getattr(ln, "accountNumber", "")
                debit = float(getattr(ln, "debitAmount", 0))
                credit = float(getattr(ln, "creditAmount", 0))

            bdate = entryDates.get(jeid, "")
            if not accNo or not bdate:
                continue
            parts = bdate.split("-")
            if len(parts) < 2:
                continue
            try:
                year = int(parts[0])
                month = int(parts[1])
            except ValueError:
                continue

            buckets[(accNo, year, month)]["debit"] += debit
            buckets[(accNo, year, month)]["credit"] += credit
            buckets[(accNo, year, 0)]["debit"] += debit
            buckets[(accNo, year, 0)]["credit"] += credit

        scope = {"featureInstanceId": featureInstanceId, "mandateId": mandateId}
        rows = [{
            "accountNumber": accNo,
            "periodYear": year,
            "periodMonth": month,
            "openingBalance": 0.0,
            "debitTotal": round(totals["debit"], 2),
            "creditTotal": round(totals["credit"], 2),
            "closingBalance": round(totals["debit"] - totals["credit"], 2),
            "currency": "CHF",
            **scope,
        } for (accNo, year, month), totals in buckets.items()]
        n = self._bulkCreate(modelBalance, rows)
        logger.info(
            f"Persisted {n} balances for {featureInstanceId} in {time.time() - t0:.1f}s"
        )
        return n

    # ===== Low-level bulk helpers =====

    def _bulkClear(self, model: Type, featureInstanceId: str) -> int:
        """Delete every row for this feature instance in a single statement."""
        try:
            return self._if.db.recordDeleteWhere(
                model, {"featureInstanceId": featureInstanceId}
            )
        except AttributeError:
            # Backwards-compatible path if the connector hasn't been upgraded
            # yet. Logs a warning so we notice in dev/CI.
            logger.warning(
                "DatabaseConnector.recordDeleteWhere missing — falling back to slow per-row delete for %s",
                model.__name__,
            )
            records = self._if.db.getRecordset(
                model, recordFilter={"featureInstanceId": featureInstanceId}
            ) or []
            count = 0
            for r in records:
                rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None)
                if rid:
                    try:
                        self._if.db.recordDelete(model, rid)
                        count += 1
                    except Exception:
                        pass
            return count

    def _bulkCreate(self, model: Type, rows: List[Dict[str, Any]]) -> int:
        """Insert all rows in a single transaction. Falls back to per-row
        insert only if the connector lacks ``recordCreateBulk`` (legacy)."""
        if not rows:
            return 0
        try:
            return self._if.db.recordCreateBulk(model, rows)
        except AttributeError:
            logger.warning(
                "DatabaseConnector.recordCreateBulk missing — falling back to slow per-row insert for %s",
                model.__name__,
            )
            n = 0
            for i, r in enumerate(rows, start=1):
                try:
                    self._if.db.recordCreate(model, r)
                    n += 1
                except Exception as ex:
                    logger.warning(f"Per-row insert failed at {i}/{len(rows)}: {ex}")
                if i % _HEARTBEAT_EVERY == 0:
                    logger.info(f"Per-row insert progress: {i}/{len(rows)} rows ({model.__name__})")
            return n

    # ===== Field helpers =====

    @staticmethod
    def _safeStr(val: Any) -> str:
        """Convert a value to a safe string for DB storage, collapsing nested dicts/lists."""
        if val is None:
            return ""
        if isinstance(val, (dict, list)):
            return ""
        return str(val)

    def _mapContact(self, raw: Dict[str, Any], contactType: str, scope: Dict[str, Any]) -> Dict[str, Any]:
        """Extract contact fields from a raw API dict, handling varying field names across connectors."""
        s = self._safeStr
        return {
            "externalId": s(raw.get("id") or raw.get("Id") or raw.get("customer_nr") or raw.get("vendor_nr") or ""),
            "contactType": contactType,
            "contactNumber": s(
                raw.get("customernumber") or raw.get("customer_nr")
                or raw.get("vendornumber") or raw.get("vendor_nr")
                or raw.get("nr") or raw.get("ContactNumber")
                or raw.get("id") or ""
            ),
            "name": s(raw.get("name") or raw.get("Name") or raw.get("name_1") or ""),
            "address": s(raw.get("addr1") or raw.get("address") or raw.get("Address") or ""),
            "zip": s(raw.get("zipcode") or raw.get("postcode") or raw.get("Zip") or raw.get("zip") or ""),
            "city": s(raw.get("city") or raw.get("City") or ""),
            "country": s(raw.get("country") or raw.get("country_id") or raw.get("Country") or ""),
            "email": s(raw.get("email") or raw.get("mail") or raw.get("Email") or ""),
            "phone": s(raw.get("phone") or raw.get("phone_fixed") or raw.get("Phone") or ""),
            "vatNumber": s(raw.get("vat_identifier") or raw.get("vatNumber") or ""),
            **scope,
        }