# Trustee Sub-Agent Benchmark -- 19 questions analog Hein 2025 # # Each question covers ONE expected hallucination class so we can attribute # accuracy gains to specific phases (validator / ontology). # # Scoring per question (all binary unless noted): # patternOk -- did the agent call the right tool(s) with the right filters? # forbidOk -- did it AVOID the forbidden tool/op (e.g. SUM closingBalance)? # numericOk -- does the final answer contain the expected number(s)? # accuracyOk -- patternOk AND forbidOk AND numericOk # # tolerance: relative tolerance for numeric comparison (default 0.005 = 0.5 %). - id: q01 question: "Was ist der Banksaldo per 31.12.2025 fuer das ZKB-Konto 1020?" intent: BANK_BALANCE_AT_DATE expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "1020" periodYear: 2025 periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [152400.0] - id: q02 question: "Wie hoch ist die Hauptkasse (Konto 1000) per Ende 2024?" intent: CASH_BALANCE_AT_DATE expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "1000" periodYear: 2024 periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [5200.0] - id: q03 question: "Summiere alle Bankkonten (102x) per 31.12.2025." intent: BANK_GROUP_TOTAL_AT_DATE expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: periodYear: 2025 periodMonth: 0 accountNumberLike: "102%" forbiddenTools: [aggregateTable] expectedNumbers: [180500.0] numericTolerance: 0.01 - id: q04 question: "Wie hat sich der Schlusssaldo des ZKB-Kontos 1020 ueber die Jahre 2023 bis 2025 entwickelt?" intent: BALANCE_HISTORY_PER_YEAR expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "1020" periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [132500.0, 148900.0, 152400.0] - id: q05 question: "Welches Konto hatte 2025 den hoechsten Schlusssaldo bei den Aktiven (1xxx)?" intent: TOP_ASSET_AT_DATE expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: periodYear: 2025 periodMonth: 0 accountNumberLike: "1%" forbiddenTools: [aggregateTable] expectedAnswerContains: ["1020"] expectedNumbers: [152400.0] - id: q06 question: "Welche Konten gehoeren zu den Bankkonten (102x)?" intent: ACCOUNT_LIST_FILTER expectedTools: [queryTable] expectedTable: TrusteeDataAccount requiredFilters: accountNumberLike: "102%" forbiddenTools: [aggregateTable] expectedAnswerContains: ["1020", "1021"] - id: q07 question: "Wie hoch war der Materialaufwand (Konto 5400) im Jahr 2025?" intent: EXPENSE_AT_YEAR expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "5400" periodYear: 2025 periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [112100.0] - id: q08 question: "Wie viele Buchungszeilen gibt es insgesamt im System?" intent: COUNT_ROWS expectedTools: [aggregateTable] expectedTable: TrusteeDataJournalLine expectedAggregate: COUNT forbiddenTools: [] expectedNumbers: [20] - id: q09 question: "Wie hoch ist der gesamte Beratungsertrag (Konto 3000) im Jahr 2025?" intent: REVENUE_AT_YEAR expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "3000" periodYear: 2025 periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [488700.0] - id: q10 question: "Wie viel wurde 2025 auf das Materialaufwand-Konto 5400 gebucht (Soll-Summe ueber Buchungszeilen)?" intent: JOURNAL_SUM_AT_ACCOUNT expectedTools: [aggregateTable] expectedTable: TrusteeDataJournalLine expectedAggregate: SUM expectedAggregateField: debitAmount requiredFilters: accountNumber: "5400" forbiddenTools: [] expectedNumbers: [29400.0] numericTolerance: 0.01 - id: q11 question: "Welche Buchungen im 1. Quartal 2025 (Januar bis Maerz) wurden auf Konto 3000 gebucht?" intent: JOURNAL_LINES_BY_ACCOUNT expectedTools: [queryTable] expectedTable: TrusteeDataJournalLine requiredFilters: accountNumber: "3000" forbiddenTools: [aggregateTable] expectedAnswerContains: ["18500", "ACME"] - id: q12 question: "Wie hoch war die Hauptkasse (Konto 1000) jeweils per Ende Maerz 2025 und per Ende Juni 2025?" intent: MULTI_MONTH_SNAPSHOT expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "1000" periodYear: 2025 forbiddenTools: [aggregateTable] expectedNumbers: [5375.0, 5550.0] numericTolerance: 0.01 - id: q13 question: "Wie hoch ist die Summe aller Aufwandskonten (5xxx und 6xxx) per Ende 2025?" intent: EXPENSE_GROUP_TOTAL expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: periodYear: 2025 periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [154100.0] numericTolerance: 0.01 - id: q14 question: "Welches Konto hat den hoechsten openingBalance fuer 2025?" intent: TOP_OPENING_BALANCE # Both routes are legitimate: queryTable+orderBy+limit=1, or # aggregateTable(MAX) followed by queryTable lookup. We only insist that # the final answer names the right account and (optionally) the value. expectedTools: [queryTable, aggregateTable] expectedTable: TrusteeDataAccountBalance forbiddenTools: [] expectedAnswerContains: ["3000"] expectedNumbers: [462500.0] - id: q15 question: "Liste alle Konten vom Typ asset auf." intent: ACCOUNTS_BY_TYPE expectedTools: [queryTable] expectedTable: TrusteeDataAccount requiredFilters: accountType: "asset" forbiddenTools: [aggregateTable] expectedAnswerContains: ["1000", "1020", "1021", "1100"] - id: q16 question: "Wie hoch ist der Schlusssaldo der Forderungen aus Lieferungen und Leistungen (Konto 1100) per Ende 2025?" intent: BALANCE_BY_NAME_LOOKUP expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "1100" periodYear: 2025 periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [66800.0] - id: q17 question: "Wie hoch waren die Verbindlichkeiten (Konto 2000) jeweils per Ende 2023, 2024 und 2025?" intent: LIABILITY_HISTORY expectedTools: [queryTable] expectedTable: TrusteeDataAccountBalance requiredFilters: accountNumber: "2000" periodMonth: 0 forbiddenTools: [aggregateTable] expectedNumbers: [41300.0, 44100.0, 47900.0] - id: q18 question: "Wie viele Bankkonten gibt es im Kontenplan (102x)?" intent: ACCOUNT_COUNT_BY_PREFIX expectedTools: [queryTable, aggregateTable] expectedTable: TrusteeDataAccount requiredFilters: accountNumberLike: "102%" forbiddenTools: [] expectedNumbers: [2] - id: q19 question: "Gib mir alle Buchungszeilen mit einem Sollbetrag groesser als 20'000 CHF." intent: JOURNAL_LINES_BY_AMOUNT expectedTools: [queryTable] expectedTable: TrusteeDataJournalLine forbiddenTools: [aggregateTable] expectedAnswerContains: ["24300", "32100", "41500", "28700"]