226 lines
7.1 KiB
YAML
226 lines
7.1 KiB
YAML
# Trustee Sub-Agent Benchmark -- 19 questions analog Hein 2025
|
|
#
|
|
# Each question covers ONE expected hallucination class so we can attribute
|
|
# accuracy gains to specific phases (validator / ontology).
|
|
#
|
|
# Scoring per question (all binary unless noted):
|
|
# patternOk -- did the agent call the right tool(s) with the right filters?
|
|
# forbidOk -- did it AVOID the forbidden tool/op (e.g. SUM closingBalance)?
|
|
# numericOk -- does the final answer contain the expected number(s)?
|
|
# accuracyOk -- patternOk AND forbidOk AND numericOk
|
|
#
|
|
# tolerance: relative tolerance for numeric comparison (default 0.005 = 0.5 %).
|
|
|
|
- id: q01
|
|
question: "Was ist der Banksaldo per 31.12.2025 fuer das ZKB-Konto 1020?"
|
|
intent: BANK_BALANCE_AT_DATE
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "1020"
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [152400.0]
|
|
|
|
- id: q02
|
|
question: "Wie hoch ist die Hauptkasse (Konto 1000) per Ende 2024?"
|
|
intent: CASH_BALANCE_AT_DATE
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "1000"
|
|
periodYear: 2024
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [5200.0]
|
|
|
|
- id: q03
|
|
question: "Summiere alle Bankkonten (102x) per 31.12.2025."
|
|
intent: BANK_GROUP_TOTAL_AT_DATE
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
accountNumberLike: "102%"
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [180500.0]
|
|
numericTolerance: 0.01
|
|
|
|
- id: q04
|
|
question: "Wie hat sich der Schlusssaldo des ZKB-Kontos 1020 ueber die Jahre 2023 bis 2025 entwickelt?"
|
|
intent: BALANCE_HISTORY_PER_YEAR
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "1020"
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [132500.0, 148900.0, 152400.0]
|
|
|
|
- id: q05
|
|
question: "Welches Konto hatte 2025 den hoechsten Schlusssaldo bei den Aktiven (1xxx)?"
|
|
intent: TOP_ASSET_AT_DATE
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
accountNumberLike: "1%"
|
|
forbiddenTools: [aggregateTable]
|
|
expectedAnswerContains: ["1020"]
|
|
expectedNumbers: [152400.0]
|
|
|
|
- id: q06
|
|
question: "Welche Konten gehoeren zu den Bankkonten (102x)?"
|
|
intent: ACCOUNT_LIST_FILTER
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccount
|
|
requiredFilters:
|
|
accountNumberLike: "102%"
|
|
forbiddenTools: [aggregateTable]
|
|
expectedAnswerContains: ["1020", "1021"]
|
|
|
|
- id: q07
|
|
question: "Wie hoch war der Materialaufwand (Konto 5400) im Jahr 2025?"
|
|
intent: EXPENSE_AT_YEAR
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "5400"
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [112100.0]
|
|
|
|
- id: q08
|
|
question: "Wie viele Buchungszeilen gibt es insgesamt im System?"
|
|
intent: COUNT_ROWS
|
|
expectedTools: [aggregateTable]
|
|
expectedTable: TrusteeDataJournalLine
|
|
expectedAggregate: COUNT
|
|
forbiddenTools: []
|
|
expectedNumbers: [20]
|
|
|
|
- id: q09
|
|
question: "Wie hoch ist der gesamte Beratungsertrag (Konto 3000) im Jahr 2025?"
|
|
intent: REVENUE_AT_YEAR
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "3000"
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [488700.0]
|
|
|
|
- id: q10
|
|
question: "Wie viel wurde 2025 auf das Materialaufwand-Konto 5400 gebucht (Soll-Summe ueber Buchungszeilen)?"
|
|
intent: JOURNAL_SUM_AT_ACCOUNT
|
|
expectedTools: [aggregateTable]
|
|
expectedTable: TrusteeDataJournalLine
|
|
expectedAggregate: SUM
|
|
expectedAggregateField: debitAmount
|
|
requiredFilters:
|
|
accountNumber: "5400"
|
|
forbiddenTools: []
|
|
expectedNumbers: [29400.0]
|
|
numericTolerance: 0.01
|
|
|
|
- id: q11
|
|
question: "Welche Buchungen im 1. Quartal 2025 (Januar bis Maerz) wurden auf Konto 3000 gebucht?"
|
|
intent: JOURNAL_LINES_BY_ACCOUNT
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataJournalLine
|
|
requiredFilters:
|
|
accountNumber: "3000"
|
|
forbiddenTools: [aggregateTable]
|
|
expectedAnswerContains: ["18500", "ACME"]
|
|
|
|
- id: q12
|
|
question: "Wie hoch war die Hauptkasse (Konto 1000) jeweils per Ende Maerz 2025 und per Ende Juni 2025?"
|
|
intent: MULTI_MONTH_SNAPSHOT
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "1000"
|
|
periodYear: 2025
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [5375.0, 5550.0]
|
|
numericTolerance: 0.01
|
|
|
|
- id: q13
|
|
question: "Wie hoch ist die Summe aller Aufwandskonten (5xxx und 6xxx) per Ende 2025?"
|
|
intent: EXPENSE_GROUP_TOTAL
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [154100.0]
|
|
numericTolerance: 0.01
|
|
|
|
- id: q14
|
|
question: "Welches Konto hat den hoechsten openingBalance fuer 2025?"
|
|
intent: TOP_OPENING_BALANCE
|
|
# Both routes are legitimate: queryTable+orderBy+limit=1, or
|
|
# aggregateTable(MAX) followed by queryTable lookup. We only insist that
|
|
# the final answer names the right account and (optionally) the value.
|
|
expectedTools: [queryTable, aggregateTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
forbiddenTools: []
|
|
expectedAnswerContains: ["3000"]
|
|
expectedNumbers: [462500.0]
|
|
|
|
- id: q15
|
|
question: "Liste alle Konten vom Typ asset auf."
|
|
intent: ACCOUNTS_BY_TYPE
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccount
|
|
requiredFilters:
|
|
accountType: "asset"
|
|
forbiddenTools: [aggregateTable]
|
|
expectedAnswerContains: ["1000", "1020", "1021", "1100"]
|
|
|
|
- id: q16
|
|
question: "Wie hoch ist der Schlusssaldo der Forderungen aus Lieferungen und Leistungen (Konto 1100) per Ende 2025?"
|
|
intent: BALANCE_BY_NAME_LOOKUP
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "1100"
|
|
periodYear: 2025
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [66800.0]
|
|
|
|
- id: q17
|
|
question: "Wie hoch waren die Verbindlichkeiten (Konto 2000) jeweils per Ende 2023, 2024 und 2025?"
|
|
intent: LIABILITY_HISTORY
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataAccountBalance
|
|
requiredFilters:
|
|
accountNumber: "2000"
|
|
periodMonth: 0
|
|
forbiddenTools: [aggregateTable]
|
|
expectedNumbers: [41300.0, 44100.0, 47900.0]
|
|
|
|
- id: q18
|
|
question: "Wie viele Bankkonten gibt es im Kontenplan (102x)?"
|
|
intent: ACCOUNT_COUNT_BY_PREFIX
|
|
expectedTools: [queryTable, aggregateTable]
|
|
expectedTable: TrusteeDataAccount
|
|
requiredFilters:
|
|
accountNumberLike: "102%"
|
|
forbiddenTools: []
|
|
expectedNumbers: [2]
|
|
|
|
- id: q19
|
|
question: "Gib mir alle Buchungszeilen mit einem Sollbetrag groesser als 20'000 CHF."
|
|
intent: JOURNAL_LINES_BY_AMOUNT
|
|
expectedTools: [queryTable]
|
|
expectedTable: TrusteeDataJournalLine
|
|
forbiddenTools: [aggregateTable]
|
|
expectedAnswerContains: ["24300", "32100", "41500", "28700"]
|