platform-core/tests/fixtures/trusteeBenchmark/questions.yaml
2026-05-16 22:55:43 +02:00

226 lines
7.1 KiB
YAML

# Trustee Sub-Agent Benchmark -- 19 questions analog Hein 2025
#
# Each question covers ONE expected hallucination class so we can attribute
# accuracy gains to specific phases (validator / ontology).
#
# Scoring per question (all binary unless noted):
# patternOk -- did the agent call the right tool(s) with the right filters?
# forbidOk -- did it AVOID the forbidden tool/op (e.g. SUM closingBalance)?
# numericOk -- does the final answer contain the expected number(s)?
# accuracyOk -- patternOk AND forbidOk AND numericOk
#
# tolerance: relative tolerance for numeric comparison (default 0.005 = 0.5 %).
- id: q01
question: "Was ist der Banksaldo per 31.12.2025 fuer das ZKB-Konto 1020?"
intent: BANK_BALANCE_AT_DATE
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "1020"
periodYear: 2025
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [152400.0]
- id: q02
question: "Wie hoch ist die Hauptkasse (Konto 1000) per Ende 2024?"
intent: CASH_BALANCE_AT_DATE
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "1000"
periodYear: 2024
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [5200.0]
- id: q03
question: "Summiere alle Bankkonten (102x) per 31.12.2025."
intent: BANK_GROUP_TOTAL_AT_DATE
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
periodYear: 2025
periodMonth: 0
accountNumberLike: "102%"
forbiddenTools: [aggregateTable]
expectedNumbers: [180500.0]
numericTolerance: 0.01
- id: q04
question: "Wie hat sich der Schlusssaldo des ZKB-Kontos 1020 ueber die Jahre 2023 bis 2025 entwickelt?"
intent: BALANCE_HISTORY_PER_YEAR
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "1020"
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [132500.0, 148900.0, 152400.0]
- id: q05
question: "Welches Konto hatte 2025 den hoechsten Schlusssaldo bei den Aktiven (1xxx)?"
intent: TOP_ASSET_AT_DATE
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
periodYear: 2025
periodMonth: 0
accountNumberLike: "1%"
forbiddenTools: [aggregateTable]
expectedAnswerContains: ["1020"]
expectedNumbers: [152400.0]
- id: q06
question: "Welche Konten gehoeren zu den Bankkonten (102x)?"
intent: ACCOUNT_LIST_FILTER
expectedTools: [queryTable]
expectedTable: TrusteeDataAccount
requiredFilters:
accountNumberLike: "102%"
forbiddenTools: [aggregateTable]
expectedAnswerContains: ["1020", "1021"]
- id: q07
question: "Wie hoch war der Materialaufwand (Konto 5400) im Jahr 2025?"
intent: EXPENSE_AT_YEAR
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "5400"
periodYear: 2025
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [112100.0]
- id: q08
question: "Wie viele Buchungszeilen gibt es insgesamt im System?"
intent: COUNT_ROWS
expectedTools: [aggregateTable]
expectedTable: TrusteeDataJournalLine
expectedAggregate: COUNT
forbiddenTools: []
expectedNumbers: [20]
- id: q09
question: "Wie hoch ist der gesamte Beratungsertrag (Konto 3000) im Jahr 2025?"
intent: REVENUE_AT_YEAR
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "3000"
periodYear: 2025
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [488700.0]
- id: q10
question: "Wie viel wurde 2025 auf das Materialaufwand-Konto 5400 gebucht (Soll-Summe ueber Buchungszeilen)?"
intent: JOURNAL_SUM_AT_ACCOUNT
expectedTools: [aggregateTable]
expectedTable: TrusteeDataJournalLine
expectedAggregate: SUM
expectedAggregateField: debitAmount
requiredFilters:
accountNumber: "5400"
forbiddenTools: []
expectedNumbers: [29400.0]
numericTolerance: 0.01
- id: q11
question: "Welche Buchungen im 1. Quartal 2025 (Januar bis Maerz) wurden auf Konto 3000 gebucht?"
intent: JOURNAL_LINES_BY_ACCOUNT
expectedTools: [queryTable]
expectedTable: TrusteeDataJournalLine
requiredFilters:
accountNumber: "3000"
forbiddenTools: [aggregateTable]
expectedAnswerContains: ["18500", "ACME"]
- id: q12
question: "Wie hoch war die Hauptkasse (Konto 1000) jeweils per Ende Maerz 2025 und per Ende Juni 2025?"
intent: MULTI_MONTH_SNAPSHOT
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "1000"
periodYear: 2025
forbiddenTools: [aggregateTable]
expectedNumbers: [5375.0, 5550.0]
numericTolerance: 0.01
- id: q13
question: "Wie hoch ist die Summe aller Aufwandskonten (5xxx und 6xxx) per Ende 2025?"
intent: EXPENSE_GROUP_TOTAL
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
periodYear: 2025
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [154100.0]
numericTolerance: 0.01
- id: q14
question: "Welches Konto hat den hoechsten openingBalance fuer 2025?"
intent: TOP_OPENING_BALANCE
# Both routes are legitimate: queryTable+orderBy+limit=1, or
# aggregateTable(MAX) followed by queryTable lookup. We only insist that
# the final answer names the right account and (optionally) the value.
expectedTools: [queryTable, aggregateTable]
expectedTable: TrusteeDataAccountBalance
forbiddenTools: []
expectedAnswerContains: ["3000"]
expectedNumbers: [462500.0]
- id: q15
question: "Liste alle Konten vom Typ asset auf."
intent: ACCOUNTS_BY_TYPE
expectedTools: [queryTable]
expectedTable: TrusteeDataAccount
requiredFilters:
accountType: "asset"
forbiddenTools: [aggregateTable]
expectedAnswerContains: ["1000", "1020", "1021", "1100"]
- id: q16
question: "Wie hoch ist der Schlusssaldo der Forderungen aus Lieferungen und Leistungen (Konto 1100) per Ende 2025?"
intent: BALANCE_BY_NAME_LOOKUP
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "1100"
periodYear: 2025
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [66800.0]
- id: q17
question: "Wie hoch waren die Verbindlichkeiten (Konto 2000) jeweils per Ende 2023, 2024 und 2025?"
intent: LIABILITY_HISTORY
expectedTools: [queryTable]
expectedTable: TrusteeDataAccountBalance
requiredFilters:
accountNumber: "2000"
periodMonth: 0
forbiddenTools: [aggregateTable]
expectedNumbers: [41300.0, 44100.0, 47900.0]
- id: q18
question: "Wie viele Bankkonten gibt es im Kontenplan (102x)?"
intent: ACCOUNT_COUNT_BY_PREFIX
expectedTools: [queryTable, aggregateTable]
expectedTable: TrusteeDataAccount
requiredFilters:
accountNumberLike: "102%"
forbiddenTools: []
expectedNumbers: [2]
- id: q19
question: "Gib mir alle Buchungszeilen mit einem Sollbetrag groesser als 20'000 CHF."
intent: JOURNAL_LINES_BY_AMOUNT
expectedTools: [queryTable]
expectedTable: TrusteeDataJournalLine
forbiddenTools: [aggregateTable]
expectedAnswerContains: ["24300", "32100", "41500", "28700"]