From a6be2b90e054fae264ebde2cc873b4479ddfcdad Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Sun, 1 Mar 2026 21:54:56 +0100
Subject: [PATCH] improve bank extraction retry + auto-matching for bank
documents
Made-with: Cursor
---
.../methodTrustee/actions/extractFromFiles.py | 80 +++++++++
.../methodTrustee/actions/processDocuments.py | 166 +++++++++++++++++-
2 files changed, 245 insertions(+), 1 deletion(-)
diff --git a/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py b/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
index 70fff273..07e9a046 100644
--- a/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
+++ b/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
@@ -215,6 +215,48 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
return records
+def _estimateBankTransactionLineCount(rawText: str) -> int:
+ """Estimate how many transaction rows exist in bank statement OCR text."""
+ import re
+
+ lines = (rawText or "").splitlines()
+ datePattern = re.compile(r"\b(\d{2}[./-]\d{2}[./-]\d{2,4}|\d{4}-\d{2}-\d{2})\b")
+ amountPattern = re.compile(r"[-+]?\d{1,3}(?:[ '\u00A0]\d{3})*(?:[.,]\d{2})\b")
+
+ candidateCount = 0
+ for line in lines:
+ stripped = (line or "").strip()
+ if len(stripped) < 8:
+ continue
+ if datePattern.search(stripped) and amountPattern.search(stripped):
+ candidateCount += 1
+
+ return candidateCount
+
+
+def _buildBankDocumentRetryPrompt(expenseList: str, bankList: str, expectedRows: int) -> str:
+ """Build a stricter retry prompt to force full bank-row extraction."""
+ return (
+ "Du hast vorher zu wenige Buchungen extrahiert. "
+ "Extrahiere JETZT ALLE Transaktionszeilen aus dem Bankauszug vollstaendig. "
+ f"Erwartete Groessenordnung: mindestens {max(2, expectedRows)} Zeilen. "
+ "WICHTIG: Eine Transaktionszeile = genau ein Record. "
+ "Niemals Zeilen zusammenfassen, niemals nur die erste oder eine Beispielzeile liefern. "
+ "Wenn Details fehlen, trotzdem Record erzeugen und fehlende Felder als null setzen. "
+ "Return JSON: {\"records\": [{...}]}. "
+ "Jeder Record hat diese Felder:\n"
+ "- documentType: immer \"bank_document\"\n"
+ "- valuta (YYYY-MM-DD), transactionDateTime (unix seconds, optional)\n"
+ "- company (Gegenpartei)\n"
+ "- desc (vollstaendige Details der Zeile inkl. Referenz/Mitteilung)\n"
+ "- bookingAmount, bookingCurrency\n"
+ f"- debitAccountNumber (NUR Kontonummer aus: {expenseList})\n"
+ f"- creditAccountNumber (NUR Kontonummer aus: {bankList})\n"
+ "- bookingReference, payeeIban, payeeName, paymentReference\n"
+ "Kein MwSt bei Bankauszuegen."
+ )
+
+
async def _extractWithAi(
self,
chatDocumentId: str,
@@ -293,6 +335,44 @@ async def _extractWithAi(
records = _parseStructuredRecords(raw2)
logger.info("Phase 2 result: documentType=%s, records=%d, raw2_length=%d", documentType, len(records), len(raw2))
+ # Failsafe for bank statements: retry with stricter prompt if extraction is likely incomplete.
+ if documentType == "BANK_DOCUMENT":
+ estimatedRows = _estimateBankTransactionLineCount(rawText)
+ likelyIncomplete = (
+ estimatedRows >= 3
+ and (
+ len(records) <= 1
+ or len(records) < max(2, estimatedRows // 2)
+ )
+ )
+ if likelyIncomplete:
+ retryPrompt = _buildBankDocumentRetryPrompt(expenseList, bankList, estimatedRows)
+ retryFullPrompt = f"{retryPrompt}\n\nDOKUMENT-TEXT:\n{rawText}"
+ retryRequest = AiCallRequest(
+ prompt=retryFullPrompt,
+ context="",
+ options=AiCallOptions(resultFormat="json"),
+ )
+ retryResponse = await self.services.ai.callAi(retryRequest)
+ retryRaw = (retryResponse.content or "").strip() if hasattr(retryResponse, "content") else ""
+ retryRecords = _parseStructuredRecords(retryRaw)
+ if len(retryRecords) > len(records):
+ records = retryRecords
+ logger.info(
+ "Bank statement retry improved extraction: records=%d -> %d (estimatedRows=%d, file=%s)",
+ len(_parseStructuredRecords(raw2)),
+ len(records),
+ estimatedRows,
+ fileName,
+ )
+ else:
+ logger.warning(
+ "Bank statement extraction may be incomplete: records=%d, estimatedRows=%d, file=%s",
+ len(records),
+ estimatedRows,
+ fileName,
+ )
+
if records and (not documentType or documentType == "UNKNOWN"):
documentType = "EXPENSE_RECEIPT"
diff --git a/modules/workflows/methods/methodTrustee/actions/processDocuments.py b/modules/workflows/methods/methodTrustee/actions/processDocuments.py
index 1f233fb5..3f95836d 100644
--- a/modules/workflows/methods/methodTrustee/actions/processDocuments.py
+++ b/modules/workflows/methods/methodTrustee/actions/processDocuments.py
@@ -10,6 +10,7 @@ Output: one ActionDocument with JSON { positionIds, documentIds } for chaining t
import json
import logging
+from datetime import datetime
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
@@ -53,6 +54,121 @@ def _cleanStr(value, default=None) -> Optional[str]:
return s if s else default
+def _normaliseRef(value: Any) -> Optional[str]:
+ """Normalise payment references for robust matching."""
+ raw = _cleanStr(value)
+ if not raw:
+ return None
+ import re
+ return re.sub(r"[^A-Z0-9]", "", raw.upper()) or None
+
+
+def _parseIsoDate(value: Any) -> Optional[datetime]:
+ """Parse YYYY-MM-DD date for proximity scoring."""
+ raw = _cleanStr(value)
+ if not raw:
+ return None
+ try:
+ return datetime.strptime(raw[:10], "%Y-%m-%d")
+ except ValueError:
+ return None
+
+
+def _normaliseAmount(value: Any) -> float:
+ """Use absolute rounded amount, since bank lines are often signed."""
+ return round(abs(_parseFloat(value)), 2)
+
+
+def _normaliseCompany(value: Any) -> Optional[str]:
+ """Normalise company names for approximate matching."""
+ raw = _cleanStr(value)
+ if not raw:
+ return None
+ import re
+ cleaned = re.sub(r"[^A-Z0-9]", "", raw.upper())
+ return cleaned or None
+
+
+def _findBestBankMatch(
+ bankPosition: Dict[str, Any],
+ candidatePositions: List[Dict[str, Any]],
+ alreadyMatchedIds: set,
+) -> Optional[Dict[str, Any]]:
+ """Find best invoice/expense position for one bank position."""
+ bankRef = _normaliseRef(bankPosition.get("paymentReference") or bankPosition.get("bookingReference"))
+ bankAmount = _normaliseAmount(bankPosition.get("bookingAmount"))
+ bankIban = _normaliseRef(bankPosition.get("payeeIban"))
+ bankDate = _parseIsoDate(bankPosition.get("valuta"))
+ bankCompany = _normaliseCompany(bankPosition.get("company"))
+
+ bestScore = 0
+ bestCandidate = None
+
+ for candidate in candidatePositions:
+ candidateId = candidate.get("id")
+ if not candidateId or candidateId in alreadyMatchedIds:
+ continue
+ if candidate.get("bankDocumentId"):
+ continue
+ if (candidate.get("documentType") or "").lower().strip() == "bank_document":
+ continue
+
+ score = 0
+ candidateRef = _normaliseRef(candidate.get("paymentReference") or candidate.get("bookingReference"))
+ candidateAmount = _normaliseAmount(candidate.get("bookingAmount"))
+ candidateIban = _normaliseRef(candidate.get("payeeIban"))
+ candidateDate = _parseIsoDate(candidate.get("valuta"))
+ candidateCompany = _normaliseCompany(candidate.get("company"))
+
+ # Strongest signal: structured payment reference / invoice reference match.
+ if bankRef and candidateRef and bankRef == candidateRef:
+ score += 100
+
+ # Amount must usually match; use tolerance for minor rounding differences.
+ if abs(candidateAmount - bankAmount) <= 0.05:
+ score += 40
+
+ # IBAN is a strong supporting signal.
+ if bankIban and candidateIban and bankIban == candidateIban:
+ score += 25
+
+ # Small date difference increases confidence.
+ if bankDate and candidateDate:
+ dayDiff = abs((bankDate - candidateDate).days)
+ if dayDiff <= 3:
+ score += 20
+ elif dayDiff <= 14:
+ score += 10
+
+ # Company/party comparison helps when no structured reference is present.
+ if bankCompany and candidateCompany and (
+ bankCompany in candidateCompany or candidateCompany in bankCompany
+ ):
+ score += 15
+
+ # If no reference exists, require stronger secondary evidence.
+ minScore = 45 if bankRef else 65
+ if score >= minScore and score > bestScore:
+ bestScore = score
+ bestCandidate = candidate
+
+ return bestCandidate
+
+
+def _safeRecordFromModel(modelOrDict: Any) -> Dict[str, Any]:
+ """Convert Pydantic/dict object to plain dict for matching."""
+ if not modelOrDict:
+ return {}
+ if isinstance(modelOrDict, dict):
+ return modelOrDict
+ if hasattr(modelOrDict, "model_dump"):
+ return modelOrDict.model_dump()
+ try:
+ return dict(modelOrDict)
+ except Exception:
+ return {}
+
+
def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]:
"""Map extraction record to TrusteePosition payload."""
recDocType = _cleanStr(record.get("documentType")) or documentType
@@ -119,6 +235,7 @@ async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
allPositionIds = []
allDocumentIds = []
+ autoMatchedPositionIds = []
for chatDoc in chatDocuments:
rawBytes = self.services.chat.getFileData(chatDoc.fileId)
@@ -150,13 +267,60 @@ async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
allDocumentIds.append(trusteeDoc.id)
docTypeLower = (documentType or "unknown").lower()
+ createdPositions = []
for record in records:
posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower)
pos = trusteeInterface.createPosition(posPayload)
if pos:
allPositionIds.append(pos.id)
+ createdPositions.append(pos)
- payload = {"positionIds": allPositionIds, "documentIds": allDocumentIds}
+ # Auto-link bank statement lines to existing invoice/expense positions.
+ if docTypeLower == "bank_document" and createdPositions:
+ try:
+ from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition
+
+ candidatesRaw = trusteeInterface.db.getRecordset(
+ TrusteePosition,
+ recordFilter={"featureInstanceId": featureInstanceId},
+ )
+ candidatePositions = [_safeRecordFromModel(c) for c in (candidatesRaw or [])]
+ matchedInThisBankDoc = set()
+
+ for createdPos in createdPositions:
+ bankPosition = _safeRecordFromModel(createdPos)
+ if not bankPosition:
+ continue
+
+ matchCandidate = _findBestBankMatch(
+ bankPosition=bankPosition,
+ candidatePositions=candidatePositions,
+ alreadyMatchedIds=matchedInThisBankDoc,
+ )
+ if not matchCandidate:
+ continue
+
+ matchedId = matchCandidate.get("id")
+ if not matchedId:
+ continue
+
+ updated = trusteeInterface.updatePosition(
+ matchedId,
+ {
+ "bankDocumentId": trusteeDoc.id,
+ },
+ )
+ if updated:
+ matchedInThisBankDoc.add(matchedId)
+ autoMatchedPositionIds.append(matchedId)
+ except Exception:
+ logger.exception("Automatic bank-document matching failed for documentId=%s", trusteeDoc.id)
+
+ payload = {
+ "positionIds": allPositionIds,
+ "documentIds": allDocumentIds,
+ "autoMatchedPositionIds": autoMatchedPositionIds,
+ }
return ActionResult.isSuccess(
documents=[
ActionDocument(