improve bank extraction retry + auto-matching for bank documents
Made-with: Cursor
This commit is contained in:
parent
565ad62c39
commit
a6be2b90e0
2 changed files with 245 additions and 1 deletions
|
|
@ -215,6 +215,48 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
|
||||||
return records
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def _estimateBankTransactionLineCount(rawText: str) -> int:
|
||||||
|
"""Estimate how many transaction rows exist in bank statement OCR text."""
|
||||||
|
import re
|
||||||
|
|
||||||
|
lines = (rawText or "").splitlines()
|
||||||
|
datePattern = re.compile(r"\b(\d{2}[./-]\d{2}[./-]\d{2,4}|\d{4}-\d{2}-\d{2})\b")
|
||||||
|
amountPattern = re.compile(r"[-+]?\d{1,3}(?:[ '\u00A0]\d{3})*(?:[.,]\d{2})\b")
|
||||||
|
|
||||||
|
candidateCount = 0
|
||||||
|
for line in lines:
|
||||||
|
stripped = (line or "").strip()
|
||||||
|
if len(stripped) < 8:
|
||||||
|
continue
|
||||||
|
if datePattern.search(stripped) and amountPattern.search(stripped):
|
||||||
|
candidateCount += 1
|
||||||
|
|
||||||
|
return candidateCount
|
||||||
|
|
||||||
|
|
||||||
|
def _buildBankDocumentRetryPrompt(expenseList: str, bankList: str, expectedRows: int) -> str:
|
||||||
|
"""Build a stricter retry prompt to force full bank-row extraction."""
|
||||||
|
return (
|
||||||
|
"Du hast vorher zu wenige Buchungen extrahiert. "
|
||||||
|
"Extrahiere JETZT ALLE Transaktionszeilen aus dem Bankauszug vollstaendig. "
|
||||||
|
f"Erwartete Groessenordnung: mindestens {max(2, expectedRows)} Zeilen. "
|
||||||
|
"WICHTIG: Eine Transaktionszeile = genau ein Record. "
|
||||||
|
"Niemals Zeilen zusammenfassen, niemals nur die erste oder eine Beispielzeile liefern. "
|
||||||
|
"Wenn Details fehlen, trotzdem Record erzeugen und fehlende Felder als null setzen. "
|
||||||
|
"Return JSON: {\"records\": [{...}]}. "
|
||||||
|
"Jeder Record hat diese Felder:\n"
|
||||||
|
"- documentType: immer \"bank_document\"\n"
|
||||||
|
"- valuta (YYYY-MM-DD), transactionDateTime (unix seconds, optional)\n"
|
||||||
|
"- company (Gegenpartei)\n"
|
||||||
|
"- desc (vollstaendige Details der Zeile inkl. Referenz/Mitteilung)\n"
|
||||||
|
"- bookingAmount, bookingCurrency\n"
|
||||||
|
f"- debitAccountNumber (NUR Kontonummer aus: {expenseList})\n"
|
||||||
|
f"- creditAccountNumber (NUR Kontonummer aus: {bankList})\n"
|
||||||
|
"- bookingReference, payeeIban, payeeName, paymentReference\n"
|
||||||
|
"Kein MwSt bei Bankauszuegen."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def _extractWithAi(
|
async def _extractWithAi(
|
||||||
self,
|
self,
|
||||||
chatDocumentId: str,
|
chatDocumentId: str,
|
||||||
|
|
@ -293,6 +335,44 @@ async def _extractWithAi(
|
||||||
records = _parseStructuredRecords(raw2)
|
records = _parseStructuredRecords(raw2)
|
||||||
logger.info("Phase 2 result: documentType=%s, records=%d, raw2_length=%d", documentType, len(records), len(raw2))
|
logger.info("Phase 2 result: documentType=%s, records=%d, raw2_length=%d", documentType, len(records), len(raw2))
|
||||||
|
|
||||||
|
# Failsafe for bank statements: retry with stricter prompt if extraction is likely incomplete.
|
||||||
|
if documentType == "BANK_DOCUMENT":
|
||||||
|
estimatedRows = _estimateBankTransactionLineCount(rawText)
|
||||||
|
likelyIncomplete = (
|
||||||
|
estimatedRows >= 3
|
||||||
|
and (
|
||||||
|
len(records) <= 1
|
||||||
|
or len(records) < max(2, estimatedRows // 2)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if likelyIncomplete:
|
||||||
|
retryPrompt = _buildBankDocumentRetryPrompt(expenseList, bankList, estimatedRows)
|
||||||
|
retryFullPrompt = f"{retryPrompt}\n\nDOKUMENT-TEXT:\n{rawText}"
|
||||||
|
retryRequest = AiCallRequest(
|
||||||
|
prompt=retryFullPrompt,
|
||||||
|
context="",
|
||||||
|
options=AiCallOptions(resultFormat="json"),
|
||||||
|
)
|
||||||
|
retryResponse = await self.services.ai.callAi(retryRequest)
|
||||||
|
retryRaw = (retryResponse.content or "").strip() if hasattr(retryResponse, "content") else ""
|
||||||
|
retryRecords = _parseStructuredRecords(retryRaw)
|
||||||
|
if len(retryRecords) > len(records):
|
||||||
|
records = retryRecords
|
||||||
|
logger.info(
|
||||||
|
"Bank statement retry improved extraction: records=%d -> %d (estimatedRows=%d, file=%s)",
|
||||||
|
len(_parseStructuredRecords(raw2)),
|
||||||
|
len(records),
|
||||||
|
estimatedRows,
|
||||||
|
fileName,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"Bank statement extraction may be incomplete: records=%d, estimatedRows=%d, file=%s",
|
||||||
|
len(records),
|
||||||
|
estimatedRows,
|
||||||
|
fileName,
|
||||||
|
)
|
||||||
|
|
||||||
if records and (not documentType or documentType == "UNKNOWN"):
|
if records and (not documentType or documentType == "UNKNOWN"):
|
||||||
documentType = "EXPENSE_RECEIPT"
|
documentType = "EXPENSE_RECEIPT"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ Output: one ActionDocument with JSON { positionIds, documentIds } for chaining t
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
from typing import Dict, Any, List, Optional
|
from typing import Dict, Any, List, Optional
|
||||||
|
|
||||||
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
||||||
|
|
@ -53,6 +54,121 @@ def _cleanStr(value, default=None) -> Optional[str]:
|
||||||
return s if s else default
|
return s if s else default
|
||||||
|
|
||||||
|
|
||||||
|
def _normaliseRef(value: Any) -> Optional[str]:
|
||||||
|
"""Normalise payment references for robust matching."""
|
||||||
|
raw = _cleanStr(value)
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
import re
|
||||||
|
return re.sub(r"[^A-Z0-9]", "", raw.upper()) or None
|
||||||
|
|
||||||
|
|
||||||
|
def _parseIsoDate(value: Any) -> Optional[datetime]:
|
||||||
|
"""Parse YYYY-MM-DD date for proximity scoring."""
|
||||||
|
raw = _cleanStr(value)
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.strptime(raw[:10], "%Y-%m-%d")
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _normaliseAmount(value: Any) -> float:
|
||||||
|
"""Use absolute rounded amount, since bank lines are often signed."""
|
||||||
|
return round(abs(_parseFloat(value)), 2)
|
||||||
|
|
||||||
|
|
||||||
|
def _normaliseCompany(value: Any) -> Optional[str]:
|
||||||
|
"""Normalise company names for approximate matching."""
|
||||||
|
raw = _cleanStr(value)
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
import re
|
||||||
|
cleaned = re.sub(r"[^A-Z0-9]", "", raw.upper())
|
||||||
|
return cleaned or None
|
||||||
|
|
||||||
|
|
||||||
|
def _findBestBankMatch(
|
||||||
|
bankPosition: Dict[str, Any],
|
||||||
|
candidatePositions: List[Dict[str, Any]],
|
||||||
|
alreadyMatchedIds: set,
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Find best invoice/expense position for one bank position."""
|
||||||
|
bankRef = _normaliseRef(bankPosition.get("paymentReference") or bankPosition.get("bookingReference"))
|
||||||
|
bankAmount = _normaliseAmount(bankPosition.get("bookingAmount"))
|
||||||
|
bankIban = _normaliseRef(bankPosition.get("payeeIban"))
|
||||||
|
bankDate = _parseIsoDate(bankPosition.get("valuta"))
|
||||||
|
bankCompany = _normaliseCompany(bankPosition.get("company"))
|
||||||
|
|
||||||
|
bestScore = 0
|
||||||
|
bestCandidate = None
|
||||||
|
|
||||||
|
for candidate in candidatePositions:
|
||||||
|
candidateId = candidate.get("id")
|
||||||
|
if not candidateId or candidateId in alreadyMatchedIds:
|
||||||
|
continue
|
||||||
|
if candidate.get("bankDocumentId"):
|
||||||
|
continue
|
||||||
|
if (candidate.get("documentType") or "").lower().strip() == "bank_document":
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = 0
|
||||||
|
candidateRef = _normaliseRef(candidate.get("paymentReference") or candidate.get("bookingReference"))
|
||||||
|
candidateAmount = _normaliseAmount(candidate.get("bookingAmount"))
|
||||||
|
candidateIban = _normaliseRef(candidate.get("payeeIban"))
|
||||||
|
candidateDate = _parseIsoDate(candidate.get("valuta"))
|
||||||
|
candidateCompany = _normaliseCompany(candidate.get("company"))
|
||||||
|
|
||||||
|
# Strongest signal: structured payment reference / invoice reference match.
|
||||||
|
if bankRef and candidateRef and bankRef == candidateRef:
|
||||||
|
score += 100
|
||||||
|
|
||||||
|
# Amount must usually match; use tolerance for minor rounding differences.
|
||||||
|
if abs(candidateAmount - bankAmount) <= 0.05:
|
||||||
|
score += 40
|
||||||
|
|
||||||
|
# IBAN is a strong supporting signal.
|
||||||
|
if bankIban and candidateIban and bankIban == candidateIban:
|
||||||
|
score += 25
|
||||||
|
|
||||||
|
# Small date difference increases confidence.
|
||||||
|
if bankDate and candidateDate:
|
||||||
|
dayDiff = abs((bankDate - candidateDate).days)
|
||||||
|
if dayDiff <= 3:
|
||||||
|
score += 20
|
||||||
|
elif dayDiff <= 14:
|
||||||
|
score += 10
|
||||||
|
|
||||||
|
# Company/party comparison helps when no structured reference is present.
|
||||||
|
if bankCompany and candidateCompany and (
|
||||||
|
bankCompany in candidateCompany or candidateCompany in bankCompany
|
||||||
|
):
|
||||||
|
score += 15
|
||||||
|
|
||||||
|
# If no reference exists, require stronger secondary evidence.
|
||||||
|
minScore = 45 if bankRef else 65
|
||||||
|
if score >= minScore and score > bestScore:
|
||||||
|
bestScore = score
|
||||||
|
bestCandidate = candidate
|
||||||
|
|
||||||
|
return bestCandidate
|
||||||
|
|
||||||
|
|
||||||
|
def _safeRecordFromModel(modelOrDict: Any) -> Dict[str, Any]:
|
||||||
|
"""Convert Pydantic/dict object to plain dict for matching."""
|
||||||
|
if not modelOrDict:
|
||||||
|
return {}
|
||||||
|
if isinstance(modelOrDict, dict):
|
||||||
|
return modelOrDict
|
||||||
|
if hasattr(modelOrDict, "model_dump"):
|
||||||
|
return modelOrDict.model_dump()
|
||||||
|
try:
|
||||||
|
return dict(modelOrDict)
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]:
|
def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]:
|
||||||
"""Map extraction record to TrusteePosition payload."""
|
"""Map extraction record to TrusteePosition payload."""
|
||||||
recDocType = _cleanStr(record.get("documentType")) or documentType
|
recDocType = _cleanStr(record.get("documentType")) or documentType
|
||||||
|
|
@ -119,6 +235,7 @@ async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
|
||||||
allPositionIds = []
|
allPositionIds = []
|
||||||
allDocumentIds = []
|
allDocumentIds = []
|
||||||
|
autoMatchedPositionIds = []
|
||||||
|
|
||||||
for chatDoc in chatDocuments:
|
for chatDoc in chatDocuments:
|
||||||
rawBytes = self.services.chat.getFileData(chatDoc.fileId)
|
rawBytes = self.services.chat.getFileData(chatDoc.fileId)
|
||||||
|
|
@ -150,13 +267,60 @@ async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
allDocumentIds.append(trusteeDoc.id)
|
allDocumentIds.append(trusteeDoc.id)
|
||||||
|
|
||||||
docTypeLower = (documentType or "unknown").lower()
|
docTypeLower = (documentType or "unknown").lower()
|
||||||
|
createdPositions = []
|
||||||
for record in records:
|
for record in records:
|
||||||
posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower)
|
posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower)
|
||||||
pos = trusteeInterface.createPosition(posPayload)
|
pos = trusteeInterface.createPosition(posPayload)
|
||||||
if pos:
|
if pos:
|
||||||
allPositionIds.append(pos.id)
|
allPositionIds.append(pos.id)
|
||||||
|
createdPositions.append(pos)
|
||||||
|
|
||||||
payload = {"positionIds": allPositionIds, "documentIds": allDocumentIds}
|
# Auto-link bank statement lines to existing invoice/expense positions.
|
||||||
|
if docTypeLower == "bank_document" and createdPositions:
|
||||||
|
try:
|
||||||
|
from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition
|
||||||
|
|
||||||
|
candidatesRaw = trusteeInterface.db.getRecordset(
|
||||||
|
TrusteePosition,
|
||||||
|
recordFilter={"featureInstanceId": featureInstanceId},
|
||||||
|
)
|
||||||
|
candidatePositions = [_safeRecordFromModel(c) for c in (candidatesRaw or [])]
|
||||||
|
matchedInThisBankDoc = set()
|
||||||
|
|
||||||
|
for createdPos in createdPositions:
|
||||||
|
bankPosition = _safeRecordFromModel(createdPos)
|
||||||
|
if not bankPosition:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matchCandidate = _findBestBankMatch(
|
||||||
|
bankPosition=bankPosition,
|
||||||
|
candidatePositions=candidatePositions,
|
||||||
|
alreadyMatchedIds=matchedInThisBankDoc,
|
||||||
|
)
|
||||||
|
if not matchCandidate:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matchedId = matchCandidate.get("id")
|
||||||
|
if not matchedId:
|
||||||
|
continue
|
||||||
|
|
||||||
|
updated = trusteeInterface.updatePosition(
|
||||||
|
matchedId,
|
||||||
|
{
|
||||||
|
"bankDocumentId": trusteeDoc.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if updated:
|
||||||
|
matchedInThisBankDoc.add(matchedId)
|
||||||
|
autoMatchedPositionIds.append(matchedId)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Automatic bank-document matching failed for documentId=%s", trusteeDoc.id)
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"positionIds": allPositionIds,
|
||||||
|
"documentIds": allDocumentIds,
|
||||||
|
"autoMatchedPositionIds": autoMatchedPositionIds,
|
||||||
|
}
|
||||||
return ActionResult.isSuccess(
|
return ActionResult.isSuccess(
|
||||||
documents=[
|
documents=[
|
||||||
ActionDocument(
|
ActionDocument(
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue