gateway/modules/workflows/methods/methodTrustee/actions/processDocuments.py

376 lines
15 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Process extracted documents: create TrusteeDocument + TrusteePosition from extraction JSON.
Input: documentList (reference to extractFromFiles result).
Each document is JSON with documentType, extractedData, fileId, fileName.
extractedData is a list of expense/position records.
Output: one ActionDocument with JSON { positionIds, documentIds } for chaining to syncToAccounting.
"""
import json
import logging
from datetime import datetime
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import DocumentReferenceList
logger = logging.getLogger(__name__)
def _parseFloat(value) -> float:
try:
if value is None or value == "":
return 0.0
return float(value)
except (ValueError, TypeError):
return 0.0
def _extractAccountNumber(value) -> Optional[str]:
"""Extract the leading numeric account number from AI output like '6200 Fahrzeugaufwand' -> '6200'."""
if not value or not isinstance(value, str):
return None
import re
match = re.match(r"(\d+)", value.strip())
return match.group(1) if match else value.strip() or None
def _normaliseTags(value) -> str:
"""Convert tags from various formats to a clean comma-separated string."""
if not value:
return ""
if isinstance(value, list):
return ", ".join(str(t) for t in value if t)
return str(value)
def _cleanStr(value, default=None) -> Optional[str]:
"""Strip and return a non-empty string, else *default*."""
if not value:
return default
s = str(value).strip()
return s if s else default
def _normaliseRef(value: Any) -> Optional[str]:
"""Normalise payment references for robust matching."""
raw = _cleanStr(value)
if not raw:
return None
import re
return re.sub(r"[^A-Z0-9]", "", raw.upper()) or None
def _parseIsoDate(value: Any) -> Optional[datetime]:
"""Parse YYYY-MM-DD date for proximity scoring."""
raw = _cleanStr(value)
if not raw:
return None
try:
return datetime.strptime(raw[:10], "%Y-%m-%d")
except ValueError:
return None
def _normaliseAmount(value: Any) -> float:
"""Use absolute rounded amount, since bank lines are often signed."""
return round(abs(_parseFloat(value)), 2)
def _normaliseCompany(value: Any) -> Optional[str]:
"""Normalise company names for approximate matching."""
raw = _cleanStr(value)
if not raw:
return None
import re
cleaned = re.sub(r"[^A-Z0-9]", "", raw.upper())
return cleaned or None
def _findBestBankMatch(
bankPosition: Dict[str, Any],
candidatePositions: List[Dict[str, Any]],
alreadyMatchedIds: set,
) -> Optional[Dict[str, Any]]:
"""Find best invoice/expense position for one bank position."""
bankRef = _normaliseRef(bankPosition.get("paymentReference") or bankPosition.get("bookingReference"))
bankAmount = _normaliseAmount(bankPosition.get("bookingAmount"))
bankIban = _normaliseRef(bankPosition.get("payeeIban"))
bankDate = _parseIsoDate(bankPosition.get("valuta"))
bankCompany = _normaliseCompany(bankPosition.get("company"))
bestScore = 0
bestCandidate = None
for candidate in candidatePositions:
candidateId = candidate.get("id")
if not candidateId or candidateId in alreadyMatchedIds:
continue
if candidate.get("bankDocumentId"):
continue
if (candidate.get("documentType") or "").lower().strip() == "bank_document":
continue
score = 0
candidateRef = _normaliseRef(candidate.get("paymentReference") or candidate.get("bookingReference"))
candidateAmount = _normaliseAmount(candidate.get("bookingAmount"))
candidateIban = _normaliseRef(candidate.get("payeeIban"))
candidateDate = _parseIsoDate(candidate.get("valuta"))
candidateCompany = _normaliseCompany(candidate.get("company"))
# Strongest signal: structured payment reference / invoice reference match.
if bankRef and candidateRef and bankRef == candidateRef:
score += 100
# Amount must usually match; use tolerance for minor rounding differences.
if abs(candidateAmount - bankAmount) <= 0.05:
score += 40
# IBAN is a strong supporting signal.
if bankIban and candidateIban and bankIban == candidateIban:
score += 25
# Small date difference increases confidence.
if bankDate and candidateDate:
dayDiff = abs((bankDate - candidateDate).days)
if dayDiff <= 3:
score += 20
elif dayDiff <= 14:
score += 10
# Company/party comparison helps when no structured reference is present.
if bankCompany and candidateCompany and (
bankCompany in candidateCompany or candidateCompany in bankCompany
):
score += 15
# If no reference exists, require stronger secondary evidence.
minScore = 45 if bankRef else 65
if score >= minScore and score > bestScore:
bestScore = score
bestCandidate = candidate
return bestCandidate
def _safeRecordFromModel(modelOrDict: Any) -> Dict[str, Any]:
"""Convert Pydantic/dict object to plain dict for matching."""
if not modelOrDict:
return {}
if isinstance(modelOrDict, dict):
return modelOrDict
if hasattr(modelOrDict, "model_dump"):
return modelOrDict.model_dump()
try:
return dict(modelOrDict)
except Exception:
return {}
def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]:
"""Map extraction record to TrusteePosition payload."""
recDocType = _cleanStr(record.get("documentType")) or documentType
if recDocType:
recDocType = recDocType.lower().strip()
return {
"documentId": documentId,
"documentType": recDocType,
"valuta": record.get("valuta"),
"transactionDateTime": record.get("transactionDateTime"),
"company": record.get("company", ""),
"desc": record.get("desc", ""),
"tags": _normaliseTags(record.get("tags")),
"bookingCurrency": record.get("bookingCurrency", "CHF"),
"bookingAmount": _parseFloat(record.get("bookingAmount", 0)),
"originalCurrency": record.get("originalCurrency") or record.get("bookingCurrency", "CHF"),
"originalAmount": _parseFloat(record.get("originalAmount", 0)) or _parseFloat(record.get("bookingAmount", 0)),
"vatPercentage": _parseFloat(record.get("vatPercentage", 0)),
"vatAmount": _parseFloat(record.get("vatAmount", 0)),
"debitAccountNumber": _extractAccountNumber(record.get("debitAccountNumber")),
"creditAccountNumber": _extractAccountNumber(record.get("creditAccountNumber")),
"taxCode": record.get("taxCode") or None,
"costCenter": record.get("costCenter") or None,
"bookingReference": record.get("bookingReference") or None,
"payeeIban": _cleanStr(record.get("payeeIban")),
"payeeName": _cleanStr(record.get("payeeName")),
"payeeBic": _cleanStr(record.get("payeeBic")),
"paymentReference": _cleanStr(record.get("paymentReference")),
"dueDate": _cleanStr(record.get("dueDate")),
"featureInstanceId": featureInstanceId,
"mandateId": mandateId,
}
def _resolveDocumentList(documentListParam, services) -> List[tuple]:
"""Resolve documentList from either Graph-Editor output (list of dicts) or Chat references.
Returns list of (data_dict, fileId, fileName, mimeType) tuples.
"""
results = []
if isinstance(documentListParam, list) and documentListParam:
first = documentListParam[0]
if isinstance(first, dict) and ("documentData" in first or "documentName" in first):
for doc in documentListParam:
rawData = doc.get("documentData")
if not rawData:
continue
try:
data = json.loads(rawData) if isinstance(rawData, str) else rawData
except (json.JSONDecodeError, TypeError):
continue
fileId = (doc.get("validationMetadata") or {}).get("fileId") or doc.get("fileId", "")
fileName = doc.get("documentName") or doc.get("fileName") or "document"
mimeType = doc.get("mimeType") or doc.get("documentMimeType") or "application/json"
results.append((data, fileId, fileName, mimeType))
if results:
return results
chatService = getattr(services, "chat", None)
if not chatService:
return results
try:
docList = DocumentReferenceList.from_string_list(
documentListParam if isinstance(documentListParam, list) else [documentListParam]
)
chatDocuments = chatService.getChatDocumentsFromDocumentList(docList)
for chatDoc in (chatDocuments or []):
rawBytes = chatService.getFileData(chatDoc.fileId)
if not rawBytes:
continue
content = rawBytes.decode("utf-8") if isinstance(rawBytes, bytes) else rawBytes
try:
data = json.loads(content) if isinstance(content, str) else content
except (json.JSONDecodeError, TypeError):
continue
results.append((data, chatDoc.fileId, chatDoc.fileName or "document", chatDoc.mimeType or "application/json"))
except Exception as e:
logger.debug("_resolveDocumentList chat fallback failed: %s", e)
return results
async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Resolve documentList to ChatDocuments, load extraction JSON per document,
create TrusteeDocument (with documentType) + TrusteePosition(s), return one JSON document with positionIds/documentIds.
"""
documentListParam = parameters.get("documentList")
featureInstanceId = parameters.get("featureInstanceId") or (getattr(self.services, "featureInstanceId", None))
if not documentListParam:
return ActionResult.isFailure(error="documentList is required (reference to extractFromFiles result)")
if not featureInstanceId:
return ActionResult.isFailure(error="featureInstanceId is required")
try:
extractionDocs = _resolveDocumentList(documentListParam, self.services)
if not extractionDocs:
return ActionResult.isFailure(error="No documents found for documentList")
from modules.features.trustee.interfaceFeatureTrustee import getInterface as getTrusteeInterface
trusteeInterface = getTrusteeInterface(
self.services.user,
mandateId=self.services.mandateId,
featureInstanceId=featureInstanceId
)
allPositionIds = []
allDocumentIds = []
autoMatchedPositionIds = []
for data, fileId, fileName, mimeType in extractionDocs:
documentType = data.get("documentType")
extractedData = data.get("extractedData")
fileId = data.get("fileId") or fileId
fileName = data.get("fileName") or fileName or "document"
records = extractedData if isinstance(extractedData, list) else [extractedData] if extractedData else []
if not records:
continue
docPayload = {
"fileId": fileId,
"documentName": fileName,
"documentMimeType": mimeType or "application/octet-stream",
"sourceType": "workflow",
"documentType": documentType,
}
trusteeDoc = trusteeInterface.createDocument(docPayload)
if not trusteeDoc:
logger.warning(f"Failed to create TrusteeDocument for {fileName}")
continue
allDocumentIds.append(trusteeDoc.id)
docTypeLower = (documentType or "unknown").lower()
createdPositions = []
for record in records:
posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower)
pos = trusteeInterface.createPosition(posPayload)
if pos:
allPositionIds.append(pos.id)
createdPositions.append(pos)
# Auto-link bank statement lines to existing invoice/expense positions.
if docTypeLower == "bank_document" and createdPositions:
try:
from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition
candidatesRaw = trusteeInterface.db.getRecordset(
TrusteePosition,
recordFilter={"featureInstanceId": featureInstanceId},
)
candidatePositions = [_safeRecordFromModel(c) for c in (candidatesRaw or [])]
matchedInThisBankDoc = set()
for createdPos in createdPositions:
bankPosition = _safeRecordFromModel(createdPos)
if not bankPosition:
continue
matchCandidate = _findBestBankMatch(
bankPosition=bankPosition,
candidatePositions=candidatePositions,
alreadyMatchedIds=matchedInThisBankDoc,
)
if not matchCandidate:
continue
matchedId = matchCandidate.get("id")
if not matchedId:
continue
updated = trusteeInterface.updatePosition(
matchedId,
{
"bankDocumentId": trusteeDoc.id,
},
)
if updated:
matchedInThisBankDoc.add(matchedId)
autoMatchedPositionIds.append(matchedId)
except Exception:
logger.exception("Automatic bank-document matching failed for documentId=%s", trusteeDoc.id)
payload = {
"positionIds": allPositionIds,
"documentIds": allDocumentIds,
"autoMatchedPositionIds": autoMatchedPositionIds,
}
return ActionResult.isSuccess(
documents=[
ActionDocument(
documentName="process_documents_result.json",
documentData=json.dumps(payload),
mimeType="application/json",
)
]
)
except Exception as e:
logger.exception("processDocuments failed")
return ActionResult.isFailure(error=str(e))