376 lines
15 KiB
Python
376 lines
15 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Process extracted documents: create TrusteeDocument + TrusteePosition from extraction JSON.
|
|
Input: documentList (reference to extractFromFiles result).
|
|
Each document is JSON with documentType, extractedData, fileId, fileName.
|
|
extractedData is a list of expense/position records.
|
|
Output: one ActionDocument with JSON { positionIds, documentIds } for chaining to syncToAccounting.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
|
from modules.datamodels.datamodelDocref import DocumentReferenceList
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _parseFloat(value) -> float:
|
|
try:
|
|
if value is None or value == "":
|
|
return 0.0
|
|
return float(value)
|
|
except (ValueError, TypeError):
|
|
return 0.0
|
|
|
|
|
|
def _extractAccountNumber(value) -> Optional[str]:
|
|
"""Extract the leading numeric account number from AI output like '6200 Fahrzeugaufwand' -> '6200'."""
|
|
if not value or not isinstance(value, str):
|
|
return None
|
|
import re
|
|
match = re.match(r"(\d+)", value.strip())
|
|
return match.group(1) if match else value.strip() or None
|
|
|
|
|
|
def _normaliseTags(value) -> str:
|
|
"""Convert tags from various formats to a clean comma-separated string."""
|
|
if not value:
|
|
return ""
|
|
if isinstance(value, list):
|
|
return ", ".join(str(t) for t in value if t)
|
|
return str(value)
|
|
|
|
|
|
def _cleanStr(value, default=None) -> Optional[str]:
|
|
"""Strip and return a non-empty string, else *default*."""
|
|
if not value:
|
|
return default
|
|
s = str(value).strip()
|
|
return s if s else default
|
|
|
|
|
|
def _normaliseRef(value: Any) -> Optional[str]:
|
|
"""Normalise payment references for robust matching."""
|
|
raw = _cleanStr(value)
|
|
if not raw:
|
|
return None
|
|
import re
|
|
return re.sub(r"[^A-Z0-9]", "", raw.upper()) or None
|
|
|
|
|
|
def _parseIsoDate(value: Any) -> Optional[datetime]:
|
|
"""Parse YYYY-MM-DD date for proximity scoring."""
|
|
raw = _cleanStr(value)
|
|
if not raw:
|
|
return None
|
|
try:
|
|
return datetime.strptime(raw[:10], "%Y-%m-%d")
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _normaliseAmount(value: Any) -> float:
|
|
"""Use absolute rounded amount, since bank lines are often signed."""
|
|
return round(abs(_parseFloat(value)), 2)
|
|
|
|
|
|
def _normaliseCompany(value: Any) -> Optional[str]:
|
|
"""Normalise company names for approximate matching."""
|
|
raw = _cleanStr(value)
|
|
if not raw:
|
|
return None
|
|
import re
|
|
cleaned = re.sub(r"[^A-Z0-9]", "", raw.upper())
|
|
return cleaned or None
|
|
|
|
|
|
def _findBestBankMatch(
|
|
bankPosition: Dict[str, Any],
|
|
candidatePositions: List[Dict[str, Any]],
|
|
alreadyMatchedIds: set,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Find best invoice/expense position for one bank position."""
|
|
bankRef = _normaliseRef(bankPosition.get("paymentReference") or bankPosition.get("bookingReference"))
|
|
bankAmount = _normaliseAmount(bankPosition.get("bookingAmount"))
|
|
bankIban = _normaliseRef(bankPosition.get("payeeIban"))
|
|
bankDate = _parseIsoDate(bankPosition.get("valuta"))
|
|
bankCompany = _normaliseCompany(bankPosition.get("company"))
|
|
|
|
bestScore = 0
|
|
bestCandidate = None
|
|
|
|
for candidate in candidatePositions:
|
|
candidateId = candidate.get("id")
|
|
if not candidateId or candidateId in alreadyMatchedIds:
|
|
continue
|
|
if candidate.get("bankDocumentId"):
|
|
continue
|
|
if (candidate.get("documentType") or "").lower().strip() == "bank_document":
|
|
continue
|
|
|
|
score = 0
|
|
candidateRef = _normaliseRef(candidate.get("paymentReference") or candidate.get("bookingReference"))
|
|
candidateAmount = _normaliseAmount(candidate.get("bookingAmount"))
|
|
candidateIban = _normaliseRef(candidate.get("payeeIban"))
|
|
candidateDate = _parseIsoDate(candidate.get("valuta"))
|
|
candidateCompany = _normaliseCompany(candidate.get("company"))
|
|
|
|
# Strongest signal: structured payment reference / invoice reference match.
|
|
if bankRef and candidateRef and bankRef == candidateRef:
|
|
score += 100
|
|
|
|
# Amount must usually match; use tolerance for minor rounding differences.
|
|
if abs(candidateAmount - bankAmount) <= 0.05:
|
|
score += 40
|
|
|
|
# IBAN is a strong supporting signal.
|
|
if bankIban and candidateIban and bankIban == candidateIban:
|
|
score += 25
|
|
|
|
# Small date difference increases confidence.
|
|
if bankDate and candidateDate:
|
|
dayDiff = abs((bankDate - candidateDate).days)
|
|
if dayDiff <= 3:
|
|
score += 20
|
|
elif dayDiff <= 14:
|
|
score += 10
|
|
|
|
# Company/party comparison helps when no structured reference is present.
|
|
if bankCompany and candidateCompany and (
|
|
bankCompany in candidateCompany or candidateCompany in bankCompany
|
|
):
|
|
score += 15
|
|
|
|
# If no reference exists, require stronger secondary evidence.
|
|
minScore = 45 if bankRef else 65
|
|
if score >= minScore and score > bestScore:
|
|
bestScore = score
|
|
bestCandidate = candidate
|
|
|
|
return bestCandidate
|
|
|
|
|
|
def _safeRecordFromModel(modelOrDict: Any) -> Dict[str, Any]:
|
|
"""Convert Pydantic/dict object to plain dict for matching."""
|
|
if not modelOrDict:
|
|
return {}
|
|
if isinstance(modelOrDict, dict):
|
|
return modelOrDict
|
|
if hasattr(modelOrDict, "model_dump"):
|
|
return modelOrDict.model_dump()
|
|
try:
|
|
return dict(modelOrDict)
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]:
|
|
"""Map extraction record to TrusteePosition payload."""
|
|
recDocType = _cleanStr(record.get("documentType")) or documentType
|
|
if recDocType:
|
|
recDocType = recDocType.lower().strip()
|
|
|
|
return {
|
|
"documentId": documentId,
|
|
"documentType": recDocType,
|
|
"valuta": record.get("valuta"),
|
|
"transactionDateTime": record.get("transactionDateTime"),
|
|
"company": record.get("company", ""),
|
|
"desc": record.get("desc", ""),
|
|
"tags": _normaliseTags(record.get("tags")),
|
|
"bookingCurrency": record.get("bookingCurrency", "CHF"),
|
|
"bookingAmount": _parseFloat(record.get("bookingAmount", 0)),
|
|
"originalCurrency": record.get("originalCurrency") or record.get("bookingCurrency", "CHF"),
|
|
"originalAmount": _parseFloat(record.get("originalAmount", 0)) or _parseFloat(record.get("bookingAmount", 0)),
|
|
"vatPercentage": _parseFloat(record.get("vatPercentage", 0)),
|
|
"vatAmount": _parseFloat(record.get("vatAmount", 0)),
|
|
"debitAccountNumber": _extractAccountNumber(record.get("debitAccountNumber")),
|
|
"creditAccountNumber": _extractAccountNumber(record.get("creditAccountNumber")),
|
|
"taxCode": record.get("taxCode") or None,
|
|
"costCenter": record.get("costCenter") or None,
|
|
"bookingReference": record.get("bookingReference") or None,
|
|
"payeeIban": _cleanStr(record.get("payeeIban")),
|
|
"payeeName": _cleanStr(record.get("payeeName")),
|
|
"payeeBic": _cleanStr(record.get("payeeBic")),
|
|
"paymentReference": _cleanStr(record.get("paymentReference")),
|
|
"dueDate": _cleanStr(record.get("dueDate")),
|
|
"featureInstanceId": featureInstanceId,
|
|
"mandateId": mandateId,
|
|
}
|
|
|
|
|
|
def _resolveDocumentList(documentListParam, services) -> List[tuple]:
|
|
"""Resolve documentList from either Graph-Editor output (list of dicts) or Chat references.
|
|
|
|
Returns list of (data_dict, fileId, fileName, mimeType) tuples.
|
|
"""
|
|
results = []
|
|
|
|
if isinstance(documentListParam, list) and documentListParam:
|
|
first = documentListParam[0]
|
|
if isinstance(first, dict) and ("documentData" in first or "documentName" in first):
|
|
for doc in documentListParam:
|
|
rawData = doc.get("documentData")
|
|
if not rawData:
|
|
continue
|
|
try:
|
|
data = json.loads(rawData) if isinstance(rawData, str) else rawData
|
|
except (json.JSONDecodeError, TypeError):
|
|
continue
|
|
fileId = (doc.get("validationMetadata") or {}).get("fileId") or doc.get("fileId", "")
|
|
fileName = doc.get("documentName") or doc.get("fileName") or "document"
|
|
mimeType = doc.get("mimeType") or doc.get("documentMimeType") or "application/json"
|
|
results.append((data, fileId, fileName, mimeType))
|
|
if results:
|
|
return results
|
|
|
|
chatService = getattr(services, "chat", None)
|
|
if not chatService:
|
|
return results
|
|
|
|
try:
|
|
docList = DocumentReferenceList.from_string_list(
|
|
documentListParam if isinstance(documentListParam, list) else [documentListParam]
|
|
)
|
|
chatDocuments = chatService.getChatDocumentsFromDocumentList(docList)
|
|
for chatDoc in (chatDocuments or []):
|
|
rawBytes = chatService.getFileData(chatDoc.fileId)
|
|
if not rawBytes:
|
|
continue
|
|
content = rawBytes.decode("utf-8") if isinstance(rawBytes, bytes) else rawBytes
|
|
try:
|
|
data = json.loads(content) if isinstance(content, str) else content
|
|
except (json.JSONDecodeError, TypeError):
|
|
continue
|
|
results.append((data, chatDoc.fileId, chatDoc.fileName or "document", chatDoc.mimeType or "application/json"))
|
|
except Exception as e:
|
|
logger.debug("_resolveDocumentList chat fallback failed: %s", e)
|
|
|
|
return results
|
|
|
|
|
|
async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Resolve documentList to ChatDocuments, load extraction JSON per document,
|
|
create TrusteeDocument (with documentType) + TrusteePosition(s), return one JSON document with positionIds/documentIds.
|
|
"""
|
|
documentListParam = parameters.get("documentList")
|
|
featureInstanceId = parameters.get("featureInstanceId") or (getattr(self.services, "featureInstanceId", None))
|
|
|
|
if not documentListParam:
|
|
return ActionResult.isFailure(error="documentList is required (reference to extractFromFiles result)")
|
|
if not featureInstanceId:
|
|
return ActionResult.isFailure(error="featureInstanceId is required")
|
|
|
|
try:
|
|
extractionDocs = _resolveDocumentList(documentListParam, self.services)
|
|
if not extractionDocs:
|
|
return ActionResult.isFailure(error="No documents found for documentList")
|
|
|
|
from modules.features.trustee.interfaceFeatureTrustee import getInterface as getTrusteeInterface
|
|
|
|
trusteeInterface = getTrusteeInterface(
|
|
self.services.user,
|
|
mandateId=self.services.mandateId,
|
|
featureInstanceId=featureInstanceId
|
|
)
|
|
|
|
allPositionIds = []
|
|
allDocumentIds = []
|
|
autoMatchedPositionIds = []
|
|
|
|
for data, fileId, fileName, mimeType in extractionDocs:
|
|
documentType = data.get("documentType")
|
|
extractedData = data.get("extractedData")
|
|
fileId = data.get("fileId") or fileId
|
|
fileName = data.get("fileName") or fileName or "document"
|
|
|
|
records = extractedData if isinstance(extractedData, list) else [extractedData] if extractedData else []
|
|
if not records:
|
|
continue
|
|
|
|
docPayload = {
|
|
"fileId": fileId,
|
|
"documentName": fileName,
|
|
"documentMimeType": mimeType or "application/octet-stream",
|
|
"sourceType": "workflow",
|
|
"documentType": documentType,
|
|
}
|
|
trusteeDoc = trusteeInterface.createDocument(docPayload)
|
|
if not trusteeDoc:
|
|
logger.warning(f"Failed to create TrusteeDocument for {fileName}")
|
|
continue
|
|
allDocumentIds.append(trusteeDoc.id)
|
|
|
|
docTypeLower = (documentType or "unknown").lower()
|
|
createdPositions = []
|
|
for record in records:
|
|
posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower)
|
|
pos = trusteeInterface.createPosition(posPayload)
|
|
if pos:
|
|
allPositionIds.append(pos.id)
|
|
createdPositions.append(pos)
|
|
|
|
# Auto-link bank statement lines to existing invoice/expense positions.
|
|
if docTypeLower == "bank_document" and createdPositions:
|
|
try:
|
|
from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition
|
|
|
|
candidatesRaw = trusteeInterface.db.getRecordset(
|
|
TrusteePosition,
|
|
recordFilter={"featureInstanceId": featureInstanceId},
|
|
)
|
|
candidatePositions = [_safeRecordFromModel(c) for c in (candidatesRaw or [])]
|
|
matchedInThisBankDoc = set()
|
|
|
|
for createdPos in createdPositions:
|
|
bankPosition = _safeRecordFromModel(createdPos)
|
|
if not bankPosition:
|
|
continue
|
|
|
|
matchCandidate = _findBestBankMatch(
|
|
bankPosition=bankPosition,
|
|
candidatePositions=candidatePositions,
|
|
alreadyMatchedIds=matchedInThisBankDoc,
|
|
)
|
|
if not matchCandidate:
|
|
continue
|
|
|
|
matchedId = matchCandidate.get("id")
|
|
if not matchedId:
|
|
continue
|
|
|
|
updated = trusteeInterface.updatePosition(
|
|
matchedId,
|
|
{
|
|
"bankDocumentId": trusteeDoc.id,
|
|
},
|
|
)
|
|
if updated:
|
|
matchedInThisBankDoc.add(matchedId)
|
|
autoMatchedPositionIds.append(matchedId)
|
|
except Exception:
|
|
logger.exception("Automatic bank-document matching failed for documentId=%s", trusteeDoc.id)
|
|
|
|
payload = {
|
|
"positionIds": allPositionIds,
|
|
"documentIds": allDocumentIds,
|
|
"autoMatchedPositionIds": autoMatchedPositionIds,
|
|
}
|
|
return ActionResult.isSuccess(
|
|
documents=[
|
|
ActionDocument(
|
|
documentName="process_documents_result.json",
|
|
documentData=json.dumps(payload),
|
|
mimeType="application/json",
|
|
)
|
|
]
|
|
)
|
|
except Exception as e:
|
|
logger.exception("processDocuments failed")
|
|
return ActionResult.isFailure(error=str(e))
|