# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Process extracted documents: create TrusteeDocument + TrusteePosition from extraction JSON. Input: documentList (reference to extractFromFiles result). Each document is JSON with documentType, extractedData, fileId, fileName. extractedData is a list of expense/position records. Output: one ActionDocument with JSON { positionIds, documentIds } for chaining to syncToAccounting. """ import json import logging from datetime import datetime from typing import Dict, Any, List, Optional from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelDocref import DocumentReferenceList logger = logging.getLogger(__name__) def _parseFloat(value) -> float: try: if value is None or value == "": return 0.0 return float(value) except (ValueError, TypeError): return 0.0 def _extractAccountNumber(value) -> Optional[str]: """Extract the leading numeric account number from AI output like '6200 Fahrzeugaufwand' -> '6200'.""" if not value or not isinstance(value, str): return None import re match = re.match(r"(\d+)", value.strip()) return match.group(1) if match else value.strip() or None def _normaliseTags(value) -> str: """Convert tags from various formats to a clean comma-separated string.""" if not value: return "" if isinstance(value, list): return ", ".join(str(t) for t in value if t) return str(value) def _cleanStr(value, default=None) -> Optional[str]: """Strip and return a non-empty string, else *default*.""" if not value: return default s = str(value).strip() return s if s else default def _normaliseRef(value: Any) -> Optional[str]: """Normalise payment references for robust matching.""" raw = _cleanStr(value) if not raw: return None import re return re.sub(r"[^A-Z0-9]", "", raw.upper()) or None def _parseIsoDate(value: Any) -> Optional[datetime]: """Parse YYYY-MM-DD date for proximity scoring.""" raw = _cleanStr(value) if not raw: return None try: return datetime.strptime(raw[:10], "%Y-%m-%d") except ValueError: return None def _normaliseAmount(value: Any) -> float: """Use absolute rounded amount, since bank lines are often signed.""" return round(abs(_parseFloat(value)), 2) def _normaliseCompany(value: Any) -> Optional[str]: """Normalise company names for approximate matching.""" raw = _cleanStr(value) if not raw: return None import re cleaned = re.sub(r"[^A-Z0-9]", "", raw.upper()) return cleaned or None def _findBestBankMatch( bankPosition: Dict[str, Any], candidatePositions: List[Dict[str, Any]], alreadyMatchedIds: set, ) -> Optional[Dict[str, Any]]: """Find best invoice/expense position for one bank position.""" bankRef = _normaliseRef(bankPosition.get("paymentReference") or bankPosition.get("bookingReference")) bankAmount = _normaliseAmount(bankPosition.get("bookingAmount")) bankIban = _normaliseRef(bankPosition.get("payeeIban")) bankDate = _parseIsoDate(bankPosition.get("valuta")) bankCompany = _normaliseCompany(bankPosition.get("company")) bestScore = 0 bestCandidate = None for candidate in candidatePositions: candidateId = candidate.get("id") if not candidateId or candidateId in alreadyMatchedIds: continue if candidate.get("bankDocumentId"): continue if (candidate.get("documentType") or "").lower().strip() == "bank_document": continue score = 0 candidateRef = _normaliseRef(candidate.get("paymentReference") or candidate.get("bookingReference")) candidateAmount = _normaliseAmount(candidate.get("bookingAmount")) candidateIban = _normaliseRef(candidate.get("payeeIban")) candidateDate = _parseIsoDate(candidate.get("valuta")) candidateCompany = _normaliseCompany(candidate.get("company")) # Strongest signal: structured payment reference / invoice reference match. if bankRef and candidateRef and bankRef == candidateRef: score += 100 # Amount must usually match; use tolerance for minor rounding differences. if abs(candidateAmount - bankAmount) <= 0.05: score += 40 # IBAN is a strong supporting signal. if bankIban and candidateIban and bankIban == candidateIban: score += 25 # Small date difference increases confidence. if bankDate and candidateDate: dayDiff = abs((bankDate - candidateDate).days) if dayDiff <= 3: score += 20 elif dayDiff <= 14: score += 10 # Company/party comparison helps when no structured reference is present. if bankCompany and candidateCompany and ( bankCompany in candidateCompany or candidateCompany in bankCompany ): score += 15 # If no reference exists, require stronger secondary evidence. minScore = 45 if bankRef else 65 if score >= minScore and score > bestScore: bestScore = score bestCandidate = candidate return bestCandidate def _safeRecordFromModel(modelOrDict: Any) -> Dict[str, Any]: """Convert Pydantic/dict object to plain dict for matching.""" if not modelOrDict: return {} if isinstance(modelOrDict, dict): return modelOrDict if hasattr(modelOrDict, "model_dump"): return modelOrDict.model_dump() try: return dict(modelOrDict) except Exception: return {} def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]: """Map extraction record to TrusteePosition payload.""" recDocType = _cleanStr(record.get("documentType")) or documentType if recDocType: recDocType = recDocType.lower().strip() return { "documentId": documentId, "documentType": recDocType, "valuta": record.get("valuta"), "transactionDateTime": record.get("transactionDateTime"), "company": record.get("company", ""), "desc": record.get("desc", ""), "tags": _normaliseTags(record.get("tags")), "bookingCurrency": record.get("bookingCurrency", "CHF"), "bookingAmount": _parseFloat(record.get("bookingAmount", 0)), "originalCurrency": record.get("originalCurrency") or record.get("bookingCurrency", "CHF"), "originalAmount": _parseFloat(record.get("originalAmount", 0)) or _parseFloat(record.get("bookingAmount", 0)), "vatPercentage": _parseFloat(record.get("vatPercentage", 0)), "vatAmount": _parseFloat(record.get("vatAmount", 0)), "debitAccountNumber": _extractAccountNumber(record.get("debitAccountNumber")), "creditAccountNumber": _extractAccountNumber(record.get("creditAccountNumber")), "taxCode": record.get("taxCode") or None, "costCenter": record.get("costCenter") or None, "bookingReference": record.get("bookingReference") or None, "payeeIban": _cleanStr(record.get("payeeIban")), "payeeName": _cleanStr(record.get("payeeName")), "payeeBic": _cleanStr(record.get("payeeBic")), "paymentReference": _cleanStr(record.get("paymentReference")), "dueDate": _cleanStr(record.get("dueDate")), "featureInstanceId": featureInstanceId, "mandateId": mandateId, } async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult: """ Resolve documentList to ChatDocuments, load extraction JSON per document, create TrusteeDocument (with documentType) + TrusteePosition(s), return one JSON document with positionIds/documentIds. """ documentListParam = parameters.get("documentList") featureInstanceId = parameters.get("featureInstanceId") or (getattr(self.services, "featureInstanceId", None)) if not documentListParam: return ActionResult.isFailure(error="documentList is required (reference to extractFromFiles result)") if not featureInstanceId: return ActionResult.isFailure(error="featureInstanceId is required") try: docList = DocumentReferenceList.from_string_list( documentListParam if isinstance(documentListParam, list) else [documentListParam] ) chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docList) if not chatDocuments: return ActionResult.isFailure(error="No documents found for documentList") from modules.features.trustee.interfaceFeatureTrustee import getInterface as getTrusteeInterface trusteeInterface = getTrusteeInterface( self.services.user, mandateId=self.services.mandateId, featureInstanceId=featureInstanceId ) allPositionIds = [] allDocumentIds = [] autoMatchedPositionIds = [] for chatDoc in chatDocuments: rawBytes = self.services.chat.getFileData(chatDoc.fileId) if not rawBytes: logger.warning(f"Could not load file {chatDoc.fileId}, skipping") continue content = rawBytes.decode("utf-8") if isinstance(rawBytes, bytes) else rawBytes data = json.loads(content) if isinstance(content, str) else content documentType = data.get("documentType") extractedData = data.get("extractedData") fileId = data.get("fileId") or chatDoc.fileId fileName = data.get("fileName") or chatDoc.fileName or "document" records = extractedData if isinstance(extractedData, list) else [extractedData] if extractedData else [] if not records: continue docPayload = { "fileId": fileId, "documentName": fileName, "documentMimeType": chatDoc.mimeType or "application/octet-stream", "sourceType": "workflow", "documentType": documentType, } trusteeDoc = trusteeInterface.createDocument(docPayload) if not trusteeDoc: logger.warning(f"Failed to create TrusteeDocument for {fileName}") continue allDocumentIds.append(trusteeDoc.id) docTypeLower = (documentType or "unknown").lower() createdPositions = [] for record in records: posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower) pos = trusteeInterface.createPosition(posPayload) if pos: allPositionIds.append(pos.id) createdPositions.append(pos) # Auto-link bank statement lines to existing invoice/expense positions. if docTypeLower == "bank_document" and createdPositions: try: from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition candidatesRaw = trusteeInterface.db.getRecordset( TrusteePosition, recordFilter={"featureInstanceId": featureInstanceId}, ) candidatePositions = [_safeRecordFromModel(c) for c in (candidatesRaw or [])] matchedInThisBankDoc = set() for createdPos in createdPositions: bankPosition = _safeRecordFromModel(createdPos) if not bankPosition: continue matchCandidate = _findBestBankMatch( bankPosition=bankPosition, candidatePositions=candidatePositions, alreadyMatchedIds=matchedInThisBankDoc, ) if not matchCandidate: continue matchedId = matchCandidate.get("id") if not matchedId: continue updated = trusteeInterface.updatePosition( matchedId, { "bankDocumentId": trusteeDoc.id, }, ) if updated: matchedInThisBankDoc.add(matchedId) autoMatchedPositionIds.append(matchedId) except Exception: logger.exception("Automatic bank-document matching failed for documentId=%s", trusteeDoc.id) payload = { "positionIds": allPositionIds, "documentIds": allDocumentIds, "autoMatchedPositionIds": autoMatchedPositionIds, } return ActionResult.isSuccess( documents=[ ActionDocument( documentName="process_documents_result.json", documentData=json.dumps(payload), mimeType="application/json", ) ] ) except Exception as e: logger.exception("processDocuments failed") return ActionResult.isFailure(error=str(e))