gateway/modules/workflows/methods/methodTrustee/actions/processDocuments.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Process extracted documents: create TrusteeDocument + TrusteePosition from extraction JSON.
Input: documentList (reference to extractFromFiles result).
Each document is JSON with documentType, extractedData, fileId, fileName.
extractedData is a list of expense/position records.
Output: one ActionDocument with JSON { positionIds, documentIds } for chaining to syncToAccounting.
"""

import json
import logging
from datetime import datetime
from typing import Dict, Any, List, Optional

from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import DocumentReferenceList

logger = logging.getLogger(__name__)


def _parseFloat(value) -> float:
    try:
        if value is None or value == "":
            return 0.0
        return float(value)
    except (ValueError, TypeError):
        return 0.0


def _extractAccountNumber(value) -> Optional[str]:
    """Extract the leading numeric account number from AI output like '6200 Fahrzeugaufwand' -> '6200'."""
    if not value or not isinstance(value, str):
        return None
    import re
    match = re.match(r"(\d+)", value.strip())
    return match.group(1) if match else value.strip() or None


def _normaliseTags(value) -> str:
    """Convert tags from various formats to a clean comma-separated string."""
    if not value:
        return ""
    if isinstance(value, list):
        return ", ".join(str(t) for t in value if t)
    return str(value)


def _cleanStr(value, default=None) -> Optional[str]:
    """Strip and return a non-empty string, else *default*."""
    if not value:
        return default
    s = str(value).strip()
    return s if s else default


def _normaliseRef(value: Any) -> Optional[str]:
    """Normalise payment references for robust matching."""
    raw = _cleanStr(value)
    if not raw:
        return None
    import re
    return re.sub(r"[^A-Z0-9]", "", raw.upper()) or None


def _parseIsoDate(value: Any) -> Optional[datetime]:
    """Parse YYYY-MM-DD date for proximity scoring."""
    raw = _cleanStr(value)
    if not raw:
        return None
    try:
        return datetime.strptime(raw[:10], "%Y-%m-%d")
    except ValueError:
        return None


def _normaliseAmount(value: Any) -> float:
    """Use absolute rounded amount, since bank lines are often signed."""
    return round(abs(_parseFloat(value)), 2)


def _normaliseCompany(value: Any) -> Optional[str]:
    """Normalise company names for approximate matching."""
    raw = _cleanStr(value)
    if not raw:
        return None
    import re
    cleaned = re.sub(r"[^A-Z0-9]", "", raw.upper())
    return cleaned or None


def _findBestBankMatch(
    bankPosition: Dict[str, Any],
    candidatePositions: List[Dict[str, Any]],
    alreadyMatchedIds: set,
) -> Optional[Dict[str, Any]]:
    """Find best invoice/expense position for one bank position."""
    bankRef = _normaliseRef(bankPosition.get("paymentReference") or bankPosition.get("bookingReference"))
    bankAmount = _normaliseAmount(bankPosition.get("bookingAmount"))
    bankIban = _normaliseRef(bankPosition.get("payeeIban"))
    bankDate = _parseIsoDate(bankPosition.get("valuta"))
    bankCompany = _normaliseCompany(bankPosition.get("company"))

    bestScore = 0
    bestCandidate = None

    for candidate in candidatePositions:
        candidateId = candidate.get("id")
        if not candidateId or candidateId in alreadyMatchedIds:
            continue
        if candidate.get("bankDocumentId"):
            continue
        if (candidate.get("documentType") or "").lower().strip() == "bank_document":
            continue

        score = 0
        candidateRef = _normaliseRef(candidate.get("paymentReference") or candidate.get("bookingReference"))
        candidateAmount = _normaliseAmount(candidate.get("bookingAmount"))
        candidateIban = _normaliseRef(candidate.get("payeeIban"))
        candidateDate = _parseIsoDate(candidate.get("valuta"))
        candidateCompany = _normaliseCompany(candidate.get("company"))

        # Strongest signal: structured payment reference / invoice reference match.
        if bankRef and candidateRef and bankRef == candidateRef:
            score += 100

        # Amount must usually match; use tolerance for minor rounding differences.
        if abs(candidateAmount - bankAmount) <= 0.05:
            score += 40

        # IBAN is a strong supporting signal.
        if bankIban and candidateIban and bankIban == candidateIban:
            score += 25

        # Small date difference increases confidence.
        if bankDate and candidateDate:
            dayDiff = abs((bankDate - candidateDate).days)
            if dayDiff <= 3:
                score += 20
            elif dayDiff <= 14:
                score += 10

        # Company/party comparison helps when no structured reference is present.
        if bankCompany and candidateCompany and (
            bankCompany in candidateCompany or candidateCompany in bankCompany
        ):
            score += 15

        # If no reference exists, require stronger secondary evidence.
        minScore = 45 if bankRef else 65
        if score >= minScore and score > bestScore:
            bestScore = score
            bestCandidate = candidate

    return bestCandidate


def _safeRecordFromModel(modelOrDict: Any) -> Dict[str, Any]:
    """Convert Pydantic/dict object to plain dict for matching."""
    if not modelOrDict:
        return {}
    if isinstance(modelOrDict, dict):
        return modelOrDict
    if hasattr(modelOrDict, "model_dump"):
        return modelOrDict.model_dump()
    try:
        return dict(modelOrDict)
    except Exception:
        return {}


def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str, documentType: Optional[str] = None) -> Dict[str, Any]:
    """Map extraction record to TrusteePosition payload."""
    recDocType = _cleanStr(record.get("documentType")) or documentType
    if recDocType:
        recDocType = recDocType.lower().strip()

    return {
        "documentId": documentId,
        "documentType": recDocType,
        "valuta": record.get("valuta"),
        "transactionDateTime": record.get("transactionDateTime"),
        "company": record.get("company", ""),
        "desc": record.get("desc", ""),
        "tags": _normaliseTags(record.get("tags")),
        "bookingCurrency": record.get("bookingCurrency", "CHF"),
        "bookingAmount": _parseFloat(record.get("bookingAmount", 0)),
        "originalCurrency": record.get("originalCurrency") or record.get("bookingCurrency", "CHF"),
        "originalAmount": _parseFloat(record.get("originalAmount", 0)) or _parseFloat(record.get("bookingAmount", 0)),
        "vatPercentage": _parseFloat(record.get("vatPercentage", 0)),
        "vatAmount": _parseFloat(record.get("vatAmount", 0)),
        "debitAccountNumber": _extractAccountNumber(record.get("debitAccountNumber")),
        "creditAccountNumber": _extractAccountNumber(record.get("creditAccountNumber")),
        "taxCode": record.get("taxCode") or None,
        "costCenter": record.get("costCenter") or None,
        "bookingReference": record.get("bookingReference") or None,
        "payeeIban": _cleanStr(record.get("payeeIban")),
        "payeeName": _cleanStr(record.get("payeeName")),
        "payeeBic": _cleanStr(record.get("payeeBic")),
        "paymentReference": _cleanStr(record.get("paymentReference")),
        "dueDate": _cleanStr(record.get("dueDate")),
        "featureInstanceId": featureInstanceId,
        "mandateId": mandateId,
    }


def _resolveDocumentList(documentListParam, services) -> List[tuple]:
    """Resolve documentList from either Graph-Editor output (list of dicts) or Chat references.

    Returns list of (data_dict, fileId, fileName, mimeType) tuples.
    """
    results = []

    if isinstance(documentListParam, list) and documentListParam:
        first = documentListParam[0]
        if isinstance(first, dict) and ("documentData" in first or "documentName" in first):
            for doc in documentListParam:
                rawData = doc.get("documentData")
                if not rawData:
                    continue
                try:
                    data = json.loads(rawData) if isinstance(rawData, str) else rawData
                except (json.JSONDecodeError, TypeError):
                    continue
                fileId = (doc.get("validationMetadata") or {}).get("fileId") or doc.get("fileId", "")
                fileName = doc.get("documentName") or doc.get("fileName") or "document"
                mimeType = doc.get("mimeType") or doc.get("documentMimeType") or "application/json"
                results.append((data, fileId, fileName, mimeType))
            if results:
                return results

    chatService = getattr(services, "chat", None)
    if not chatService:
        return results

    try:
        docList = DocumentReferenceList.from_string_list(
            documentListParam if isinstance(documentListParam, list) else [documentListParam]
        )
        chatDocuments = chatService.getChatDocumentsFromDocumentList(docList)
        for chatDoc in (chatDocuments or []):
            rawBytes = chatService.getFileData(chatDoc.fileId)
            if not rawBytes:
                continue
            content = rawBytes.decode("utf-8") if isinstance(rawBytes, bytes) else rawBytes
            try:
                data = json.loads(content) if isinstance(content, str) else content
            except (json.JSONDecodeError, TypeError):
                continue
            results.append((data, chatDoc.fileId, chatDoc.fileName or "document", chatDoc.mimeType or "application/json"))
    except Exception as e:
        logger.debug("_resolveDocumentList chat fallback failed: %s", e)

    return results


async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
    """
    Resolve documentList to ChatDocuments, load extraction JSON per document,
    create TrusteeDocument (with documentType) + TrusteePosition(s), return one JSON document with positionIds/documentIds.
    """
    documentListParam = parameters.get("documentList")
    featureInstanceId = parameters.get("featureInstanceId") or (getattr(self.services, "featureInstanceId", None))

    if not documentListParam:
        return ActionResult.isFailure(error="documentList is required (reference to extractFromFiles result)")
    if not featureInstanceId:
        return ActionResult.isFailure(error="featureInstanceId is required")

    try:
        extractionDocs = _resolveDocumentList(documentListParam, self.services)
        if not extractionDocs:
            return ActionResult.isFailure(error="No documents found for documentList")

        from modules.features.trustee.interfaceFeatureTrustee import getInterface as getTrusteeInterface

        trusteeInterface = getTrusteeInterface(
            self.services.user,
            mandateId=self.services.mandateId,
            featureInstanceId=featureInstanceId
        )

        allPositionIds = []
        allDocumentIds = []
        autoMatchedPositionIds = []

        for data, fileId, fileName, mimeType in extractionDocs:
            documentType = data.get("documentType")
            extractedData = data.get("extractedData")
            fileId = data.get("fileId") or fileId
            fileName = data.get("fileName") or fileName or "document"

            records = extractedData if isinstance(extractedData, list) else [extractedData] if extractedData else []
            if not records:
                continue

            docPayload = {
                "fileId": fileId,
                "documentName": fileName,
                "documentMimeType": mimeType or "application/octet-stream",
                "sourceType": "workflow",
                "documentType": documentType,
            }
            trusteeDoc = trusteeInterface.createDocument(docPayload)
            if not trusteeDoc:
                logger.warning(f"Failed to create TrusteeDocument for {fileName}")
                continue
            allDocumentIds.append(trusteeDoc.id)

            docTypeLower = (documentType or "unknown").lower()
            createdPositions = []
            for record in records:
                posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId, documentType=docTypeLower)
                pos = trusteeInterface.createPosition(posPayload)
                if pos:
                    allPositionIds.append(pos.id)
                    createdPositions.append(pos)

            # Auto-link bank statement lines to existing invoice/expense positions.
            if docTypeLower == "bank_document" and createdPositions:
                try:
                    from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition

                    candidatesRaw = trusteeInterface.db.getRecordset(
                        TrusteePosition,
                        recordFilter={"featureInstanceId": featureInstanceId},
                    )
                    candidatePositions = [_safeRecordFromModel(c) for c in (candidatesRaw or [])]
                    matchedInThisBankDoc = set()

                    for createdPos in createdPositions:
                        bankPosition = _safeRecordFromModel(createdPos)
                        if not bankPosition:
                            continue

                        matchCandidate = _findBestBankMatch(
                            bankPosition=bankPosition,
                            candidatePositions=candidatePositions,
                            alreadyMatchedIds=matchedInThisBankDoc,
                        )
                        if not matchCandidate:
                            continue

                        matchedId = matchCandidate.get("id")
                        if not matchedId:
                            continue

                        updated = trusteeInterface.updatePosition(
                            matchedId,
                            {
                                "bankDocumentId": trusteeDoc.id,
                            },
                        )
                        if updated:
                            matchedInThisBankDoc.add(matchedId)
                            autoMatchedPositionIds.append(matchedId)
                except Exception:
                    logger.exception("Automatic bank-document matching failed for documentId=%s", trusteeDoc.id)

        payload = {
            "positionIds": allPositionIds,
            "documentIds": allDocumentIds,
            "autoMatchedPositionIds": autoMatchedPositionIds,
        }
        return ActionResult.isSuccess(
            documents=[
                ActionDocument(
                    documentName="process_documents_result.json",
                    documentData=json.dumps(payload),
                    mimeType="application/json",
                )
            ]
        )
    except Exception as e:
        logger.exception("processDocuments failed")
        return ActionResult.isFailure(error=str(e))