gateway/modules/workflows/methods/methodTrustee/actions/processDocuments.py
2026-02-22 22:34:07 +01:00

152 lines
6.4 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Process extracted documents: create TrusteeDocument + TrusteePosition from extraction JSON.
Input: documentList (reference to extractFromFiles result).
Each document is JSON with documentType, extractedData, fileId, fileName.
extractedData is a list of expense/position records.
Output: one ActionDocument with JSON { positionIds, documentIds } for chaining to syncToAccounting.
"""
import json
import logging
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import DocumentReferenceList
logger = logging.getLogger(__name__)
def _parseFloat(value) -> float:
try:
if value is None or value == "":
return 0.0
return float(value)
except (ValueError, TypeError):
return 0.0
def _extractAccountNumber(value) -> Optional[str]:
"""Extract the leading numeric account number from AI output like '6200 Fahrzeugaufwand' -> '6200'."""
if not value or not isinstance(value, str):
return None
import re
match = re.match(r"(\d+)", value.strip())
return match.group(1) if match else value.strip() or None
def _normaliseTags(value) -> str:
"""Convert tags from various formats to a clean comma-separated string."""
if not value:
return ""
if isinstance(value, list):
return ", ".join(str(t) for t in value if t)
return str(value)
def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str) -> Dict[str, Any]:
"""Map extraction record to TrusteePosition payload."""
return {
"documentId": documentId,
"valuta": record.get("valuta"),
"transactionDateTime": record.get("transactionDateTime"),
"company": record.get("company", ""),
"desc": record.get("desc", ""),
"tags": _normaliseTags(record.get("tags")),
"bookingCurrency": record.get("bookingCurrency", "CHF"),
"bookingAmount": _parseFloat(record.get("bookingAmount", 0)),
"originalCurrency": record.get("originalCurrency") or record.get("bookingCurrency", "CHF"),
"originalAmount": _parseFloat(record.get("originalAmount", 0)) or _parseFloat(record.get("bookingAmount", 0)),
"vatPercentage": _parseFloat(record.get("vatPercentage", 0)),
"vatAmount": _parseFloat(record.get("vatAmount", 0)),
"debitAccountNumber": _extractAccountNumber(record.get("debitAccountNumber")),
"creditAccountNumber": _extractAccountNumber(record.get("creditAccountNumber")),
"taxCode": record.get("taxCode") or None,
"costCenter": record.get("costCenter") or None,
"bookingReference": record.get("bookingReference") or None,
"featureInstanceId": featureInstanceId,
"mandateId": mandateId,
}
async def processDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Resolve documentList to ChatDocuments, load extraction JSON per document,
create TrusteeDocument (with documentType) + TrusteePosition(s), return one JSON document with positionIds/documentIds.
"""
documentListParam = parameters.get("documentList")
featureInstanceId = parameters.get("featureInstanceId") or (getattr(self.services, "featureInstanceId", None))
if not documentListParam:
return ActionResult.isFailure(error="documentList is required (reference to extractFromFiles result)")
if not featureInstanceId:
return ActionResult.isFailure(error="featureInstanceId is required")
try:
docList = DocumentReferenceList.from_string_list(
documentListParam if isinstance(documentListParam, list) else [documentListParam]
)
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docList)
if not chatDocuments:
return ActionResult.isFailure(error="No documents found for documentList")
from modules.features.trustee.interfaceFeatureTrustee import getInterface as getTrusteeInterface
trusteeInterface = getTrusteeInterface(
self.services.user,
mandateId=self.services.mandateId,
featureInstanceId=featureInstanceId
)
allPositionIds = []
allDocumentIds = []
for chatDoc in chatDocuments:
rawBytes = self.services.chat.getFileData(chatDoc.fileId)
if not rawBytes:
logger.warning(f"Could not load file {chatDoc.fileId}, skipping")
continue
content = rawBytes.decode("utf-8") if isinstance(rawBytes, bytes) else rawBytes
data = json.loads(content) if isinstance(content, str) else content
documentType = data.get("documentType")
extractedData = data.get("extractedData")
fileId = data.get("fileId") or chatDoc.fileId
fileName = data.get("fileName") or chatDoc.fileName or "document"
records = extractedData if isinstance(extractedData, list) else [extractedData] if extractedData else []
if not records:
continue
docPayload = {
"fileId": fileId,
"documentName": fileName,
"documentMimeType": chatDoc.mimeType or "application/octet-stream",
"sourceType": "workflow",
"documentType": documentType,
}
trusteeDoc = trusteeInterface.createDocument(docPayload)
if not trusteeDoc:
logger.warning(f"Failed to create TrusteeDocument for {fileName}")
continue
allDocumentIds.append(trusteeDoc.id)
for record in records:
posPayload = _recordToPosition(record, trusteeDoc.id, featureInstanceId, self.services.mandateId)
pos = trusteeInterface.createPosition(posPayload)
if pos:
allPositionIds.append(pos.id)
payload = {"positionIds": allPositionIds, "documentIds": allDocumentIds}
return ActionResult.isSuccess(
documents=[
ActionDocument(
documentName="process_documents_result.json",
documentData=json.dumps(payload),
mimeType="application/json",
)
]
)
except Exception as e:
logger.exception("processDocuments failed")
return ActionResult.isFailure(error=str(e))