From f5143611b0e05afb5d18ddf081d7eece5a680e36 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Tue, 24 Feb 2026 22:49:08 +0100
Subject: [PATCH] fix trustee 3-step ai call for documents
---
modules/interfaces/interfaceDbChat.py | 27 ++++--
.../methodTrustee/actions/extractFromFiles.py | 92 +++++++++----------
2 files changed, 65 insertions(+), 54 deletions(-)
diff --git a/modules/interfaces/interfaceDbChat.py b/modules/interfaces/interfaceDbChat.py
index 56743b50..cd688ca7 100644
--- a/modules/interfaces/interfaceDbChat.py
+++ b/modules/interfaces/interfaceDbChat.py
@@ -669,17 +669,30 @@ class ChatObjects:
stats = self.getStats(workflowId)
# Validate workflow data against ChatWorkflow model
+ # Explicit type coercion: DB may store numeric fields as TEXT on some platforms
+ def _toInt(v, default=0):
+ try:
+ return int(v) if v is not None else default
+ except (ValueError, TypeError):
+ return default
+
+ def _toFloat(v, default=None):
+ try:
+ return float(v) if v is not None else (default if default is not None else getUtcTimestamp())
+ except (ValueError, TypeError):
+ return default if default is not None else getUtcTimestamp()
+
return ChatWorkflow(
id=workflow["id"],
status=workflow.get("status", "running"),
name=workflow.get("name"),
- currentRound=workflow.get("currentRound", 0) or 0,
- currentTask=workflow.get("currentTask", 0) or 0,
- currentAction=workflow.get("currentAction", 0) or 0,
- totalTasks=workflow.get("totalTasks", 0) or 0,
- totalActions=workflow.get("totalActions", 0) or 0,
- lastActivity=workflow.get("lastActivity", getUtcTimestamp()),
- startedAt=workflow.get("startedAt", getUtcTimestamp()),
+ currentRound=_toInt(workflow.get("currentRound")),
+ currentTask=_toInt(workflow.get("currentTask")),
+ currentAction=_toInt(workflow.get("currentAction")),
+ totalTasks=_toInt(workflow.get("totalTasks")),
+ totalActions=_toInt(workflow.get("totalActions")),
+ lastActivity=_toFloat(workflow.get("lastActivity")),
+ startedAt=_toFloat(workflow.get("startedAt")),
logs=logs,
messages=messages,
stats=stats
diff --git a/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py b/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
index fe3473f9..88f2c544 100644
--- a/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
+++ b/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
@@ -24,11 +24,13 @@ logger = logging.getLogger(__name__)
ALLOWED_EXTENSIONS = (".pdf", ".jpg", ".jpeg")
MAX_FILES = 50
-# Phase 1: Extract all text + classify document type (one step)
-_CLASSIFICATION_PROMPT = (
- "Extract ALL text from this document verbatim. Then identify the document type.\n"
- 'Return JSON: {"documentType": "EXPENSE_RECEIPT"|"BANK_DOCUMENT"|"INVOICE"|"CONTRACT"|"UNKNOWN", '
- '"rawText": ""}\n'
+# Phase 1a: Pure OCR / text extraction (no JSON, plain text only)
+_OCR_PROMPT = "Extract ALL readable text from this document. Return ONLY the plain text, nothing else."
+
+# Phase 1b: Classification (text-only, lightweight)
+_CLASSIFY_PROMPT = (
+ "Classify this document text into one of these types. "
+ "Return ONLY the type name, nothing else.\n"
"EXPENSE_RECEIPT: Quittungen, Tankbelege, Kassenzettel\n"
"BANK_DOCUMENT: Bankauszuege, Kontoauszuege mit Transaktionslisten\n"
"INVOICE: Rechnungen mit Rechnungsnummer und Faelligkeitsdatum\n"
@@ -67,20 +69,14 @@ _PROMPT_FALLBACK = (
)
-def _parseClassificationResult(raw: str) -> Tuple[str, str]:
- """Parse phase 1 AI response: {documentType, rawText}. Returns (documentType, rawText)."""
- from modules.shared.jsonUtils import stripCodeFences, extractFirstBalancedJson
-
- documentType = "UNKNOWN"
- rawText = ""
- cleaned = extractFirstBalancedJson(stripCodeFences((raw or "").strip()))
- try:
- data = json.loads(cleaned)
- documentType = (data.get("documentType") or "UNKNOWN").strip().upper().replace(" ", "_")
- rawText = (data.get("rawText") or data.get("raw_text") or "").strip()
- except Exception as e:
- logger.debug("Parse classification result: %s", e)
- return (documentType, rawText)
+def _parseDocumentType(raw: str) -> str:
+ """Parse classification response (plain type name). Returns normalised document type."""
+ _VALID_TYPES = {"EXPENSE_RECEIPT", "BANK_DOCUMENT", "INVOICE", "CONTRACT", "UNKNOWN"}
+ cleaned = (raw or "").strip().upper().replace(" ", "_").replace('"', "").replace("'", "")
+ for t in _VALID_TYPES:
+ if t in cleaned:
+ return t
+ return "UNKNOWN"
def _buildStructuringPrompt(documentType: str, expenseList: str, bankList: str) -> str:
@@ -170,7 +166,7 @@ async def _extractWithAi(
bankList: str,
featureInstanceId: str,
) -> Dict[str, Any]:
- """Run 2-phase AI extraction: (1) classify + full text, (2) structure by type. Returns { documentType, extractedData, fileId, fileName }."""
+ """3-step extraction: (1a) OCR/text via Vision AI, (1b) classify text, (2) structure by type."""
await self.services.ai.ensureAiObjectsInitialized()
from modules.datamodels.datamodelDocref import DocumentReferenceList, DocumentItemReference
@@ -178,42 +174,44 @@ async def _extractWithAi(
references=[DocumentItemReference(documentId=chatDocumentId, fileName=fileName)]
)
+ # --- Step 1a: Pure text extraction (Vision AI for images, text extraction for text PDFs) ---
try:
- self.services.utils.writeDebugFile(_CLASSIFICATION_PROMPT, "trustee_classification_prompt")
+ self.services.utils.writeDebugFile(_OCR_PROMPT, "trustee_ocr_prompt")
except Exception:
pass
- options = AiCallOptions(resultFormat="json", operationType=OperationTypeEnum.DATA_EXTRACT)
- try:
- phase1Response = await self.services.ai.callAiContent(
- prompt=_CLASSIFICATION_PROMPT,
- options=options,
- documentList=docList,
- contentParts=None,
- outputFormat="json",
- generationIntent="extract",
- )
- except Exception:
- options = AiCallOptions(resultFormat="csv", operationType=OperationTypeEnum.DATA_EXTRACT)
- phase1Response = await self.services.ai.callAiContent(
- prompt=_CLASSIFICATION_PROMPT,
- options=options,
- documentList=docList,
- contentParts=None,
- outputFormat="csv",
- generationIntent="extract",
- )
+ ocrOptions = AiCallOptions(resultFormat="text", operationType=OperationTypeEnum.DATA_EXTRACT)
+ ocrResponse = await self.services.ai.callAiContent(
+ prompt=_OCR_PROMPT,
+ options=ocrOptions,
+ documentList=docList,
+ contentParts=None,
+ outputFormat="txt",
+ generationIntent="extract",
+ )
- if not phase1Response or not phase1Response.documents:
+ if not ocrResponse or not ocrResponse.documents:
return {"documentType": "UNKNOWN", "extractedData": [], "fileId": fileId, "fileName": fileName}
- raw1 = phase1Response.documents[0].documentData
- if isinstance(raw1, bytes):
- raw1 = raw1.decode("utf-8")
- documentType, rawText = _parseClassificationResult(raw1 or "")
+ rawText = ocrResponse.documents[0].documentData
+ if isinstance(rawText, bytes):
+ rawText = rawText.decode("utf-8")
+ rawText = (rawText or "").strip()
+
+ try:
+ self.services.utils.writeDebugFile(rawText[:5000] if rawText else "(empty)", "trustee_ocr_result")
+ except Exception:
+ pass
if not rawText:
- return {"documentType": documentType or "UNKNOWN", "extractedData": [], "fileId": fileId, "fileName": fileName}
+ return {"documentType": "UNKNOWN", "extractedData": [], "fileId": fileId, "fileName": fileName}
+
+ # --- Step 1b: Classify the extracted text (lightweight text-only call, no Vision AI) ---
+ classifyPrompt = f"{_CLASSIFY_PROMPT}\n\nTEXT:\n{rawText[:3000]}"
+ classifyRequest = AiCallRequest(prompt=classifyPrompt, context="", options=AiCallOptions(resultFormat="text"))
+ classifyResponse = await self.services.ai.callAi(classifyRequest)
+ documentType = _parseDocumentType(classifyResponse.content if hasattr(classifyResponse, "content") else "")
+ logger.info("Document classified: type=%s, rawText_length=%d, file=%s", documentType, len(rawText), fileName)
structuringPrompt = _buildStructuringPrompt(documentType, expenseList, bankList)
try: