fixes sync and ai tree

2026-02-22 22:34:07 +01:00 · 2026-02-22 22:34:07 +01:00 · f35a90e428
commit f35a90e428
parent 6b11d66766
7 changed files with 351 additions and 160 deletions
--- a/modules/features/trustee/accounting/accountingBridge.py
+++ b/modules/features/trustee/accounting/accountingBridge.py
@ -5,6 +5,7 @@
 Encapsulates: config loading -> connector resolution -> duplicate check -> push -> sync record.
 """

+import json
 import logging
 import time
 from typing import List, Dict, Any, Optional
@ -109,19 +110,26 @@ class AccountingBridge:
            lines=lines,
        )

-    async def pushPositionToAccounting(self, featureInstanceId: str, positionId: str) -> SyncResult:
+    async def pushPositionToAccounting(
+        self,
+        featureInstanceId: str,
+        positionId: str,
+        _resolvedConnector=None,
+        _resolvedPlainConfig=None,
+        _resolvedConfigRecord=None,
+    ) -> SyncResult:
        """Push a single position to the configured accounting system.

-        1. Load config and connector
-        2. Load position data
-        3. Check for existing successful sync (duplicate guard)
-        4. Build AccountingBooking
-        5. Push via connector
-        6. Create TrusteeAccountingSync record
+        Optional _resolved* params allow pushBatchToAccounting to pass a pre-resolved
+        connector/config so we don't decrypt per position (avoids rate-limit).
        """
        from modules.features.trustee.datamodelFeatureTrustee import TrusteePosition, TrusteeAccountingSync

-        connector, plainConfig, configRecord = await self._resolveConnectorAndConfig(featureInstanceId)
+        connector = _resolvedConnector
+        plainConfig = _resolvedPlainConfig
+        configRecord = _resolvedConfigRecord
+        if not connector or not plainConfig:
+            connector, plainConfig, configRecord = await self._resolveConnectorAndConfig(featureInstanceId)
        if not connector or not plainConfig:
            return SyncResult(success=False, errorMessage="No active accounting configuration found")

@ -269,19 +277,62 @@ class AccountingBridge:
        return result

    async def pushBatchToAccounting(self, featureInstanceId: str, positionIds: List[str]) -> List[SyncResult]:
-        """Push multiple positions sequentially."""
+        """Push multiple positions sequentially. Resolves connector/config once to avoid decrypt rate-limit."""
+        connector, plainConfig, configRecord = await self._resolveConnectorAndConfig(featureInstanceId)
+        if not connector or not plainConfig:
+            return [SyncResult(success=False, errorMessage="No active accounting configuration found") for _ in positionIds]
        results = []
        for positionId in positionIds:
-            result = await self.pushPositionToAccounting(featureInstanceId, positionId)
+            result = await self.pushPositionToAccounting(
+                featureInstanceId, positionId,
+                _resolvedConnector=connector, _resolvedPlainConfig=plainConfig, _resolvedConfigRecord=configRecord,
+            )
            results.append(result)
        return results

-    async def getChartOfAccounts(self, featureInstanceId: str, accountType: Optional[str] = None) -> List[AccountingChart]:
-        """Load the chart of accounts from the configured external system. Optional filter by accountType."""
-        connector, plainConfig, _ = await self._resolveConnectorAndConfig(featureInstanceId)
-        if not connector or not plainConfig:
+    async def refreshChartOfAccounts(self, featureInstanceId: str) -> List[AccountingChart]:
+        """Fetch the full chart of accounts from the external system and cache it locally on TrusteeAccountingConfig."""
+        from modules.features.trustee.datamodelFeatureTrustee import TrusteeAccountingConfig
+
+        connector, plainConfig, configRecord = await self._resolveConnectorAndConfig(featureInstanceId)
+        if not connector or not plainConfig or not configRecord:
+            logger.warning("refreshChartOfAccounts: no connector/config — nothing to cache")
            return []
-        charts = await connector.getChartOfAccounts(plainConfig, accountType=accountType)
+
+        charts = await connector.getChartOfAccounts(plainConfig)
+        serialised = json.dumps([{"accountNumber": c.accountNumber, "label": c.label, "accountType": c.accountType or ""} for c in charts], ensure_ascii=False)
+        self._trusteeInterface.db.recordModify(TrusteeAccountingConfig, configRecord["id"], {
+            "cachedChartOfAccounts": serialised,
+            "chartCachedAt": time.time(),
+        })
+        logger.info(f"Cached {len(charts)} chart-of-accounts entries for instance {featureInstanceId}")
+        return charts
+
+    def _readCachedCharts(self, configRecord: Dict[str, Any]) -> List[AccountingChart]:
+        """Deserialise the cached chart-of-accounts JSON from a config record. Returns [] on any error."""
+        raw = configRecord.get("cachedChartOfAccounts")
+        if not raw:
+            return []
+        try:
+            items = json.loads(raw) if isinstance(raw, str) else raw
+            return [AccountingChart(accountNumber=i["accountNumber"], label=i["label"], accountType=i.get("accountType", "")) for i in items]
+        except Exception as e:
+            logger.debug("Could not deserialise cached chart: %s", e)
+            return []
+
+    async def getChartOfAccounts(self, featureInstanceId: str, accountType: Optional[str] = None) -> List[AccountingChart]:
+        """Return chart of accounts — from local cache if available, otherwise fetch externally and cache."""
+        configRecord = await self.getActiveConfig(featureInstanceId)
+        if not configRecord:
+            return []
+
+        charts = self._readCachedCharts(configRecord)
+        if charts:
+            logger.debug(f"Using cached chart of accounts ({len(charts)} entries) for instance {featureInstanceId}")
+        else:
+            logger.info(f"No cached chart — fetching live for instance {featureInstanceId}")
+            charts = await self.refreshChartOfAccounts(featureInstanceId)
+
        if accountType:
            charts = [c for c in charts if c.accountType == accountType]
        return charts
--- a/modules/features/trustee/accounting/connectors/accountingConnectorRma.py
+++ b/modules/features/trustee/accounting/connectors/accountingConnectorRma.py
@ -87,26 +87,47 @@ class AccountingConnectorRma(BaseAccountingConnector):
        except Exception as e:
            return SyncResult(success=False, errorMessage=str(e))

+    def _rmaLinkToAccountType(self, link: str) -> str:
+        """Map RMA chart 'link' (e.g. AP_amount, AR_amount, AR_paid:AP_paid) to our accountType."""
+        if not link:
+            return ""
+        linkUpper = link.upper()
+        if "AP_AMOUNT" in linkUpper:
+            return "expense"
+        if "AR_AMOUNT" in linkUpper:
+            return "revenue"
+        if "AR_PAID" in linkUpper or "AP_PAID" in linkUpper:
+            return "asset"
+        if "AR_TAX" in linkUpper or "AP_TAX" in linkUpper:
+            return "liability"
+        if linkUpper in ("AR", "AP"):
+            return "asset"
+        return link
+
    async def getChartOfAccounts(self, config: Dict[str, Any], accountType: Optional[str] = None) -> List[AccountingChart]:
+        """RMA API 'type' filter expects RMA values (AP_amount, AR_amount, etc.), not 'expense'. Fetch full chart and filter client-side."""
        try:
-            params = {}
-            if accountType:
-                params["type"] = accountType
            async with aiohttp.ClientSession() as session:
                url = self._buildUrl(config, "charts")
-                async with session.get(url, headers=self._buildHeaders(config), params=params, timeout=aiohttp.ClientTimeout(total=30)) as resp:
+                async with session.get(url, headers=self._buildHeaders(config), timeout=aiohttp.ClientTimeout(total=30)) as resp:
                    if resp.status != 200:
-                        logger.error(f"RMA charts failed: HTTP {resp.status}")
+                        body = await resp.text()
+                        logger.error(f"RMA charts failed: HTTP {resp.status} - {body[:200]}")
                        return []
                    data = await resp.json()

            charts = []
            items = data if isinstance(data, list) else data.get("chart", data.get("row", []))
+            if not isinstance(items, list):
+                items = []
            for item in items:
                if isinstance(item, dict):
                    accNo = str(item.get("accno", item.get("account_number", "")))
                    label = str(item.get("description", item.get("label", "")))
-                    chartType = item.get("charttype") or item.get("category") or item.get("link") or ""
+                    rmaLink = item.get("link") or ""
+                    chartType = item.get("charttype") or item.get("category") or ""
+                    if not chartType and rmaLink:
+                        chartType = self._rmaLinkToAccountType(rmaLink)
                    if not chartType and accNo:
                        firstDigit = accNo[0] if accNo else ""
                        chartType = {
--- a/modules/features/trustee/datamodelFeatureTrustee.py
+++ b/modules/features/trustee/datamodelFeatureTrustee.py
@ -684,6 +684,8 @@ class TrusteeAccountingConfig(BaseModel):
    lastSyncAt: Optional[float] = Field(default=None, description="Timestamp of last sync attempt")
    lastSyncStatus: Optional[str] = Field(default=None, description="Last sync result: success, error, partial")
    lastSyncErrorMessage: Optional[str] = Field(default=None, description="Error message when lastSyncStatus is error")
+    cachedChartOfAccounts: Optional[str] = Field(default=None, description="JSON-serialised chart of accounts cache (list of {accountNumber, label, accountType})")
+    chartCachedAt: Optional[float] = Field(default=None, description="Timestamp when cachedChartOfAccounts was last refreshed")
    mandateId: Optional[str] = Field(default=None)


@ -699,6 +701,8 @@ registerModelLabels(
        "lastSyncAt": {"en": "Last Sync", "fr": "Dernière sync.", "de": "Letzte Synchronisation"},
        "lastSyncStatus": {"en": "Status", "fr": "Statut", "de": "Status"},
        "lastSyncErrorMessage": {"en": "Error", "fr": "Erreur", "de": "Fehlermeldung"},
+        "cachedChartOfAccounts": {"en": "Cached Chart", "de": "Cached Kontoplan", "fr": "Plan comptable en cache"},
+        "chartCachedAt": {"en": "Chart Cached At", "de": "Kontoplan-Cache-Zeitpunkt", "fr": "Horodatage cache plan comptable"},
        "mandateId": {"en": "Mandate", "fr": "Mandat", "de": "Mandat"},
    },
 )
--- a/modules/features/trustee/routeFeatureTrustee.py
+++ b/modules/features/trustee/routeFeatureTrustee.py
@ -1236,7 +1236,7 @@ class SaveAccountingConfigBody(BaseModel):

@router.post("/{instanceId}/accounting/config", status_code=201)
@limiter.limit("5/minute")
-def save_accounting_config(
+async def save_accounting_config(
    request: Request,
    instanceId: str = Path(..., description="Feature Instance ID"),
    body: SaveAccountingConfigBody = Body(...),
@ -1288,6 +1288,7 @@ def save_accounting_config(
                    merged[k] = v
            updatePayload["encryptedConfig"] = encryptValue(json.dumps(merged), keyName="accountingConfig")
        interface.db.recordModify(TrusteeAccountingConfig, configId, updatePayload)
+        await _refreshChartSilently(interface, instanceId)
        return {"message": "Accounting config updated", "id": configId}

    if not plainConfig:
@ -1307,6 +1308,7 @@ def save_accounting_config(
        "mandateId": mandateId,
    }
    interface.db.recordCreate(TrusteeAccountingConfig, configRecord)
+    await _refreshChartSilently(interface, instanceId)
    return {"message": "Accounting config created", "id": configRecord["id"]}


@ -1317,12 +1319,14 @@ async def test_accounting_connection(
    instanceId: str = Path(..., description="Feature Instance ID"),
    context: RequestContext = Depends(getRequestContext)
 ) -> Dict[str, Any]:
-    """Test the connection to the configured accounting system."""
+    """Test the connection to the configured accounting system. On success, refreshes the local chart-of-accounts cache."""
    mandateId = _validateInstanceAccess(instanceId, context)
    interface = getInterface(context.user, mandateId=mandateId, featureInstanceId=instanceId)
    from .accounting.accountingBridge import AccountingBridge
    bridge = AccountingBridge(interface)
    result = await bridge.testConnection(instanceId)
+    if result.success:
+        await _refreshChartSilently(interface, instanceId)
    return result.model_dump()


@ -1360,6 +1364,33 @@ async def get_chart_of_accounts(
    return [c.model_dump() for c in charts]


+async def _refreshChartSilently(interface, instanceId: str) -> None:
+    """Best-effort chart-of-accounts cache refresh. Logs but does not raise on failure."""
+    try:
+        from .accounting.accountingBridge import AccountingBridge
+        bridge = AccountingBridge(interface)
+        charts = await bridge.refreshChartOfAccounts(instanceId)
+        logger.info(f"Chart cache refreshed: {len(charts)} entries for instance {instanceId}")
+    except Exception as e:
+        logger.warning(f"Chart cache refresh failed (non-critical): {e}")
+
+
+@router.post("/{instanceId}/accounting/refresh-chart")
+@limiter.limit("5/minute")
+async def refresh_chart_of_accounts(
+    request: Request,
+    instanceId: str = Path(..., description="Feature Instance ID"),
+    context: RequestContext = Depends(getRequestContext)
+) -> Dict[str, Any]:
+    """Explicitly refresh the locally cached chart of accounts from the external system."""
+    mandateId = _validateInstanceAccess(instanceId, context)
+    interface = getInterface(context.user, mandateId=mandateId, featureInstanceId=instanceId)
+    from .accounting.accountingBridge import AccountingBridge
+    bridge = AccountingBridge(interface)
+    charts = await bridge.refreshChartOfAccounts(instanceId)
+    return {"message": f"Chart of accounts refreshed: {len(charts)} entries", "count": len(charts)}
+
+
@router.post("/{instanceId}/accounting/sync")
@limiter.limit("5/minute")
 async def sync_positions_to_accounting(
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -6,7 +6,7 @@ import re
 import time
 import base64
 from typing import Dict, Any, List, Optional, Tuple
-from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
+from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument, WorkflowModeEnum
 from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
 from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
 from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
@ -1283,8 +1283,12 @@ Respond with ONLY a JSON object in this exact format:
        parentOperationId: Optional[str]
    ) -> AiResponse:
        """
-        Handle DATA_EXTRACT: Extract content from documents (no AI), then process with AI.
-        This is the original flow: extract all documents first, then process contentParts with AI.
+        Handle DATA_EXTRACT: Extract content from documents, then process with AI.
+
+        - AUTOMATION mode: No intent analysis. The passed prompt is used as extractionPrompt
+          for every document and for the final AI call (exact prompt preserved).
+        - DYNAMIC mode: Intent analysis (clarifyDocumentIntents) runs first; extraction and
+          processing use the intents and AI-derived extractionPrompt.
        """
        import time
        
@ -1332,14 +1336,28 @@ Respond with ONLY a JSON object in this exact format:
                
                documents = filteredDocuments  # Use filtered list
            
-            # Step 2: Clarify document intents (if not provided) - REQUIRED for all documents
+            # Step 2: Document intents – AUTOMATION uses exact prompt; DYNAMIC uses intent analysis
            if not documentIntents and documents:
-                documentIntents = await self.clarifyDocumentIntents(
-                    documents,
-                    prompt,
-                    {"outputFormat": outputFormat},
-                    extractOperationId
-                )
+                workflowMode = getattr(self.services.workflow, "workflowMode", None) if self.services.workflow else None
+                if workflowMode == WorkflowModeEnum.WORKFLOW_AUTOMATION:
+                    # Automation: no intent AI call – use the given prompt as extractionPrompt for every document
+                    documentIntents = [
+                        DocumentIntent(
+                            documentId=doc.id,
+                            intents=["extract"],
+                            extractionPrompt=prompt,
+                            reasoning="Automation mode: use exact prompt from action",
+                        )
+                        for doc in documents
+                    ]
+                    logger.debug("DATA_EXTRACT in AUTOMATION mode: using exact prompt for all documents (no intent analysis)")
+                else:
+                    documentIntents = await self.clarifyDocumentIntents(
+                        documents,
+                        prompt,
+                        {"outputFormat": outputFormat},
+                        extractOperationId
+                    )
            
            # Step 3: Extract and prepare content (NO AI - pure extraction) - REQUIRED for all documents
            if documents:
@ -1359,51 +1377,33 @@ Respond with ONLY a JSON object in this exact format:
                
                contentParts = preparedContentParts
            
-            # Step 4: Process extracted contentParts with AI (simple text processing, no structure generation)
+            # Step 4: Process contentParts with AI via ExtractionService
+            # Always use processContentPartsWithAi – it handles text vs image parts correctly:
+            # - Text parts → text models (with chunking if needed)
+            # - Image parts → Vision AI (proper image_url content blocks)
+            # No manual contentText concatenation or token estimation needed.
            if not contentParts:
                raise ValueError("No content extracted from documents")
            
-            # Use simple AI call to process extracted content
-            # Prepare content for AI processing
-            contentText = "\n\n".join([
-                f"[Document: {part.metadata.get('documentName', 'Unknown')}]\n{part.data}"
-                for part in contentParts
-                if part.data
-            ])
+            # Filter out empty content parts (e.g. PDF container with 0 bytes) that would
+            # produce garbage AI responses and pollute the merged result.
+            nonEmptyParts = [p for p in contentParts if p.data and len(p.data.strip()) > 0]
+            if not nonEmptyParts:
+                raise ValueError("No non-empty content parts to process")
            
-            # Check content size and use chunking if needed
-            # Conservative estimate: 2 bytes per token, 80% of model limit for safety
-            contentSizeBytes = len(contentText.encode('utf-8'))
-            promptSizeBytes = len(prompt.encode('utf-8'))
-            totalSizeBytes = contentSizeBytes + promptSizeBytes
-            estimatedTokens = totalSizeBytes / 2  # Conservative: 2 bytes per token
-            
-            # Get max model context (use Claude's 200k as reference, 80% = 160k tokens)
-            maxSafeTokens = 160000
-            
-            if estimatedTokens > maxSafeTokens:
-                # Content too large - use chunking via ExtractionService
-                logger.warning(f"Content too large for single AI call: ~{estimatedTokens:.0f} tokens (limit: {maxSafeTokens}). Using chunked processing.")
-                
-                # Use ExtractionService for chunked processing
-                extractionService = self.services.extraction
-                aiResponse = await extractionService.processContentPartsWithPrompt(
-                    contentParts=contentParts,
-                    prompt=prompt,
-                    aiObjects=self.aiObjects,
-                    options=options,
-                    operationId=extractOperationId,
-                    parentOperationId=parentOperationId
-                )
-            else:
-                # Content fits - use single AI call
-                aiRequest = AiCallRequest(
-                    prompt=f"{prompt}\n\nExtracted Content:\n{contentText}",
-                    context="",
-                    options=options
-                )
-                
-                aiResponse = await self.callAi(aiRequest)
+            self.services.utils.writeDebugFile(prompt, "data_extract_prompt")
+            extractionService = self.services.extraction
+            aiRequest = AiCallRequest(
+                prompt=prompt,
+                context="",
+                options=options,
+                contentParts=nonEmptyParts,
+            )
+            aiResponse = await extractionService.processContentPartsWithAi(
+                aiRequest, self.aiObjects
+            )
+            _respText = aiResponse.content if isinstance(aiResponse.content, str) else (aiResponse.content.decode("utf-8", errors="replace") if aiResponse.content else "")
+            self.services.utils.writeDebugFile(_respText, "data_extract_response")
            
            # Create response document
            resultDocument = DocumentData(
--- a/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
+++ b/modules/workflows/methods/methodTrustee/actions/extractFromFiles.py
@ -13,30 +13,93 @@ import uuid
 import csv
 import io
 from datetime import datetime, timezone
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Tuple

 from modules.datamodels.datamodelChat import ActionResult, ActionDocument, ChatDocument
 from modules.datamodels.datamodelDocref import DocumentReferenceList, DocumentItemReference
-from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
+from modules.datamodels.datamodelAi import AiCallOptions, AiCallRequest, OperationTypeEnum

 logger = logging.getLogger(__name__)

 ALLOWED_EXTENSIONS = (".pdf", ".jpg", ".jpeg")
 MAX_FILES = 50

-_DEFAULT_PROMPT_FALLBACK = (
-    'Extract document type (one of: INVOICE, EXPENSE_RECEIPT, BANK_DOCUMENT, CONTRACT, UNKNOWN) '
-    'and expense/position records. Return JSON: {"documentType": "...", "records": [{...}]}. '
-    'Each record must have: valuta (YYYY-MM-DD), transactionDateTime (unix seconds), company, desc (full extracted text), '
-    'tags (from: customer, meeting, license, subscription, fuel, food, material), '
-    'bookingCurrency, bookingAmount, originalCurrency, originalAmount, vatPercentage, vatAmount, '
-    'debitAccountNumber (Soll-Konto nach Schweizer KMU-Kontenrahmen, z.B. 6200 Fahrzeugaufwand, 6000 Materialaufwand), '
-    'creditAccountNumber (Haben-Konto, z.B. 1020 Bank), taxCode, costCenter, bookingReference.'
+# Phase 1: Extract all text + classify document type (one step)
+_CLASSIFICATION_PROMPT = (
+    "Extract ALL text from this document verbatim. Then identify the document type.\n"
+    'Return JSON: {"documentType": "EXPENSE_RECEIPT"|"BANK_DOCUMENT"|"INVOICE"|"CONTRACT"|"UNKNOWN", '
+    '"rawText": "<complete extracted text>"}\n'
+    "EXPENSE_RECEIPT: Quittungen, Tankbelege, Kassenzettel\n"
+    "BANK_DOCUMENT: Bankauszuege, Kontoauszuege mit Transaktionslisten\n"
+    "INVOICE: Rechnungen mit Rechnungsnummer und Faelligkeitsdatum\n"
+    "CONTRACT: Vertraege\n"
+    "UNKNOWN: Falls unklar"
+)
+
+# Phase 2: Type-specific structuring prompts (placeholders: {expenseList}, {bankList})
+_PROMPT_EXPENSE_RECEIPT = (
+    "Extrahiere aus dem folgenden Dokument eine Buchung pro Ausgabeposition. "
+    "Return JSON: {{\"records\": [{{...}}]}}. Jeder Record: valuta (YYYY-MM-DD), transactionDateTime (unix seconds), company, desc, "
+    "bookingCurrency, bookingAmount, originalCurrency, originalAmount, vatPercentage, vatAmount, "
+    "debitAccountNumber (NUR die Kontonummer, z.B. \"6200\", aus: {expenseList}), "
+    "creditAccountNumber (NUR die Kontonummer, z.B. \"1020\", aus: {bankList}), tags, taxCode, costCenter, bookingReference."
+)
+_PROMPT_BANK_DOCUMENT = (
+    "Extrahiere aus dem folgenden Bankauszug eine Buchung pro Transaktionszeile. "
+    "Return JSON: {{\"records\": [{{...}}]}}. Jeder Record: valuta, company (Gegenpartei), desc (Zahlungsreferenz), "
+    "bookingAmount, bookingCurrency, "
+    "debitAccountNumber (NUR die Kontonummer aus: {expenseList}), creditAccountNumber (NUR die Kontonummer aus: {bankList}), bookingReference. "
+    "Kein MwSt bei Bankauszuegen. transactionDateTime optional."
+)
+_PROMPT_INVOICE = (
+    "Extrahiere aus der folgenden Rechnung genau eine Buchung. "
+    "Return JSON: {{\"records\": [{{...}}]}}. Record: valuta (Rechnungsdatum), company (Kreditor), desc (Rechnungsdetails), "
+    "bookingAmount, bookingCurrency, vatPercentage, vatAmount, "
+    "debitAccountNumber (NUR die Kontonummer aus: {expenseList}), creditAccountNumber (NUR die Kontonummer aus: {bankList}), "
+    "bookingReference (Rechnungsnummer), transactionDateTime, taxCode, costCenter."
+)
+_PROMPT_FALLBACK = (
+    "Extrahiere aus dem folgenden Dokument Buchungsdaten. "
+    "Return JSON: {{\"records\": [{{...}}]}}. Jeder Record: valuta (YYYY-MM-DD), transactionDateTime (unix seconds), company, desc, "
+    "bookingCurrency, bookingAmount, originalCurrency, originalAmount, vatPercentage, vatAmount, "
+    "debitAccountNumber (NUR die Kontonummer, z.B. \"6200\", aus: {expenseList}), "
+    "creditAccountNumber (NUR die Kontonummer, z.B. \"1020\", aus: {bankList}), tags, taxCode, costCenter, bookingReference."
 )


-async def _buildDefaultPromptWithAccounts(self, featureInstanceId: str) -> str:
-    """Build extraction prompt with real expense accounts from the connected accounting system."""
+def _parseClassificationResult(raw: str) -> Tuple[str, str]:
+    """Parse phase 1 AI response: {documentType, rawText}. Returns (documentType, rawText)."""
+    from modules.shared.jsonUtils import stripCodeFences, extractFirstBalancedJson
+
+    documentType = "UNKNOWN"
+    rawText = ""
+    cleaned = extractFirstBalancedJson(stripCodeFences((raw or "").strip()))
+    try:
+        data = json.loads(cleaned)
+        documentType = (data.get("documentType") or "UNKNOWN").strip().upper().replace(" ", "_")
+        rawText = (data.get("rawText") or data.get("raw_text") or "").strip()
+    except Exception as e:
+        logger.debug("Parse classification result: %s", e)
+    return (documentType, rawText)
+
+
+def _buildStructuringPrompt(documentType: str, expenseList: str, bankList: str) -> str:
+    """Build phase 2 prompt for the given document type, with account lists injected."""
+    expenseList = expenseList or "6200 Fahrzeugaufwand, 6000 Materialaufwand"
+    bankList = bankList or "1020 Bank"
+    docType = (documentType or "UNKNOWN").upper().replace(" ", "_")
+    if docType == "EXPENSE_RECEIPT":
+        return _PROMPT_EXPENSE_RECEIPT.format(expenseList=expenseList, bankList=bankList)
+    if docType == "BANK_DOCUMENT":
+        return _PROMPT_BANK_DOCUMENT.format(expenseList=expenseList, bankList=bankList)
+    if docType == "INVOICE":
+        return _PROMPT_INVOICE.format(expenseList=expenseList, bankList=bankList)
+    return _PROMPT_FALLBACK.format(expenseList=expenseList, bankList=bankList)
+
+
+async def _getAccountLists(self, featureInstanceId: str) -> Tuple[str, str]:
+    """Load expense and bank account lists from the connected accounting system for use in prompts.
+    Returns (expenseList, bankList). Empty strings if not configured or on error."""
    try:
        from modules.features.trustee.interfaceFeatureTrustee import getInterface as getTrusteeInterface
        from modules.features.trustee.accounting.accountingBridge import AccountingBridge
@ -50,25 +113,30 @@ async def _buildDefaultPromptWithAccounts(self, featureInstanceId: str) -> str:
        assetAccounts = await bridge.getChartOfAccounts(featureInstanceId, accountType="asset")
    except Exception as e:
        logger.debug("Could not load chart of accounts for prompt: %s", e)
-        return ""
+        return ("", "")

    if not expenseAccounts:
-        return ""
+        return ("", "")

    expenseList = ", ".join(f"{a.accountNumber} {a.label}" for a in expenseAccounts[:50])
    bankAccounts = [a for a in assetAccounts if a.accountNumber.startswith("10")]
    bankList = ", ".join(f"{a.accountNumber} {a.label}" for a in bankAccounts[:10]) if bankAccounts else "1020 Bank"
+    return (expenseList, bankList)

-    return (
-        'Extract document type (one of: INVOICE, EXPENSE_RECEIPT, BANK_DOCUMENT, CONTRACT, UNKNOWN) '
-        'and expense/position records. Return JSON: {"documentType": "...", "records": [{...}]}. '
-        'Each record must have: valuta (YYYY-MM-DD), transactionDateTime (unix seconds), company, desc (full extracted text), '
-        'tags (from: customer, meeting, license, subscription, fuel, food, material), '
-        'bookingCurrency, bookingAmount, originalCurrency, originalAmount, vatPercentage, vatAmount, '
-        f'debitAccountNumber (Soll-Konto, verwende eines der folgenden Aufwandkonten: {expenseList}), '
-        f'creditAccountNumber (Haben-Konto, verwende eines der folgenden Konten: {bankList}), '
-        'taxCode, costCenter, bookingReference.'
-    )
+
+def _parseStructuredRecords(raw: str) -> List[Dict[str, Any]]:
+    """Parse phase 2 AI response (JSON with records or CSV) into list of record dicts."""
+    from modules.shared.jsonUtils import stripCodeFences, extractFirstBalancedJson
+
+    records: List[Dict[str, Any]] = []
+    cleaned = extractFirstBalancedJson(stripCodeFences((raw or "").strip()))
+    try:
+        data = json.loads(cleaned)
+        records = data.get("records") or data.get("extractedData") or []
+    except Exception:
+        if cleaned:
+            records = _parseCsvToRecords(cleaned)
+    return records if isinstance(records, list) else []


 def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
@ -85,26 +153,40 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
            content = "\n".join(lines)
        reader = csv.DictReader(io.StringIO(content))
        for row in reader:
-            cleaned = {k.strip(): (v.strip() if isinstance(v, str) else v) for k, v in row.items()}
+            cleaned = {(k.strip() if k else k): (v.strip() if isinstance(v, str) else v) for k, v in row.items() if k}
            records.append(cleaned)
    except Exception as e:
        logger.warning(f"Parse CSV: {e}")
    return records


-async def _extractWithAi(self, chatDocumentId: str, fileId: str, fileName: str, mimeType: str, prompt: str, featureInstanceId: str) -> Dict[str, Any]:
-    """Run AI extraction on one file; return { documentType, extractedData (records), fileId, fileName }."""
+async def _extractWithAi(
+    self,
+    chatDocumentId: str,
+    fileId: str,
+    fileName: str,
+    mimeType: str,
+    expenseList: str,
+    bankList: str,
+    featureInstanceId: str,
+) -> Dict[str, Any]:
+    """Run 2-phase AI extraction: (1) classify + full text, (2) structure by type. Returns { documentType, extractedData, fileId, fileName }."""
    await self.services.ai.ensureAiObjectsInitialized()
    from modules.datamodels.datamodelDocref import DocumentReferenceList, DocumentItemReference

    docList = DocumentReferenceList(
        references=[DocumentItemReference(documentId=chatDocumentId, fileName=fileName)]
    )
-    # Prefer JSON for documentType + records in one response; fallback to CSV
-    options = AiCallOptions(resultFormat="json", operationType=OperationTypeEnum.DATA_GENERATE)
+
    try:
-        aiResponse = await self.services.ai.callAiContent(
-            prompt=prompt or _DEFAULT_PROMPT_FALLBACK,
+        self.services.utils.writeDebugFile(_CLASSIFICATION_PROMPT, "trustee_classification_prompt")
+    except Exception:
+        pass
+
+    options = AiCallOptions(resultFormat="json", operationType=OperationTypeEnum.DATA_EXTRACT)
+    try:
+        phase1Response = await self.services.ai.callAiContent(
+            prompt=_CLASSIFICATION_PROMPT,
            options=options,
            documentList=docList,
            contentParts=None,
@ -112,9 +194,9 @@ async def _extractWithAi(self, chatDocumentId: str, fileId: str, fileName: str,
            generationIntent="extract",
        )
    except Exception:
-        options = AiCallOptions(resultFormat="csv", operationType=OperationTypeEnum.DATA_GENERATE)
-        aiResponse = await self.services.ai.callAiContent(
-            prompt=prompt or _DEFAULT_PROMPT_FALLBACK,
+        options = AiCallOptions(resultFormat="csv", operationType=OperationTypeEnum.DATA_EXTRACT)
+        phase1Response = await self.services.ai.callAiContent(
+            prompt=_CLASSIFICATION_PROMPT,
            options=options,
            documentList=docList,
            contentParts=None,
@ -122,63 +204,50 @@ async def _extractWithAi(self, chatDocumentId: str, fileId: str, fileName: str,
            generationIntent="extract",
        )

-    if not aiResponse or not aiResponse.documents:
+    if not phase1Response or not phase1Response.documents:
        return {"documentType": "UNKNOWN", "extractedData": [], "fileId": fileId, "fileName": fileName}

-    doc = aiResponse.documents[0]
-    raw = doc.documentData
-    if isinstance(raw, bytes):
-        raw = raw.decode("utf-8")
+    raw1 = phase1Response.documents[0].documentData
+    if isinstance(raw1, bytes):
+        raw1 = raw1.decode("utf-8")
+    documentType, rawText = _parseClassificationResult(raw1 or "")

-    documentType = "UNKNOWN"
-    records = []
+    if not rawText:
+        return {"documentType": documentType or "UNKNOWN", "extractedData": [], "fileId": fileId, "fileName": fileName}

-    # Try JSON first
+    structuringPrompt = _buildStructuringPrompt(documentType, expenseList, bankList)
    try:
-        if raw.strip().startswith("{"):
-            data = json.loads(raw)
-            # Direct format: {"documentType": "...", "records": [...]}
-            if "records" in data or "extractedData" in data:
-                documentType = (data.get("documentType") or "UNKNOWN").upper().replace(" ", "_")
-                records = data.get("records") or data.get("extractedData") or []
-            # Wrapped in document structure: {"documents": [{"sections": [{"elements": [{"content": {"code": "..."}}]}]}]}
-            elif "documents" in data:
-                for doc in data.get("documents", []):
-                    for section in doc.get("sections", []):
-                        for elem in section.get("elements", []):
-                            code = (elem.get("content") or {}).get("code")
-                            if code and isinstance(code, str):
-                                try:
-                                    inner = json.loads(code)
-                                    if isinstance(inner, dict) and ("records" in inner or "documentType" in inner):
-                                        documentType = (inner.get("documentType") or "UNKNOWN").upper().replace(" ", "_")
-                                        records = inner.get("records") or inner.get("extractedData") or []
-                                        break
-                                except Exception:
-                                    pass
-                        if records:
-                            break
-                    if records:
-                        break
-            elif "documentType" in data:
-                documentType = (data.get("documentType") or "UNKNOWN").upper().replace(" ", "_")
+        self.services.utils.writeDebugFile(structuringPrompt, "trustee_structuring_prompt")
    except Exception:
        pass

-    # Fallback: CSV
-    if not records and raw:
-        records = _parseCsvToRecords(raw)
-        if records and not documentType or documentType == "UNKNOWN":
-            documentType = "EXPENSE_RECEIPT"
+    fullPrompt = f"{structuringPrompt}\n\nDOKUMENT-TEXT:\n{rawText}"
+    phase2Request = AiCallRequest(
+        prompt=fullPrompt,
+        context="",
+        options=AiCallOptions(resultFormat="json"),
+    )
+    phase2Response = await self.services.ai.callAi(phase2Request)
+    raw2 = (phase2Response.content or "").strip() if hasattr(phase2Response, "content") else ""
+    try:
+        self.services.utils.writeDebugFile(raw2 or "(empty)", "trustee_structuring_response")
+    except Exception:
+        pass
+    records = _parseStructuredRecords(raw2)
+    logger.info("Phase 2 result: documentType=%s, records=%d, raw2_length=%d", documentType, len(records), len(raw2))

-    return {"documentType": documentType, "extractedData": records, "fileId": fileId, "fileName": fileName}  # fileId from caller for result
+    if records and (not documentType or documentType == "UNKNOWN"):
+        documentType = "EXPENSE_RECEIPT"
+
+    return {"documentType": documentType or "UNKNOWN", "extractedData": records, "fileId": fileId, "fileName": fileName}


 async def _extractOne(
    self,
    f: Dict[str, Any],
    fileIdToChatDocId: Dict[str, str],
-    prompt: str,
+    expenseList: str,
+    bankList: str,
    featureInstanceId: str,
 ) -> ActionDocument:
    """Run extraction for one file; returns success or error ActionDocument (never raises)."""
@ -197,7 +266,7 @@ async def _extractOne(
        )
    try:
        out = await _extractWithAi(
-            self, chatDocId, f["fileId"], f["fileName"], f["mimeType"], prompt, featureInstanceId
+            self, chatDocId, f["fileId"], f["fileName"], f["mimeType"], expenseList, bankList, featureInstanceId
        )
        return ActionDocument(
            documentName=f.get("fileName", "extract") + ".json",
@ -229,7 +298,6 @@ async def extractFromFiles(self, parameters: Dict[str, Any]) -> ActionResult:
    connectionReference = parameters.get("connectionReference")
    sharepointFolder = parameters.get("sharepointFolder")
    featureInstanceId = parameters.get("featureInstanceId") or getattr(self.services, "featureInstanceId", None)
-    prompt = parameters.get("prompt") or ""

    if not featureInstanceId:
        return ActionResult.isFailure(error="featureInstanceId is required")
@ -329,13 +397,11 @@ async def extractFromFiles(self, parameters: Dict[str, Any]) -> ActionResult:
        if i < len(createdMessage.documents):
            fileIdToChatDocId[f["fileId"]] = createdMessage.documents[i].id

-    # Load expense accounts from accounting system for AI prompt (if configured)
-    if not prompt:
-        prompt = await _buildDefaultPromptWithAccounts(self, featureInstanceId)
+    expenseList, bankList = await _getAccountLists(self, featureInstanceId)

-    # Parallel extraction (all files at once)
+    # Parallel extraction (all files at once, 2-phase: classify + structure)
    tasks = [
-        _extractOne(self, f, fileIdToChatDocId, prompt, featureInstanceId)
+        _extractOne(self, f, fileIdToChatDocId, expenseList, bankList, featureInstanceId)
        for f in filesToProcess
    ]
    resultDocuments = list(await asyncio.gather(*tasks))
--- a/modules/workflows/methods/methodTrustee/actions/processDocuments.py
+++ b/modules/workflows/methods/methodTrustee/actions/processDocuments.py
@ -27,6 +27,24 @@ def _parseFloat(value) -> float:
        return 0.0


+def _extractAccountNumber(value) -> Optional[str]:
+    """Extract the leading numeric account number from AI output like '6200 Fahrzeugaufwand' -> '6200'."""
+    if not value or not isinstance(value, str):
+        return None
+    import re
+    match = re.match(r"(\d+)", value.strip())
+    return match.group(1) if match else value.strip() or None
+
+
+def _normaliseTags(value) -> str:
+    """Convert tags from various formats to a clean comma-separated string."""
+    if not value:
+        return ""
+    if isinstance(value, list):
+        return ", ".join(str(t) for t in value if t)
+    return str(value)
+
+
 def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], featureInstanceId: str, mandateId: str) -> Dict[str, Any]:
    """Map extraction record to TrusteePosition payload."""
    return {
@ -35,15 +53,15 @@ def _recordToPosition(record: Dict[str, Any], documentId: Optional[str], feature
        "transactionDateTime": record.get("transactionDateTime"),
        "company": record.get("company", ""),
        "desc": record.get("desc", ""),
-        "tags": record.get("tags", ""),
+        "tags": _normaliseTags(record.get("tags")),
        "bookingCurrency": record.get("bookingCurrency", "CHF"),
        "bookingAmount": _parseFloat(record.get("bookingAmount", 0)),
        "originalCurrency": record.get("originalCurrency") or record.get("bookingCurrency", "CHF"),
        "originalAmount": _parseFloat(record.get("originalAmount", 0)) or _parseFloat(record.get("bookingAmount", 0)),
        "vatPercentage": _parseFloat(record.get("vatPercentage", 0)),
        "vatAmount": _parseFloat(record.get("vatAmount", 0)),
-        "debitAccountNumber": record.get("debitAccountNumber") or None,
-        "creditAccountNumber": record.get("creditAccountNumber") or None,
+        "debitAccountNumber": _extractAccountNumber(record.get("debitAccountNumber")),
+        "creditAccountNumber": _extractAccountNumber(record.get("creditAccountNumber")),
        "taxCode": record.get("taxCode") or None,
        "costCenter": record.get("costCenter") or None,
        "bookingReference": record.get("bookingReference") or None,