fixed ai call end to end with saas multimandate

This commit is contained in:
ValueOn AG 2026-01-26 23:26:30 +01:00
parent 5c4813b10a
commit 97cbda0ef2
11 changed files with 395 additions and 232 deletions

View file

@ -72,10 +72,16 @@ class ModelSelector:
promptSize = len(prompt.encode("utf-8")) promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8")) contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize totalSize = promptSize + contextSize
# Convert bytes to approximate tokens (1 token ≈ 4 bytes) # Convert bytes to approximate tokens
promptTokens = promptSize / 4 # Conservative estimate: 1 token ≈ 2 bytes (for safety margin)
contextTokens = contextSize / 4 # Note: Actual tokenization varies by content type and model
totalTokens = totalSize / 4 # - English text: ~4 bytes/token
# - Structured data/JSON: ~2-3 bytes/token
# - Base64/encoded data: ~1.5-2 bytes/token
bytesPerToken = 2 # Conservative estimate for mixed content
promptTokens = promptSize / bytesPerToken
contextTokens = contextSize / bytesPerToken
totalTokens = totalSize / bytesPerToken
logger.debug(f"Request sizes - Prompt: {promptTokens:.0f} tokens ({promptSize} bytes), Context: {contextTokens:.0f} tokens ({contextSize} bytes), Total: {totalTokens:.0f} tokens ({totalSize} bytes)") logger.debug(f"Request sizes - Prompt: {promptTokens:.0f} tokens ({promptSize} bytes), Context: {contextTokens:.0f} tokens ({contextSize} bytes), Total: {totalTokens:.0f} tokens ({totalSize} bytes)")

View file

@ -220,11 +220,12 @@ class ChatMessage(BaseModel):
) )
role: str = Field(description="Role of the message sender") role: str = Field(description="Role of the message sender")
status: str = Field(description="Status of the message (first, step, last)") status: str = Field(description="Status of the message (first, step, last)")
sequenceNr: int = Field( sequenceNr: Optional[int] = Field(
default=0,
description="Sequence number of the message (set automatically)" description="Sequence number of the message (set automatically)"
) )
publishedAt: float = Field( publishedAt: Optional[float] = Field(
default_factory=getUtcTimestamp, default=None,
description="When the message was published (UTC timestamp in seconds)", description="When the message was published (UTC timestamp in seconds)",
) )
success: Optional[bool] = Field( success: Optional[bool] = Field(

View file

@ -399,7 +399,7 @@ AUTOMATION_TEMPLATES: Dict[str, Any] = {
"connectionName": "", "connectionName": "",
"sharepointFolder": "", "sharepointFolder": "",
"featureInstanceId": "", "featureInstanceId": "",
"extractionPrompt": "Du bist ein Spezialist für die Extraktion von Spesendaten aus PDF-Dokumenten.\n\nAUFGABE:\nExtrahiere alle Speseneinträge aus dem bereitgestellten PDF-Dokument und gib sie im CSV-Format zurück.\n\nWICHTIGE REGELN:\n1. Pro MwSt-Prozentsatz einen separaten Datensatz erstellen\n2. Alle Datensätze zusammen müssen den Gesamtbetrag des Dokuments ergeben\n3. Der gesamte extrahierte Text des Dokuments muss im Feld \"desc\" erfasst werden\n4. Feld \"company\" enthält den Lieferanten/Verkäufer der Buchung\n5. Tags müssen aus dieser Liste gewählt werden: customer, meeting, license, subscription, fuel, food, material\n - Mehrere zutreffende Tags mit Komma trennen\n\nCSV-SPALTEN (in dieser Reihenfolge):\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n\nDATENFORMAT:\n- valuta: YYYY-MM-DD (Valutadatum)\n- transactionDateTime: Unix-Timestamp in Sekunden (Transaktionszeitpunkt)\n- company: Lieferant/Verkäufer Name\n- desc: Vollständiger extrahierter Text des Dokuments\n- tags: Komma-getrennte Tags aus der erlaubten Liste\n- bookingCurrency: Währungscode (CHF, EUR, USD, GBP)\n- bookingAmount: Buchungsbetrag als Dezimalzahl\n- originalCurrency: Original-Währungscode\n- originalAmount: Original-Betrag als Dezimalzahl\n- vatPercentage: MwSt-Prozentsatz (z.B. 8.1 für 8.1%)\n- vatAmount: MwSt-Betrag als Dezimalzahl\n\nBEISPIEL OUTPUT:\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n2026-01-15,1736953200,Migros AG,\"Einkauf Migros Zürich...\",food,CHF,45.50,CHF,45.50,2.6,1.15\n2026-01-15,1736953200,Migros AG,\"Einkauf Migros Zürich...\",material,CHF,12.30,CHF,12.30,8.1,0.92\n\nHINWEISE:\n- Wenn nur ein MwSt-Satz vorhanden ist, einen Datensatz erstellen\n- Wenn mehrere MwSt-Sätze vorhanden sind (z.B. Lebensmittel 2.6% und Non-Food 8.1%), separate Datensätze erstellen\n- Bei fehlenden Informationen: leeres Feld oder Standardwert\n- Keine Anführungszeichen um numerische Werte" "extractionPrompt": "Du bist ein Spezialist für die Extraktion von Belegdaten aus PDF-Dokumenten.\n\nAUFGABE:\nExtrahiere die Daten aus dem bereitgestellten Zahlungsbeleg und erstelle EINE EINZIGE CSV-Tabelle mit allen Datensätzen.\n\nOUTPUT-STRUKTUR:\nErstelle genau EINE Tabelle mit den folgenden Spalten. Alle extrahierten Datensätze kommen in diese eine Tabelle als Zeilen.\n\nWICHTIGE REGELN:\n1. Pro MwSt-Prozentsatz einen separaten Datensatz (= Zeile) erstellen\n2. Alle Datensätze zusammen müssen den Gesamtbetrag des Dokuments ergeben\n3. Der gesamte extrahierte Text des Dokuments muss im Feld \"desc\" erfasst werden\n4. Feld \"company\" enthält den Lieferanten/Verkäufer der Buchung\n5. Tags müssen aus dieser Liste gewählt werden: customer, meeting, license, subscription, fuel, food, material\n - Mehrere zutreffende Tags mit Komma trennen\n\nCSV-SPALTEN (in dieser Reihenfolge):\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n\nDATENFORMAT:\n- valuta: YYYY-MM-DD (Valutadatum)\n- transactionDateTime: Unix-Timestamp in Sekunden (Transaktionszeitpunkt)\n- company: Lieferant/Verkäufer Name\n- desc: Vollständiger extrahierter Text des Dokuments\n- tags: Komma-getrennte Tags aus der erlaubten Liste\n- bookingCurrency: Währungscode (CHF, EUR, USD, GBP)\n- bookingAmount: Buchungsbetrag als Dezimalzahl\n- originalCurrency: Original-Währungscode\n- originalAmount: Original-Betrag als Dezimalzahl\n- vatPercentage: MwSt-Prozentsatz (z.B. 8.1 für 8.1%)\n- vatAmount: MwSt-Betrag als Dezimalzahl\n\nHINWEISE:\n- Wenn nur ein MwSt-Satz vorhanden ist, einen Datensatz erstellen\n- Wenn mehrere MwSt-Sätze vorhanden sind (z.B. Lebensmittel 2.6% und Non-Food 8.1%), separate Datensätze erstellen\n- Bei fehlenden Informationen: leeres Feld oder Standardwert"
} }
} }
] ]

View file

@ -279,7 +279,10 @@ registerModelLabels(
class TrusteeDocument(BaseModel): class TrusteeDocument(BaseModel):
"""Contains document references and receipts for bookings. """Contains document references for bookings.
Documents reference files in the central Files table via fileId.
This allows file content to be stored once and referenced by multiple features.
Note: organisationId and contractId removed as per architecture decision: Note: organisationId and contractId removed as per architecture decision:
- The feature instance IS the organisation - The feature instance IS the organisation
@ -294,11 +297,11 @@ class TrusteeDocument(BaseModel):
"frontend_required": False "frontend_required": False
} }
) )
documentData: Optional[bytes] = Field( fileId: Optional[str] = Field(
default=None, default=None,
description="The file content (binary)", description="Reference to central Files table (Files.id)",
json_schema_extra={ json_schema_extra={
"frontend_type": "file", "frontend_type": "file_reference",
"frontend_readonly": False, "frontend_readonly": False,
"frontend_required": False "frontend_required": False
} }
@ -321,6 +324,24 @@ class TrusteeDocument(BaseModel):
"frontend_options": "/api/trustee/mime-types/options" "frontend_options": "/api/trustee/mime-types/options"
} }
) )
sourceType: Optional[str] = Field(
default=None,
description="Source type (e.g., 'sharepoint', 'upload', 'email')",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False
}
)
sourceLocation: Optional[str] = Field(
default=None,
description="Original source location (e.g., SharePoint path)",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False
}
)
mandateId: Optional[str] = Field( mandateId: Optional[str] = Field(
default=None, default=None,
description="Mandate ID (auto-set from context)", description="Mandate ID (auto-set from context)",
@ -349,9 +370,11 @@ registerModelLabels(
{"en": "Document", "fr": "Document", "de": "Dokument"}, {"en": "Document", "fr": "Document", "de": "Dokument"},
{ {
"id": {"en": "ID", "fr": "ID", "de": "ID"}, "id": {"en": "ID", "fr": "ID", "de": "ID"},
"documentData": {"en": "Document Data", "fr": "Données du document", "de": "Dokumentdaten"}, "fileId": {"en": "File Reference", "fr": "Référence du fichier", "de": "Datei-Referenz"},
"documentName": {"en": "Document Name", "fr": "Nom du document", "de": "Dokumentname"}, "documentName": {"en": "Document Name", "fr": "Nom du document", "de": "Dokumentname"},
"documentMimeType": {"en": "MIME Type", "fr": "Type MIME", "de": "MIME-Typ"}, "documentMimeType": {"en": "MIME Type", "fr": "Type MIME", "de": "MIME-Typ"},
"sourceType": {"en": "Source Type", "fr": "Type de source", "de": "Quelltyp"},
"sourceLocation": {"en": "Source Location", "fr": "Emplacement source", "de": "Quellort"},
"mandateId": {"en": "Mandate", "fr": "Mandat", "de": "Mandat"}, "mandateId": {"en": "Mandate", "fr": "Mandat", "de": "Mandat"},
"featureInstanceId": {"en": "Feature Instance", "fr": "Instance de fonctionnalité", "de": "Feature-Instanz"}, "featureInstanceId": {"en": "Feature Instance", "fr": "Instance de fonctionnalité", "de": "Feature-Instanz"},
}, },

View file

@ -177,8 +177,7 @@ class TrusteeObjects:
AccessRuleContext.DATA, AccessRuleContext.DATA,
tableName, tableName,
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
if not permissions.view: if not permissions.view:
@ -205,8 +204,7 @@ class TrusteeObjects:
AccessRuleContext.DATA, AccessRuleContext.DATA,
tableName, tableName,
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
if not permissions.view: if not permissions.view:
@ -270,8 +268,7 @@ class TrusteeObjects:
recordFilter=None, recordFilter=None,
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
logger.debug(f"getAllOrganisations: getRecordsetWithRBAC returned {len(records)} records") logger.debug(f"getAllOrganisations: getRecordsetWithRBAC returned {len(records)} records")
@ -364,8 +361,7 @@ class TrusteeObjects:
recordFilter=None, recordFilter=None,
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
# Users with ALL access level (from system RBAC) see all roles # Users with ALL access level (from system RBAC) see all roles
@ -475,8 +471,7 @@ class TrusteeObjects:
recordFilter=None, recordFilter=None,
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
# Users with ALL access level (from system RBAC) see all records # Users with ALL access level (from system RBAC) see all records
@ -535,8 +530,7 @@ class TrusteeObjects:
recordFilter={"organisationId": organisationId}, recordFilter={"organisationId": organisationId},
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
return [TrusteeAccess(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records] return [TrusteeAccess(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -553,8 +547,7 @@ class TrusteeObjects:
recordFilter={"userId": userId}, recordFilter={"userId": userId},
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
# Users with ALL access level (from system RBAC) see all records # Users with ALL access level (from system RBAC) see all records
@ -671,8 +664,7 @@ class TrusteeObjects:
recordFilter=None, recordFilter=None,
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
totalItems = len(records) totalItems = len(records)
@ -705,8 +697,7 @@ class TrusteeObjects:
recordFilter={"organisationId": organisationId}, recordFilter={"organisationId": organisationId},
orderBy="label", orderBy="label",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
return [TrusteeContract(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records] return [TrusteeContract(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -780,8 +771,8 @@ class TrusteeObjects:
createdRecord = self.db.recordCreate(TrusteeDocument, data) createdRecord = self.db.recordCreate(TrusteeDocument, data)
if createdRecord and createdRecord.get("id"): if createdRecord and createdRecord.get("id"):
# Remove binary data and metadata from Pydantic model # Remove metadata from Pydantic model
cleanedRecord = {k: v for k, v in createdRecord.items() if not k.startswith("_") and k != "documentData"} cleanedRecord = {k: v for k, v in createdRecord.items() if not k.startswith("_")}
return TrusteeDocument(**cleanedRecord) return TrusteeDocument(**cleanedRecord)
return None return None
@ -795,12 +786,25 @@ class TrusteeObjects:
return TrusteeDocument(**cleanedRecord) return TrusteeDocument(**cleanedRecord)
def getDocumentData(self, documentId: str) -> Optional[bytes]: def getDocumentData(self, documentId: str) -> Optional[bytes]:
"""Get document binary data.""" """Get document binary data via fileId reference to central Files table."""
records = self.db.getRecordset(TrusteeDocument, recordFilter={"id": documentId}) records = self.db.getRecordset(TrusteeDocument, recordFilter={"id": documentId})
record = records[0] if records else None record = records[0] if records else None
if record: if not record:
return record.get("documentData") return None
return None
# New model: fileId references central Files table
fileId = record.get("fileId")
if fileId:
from modules.interfaces.interfaceDbManagement import getInterface as getDbInterface
dbInterface = getDbInterface(self.currentUser, mandateId=self.mandateId, featureInstanceId=self.featureInstanceId)
fileData = dbInterface.getFileData(fileId)
if fileData:
return fileData
logger.warning(f"File data not found for fileId {fileId}")
return None
# Legacy fallback: documentData was stored directly (for migration)
return record.get("documentData")
def getAllDocuments(self, params: Optional[PaginationParams] = None) -> PaginatedResult: def getAllDocuments(self, params: Optional[PaginationParams] = None) -> PaginatedResult:
"""Get all documents with RBAC filtering + feature-level access filtering (metadata only).""" """Get all documents with RBAC filtering + feature-level access filtering (metadata only)."""
@ -812,8 +816,7 @@ class TrusteeObjects:
recordFilter=None, recordFilter=None,
orderBy="documentName", orderBy="documentName",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
# Convert dicts to Pydantic objects (remove binary data and internal fields) # Convert dicts to Pydantic objects (remove binary data and internal fields)
@ -852,8 +855,7 @@ class TrusteeObjects:
recordFilter={"contractId": contractId}, recordFilter={"contractId": contractId},
orderBy="documentName", orderBy="documentName",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
result = [] result = []
@ -960,8 +962,7 @@ class TrusteeObjects:
recordFilter=None, recordFilter=None,
orderBy="valuta", orderBy="valuta",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
# Convert dicts to Pydantic objects (remove internal fields) # Convert dicts to Pydantic objects (remove internal fields)
@ -1000,8 +1001,7 @@ class TrusteeObjects:
recordFilter={"contractId": contractId}, recordFilter={"contractId": contractId},
orderBy="valuta", orderBy="valuta",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records] return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -1015,8 +1015,7 @@ class TrusteeObjects:
recordFilter={"organisationId": organisationId}, recordFilter={"organisationId": organisationId},
orderBy="valuta", orderBy="valuta",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records] return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -1173,8 +1172,7 @@ class TrusteeObjects:
recordFilter={"positionId": positionId}, recordFilter={"positionId": positionId},
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links] return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links]
@ -1188,8 +1186,7 @@ class TrusteeObjects:
recordFilter={"documentId": documentId}, recordFilter={"documentId": documentId},
orderBy="id", orderBy="id",
mandateId=self.mandateId, mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId, featureInstanceId=self.featureInstanceId
featureCode=self.FEATURE_CODE
) )
return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links] return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links]

View file

@ -53,7 +53,8 @@ async def start_workflow(
""" """
try: try:
# Start or continue workflow using playground controller # Start or continue workflow using playground controller
workflow = await chatStart(context.user, userInput, workflowMode, workflowId) mandateId = str(context.mandateId) if context.mandateId else None
workflow = await chatStart(context.user, userInput, workflowMode, workflowId, mandateId=mandateId)
return workflow return workflow
@ -75,7 +76,8 @@ async def stop_workflow(
"""Stops a running workflow.""" """Stops a running workflow."""
try: try:
# Stop workflow using playground controller # Stop workflow using playground controller
workflow = await chatStop(context.user, workflowId) mandateId = str(context.mandateId) if context.mandateId else None
workflow = await chatStop(context.user, workflowId, mandateId=mandateId)
return workflow return workflow

View file

@ -785,14 +785,39 @@ Respond with ONLY a JSON object in this exact format:
if part.data if part.data
]) ])
# Call AI with extracted content # Check content size and use chunking if needed
aiRequest = AiCallRequest( # Conservative estimate: 2 bytes per token, 80% of model limit for safety
prompt=f"{prompt}\n\nExtracted Content:\n{contentText}", contentSizeBytes = len(contentText.encode('utf-8'))
context="", promptSizeBytes = len(prompt.encode('utf-8'))
options=options totalSizeBytes = contentSizeBytes + promptSizeBytes
) estimatedTokens = totalSizeBytes / 2 # Conservative: 2 bytes per token
aiResponse = await self.callAi(aiRequest) # Get max model context (use Claude's 200k as reference, 80% = 160k tokens)
maxSafeTokens = 160000
if estimatedTokens > maxSafeTokens:
# Content too large - use chunking via ExtractionService
logger.warning(f"Content too large for single AI call: ~{estimatedTokens:.0f} tokens (limit: {maxSafeTokens}). Using chunked processing.")
# Use ExtractionService for chunked processing
extractionService = self.services.extraction
aiResponse = await extractionService.processContentPartsWithPrompt(
contentParts=contentParts,
prompt=prompt,
aiObjects=self.aiObjects,
options=options,
operationId=extractOperationId,
parentOperationId=parentOperationId
)
else:
# Content fits - use single AI call
aiRequest = AiCallRequest(
prompt=f"{prompt}\n\nExtracted Content:\n{contentText}",
context="",
options=options
)
aiResponse = await self.callAi(aiRequest)
# Create response document # Create response document
resultDocument = DocumentData( resultDocument = DocumentData(

View file

@ -567,7 +567,8 @@ class StructureFiller:
userPrompt: str, userPrompt: str,
all_sections_list: List[Dict[str, Any]], all_sections_list: List[Dict[str, Any]],
language: str, language: str,
calculateOverallProgress: callable outputFormat: str = "txt",
calculateOverallProgress: callable = None
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Process a single section and return its elements. Process a single section and return its elements.
@ -761,7 +762,8 @@ class StructureFiller:
allSections=all_sections_list, allSections=all_sections_list,
sectionIndex=sectionIndex, sectionIndex=sectionIndex,
isAggregation=isAggregation, isAggregation=isAggregation,
language=language language=language,
outputFormat=outputFormat
) )
sectionOperationId = f"{fillOperationId}_section_{sectionId}" sectionOperationId = f"{fillOperationId}_section_{sectionId}"
@ -949,7 +951,8 @@ class StructureFiller:
allSections=all_sections_list, allSections=all_sections_list,
sectionIndex=sectionIndex, sectionIndex=sectionIndex,
isAggregation=False, isAggregation=False,
language=language language=language,
outputFormat=outputFormat
) )
sectionOperationId = f"{fillOperationId}_section_{sectionId}" sectionOperationId = f"{fillOperationId}_section_{sectionId}"
@ -1214,7 +1217,8 @@ class StructureFiller:
allSections=all_sections_list, allSections=all_sections_list,
sectionIndex=sectionIndex, sectionIndex=sectionIndex,
isAggregation=False, isAggregation=False,
language=language language=language,
outputFormat=outputFormat
) )
sectionOperationId = f"{fillOperationId}_section_{sectionId}" sectionOperationId = f"{fillOperationId}_section_{sectionId}"
@ -1540,6 +1544,7 @@ class StructureFiller:
for doc in chapterStructure.get("documents", []): for doc in chapterStructure.get("documents", []):
docId = doc.get("id", "unknown") docId = doc.get("id", "unknown")
docLanguage = self._getDocumentLanguage(chapterStructure, docId) docLanguage = self._getDocumentLanguage(chapterStructure, docId)
docFormat = doc.get("outputFormat", "txt") # Get output format for this document
for chapter in doc.get("chapters", []): for chapter in doc.get("chapters", []):
chapterId = chapter.get("id", "unknown") chapterId = chapter.get("id", "unknown")
@ -1555,7 +1560,8 @@ class StructureFiller:
"sectionIndex": sectionIndex, "sectionIndex": sectionIndex,
"chapterSectionCount": chapterSectionCount, "chapterSectionCount": chapterSectionCount,
"section": section, "section": section,
"docLanguage": docLanguage "docLanguage": docLanguage,
"docFormat": docFormat # Include output format
}) })
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters") logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
@ -1577,6 +1583,7 @@ class StructureFiller:
userPrompt=userPrompt, userPrompt=userPrompt,
all_sections_list=all_sections_list, all_sections_list=all_sections_list,
language=taskInfo["docLanguage"], language=taskInfo["docLanguage"],
outputFormat=taskInfo.get("docFormat", "txt"), # Pass output format
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0 calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0
) )
@ -1826,23 +1833,11 @@ If AVAILABLE CONTENT PARTS are listed above, then EVERY section that generates c
- If chapter's generationHint references documents/images/data AND section generates content for that chapter → section MUST assign relevant ContentParts - If chapter's generationHint references documents/images/data AND section generates content for that chapter → section MUST assign relevant ContentParts
- Empty contentPartIds [] are only allowed if section generates content WITHOUT referencing any available ContentParts AND WITHOUT relating to chapter's generationHint - Empty contentPartIds [] are only allowed if section generates content WITHOUT referencing any available ContentParts AND WITHOUT relating to chapter's generationHint
## CONTENT TYPES ## ACCEPTED CONTENT TYPES FOR THIS FORMAT
Available content types for sections: table, bullet_list, heading, paragraph, code_block, image The document output format ({outputFormat}) accepts only the following content types:
{', '.join(acceptedSectionTypes)}
## ACCEPTED SECTION TYPES FOR THIS FORMAT **CRITICAL**: Only create sections with content types from this list. Other types will fail.
The document output format ({outputFormat}) accepts only the following section types:
{', '.join(acceptedSectionTypes) if acceptedSectionTypes else 'All section types'}
**IMPORTANT**: Only create sections with content types from the accepted list above. Do not create sections with types that are not accepted by this format.
## FORMAT-APPROPRIATE SECTION STRUCTURE
When determining which sections to create for this chapter, consider the document's output format ({outputFormat}) and ensure sections are structured appropriately for that format:
- Different formats have different capabilities and constraints
- Structure sections to match what the format can effectively represent
- Consider what content types work best for each format
- Ensure the section structure aligns with the format's strengths and limitations
- Select content types that are well-suited for the target format
- **CRITICAL**: Only use section types from the ACCEPTED SECTION TYPES list above
useAiCall RULE (simple): useAiCall RULE (simple):
- useAiCall: true Content needs AI processing (extract, transform, generate, filter, summarize) - useAiCall: true Content needs AI processing (extract, transform, generate, filter, summarize)
@ -1853,7 +1848,7 @@ RETURN JSON:
"sections": [ "sections": [
{{ {{
"id": "section_1", "id": "section_1",
"content_type": "paragraph", "content_type": "{acceptedSectionTypes[0]}",
"contentPartIds": ["extracted_part_id"], "contentPartIds": ["extracted_part_id"],
"generationHint": "Description of what to extract or generate", "generationHint": "Description of what to extract or generate",
"useAiCall": true, "useAiCall": true,
@ -1897,7 +1892,8 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
allSections: Optional[List[Dict[str, Any]]] = None, allSections: Optional[List[Dict[str, Any]]] = None,
sectionIndex: Optional[int] = None, sectionIndex: Optional[int] = None,
isAggregation: bool = False, isAggregation: bool = False,
language: str = "en" language: str = "en",
outputFormat: str = "txt"
) -> tuple[str, str]: ) -> tuple[str, str]:
"""Baue Prompt für Section-Generierung mit vollständigem Kontext.""" """Baue Prompt für Section-Generierung mit vollständigem Kontext."""
# Filtere None-Werte # Filtere None-Werte
@ -2005,14 +2001,29 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
for next in nextSections: for next in nextSections:
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n" contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
contentStructureExample = self._getContentStructureExample(contentType) # Get accepted section types for the output format
acceptedTypesAggr = self._getAcceptedSectionTypesForFormat(outputFormat)
# CRITICAL: If the section's content_type is not supported by the output format,
# use the first accepted type instead. E.g., CSV only supports 'table', so
# even if section says 'code_block', we must output as 'table'.
effectiveContentType = contentType
if contentType not in acceptedTypesAggr and acceptedTypesAggr:
effectiveContentType = acceptedTypesAggr[0]
logger.debug(f"Section {sectionId}: Content type '{contentType}' not supported by format '{outputFormat}', using '{effectiveContentType}' instead")
contentStructureExample = self._getContentStructureExample(effectiveContentType)
# Build format note for the prompt - purely dynamic from renderer
# Always show what types are accepted for this format
formatNoteAggr = f"\n- Target Output Format: {outputFormat.upper()} (accepted content types: {', '.join(acceptedTypesAggr)})"
# Create template structure explicitly (not extracted from prompt) # Create template structure explicitly (not extracted from prompt)
# This ensures exact identity between initial and continuation prompts # This ensures exact identity between initial and continuation prompts
templateStructure = f"""{{ templateStructure = f"""{{
"elements": [ "elements": [
{{ {{
"type": "{contentType}", "type": "{effectiveContentType}",
"content": {contentStructureExample} "content": {contentStructureExample}
}} }}
] ]
@ -2022,14 +2033,14 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
prompt = f"""# TASK: Generate Section Content (Aggregation) prompt = f"""# TASK: Generate Section Content (Aggregation)
Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON. Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON.
If ContentParts have no data, return: {{"elements": [{{"type": "{contentType}", "content": {{"headers": [], "rows": []}}}}]}} If ContentParts have no data, return: {{"elements": [{{"type": "{effectiveContentType}", "content": {{"headers": [], "rows": []}}}}]}}
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}. LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
## SECTION METADATA ## SECTION METADATA
- Section ID: {sectionId} - Section ID: {sectionId}
- Content Type: {contentType} - Content Type: {effectiveContentType}
- Generation Hint: {generationHint} - Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES ## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only - Generate COMPACT content: Focus on essential facts only
@ -2044,7 +2055,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
3. If the context contains no data, return empty structures (empty rows array for tables). 3. If the context contains no data, return empty structures (empty rows array for tables).
4. Aggregate all data into one element (e.g., one table). 4. Aggregate all data into one element (e.g., one table).
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists. 5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
6. Format based on content_type ({contentType}). 6. Format based on content_type ({effectiveContentType}).
7. No HTML/styling: Plain text only, no markup. 7. No HTML/styling: Plain text only, no markup.
8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself. 8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
@ -2055,7 +2066,7 @@ Return a JSON object with this structure:
{{ {{
"elements": [ "elements": [
{{ {{
"type": "{contentType}", "type": "{effectiveContentType}",
"content": {contentStructureExample} "content": {contentStructureExample}
}} }}
] ]
@ -2087,8 +2098,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## SECTION METADATA ## SECTION METADATA
- Section ID: {sectionId} - Section ID: {sectionId}
- Content Type: {contentType} - Content Type: {effectiveContentType}
- Generation Hint: {generationHint} - Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES ## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only - Generate COMPACT content: Focus on essential facts only
@ -2103,7 +2114,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## INSTRUCTIONS ## INSTRUCTIONS
1. Extract data only from provided ContentParts. Never invent or generate data. 1. Extract data only from provided ContentParts. Never invent or generate data.
2. If ContentParts contain no data, return empty structures (empty rows array for tables). 2. If ContentParts contain no data, return empty structures (empty rows array for tables).
3. Format based on content_type ({contentType}). 3. Format based on content_type ({effectiveContentType}).
4. Return only valid JSON with "elements" array. 4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup. 5. No HTML/styling: Plain text only, no markup.
6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself. 6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
@ -2114,7 +2125,7 @@ Return a JSON object with this structure:
{{ {{
"elements": [ "elements": [
{{ {{
"type": "{contentType}", "type": "{effectiveContentType}",
"content": {contentStructureExample} "content": {contentStructureExample}
}} }}
] ]
@ -2142,8 +2153,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## SECTION METADATA ## SECTION METADATA
- Section ID: {sectionId} - Section ID: {sectionId}
- Content Type: {contentType} - Content Type: {effectiveContentType}
- Generation Hint: {generationHint} - Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES ## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only - Generate COMPACT content: Focus on essential facts only
@ -2154,7 +2165,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## INSTRUCTIONS ## INSTRUCTIONS
1. Generate content based on the Generation Hint above. 1. Generate content based on the Generation Hint above.
2. Create appropriate content that matches the content_type ({contentType}). 2. Create appropriate content that matches the content_type ({effectiveContentType}).
3. The content should be relevant to the USER REQUEST and fit the context of surrounding sections. 3. The content should be relevant to the USER REQUEST and fit the context of surrounding sections.
4. Return only valid JSON with "elements" array. 4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup. 5. No HTML/styling: Plain text only, no markup.
@ -2166,7 +2177,7 @@ Return a JSON object with this structure:
{{ {{
"elements": [ "elements": [
{{ {{
"type": "{contentType}", "type": "{effectiveContentType}",
"content": {contentStructureExample} "content": {contentStructureExample}
}} }}
] ]
@ -2557,28 +2568,26 @@ CRITICAL:
Returns: Returns:
List of accepted section content types (e.g., ["table", "code_block"]) List of accepted section content types (e.g., ["table", "code_block"])
Raises:
ValueError: If renderer not found or doesn't provide accepted types
""" """
try: from modules.services.serviceGeneration.renderers.registry import getRenderer
from modules.services.serviceGeneration.renderers.registry import getRenderer
# Get renderer for this format # Get renderer for this format - NO FALLBACK
renderer = getRenderer(outputFormat, self.services) renderer = getRenderer(outputFormat, self.services)
if renderer and hasattr(renderer, 'getAcceptedSectionTypes'): if not renderer:
# Query renderer for accepted types raise ValueError(f"No renderer found for output format '{outputFormat}'. Check renderer registry.")
acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat)
if acceptedTypes:
logger.debug(f"Renderer for format '{outputFormat}' accepts section types: {acceptedTypes}")
return acceptedTypes
# Fallback: if no renderer or method not found, return all types if not hasattr(renderer, 'getAcceptedSectionTypes'):
from modules.datamodels.datamodelJson import supportedSectionTypes raise ValueError(f"Renderer for '{outputFormat}' does not implement getAcceptedSectionTypes(). Add this method to the renderer.")
logger.debug(f"No renderer found for format '{outputFormat}' or method not available, using all section types")
return list(supportedSectionTypes)
except Exception as e: acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat)
logger.warning(f"Error querying renderer for accepted section types for format '{outputFormat}': {str(e)}")
# Fallback: return all types if not acceptedTypes:
from modules.datamodels.datamodelJson import supportedSectionTypes raise ValueError(f"Renderer for '{outputFormat}' returned empty accepted types. Fix getAcceptedSectionTypes() in the renderer.")
return list(supportedSectionTypes)
logger.debug(f"Renderer for '{outputFormat}' accepts: {acceptedTypes}")
return acceptedTypes

View file

@ -24,7 +24,7 @@ from .subAutomationUtils import parseScheduleToCron, planToPrompt, replacePlaceh
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode: WorkflowModeEnum, workflowId: Optional[str] = None) -> ChatWorkflow: async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode: WorkflowModeEnum, workflowId: Optional[str] = None, mandateId: Optional[str] = None) -> ChatWorkflow:
""" """
Starts a new chat or continues an existing one, then launches processing asynchronously. Starts a new chat or continues an existing one, then launches processing asynchronously.
@ -33,12 +33,13 @@ async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode
userInput: User input request userInput: User input request
workflowId: Optional workflow ID to continue existing workflow workflowId: Optional workflow ID to continue existing workflow
workflowMode: "Dynamic" for iterative dynamic-style processing, "Automation" for automated workflow execution workflowMode: "Dynamic" for iterative dynamic-style processing, "Automation" for automated workflow execution
mandateId: Mandate ID from request context (required for proper data isolation)
Example usage for Dynamic mode: Example usage for Dynamic mode:
workflow = await chatStart(currentUser, userInput, workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC) workflow = await chatStart(currentUser, userInput, workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC, mandateId=mandateId)
""" """
try: try:
services = getServices(currentUser, None) services = getServices(currentUser, mandateId=mandateId)
workflowManager = WorkflowManager(services) workflowManager = WorkflowManager(services)
workflow = await workflowManager.workflowStart(userInput, workflowMode, workflowId) workflow = await workflowManager.workflowStart(userInput, workflowMode, workflowId)
return workflow return workflow
@ -46,10 +47,10 @@ async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode
logger.error(f"Error starting chat: {str(e)}") logger.error(f"Error starting chat: {str(e)}")
raise raise
async def chatStop(currentUser: User, workflowId: str) -> ChatWorkflow: async def chatStop(currentUser: User, workflowId: str, mandateId: Optional[str] = None) -> ChatWorkflow:
"""Stops a running chat.""" """Stops a running chat."""
try: try:
services = getServices(currentUser, None) services = getServices(currentUser, mandateId=mandateId)
workflowManager = WorkflowManager(services) workflowManager = WorkflowManager(services)
return await workflowManager.workflowStop(workflowId) return await workflowManager.workflowStop(workflowId)
except Exception as e: except Exception as e:

View file

@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
# Configuration # Configuration
MAX_FILES_PER_EXECUTION = 50 MAX_FILES_PER_EXECUTION = 50
MAX_CONCURRENT_AI_TASKS = 10 # Limit concurrent AI calls to avoid rate limits
ALLOWED_TAGS = ["customer", "meeting", "license", "subscription", "fuel", "food", "material"] ALLOWED_TAGS = ["customer", "meeting", "license", "subscription", "fuel", "food", "material"]
RATE_LIMIT_WAIT_SECONDS = 60 RATE_LIMIT_WAIT_SECONDS = 60
@ -92,6 +93,11 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
self.services.chat.progressLogFinish(operationId, False) self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid Microsoft connection found") return ActionResult.isFailure(error="No valid Microsoft connection found")
# Set access token for SharePoint service
if not self.services.sharepoint.setAccessTokenFromConnection(connection):
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Failed to set SharePoint access token")
# Find site and folder info # Find site and folder info
self.services.chat.progressLogUpdate(operationId, 0.1, "Resolving SharePoint site") self.services.chat.progressLogUpdate(operationId, 0.1, "Resolving SharePoint site")
siteInfo, folderPath = await _resolveSiteAndFolder(self, sharepointFolder) siteInfo, folderPath = await _resolveSiteAndFolder(self, sharepointFolder)
@ -137,90 +143,104 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
featureInstanceId=featureInstanceId featureInstanceId=featureInstanceId
) )
# Process each PDF # Process PDFs in parallel with semaphore to limit concurrent AI calls
for idx, pdfFile in enumerate(pdfFiles): semaphore = asyncio.Semaphore(MAX_CONCURRENT_AI_TASKS)
currentProgress = 0.2 + (idx * progressPerFile) completedCount = [0] # Use list for mutable reference in closure
async def processSinglePdf(idx: int, pdfFile: Dict[str, Any]) -> Dict[str, Any]:
"""Process a single PDF document. Returns result dict."""
fileName = pdfFile.get("name", f"file_{idx}") fileName = pdfFile.get("name", f"file_{idx}")
fileId = pdfFile.get("id") fileId = pdfFile.get("id")
self.services.chat.progressLogUpdate( async with semaphore:
operationId, # Update progress (thread-safe via asyncio)
currentProgress, completedCount[0] += 1
f"Processing {idx + 1}/{totalFiles}: {fileName}" currentProgress = 0.2 + (completedCount[0] * progressPerFile)
) self.services.chat.progressLogUpdate(
operationId,
min(currentProgress, 0.9),
f"Processing {completedCount[0]}/{totalFiles}: {fileName}"
)
try: try:
# Download PDF content # Download PDF content
fileContent = await self.services.sharepoint.downloadFile(siteId, fileId) fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)
if not fileContent: if not fileContent:
await _moveToErrorFolder(self, siteId, folderPath, fileName) await _moveToErrorFolder(self, siteId, folderPath, fileName)
errorDocuments.append({ return {"type": "error", "file": fileName, "error": "Failed to download", "movedTo": "error/"}
# AI call to extract expense data (this is the bottleneck - parallelized)
aiResult = await _extractExpensesWithAi(self.services, fileContent, fileName, prompt, featureInstanceId)
if not aiResult.get("success"):
await _moveToErrorFolder(self, siteId, folderPath, fileName)
return {"type": "error", "file": fileName, "error": aiResult.get("error", "AI extraction failed"), "movedTo": "error/"}
records = aiResult.get("records", [])
fileId = aiResult.get("fileId")
# Check for empty records
if not records:
logger.warning(f"Document {fileName}: No records extracted, moving to error folder")
await _moveToErrorFolder(self, siteId, folderPath, fileName)
return {"type": "skipped", "file": fileName, "reason": "No expense records extracted", "movedTo": "error/"}
# Validate and enrich records
validatedRecords = _validateAndEnrichRecords(records, fileName)
# Save to TrusteePosition and create Document + Position-Document links
savedCount = _saveToTrusteePosition(
trusteeInterface,
validatedRecords,
featureInstanceId,
self.services.mandateId,
fileId=fileId,
fileName=fileName,
sourceLocation=sharepointFolder
)
# Move document to "processed" subfolder
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
newFileName = f"{timestamp}_{fileName}"
moveSuccess = await _moveToProcessedFolder(self, siteId, folderPath, fileName, newFileName)
return {
"type": "processed",
"file": fileName, "file": fileName,
"error": "Failed to download", "newLocation": f"processed/{newFileName}" if moveSuccess else "move_failed",
"movedTo": "error/" "recordsExtracted": len(validatedRecords),
}) "recordsSaved": savedCount
continue }
# AI call to extract expense data except Exception as e:
aiResult = await _extractExpensesWithAi(self.services, fileContent, fileName, prompt, featureInstanceId) errorMsg = str(e)
logger.error(f"Error processing {fileName}: {errorMsg}")
# Handle rate limit
if "429" in errorMsg or "throttl" in errorMsg.lower():
logger.warning(f"Rate limit hit, waiting {RATE_LIMIT_WAIT_SECONDS} seconds")
await asyncio.sleep(RATE_LIMIT_WAIT_SECONDS)
if not aiResult.get("success"):
await _moveToErrorFolder(self, siteId, folderPath, fileName) await _moveToErrorFolder(self, siteId, folderPath, fileName)
errorDocuments.append({ return {"type": "error", "file": fileName, "error": errorMsg, "movedTo": "error/"}
"file": fileName,
"error": aiResult.get("error", "AI extraction failed"),
"movedTo": "error/"
})
continue
records = aiResult.get("records", []) # Execute all PDF processing tasks in parallel (limited by semaphore)
logger.info(f"Starting parallel processing of {totalFiles} PDFs (max {MAX_CONCURRENT_AI_TASKS} concurrent)")
tasks = [processSinglePdf(idx, pdfFile) for idx, pdfFile in enumerate(pdfFiles)]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Check for empty records # Collect results
if not records: for result in results:
logger.warning(f"Document {fileName}: No records extracted, moving to error folder") if isinstance(result, Exception):
await _moveToErrorFolder(self, siteId, folderPath, fileName) errorDocuments.append({"file": "unknown", "error": str(result), "movedTo": "error/"})
skippedDocuments.append({ elif result.get("type") == "processed":
"file": fileName, processedDocuments.append(result)
"reason": "No expense records extracted", totalPositions += result.get("recordsSaved", 0)
"movedTo": "error/" elif result.get("type") == "skipped":
}) skippedDocuments.append(result)
continue elif result.get("type") == "error":
errorDocuments.append(result)
# Validate and enrich records
validatedRecords = _validateAndEnrichRecords(records, fileName)
# Save to TrusteePosition
savedCount = _saveToTrusteePosition(trusteeInterface, validatedRecords, featureInstanceId, self.services.mandateId)
totalPositions += savedCount
# Move document to "processed" subfolder
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
newFileName = f"{timestamp}_{fileName}"
moveSuccess = await _moveToProcessedFolder(self, siteId, folderPath, fileName, newFileName)
processedDocuments.append({
"file": fileName,
"newLocation": f"processed/{newFileName}" if moveSuccess else "move_failed",
"recordsExtracted": len(validatedRecords),
"recordsSaved": savedCount
})
except Exception as e:
errorMsg = str(e)
logger.error(f"Error processing {fileName}: {errorMsg}")
# Handle rate limit
if "429" in errorMsg or "throttl" in errorMsg.lower():
logger.warning(f"Rate limit hit, waiting {RATE_LIMIT_WAIT_SECONDS} seconds")
await asyncio.sleep(RATE_LIMIT_WAIT_SECONDS)
await _moveToErrorFolder(self, siteId, folderPath, fileName)
errorDocuments.append({
"file": fileName,
"error": errorMsg,
"movedTo": "error/"
})
# Create result summary # Create result summary
self.services.chat.progressLogUpdate(operationId, 0.95, "Creating result summary") self.services.chat.progressLogUpdate(operationId, 0.95, "Creating result summary")
@ -423,9 +443,10 @@ async def _extractExpensesWithAi(services, fileContent: bytes, fileName: str, pr
# Step 5: Call AI with documentList - let AI service handle everything # Step 5: Call AI with documentList - let AI service handle everything
# (extraction, intent analysis, chunking, image processing) # (extraction, intent analysis, chunking, image processing)
# Use DATA_GENERATE (same path as ai.process) which handles chunking correctly
options = AiCallOptions( options = AiCallOptions(
resultFormat="csv", resultFormat="csv",
operationType=OperationTypeEnum.DATA_EXTRACT operationType=OperationTypeEnum.DATA_GENERATE
) )
aiResponse = await services.ai.callAiContent( aiResponse = await services.ai.callAiContent(
@ -433,17 +454,31 @@ async def _extractExpensesWithAi(services, fileContent: bytes, fileName: str, pr
options=options, options=options,
documentList=documentList, documentList=documentList,
contentParts=None, # Let AI service extract from documents contentParts=None, # Let AI service extract from documents
outputFormat="csv" outputFormat="csv",
generationIntent="extract" # Signal this is extraction, not document generation
) )
if not aiResponse or not aiResponse.content: if not aiResponse:
return {"success": False, "error": "AI returned empty response"} return {"success": False, "error": "AI returned empty response"}
# Parse CSV response # Get CSV from rendered documents (not from content - that's the internal structure)
csvContent = aiResponse.content if not aiResponse.documents or len(aiResponse.documents) == 0:
return {"success": False, "error": "AI returned no documents"}
# Get the CSV content from the first document
csvDocument = aiResponse.documents[0]
csvContent = csvDocument.documentData
# documentData is bytes, decode to string
if isinstance(csvContent, bytes):
csvContent = csvContent.decode('utf-8')
logger.info(f"Retrieved CSV content ({len(csvContent)} chars) from rendered document: {csvDocument.documentName}")
records = _parseCsvToRecords(csvContent) records = _parseCsvToRecords(csvContent)
return {"success": True, "records": records} # Return fileId so it can be used to create TrusteeDocument reference
return {"success": True, "records": records, "fileId": fileItem.id}
except Exception as e: except Exception as e:
logger.error(f"AI extraction error for {fileName}: {str(e)}") logger.error(f"AI extraction error for {fileName}: {str(e)}")
@ -454,8 +489,9 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
"""Parse CSV content to list of expense records.""" """Parse CSV content to list of expense records."""
records = [] records = []
try: try:
# Clean up CSV content - remove markdown code blocks if present
content = csvContent.strip() content = csvContent.strip()
# Clean up CSV content - remove markdown code blocks if present
if content.startswith("```"): if content.startswith("```"):
lines = content.split('\n') lines = content.split('\n')
# Remove first and last line if they're code block markers # Remove first and last line if they're code block markers
@ -471,6 +507,8 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
cleanedRow = {k.strip(): v.strip() if isinstance(v, str) else v for k, v in row.items()} cleanedRow = {k.strip(): v.strip() if isinstance(v, str) else v for k, v in row.items()}
records.append(cleanedRow) records.append(cleanedRow)
logger.info(f"Parsed {len(records)} records from CSV content")
except Exception as e: except Exception as e:
logger.error(f"Error parsing CSV: {str(e)}") logger.error(f"Error parsing CSV: {str(e)}")
@ -548,10 +586,54 @@ def _parseFloat(value) -> float:
return 0.0 return 0.0
def _saveToTrusteePosition(trusteeInterface, records: List[Dict[str, Any]], featureInstanceId: str, mandateId: str) -> int: def _saveToTrusteePosition(
"""Save validated records to TrusteePosition table.""" trusteeInterface,
savedCount = 0 records: List[Dict[str, Any]],
featureInstanceId: str,
mandateId: str,
fileId: Optional[str] = None,
fileName: Optional[str] = None,
sourceLocation: Optional[str] = None
) -> int:
"""
Save validated records to TrusteePosition table.
Also creates TrusteeDocument (referencing the source file) and links positions to it.
Args:
trusteeInterface: Trustee interface instance
records: List of expense records to save
featureInstanceId: Feature instance ID
mandateId: Mandate ID
fileId: Optional file ID from central Files table (source PDF)
fileName: Optional file name
sourceLocation: Optional source location (e.g., SharePoint path)
Returns:
Number of positions saved
"""
savedCount = 0
savedPositionIds = []
# Step 1: Create TrusteeDocument referencing the source file
documentId = None
if fileId and fileName:
try:
document = trusteeInterface.createDocument({
"fileId": fileId,
"documentName": fileName,
"documentMimeType": "application/pdf",
"sourceType": "sharepoint",
"sourceLocation": sourceLocation
})
if document:
documentId = document.id
logger.info(f"Created TrusteeDocument {documentId} referencing file {fileId}")
else:
logger.warning(f"Failed to create TrusteeDocument for file {fileId}")
except Exception as e:
logger.error(f"Error creating TrusteeDocument: {str(e)}")
# Step 2: Save positions
for record in records: for record in records:
try: try:
position = { position = {
@ -573,11 +655,27 @@ def _saveToTrusteePosition(trusteeInterface, records: List[Dict[str, Any]], feat
result = trusteeInterface.createPosition(position) result = trusteeInterface.createPosition(position)
if result: if result:
savedCount += 1 savedCount += 1
savedPositionIds.append(result.id)
logger.debug(f"Saved position: {position.get('company')} - {position.get('bookingAmount')}") logger.debug(f"Saved position: {position.get('company')} - {position.get('bookingAmount')}")
except Exception as e: except Exception as e:
logger.error(f"Failed to save position: {str(e)}") logger.error(f"Failed to save position: {str(e)}")
# Step 3: Create Position-Document links
if documentId and savedPositionIds:
for positionId in savedPositionIds:
try:
link = trusteeInterface.createPositionDocument({
"documentId": documentId,
"positionId": positionId
})
if link:
logger.debug(f"Created position-document link: {positionId} -> {documentId}")
else:
logger.warning(f"Failed to create position-document link: {positionId} -> {documentId}")
except Exception as e:
logger.error(f"Error creating position-document link: {str(e)}")
return savedCount return savedCount
@ -718,27 +816,16 @@ async def _deleteFile(self, siteId: str, folderPath: str, fileName: str) -> bool
if not fileId: if not fileId:
return False return False
# Delete by ID # Delete by ID using apiClient
deleteEndpoint = f"sites/{siteId}/drive/items/{fileId}" deleteEndpoint = f"sites/{siteId}/drive/items/{fileId}"
result = await self.apiClient.makeGraphApiCall(deleteEndpoint, method="DELETE")
# Make DELETE request if "error" in result:
if self.services.sharepoint.accessToken is None: logger.warning(f"Delete failed: {result['error']}")
logger.error("Access token not set for delete")
return False return False
import aiohttp logger.debug(f"Deleted file: {filePath}")
headers = {"Authorization": f"Bearer {self.services.sharepoint.accessToken}"} return True
url = f"https://graph.microsoft.com/v1.0/{deleteEndpoint}"
async with aiohttp.ClientSession() as session:
async with session.delete(url, headers=headers) as response:
if response.status in [200, 204]:
logger.debug(f"Deleted file: {filePath}")
return True
else:
errorText = await response.text()
logger.warning(f"Delete failed: {response.status} - {errorText}")
return False
except Exception as e: except Exception as e:
logger.error(f"Failed to delete file: {str(e)}") logger.error(f"Failed to delete file: {str(e)}")

View file

@ -93,6 +93,18 @@ class ApiClientHelper:
logger.error(f"Graph API call failed: {response.status} - {errorText}") logger.error(f"Graph API call failed: {response.status} - {errorText}")
return {"error": f"API call failed: {response.status} - {errorText}"} return {"error": f"API call failed: {response.status} - {errorText}"}
elif method == "DELETE":
logger.debug(f"Starting DELETE request to {url}")
async with session.delete(url, headers=headers) as response:
logger.info(f"Graph API response: {response.status}")
if response.status in [200, 204]:
logger.debug(f"Graph API DELETE success")
return {"success": True}
else:
errorText = await response.text()
logger.error(f"Graph API call failed: {response.status} - {errorText}")
return {"error": f"API call failed: {response.status} - {errorText}"}
except asyncio.TimeoutError: except asyncio.TimeoutError:
logger.error(f"Graph API call timed out after 30 seconds: {endpoint}") logger.error(f"Graph API call timed out after 30 seconds: {endpoint}")
return {"error": f"API call timed out after 30 seconds: {endpoint}"} return {"error": f"API call timed out after 30 seconds: {endpoint}"}