fixed ai call end to end with saas multimandate

This commit is contained in:
ValueOn AG 2026-01-26 23:26:30 +01:00
parent 5c4813b10a
commit 97cbda0ef2
11 changed files with 395 additions and 232 deletions

View file

@ -72,10 +72,16 @@ class ModelSelector:
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
# Convert bytes to approximate tokens (1 token ≈ 4 bytes)
promptTokens = promptSize / 4
contextTokens = contextSize / 4
totalTokens = totalSize / 4
# Convert bytes to approximate tokens
# Conservative estimate: 1 token ≈ 2 bytes (for safety margin)
# Note: Actual tokenization varies by content type and model
# - English text: ~4 bytes/token
# - Structured data/JSON: ~2-3 bytes/token
# - Base64/encoded data: ~1.5-2 bytes/token
bytesPerToken = 2 # Conservative estimate for mixed content
promptTokens = promptSize / bytesPerToken
contextTokens = contextSize / bytesPerToken
totalTokens = totalSize / bytesPerToken
logger.debug(f"Request sizes - Prompt: {promptTokens:.0f} tokens ({promptSize} bytes), Context: {contextTokens:.0f} tokens ({contextSize} bytes), Total: {totalTokens:.0f} tokens ({totalSize} bytes)")

View file

@ -220,11 +220,12 @@ class ChatMessage(BaseModel):
)
role: str = Field(description="Role of the message sender")
status: str = Field(description="Status of the message (first, step, last)")
sequenceNr: int = Field(
sequenceNr: Optional[int] = Field(
default=0,
description="Sequence number of the message (set automatically)"
)
publishedAt: float = Field(
default_factory=getUtcTimestamp,
publishedAt: Optional[float] = Field(
default=None,
description="When the message was published (UTC timestamp in seconds)",
)
success: Optional[bool] = Field(

View file

@ -399,7 +399,7 @@ AUTOMATION_TEMPLATES: Dict[str, Any] = {
"connectionName": "",
"sharepointFolder": "",
"featureInstanceId": "",
"extractionPrompt": "Du bist ein Spezialist für die Extraktion von Spesendaten aus PDF-Dokumenten.\n\nAUFGABE:\nExtrahiere alle Speseneinträge aus dem bereitgestellten PDF-Dokument und gib sie im CSV-Format zurück.\n\nWICHTIGE REGELN:\n1. Pro MwSt-Prozentsatz einen separaten Datensatz erstellen\n2. Alle Datensätze zusammen müssen den Gesamtbetrag des Dokuments ergeben\n3. Der gesamte extrahierte Text des Dokuments muss im Feld \"desc\" erfasst werden\n4. Feld \"company\" enthält den Lieferanten/Verkäufer der Buchung\n5. Tags müssen aus dieser Liste gewählt werden: customer, meeting, license, subscription, fuel, food, material\n - Mehrere zutreffende Tags mit Komma trennen\n\nCSV-SPALTEN (in dieser Reihenfolge):\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n\nDATENFORMAT:\n- valuta: YYYY-MM-DD (Valutadatum)\n- transactionDateTime: Unix-Timestamp in Sekunden (Transaktionszeitpunkt)\n- company: Lieferant/Verkäufer Name\n- desc: Vollständiger extrahierter Text des Dokuments\n- tags: Komma-getrennte Tags aus der erlaubten Liste\n- bookingCurrency: Währungscode (CHF, EUR, USD, GBP)\n- bookingAmount: Buchungsbetrag als Dezimalzahl\n- originalCurrency: Original-Währungscode\n- originalAmount: Original-Betrag als Dezimalzahl\n- vatPercentage: MwSt-Prozentsatz (z.B. 8.1 für 8.1%)\n- vatAmount: MwSt-Betrag als Dezimalzahl\n\nBEISPIEL OUTPUT:\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n2026-01-15,1736953200,Migros AG,\"Einkauf Migros Zürich...\",food,CHF,45.50,CHF,45.50,2.6,1.15\n2026-01-15,1736953200,Migros AG,\"Einkauf Migros Zürich...\",material,CHF,12.30,CHF,12.30,8.1,0.92\n\nHINWEISE:\n- Wenn nur ein MwSt-Satz vorhanden ist, einen Datensatz erstellen\n- Wenn mehrere MwSt-Sätze vorhanden sind (z.B. Lebensmittel 2.6% und Non-Food 8.1%), separate Datensätze erstellen\n- Bei fehlenden Informationen: leeres Feld oder Standardwert\n- Keine Anführungszeichen um numerische Werte"
"extractionPrompt": "Du bist ein Spezialist für die Extraktion von Belegdaten aus PDF-Dokumenten.\n\nAUFGABE:\nExtrahiere die Daten aus dem bereitgestellten Zahlungsbeleg und erstelle EINE EINZIGE CSV-Tabelle mit allen Datensätzen.\n\nOUTPUT-STRUKTUR:\nErstelle genau EINE Tabelle mit den folgenden Spalten. Alle extrahierten Datensätze kommen in diese eine Tabelle als Zeilen.\n\nWICHTIGE REGELN:\n1. Pro MwSt-Prozentsatz einen separaten Datensatz (= Zeile) erstellen\n2. Alle Datensätze zusammen müssen den Gesamtbetrag des Dokuments ergeben\n3. Der gesamte extrahierte Text des Dokuments muss im Feld \"desc\" erfasst werden\n4. Feld \"company\" enthält den Lieferanten/Verkäufer der Buchung\n5. Tags müssen aus dieser Liste gewählt werden: customer, meeting, license, subscription, fuel, food, material\n - Mehrere zutreffende Tags mit Komma trennen\n\nCSV-SPALTEN (in dieser Reihenfolge):\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n\nDATENFORMAT:\n- valuta: YYYY-MM-DD (Valutadatum)\n- transactionDateTime: Unix-Timestamp in Sekunden (Transaktionszeitpunkt)\n- company: Lieferant/Verkäufer Name\n- desc: Vollständiger extrahierter Text des Dokuments\n- tags: Komma-getrennte Tags aus der erlaubten Liste\n- bookingCurrency: Währungscode (CHF, EUR, USD, GBP)\n- bookingAmount: Buchungsbetrag als Dezimalzahl\n- originalCurrency: Original-Währungscode\n- originalAmount: Original-Betrag als Dezimalzahl\n- vatPercentage: MwSt-Prozentsatz (z.B. 8.1 für 8.1%)\n- vatAmount: MwSt-Betrag als Dezimalzahl\n\nHINWEISE:\n- Wenn nur ein MwSt-Satz vorhanden ist, einen Datensatz erstellen\n- Wenn mehrere MwSt-Sätze vorhanden sind (z.B. Lebensmittel 2.6% und Non-Food 8.1%), separate Datensätze erstellen\n- Bei fehlenden Informationen: leeres Feld oder Standardwert"
}
}
]

View file

@ -279,7 +279,10 @@ registerModelLabels(
class TrusteeDocument(BaseModel):
"""Contains document references and receipts for bookings.
"""Contains document references for bookings.
Documents reference files in the central Files table via fileId.
This allows file content to be stored once and referenced by multiple features.
Note: organisationId and contractId removed as per architecture decision:
- The feature instance IS the organisation
@ -294,11 +297,11 @@ class TrusteeDocument(BaseModel):
"frontend_required": False
}
)
documentData: Optional[bytes] = Field(
fileId: Optional[str] = Field(
default=None,
description="The file content (binary)",
description="Reference to central Files table (Files.id)",
json_schema_extra={
"frontend_type": "file",
"frontend_type": "file_reference",
"frontend_readonly": False,
"frontend_required": False
}
@ -321,6 +324,24 @@ class TrusteeDocument(BaseModel):
"frontend_options": "/api/trustee/mime-types/options"
}
)
sourceType: Optional[str] = Field(
default=None,
description="Source type (e.g., 'sharepoint', 'upload', 'email')",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False
}
)
sourceLocation: Optional[str] = Field(
default=None,
description="Original source location (e.g., SharePoint path)",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False
}
)
mandateId: Optional[str] = Field(
default=None,
description="Mandate ID (auto-set from context)",
@ -349,9 +370,11 @@ registerModelLabels(
{"en": "Document", "fr": "Document", "de": "Dokument"},
{
"id": {"en": "ID", "fr": "ID", "de": "ID"},
"documentData": {"en": "Document Data", "fr": "Données du document", "de": "Dokumentdaten"},
"fileId": {"en": "File Reference", "fr": "Référence du fichier", "de": "Datei-Referenz"},
"documentName": {"en": "Document Name", "fr": "Nom du document", "de": "Dokumentname"},
"documentMimeType": {"en": "MIME Type", "fr": "Type MIME", "de": "MIME-Typ"},
"sourceType": {"en": "Source Type", "fr": "Type de source", "de": "Quelltyp"},
"sourceLocation": {"en": "Source Location", "fr": "Emplacement source", "de": "Quellort"},
"mandateId": {"en": "Mandate", "fr": "Mandat", "de": "Mandat"},
"featureInstanceId": {"en": "Feature Instance", "fr": "Instance de fonctionnalité", "de": "Feature-Instanz"},
},

View file

@ -177,8 +177,7 @@ class TrusteeObjects:
AccessRuleContext.DATA,
tableName,
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
if not permissions.view:
@ -205,8 +204,7 @@ class TrusteeObjects:
AccessRuleContext.DATA,
tableName,
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
if not permissions.view:
@ -270,8 +268,7 @@ class TrusteeObjects:
recordFilter=None,
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
logger.debug(f"getAllOrganisations: getRecordsetWithRBAC returned {len(records)} records")
@ -364,8 +361,7 @@ class TrusteeObjects:
recordFilter=None,
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
# Users with ALL access level (from system RBAC) see all roles
@ -475,8 +471,7 @@ class TrusteeObjects:
recordFilter=None,
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
# Users with ALL access level (from system RBAC) see all records
@ -535,8 +530,7 @@ class TrusteeObjects:
recordFilter={"organisationId": organisationId},
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
return [TrusteeAccess(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -553,8 +547,7 @@ class TrusteeObjects:
recordFilter={"userId": userId},
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
# Users with ALL access level (from system RBAC) see all records
@ -671,8 +664,7 @@ class TrusteeObjects:
recordFilter=None,
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
totalItems = len(records)
@ -705,8 +697,7 @@ class TrusteeObjects:
recordFilter={"organisationId": organisationId},
orderBy="label",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
return [TrusteeContract(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -780,8 +771,8 @@ class TrusteeObjects:
createdRecord = self.db.recordCreate(TrusteeDocument, data)
if createdRecord and createdRecord.get("id"):
# Remove binary data and metadata from Pydantic model
cleanedRecord = {k: v for k, v in createdRecord.items() if not k.startswith("_") and k != "documentData"}
# Remove metadata from Pydantic model
cleanedRecord = {k: v for k, v in createdRecord.items() if not k.startswith("_")}
return TrusteeDocument(**cleanedRecord)
return None
@ -795,12 +786,25 @@ class TrusteeObjects:
return TrusteeDocument(**cleanedRecord)
def getDocumentData(self, documentId: str) -> Optional[bytes]:
"""Get document binary data."""
"""Get document binary data via fileId reference to central Files table."""
records = self.db.getRecordset(TrusteeDocument, recordFilter={"id": documentId})
record = records[0] if records else None
if record:
return record.get("documentData")
return None
if not record:
return None
# New model: fileId references central Files table
fileId = record.get("fileId")
if fileId:
from modules.interfaces.interfaceDbManagement import getInterface as getDbInterface
dbInterface = getDbInterface(self.currentUser, mandateId=self.mandateId, featureInstanceId=self.featureInstanceId)
fileData = dbInterface.getFileData(fileId)
if fileData:
return fileData
logger.warning(f"File data not found for fileId {fileId}")
return None
# Legacy fallback: documentData was stored directly (for migration)
return record.get("documentData")
def getAllDocuments(self, params: Optional[PaginationParams] = None) -> PaginatedResult:
"""Get all documents with RBAC filtering + feature-level access filtering (metadata only)."""
@ -812,8 +816,7 @@ class TrusteeObjects:
recordFilter=None,
orderBy="documentName",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
# Convert dicts to Pydantic objects (remove binary data and internal fields)
@ -852,8 +855,7 @@ class TrusteeObjects:
recordFilter={"contractId": contractId},
orderBy="documentName",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
result = []
@ -960,8 +962,7 @@ class TrusteeObjects:
recordFilter=None,
orderBy="valuta",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
# Convert dicts to Pydantic objects (remove internal fields)
@ -1000,8 +1001,7 @@ class TrusteeObjects:
recordFilter={"contractId": contractId},
orderBy="valuta",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -1015,8 +1015,7 @@ class TrusteeObjects:
recordFilter={"organisationId": organisationId},
orderBy="valuta",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
@ -1173,8 +1172,7 @@ class TrusteeObjects:
recordFilter={"positionId": positionId},
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links]
@ -1188,8 +1186,7 @@ class TrusteeObjects:
recordFilter={"documentId": documentId},
orderBy="id",
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
featureCode=self.FEATURE_CODE
featureInstanceId=self.featureInstanceId
)
return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links]

View file

@ -53,7 +53,8 @@ async def start_workflow(
"""
try:
# Start or continue workflow using playground controller
workflow = await chatStart(context.user, userInput, workflowMode, workflowId)
mandateId = str(context.mandateId) if context.mandateId else None
workflow = await chatStart(context.user, userInput, workflowMode, workflowId, mandateId=mandateId)
return workflow
@ -75,7 +76,8 @@ async def stop_workflow(
"""Stops a running workflow."""
try:
# Stop workflow using playground controller
workflow = await chatStop(context.user, workflowId)
mandateId = str(context.mandateId) if context.mandateId else None
workflow = await chatStop(context.user, workflowId, mandateId=mandateId)
return workflow

View file

@ -785,14 +785,39 @@ Respond with ONLY a JSON object in this exact format:
if part.data
])
# Call AI with extracted content
aiRequest = AiCallRequest(
prompt=f"{prompt}\n\nExtracted Content:\n{contentText}",
context="",
options=options
)
# Check content size and use chunking if needed
# Conservative estimate: 2 bytes per token, 80% of model limit for safety
contentSizeBytes = len(contentText.encode('utf-8'))
promptSizeBytes = len(prompt.encode('utf-8'))
totalSizeBytes = contentSizeBytes + promptSizeBytes
estimatedTokens = totalSizeBytes / 2 # Conservative: 2 bytes per token
aiResponse = await self.callAi(aiRequest)
# Get max model context (use Claude's 200k as reference, 80% = 160k tokens)
maxSafeTokens = 160000
if estimatedTokens > maxSafeTokens:
# Content too large - use chunking via ExtractionService
logger.warning(f"Content too large for single AI call: ~{estimatedTokens:.0f} tokens (limit: {maxSafeTokens}). Using chunked processing.")
# Use ExtractionService for chunked processing
extractionService = self.services.extraction
aiResponse = await extractionService.processContentPartsWithPrompt(
contentParts=contentParts,
prompt=prompt,
aiObjects=self.aiObjects,
options=options,
operationId=extractOperationId,
parentOperationId=parentOperationId
)
else:
# Content fits - use single AI call
aiRequest = AiCallRequest(
prompt=f"{prompt}\n\nExtracted Content:\n{contentText}",
context="",
options=options
)
aiResponse = await self.callAi(aiRequest)
# Create response document
resultDocument = DocumentData(

View file

@ -567,7 +567,8 @@ class StructureFiller:
userPrompt: str,
all_sections_list: List[Dict[str, Any]],
language: str,
calculateOverallProgress: callable
outputFormat: str = "txt",
calculateOverallProgress: callable = None
) -> List[Dict[str, Any]]:
"""
Process a single section and return its elements.
@ -761,7 +762,8 @@ class StructureFiller:
allSections=all_sections_list,
sectionIndex=sectionIndex,
isAggregation=isAggregation,
language=language
language=language,
outputFormat=outputFormat
)
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
@ -949,7 +951,8 @@ class StructureFiller:
allSections=all_sections_list,
sectionIndex=sectionIndex,
isAggregation=False,
language=language
language=language,
outputFormat=outputFormat
)
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
@ -1214,7 +1217,8 @@ class StructureFiller:
allSections=all_sections_list,
sectionIndex=sectionIndex,
isAggregation=False,
language=language
language=language,
outputFormat=outputFormat
)
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
@ -1540,6 +1544,7 @@ class StructureFiller:
for doc in chapterStructure.get("documents", []):
docId = doc.get("id", "unknown")
docLanguage = self._getDocumentLanguage(chapterStructure, docId)
docFormat = doc.get("outputFormat", "txt") # Get output format for this document
for chapter in doc.get("chapters", []):
chapterId = chapter.get("id", "unknown")
@ -1555,7 +1560,8 @@ class StructureFiller:
"sectionIndex": sectionIndex,
"chapterSectionCount": chapterSectionCount,
"section": section,
"docLanguage": docLanguage
"docLanguage": docLanguage,
"docFormat": docFormat # Include output format
})
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
@ -1577,6 +1583,7 @@ class StructureFiller:
userPrompt=userPrompt,
all_sections_list=all_sections_list,
language=taskInfo["docLanguage"],
outputFormat=taskInfo.get("docFormat", "txt"), # Pass output format
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0
)
@ -1826,23 +1833,11 @@ If AVAILABLE CONTENT PARTS are listed above, then EVERY section that generates c
- If chapter's generationHint references documents/images/data AND section generates content for that chapter → section MUST assign relevant ContentParts
- Empty contentPartIds [] are only allowed if section generates content WITHOUT referencing any available ContentParts AND WITHOUT relating to chapter's generationHint
## CONTENT TYPES
Available content types for sections: table, bullet_list, heading, paragraph, code_block, image
## ACCEPTED CONTENT TYPES FOR THIS FORMAT
The document output format ({outputFormat}) accepts only the following content types:
{', '.join(acceptedSectionTypes)}
## ACCEPTED SECTION TYPES FOR THIS FORMAT
The document output format ({outputFormat}) accepts only the following section types:
{', '.join(acceptedSectionTypes) if acceptedSectionTypes else 'All section types'}
**IMPORTANT**: Only create sections with content types from the accepted list above. Do not create sections with types that are not accepted by this format.
## FORMAT-APPROPRIATE SECTION STRUCTURE
When determining which sections to create for this chapter, consider the document's output format ({outputFormat}) and ensure sections are structured appropriately for that format:
- Different formats have different capabilities and constraints
- Structure sections to match what the format can effectively represent
- Consider what content types work best for each format
- Ensure the section structure aligns with the format's strengths and limitations
- Select content types that are well-suited for the target format
- **CRITICAL**: Only use section types from the ACCEPTED SECTION TYPES list above
**CRITICAL**: Only create sections with content types from this list. Other types will fail.
useAiCall RULE (simple):
- useAiCall: true Content needs AI processing (extract, transform, generate, filter, summarize)
@ -1853,7 +1848,7 @@ RETURN JSON:
"sections": [
{{
"id": "section_1",
"content_type": "paragraph",
"content_type": "{acceptedSectionTypes[0]}",
"contentPartIds": ["extracted_part_id"],
"generationHint": "Description of what to extract or generate",
"useAiCall": true,
@ -1897,7 +1892,8 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
allSections: Optional[List[Dict[str, Any]]] = None,
sectionIndex: Optional[int] = None,
isAggregation: bool = False,
language: str = "en"
language: str = "en",
outputFormat: str = "txt"
) -> tuple[str, str]:
"""Baue Prompt für Section-Generierung mit vollständigem Kontext."""
# Filtere None-Werte
@ -2005,14 +2001,29 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
for next in nextSections:
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
contentStructureExample = self._getContentStructureExample(contentType)
# Get accepted section types for the output format
acceptedTypesAggr = self._getAcceptedSectionTypesForFormat(outputFormat)
# CRITICAL: If the section's content_type is not supported by the output format,
# use the first accepted type instead. E.g., CSV only supports 'table', so
# even if section says 'code_block', we must output as 'table'.
effectiveContentType = contentType
if contentType not in acceptedTypesAggr and acceptedTypesAggr:
effectiveContentType = acceptedTypesAggr[0]
logger.debug(f"Section {sectionId}: Content type '{contentType}' not supported by format '{outputFormat}', using '{effectiveContentType}' instead")
contentStructureExample = self._getContentStructureExample(effectiveContentType)
# Build format note for the prompt - purely dynamic from renderer
# Always show what types are accepted for this format
formatNoteAggr = f"\n- Target Output Format: {outputFormat.upper()} (accepted content types: {', '.join(acceptedTypesAggr)})"
# Create template structure explicitly (not extracted from prompt)
# This ensures exact identity between initial and continuation prompts
templateStructure = f"""{{
"elements": [
{{
"type": "{contentType}",
"type": "{effectiveContentType}",
"content": {contentStructureExample}
}}
]
@ -2022,14 +2033,14 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
prompt = f"""# TASK: Generate Section Content (Aggregation)
Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON.
If ContentParts have no data, return: {{"elements": [{{"type": "{contentType}", "content": {{"headers": [], "rows": []}}}}]}}
If ContentParts have no data, return: {{"elements": [{{"type": "{effectiveContentType}", "content": {{"headers": [], "rows": []}}}}]}}
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {contentType}
- Generation Hint: {generationHint}
- Content Type: {effectiveContentType}
- Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only
@ -2044,7 +2055,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
3. If the context contains no data, return empty structures (empty rows array for tables).
4. Aggregate all data into one element (e.g., one table).
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
6. Format based on content_type ({contentType}).
6. Format based on content_type ({effectiveContentType}).
7. No HTML/styling: Plain text only, no markup.
8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
@ -2055,7 +2066,7 @@ Return a JSON object with this structure:
{{
"elements": [
{{
"type": "{contentType}",
"type": "{effectiveContentType}",
"content": {contentStructureExample}
}}
]
@ -2087,8 +2098,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {contentType}
- Generation Hint: {generationHint}
- Content Type: {effectiveContentType}
- Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only
@ -2103,7 +2114,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## INSTRUCTIONS
1. Extract data only from provided ContentParts. Never invent or generate data.
2. If ContentParts contain no data, return empty structures (empty rows array for tables).
3. Format based on content_type ({contentType}).
3. Format based on content_type ({effectiveContentType}).
4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup.
6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
@ -2114,7 +2125,7 @@ Return a JSON object with this structure:
{{
"elements": [
{{
"type": "{contentType}",
"type": "{effectiveContentType}",
"content": {contentStructureExample}
}}
]
@ -2142,8 +2153,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {contentType}
- Generation Hint: {generationHint}
- Content Type: {effectiveContentType}
- Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only
@ -2154,7 +2165,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
## INSTRUCTIONS
1. Generate content based on the Generation Hint above.
2. Create appropriate content that matches the content_type ({contentType}).
2. Create appropriate content that matches the content_type ({effectiveContentType}).
3. The content should be relevant to the USER REQUEST and fit the context of surrounding sections.
4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup.
@ -2166,7 +2177,7 @@ Return a JSON object with this structure:
{{
"elements": [
{{
"type": "{contentType}",
"type": "{effectiveContentType}",
"content": {contentStructureExample}
}}
]
@ -2557,28 +2568,26 @@ CRITICAL:
Returns:
List of accepted section content types (e.g., ["table", "code_block"])
Raises:
ValueError: If renderer not found or doesn't provide accepted types
"""
try:
from modules.services.serviceGeneration.renderers.registry import getRenderer
# Get renderer for this format
renderer = getRenderer(outputFormat, self.services)
if renderer and hasattr(renderer, 'getAcceptedSectionTypes'):
# Query renderer for accepted types
acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat)
if acceptedTypes:
logger.debug(f"Renderer for format '{outputFormat}' accepts section types: {acceptedTypes}")
return acceptedTypes
# Fallback: if no renderer or method not found, return all types
from modules.datamodels.datamodelJson import supportedSectionTypes
logger.debug(f"No renderer found for format '{outputFormat}' or method not available, using all section types")
return list(supportedSectionTypes)
except Exception as e:
logger.warning(f"Error querying renderer for accepted section types for format '{outputFormat}': {str(e)}")
# Fallback: return all types
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
from modules.services.serviceGeneration.renderers.registry import getRenderer
# Get renderer for this format - NO FALLBACK
renderer = getRenderer(outputFormat, self.services)
if not renderer:
raise ValueError(f"No renderer found for output format '{outputFormat}'. Check renderer registry.")
if not hasattr(renderer, 'getAcceptedSectionTypes'):
raise ValueError(f"Renderer for '{outputFormat}' does not implement getAcceptedSectionTypes(). Add this method to the renderer.")
acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat)
if not acceptedTypes:
raise ValueError(f"Renderer for '{outputFormat}' returned empty accepted types. Fix getAcceptedSectionTypes() in the renderer.")
logger.debug(f"Renderer for '{outputFormat}' accepts: {acceptedTypes}")
return acceptedTypes

View file

@ -24,7 +24,7 @@ from .subAutomationUtils import parseScheduleToCron, planToPrompt, replacePlaceh
logger = logging.getLogger(__name__)
async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode: WorkflowModeEnum, workflowId: Optional[str] = None) -> ChatWorkflow:
async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode: WorkflowModeEnum, workflowId: Optional[str] = None, mandateId: Optional[str] = None) -> ChatWorkflow:
"""
Starts a new chat or continues an existing one, then launches processing asynchronously.
@ -33,12 +33,13 @@ async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode
userInput: User input request
workflowId: Optional workflow ID to continue existing workflow
workflowMode: "Dynamic" for iterative dynamic-style processing, "Automation" for automated workflow execution
mandateId: Mandate ID from request context (required for proper data isolation)
Example usage for Dynamic mode:
workflow = await chatStart(currentUser, userInput, workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC)
workflow = await chatStart(currentUser, userInput, workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC, mandateId=mandateId)
"""
try:
services = getServices(currentUser, None)
services = getServices(currentUser, mandateId=mandateId)
workflowManager = WorkflowManager(services)
workflow = await workflowManager.workflowStart(userInput, workflowMode, workflowId)
return workflow
@ -46,10 +47,10 @@ async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode
logger.error(f"Error starting chat: {str(e)}")
raise
async def chatStop(currentUser: User, workflowId: str) -> ChatWorkflow:
async def chatStop(currentUser: User, workflowId: str, mandateId: Optional[str] = None) -> ChatWorkflow:
"""Stops a running chat."""
try:
services = getServices(currentUser, None)
services = getServices(currentUser, mandateId=mandateId)
workflowManager = WorkflowManager(services)
return await workflowManager.workflowStop(workflowId)
except Exception as e:

View file

@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
# Configuration
MAX_FILES_PER_EXECUTION = 50
MAX_CONCURRENT_AI_TASKS = 10 # Limit concurrent AI calls to avoid rate limits
ALLOWED_TAGS = ["customer", "meeting", "license", "subscription", "fuel", "food", "material"]
RATE_LIMIT_WAIT_SECONDS = 60
@ -92,6 +93,11 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid Microsoft connection found")
# Set access token for SharePoint service
if not self.services.sharepoint.setAccessTokenFromConnection(connection):
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Failed to set SharePoint access token")
# Find site and folder info
self.services.chat.progressLogUpdate(operationId, 0.1, "Resolving SharePoint site")
siteInfo, folderPath = await _resolveSiteAndFolder(self, sharepointFolder)
@ -137,90 +143,104 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
featureInstanceId=featureInstanceId
)
# Process each PDF
for idx, pdfFile in enumerate(pdfFiles):
currentProgress = 0.2 + (idx * progressPerFile)
# Process PDFs in parallel with semaphore to limit concurrent AI calls
semaphore = asyncio.Semaphore(MAX_CONCURRENT_AI_TASKS)
completedCount = [0] # Use list for mutable reference in closure
async def processSinglePdf(idx: int, pdfFile: Dict[str, Any]) -> Dict[str, Any]:
"""Process a single PDF document. Returns result dict."""
fileName = pdfFile.get("name", f"file_{idx}")
fileId = pdfFile.get("id")
self.services.chat.progressLogUpdate(
operationId,
currentProgress,
f"Processing {idx + 1}/{totalFiles}: {fileName}"
)
try:
# Download PDF content
fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)
if not fileContent:
await _moveToErrorFolder(self, siteId, folderPath, fileName)
errorDocuments.append({
async with semaphore:
# Update progress (thread-safe via asyncio)
completedCount[0] += 1
currentProgress = 0.2 + (completedCount[0] * progressPerFile)
self.services.chat.progressLogUpdate(
operationId,
min(currentProgress, 0.9),
f"Processing {completedCount[0]}/{totalFiles}: {fileName}"
)
try:
# Download PDF content
fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)
if not fileContent:
await _moveToErrorFolder(self, siteId, folderPath, fileName)
return {"type": "error", "file": fileName, "error": "Failed to download", "movedTo": "error/"}
# AI call to extract expense data (this is the bottleneck - parallelized)
aiResult = await _extractExpensesWithAi(self.services, fileContent, fileName, prompt, featureInstanceId)
if not aiResult.get("success"):
await _moveToErrorFolder(self, siteId, folderPath, fileName)
return {"type": "error", "file": fileName, "error": aiResult.get("error", "AI extraction failed"), "movedTo": "error/"}
records = aiResult.get("records", [])
fileId = aiResult.get("fileId")
# Check for empty records
if not records:
logger.warning(f"Document {fileName}: No records extracted, moving to error folder")
await _moveToErrorFolder(self, siteId, folderPath, fileName)
return {"type": "skipped", "file": fileName, "reason": "No expense records extracted", "movedTo": "error/"}
# Validate and enrich records
validatedRecords = _validateAndEnrichRecords(records, fileName)
# Save to TrusteePosition and create Document + Position-Document links
savedCount = _saveToTrusteePosition(
trusteeInterface,
validatedRecords,
featureInstanceId,
self.services.mandateId,
fileId=fileId,
fileName=fileName,
sourceLocation=sharepointFolder
)
# Move document to "processed" subfolder
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
newFileName = f"{timestamp}_{fileName}"
moveSuccess = await _moveToProcessedFolder(self, siteId, folderPath, fileName, newFileName)
return {
"type": "processed",
"file": fileName,
"error": "Failed to download",
"movedTo": "error/"
})
continue
# AI call to extract expense data
aiResult = await _extractExpensesWithAi(self.services, fileContent, fileName, prompt, featureInstanceId)
if not aiResult.get("success"):
"newLocation": f"processed/{newFileName}" if moveSuccess else "move_failed",
"recordsExtracted": len(validatedRecords),
"recordsSaved": savedCount
}
except Exception as e:
errorMsg = str(e)
logger.error(f"Error processing {fileName}: {errorMsg}")
# Handle rate limit
if "429" in errorMsg or "throttl" in errorMsg.lower():
logger.warning(f"Rate limit hit, waiting {RATE_LIMIT_WAIT_SECONDS} seconds")
await asyncio.sleep(RATE_LIMIT_WAIT_SECONDS)
await _moveToErrorFolder(self, siteId, folderPath, fileName)
errorDocuments.append({
"file": fileName,
"error": aiResult.get("error", "AI extraction failed"),
"movedTo": "error/"
})
continue
records = aiResult.get("records", [])
# Check for empty records
if not records:
logger.warning(f"Document {fileName}: No records extracted, moving to error folder")
await _moveToErrorFolder(self, siteId, folderPath, fileName)
skippedDocuments.append({
"file": fileName,
"reason": "No expense records extracted",
"movedTo": "error/"
})
continue
# Validate and enrich records
validatedRecords = _validateAndEnrichRecords(records, fileName)
# Save to TrusteePosition
savedCount = _saveToTrusteePosition(trusteeInterface, validatedRecords, featureInstanceId, self.services.mandateId)
totalPositions += savedCount
# Move document to "processed" subfolder
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
newFileName = f"{timestamp}_{fileName}"
moveSuccess = await _moveToProcessedFolder(self, siteId, folderPath, fileName, newFileName)
processedDocuments.append({
"file": fileName,
"newLocation": f"processed/{newFileName}" if moveSuccess else "move_failed",
"recordsExtracted": len(validatedRecords),
"recordsSaved": savedCount
})
except Exception as e:
errorMsg = str(e)
logger.error(f"Error processing {fileName}: {errorMsg}")
# Handle rate limit
if "429" in errorMsg or "throttl" in errorMsg.lower():
logger.warning(f"Rate limit hit, waiting {RATE_LIMIT_WAIT_SECONDS} seconds")
await asyncio.sleep(RATE_LIMIT_WAIT_SECONDS)
await _moveToErrorFolder(self, siteId, folderPath, fileName)
errorDocuments.append({
"file": fileName,
"error": errorMsg,
"movedTo": "error/"
})
return {"type": "error", "file": fileName, "error": errorMsg, "movedTo": "error/"}
# Execute all PDF processing tasks in parallel (limited by semaphore)
logger.info(f"Starting parallel processing of {totalFiles} PDFs (max {MAX_CONCURRENT_AI_TASKS} concurrent)")
tasks = [processSinglePdf(idx, pdfFile) for idx, pdfFile in enumerate(pdfFiles)]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Collect results
for result in results:
if isinstance(result, Exception):
errorDocuments.append({"file": "unknown", "error": str(result), "movedTo": "error/"})
elif result.get("type") == "processed":
processedDocuments.append(result)
totalPositions += result.get("recordsSaved", 0)
elif result.get("type") == "skipped":
skippedDocuments.append(result)
elif result.get("type") == "error":
errorDocuments.append(result)
# Create result summary
self.services.chat.progressLogUpdate(operationId, 0.95, "Creating result summary")
@ -423,9 +443,10 @@ async def _extractExpensesWithAi(services, fileContent: bytes, fileName: str, pr
# Step 5: Call AI with documentList - let AI service handle everything
# (extraction, intent analysis, chunking, image processing)
# Use DATA_GENERATE (same path as ai.process) which handles chunking correctly
options = AiCallOptions(
resultFormat="csv",
operationType=OperationTypeEnum.DATA_EXTRACT
operationType=OperationTypeEnum.DATA_GENERATE
)
aiResponse = await services.ai.callAiContent(
@ -433,17 +454,31 @@ async def _extractExpensesWithAi(services, fileContent: bytes, fileName: str, pr
options=options,
documentList=documentList,
contentParts=None, # Let AI service extract from documents
outputFormat="csv"
outputFormat="csv",
generationIntent="extract" # Signal this is extraction, not document generation
)
if not aiResponse or not aiResponse.content:
if not aiResponse:
return {"success": False, "error": "AI returned empty response"}
# Parse CSV response
csvContent = aiResponse.content
# Get CSV from rendered documents (not from content - that's the internal structure)
if not aiResponse.documents or len(aiResponse.documents) == 0:
return {"success": False, "error": "AI returned no documents"}
# Get the CSV content from the first document
csvDocument = aiResponse.documents[0]
csvContent = csvDocument.documentData
# documentData is bytes, decode to string
if isinstance(csvContent, bytes):
csvContent = csvContent.decode('utf-8')
logger.info(f"Retrieved CSV content ({len(csvContent)} chars) from rendered document: {csvDocument.documentName}")
records = _parseCsvToRecords(csvContent)
return {"success": True, "records": records}
# Return fileId so it can be used to create TrusteeDocument reference
return {"success": True, "records": records, "fileId": fileItem.id}
except Exception as e:
logger.error(f"AI extraction error for {fileName}: {str(e)}")
@ -454,8 +489,9 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
"""Parse CSV content to list of expense records."""
records = []
try:
# Clean up CSV content - remove markdown code blocks if present
content = csvContent.strip()
# Clean up CSV content - remove markdown code blocks if present
if content.startswith("```"):
lines = content.split('\n')
# Remove first and last line if they're code block markers
@ -470,6 +506,8 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
# Clean up keys (remove whitespace)
cleanedRow = {k.strip(): v.strip() if isinstance(v, str) else v for k, v in row.items()}
records.append(cleanedRow)
logger.info(f"Parsed {len(records)} records from CSV content")
except Exception as e:
logger.error(f"Error parsing CSV: {str(e)}")
@ -548,10 +586,54 @@ def _parseFloat(value) -> float:
return 0.0
def _saveToTrusteePosition(trusteeInterface, records: List[Dict[str, Any]], featureInstanceId: str, mandateId: str) -> int:
"""Save validated records to TrusteePosition table."""
savedCount = 0
def _saveToTrusteePosition(
trusteeInterface,
records: List[Dict[str, Any]],
featureInstanceId: str,
mandateId: str,
fileId: Optional[str] = None,
fileName: Optional[str] = None,
sourceLocation: Optional[str] = None
) -> int:
"""
Save validated records to TrusteePosition table.
Also creates TrusteeDocument (referencing the source file) and links positions to it.
Args:
trusteeInterface: Trustee interface instance
records: List of expense records to save
featureInstanceId: Feature instance ID
mandateId: Mandate ID
fileId: Optional file ID from central Files table (source PDF)
fileName: Optional file name
sourceLocation: Optional source location (e.g., SharePoint path)
Returns:
Number of positions saved
"""
savedCount = 0
savedPositionIds = []
# Step 1: Create TrusteeDocument referencing the source file
documentId = None
if fileId and fileName:
try:
document = trusteeInterface.createDocument({
"fileId": fileId,
"documentName": fileName,
"documentMimeType": "application/pdf",
"sourceType": "sharepoint",
"sourceLocation": sourceLocation
})
if document:
documentId = document.id
logger.info(f"Created TrusteeDocument {documentId} referencing file {fileId}")
else:
logger.warning(f"Failed to create TrusteeDocument for file {fileId}")
except Exception as e:
logger.error(f"Error creating TrusteeDocument: {str(e)}")
# Step 2: Save positions
for record in records:
try:
position = {
@ -573,11 +655,27 @@ def _saveToTrusteePosition(trusteeInterface, records: List[Dict[str, Any]], feat
result = trusteeInterface.createPosition(position)
if result:
savedCount += 1
savedPositionIds.append(result.id)
logger.debug(f"Saved position: {position.get('company')} - {position.get('bookingAmount')}")
except Exception as e:
logger.error(f"Failed to save position: {str(e)}")
# Step 3: Create Position-Document links
if documentId and savedPositionIds:
for positionId in savedPositionIds:
try:
link = trusteeInterface.createPositionDocument({
"documentId": documentId,
"positionId": positionId
})
if link:
logger.debug(f"Created position-document link: {positionId} -> {documentId}")
else:
logger.warning(f"Failed to create position-document link: {positionId} -> {documentId}")
except Exception as e:
logger.error(f"Error creating position-document link: {str(e)}")
return savedCount
@ -718,27 +816,16 @@ async def _deleteFile(self, siteId: str, folderPath: str, fileName: str) -> bool
if not fileId:
return False
# Delete by ID
# Delete by ID using apiClient
deleteEndpoint = f"sites/{siteId}/drive/items/{fileId}"
result = await self.apiClient.makeGraphApiCall(deleteEndpoint, method="DELETE")
# Make DELETE request
if self.services.sharepoint.accessToken is None:
logger.error("Access token not set for delete")
if "error" in result:
logger.warning(f"Delete failed: {result['error']}")
return False
import aiohttp
headers = {"Authorization": f"Bearer {self.services.sharepoint.accessToken}"}
url = f"https://graph.microsoft.com/v1.0/{deleteEndpoint}"
async with aiohttp.ClientSession() as session:
async with session.delete(url, headers=headers) as response:
if response.status in [200, 204]:
logger.debug(f"Deleted file: {filePath}")
return True
else:
errorText = await response.text()
logger.warning(f"Delete failed: {response.status} - {errorText}")
return False
logger.debug(f"Deleted file: {filePath}")
return True
except Exception as e:
logger.error(f"Failed to delete file: {str(e)}")

View file

@ -92,6 +92,18 @@ class ApiClientHelper:
errorText = await response.text()
logger.error(f"Graph API call failed: {response.status} - {errorText}")
return {"error": f"API call failed: {response.status} - {errorText}"}
elif method == "DELETE":
logger.debug(f"Starting DELETE request to {url}")
async with session.delete(url, headers=headers) as response:
logger.info(f"Graph API response: {response.status}")
if response.status in [200, 204]:
logger.debug(f"Graph API DELETE success")
return {"success": True}
else:
errorText = await response.text()
logger.error(f"Graph API call failed: {response.status} - {errorText}")
return {"error": f"API call failed: {response.status} - {errorText}"}
except asyncio.TimeoutError:
logger.error(f"Graph API call timed out after 30 seconds: {endpoint}")