fixed ai call end to end with saas multimandate
This commit is contained in:
parent
5c4813b10a
commit
97cbda0ef2
11 changed files with 395 additions and 232 deletions
|
|
@ -72,10 +72,16 @@ class ModelSelector:
|
|||
promptSize = len(prompt.encode("utf-8"))
|
||||
contextSize = len(context.encode("utf-8"))
|
||||
totalSize = promptSize + contextSize
|
||||
# Convert bytes to approximate tokens (1 token ≈ 4 bytes)
|
||||
promptTokens = promptSize / 4
|
||||
contextTokens = contextSize / 4
|
||||
totalTokens = totalSize / 4
|
||||
# Convert bytes to approximate tokens
|
||||
# Conservative estimate: 1 token ≈ 2 bytes (for safety margin)
|
||||
# Note: Actual tokenization varies by content type and model
|
||||
# - English text: ~4 bytes/token
|
||||
# - Structured data/JSON: ~2-3 bytes/token
|
||||
# - Base64/encoded data: ~1.5-2 bytes/token
|
||||
bytesPerToken = 2 # Conservative estimate for mixed content
|
||||
promptTokens = promptSize / bytesPerToken
|
||||
contextTokens = contextSize / bytesPerToken
|
||||
totalTokens = totalSize / bytesPerToken
|
||||
|
||||
logger.debug(f"Request sizes - Prompt: {promptTokens:.0f} tokens ({promptSize} bytes), Context: {contextTokens:.0f} tokens ({contextSize} bytes), Total: {totalTokens:.0f} tokens ({totalSize} bytes)")
|
||||
|
||||
|
|
|
|||
|
|
@ -220,11 +220,12 @@ class ChatMessage(BaseModel):
|
|||
)
|
||||
role: str = Field(description="Role of the message sender")
|
||||
status: str = Field(description="Status of the message (first, step, last)")
|
||||
sequenceNr: int = Field(
|
||||
sequenceNr: Optional[int] = Field(
|
||||
default=0,
|
||||
description="Sequence number of the message (set automatically)"
|
||||
)
|
||||
publishedAt: float = Field(
|
||||
default_factory=getUtcTimestamp,
|
||||
publishedAt: Optional[float] = Field(
|
||||
default=None,
|
||||
description="When the message was published (UTC timestamp in seconds)",
|
||||
)
|
||||
success: Optional[bool] = Field(
|
||||
|
|
|
|||
|
|
@ -399,7 +399,7 @@ AUTOMATION_TEMPLATES: Dict[str, Any] = {
|
|||
"connectionName": "",
|
||||
"sharepointFolder": "",
|
||||
"featureInstanceId": "",
|
||||
"extractionPrompt": "Du bist ein Spezialist für die Extraktion von Spesendaten aus PDF-Dokumenten.\n\nAUFGABE:\nExtrahiere alle Speseneinträge aus dem bereitgestellten PDF-Dokument und gib sie im CSV-Format zurück.\n\nWICHTIGE REGELN:\n1. Pro MwSt-Prozentsatz einen separaten Datensatz erstellen\n2. Alle Datensätze zusammen müssen den Gesamtbetrag des Dokuments ergeben\n3. Der gesamte extrahierte Text des Dokuments muss im Feld \"desc\" erfasst werden\n4. Feld \"company\" enthält den Lieferanten/Verkäufer der Buchung\n5. Tags müssen aus dieser Liste gewählt werden: customer, meeting, license, subscription, fuel, food, material\n - Mehrere zutreffende Tags mit Komma trennen\n\nCSV-SPALTEN (in dieser Reihenfolge):\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n\nDATENFORMAT:\n- valuta: YYYY-MM-DD (Valutadatum)\n- transactionDateTime: Unix-Timestamp in Sekunden (Transaktionszeitpunkt)\n- company: Lieferant/Verkäufer Name\n- desc: Vollständiger extrahierter Text des Dokuments\n- tags: Komma-getrennte Tags aus der erlaubten Liste\n- bookingCurrency: Währungscode (CHF, EUR, USD, GBP)\n- bookingAmount: Buchungsbetrag als Dezimalzahl\n- originalCurrency: Original-Währungscode\n- originalAmount: Original-Betrag als Dezimalzahl\n- vatPercentage: MwSt-Prozentsatz (z.B. 8.1 für 8.1%)\n- vatAmount: MwSt-Betrag als Dezimalzahl\n\nBEISPIEL OUTPUT:\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n2026-01-15,1736953200,Migros AG,\"Einkauf Migros Zürich...\",food,CHF,45.50,CHF,45.50,2.6,1.15\n2026-01-15,1736953200,Migros AG,\"Einkauf Migros Zürich...\",material,CHF,12.30,CHF,12.30,8.1,0.92\n\nHINWEISE:\n- Wenn nur ein MwSt-Satz vorhanden ist, einen Datensatz erstellen\n- Wenn mehrere MwSt-Sätze vorhanden sind (z.B. Lebensmittel 2.6% und Non-Food 8.1%), separate Datensätze erstellen\n- Bei fehlenden Informationen: leeres Feld oder Standardwert\n- Keine Anführungszeichen um numerische Werte"
|
||||
"extractionPrompt": "Du bist ein Spezialist für die Extraktion von Belegdaten aus PDF-Dokumenten.\n\nAUFGABE:\nExtrahiere die Daten aus dem bereitgestellten Zahlungsbeleg und erstelle EINE EINZIGE CSV-Tabelle mit allen Datensätzen.\n\nOUTPUT-STRUKTUR:\nErstelle genau EINE Tabelle mit den folgenden Spalten. Alle extrahierten Datensätze kommen in diese eine Tabelle als Zeilen.\n\nWICHTIGE REGELN:\n1. Pro MwSt-Prozentsatz einen separaten Datensatz (= Zeile) erstellen\n2. Alle Datensätze zusammen müssen den Gesamtbetrag des Dokuments ergeben\n3. Der gesamte extrahierte Text des Dokuments muss im Feld \"desc\" erfasst werden\n4. Feld \"company\" enthält den Lieferanten/Verkäufer der Buchung\n5. Tags müssen aus dieser Liste gewählt werden: customer, meeting, license, subscription, fuel, food, material\n - Mehrere zutreffende Tags mit Komma trennen\n\nCSV-SPALTEN (in dieser Reihenfolge):\nvaluta,transactionDateTime,company,desc,tags,bookingCurrency,bookingAmount,originalCurrency,originalAmount,vatPercentage,vatAmount\n\nDATENFORMAT:\n- valuta: YYYY-MM-DD (Valutadatum)\n- transactionDateTime: Unix-Timestamp in Sekunden (Transaktionszeitpunkt)\n- company: Lieferant/Verkäufer Name\n- desc: Vollständiger extrahierter Text des Dokuments\n- tags: Komma-getrennte Tags aus der erlaubten Liste\n- bookingCurrency: Währungscode (CHF, EUR, USD, GBP)\n- bookingAmount: Buchungsbetrag als Dezimalzahl\n- originalCurrency: Original-Währungscode\n- originalAmount: Original-Betrag als Dezimalzahl\n- vatPercentage: MwSt-Prozentsatz (z.B. 8.1 für 8.1%)\n- vatAmount: MwSt-Betrag als Dezimalzahl\n\nHINWEISE:\n- Wenn nur ein MwSt-Satz vorhanden ist, einen Datensatz erstellen\n- Wenn mehrere MwSt-Sätze vorhanden sind (z.B. Lebensmittel 2.6% und Non-Food 8.1%), separate Datensätze erstellen\n- Bei fehlenden Informationen: leeres Feld oder Standardwert"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -279,7 +279,10 @@ registerModelLabels(
|
|||
|
||||
|
||||
class TrusteeDocument(BaseModel):
|
||||
"""Contains document references and receipts for bookings.
|
||||
"""Contains document references for bookings.
|
||||
|
||||
Documents reference files in the central Files table via fileId.
|
||||
This allows file content to be stored once and referenced by multiple features.
|
||||
|
||||
Note: organisationId and contractId removed as per architecture decision:
|
||||
- The feature instance IS the organisation
|
||||
|
|
@ -294,11 +297,11 @@ class TrusteeDocument(BaseModel):
|
|||
"frontend_required": False
|
||||
}
|
||||
)
|
||||
documentData: Optional[bytes] = Field(
|
||||
fileId: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The file content (binary)",
|
||||
description="Reference to central Files table (Files.id)",
|
||||
json_schema_extra={
|
||||
"frontend_type": "file",
|
||||
"frontend_type": "file_reference",
|
||||
"frontend_readonly": False,
|
||||
"frontend_required": False
|
||||
}
|
||||
|
|
@ -321,6 +324,24 @@ class TrusteeDocument(BaseModel):
|
|||
"frontend_options": "/api/trustee/mime-types/options"
|
||||
}
|
||||
)
|
||||
sourceType: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Source type (e.g., 'sharepoint', 'upload', 'email')",
|
||||
json_schema_extra={
|
||||
"frontend_type": "text",
|
||||
"frontend_readonly": True,
|
||||
"frontend_required": False
|
||||
}
|
||||
)
|
||||
sourceLocation: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Original source location (e.g., SharePoint path)",
|
||||
json_schema_extra={
|
||||
"frontend_type": "text",
|
||||
"frontend_readonly": True,
|
||||
"frontend_required": False
|
||||
}
|
||||
)
|
||||
mandateId: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Mandate ID (auto-set from context)",
|
||||
|
|
@ -349,9 +370,11 @@ registerModelLabels(
|
|||
{"en": "Document", "fr": "Document", "de": "Dokument"},
|
||||
{
|
||||
"id": {"en": "ID", "fr": "ID", "de": "ID"},
|
||||
"documentData": {"en": "Document Data", "fr": "Données du document", "de": "Dokumentdaten"},
|
||||
"fileId": {"en": "File Reference", "fr": "Référence du fichier", "de": "Datei-Referenz"},
|
||||
"documentName": {"en": "Document Name", "fr": "Nom du document", "de": "Dokumentname"},
|
||||
"documentMimeType": {"en": "MIME Type", "fr": "Type MIME", "de": "MIME-Typ"},
|
||||
"sourceType": {"en": "Source Type", "fr": "Type de source", "de": "Quelltyp"},
|
||||
"sourceLocation": {"en": "Source Location", "fr": "Emplacement source", "de": "Quellort"},
|
||||
"mandateId": {"en": "Mandate", "fr": "Mandat", "de": "Mandat"},
|
||||
"featureInstanceId": {"en": "Feature Instance", "fr": "Instance de fonctionnalité", "de": "Feature-Instanz"},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -177,8 +177,7 @@ class TrusteeObjects:
|
|||
AccessRuleContext.DATA,
|
||||
tableName,
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
if not permissions.view:
|
||||
|
|
@ -205,8 +204,7 @@ class TrusteeObjects:
|
|||
AccessRuleContext.DATA,
|
||||
tableName,
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
if not permissions.view:
|
||||
|
|
@ -270,8 +268,7 @@ class TrusteeObjects:
|
|||
recordFilter=None,
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
logger.debug(f"getAllOrganisations: getRecordsetWithRBAC returned {len(records)} records")
|
||||
|
||||
|
|
@ -364,8 +361,7 @@ class TrusteeObjects:
|
|||
recordFilter=None,
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
# Users with ALL access level (from system RBAC) see all roles
|
||||
|
|
@ -475,8 +471,7 @@ class TrusteeObjects:
|
|||
recordFilter=None,
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
# Users with ALL access level (from system RBAC) see all records
|
||||
|
|
@ -535,8 +530,7 @@ class TrusteeObjects:
|
|||
recordFilter={"organisationId": organisationId},
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
return [TrusteeAccess(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
|
||||
|
||||
|
|
@ -553,8 +547,7 @@ class TrusteeObjects:
|
|||
recordFilter={"userId": userId},
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
# Users with ALL access level (from system RBAC) see all records
|
||||
|
|
@ -671,8 +664,7 @@ class TrusteeObjects:
|
|||
recordFilter=None,
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
totalItems = len(records)
|
||||
|
|
@ -705,8 +697,7 @@ class TrusteeObjects:
|
|||
recordFilter={"organisationId": organisationId},
|
||||
orderBy="label",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
return [TrusteeContract(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
|
||||
|
||||
|
|
@ -780,8 +771,8 @@ class TrusteeObjects:
|
|||
|
||||
createdRecord = self.db.recordCreate(TrusteeDocument, data)
|
||||
if createdRecord and createdRecord.get("id"):
|
||||
# Remove binary data and metadata from Pydantic model
|
||||
cleanedRecord = {k: v for k, v in createdRecord.items() if not k.startswith("_") and k != "documentData"}
|
||||
# Remove metadata from Pydantic model
|
||||
cleanedRecord = {k: v for k, v in createdRecord.items() if not k.startswith("_")}
|
||||
return TrusteeDocument(**cleanedRecord)
|
||||
return None
|
||||
|
||||
|
|
@ -795,13 +786,26 @@ class TrusteeObjects:
|
|||
return TrusteeDocument(**cleanedRecord)
|
||||
|
||||
def getDocumentData(self, documentId: str) -> Optional[bytes]:
|
||||
"""Get document binary data."""
|
||||
"""Get document binary data via fileId reference to central Files table."""
|
||||
records = self.db.getRecordset(TrusteeDocument, recordFilter={"id": documentId})
|
||||
record = records[0] if records else None
|
||||
if record:
|
||||
return record.get("documentData")
|
||||
if not record:
|
||||
return None
|
||||
|
||||
# New model: fileId references central Files table
|
||||
fileId = record.get("fileId")
|
||||
if fileId:
|
||||
from modules.interfaces.interfaceDbManagement import getInterface as getDbInterface
|
||||
dbInterface = getDbInterface(self.currentUser, mandateId=self.mandateId, featureInstanceId=self.featureInstanceId)
|
||||
fileData = dbInterface.getFileData(fileId)
|
||||
if fileData:
|
||||
return fileData
|
||||
logger.warning(f"File data not found for fileId {fileId}")
|
||||
return None
|
||||
|
||||
# Legacy fallback: documentData was stored directly (for migration)
|
||||
return record.get("documentData")
|
||||
|
||||
def getAllDocuments(self, params: Optional[PaginationParams] = None) -> PaginatedResult:
|
||||
"""Get all documents with RBAC filtering + feature-level access filtering (metadata only)."""
|
||||
# Step 1: System RBAC filtering
|
||||
|
|
@ -812,8 +816,7 @@ class TrusteeObjects:
|
|||
recordFilter=None,
|
||||
orderBy="documentName",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
# Convert dicts to Pydantic objects (remove binary data and internal fields)
|
||||
|
|
@ -852,8 +855,7 @@ class TrusteeObjects:
|
|||
recordFilter={"contractId": contractId},
|
||||
orderBy="documentName",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
result = []
|
||||
|
|
@ -960,8 +962,7 @@ class TrusteeObjects:
|
|||
recordFilter=None,
|
||||
orderBy="valuta",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
|
||||
# Convert dicts to Pydantic objects (remove internal fields)
|
||||
|
|
@ -1000,8 +1001,7 @@ class TrusteeObjects:
|
|||
recordFilter={"contractId": contractId},
|
||||
orderBy="valuta",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
|
||||
|
||||
|
|
@ -1015,8 +1015,7 @@ class TrusteeObjects:
|
|||
recordFilter={"organisationId": organisationId},
|
||||
orderBy="valuta",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
return [TrusteePosition(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in records]
|
||||
|
||||
|
|
@ -1173,8 +1172,7 @@ class TrusteeObjects:
|
|||
recordFilter={"positionId": positionId},
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links]
|
||||
|
||||
|
|
@ -1188,8 +1186,7 @@ class TrusteeObjects:
|
|||
recordFilter={"documentId": documentId},
|
||||
orderBy="id",
|
||||
mandateId=self.mandateId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.FEATURE_CODE
|
||||
featureInstanceId=self.featureInstanceId
|
||||
)
|
||||
return [TrusteePositionDocument(**{k: v for k, v in r.items() if not k.startswith("_")}) for r in links]
|
||||
|
||||
|
|
|
|||
|
|
@ -53,7 +53,8 @@ async def start_workflow(
|
|||
"""
|
||||
try:
|
||||
# Start or continue workflow using playground controller
|
||||
workflow = await chatStart(context.user, userInput, workflowMode, workflowId)
|
||||
mandateId = str(context.mandateId) if context.mandateId else None
|
||||
workflow = await chatStart(context.user, userInput, workflowMode, workflowId, mandateId=mandateId)
|
||||
|
||||
return workflow
|
||||
|
||||
|
|
@ -75,7 +76,8 @@ async def stop_workflow(
|
|||
"""Stops a running workflow."""
|
||||
try:
|
||||
# Stop workflow using playground controller
|
||||
workflow = await chatStop(context.user, workflowId)
|
||||
mandateId = str(context.mandateId) if context.mandateId else None
|
||||
workflow = await chatStop(context.user, workflowId, mandateId=mandateId)
|
||||
|
||||
return workflow
|
||||
|
||||
|
|
|
|||
|
|
@ -785,7 +785,32 @@ Respond with ONLY a JSON object in this exact format:
|
|||
if part.data
|
||||
])
|
||||
|
||||
# Call AI with extracted content
|
||||
# Check content size and use chunking if needed
|
||||
# Conservative estimate: 2 bytes per token, 80% of model limit for safety
|
||||
contentSizeBytes = len(contentText.encode('utf-8'))
|
||||
promptSizeBytes = len(prompt.encode('utf-8'))
|
||||
totalSizeBytes = contentSizeBytes + promptSizeBytes
|
||||
estimatedTokens = totalSizeBytes / 2 # Conservative: 2 bytes per token
|
||||
|
||||
# Get max model context (use Claude's 200k as reference, 80% = 160k tokens)
|
||||
maxSafeTokens = 160000
|
||||
|
||||
if estimatedTokens > maxSafeTokens:
|
||||
# Content too large - use chunking via ExtractionService
|
||||
logger.warning(f"Content too large for single AI call: ~{estimatedTokens:.0f} tokens (limit: {maxSafeTokens}). Using chunked processing.")
|
||||
|
||||
# Use ExtractionService for chunked processing
|
||||
extractionService = self.services.extraction
|
||||
aiResponse = await extractionService.processContentPartsWithPrompt(
|
||||
contentParts=contentParts,
|
||||
prompt=prompt,
|
||||
aiObjects=self.aiObjects,
|
||||
options=options,
|
||||
operationId=extractOperationId,
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
else:
|
||||
# Content fits - use single AI call
|
||||
aiRequest = AiCallRequest(
|
||||
prompt=f"{prompt}\n\nExtracted Content:\n{contentText}",
|
||||
context="",
|
||||
|
|
|
|||
|
|
@ -567,7 +567,8 @@ class StructureFiller:
|
|||
userPrompt: str,
|
||||
all_sections_list: List[Dict[str, Any]],
|
||||
language: str,
|
||||
calculateOverallProgress: callable
|
||||
outputFormat: str = "txt",
|
||||
calculateOverallProgress: callable = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process a single section and return its elements.
|
||||
|
|
@ -761,7 +762,8 @@ class StructureFiller:
|
|||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=isAggregation,
|
||||
language=language
|
||||
language=language,
|
||||
outputFormat=outputFormat
|
||||
)
|
||||
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
|
|
@ -949,7 +951,8 @@ class StructureFiller:
|
|||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=False,
|
||||
language=language
|
||||
language=language,
|
||||
outputFormat=outputFormat
|
||||
)
|
||||
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
|
|
@ -1214,7 +1217,8 @@ class StructureFiller:
|
|||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=False,
|
||||
language=language
|
||||
language=language,
|
||||
outputFormat=outputFormat
|
||||
)
|
||||
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
|
|
@ -1540,6 +1544,7 @@ class StructureFiller:
|
|||
for doc in chapterStructure.get("documents", []):
|
||||
docId = doc.get("id", "unknown")
|
||||
docLanguage = self._getDocumentLanguage(chapterStructure, docId)
|
||||
docFormat = doc.get("outputFormat", "txt") # Get output format for this document
|
||||
|
||||
for chapter in doc.get("chapters", []):
|
||||
chapterId = chapter.get("id", "unknown")
|
||||
|
|
@ -1555,7 +1560,8 @@ class StructureFiller:
|
|||
"sectionIndex": sectionIndex,
|
||||
"chapterSectionCount": chapterSectionCount,
|
||||
"section": section,
|
||||
"docLanguage": docLanguage
|
||||
"docLanguage": docLanguage,
|
||||
"docFormat": docFormat # Include output format
|
||||
})
|
||||
|
||||
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
|
||||
|
|
@ -1577,6 +1583,7 @@ class StructureFiller:
|
|||
userPrompt=userPrompt,
|
||||
all_sections_list=all_sections_list,
|
||||
language=taskInfo["docLanguage"],
|
||||
outputFormat=taskInfo.get("docFormat", "txt"), # Pass output format
|
||||
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0
|
||||
)
|
||||
|
||||
|
|
@ -1826,23 +1833,11 @@ If AVAILABLE CONTENT PARTS are listed above, then EVERY section that generates c
|
|||
- If chapter's generationHint references documents/images/data AND section generates content for that chapter → section MUST assign relevant ContentParts
|
||||
- Empty contentPartIds [] are only allowed if section generates content WITHOUT referencing any available ContentParts AND WITHOUT relating to chapter's generationHint
|
||||
|
||||
## CONTENT TYPES
|
||||
Available content types for sections: table, bullet_list, heading, paragraph, code_block, image
|
||||
## ACCEPTED CONTENT TYPES FOR THIS FORMAT
|
||||
The document output format ({outputFormat}) accepts only the following content types:
|
||||
{', '.join(acceptedSectionTypes)}
|
||||
|
||||
## ACCEPTED SECTION TYPES FOR THIS FORMAT
|
||||
The document output format ({outputFormat}) accepts only the following section types:
|
||||
{', '.join(acceptedSectionTypes) if acceptedSectionTypes else 'All section types'}
|
||||
|
||||
**IMPORTANT**: Only create sections with content types from the accepted list above. Do not create sections with types that are not accepted by this format.
|
||||
|
||||
## FORMAT-APPROPRIATE SECTION STRUCTURE
|
||||
When determining which sections to create for this chapter, consider the document's output format ({outputFormat}) and ensure sections are structured appropriately for that format:
|
||||
- Different formats have different capabilities and constraints
|
||||
- Structure sections to match what the format can effectively represent
|
||||
- Consider what content types work best for each format
|
||||
- Ensure the section structure aligns with the format's strengths and limitations
|
||||
- Select content types that are well-suited for the target format
|
||||
- **CRITICAL**: Only use section types from the ACCEPTED SECTION TYPES list above
|
||||
**CRITICAL**: Only create sections with content types from this list. Other types will fail.
|
||||
|
||||
useAiCall RULE (simple):
|
||||
- useAiCall: true → Content needs AI processing (extract, transform, generate, filter, summarize)
|
||||
|
|
@ -1853,7 +1848,7 @@ RETURN JSON:
|
|||
"sections": [
|
||||
{{
|
||||
"id": "section_1",
|
||||
"content_type": "paragraph",
|
||||
"content_type": "{acceptedSectionTypes[0]}",
|
||||
"contentPartIds": ["extracted_part_id"],
|
||||
"generationHint": "Description of what to extract or generate",
|
||||
"useAiCall": true,
|
||||
|
|
@ -1897,7 +1892,8 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
|
|||
allSections: Optional[List[Dict[str, Any]]] = None,
|
||||
sectionIndex: Optional[int] = None,
|
||||
isAggregation: bool = False,
|
||||
language: str = "en"
|
||||
language: str = "en",
|
||||
outputFormat: str = "txt"
|
||||
) -> tuple[str, str]:
|
||||
"""Baue Prompt für Section-Generierung mit vollständigem Kontext."""
|
||||
# Filtere None-Werte
|
||||
|
|
@ -2005,14 +2001,29 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
|
|||
for next in nextSections:
|
||||
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
|
||||
|
||||
contentStructureExample = self._getContentStructureExample(contentType)
|
||||
# Get accepted section types for the output format
|
||||
acceptedTypesAggr = self._getAcceptedSectionTypesForFormat(outputFormat)
|
||||
|
||||
# CRITICAL: If the section's content_type is not supported by the output format,
|
||||
# use the first accepted type instead. E.g., CSV only supports 'table', so
|
||||
# even if section says 'code_block', we must output as 'table'.
|
||||
effectiveContentType = contentType
|
||||
if contentType not in acceptedTypesAggr and acceptedTypesAggr:
|
||||
effectiveContentType = acceptedTypesAggr[0]
|
||||
logger.debug(f"Section {sectionId}: Content type '{contentType}' not supported by format '{outputFormat}', using '{effectiveContentType}' instead")
|
||||
|
||||
contentStructureExample = self._getContentStructureExample(effectiveContentType)
|
||||
|
||||
# Build format note for the prompt - purely dynamic from renderer
|
||||
# Always show what types are accepted for this format
|
||||
formatNoteAggr = f"\n- Target Output Format: {outputFormat.upper()} (accepted content types: {', '.join(acceptedTypesAggr)})"
|
||||
|
||||
# Create template structure explicitly (not extracted from prompt)
|
||||
# This ensures exact identity between initial and continuation prompts
|
||||
templateStructure = f"""{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{contentType}",
|
||||
"type": "{effectiveContentType}",
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
|
|
@ -2022,14 +2033,14 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
|
|||
prompt = f"""# TASK: Generate Section Content (Aggregation)
|
||||
|
||||
Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON.
|
||||
If ContentParts have no data, return: {{"elements": [{{"type": "{contentType}", "content": {{"headers": [], "rows": []}}}}]}}
|
||||
If ContentParts have no data, return: {{"elements": [{{"type": "{effectiveContentType}", "content": {{"headers": [], "rows": []}}}}]}}
|
||||
|
||||
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
||||
|
||||
## SECTION METADATA
|
||||
- Section ID: {sectionId}
|
||||
- Content Type: {contentType}
|
||||
- Generation Hint: {generationHint}
|
||||
- Content Type: {effectiveContentType}
|
||||
- Generation Hint: {generationHint}{formatNoteAggr}
|
||||
|
||||
## CONTENT EFFICIENCY PRINCIPLES
|
||||
- Generate COMPACT content: Focus on essential facts only
|
||||
|
|
@ -2044,7 +2055,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
3. If the context contains no data, return empty structures (empty rows array for tables).
|
||||
4. Aggregate all data into one element (e.g., one table).
|
||||
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
|
||||
6. Format based on content_type ({contentType}).
|
||||
6. Format based on content_type ({effectiveContentType}).
|
||||
7. No HTML/styling: Plain text only, no markup.
|
||||
8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
|
||||
|
||||
|
|
@ -2055,7 +2066,7 @@ Return a JSON object with this structure:
|
|||
{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{contentType}",
|
||||
"type": "{effectiveContentType}",
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
|
|
@ -2087,8 +2098,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
|
||||
## SECTION METADATA
|
||||
- Section ID: {sectionId}
|
||||
- Content Type: {contentType}
|
||||
- Generation Hint: {generationHint}
|
||||
- Content Type: {effectiveContentType}
|
||||
- Generation Hint: {generationHint}{formatNoteAggr}
|
||||
|
||||
## CONTENT EFFICIENCY PRINCIPLES
|
||||
- Generate COMPACT content: Focus on essential facts only
|
||||
|
|
@ -2103,7 +2114,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
## INSTRUCTIONS
|
||||
1. Extract data only from provided ContentParts. Never invent or generate data.
|
||||
2. If ContentParts contain no data, return empty structures (empty rows array for tables).
|
||||
3. Format based on content_type ({contentType}).
|
||||
3. Format based on content_type ({effectiveContentType}).
|
||||
4. Return only valid JSON with "elements" array.
|
||||
5. No HTML/styling: Plain text only, no markup.
|
||||
6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
|
||||
|
|
@ -2114,7 +2125,7 @@ Return a JSON object with this structure:
|
|||
{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{contentType}",
|
||||
"type": "{effectiveContentType}",
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
|
|
@ -2142,8 +2153,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
|
||||
## SECTION METADATA
|
||||
- Section ID: {sectionId}
|
||||
- Content Type: {contentType}
|
||||
- Generation Hint: {generationHint}
|
||||
- Content Type: {effectiveContentType}
|
||||
- Generation Hint: {generationHint}{formatNoteAggr}
|
||||
|
||||
## CONTENT EFFICIENCY PRINCIPLES
|
||||
- Generate COMPACT content: Focus on essential facts only
|
||||
|
|
@ -2154,7 +2165,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
|
||||
## INSTRUCTIONS
|
||||
1. Generate content based on the Generation Hint above.
|
||||
2. Create appropriate content that matches the content_type ({contentType}).
|
||||
2. Create appropriate content that matches the content_type ({effectiveContentType}).
|
||||
3. The content should be relevant to the USER REQUEST and fit the context of surrounding sections.
|
||||
4. Return only valid JSON with "elements" array.
|
||||
5. No HTML/styling: Plain text only, no markup.
|
||||
|
|
@ -2166,7 +2177,7 @@ Return a JSON object with this structure:
|
|||
{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{contentType}",
|
||||
"type": "{effectiveContentType}",
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
|
|
@ -2557,28 +2568,26 @@ CRITICAL:
|
|||
|
||||
Returns:
|
||||
List of accepted section content types (e.g., ["table", "code_block"])
|
||||
|
||||
Raises:
|
||||
ValueError: If renderer not found or doesn't provide accepted types
|
||||
"""
|
||||
try:
|
||||
from modules.services.serviceGeneration.renderers.registry import getRenderer
|
||||
|
||||
# Get renderer for this format
|
||||
# Get renderer for this format - NO FALLBACK
|
||||
renderer = getRenderer(outputFormat, self.services)
|
||||
|
||||
if renderer and hasattr(renderer, 'getAcceptedSectionTypes'):
|
||||
# Query renderer for accepted types
|
||||
if not renderer:
|
||||
raise ValueError(f"No renderer found for output format '{outputFormat}'. Check renderer registry.")
|
||||
|
||||
if not hasattr(renderer, 'getAcceptedSectionTypes'):
|
||||
raise ValueError(f"Renderer for '{outputFormat}' does not implement getAcceptedSectionTypes(). Add this method to the renderer.")
|
||||
|
||||
acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat)
|
||||
if acceptedTypes:
|
||||
logger.debug(f"Renderer for format '{outputFormat}' accepts section types: {acceptedTypes}")
|
||||
|
||||
if not acceptedTypes:
|
||||
raise ValueError(f"Renderer for '{outputFormat}' returned empty accepted types. Fix getAcceptedSectionTypes() in the renderer.")
|
||||
|
||||
logger.debug(f"Renderer for '{outputFormat}' accepts: {acceptedTypes}")
|
||||
return acceptedTypes
|
||||
|
||||
# Fallback: if no renderer or method not found, return all types
|
||||
from modules.datamodels.datamodelJson import supportedSectionTypes
|
||||
logger.debug(f"No renderer found for format '{outputFormat}' or method not available, using all section types")
|
||||
return list(supportedSectionTypes)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error querying renderer for accepted section types for format '{outputFormat}': {str(e)}")
|
||||
# Fallback: return all types
|
||||
from modules.datamodels.datamodelJson import supportedSectionTypes
|
||||
return list(supportedSectionTypes)
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ from .subAutomationUtils import parseScheduleToCron, planToPrompt, replacePlaceh
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode: WorkflowModeEnum, workflowId: Optional[str] = None) -> ChatWorkflow:
|
||||
async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode: WorkflowModeEnum, workflowId: Optional[str] = None, mandateId: Optional[str] = None) -> ChatWorkflow:
|
||||
"""
|
||||
Starts a new chat or continues an existing one, then launches processing asynchronously.
|
||||
|
||||
|
|
@ -33,12 +33,13 @@ async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode
|
|||
userInput: User input request
|
||||
workflowId: Optional workflow ID to continue existing workflow
|
||||
workflowMode: "Dynamic" for iterative dynamic-style processing, "Automation" for automated workflow execution
|
||||
mandateId: Mandate ID from request context (required for proper data isolation)
|
||||
|
||||
Example usage for Dynamic mode:
|
||||
workflow = await chatStart(currentUser, userInput, workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC)
|
||||
workflow = await chatStart(currentUser, userInput, workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC, mandateId=mandateId)
|
||||
"""
|
||||
try:
|
||||
services = getServices(currentUser, None)
|
||||
services = getServices(currentUser, mandateId=mandateId)
|
||||
workflowManager = WorkflowManager(services)
|
||||
workflow = await workflowManager.workflowStart(userInput, workflowMode, workflowId)
|
||||
return workflow
|
||||
|
|
@ -46,10 +47,10 @@ async def chatStart(currentUser: User, userInput: UserInputRequest, workflowMode
|
|||
logger.error(f"Error starting chat: {str(e)}")
|
||||
raise
|
||||
|
||||
async def chatStop(currentUser: User, workflowId: str) -> ChatWorkflow:
|
||||
async def chatStop(currentUser: User, workflowId: str, mandateId: Optional[str] = None) -> ChatWorkflow:
|
||||
"""Stops a running chat."""
|
||||
try:
|
||||
services = getServices(currentUser, None)
|
||||
services = getServices(currentUser, mandateId=mandateId)
|
||||
workflowManager = WorkflowManager(services)
|
||||
return await workflowManager.workflowStop(workflowId)
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
# Configuration
|
||||
MAX_FILES_PER_EXECUTION = 50
|
||||
MAX_CONCURRENT_AI_TASKS = 10 # Limit concurrent AI calls to avoid rate limits
|
||||
ALLOWED_TAGS = ["customer", "meeting", "license", "subscription", "fuel", "food", "material"]
|
||||
RATE_LIMIT_WAIT_SECONDS = 60
|
||||
|
||||
|
|
@ -92,6 +93,11 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
self.services.chat.progressLogFinish(operationId, False)
|
||||
return ActionResult.isFailure(error="No valid Microsoft connection found")
|
||||
|
||||
# Set access token for SharePoint service
|
||||
if not self.services.sharepoint.setAccessTokenFromConnection(connection):
|
||||
self.services.chat.progressLogFinish(operationId, False)
|
||||
return ActionResult.isFailure(error="Failed to set SharePoint access token")
|
||||
|
||||
# Find site and folder info
|
||||
self.services.chat.progressLogUpdate(operationId, 0.1, "Resolving SharePoint site")
|
||||
siteInfo, folderPath = await _resolveSiteAndFolder(self, sharepointFolder)
|
||||
|
|
@ -137,16 +143,23 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
featureInstanceId=featureInstanceId
|
||||
)
|
||||
|
||||
# Process each PDF
|
||||
for idx, pdfFile in enumerate(pdfFiles):
|
||||
currentProgress = 0.2 + (idx * progressPerFile)
|
||||
# Process PDFs in parallel with semaphore to limit concurrent AI calls
|
||||
semaphore = asyncio.Semaphore(MAX_CONCURRENT_AI_TASKS)
|
||||
completedCount = [0] # Use list for mutable reference in closure
|
||||
|
||||
async def processSinglePdf(idx: int, pdfFile: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Process a single PDF document. Returns result dict."""
|
||||
fileName = pdfFile.get("name", f"file_{idx}")
|
||||
fileId = pdfFile.get("id")
|
||||
|
||||
async with semaphore:
|
||||
# Update progress (thread-safe via asyncio)
|
||||
completedCount[0] += 1
|
||||
currentProgress = 0.2 + (completedCount[0] * progressPerFile)
|
||||
self.services.chat.progressLogUpdate(
|
||||
operationId,
|
||||
currentProgress,
|
||||
f"Processing {idx + 1}/{totalFiles}: {fileName}"
|
||||
min(currentProgress, 0.9),
|
||||
f"Processing {completedCount[0]}/{totalFiles}: {fileName}"
|
||||
)
|
||||
|
||||
try:
|
||||
|
|
@ -154,44 +167,37 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)
|
||||
if not fileContent:
|
||||
await _moveToErrorFolder(self, siteId, folderPath, fileName)
|
||||
errorDocuments.append({
|
||||
"file": fileName,
|
||||
"error": "Failed to download",
|
||||
"movedTo": "error/"
|
||||
})
|
||||
continue
|
||||
return {"type": "error", "file": fileName, "error": "Failed to download", "movedTo": "error/"}
|
||||
|
||||
# AI call to extract expense data
|
||||
# AI call to extract expense data (this is the bottleneck - parallelized)
|
||||
aiResult = await _extractExpensesWithAi(self.services, fileContent, fileName, prompt, featureInstanceId)
|
||||
|
||||
if not aiResult.get("success"):
|
||||
await _moveToErrorFolder(self, siteId, folderPath, fileName)
|
||||
errorDocuments.append({
|
||||
"file": fileName,
|
||||
"error": aiResult.get("error", "AI extraction failed"),
|
||||
"movedTo": "error/"
|
||||
})
|
||||
continue
|
||||
return {"type": "error", "file": fileName, "error": aiResult.get("error", "AI extraction failed"), "movedTo": "error/"}
|
||||
|
||||
records = aiResult.get("records", [])
|
||||
fileId = aiResult.get("fileId")
|
||||
|
||||
# Check for empty records
|
||||
if not records:
|
||||
logger.warning(f"Document {fileName}: No records extracted, moving to error folder")
|
||||
await _moveToErrorFolder(self, siteId, folderPath, fileName)
|
||||
skippedDocuments.append({
|
||||
"file": fileName,
|
||||
"reason": "No expense records extracted",
|
||||
"movedTo": "error/"
|
||||
})
|
||||
continue
|
||||
return {"type": "skipped", "file": fileName, "reason": "No expense records extracted", "movedTo": "error/"}
|
||||
|
||||
# Validate and enrich records
|
||||
validatedRecords = _validateAndEnrichRecords(records, fileName)
|
||||
|
||||
# Save to TrusteePosition
|
||||
savedCount = _saveToTrusteePosition(trusteeInterface, validatedRecords, featureInstanceId, self.services.mandateId)
|
||||
totalPositions += savedCount
|
||||
# Save to TrusteePosition and create Document + Position-Document links
|
||||
savedCount = _saveToTrusteePosition(
|
||||
trusteeInterface,
|
||||
validatedRecords,
|
||||
featureInstanceId,
|
||||
self.services.mandateId,
|
||||
fileId=fileId,
|
||||
fileName=fileName,
|
||||
sourceLocation=sharepointFolder
|
||||
)
|
||||
|
||||
# Move document to "processed" subfolder
|
||||
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
|
|
@ -199,12 +205,13 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
|
||||
moveSuccess = await _moveToProcessedFolder(self, siteId, folderPath, fileName, newFileName)
|
||||
|
||||
processedDocuments.append({
|
||||
return {
|
||||
"type": "processed",
|
||||
"file": fileName,
|
||||
"newLocation": f"processed/{newFileName}" if moveSuccess else "move_failed",
|
||||
"recordsExtracted": len(validatedRecords),
|
||||
"recordsSaved": savedCount
|
||||
})
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
errorMsg = str(e)
|
||||
|
|
@ -216,11 +223,24 @@ async def getExpensesFromPdf(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
await asyncio.sleep(RATE_LIMIT_WAIT_SECONDS)
|
||||
|
||||
await _moveToErrorFolder(self, siteId, folderPath, fileName)
|
||||
errorDocuments.append({
|
||||
"file": fileName,
|
||||
"error": errorMsg,
|
||||
"movedTo": "error/"
|
||||
})
|
||||
return {"type": "error", "file": fileName, "error": errorMsg, "movedTo": "error/"}
|
||||
|
||||
# Execute all PDF processing tasks in parallel (limited by semaphore)
|
||||
logger.info(f"Starting parallel processing of {totalFiles} PDFs (max {MAX_CONCURRENT_AI_TASKS} concurrent)")
|
||||
tasks = [processSinglePdf(idx, pdfFile) for idx, pdfFile in enumerate(pdfFiles)]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Collect results
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
errorDocuments.append({"file": "unknown", "error": str(result), "movedTo": "error/"})
|
||||
elif result.get("type") == "processed":
|
||||
processedDocuments.append(result)
|
||||
totalPositions += result.get("recordsSaved", 0)
|
||||
elif result.get("type") == "skipped":
|
||||
skippedDocuments.append(result)
|
||||
elif result.get("type") == "error":
|
||||
errorDocuments.append(result)
|
||||
|
||||
# Create result summary
|
||||
self.services.chat.progressLogUpdate(operationId, 0.95, "Creating result summary")
|
||||
|
|
@ -423,9 +443,10 @@ async def _extractExpensesWithAi(services, fileContent: bytes, fileName: str, pr
|
|||
|
||||
# Step 5: Call AI with documentList - let AI service handle everything
|
||||
# (extraction, intent analysis, chunking, image processing)
|
||||
# Use DATA_GENERATE (same path as ai.process) which handles chunking correctly
|
||||
options = AiCallOptions(
|
||||
resultFormat="csv",
|
||||
operationType=OperationTypeEnum.DATA_EXTRACT
|
||||
operationType=OperationTypeEnum.DATA_GENERATE
|
||||
)
|
||||
|
||||
aiResponse = await services.ai.callAiContent(
|
||||
|
|
@ -433,17 +454,31 @@ async def _extractExpensesWithAi(services, fileContent: bytes, fileName: str, pr
|
|||
options=options,
|
||||
documentList=documentList,
|
||||
contentParts=None, # Let AI service extract from documents
|
||||
outputFormat="csv"
|
||||
outputFormat="csv",
|
||||
generationIntent="extract" # Signal this is extraction, not document generation
|
||||
)
|
||||
|
||||
if not aiResponse or not aiResponse.content:
|
||||
if not aiResponse:
|
||||
return {"success": False, "error": "AI returned empty response"}
|
||||
|
||||
# Parse CSV response
|
||||
csvContent = aiResponse.content
|
||||
# Get CSV from rendered documents (not from content - that's the internal structure)
|
||||
if not aiResponse.documents or len(aiResponse.documents) == 0:
|
||||
return {"success": False, "error": "AI returned no documents"}
|
||||
|
||||
# Get the CSV content from the first document
|
||||
csvDocument = aiResponse.documents[0]
|
||||
csvContent = csvDocument.documentData
|
||||
|
||||
# documentData is bytes, decode to string
|
||||
if isinstance(csvContent, bytes):
|
||||
csvContent = csvContent.decode('utf-8')
|
||||
|
||||
logger.info(f"Retrieved CSV content ({len(csvContent)} chars) from rendered document: {csvDocument.documentName}")
|
||||
|
||||
records = _parseCsvToRecords(csvContent)
|
||||
|
||||
return {"success": True, "records": records}
|
||||
# Return fileId so it can be used to create TrusteeDocument reference
|
||||
return {"success": True, "records": records, "fileId": fileItem.id}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI extraction error for {fileName}: {str(e)}")
|
||||
|
|
@ -454,8 +489,9 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
|
|||
"""Parse CSV content to list of expense records."""
|
||||
records = []
|
||||
try:
|
||||
# Clean up CSV content - remove markdown code blocks if present
|
||||
content = csvContent.strip()
|
||||
|
||||
# Clean up CSV content - remove markdown code blocks if present
|
||||
if content.startswith("```"):
|
||||
lines = content.split('\n')
|
||||
# Remove first and last line if they're code block markers
|
||||
|
|
@ -471,6 +507,8 @@ def _parseCsvToRecords(csvContent: str) -> List[Dict[str, Any]]:
|
|||
cleanedRow = {k.strip(): v.strip() if isinstance(v, str) else v for k, v in row.items()}
|
||||
records.append(cleanedRow)
|
||||
|
||||
logger.info(f"Parsed {len(records)} records from CSV content")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing CSV: {str(e)}")
|
||||
|
||||
|
|
@ -548,10 +586,54 @@ def _parseFloat(value) -> float:
|
|||
return 0.0
|
||||
|
||||
|
||||
def _saveToTrusteePosition(trusteeInterface, records: List[Dict[str, Any]], featureInstanceId: str, mandateId: str) -> int:
|
||||
"""Save validated records to TrusteePosition table."""
|
||||
savedCount = 0
|
||||
def _saveToTrusteePosition(
|
||||
trusteeInterface,
|
||||
records: List[Dict[str, Any]],
|
||||
featureInstanceId: str,
|
||||
mandateId: str,
|
||||
fileId: Optional[str] = None,
|
||||
fileName: Optional[str] = None,
|
||||
sourceLocation: Optional[str] = None
|
||||
) -> int:
|
||||
"""
|
||||
Save validated records to TrusteePosition table.
|
||||
Also creates TrusteeDocument (referencing the source file) and links positions to it.
|
||||
|
||||
Args:
|
||||
trusteeInterface: Trustee interface instance
|
||||
records: List of expense records to save
|
||||
featureInstanceId: Feature instance ID
|
||||
mandateId: Mandate ID
|
||||
fileId: Optional file ID from central Files table (source PDF)
|
||||
fileName: Optional file name
|
||||
sourceLocation: Optional source location (e.g., SharePoint path)
|
||||
|
||||
Returns:
|
||||
Number of positions saved
|
||||
"""
|
||||
savedCount = 0
|
||||
savedPositionIds = []
|
||||
|
||||
# Step 1: Create TrusteeDocument referencing the source file
|
||||
documentId = None
|
||||
if fileId and fileName:
|
||||
try:
|
||||
document = trusteeInterface.createDocument({
|
||||
"fileId": fileId,
|
||||
"documentName": fileName,
|
||||
"documentMimeType": "application/pdf",
|
||||
"sourceType": "sharepoint",
|
||||
"sourceLocation": sourceLocation
|
||||
})
|
||||
if document:
|
||||
documentId = document.id
|
||||
logger.info(f"Created TrusteeDocument {documentId} referencing file {fileId}")
|
||||
else:
|
||||
logger.warning(f"Failed to create TrusteeDocument for file {fileId}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating TrusteeDocument: {str(e)}")
|
||||
|
||||
# Step 2: Save positions
|
||||
for record in records:
|
||||
try:
|
||||
position = {
|
||||
|
|
@ -573,11 +655,27 @@ def _saveToTrusteePosition(trusteeInterface, records: List[Dict[str, Any]], feat
|
|||
result = trusteeInterface.createPosition(position)
|
||||
if result:
|
||||
savedCount += 1
|
||||
savedPositionIds.append(result.id)
|
||||
logger.debug(f"Saved position: {position.get('company')} - {position.get('bookingAmount')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save position: {str(e)}")
|
||||
|
||||
# Step 3: Create Position-Document links
|
||||
if documentId and savedPositionIds:
|
||||
for positionId in savedPositionIds:
|
||||
try:
|
||||
link = trusteeInterface.createPositionDocument({
|
||||
"documentId": documentId,
|
||||
"positionId": positionId
|
||||
})
|
||||
if link:
|
||||
logger.debug(f"Created position-document link: {positionId} -> {documentId}")
|
||||
else:
|
||||
logger.warning(f"Failed to create position-document link: {positionId} -> {documentId}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating position-document link: {str(e)}")
|
||||
|
||||
return savedCount
|
||||
|
||||
|
||||
|
|
@ -718,27 +816,16 @@ async def _deleteFile(self, siteId: str, folderPath: str, fileName: str) -> bool
|
|||
if not fileId:
|
||||
return False
|
||||
|
||||
# Delete by ID
|
||||
# Delete by ID using apiClient
|
||||
deleteEndpoint = f"sites/{siteId}/drive/items/{fileId}"
|
||||
result = await self.apiClient.makeGraphApiCall(deleteEndpoint, method="DELETE")
|
||||
|
||||
# Make DELETE request
|
||||
if self.services.sharepoint.accessToken is None:
|
||||
logger.error("Access token not set for delete")
|
||||
if "error" in result:
|
||||
logger.warning(f"Delete failed: {result['error']}")
|
||||
return False
|
||||
|
||||
import aiohttp
|
||||
headers = {"Authorization": f"Bearer {self.services.sharepoint.accessToken}"}
|
||||
url = f"https://graph.microsoft.com/v1.0/{deleteEndpoint}"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.delete(url, headers=headers) as response:
|
||||
if response.status in [200, 204]:
|
||||
logger.debug(f"Deleted file: {filePath}")
|
||||
return True
|
||||
else:
|
||||
errorText = await response.text()
|
||||
logger.warning(f"Delete failed: {response.status} - {errorText}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete file: {str(e)}")
|
||||
|
|
|
|||
|
|
@ -93,6 +93,18 @@ class ApiClientHelper:
|
|||
logger.error(f"Graph API call failed: {response.status} - {errorText}")
|
||||
return {"error": f"API call failed: {response.status} - {errorText}"}
|
||||
|
||||
elif method == "DELETE":
|
||||
logger.debug(f"Starting DELETE request to {url}")
|
||||
async with session.delete(url, headers=headers) as response:
|
||||
logger.info(f"Graph API response: {response.status}")
|
||||
if response.status in [200, 204]:
|
||||
logger.debug(f"Graph API DELETE success")
|
||||
return {"success": True}
|
||||
else:
|
||||
errorText = await response.text()
|
||||
logger.error(f"Graph API call failed: {response.status} - {errorText}")
|
||||
return {"error": f"API call failed: {response.status} - {errorText}"}
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"Graph API call timed out after 30 seconds: {endpoint}")
|
||||
return {"error": f"API call timed out after 30 seconds: {endpoint}"}
|
||||
|
|
|
|||
Loading…
Reference in a new issue