From 23bb1ff5d30e668d0e416ef90acb75b70a39c955 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Thu, 25 Dec 2025 00:24:53 +0100
Subject: [PATCH] fixed json identification for pre-extracted content
---
modules/services/serviceAi/mainServiceAi.py | 28 +++++++++++--------
.../mainServiceGeneration.py | 28 +++++++++++++++++--
2 files changed, 42 insertions(+), 14 deletions(-)
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 30e7cc88..331a3289 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -1202,21 +1202,24 @@ If no trackable items can be identified, return: {{"kpis": []}}
return None
# Check for ContentExtracted format
+ # Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
documentData = None
- if "parts" in jsonData and isinstance(jsonData.get("parts"), list):
- # Direct ContentExtracted format: {"id": "...", "parts": [...], ...}
- documentData = jsonData
- else:
- validationMetadata = jsonData.get("validationMetadata", {})
- actionType = validationMetadata.get("actionType")
- if actionType == "context.extractContent":
- # Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
- documentData = jsonData.get("documentData")
+
+ validationMetadata = jsonData.get("validationMetadata", {})
+ actionType = validationMetadata.get("actionType")
+ if actionType == "context.extractContent":
+ # Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
+ documentData = jsonData.get("documentData")
+ logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}")
if documentData:
from modules.datamodels.datamodelExtraction import ContentExtracted
try:
+ # Stelle sicher, dass "id" vorhanden ist
+ if "id" not in documentData:
+ documentData["id"] = document.id
+
contentExtracted = ContentExtracted(**documentData)
if contentExtracted.parts:
@@ -1235,8 +1238,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
if not originalMimeType and part.metadata.get("documentMimeType"):
originalMimeType = part.metadata.get("documentMimeType")
- # Falls nicht gefunden, verwende documentName aus ContentExtracted
- if not originalFileName and hasattr(contentExtracted, 'id'):
+ # Falls nicht gefunden, versuche aus documentName zu extrahieren
+ if not originalFileName:
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
if document.fileName and "_extracted_" in document.fileName:
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
@@ -1252,7 +1255,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
"parts": contentExtracted.parts
}
except Exception as parseError:
- logger.debug(f"Could not parse ContentExtracted format: {str(parseError)}")
+ logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
+ logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
return None
return None
diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index cababbeb..e08eaa81 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -58,11 +58,35 @@ class GenerationService:
# Detect MIME without relying on a service center
mime_type = detectMimeTypeFromContent(content, doc.documentName)
+ # WICHTIG: Für ActionDocuments mit validationMetadata (z.B. context.extractContent)
+ # müssen wir das gesamte ActionDocument serialisieren, nicht nur documentData
+ document_data = doc.documentData
+ if hasattr(doc, 'validationMetadata') and doc.validationMetadata:
+ # Wenn validationMetadata vorhanden ist, serialisiere das gesamte ActionDocument-Format
+ if mime_type == "application/json":
+ # Erstelle ActionDocument-Format mit validationMetadata und documentData
+ if hasattr(document_data, 'model_dump'):
+ # Pydantic v2
+ document_data_dict = document_data.model_dump()
+ elif hasattr(document_data, 'dict'):
+ # Pydantic v1
+ document_data_dict = document_data.dict()
+ elif isinstance(document_data, dict):
+ document_data_dict = document_data
+ else:
+ document_data_dict = {"data": str(document_data)}
+
+ # Erstelle ActionDocument-Format
+ document_data = {
+ "validationMetadata": doc.validationMetadata,
+ "documentData": document_data_dict
+ }
+
return {
'fileName': doc.documentName,
- 'fileSize': len(str(doc.documentData)),
+ 'fileSize': len(str(document_data)),
'mimeType': mime_type,
- 'content': doc.documentData,
+ 'content': document_data,
'document': doc
}
except Exception as e: