fixed json identification for pre-extracted content

This commit is contained in:
ValueOn AG 2025-12-25 00:24:53 +01:00
parent e1b3cd36f0
commit 23bb1ff5d3
2 changed files with 42 additions and 14 deletions

View file

@ -1202,21 +1202,24 @@ If no trackable items can be identified, return: {{"kpis": []}}
return None return None
# Check for ContentExtracted format # Check for ContentExtracted format
# Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
documentData = None documentData = None
if "parts" in jsonData and isinstance(jsonData.get("parts"), list):
# Direct ContentExtracted format: {"id": "...", "parts": [...], ...} validationMetadata = jsonData.get("validationMetadata", {})
documentData = jsonData actionType = validationMetadata.get("actionType")
else: if actionType == "context.extractContent":
validationMetadata = jsonData.get("validationMetadata", {}) # Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
actionType = validationMetadata.get("actionType") documentData = jsonData.get("documentData")
if actionType == "context.extractContent": logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}")
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
documentData = jsonData.get("documentData")
if documentData: if documentData:
from modules.datamodels.datamodelExtraction import ContentExtracted from modules.datamodels.datamodelExtraction import ContentExtracted
try: try:
# Stelle sicher, dass "id" vorhanden ist
if "id" not in documentData:
documentData["id"] = document.id
contentExtracted = ContentExtracted(**documentData) contentExtracted = ContentExtracted(**documentData)
if contentExtracted.parts: if contentExtracted.parts:
@ -1235,8 +1238,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
if not originalMimeType and part.metadata.get("documentMimeType"): if not originalMimeType and part.metadata.get("documentMimeType"):
originalMimeType = part.metadata.get("documentMimeType") originalMimeType = part.metadata.get("documentMimeType")
# Falls nicht gefunden, verwende documentName aus ContentExtracted # Falls nicht gefunden, versuche aus documentName zu extrahieren
if not originalFileName and hasattr(contentExtracted, 'id'): if not originalFileName:
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf") # Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
if document.fileName and "_extracted_" in document.fileName: if document.fileName and "_extracted_" in document.fileName:
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf" originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
@ -1252,7 +1255,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
"parts": contentExtracted.parts "parts": contentExtracted.parts
} }
except Exception as parseError: except Exception as parseError:
logger.debug(f"Could not parse ContentExtracted format: {str(parseError)}") logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
return None return None
return None return None

View file

@ -58,11 +58,35 @@ class GenerationService:
# Detect MIME without relying on a service center # Detect MIME without relying on a service center
mime_type = detectMimeTypeFromContent(content, doc.documentName) mime_type = detectMimeTypeFromContent(content, doc.documentName)
# WICHTIG: Für ActionDocuments mit validationMetadata (z.B. context.extractContent)
# müssen wir das gesamte ActionDocument serialisieren, nicht nur documentData
document_data = doc.documentData
if hasattr(doc, 'validationMetadata') and doc.validationMetadata:
# Wenn validationMetadata vorhanden ist, serialisiere das gesamte ActionDocument-Format
if mime_type == "application/json":
# Erstelle ActionDocument-Format mit validationMetadata und documentData
if hasattr(document_data, 'model_dump'):
# Pydantic v2
document_data_dict = document_data.model_dump()
elif hasattr(document_data, 'dict'):
# Pydantic v1
document_data_dict = document_data.dict()
elif isinstance(document_data, dict):
document_data_dict = document_data
else:
document_data_dict = {"data": str(document_data)}
# Erstelle ActionDocument-Format
document_data = {
"validationMetadata": doc.validationMetadata,
"documentData": document_data_dict
}
return { return {
'fileName': doc.documentName, 'fileName': doc.documentName,
'fileSize': len(str(doc.documentData)), 'fileSize': len(str(document_data)),
'mimeType': mime_type, 'mimeType': mime_type,
'content': doc.documentData, 'content': document_data,
'document': doc 'document': doc
} }
except Exception as e: except Exception as e: