fixed json identification for pre-extracted content
This commit is contained in:
parent
e1b3cd36f0
commit
23bb1ff5d3
2 changed files with 42 additions and 14 deletions
|
|
@ -1202,21 +1202,24 @@ If no trackable items can be identified, return: {{"kpis": []}}
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Check for ContentExtracted format
|
# Check for ContentExtracted format
|
||||||
|
# Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
|
||||||
documentData = None
|
documentData = None
|
||||||
if "parts" in jsonData and isinstance(jsonData.get("parts"), list):
|
|
||||||
# Direct ContentExtracted format: {"id": "...", "parts": [...], ...}
|
validationMetadata = jsonData.get("validationMetadata", {})
|
||||||
documentData = jsonData
|
actionType = validationMetadata.get("actionType")
|
||||||
else:
|
if actionType == "context.extractContent":
|
||||||
validationMetadata = jsonData.get("validationMetadata", {})
|
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
|
||||||
actionType = validationMetadata.get("actionType")
|
documentData = jsonData.get("documentData")
|
||||||
if actionType == "context.extractContent":
|
logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}")
|
||||||
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
|
|
||||||
documentData = jsonData.get("documentData")
|
|
||||||
|
|
||||||
if documentData:
|
if documentData:
|
||||||
from modules.datamodels.datamodelExtraction import ContentExtracted
|
from modules.datamodels.datamodelExtraction import ContentExtracted
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Stelle sicher, dass "id" vorhanden ist
|
||||||
|
if "id" not in documentData:
|
||||||
|
documentData["id"] = document.id
|
||||||
|
|
||||||
contentExtracted = ContentExtracted(**documentData)
|
contentExtracted = ContentExtracted(**documentData)
|
||||||
|
|
||||||
if contentExtracted.parts:
|
if contentExtracted.parts:
|
||||||
|
|
@ -1235,8 +1238,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
|
||||||
if not originalMimeType and part.metadata.get("documentMimeType"):
|
if not originalMimeType and part.metadata.get("documentMimeType"):
|
||||||
originalMimeType = part.metadata.get("documentMimeType")
|
originalMimeType = part.metadata.get("documentMimeType")
|
||||||
|
|
||||||
# Falls nicht gefunden, verwende documentName aus ContentExtracted
|
# Falls nicht gefunden, versuche aus documentName zu extrahieren
|
||||||
if not originalFileName and hasattr(contentExtracted, 'id'):
|
if not originalFileName:
|
||||||
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
|
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
|
||||||
if document.fileName and "_extracted_" in document.fileName:
|
if document.fileName and "_extracted_" in document.fileName:
|
||||||
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
|
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
|
||||||
|
|
@ -1252,7 +1255,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
|
||||||
"parts": contentExtracted.parts
|
"parts": contentExtracted.parts
|
||||||
}
|
}
|
||||||
except Exception as parseError:
|
except Exception as parseError:
|
||||||
logger.debug(f"Could not parse ContentExtracted format: {str(parseError)}")
|
logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
|
||||||
|
logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -58,11 +58,35 @@ class GenerationService:
|
||||||
# Detect MIME without relying on a service center
|
# Detect MIME without relying on a service center
|
||||||
mime_type = detectMimeTypeFromContent(content, doc.documentName)
|
mime_type = detectMimeTypeFromContent(content, doc.documentName)
|
||||||
|
|
||||||
|
# WICHTIG: Für ActionDocuments mit validationMetadata (z.B. context.extractContent)
|
||||||
|
# müssen wir das gesamte ActionDocument serialisieren, nicht nur documentData
|
||||||
|
document_data = doc.documentData
|
||||||
|
if hasattr(doc, 'validationMetadata') and doc.validationMetadata:
|
||||||
|
# Wenn validationMetadata vorhanden ist, serialisiere das gesamte ActionDocument-Format
|
||||||
|
if mime_type == "application/json":
|
||||||
|
# Erstelle ActionDocument-Format mit validationMetadata und documentData
|
||||||
|
if hasattr(document_data, 'model_dump'):
|
||||||
|
# Pydantic v2
|
||||||
|
document_data_dict = document_data.model_dump()
|
||||||
|
elif hasattr(document_data, 'dict'):
|
||||||
|
# Pydantic v1
|
||||||
|
document_data_dict = document_data.dict()
|
||||||
|
elif isinstance(document_data, dict):
|
||||||
|
document_data_dict = document_data
|
||||||
|
else:
|
||||||
|
document_data_dict = {"data": str(document_data)}
|
||||||
|
|
||||||
|
# Erstelle ActionDocument-Format
|
||||||
|
document_data = {
|
||||||
|
"validationMetadata": doc.validationMetadata,
|
||||||
|
"documentData": document_data_dict
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'fileName': doc.documentName,
|
'fileName': doc.documentName,
|
||||||
'fileSize': len(str(doc.documentData)),
|
'fileSize': len(str(document_data)),
|
||||||
'mimeType': mime_type,
|
'mimeType': mime_type,
|
||||||
'content': doc.documentData,
|
'content': document_data,
|
||||||
'document': doc
|
'document': doc
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue