gateway/modules/services/serviceAi/subContentExtraction.py
2026-01-06 22:32:52 +01:00

706 lines
39 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Content Extraction Module
Handles content extraction and preparation, including:
- Extracting content from documents based on intents
- Processing pre-extracted documents
- Vision AI for image text extraction
- AI processing of text content
"""
import json
import logging
import base64
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class ContentExtractor:
"""Handles content extraction and preparation."""
def __init__(self, services, aiService, intentAnalyzer):
"""Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
self.services = services
self.aiService = aiService
self.intentAnalyzer = intentAnalyzer
async def extractAndPrepareContent(
self,
documents: List[ChatDocument],
documentIntents: List[DocumentIntent],
parentOperationId: str,
getIntentForDocument: callable
) -> List[ContentPart]:
"""
Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
Gibt Liste von ContentParts im passenden Format zurück.
WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
Beispiel: Bild mit intents=["extract", "render"] erzeugt:
- ContentPart(contentFormat="object", ...) für Rendering
- ContentPart(contentFormat="extracted", ...) für Text-Analyse
Args:
documents: Liste der zu verarbeitenden Dokumente
documentIntents: Liste von DocumentIntent-Objekten
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
getIntentForDocument: Callable to get intent for document ID
Returns:
Liste von ContentParts mit vollständigen Metadaten
"""
# Erstelle Operation-ID für Extraktion
extractionOperationId = f"{parentOperationId}_content_extraction"
# Starte ChatLog mit Parent-Referenz
self.services.chat.progressLogStart(
extractionOperationId,
"Content Extraction",
"Extraction",
f"Extracting from {len(documents)} documents",
parentOperationId=parentOperationId
)
try:
allContentParts = []
for document in documents:
checkWorkflowStopped(self.services)
# Check if document is already a ContentExtracted document (pre-extracted JSON)
logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)
if preExtracted:
logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")
# Verwende bereits extrahierte ContentParts direkt
contentExtracted = preExtracted["contentExtracted"]
# WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
# (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
intent = getIntentForDocument(document.id, documentIntents)
logger.info(f" Intent lookup for document {document.id}: found={intent is not None}")
if intent:
logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
else:
logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")
if contentExtracted.parts:
# CRITICAL: Process pre-extracted parts - analyze structure parts for nested content
processedParts = []
for part in contentExtracted.parts:
# Überspringe leere Parts (Container ohne Daten)
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
if part.typeGroup == "container":
continue # Überspringe leere Container
# CRITICAL: Check if structure part contains nested parts (e.g., JSON with documentData.parts)
if part.typeGroup == "structure" and part.mimeType == "application/json" and part.data:
nestedParts = self._extractNestedPartsFromStructure(part, document, preExtracted, intent)
if nestedParts:
# Replace structure part with extracted nested parts
processedParts.extend(nestedParts)
logger.info(f"✅ Extracted {len(nestedParts)} nested parts from structure part {part.id}")
continue # Skip original structure part
# Keep original part if no nested parts found
processedParts.append(part)
# Use processed parts (with nested parts extracted)
for part in processedParts:
if not part.metadata:
part.metadata = {}
# Ensure metadata is complete
if "documentId" not in part.metadata:
part.metadata["documentId"] = document.id
# WICHTIG: Prüfe Intent für dieses Part
partIntent = intent.intents if intent else ["extract"]
# Debug-Logging für Intent-Verarbeitung
logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")
# WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
# Generische Intent-Verarbeitung für ALLE Content-Typen
hasReferenceIntent = "reference" in partIntent
hasRenderIntent = "render" in partIntent
hasExtractIntent = "extract" in partIntent
hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)
logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")
# Track ob der originale Part bereits hinzugefügt wurde
originalPartAdded = False
# 1. Reference Intent: Erstelle Reference ContentPart
if hasReferenceIntent:
referencePart = ContentPart(
id=f"ref_{document.id}_{part.id}",
label=f"Reference: {part.label or 'Content'}",
typeGroup="reference",
mimeType=part.mimeType or "application/octet-stream",
data="", # Leer - nur Referenz
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
"intent": "reference",
"usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
"originalFileName": preExtracted["originalDocument"]["fileName"]
}
)
allContentParts.append(referencePart)
logger.debug(f"✅ Created reference ContentPart for {part.id}")
# 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
if hasRenderIntent and hasPartData:
# Prüfe ob es ein Binary/Image ist (kann gerendert werden)
isRenderable = (
part.typeGroup == "image" or
part.typeGroup == "binary" or
(part.mimeType and (
part.mimeType.startswith("image/") or
part.mimeType.startswith("video/") or
part.mimeType.startswith("audio/") or
self._isBinary(part.mimeType)
))
)
if isRenderable:
objectPart = ContentPart(
id=f"obj_{document.id}_{part.id}",
label=f"Object: {part.label or 'Content'}",
typeGroup=part.typeGroup,
mimeType=part.mimeType or "application/octet-stream",
data=part.data, # Base64/Binary data ist bereits vorhanden
metadata={
"contentFormat": "object",
"documentId": document.id,
"intent": "render",
"usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
}
)
allContentParts.append(objectPart)
logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
else:
logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
elif hasRenderIntent and not hasPartData:
logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")
# 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation)
if hasExtractIntent:
# For images: Keep as image part with extract intent - Vision AI extraction happens during section generation
if part.typeGroup == "image" and hasPartData:
logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation")
# Keep image part as-is, mark with extract intent
part.metadata.update({
"contentFormat": "extracted", # Marked for extraction, but not yet extracted
"intent": "extract",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.",
"needsVisionExtraction": True # Flag to indicate Vision AI extraction needed
})
allContentParts.append(part)
originalPartAdded = True
else:
# For text/table content: Use directly as extracted (no AI processing here)
# AI processing with extractionPrompt happens during section generation
if not originalPartAdded:
part.metadata.update({
"contentFormat": "extracted",
"intent": "extract",
"fromExtractContent": True,
"skipExtraction": True, # Already extracted (raw extraction)
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None
})
# Stelle sicher dass contentFormat gesetzt ist
if "contentFormat" not in part.metadata:
part.metadata["contentFormat"] = "extracted"
allContentParts.append(part)
originalPartAdded = True
logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")
# 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
# (sollte normalerweise nicht vorkommen, da default "extract" ist)
if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
part.metadata.update({
"contentFormat": "extracted",
"intent": "extract",
"fromExtractContent": True,
"skipExtraction": True,
"originalFileName": preExtracted["originalDocument"]["fileName"]
})
allContentParts.append(part)
originalPartAdded = True
logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}")
continue # Skip normal extraction for this document
# Check if it's standardized JSON format (has "documents" or "sections")
if document.mimeType == "application/json":
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
# Create reference ContentPart for structured JSON
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue # Skip normal extraction for this document
except Exception as e:
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
# Continue with normal extraction
# Normal extraction path
intent = getIntentForDocument(document.id, documentIntents)
if not intent:
# Try to find intent by similar UUID (fix for AI UUID hallucination)
correctedIntent = self._findIntentBySimilarId(document.id, documentIntents)
if correctedIntent:
logger.warning(f"Found intent for document {document.id} using UUID correction (original: {correctedIntent.documentId})")
# Create new intent with correct document ID
intent = DocumentIntent(
documentId=document.id,
intents=correctedIntent.intents,
extractionPrompt=correctedIntent.extractionPrompt,
reasoning=f"Intent matched by UUID similarity (original: {correctedIntent.documentId})"
)
else:
# Default: extract für alle Dokumente ohne Intent
logger.warning(f"No intent found for document {document.id}, using default 'extract'")
intent = DocumentIntent(
documentId=document.id,
intents=["extract"],
extractionPrompt="Extract all content from the document",
reasoning="Default intent: no specific intent found"
)
# WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen
if "reference" in intent.intents:
# Erstelle Reference ContentPart
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="reference",
mimeType=document.mimeType,
data="",
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"intent": "reference",
"usageHint": f"Reference document: {document.fileName}"
}
)
allContentParts.append(contentPart)
# WICHTIG: "render" und "extract" können beide vorhanden sein!
# In diesem Fall erzeugen wir BEIDE ContentParts
if "render" in intent.intents:
# Für Images/Binary: extrahiere als Object
if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
try:
# Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
if not binaryData:
logger.warning(f"No binary data found for document {document.id}")
continue
base64Data = base64.b64encode(binaryData).decode('utf-8')
contentPart = ContentPart(
id=f"obj_{document.id}",
label=f"Object: {document.fileName}",
typeGroup="image" if document.mimeType.startswith("image/") else "binary",
mimeType=document.mimeType,
data=base64Data,
metadata={
"contentFormat": "object",
"documentId": document.id,
"intent": "render",
"usageHint": f"Render as visual element: {document.fileName}",
"originalFileName": document.fileName,
# Verknüpfung zu extracted Part (falls vorhanden)
"relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
}
)
allContentParts.append(contentPart)
except Exception as e:
logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")
if "extract" in intent.intents:
# Extrahiere Content mit Extraction Service
extractionPrompt = intent.extractionPrompt or "Extract all content from the document"
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
extractionPrompt,
f"content_extraction_prompt_{document.id}"
)
# Führe Extraktion aus
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
extractionOptions = ExtractionOptions(
prompt=extractionPrompt,
mergeStrategy=MergeStrategy()
)
# extractContent ist nicht async - keine await nötig
checkWorkflowStopped(self.services)
extractedResults = self.services.extraction.extractContent(
[document],
extractionOptions,
operationId=extractionOperationId,
parentOperationId=extractionOperationId
)
# Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
for extracted in extractedResults:
for part in extracted.parts:
# Markiere als extracted Format
part.metadata.update({
"contentFormat": "extracted",
"documentId": document.id,
"extractionPrompt": extractionPrompt,
"intent": "extract",
"usageHint": f"Use extracted content from {document.fileName}",
# Verknüpfung zu object Part (falls vorhanden)
"relatedObjectPartId": f"obj_{document.id}" if "render" in intent.intents else None
})
# For images: Mark that Vision AI extraction is needed during section generation
if part.typeGroup == "image":
part.metadata["needsVisionExtraction"] = True
logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation")
# Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
if "render" in intent.intents:
part.id = f"ext_{document.id}_{part.id}"
allContentParts.append(part)
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
"content_extraction_result"
)
# State 2 Validation: Validate and auto-fix ContentParts
validatedParts = []
for part in allContentParts:
# Validation 2.1: Skip ContentParts without documentId
if not part.metadata.get("documentId"):
logger.warning(f"Skipping ContentPart {part.id} - missing documentId in metadata")
continue
# Validation 2.2: Skip ContentParts with invalid contentFormat
contentFormat = part.metadata.get("contentFormat")
if contentFormat not in ["extracted", "object", "reference"]:
logger.warning(
f"Skipping ContentPart {part.id} - invalid contentFormat: {contentFormat}"
)
continue
validatedParts.append(part)
# ChatLog abschließen
self.services.chat.progressLogFinish(extractionOperationId, True)
return validatedParts
except Exception as e:
self.services.chat.progressLogFinish(extractionOperationId, False)
logger.error(f"Error in extractAndPrepareContent: {str(e)}")
raise
async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Extrahiere Text aus einem Image-Part mit Vision AI.
Args:
imagePart: ContentPart mit typeGroup="image"
extractionPrompt: Prompt für die Text-Extraktion
Returns:
Extrahierter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Final extraction prompt
finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
finalPrompt,
f"content_extraction_prompt_image_{imagePart.id}"
)
# Erstelle AI-Call-Request mit Image-Part
request = AiCallRequest(
prompt=finalPrompt,
context="",
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
contentParts=[imagePart]
)
# Verwende AI-Service für Vision AI-Verarbeitung
checkWorkflowStopped(self.services)
response = await self.aiService.callAi(request)
# Debug-Log für Response (harmonisiert)
if response and response.content:
self.services.utils.writeDebugFile(
response.content,
f"content_extraction_response_image_{imagePart.id}"
)
if response and response.content:
return response.content.strip()
# Kein Content zurückgegeben - return error message für Debugging
errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
logger.warning(errorMsg)
return f"[ERROR: {errorMsg}]"
except Exception as e:
errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
logger.error(errorMsg)
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Return error message statt None für Debugging
return f"[ERROR: {errorMsg}]"
async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Verarbeite Text-Content mit AI basierend auf extractionPrompt.
WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
(z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.
Args:
textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
extractionPrompt: Prompt für die AI-Verarbeitung des Textes
Returns:
AI-verarbeiteter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Final extraction prompt
finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."
# Debug-Log (harmonisiert) - log prompt with text preview
textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
self.services.utils.writeDebugFile(
promptWithContext,
f"content_extraction_prompt_text_{textPart.id}"
)
# Erstelle Text-ContentPart für AI-Verarbeitung
# Verwende den vorhandenen Text als Input
textContentPart = ContentPart(
id=textPart.id,
label=textPart.label,
typeGroup="text",
mimeType="text/plain",
data=textPart.data if textPart.data else "",
metadata=textPart.metadata.copy() if textPart.metadata else {}
)
# Erstelle AI-Call-Request mit Text-Part
request = AiCallRequest(
prompt=finalPrompt,
context="",
options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
contentParts=[textContentPart]
)
# Verwende AI-Service für Text-Verarbeitung
checkWorkflowStopped(self.services)
response = await self.aiService.callAi(request)
# Debug-Log für Response (harmonisiert)
if response and response.content:
self.services.utils.writeDebugFile(
response.content,
f"content_extraction_response_text_{textPart.id}"
)
if response and response.content:
return response.content.strip()
# Kein Content zurückgegeben - return error message für Debugging
errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
logger.warning(errorMsg)
return f"[ERROR: {errorMsg}]"
except Exception as e:
errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
logger.error(errorMsg)
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Return error message statt None für Debugging
return f"[ERROR: {errorMsg}]"
def _isBinary(self, mimeType: str) -> bool:
"""Prüfe ob MIME-Type binary ist."""
binaryTypes = [
"application/octet-stream",
"application/pdf",
"application/zip",
"application/x-zip-compressed"
]
return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")
def _extractNestedPartsFromStructure(
self,
structurePart: ContentPart,
document: ChatDocument,
preExtracted: Dict[str, Any],
intent: Optional[Any]
) -> List[ContentPart]:
"""
Extract nested parts from a structure ContentPart (e.g., JSON with documentData.parts).
This is a generic function that analyzes pre-processed ContentParts and extracts
any nested parts that are embedded in structure data (typically JSON).
Works with standard ContentExtracted format: documentData.parts array.
Each nested part is extracted as a separate ContentPart with proper metadata.
Args:
structurePart: ContentPart with typeGroup="structure" containing nested parts
document: The document this part belongs to
preExtracted: Pre-extracted document metadata
intent: Document intent for nested parts
Returns:
List of extracted ContentParts, empty if no nested parts found
"""
nestedParts = []
try:
# Parse JSON structure
jsonData = json.loads(structurePart.data)
# Check for standard ContentExtracted format: documentData.parts
if isinstance(jsonData, dict):
documentData = jsonData.get("documentData")
if isinstance(documentData, dict):
parts = documentData.get("parts", [])
if isinstance(parts, list) and len(parts) > 0:
# Extract each nested part
for nestedPartData in parts:
if not isinstance(nestedPartData, dict):
continue
nestedPartId = nestedPartData.get("id") or f"nested_{len(nestedParts)}"
nestedTypeGroup = nestedPartData.get("typeGroup", "text")
nestedMimeType = nestedPartData.get("mimeType", "text/plain")
nestedLabel = nestedPartData.get("label", structurePart.label)
nestedData = nestedPartData.get("data", "")
nestedMetadata = nestedPartData.get("metadata", {})
# Create ContentPart for nested part
nestedPart = ContentPart(
id=f"{structurePart.id}_{nestedPartId}",
parentId=structurePart.id,
label=nestedLabel,
typeGroup=nestedTypeGroup,
mimeType=nestedMimeType,
data=nestedData,
metadata={
**nestedMetadata,
"documentId": document.id,
"fromNestedStructure": True,
"parentStructurePartId": structurePart.id,
"originalFileName": preExtracted["originalDocument"]["fileName"]
}
)
nestedParts.append(nestedPart)
logger.debug(f"✅ Extracted nested part: {nestedPart.id} (typeGroup={nestedTypeGroup}, mimeType={nestedMimeType})")
# If no nested parts found, return empty list (original part will be kept)
if not nestedParts:
logger.debug(f"No nested parts found in structure part {structurePart.id}")
except json.JSONDecodeError as e:
logger.warning(f"Could not parse structure part {structurePart.id} as JSON: {str(e)}")
except Exception as e:
logger.error(f"Error extracting nested parts from structure part {structurePart.id}: {str(e)}")
return nestedParts
def _findIntentBySimilarId(self, documentId: str, documentIntents: List[DocumentIntent]) -> Optional[DocumentIntent]:
"""
Versucht ein Intent zu finden, dessen UUID ähnlich zur angegebenen Dokument-ID ist.
Dies hilft bei AI UUID-Halluzinationen (z.B. 4451 -> 4551).
Args:
documentId: Die Dokument-ID für die ein Intent gesucht wird
documentIntents: Liste aller verfügbaren DocumentIntents
Returns:
DocumentIntent mit ähnlicher UUID falls gefunden, sonst None
"""
if not documentId or len(documentId) != 36: # UUID Format: 8-4-4-4-12
return None
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
if documentId.count('-') != 4:
return None
for intent in documentIntents:
intentId = intent.documentId
if len(intentId) != 36:
continue
# Zähle unterschiedliche Zeichen
differences = sum(c1 != c2 for c1, c2 in zip(documentId, intentId))
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
if differences <= 2:
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
if documentId.count('-') == intentId.count('-'):
return intent
return None