# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Content Extraction Module Handles content extraction and preparation, including: - Extracting content from documents based on intents - Processing pre-extracted documents - Vision AI for image text extraction - AI processing of text content """ import json import logging import base64 from typing import Dict, Any, List, Optional from modules.datamodels.datamodelChatbot import ChatDocument from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent from modules.workflows.processing.shared.stateTools import checkWorkflowStopped logger = logging.getLogger(__name__) class ContentExtractor: """Handles content extraction and preparation.""" def __init__(self, services, aiService, intentAnalyzer): """Initialize ContentExtractor with service center, AI service, and intent analyzer access.""" self.services = services self.aiService = aiService self.intentAnalyzer = intentAnalyzer async def extractAndPrepareContent( self, documents: List[ChatDocument], documentIntents: List[DocumentIntent], parentOperationId: str, getIntentForDocument: callable ) -> List[ContentPart]: """ Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor. Gibt Liste von ContentParts im passenden Format zurück. WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind. Beispiel: Bild mit intents=["extract", "render"] erzeugt: - ContentPart(contentFormat="object", ...) für Rendering - ContentPart(contentFormat="extracted", ...) für Text-Analyse Args: documents: Liste der zu verarbeitenden Dokumente documentIntents: Liste von DocumentIntent-Objekten parentOperationId: Parent Operation-ID für ChatLog-Hierarchie getIntentForDocument: Callable to get intent for document ID Returns: Liste von ContentParts mit vollständigen Metadaten """ # Erstelle Operation-ID für Extraktion extractionOperationId = f"{parentOperationId}_content_extraction" # Starte ChatLog mit Parent-Referenz self.services.chat.progressLogStart( extractionOperationId, "Content Extraction", "Extraction", f"Extracting from {len(documents)} documents", parentOperationId=parentOperationId ) try: allContentParts = [] for document in documents: checkWorkflowStopped(self.services) # Check if document is already a ContentExtracted document (pre-extracted JSON) logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content") preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document) if preExtracted: logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}") logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}") logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}") # Verwende bereits extrahierte ContentParts direkt contentExtracted = preExtracted["contentExtracted"] # WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original # (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID) intent = getIntentForDocument(document.id, documentIntents) logger.info(f" Intent lookup for document {document.id}: found={intent is not None}") if intent: logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...") else: logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}") if contentExtracted.parts: # CRITICAL: Process pre-extracted parts - analyze structure parts for nested content processedParts = [] for part in contentExtracted.parts: # Überspringe leere Parts (Container ohne Daten) if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0): if part.typeGroup == "container": continue # Überspringe leere Container # CRITICAL: Check if structure part contains nested parts (e.g., JSON with documentData.parts) if part.typeGroup == "structure" and part.mimeType == "application/json" and part.data: nestedParts = self._extractNestedPartsFromStructure(part, document, preExtracted, intent) if nestedParts: # Replace structure part with extracted nested parts processedParts.extend(nestedParts) logger.info(f"✅ Extracted {len(nestedParts)} nested parts from structure part {part.id}") continue # Skip original structure part # Keep original part if no nested parts found processedParts.append(part) # Use processed parts (with nested parts extracted) for part in processedParts: if not part.metadata: part.metadata = {} # Ensure metadata is complete if "documentId" not in part.metadata: part.metadata["documentId"] = document.id # WICHTIG: Prüfe Intent für dieses Part partIntent = intent.intents if intent else ["extract"] # Debug-Logging für Intent-Verarbeitung logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}") # WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart # Generische Intent-Verarbeitung für ALLE Content-Typen hasReferenceIntent = "reference" in partIntent hasRenderIntent = "render" in partIntent hasExtractIntent = "extract" in partIntent hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0) logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}") # SAFETY: For images with any intent, always ensure render is included # This ensures the image object part is always available for later rendering isImage = part.typeGroup == "image" or (part.mimeType and part.mimeType.startswith("image/")) if isImage and hasPartData and not hasRenderIntent: logger.info(f"🖼️ Auto-adding render intent for image {part.id} (original intents: {partIntent})") hasRenderIntent = True # Track ob der originale Part bereits hinzugefügt wurde originalPartAdded = False # 1. Reference Intent: Erstelle Reference ContentPart if hasReferenceIntent: referencePart = ContentPart( id=f"ref_{document.id}_{part.id}", label=f"Reference: {part.label or 'Content'}", typeGroup="reference", mimeType=part.mimeType or "application/octet-stream", data="", # Leer - nur Referenz metadata={ "contentFormat": "reference", "documentId": document.id, "documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}", "intent": "reference", "usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}", "originalFileName": preExtracted["originalDocument"]["fileName"] } ) allContentParts.append(referencePart) logger.debug(f"✅ Created reference ContentPart for {part.id}") # 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering) if hasRenderIntent and hasPartData: # Prüfe ob es ein Binary/Image ist (kann gerendert werden) isRenderable = ( part.typeGroup == "image" or part.typeGroup == "binary" or (part.mimeType and ( part.mimeType.startswith("image/") or part.mimeType.startswith("video/") or part.mimeType.startswith("audio/") or self._isBinary(part.mimeType) )) ) if isRenderable: objectPart = ContentPart( id=f"obj_{document.id}_{part.id}", label=f"Object: {part.label or 'Content'}", typeGroup=part.typeGroup, mimeType=part.mimeType or "application/octet-stream", data=part.data, # Base64/Binary data ist bereits vorhanden metadata={ "contentFormat": "object", "documentId": document.id, "intent": "render", "usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}", "originalFileName": preExtracted["originalDocument"]["fileName"], "relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None } ) allContentParts.append(objectPart) logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)") else: logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})") elif hasRenderIntent and not hasPartData: logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part") # 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation) if hasExtractIntent: # For images: Keep as image part with extract intent - Vision AI extraction happens during section generation if part.typeGroup == "image" and hasPartData: logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation") # Keep image part as-is, mark with extract intent part.metadata.update({ "contentFormat": "extracted", # Marked for extraction, but not yet extracted "intent": "extract", "originalFileName": preExtracted["originalDocument"]["fileName"], "relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None, "extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.", "needsVisionExtraction": True # Flag to indicate Vision AI extraction needed }) allContentParts.append(part) originalPartAdded = True else: # For text/table content: Use directly as extracted (no AI processing here) # AI processing with extractionPrompt happens during section generation if not originalPartAdded: part.metadata.update({ "contentFormat": "extracted", "intent": "extract", "fromExtractContent": True, "skipExtraction": True, # Already extracted (raw extraction) "originalFileName": preExtracted["originalDocument"]["fileName"], "relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None, "extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None }) # Stelle sicher dass contentFormat gesetzt ist if "contentFormat" not in part.metadata: part.metadata["contentFormat"] = "extracted" allContentParts.append(part) originalPartAdded = True logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)") # 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt # (sollte normalerweise nicht vorkommen, da default "extract" ist) if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded: logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default") part.metadata.update({ "contentFormat": "extracted", "intent": "extract", "fromExtractContent": True, "skipExtraction": True, "originalFileName": preExtracted["originalDocument"]["fileName"] }) allContentParts.append(part) originalPartAdded = True logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}") logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}") continue # Skip normal extraction for this document # Check if it's standardized JSON format (has "documents" or "sections") if document.mimeType == "application/json": try: docBytes = self.services.interfaceDbComponent.getFileData(document.fileId) if docBytes: docData = docBytes.decode('utf-8') jsonData = json.loads(docData) if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData): logger.info(f"Document is already in standardized JSON format, using as reference") # Create reference ContentPart for structured JSON contentPart = ContentPart( id=f"ref_{document.id}", label=f"Reference: {document.fileName}", typeGroup="structure", mimeType="application/json", data=docData, metadata={ "contentFormat": "reference", "documentId": document.id, "documentReference": f"docItem:{document.id}:{document.fileName}", "skipExtraction": True, "intent": "reference" } ) allContentParts.append(contentPart) logger.info(f"✅ Using JSON document directly without extraction") continue # Skip normal extraction for this document except Exception as e: logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}") # Continue with normal extraction # Normal extraction path intent = getIntentForDocument(document.id, documentIntents) if not intent: # Try to find intent by similar UUID (fix for AI UUID hallucination) correctedIntent = self._findIntentBySimilarId(document.id, documentIntents) if correctedIntent: logger.warning(f"Found intent for document {document.id} using UUID correction (original: {correctedIntent.documentId})") # Create new intent with correct document ID intent = DocumentIntent( documentId=document.id, intents=correctedIntent.intents, extractionPrompt=correctedIntent.extractionPrompt, reasoning=f"Intent matched by UUID similarity (original: {correctedIntent.documentId})" ) else: # Default: extract für alle Dokumente ohne Intent logger.warning(f"No intent found for document {document.id}, using default 'extract'") intent = DocumentIntent( documentId=document.id, intents=["extract"], extractionPrompt="Extract all content from the document", reasoning="Default intent: no specific intent found" ) # WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen if "reference" in intent.intents: # Erstelle Reference ContentPart contentPart = ContentPart( id=f"ref_{document.id}", label=f"Reference: {document.fileName}", typeGroup="reference", mimeType=document.mimeType, data="", metadata={ "contentFormat": "reference", "documentId": document.id, "documentReference": f"docItem:{document.id}:{document.fileName}", "intent": "reference", "usageHint": f"Reference document: {document.fileName}" } ) allContentParts.append(contentPart) # WICHTIG: "render" und "extract" können beide vorhanden sein! # In diesem Fall erzeugen wir BEIDE ContentParts # SAFETY: For images with any intent, always create object part for later rendering isImageDocument = document.mimeType and document.mimeType.startswith("image/") shouldAutoRender = isImageDocument and "render" not in intent.intents and ("extract" in intent.intents or "reference" in intent.intents) if shouldAutoRender: logger.info(f"🖼️ Auto-adding render for image document {document.id} (original intents: {intent.intents})") if "render" in intent.intents or shouldAutoRender: # Für Images/Binary: extrahiere als Object if document.mimeType.startswith("image/") or self._isBinary(document.mimeType): try: # Lade Binary-Daten (getFileData ist nicht async - keine await nötig) binaryData = self.services.interfaceDbComponent.getFileData(document.fileId) if not binaryData: logger.warning(f"No binary data found for document {document.id}") continue base64Data = base64.b64encode(binaryData).decode('utf-8') contentPart = ContentPart( id=f"obj_{document.id}", label=f"Object: {document.fileName}", typeGroup="image" if document.mimeType.startswith("image/") else "binary", mimeType=document.mimeType, data=base64Data, metadata={ "contentFormat": "object", "documentId": document.id, "intent": "render", "usageHint": f"Render as visual element: {document.fileName}", "originalFileName": document.fileName, # Verknüpfung zu extracted Part (falls vorhanden) "relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None } ) allContentParts.append(contentPart) except Exception as e: logger.error(f"Failed to load binary data for document {document.id}: {str(e)}") if "extract" in intent.intents: # Extrahiere Content mit Extraction Service extractionPrompt = intent.extractionPrompt or "Extract all content from the document" # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( extractionPrompt, f"content_extraction_prompt_{document.id}" ) # Führe Extraktion aus from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy extractionOptions = ExtractionOptions( prompt=extractionPrompt, mergeStrategy=MergeStrategy() ) # extractContent ist nicht async - keine await nötig checkWorkflowStopped(self.services) extractedResults = self.services.extraction.extractContent( [document], extractionOptions, operationId=extractionOperationId, parentOperationId=extractionOperationId ) # Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten # Check if object part exists (either explicit render or auto-render for images) hasObjectPart = "render" in intent.intents or shouldAutoRender for extracted in extractedResults: for part in extracted.parts: # Markiere als extracted Format part.metadata.update({ "contentFormat": "extracted", "documentId": document.id, "extractionPrompt": extractionPrompt, "intent": "extract", "usageHint": f"Use extracted content from {document.fileName}", # Verknüpfung zu object Part (falls vorhanden - including auto-render for images) "relatedObjectPartId": f"obj_{document.id}" if hasObjectPart else None }) # For images: Mark that Vision AI extraction is needed during section generation if part.typeGroup == "image": part.metadata["needsVisionExtraction"] = True logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation") # Stelle sicher, dass ID eindeutig ist (falls object Part existiert) if hasObjectPart: part.id = f"ext_{document.id}_{part.id}" allContentParts.append(part) # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( json.dumps([part.dict() for part in allContentParts], indent=2, default=str), "content_extraction_result" ) # State 2 Validation: Validate and auto-fix ContentParts validatedParts = [] for part in allContentParts: # Validation 2.1: Skip ContentParts without documentId if not part.metadata.get("documentId"): logger.warning(f"Skipping ContentPart {part.id} - missing documentId in metadata") continue # Validation 2.2: Skip ContentParts with invalid contentFormat contentFormat = part.metadata.get("contentFormat") if contentFormat not in ["extracted", "object", "reference"]: logger.warning( f"Skipping ContentPart {part.id} - invalid contentFormat: {contentFormat}" ) continue validatedParts.append(part) # ChatLog abschließen self.services.chat.progressLogFinish(extractionOperationId, True) return validatedParts except Exception as e: self.services.chat.progressLogFinish(extractionOperationId, False) logger.error(f"Error in extractAndPrepareContent: {str(e)}") raise async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]: """ Extrahiere Text aus einem Image-Part mit Vision AI. Args: imagePart: ContentPart mit typeGroup="image" extractionPrompt: Prompt für die Text-Extraktion Returns: Extrahierter Text oder None bei Fehler """ try: from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum # Final extraction prompt finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting." # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( finalPrompt, f"content_extraction_prompt_image_{imagePart.id}" ) # Erstelle AI-Call-Request mit Image-Part request = AiCallRequest( prompt=finalPrompt, context="", options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE), contentParts=[imagePart] ) # Verwende AI-Service für Vision AI-Verarbeitung checkWorkflowStopped(self.services) response = await self.aiService.callAi(request) # Debug-Log für Response (harmonisiert) if response and response.content: self.services.utils.writeDebugFile( response.content, f"content_extraction_response_image_{imagePart.id}" ) if response and response.content: return response.content.strip() # Kein Content zurückgegeben - return error message für Debugging errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}" logger.warning(errorMsg) return f"[ERROR: {errorMsg}]" except Exception as e: errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}" logger.error(errorMsg) import traceback logger.debug(f"Traceback: {traceback.format_exc()}") # Return error message statt None für Debugging return f"[ERROR: {errorMsg}]" async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]: """ Verarbeite Text-Content mit AI basierend auf extractionPrompt. WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text (z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt. Args: textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ) extractionPrompt: Prompt für die AI-Verarbeitung des Textes Returns: AI-verarbeiteter Text oder None bei Fehler """ try: from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum # Final extraction prompt finalPrompt = extractionPrompt or "Process and extract the key information from the following text content." # Debug-Log (harmonisiert) - log prompt with text preview textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "") promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}" self.services.utils.writeDebugFile( promptWithContext, f"content_extraction_prompt_text_{textPart.id}" ) # Erstelle Text-ContentPart für AI-Verarbeitung # Verwende den vorhandenen Text als Input textContentPart = ContentPart( id=textPart.id, label=textPart.label, typeGroup="text", mimeType="text/plain", data=textPart.data if textPart.data else "", metadata=textPart.metadata.copy() if textPart.metadata else {} ) # Erstelle AI-Call-Request mit Text-Part request = AiCallRequest( prompt=finalPrompt, context="", options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT), contentParts=[textContentPart] ) # Verwende AI-Service für Text-Verarbeitung checkWorkflowStopped(self.services) response = await self.aiService.callAi(request) # Debug-Log für Response (harmonisiert) if response and response.content: self.services.utils.writeDebugFile( response.content, f"content_extraction_response_text_{textPart.id}" ) if response and response.content: return response.content.strip() # Kein Content zurückgegeben - return error message für Debugging errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}" logger.warning(errorMsg) return f"[ERROR: {errorMsg}]" except Exception as e: errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}" logger.error(errorMsg) import traceback logger.debug(f"Traceback: {traceback.format_exc()}") # Return error message statt None für Debugging return f"[ERROR: {errorMsg}]" def _isBinary(self, mimeType: str) -> bool: """Prüfe ob MIME-Type binary ist.""" binaryTypes = [ "application/octet-stream", "application/pdf", "application/zip", "application/x-zip-compressed" ] return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/") def _extractNestedPartsFromStructure( self, structurePart: ContentPart, document: ChatDocument, preExtracted: Dict[str, Any], intent: Optional[Any] ) -> List[ContentPart]: """ Extract nested parts from a structure ContentPart (e.g., JSON with documentData.parts). This is a generic function that analyzes pre-processed ContentParts and extracts any nested parts that are embedded in structure data (typically JSON). Works with standard ContentExtracted format: documentData.parts array. Each nested part is extracted as a separate ContentPart with proper metadata. Args: structurePart: ContentPart with typeGroup="structure" containing nested parts document: The document this part belongs to preExtracted: Pre-extracted document metadata intent: Document intent for nested parts Returns: List of extracted ContentParts, empty if no nested parts found """ nestedParts = [] try: # Parse JSON structure jsonData = json.loads(structurePart.data) # Check for standard ContentExtracted format: documentData.parts if isinstance(jsonData, dict): documentData = jsonData.get("documentData") if isinstance(documentData, dict): parts = documentData.get("parts", []) if isinstance(parts, list) and len(parts) > 0: # Extract each nested part for nestedPartData in parts: if not isinstance(nestedPartData, dict): continue nestedPartId = nestedPartData.get("id") or f"nested_{len(nestedParts)}" nestedTypeGroup = nestedPartData.get("typeGroup", "text") nestedMimeType = nestedPartData.get("mimeType", "text/plain") nestedLabel = nestedPartData.get("label", structurePart.label) nestedData = nestedPartData.get("data", "") nestedMetadata = nestedPartData.get("metadata", {}) # Create ContentPart for nested part nestedPart = ContentPart( id=f"{structurePart.id}_{nestedPartId}", parentId=structurePart.id, label=nestedLabel, typeGroup=nestedTypeGroup, mimeType=nestedMimeType, data=nestedData, metadata={ **nestedMetadata, "documentId": document.id, "fromNestedStructure": True, "parentStructurePartId": structurePart.id, "originalFileName": preExtracted["originalDocument"]["fileName"] } ) nestedParts.append(nestedPart) logger.debug(f"✅ Extracted nested part: {nestedPart.id} (typeGroup={nestedTypeGroup}, mimeType={nestedMimeType})") # If no nested parts found, return empty list (original part will be kept) if not nestedParts: logger.debug(f"No nested parts found in structure part {structurePart.id}") except json.JSONDecodeError as e: logger.warning(f"Could not parse structure part {structurePart.id} as JSON: {str(e)}") except Exception as e: logger.error(f"Error extracting nested parts from structure part {structurePart.id}: {str(e)}") return nestedParts def _findIntentBySimilarId(self, documentId: str, documentIntents: List[DocumentIntent]) -> Optional[DocumentIntent]: """ Versucht ein Intent zu finden, dessen UUID ähnlich zur angegebenen Dokument-ID ist. Dies hilft bei AI UUID-Halluzinationen (z.B. 4451 -> 4551). Args: documentId: Die Dokument-ID für die ein Intent gesucht wird documentIntents: Liste aller verfügbaren DocumentIntents Returns: DocumentIntent mit ähnlicher UUID falls gefunden, sonst None """ if not documentId or len(documentId) != 36: # UUID Format: 8-4-4-4-12 return None # Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) if documentId.count('-') != 4: return None for intent in documentIntents: intentId = intent.documentId if len(intentId) != 36: continue # Zähle unterschiedliche Zeichen differences = sum(c1 != c2 for c1, c2 in zip(documentId, intentId)) # Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo if differences <= 2: # Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche) if documentId.count('-') == intentId.count('-'): return intent return None