gateway/modules/services/serviceAi/subContentExtraction.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Content Extraction Module

Handles content extraction and preparation, including:
- Extracting content from documents based on intents
- Processing pre-extracted documents
- Vision AI for image text extraction
- AI processing of text content
"""
import json
import logging
import base64
from typing import Dict, Any, List, Optional

from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped

logger = logging.getLogger(__name__)


class ContentExtractor:
    """Handles content extraction and preparation."""

    def __init__(self, services, aiService, intentAnalyzer):
        """Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
        self.services = services
        self.aiService = aiService
        self.intentAnalyzer = intentAnalyzer

    async def extractAndPrepareContent(
        self,
        documents: List[ChatDocument],
        documentIntents: List[DocumentIntent],
        parentOperationId: str,
        getIntentForDocument: callable
    ) -> List[ContentPart]:
        """
        Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
        Gibt Liste von ContentParts im passenden Format zurück.

        WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
        Beispiel: Bild mit intents=["extract", "render"] erzeugt:
        - ContentPart(contentFormat="object", ...) für Rendering
        - ContentPart(contentFormat="extracted", ...) für Text-Analyse

        Args:
            documents: Liste der zu verarbeitenden Dokumente
            documentIntents: Liste von DocumentIntent-Objekten
            parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
            getIntentForDocument: Callable to get intent for document ID

        Returns:
            Liste von ContentParts mit vollständigen Metadaten
        """
        # Erstelle Operation-ID für Extraktion
        extractionOperationId = f"{parentOperationId}_content_extraction"

        # Starte ChatLog mit Parent-Referenz
        self.services.chat.progressLogStart(
            extractionOperationId,
            "Content Extraction",
            "Extraction",
            f"Extracting from {len(documents)} documents",
            parentOperationId=parentOperationId
        )

        try:
            allContentParts = []

            for document in documents:
                checkWorkflowStopped(self.services)
                # Check if document is already a ContentExtracted document (pre-extracted JSON)
                logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
                preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)

                if preExtracted:
                    logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
                    logger.info(f"   Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
                    logger.info(f"   ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")

                    # Verwende bereits extrahierte ContentParts direkt
                    contentExtracted = preExtracted["contentExtracted"]

                    # WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
                    # (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
                    intent = getIntentForDocument(document.id, documentIntents)
                    logger.info(f"   Intent lookup for document {document.id}: found={intent is not None}")
                    if intent:
                        logger.info(f"   Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
                    else:
                        logger.warning(f"   ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")

                    if contentExtracted.parts:
                        # CRITICAL: Process pre-extracted parts - analyze structure parts for nested content
                        processedParts = []
                        for part in contentExtracted.parts:
                            # Überspringe leere Parts (Container ohne Daten)
                            if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
                                if part.typeGroup == "container":
                                    continue  # Überspringe leere Container

                            # CRITICAL: Check if structure part contains nested parts (e.g., JSON with documentData.parts)
                            if part.typeGroup == "structure" and part.mimeType == "application/json" and part.data:
                                nestedParts = self._extractNestedPartsFromStructure(part, document, preExtracted, intent)
                                if nestedParts:
                                    # Replace structure part with extracted nested parts
                                    processedParts.extend(nestedParts)
                                    logger.info(f"✅ Extracted {len(nestedParts)} nested parts from structure part {part.id}")
                                    continue  # Skip original structure part

                            # Keep original part if no nested parts found
                            processedParts.append(part)

                        # Use processed parts (with nested parts extracted)
                        for part in processedParts:
                            if not part.metadata:
                                part.metadata = {}

                            # Ensure metadata is complete
                            if "documentId" not in part.metadata:
                                part.metadata["documentId"] = document.id

                            # WICHTIG: Prüfe Intent für dieses Part
                            partIntent = intent.intents if intent else ["extract"]

                            # Debug-Logging für Intent-Verarbeitung
                            logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")

                            # WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
                            # Generische Intent-Verarbeitung für ALLE Content-Typen
                            hasReferenceIntent = "reference" in partIntent
                            hasRenderIntent = "render" in partIntent
                            hasExtractIntent = "extract" in partIntent
                            hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)

                            logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")

                            # Track ob der originale Part bereits hinzugefügt wurde
                            originalPartAdded = False

                            # 1. Reference Intent: Erstelle Reference ContentPart
                            if hasReferenceIntent:
                                referencePart = ContentPart(
                                    id=f"ref_{document.id}_{part.id}",
                                    label=f"Reference: {part.label or 'Content'}",
                                    typeGroup="reference",
                                    mimeType=part.mimeType or "application/octet-stream",
                                    data="",  # Leer - nur Referenz
                                    metadata={
                                        "contentFormat": "reference",
                                        "documentId": document.id,
                                        "documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
                                        "intent": "reference",
                                        "usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
                                        "originalFileName": preExtracted["originalDocument"]["fileName"]
                                    }
                                )
                                allContentParts.append(referencePart)
                                logger.debug(f"✅ Created reference ContentPart for {part.id}")

                            # 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
                            if hasRenderIntent and hasPartData:
                                # Prüfe ob es ein Binary/Image ist (kann gerendert werden)
                                isRenderable = (
                                    part.typeGroup == "image" or
                                    part.typeGroup == "binary" or
                                    (part.mimeType and (
                                        part.mimeType.startswith("image/") or
                                        part.mimeType.startswith("video/") or
                                        part.mimeType.startswith("audio/") or
                                        self._isBinary(part.mimeType)
                                    ))
                                )

                                if isRenderable:
                                    objectPart = ContentPart(
                                        id=f"obj_{document.id}_{part.id}",
                                        label=f"Object: {part.label or 'Content'}",
                                        typeGroup=part.typeGroup,
                                        mimeType=part.mimeType or "application/octet-stream",
                                        data=part.data,  # Base64/Binary data ist bereits vorhanden
                                        metadata={
                                            "contentFormat": "object",
                                            "documentId": document.id,
                                            "intent": "render",
                                            "usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
                                            "originalFileName": preExtracted["originalDocument"]["fileName"],
                                            "relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
                                        }
                                    )
                                    allContentParts.append(objectPart)
                                    logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
                                else:
                                    logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
                            elif hasRenderIntent and not hasPartData:
                                logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")

                            # 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation)
                            if hasExtractIntent:
                                # For images: Keep as image part with extract intent - Vision AI extraction happens during section generation
                                if part.typeGroup == "image" and hasPartData:
                                    logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation")
                                    # Keep image part as-is, mark with extract intent
                                    part.metadata.update({
                                        "contentFormat": "extracted",  # Marked for extraction, but not yet extracted
                                        "intent": "extract",
                                        "originalFileName": preExtracted["originalDocument"]["fileName"],
                                        "relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
                                        "extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.",
                                        "needsVisionExtraction": True  # Flag to indicate Vision AI extraction needed
                                    })
                                    allContentParts.append(part)
                                    originalPartAdded = True
                                else:
                                    # For text/table content: Use directly as extracted (no AI processing here)
                                    # AI processing with extractionPrompt happens during section generation
                                    if not originalPartAdded:
                                        part.metadata.update({
                                            "contentFormat": "extracted",
                                            "intent": "extract",
                                            "fromExtractContent": True,
                                            "skipExtraction": True,  # Already extracted (raw extraction)
                                            "originalFileName": preExtracted["originalDocument"]["fileName"],
                                            "relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
                                            "extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None
                                        })
                                        # Stelle sicher dass contentFormat gesetzt ist
                                        if "contentFormat" not in part.metadata:
                                            part.metadata["contentFormat"] = "extracted"
                                        allContentParts.append(part)
                                        originalPartAdded = True
                                        logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")

                            # 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
                            # (sollte normalerweise nicht vorkommen, da default "extract" ist)
                            if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
                                logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
                                part.metadata.update({
                                    "contentFormat": "extracted",
                                    "intent": "extract",
                                    "fromExtractContent": True,
                                    "skipExtraction": True,
                                    "originalFileName": preExtracted["originalDocument"]["fileName"]
                                })
                                allContentParts.append(part)
                                originalPartAdded = True

                        logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
                        logger.info(f"   Original document: {preExtracted['originalDocument']['fileName']}")
                        continue  # Skip normal extraction for this document

                # Check if it's standardized JSON format (has "documents" or "sections")
                if document.mimeType == "application/json":
                    try:
                        docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
                        if docBytes:
                            docData = docBytes.decode('utf-8')
                            jsonData = json.loads(docData)

                            if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
                                logger.info(f"Document is already in standardized JSON format, using as reference")
                                # Create reference ContentPart for structured JSON
                                contentPart = ContentPart(
                                    id=f"ref_{document.id}",
                                    label=f"Reference: {document.fileName}",
                                    typeGroup="structure",
                                    mimeType="application/json",
                                    data=docData,
                                    metadata={
                                        "contentFormat": "reference",
                                        "documentId": document.id,
                                        "documentReference": f"docItem:{document.id}:{document.fileName}",
                                        "skipExtraction": True,
                                        "intent": "reference"
                                    }
                                )
                                allContentParts.append(contentPart)
                                logger.info(f"✅ Using JSON document directly without extraction")
                                continue  # Skip normal extraction for this document
                    except Exception as e:
                        logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
                        # Continue with normal extraction

                # Normal extraction path
                intent = getIntentForDocument(document.id, documentIntents)

                if not intent:
                    # Try to find intent by similar UUID (fix for AI UUID hallucination)
                    correctedIntent = self._findIntentBySimilarId(document.id, documentIntents)
                    if correctedIntent:
                        logger.warning(f"Found intent for document {document.id} using UUID correction (original: {correctedIntent.documentId})")
                        # Create new intent with correct document ID
                        intent = DocumentIntent(
                            documentId=document.id,
                            intents=correctedIntent.intents,
                            extractionPrompt=correctedIntent.extractionPrompt,
                            reasoning=f"Intent matched by UUID similarity (original: {correctedIntent.documentId})"
                        )
                    else:
                        # Default: extract für alle Dokumente ohne Intent
                        logger.warning(f"No intent found for document {document.id}, using default 'extract'")
                        intent = DocumentIntent(
                            documentId=document.id,
                            intents=["extract"],
                            extractionPrompt="Extract all content from the document",
                            reasoning="Default intent: no specific intent found"
                        )

                # WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen

                if "reference" in intent.intents:
                    # Erstelle Reference ContentPart
                    contentPart = ContentPart(
                        id=f"ref_{document.id}",
                        label=f"Reference: {document.fileName}",
                        typeGroup="reference",
                        mimeType=document.mimeType,
                        data="",
                        metadata={
                            "contentFormat": "reference",
                            "documentId": document.id,
                            "documentReference": f"docItem:{document.id}:{document.fileName}",
                            "intent": "reference",
                            "usageHint": f"Reference document: {document.fileName}"
                        }
                    )
                    allContentParts.append(contentPart)

                # WICHTIG: "render" und "extract" können beide vorhanden sein!
                # In diesem Fall erzeugen wir BEIDE ContentParts

                if "render" in intent.intents:
                    # Für Images/Binary: extrahiere als Object
                    if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
                        try:
                            # Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
                            binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
                            if not binaryData:
                                logger.warning(f"No binary data found for document {document.id}")
                                continue
                            base64Data = base64.b64encode(binaryData).decode('utf-8')

                            contentPart = ContentPart(
                                id=f"obj_{document.id}",
                                label=f"Object: {document.fileName}",
                                typeGroup="image" if document.mimeType.startswith("image/") else "binary",
                                mimeType=document.mimeType,
                                data=base64Data,
                                metadata={
                                    "contentFormat": "object",
                                    "documentId": document.id,
                                    "intent": "render",
                                    "usageHint": f"Render as visual element: {document.fileName}",
                                    "originalFileName": document.fileName,
                                    # Verknüpfung zu extracted Part (falls vorhanden)
                                    "relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
                                }
                            )
                            allContentParts.append(contentPart)
                        except Exception as e:
                            logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")

                if "extract" in intent.intents:
                    # Extrahiere Content mit Extraction Service
                    extractionPrompt = intent.extractionPrompt or "Extract all content from the document"

                    # Debug-Log (harmonisiert)
                    self.services.utils.writeDebugFile(
                        extractionPrompt,
                        f"content_extraction_prompt_{document.id}"
                    )

                    # Führe Extraktion aus
                    from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy

                    extractionOptions = ExtractionOptions(
                        prompt=extractionPrompt,
                        mergeStrategy=MergeStrategy()
                    )

                    # extractContent ist nicht async - keine await nötig
                    checkWorkflowStopped(self.services)
                    extractedResults = self.services.extraction.extractContent(
                        [document],
                        extractionOptions,
                        operationId=extractionOperationId,
                        parentOperationId=extractionOperationId
                    )

                    # Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
                    for extracted in extractedResults:
                        for part in extracted.parts:
                            # Markiere als extracted Format
                            part.metadata.update({
                                "contentFormat": "extracted",
                                "documentId": document.id,
                                "extractionPrompt": extractionPrompt,
                                "intent": "extract",
                                "usageHint": f"Use extracted content from {document.fileName}",
                                # Verknüpfung zu object Part (falls vorhanden)
                                "relatedObjectPartId": f"obj_{document.id}" if "render" in intent.intents else None
                            })

                            # For images: Mark that Vision AI extraction is needed during section generation
                            if part.typeGroup == "image":
                                part.metadata["needsVisionExtraction"] = True
                                logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation")

                            # Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
                            if "render" in intent.intents:
                                part.id = f"ext_{document.id}_{part.id}"
                            allContentParts.append(part)

            # Debug-Log (harmonisiert)
            self.services.utils.writeDebugFile(
                json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
                "content_extraction_result"
            )

            # State 2 Validation: Validate and auto-fix ContentParts
            validatedParts = []
            for part in allContentParts:
                # Validation 2.1: Skip ContentParts without documentId
                if not part.metadata.get("documentId"):
                    logger.warning(f"Skipping ContentPart {part.id} - missing documentId in metadata")
                    continue

                # Validation 2.2: Skip ContentParts with invalid contentFormat
                contentFormat = part.metadata.get("contentFormat")
                if contentFormat not in ["extracted", "object", "reference"]:
                    logger.warning(
                        f"Skipping ContentPart {part.id} - invalid contentFormat: {contentFormat}"
                    )
                    continue

                validatedParts.append(part)

            # ChatLog abschließen
            self.services.chat.progressLogFinish(extractionOperationId, True)

            return validatedParts

        except Exception as e:
            self.services.chat.progressLogFinish(extractionOperationId, False)
            logger.error(f"Error in extractAndPrepareContent: {str(e)}")
            raise

    async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
        """
        Extrahiere Text aus einem Image-Part mit Vision AI.

        Args:
            imagePart: ContentPart mit typeGroup="image"
            extractionPrompt: Prompt für die Text-Extraktion

        Returns:
            Extrahierter Text oder None bei Fehler
        """
        try:
            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum

            # Final extraction prompt
            finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."

            # Debug-Log (harmonisiert)
            self.services.utils.writeDebugFile(
                finalPrompt,
                f"content_extraction_prompt_image_{imagePart.id}"
            )

            # Erstelle AI-Call-Request mit Image-Part
            request = AiCallRequest(
                prompt=finalPrompt,
                context="",
                options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
                contentParts=[imagePart]
            )

            # Verwende AI-Service für Vision AI-Verarbeitung
            checkWorkflowStopped(self.services)
            response = await self.aiService.callAi(request)

            # Debug-Log für Response (harmonisiert)
            if response and response.content:
                self.services.utils.writeDebugFile(
                    response.content,
                    f"content_extraction_response_image_{imagePart.id}"
                )

            if response and response.content:
                return response.content.strip()

            # Kein Content zurückgegeben - return error message für Debugging
            errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
            logger.warning(errorMsg)
            return f"[ERROR: {errorMsg}]"
        except Exception as e:
            errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
            logger.error(errorMsg)
            import traceback
            logger.debug(f"Traceback: {traceback.format_exc()}")
            # Return error message statt None für Debugging
            return f"[ERROR: {errorMsg}]"

    async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
        """
        Verarbeite Text-Content mit AI basierend auf extractionPrompt.

        WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
        (z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
        verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.

        Args:
            textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
            extractionPrompt: Prompt für die AI-Verarbeitung des Textes

        Returns:
            AI-verarbeiteter Text oder None bei Fehler
        """
        try:
            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum

            # Final extraction prompt
            finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."

            # Debug-Log (harmonisiert) - log prompt with text preview
            textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
            promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
            self.services.utils.writeDebugFile(
                promptWithContext,
                f"content_extraction_prompt_text_{textPart.id}"
            )

            # Erstelle Text-ContentPart für AI-Verarbeitung
            # Verwende den vorhandenen Text als Input
            textContentPart = ContentPart(
                id=textPart.id,
                label=textPart.label,
                typeGroup="text",
                mimeType="text/plain",
                data=textPart.data if textPart.data else "",
                metadata=textPart.metadata.copy() if textPart.metadata else {}
            )

            # Erstelle AI-Call-Request mit Text-Part
            request = AiCallRequest(
                prompt=finalPrompt,
                context="",
                options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
                contentParts=[textContentPart]
            )

            # Verwende AI-Service für Text-Verarbeitung
            checkWorkflowStopped(self.services)
            response = await self.aiService.callAi(request)

            # Debug-Log für Response (harmonisiert)
            if response and response.content:
                self.services.utils.writeDebugFile(
                    response.content,
                    f"content_extraction_response_text_{textPart.id}"
                )

            if response and response.content:
                return response.content.strip()

            # Kein Content zurückgegeben - return error message für Debugging
            errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
            logger.warning(errorMsg)
            return f"[ERROR: {errorMsg}]"
        except Exception as e:
            errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
            logger.error(errorMsg)
            import traceback
            logger.debug(f"Traceback: {traceback.format_exc()}")
            # Return error message statt None für Debugging
            return f"[ERROR: {errorMsg}]"

    def _isBinary(self, mimeType: str) -> bool:
        """Prüfe ob MIME-Type binary ist."""
        binaryTypes = [
            "application/octet-stream",
            "application/pdf",
            "application/zip",
            "application/x-zip-compressed"
        ]
        return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")

    def _extractNestedPartsFromStructure(
        self,
        structurePart: ContentPart,
        document: ChatDocument,
        preExtracted: Dict[str, Any],
        intent: Optional[Any]
    ) -> List[ContentPart]:
        """
        Extract nested parts from a structure ContentPart (e.g., JSON with documentData.parts).

        This is a generic function that analyzes pre-processed ContentParts and extracts
        any nested parts that are embedded in structure data (typically JSON).

        Works with standard ContentExtracted format: documentData.parts array.
        Each nested part is extracted as a separate ContentPart with proper metadata.

        Args:
            structurePart: ContentPart with typeGroup="structure" containing nested parts
            document: The document this part belongs to
            preExtracted: Pre-extracted document metadata
            intent: Document intent for nested parts

        Returns:
            List of extracted ContentParts, empty if no nested parts found
        """
        nestedParts = []

        try:
            # Parse JSON structure
            jsonData = json.loads(structurePart.data)

            # Check for standard ContentExtracted format: documentData.parts
            if isinstance(jsonData, dict):
                documentData = jsonData.get("documentData")
                if isinstance(documentData, dict):
                    parts = documentData.get("parts", [])
                    if isinstance(parts, list) and len(parts) > 0:
                        # Extract each nested part
                        for nestedPartData in parts:
                            if not isinstance(nestedPartData, dict):
                                continue

                            nestedPartId = nestedPartData.get("id") or f"nested_{len(nestedParts)}"
                            nestedTypeGroup = nestedPartData.get("typeGroup", "text")
                            nestedMimeType = nestedPartData.get("mimeType", "text/plain")
                            nestedLabel = nestedPartData.get("label", structurePart.label)
                            nestedData = nestedPartData.get("data", "")
                            nestedMetadata = nestedPartData.get("metadata", {})

                            # Create ContentPart for nested part
                            nestedPart = ContentPart(
                                id=f"{structurePart.id}_{nestedPartId}",
                                parentId=structurePart.id,
                                label=nestedLabel,
                                typeGroup=nestedTypeGroup,
                                mimeType=nestedMimeType,
                                data=nestedData,
                                metadata={
                                    **nestedMetadata,
                                    "documentId": document.id,
                                    "fromNestedStructure": True,
                                    "parentStructurePartId": structurePart.id,
                                    "originalFileName": preExtracted["originalDocument"]["fileName"]
                                }
                            )

                            nestedParts.append(nestedPart)
                            logger.debug(f"✅ Extracted nested part: {nestedPart.id} (typeGroup={nestedTypeGroup}, mimeType={nestedMimeType})")

            # If no nested parts found, return empty list (original part will be kept)
            if not nestedParts:
                logger.debug(f"No nested parts found in structure part {structurePart.id}")

        except json.JSONDecodeError as e:
            logger.warning(f"Could not parse structure part {structurePart.id} as JSON: {str(e)}")
        except Exception as e:
            logger.error(f"Error extracting nested parts from structure part {structurePart.id}: {str(e)}")

        return nestedParts

    def _findIntentBySimilarId(self, documentId: str, documentIntents: List[DocumentIntent]) -> Optional[DocumentIntent]:
        """
        Versucht ein Intent zu finden, dessen UUID ähnlich zur angegebenen Dokument-ID ist.
        Dies hilft bei AI UUID-Halluzinationen (z.B. 4451 -> 4551).

        Args:
            documentId: Die Dokument-ID für die ein Intent gesucht wird
            documentIntents: Liste aller verfügbaren DocumentIntents

        Returns:
            DocumentIntent mit ähnlicher UUID falls gefunden, sonst None
        """
        if not documentId or len(documentId) != 36:  # UUID Format: 8-4-4-4-12
            return None

        # Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
        if documentId.count('-') != 4:
            return None

        for intent in documentIntents:
            intentId = intent.documentId
            if len(intentId) != 36:
                continue

            # Zähle unterschiedliche Zeichen
            differences = sum(c1 != c2 for c1, c2 in zip(documentId, intentId))

            # Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
            if differences <= 2:
                # Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
                if documentId.count('-') == intentId.count('-'):
                    return intent

        return None