gateway/modules/services/serviceAi/subStructureFilling.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Structure Filling Module

Handles filling document structure with content, including:
- Filling sections with content parts
- Building section generation prompts
- Aggregation logic
"""
import json
import logging
import copy
from typing import Dict, Any, List, Optional

from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum

logger = logging.getLogger(__name__)


class StructureFiller:
    """Handles filling document structure with content."""

    def __init__(self, services, aiService):
        """Initialize StructureFiller with service center and AI service access."""
        self.services = services
        self.aiService = aiService

    async def fillStructure(
        self,
        structure: Dict[str, Any],
        contentParts: List[ContentPart],
        userPrompt: str,
        parentOperationId: str
    ) -> Dict[str, Any]:
        """
        Phase 5D: Füllt Struktur mit tatsächlichem Content.
        Für jede Section:
        - Wenn contentPartIds spezifiziert: Verwende ContentParts im spezifizierten Format
        - Wenn generation_hint spezifiziert: Generiere AI-Content

        **Implementierungsdetails:**
        - Sections werden **parallel generiert**, wenn möglich (Performance-Optimierung)
        - Fehlerhafte Sections werden mit Fehlermeldung gerendert (kein Abbruch des gesamten Prozesses)

        Args:
            structure: Struktur-Dict mit documents und sections
            contentParts: Alle vorbereiteten ContentParts
            userPrompt: User-Anfrage
            parentOperationId: Parent Operation-ID für ChatLog-Hierarchie

        Returns:
            Gefüllte Struktur mit elements in jeder Section
        """
        # Erstelle Operation-ID für Struktur-Abfüllen
        fillOperationId = f"{parentOperationId}_structure_filling"

        # Starte ChatLog mit Parent-Referenz
        self.services.chat.progressLogStart(
            fillOperationId,
            "Structure Filling",
            "Filling",
            f"Filling {len(structure.get('documents', [{}])[0].get('sections', []))} sections",
            parentOperationId=parentOperationId
        )

        try:
            filledStructure = copy.deepcopy(structure)

            # Sammle alle Sections für sequenzielle Verarbeitung (parallel kann später optimiert werden)
            sections_to_process = []
            all_sections_list = []  # Für Kontext-Informationen
            for doc in filledStructure.get("documents", []):
                doc_sections = doc.get("sections", [])
                all_sections_list.extend(doc_sections)
                for section in doc_sections:
                    sections_to_process.append((doc, section))

            # Sequenzielle Section-Generierung (parallel kann später hinzugefügt werden)
            for sectionIndex, (doc, section) in enumerate(sections_to_process):
                sectionId = section.get("id")
                contentPartIds = section.get("contentPartIds", [])
                contentFormats = section.get("contentFormats", {})
                generationHint = section.get("generation_hint")
                contentType = section.get("content_type", "paragraph")

                elements = []

                # Prüfe ob Aggregation nötig ist
                needsAggregation = self._needsAggregation(
                    contentType=contentType,
                    contentPartCount=len(contentPartIds)
                )

                if needsAggregation and generationHint:
                    # Aggregation: Alle Parts zusammen verarbeiten
                    sectionParts = [
                        self._findContentPartById(pid, contentParts)
                        for pid in contentPartIds
                    ]
                    sectionParts = [p for p in sectionParts if p is not None]

                    if sectionParts:
                        # Filtere nur extracted Parts für Aggregation (reference/object werden separat behandelt)
                        extractedParts = [
                            p for p in sectionParts
                            if contentFormats.get(p.id, p.metadata.get("contentFormat")) == "extracted"
                        ]
                        nonExtractedParts = [
                            p for p in sectionParts
                            if contentFormats.get(p.id, p.metadata.get("contentFormat")) != "extracted"
                        ]

                        # Verarbeite non-extracted Parts separat (reference, object)
                        for part in nonExtractedParts:
                            contentFormat = contentFormats.get(part.id, part.metadata.get("contentFormat"))

                            if contentFormat == "reference":
                                elements.append({
                                    "type": "reference",
                                    "documentReference": part.metadata.get("documentReference"),
                                    "label": part.metadata.get("usageHint", part.label)
                                })
                            elif contentFormat == "object":
                                elements.append({
                                    "type": part.typeGroup,
                                    "base64Data": part.data,
                                    "mimeType": part.mimeType,
                                    "altText": part.metadata.get("usageHint", part.label)
                                })

                        # Aggregiere extracted Parts mit AI
                        if extractedParts:
                            generationPrompt = self._buildSectionGenerationPrompt(
                                section=section,
                                contentParts=extractedParts,  # ALLE PARTS für Aggregation!
                                userPrompt=userPrompt,
                                generationHint=generationHint,
                                allSections=all_sections_list,
                                sectionIndex=sectionIndex,
                                isAggregation=True
                            )

                            # Erstelle Operation-ID für Section-Generierung
                            sectionOperationId = f"{fillOperationId}_section_{sectionId}"

                            # Starte ChatLog mit Parent-Referenz
                            self.services.chat.progressLogStart(
                                sectionOperationId,
                                "Section Generation (Aggregation)",
                                "Section",
                                f"Generating section {sectionId} with {len(extractedParts)} parts",
                                parentOperationId=fillOperationId
                            )

                            try:
                                # Debug: Log Prompt
                                self.services.utils.writeDebugFile(
                                    generationPrompt,
                                    f"section_content_{sectionId}_prompt"
                                )

                                # Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!)
                                request = AiCallRequest(
                                    prompt=generationPrompt,
                                    contentParts=extractedParts,  # ALLE PARTS!
                                    options=AiCallOptions(
                                        operationType=OperationTypeEnum.DATA_ANALYSE,
                                        priority=PriorityEnum.BALANCED,
                                        processingMode=ProcessingModeEnum.DETAILED
                                    )
                                )
                                aiResponse = await self.aiService.callAi(request)

                                # Debug: Log Response
                                self.services.utils.writeDebugFile(
                                    aiResponse.content,
                                    f"section_content_{sectionId}_response"
                                )

                                # Parse und füge zu elements hinzu
                                generatedElements = json.loads(
                                    self.services.utils.jsonExtractString(aiResponse.content)
                                )
                                if isinstance(generatedElements, list):
                                    elements.extend(generatedElements)
                                elif isinstance(generatedElements, dict) and "elements" in generatedElements:
                                    elements.extend(generatedElements["elements"])

                                # ChatLog abschließen
                                self.services.chat.progressLogFinish(sectionOperationId, True)

                            except Exception as e:
                                # Fehlerhafte Section mit Fehlermeldung rendern (kein Abbruch!)
                                self.services.chat.progressLogFinish(sectionOperationId, False)
                                elements.append({
                                    "type": "error",
                                    "message": f"Error generating section {sectionId}: {str(e)}",
                                    "sectionId": sectionId
                                })
                                logger.error(f"Error generating section {sectionId}: {str(e)}")
                                # NICHT raise - Section wird mit Fehlermeldung gerendert

                else:
                    # Einzelverarbeitung: Jeder Part einzeln
                    for partId in contentPartIds:
                        part = self._findContentPartById(partId, contentParts)
                        if not part:
                            continue

                        contentFormat = contentFormats.get(partId, part.metadata.get("contentFormat"))

                        if contentFormat == "reference":
                            # Füge Dokument-Referenz hinzu
                            elements.append({
                                "type": "reference",
                                "documentReference": part.metadata.get("documentReference"),
                                "label": part.metadata.get("usageHint", part.label)
                            })

                        elif contentFormat == "object":
                            # Füge base64 Object hinzu
                            elements.append({
                                "type": part.typeGroup,  # "image", "binary", etc.
                                "base64Data": part.data,
                                "mimeType": part.mimeType,
                                "altText": part.metadata.get("usageHint", part.label)
                            })

                        elif contentFormat == "extracted":
                            if generationHint:
                                # AI-Call mit einzelnen ContentPart
                                generationPrompt = self._buildSectionGenerationPrompt(
                                    section=section,
                                    contentParts=[part],  # EIN PART
                                    userPrompt=userPrompt,
                                    generationHint=generationHint,
                                    allSections=all_sections_list,
                                    sectionIndex=sectionIndex,
                                    isAggregation=False
                                )

                                # Erstelle Operation-ID für Section-Generierung
                                sectionOperationId = f"{fillOperationId}_section_{sectionId}"

                                # Starte ChatLog mit Parent-Referenz
                                self.services.chat.progressLogStart(
                                    sectionOperationId,
                                    "Section Generation",
                                    "Section",
                                    f"Generating section {sectionId}",
                                    parentOperationId=fillOperationId
                                )

                                try:
                                    # Debug: Log Prompt
                                    self.services.utils.writeDebugFile(
                                        generationPrompt,
                                        f"section_content_{sectionId}_prompt"
                                    )

                                    # Verwende callAi für ContentParts-Unterstützung
                                    request = AiCallRequest(
                                        prompt=generationPrompt,
                                        contentParts=[part],
                                        options=AiCallOptions(
                                            operationType=OperationTypeEnum.DATA_ANALYSE,
                                            priority=PriorityEnum.BALANCED,
                                            processingMode=ProcessingModeEnum.DETAILED
                                        )
                                    )
                                    aiResponse = await self.aiService.callAi(request)

                                    # Debug: Log Response
                                    self.services.utils.writeDebugFile(
                                        aiResponse.content,
                                        f"section_content_{sectionId}_response"
                                    )

                                    # Parse und füge zu elements hinzu
                                    generatedElements = json.loads(
                                        self.services.utils.jsonExtractString(aiResponse.content)
                                    )
                                    if isinstance(generatedElements, list):
                                        elements.extend(generatedElements)
                                    elif isinstance(generatedElements, dict) and "elements" in generatedElements:
                                        elements.extend(generatedElements["elements"])

                                    # ChatLog abschließen
                                    self.services.chat.progressLogFinish(sectionOperationId, True)

                                except Exception as e:
                                    # Fehlerhafte Section mit Fehlermeldung rendern (kein Abbruch!)
                                    self.services.chat.progressLogFinish(sectionOperationId, False)
                                    elements.append({
                                        "type": "error",
                                        "message": f"Error generating section {sectionId}: {str(e)}",
                                        "sectionId": sectionId
                                    })
                                    logger.error(f"Error generating section {sectionId}: {str(e)}")
                                    # NICHT raise - Section wird mit Fehlermeldung gerendert
                            else:
                                # Füge extrahierten Text direkt hinzu (kein AI-Call)
                                elements.append({
                                    "type": "extracted_text",
                                    "content": part.data,
                                    "source": part.metadata.get("documentId"),
                                    "extractionPrompt": part.metadata.get("extractionPrompt")
                                })

                section["elements"] = elements

            # ChatLog abschließen
            self.services.chat.progressLogFinish(fillOperationId, True)

            return filledStructure

        except Exception as e:
            self.services.chat.progressLogFinish(fillOperationId, False)
            logger.error(f"Error in fillStructure: {str(e)}")
            raise

    def _buildSectionGenerationPrompt(
        self,
        section: Dict[str, Any],
        contentParts: List[Optional[ContentPart]],
        userPrompt: str,
        generationHint: str,
        allSections: Optional[List[Dict[str, Any]]] = None,
        sectionIndex: Optional[int] = None,
        isAggregation: bool = False
    ) -> str:
        """Baue Prompt für Section-Generierung mit vollständigem Kontext."""
        # Filtere None-Werte
        validParts = [p for p in contentParts if p is not None]

        # Section-Metadaten
        sectionId = section.get("id", "unknown")
        contentType = section.get("content_type", "paragraph")

        # Baue ContentParts-Beschreibung
        contentPartsText = ""
        if isAggregation:
            # Aggregation: Zeige nur Metadaten, nicht Previews
            contentPartsText += f"\n## CONTENT PARTS (Aggregation)\n"
            contentPartsText += f"- Anzahl: {len(validParts)} ContentParts\n"
            contentPartsText += f"- Alle ContentParts werden als Parameter übergeben (nicht im Prompt!)\n"
            contentPartsText += f"- Jeder Part kann sehr groß sein → Chunking automatisch\n"
            contentPartsText += f"- WICHTIG: Aggregiere ALLE Parts zu einem Element (z.B. eine Tabelle)\n\n"
            contentPartsText += f"ContentPart IDs:\n"
            for part in validParts:
                contentFormat = part.metadata.get("contentFormat", "unknown")
                contentPartsText += f"  - {part.id} (Format: {contentFormat}, Type: {part.typeGroup}"
                if part.metadata.get("originalFileName"):
                    contentPartsText += f", Source: {part.metadata.get('originalFileName')}"
                contentPartsText += ")\n"
        else:
            # Einzelverarbeitung: Zeige Previews
            for part in validParts:
                contentFormat = part.metadata.get("contentFormat", "unknown")
                contentPartsText += f"\n- ContentPart {part.id}:\n"
                contentPartsText += f"  Format: {contentFormat}\n"
                contentPartsText += f"  Type: {part.typeGroup}\n"
                if part.metadata.get("originalFileName"):
                    contentPartsText += f"  Source file: {part.metadata.get('originalFileName')}\n"

                if contentFormat == "extracted":
                    # Zeige Preview von extrahiertem Text (länger für besseren Kontext)
                    previewLength = 1000
                    if part.data:
                        preview = part.data[:previewLength] + "..." if len(part.data) > previewLength else part.data
                        contentPartsText += f"  Content preview:\n```\n{preview}\n```\n"
                    else:
                        contentPartsText += f"  Content: (empty)\n"
                elif contentFormat == "reference":
                    contentPartsText += f"  Reference: {part.metadata.get('documentReference')}\n"
                    if part.metadata.get("usageHint"):
                        contentPartsText += f"  Usage hint: {part.metadata.get('usageHint')}\n"
                elif contentFormat == "object":
                    dataLength = len(part.data) if part.data else 0
                    contentPartsText += f"  Object type: {part.typeGroup}\n"
                    contentPartsText += f"  MIME type: {part.mimeType}\n"
                    contentPartsText += f"  Data size: {dataLength} chars (base64 encoded)\n"
                    if part.metadata.get("usageHint"):
                        contentPartsText += f"  Usage hint: {part.metadata.get('usageHint')}\n"

        # Baue Section-Kontext (vorherige und nachfolgende Sections)
        contextText = ""
        if allSections and sectionIndex is not None:
            prevSections = []
            nextSections = []

            if sectionIndex > 0:
                for i in range(max(0, sectionIndex - 2), sectionIndex):
                    prevSection = allSections[i]
                    prevSections.append({
                        "id": prevSection.get("id"),
                        "content_type": prevSection.get("content_type"),
                        "generation_hint": prevSection.get("generation_hint", "")[:100]
                    })

            if sectionIndex < len(allSections) - 1:
                for i in range(sectionIndex + 1, min(len(allSections), sectionIndex + 3)):
                    nextSection = allSections[i]
                    nextSections.append({
                        "id": nextSection.get("id"),
                        "content_type": nextSection.get("content_type"),
                        "generation_hint": nextSection.get("generation_hint", "")[:100]
                    })

            if prevSections or nextSections:
                contextText = "\n## DOCUMENT CONTEXT\n"
                if prevSections:
                    contextText += "\nPrevious sections:\n"
                    for prev in prevSections:
                        contextText += f"- {prev['id']} ({prev['content_type']}): {prev['generation_hint']}\n"
                if nextSections:
                    contextText += "\nFollowing sections:\n"
                    for next in nextSections:
                        contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"

        if isAggregation:
            prompt = f"""# TASK: Generate Section Content (Aggregation)

## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {contentType}
- Generation Hint: {generationHint}
{contextText}

## USER REQUEST (for context)
```
{userPrompt}
```

## AVAILABLE CONTENT FOR THIS SECTION
{contentPartsText if contentPartsText else "(No content parts specified for this section)"}

## INSTRUCTIONS
1. Generate content for section "{sectionId}" based on the generation hint above
2. **AGGREGATION**: Combine ALL provided ContentParts into ONE element (e.g., one table with all data)
3. For table content_type: Create a single table with headers and rows from all ContentParts
4. For bullet_list content_type: Create a single list with items from all ContentParts
5. Format appropriately based on content_type ({contentType})
6. Ensure the generated content fits logically between previous and following sections
7. Return ONLY a JSON object with an "elements" array
8. Each element should match the content_type: {contentType}

## OUTPUT FORMAT
Return a JSON object with this structure:
```json
{{
  "elements": [
    {{
      "type": "{contentType}",
      "headers": [...],  // if table
      "rows": [...],     // if table
      "items": [...],    // if bullet_list
      "content": "..."   // if paragraph
    }}
  ]
}}
```

CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
"""
        else:
            prompt = f"""# TASK: Generate Section Content

## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {contentType}
- Generation Hint: {generationHint}
{contextText}

## USER REQUEST (for context)
```
{userPrompt}
```

## AVAILABLE CONTENT FOR THIS SECTION
{contentPartsText if contentPartsText else "(No content parts specified for this section)"}

## INSTRUCTIONS
1. Generate content for section "{sectionId}" based on the generation hint above
2. Use the available content parts to populate this section
3. For images: Use data URI format (data:image/[type];base64,[data]) when embedding base64 image data
4. For extracted text: Format appropriately based on content_type ({contentType})
5. Ensure the generated content fits logically between previous and following sections
6. Return ONLY a JSON object with an "elements" array
7. Each element should match the content_type: {contentType}

## OUTPUT FORMAT
Return a JSON object with this structure:
```json
{{
  "elements": [
    {{
      "type": "{contentType}",
      "content": "..."
    }}
  ]
}}
```

CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
"""
        return prompt

    def _findContentPartById(self, partId: str, contentParts: List[ContentPart]) -> Optional[ContentPart]:
        """Finde ContentPart nach ID."""
        for part in contentParts:
            if part.id == partId:
                return part
        return None

    def _needsAggregation(
        self,
        contentType: str,
        contentPartCount: int
    ) -> bool:
        """
        Bestimmt ob mehrere ContentParts aggregiert werden müssen.

        Aggregation nötig wenn:
        - content_type erfordert Aggregation (table, bullet_list)
        - UND mehrere ContentParts vorhanden sind (> 1)

        Args:
            contentType: Section content_type
            contentPartCount: Anzahl der ContentParts in dieser Section

        Returns:
            True wenn Aggregation nötig, False sonst
        """
        aggregationTypes = ["table", "bullet_list"]

        if contentType in aggregationTypes and contentPartCount > 1:
            return True

        # Optional: Auch für paragraph wenn mehrere Parts vorhanden
        # (z.B. Vergleich mehrerer Dokumente)
        # Standard: Keine Aggregation für paragraph
        return False