gateway/modules/services/serviceAi/subStructureGeneration.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Structure Generation Module

Handles document structure generation, including:
- Generating document structure with sections
- Building structure prompts
"""
import json
import logging
from typing import Dict, Any, List

from modules.datamodels.datamodelExtraction import ContentPart

logger = logging.getLogger(__name__)


class StructureGenerator:
    """Handles document structure generation."""

    def __init__(self, services, aiService):
        """Initialize StructureGenerator with service center and AI service access."""
        self.services = services
        self.aiService = aiService

    async def generateStructure(
        self,
        userPrompt: str,
        contentParts: List[ContentPart],
        outputFormat: str,
        parentOperationId: str
    ) -> Dict[str, Any]:
        """
        Phase 5C: Generiert Dokument-Struktur mit Sections.
        Jede Section spezifiziert:
        - Welcher Content sollte in dieser Section sein
        - Welche ContentParts zu verwenden sind
        - Format für jeden ContentPart

        Args:
            userPrompt: User-Anfrage
            contentParts: Alle vorbereiteten ContentParts mit Metadaten
            outputFormat: Ziel-Format (html, docx, pdf, etc.)
            parentOperationId: Parent Operation-ID für ChatLog-Hierarchie

        Returns:
            Struktur-Dict mit documents und sections
        """
        # Erstelle Operation-ID für Struktur-Generierung
        structureOperationId = f"{parentOperationId}_structure_generation"

        # Starte ChatLog mit Parent-Referenz
        self.services.chat.progressLogStart(
            structureOperationId,
            "Structure Generation",
            "Structure",
            f"Generating structure for {outputFormat}",
            parentOperationId=parentOperationId
        )

        try:
            # Baue Struktur-Prompt mit Content-Index
            structurePrompt = self._buildStructurePrompt(
                userPrompt=userPrompt,
                contentParts=contentParts,
                outputFormat=outputFormat
            )

            # AI-Call für Struktur-Generierung (verwende callAiPlanning für einfache JSON-Responses)
            # Debug-Logs werden bereits von callAiPlanning geschrieben
            aiResponse = await self.aiService.callAiPlanning(
                prompt=structurePrompt,
                debugType="document_generation_structure"
            )

            # Parse Struktur
            structure = json.loads(self.services.utils.jsonExtractString(aiResponse))

            # ChatLog abschließen
            self.services.chat.progressLogFinish(structureOperationId, True)

            return structure

        except Exception as e:
            self.services.chat.progressLogFinish(structureOperationId, False)
            logger.error(f"Error in generateStructure: {str(e)}")
            raise

    def _buildStructurePrompt(
        self,
        userPrompt: str,
        contentParts: List[ContentPart],
        outputFormat: str
    ) -> str:
        """Baue Prompt für Struktur-Generierung."""
        # Baue ContentParts-Index - filtere leere Parts heraus
        contentPartsIndex = ""
        validParts = []
        filteredParts = []

        for part in contentParts:
            contentFormat = part.metadata.get("contentFormat", "unknown")

            # WICHTIG: Reference Parts haben absichtlich leere Daten - immer einschließen
            if contentFormat == "reference":
                validParts.append(part)
                logger.debug(f"Including reference ContentPart {part.id} (intentionally empty data)")
                continue

            # Überspringe leere Parts (keine Daten oder nur Container ohne Inhalt)
            # ABER: Reference Parts wurden bereits oben behandelt
            if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
                # Überspringe Container-Parts ohne Daten
                if part.typeGroup == "container" and not part.data:
                    filteredParts.append((part.id, "container without data"))
                    continue
                # Überspringe andere leere Parts (aber nicht Reference, die wurden bereits behandelt)
                if not part.data:
                    filteredParts.append((part.id, f"no data (format: {contentFormat})"))
                    continue

            validParts.append(part)
            logger.debug(f"Including ContentPart {part.id}: format={contentFormat}, type={part.typeGroup}, dataLength={len(str(part.data)) if part.data else 0}")

        if filteredParts:
            logger.debug(f"Filtered out {len(filteredParts)} empty ContentParts: {filteredParts}")

        logger.info(f"Building structure prompt with {len(validParts)} valid ContentParts (from {len(contentParts)} total)")

        # Baue Index nur für gültige Parts
        for i, part in enumerate(validParts, 1):
            contentFormat = part.metadata.get("contentFormat", "unknown")
            dataPreview = ""

            if contentFormat == "extracted":
                # Für Image-Parts: Zeige dass es ein Image ist
                if part.typeGroup == "image":
                    dataLength = len(part.data) if part.data else 0
                    mimeType = part.mimeType or "image"
                    dataPreview = f"Image data ({mimeType}, {dataLength} chars) - base64 encoded image content"
                elif part.typeGroup == "container":
                    # Container ohne Daten überspringen wir bereits oben
                    dataPreview = "Container structure (no text content)"
                else:
                    # Zeige Preview von extrahiertem Text
                    if part.data:
                        preview = part.data[:200] + "..." if len(part.data) > 200 else part.data
                        dataPreview = preview
                    else:
                        dataPreview = "(empty)"
            elif contentFormat == "object":
                dataLength = len(part.data) if part.data else 0
                mimeType = part.mimeType or "binary"
                if part.typeGroup == "image":
                    dataPreview = f"Base64 encoded image ({mimeType}, {dataLength} chars)"
                else:
                    dataPreview = f"Base64 encoded binary ({mimeType}, {dataLength} chars)"
            elif contentFormat == "reference":
                dataPreview = part.metadata.get("documentReference", "reference")

            originalFileName = part.metadata.get('originalFileName', 'N/A')

            contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
            contentPartsIndex += f"   Format: {contentFormat}\n"
            contentPartsIndex += f"   Type: {part.typeGroup}\n"
            contentPartsIndex += f"   MIME Type: {part.mimeType or 'N/A'}\n"
            contentPartsIndex += f"   Source: {part.metadata.get('documentId', 'unknown')}\n"
            contentPartsIndex += f"   Original file name: {originalFileName}\n"
            contentPartsIndex += f"   Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
            contentPartsIndex += f"   Data preview: {dataPreview}\n"

        if not contentPartsIndex:
            contentPartsIndex = "\n(No content parts available)"

        prompt = f"""USER REQUEST:
{userPrompt}

AVAILABLE CONTENT PARTS:
{contentPartsIndex}

TASK: Generiere Dokument-Struktur mit Sections.
Für jede Section, spezifiziere:
- section id
- content_type (heading, paragraph, image, table, etc.)
- contentPartIds: [Liste von ContentPart-IDs zu verwenden]
- contentFormats: {{"partId": "reference|object|extracted"}} - Wie jeder ContentPart zu verwenden ist
- generation_hint: Was AI für diese Section generieren soll
- elements: [] (leer, wird in nächster Phase gefüllt)

OUTPUT FORMAT: {outputFormat}

RETURN JSON:
{{
  "metadata": {{
    "title": "Document Title",
    "language": "de"
  }},
  "documents": [{{
    "id": "doc_1",
    "title": "Document Title",
    "filename": "document.{outputFormat}",
    "sections": [
      {{
        "id": "section_1",
        "content_type": "heading",
        "generation_hint": "Main title",
        "contentPartIds": [],
        "contentFormats": {{}},
        "elements": []
      }},
      {{
        "id": "section_2",
        "content_type": "paragraph",
        "generation_hint": "Introduction paragraph",
        "contentPartIds": ["part_ext_1"],
        "contentFormats": {{
          "part_ext_1": "extracted"
        }},
        "elements": []
      }}
    ]
  }}]
}}

Return ONLY valid JSON following the structure above.
"""
        return prompt