gateway/modules/services/serviceAi/subStructureGeneration.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Structure Generation Module

Handles document structure generation, including:
- Generating document structure with sections
- Building structure prompts
"""
import json
import logging
from typing import Dict, Any, List, Optional

from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped

logger = logging.getLogger(__name__)


class StructureGenerator:
    """Handles document structure generation."""

    def __init__(self, services, aiService):
        """Initialize StructureGenerator with service center and AI service access."""
        self.services = services
        self.aiService = aiService

    def _getUserLanguage(self) -> str:
        """Get user language for document generation"""
        try:
            if self.services:
                # Prefer detected language if available (from user intention analysis)
                if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage:
                    return self.services.currentUserLanguage
                # Fallback to user's preferred language
                elif hasattr(self.services, 'user') and self.services.user and hasattr(self.services.user, 'language'):
                    return self.services.user.language
        except Exception:
            pass
        return 'en'  # Default fallback

    async def generateStructure(
        self,
        userPrompt: str,
        contentParts: List[ContentPart],
        outputFormat: Optional[str] = None,
        parentOperationId: str = None
    ) -> Dict[str, Any]:
        """
        Phase 5C: Generiert Chapter-Struktur (Table of Contents).
        Definiert für jedes Chapter:
        - Level, Title
        - contentParts (unified object with instruction and/or caption per part)
        - generationHint

        Generate document structure with per-document format determination.
        Multiple documents can be produced with different formats (e.g., one PDF, one HTML).
        AI determines formats per-document from user prompt. The outputFormat parameter is
        only a validation fallback - used if AI doesn't return format per document.

        Args:
            userPrompt: User-Anfrage
            contentParts: Alle vorbereiteten ContentParts mit Metadaten
            outputFormat: Optional global format fallback. If omitted, formats are determined
                         from user prompt by AI. Used as validation fallback if AI doesn't
                         return format per document. Defaults to "txt" if not provided.
            parentOperationId: Parent Operation-ID für ChatLog-Hierarchie

        Returns:
            Struktur-Dict mit documents und chapters (nicht sections!)
        """
        # If outputFormat not provided, use "txt" as fallback for validation
        # AI will determine formats per document from user prompt
        if not outputFormat:
            outputFormat = "txt"
            logger.debug("outputFormat not provided - using 'txt' as validation fallback, formats determined from prompt")
        # Erstelle Operation-ID für Struktur-Generierung
        structureOperationId = f"{parentOperationId}_structure_generation"

        # Starte ChatLog mit Parent-Referenz
        formatDisplay = outputFormat if outputFormat else "auto-determined"
        self.services.chat.progressLogStart(
            structureOperationId,
            "Chapter Structure Generation",
            "Structure",
            f"Generating chapter structure (format: {formatDisplay})",
            parentOperationId=parentOperationId
        )

        try:
            # Baue Chapter-Struktur-Prompt mit Content-Index
            structurePrompt = self._buildChapterStructurePrompt(
                userPrompt=userPrompt,
                contentParts=contentParts,
                outputFormat=outputFormat
            )

            # AI-Call für Chapter-Struktur-Generierung mit Looping-Unterstützung
            # Use _callAiWithLooping instead of callAiPlanning to support continuation if response is cut
            options = AiCallOptions(
                operationType=OperationTypeEnum.DATA_GENERATE,
                priority=PriorityEnum.QUALITY,
                processingMode=ProcessingModeEnum.DETAILED,
                compressPrompt=False,
                compressContext=False,
                resultFormat="json"
            )

            # Create prompt builder for continuation support
            async def buildChapterStructurePromptWithContinuation(
                continuationContext: Optional[Dict[str, Any]] = None,
                **kwargs
            ) -> str:
                """Build chapter structure prompt with optional continuation context."""
                basePrompt = self._buildChapterStructurePrompt(
                    userPrompt=userPrompt,
                    contentParts=contentParts,
                    outputFormat=outputFormat
                )

                if continuationContext:
                    # Add continuation instructions
                    deliveredSummary = continuationContext.get("delivered_summary", "")
                    elementBeforeCutoff = continuationContext.get("element_before_cutoff", "")
                    cutOffElement = continuationContext.get("cut_off_element", "")

                    continuationText = f"{deliveredSummary}\n\n"
                    continuationText += "⚠️ CONTINUATION: Response was cut off. Generate ONLY the remaining content that comes AFTER the reference elements below.\n\n"

                    if elementBeforeCutoff:
                        continuationText += "# REFERENCE: Last complete element (already delivered - DO NOT repeat):\n"
                        continuationText += f"{elementBeforeCutoff}\n\n"

                    if cutOffElement:
                        continuationText += "# REFERENCE: Incomplete element (cut off here - DO NOT repeat):\n"
                        continuationText += f"{cutOffElement}\n\n"

                    continuationText += "⚠️ CRITICAL: The elements above are REFERENCE ONLY. They are already delivered.\n"
                    continuationText += "Generate ONLY what comes AFTER these elements. DO NOT regenerate the entire JSON structure.\n"
                    continuationText += "Start directly with the next chapter that should follow.\n\n"

                    return f"""{basePrompt}

{continuationText}

Continue generating the remaining chapters now.
"""
                else:
                    return basePrompt

            # Call AI with looping support
            # NOTE: Do NOT pass contentParts here - we only need metadata for structure generation
            # The contentParts metadata is already included in the prompt (contentPartsIndex)
            # Actual content extraction happens later during section generation
            checkWorkflowStopped(self.services)
            aiResponseJson = await self.aiService.callAiWithLooping(
                prompt=structurePrompt,
                options=options,
                debugPrefix="chapter_structure_generation",
                promptBuilder=buildChapterStructurePromptWithContinuation,
                promptArgs={
                    "userPrompt": userPrompt,
                    "outputFormat": outputFormat,
                    "services": self.services
                },
                useCaseId="chapter_structure",  # REQUIRED: Explicit use case ID
                operationId=structureOperationId,
                userPrompt=userPrompt,
                contentParts=None  # Do not pass ContentParts - only metadata needed, not content extraction
            )

            # Parse the complete JSON response (looping system already handles completion)
            extractedJson = self.services.utils.jsonExtractString(aiResponseJson)
            parsedJson, parseError, cleanedJson = self.services.utils.jsonTryParse(extractedJson)

            if parseError is not None:
                # Even with looping, try repair as fallback
                logger.warning(f"JSON parsing failed after looping: {str(parseError)}. Attempting repair...")
                from modules.shared import jsonUtils
                repairedJson = jsonUtils.repairBrokenJson(extractedJson)
                if repairedJson:
                    parsedJson, parseError, _ = self.services.utils.jsonTryParse(json.dumps(repairedJson))
                    if parseError is None:
                        logger.info("Successfully repaired and parsed JSON structure after looping")
                        structure = parsedJson
                    else:
                        logger.error(f"Failed to parse repaired JSON: {str(parseError)}")
                        raise ValueError(f"Failed to parse JSON structure after repair: {str(parseError)}")
                else:
                    logger.error(f"Failed to repair JSON. Parse error: {str(parseError)}")
                    logger.error(f"Cleaned JSON preview (first 500 chars): {cleanedJson[:500]}")
                    raise ValueError(f"Failed to parse JSON structure: {str(parseError)}")
            else:
                structure = parsedJson

            # State 3 Validation: Validate and auto-fix structure
            # Validation 3.1: Structure missing 'documents' field
            if "documents" not in structure:
                raise ValueError("Structure missing 'documents' field - cannot auto-fix")

            documents = structure["documents"]

            # Validation 3.2: Structure has no documents
            if not isinstance(documents, list) or len(documents) == 0:
                raise ValueError("Structure has no documents - cannot generate without documents")

            # Import renderer registry for format validation (existing infrastructure)
            from modules.services.serviceGeneration.renderers.registry import getRenderer

            # Validate and fix each document
            for doc in documents:
                # Validation 3.3 & 3.4: Document outputFormat
                # outputFormat parameter is optional - if omitted, formats determined from prompt by AI
                # Use as fallback only if AI doesn't return format per document
                # Multiple documents can have different formats (e.g., one PDF, one HTML)
                globalFormatFallback = outputFormat or "txt"  # Fallback for validation

                if "outputFormat" not in doc or not doc["outputFormat"]:
                    # AI didn't return format or returned empty - use global fallback
                    doc["outputFormat"] = globalFormatFallback
                    logger.warning(f"Document {doc.get('id')} missing outputFormat - using fallback: {doc['outputFormat']}")
                else:
                    # AI returned format - validate using existing renderer registry
                    formatName = str(doc["outputFormat"]).lower().strip()
                    renderer = getRenderer(formatName)  # Uses existing infrastructure

                    if not renderer:
                        # Format doesn't match any renderer - use txt (simple approach)
                        logger.warning(f"Document {doc.get('id')} has format without renderer: {formatName}, using 'txt'")
                        doc["outputFormat"] = "txt"
                    else:
                        # Valid format with renderer - normalize and keep AI result
                        doc["outputFormat"] = formatName
                        logger.debug(f"Document {doc.get('id')} using AI-determined format: {formatName}")

                # Validation 3.5 & 3.6: Document language
                # Use validated currentUserLanguage (always valid, validated during user intention analysis)
                # Access via _getUserLanguage() which uses self.services.currentUserLanguage
                userPromptLanguage = self._getUserLanguage()  # Uses validated currentUserLanguage infrastructure

                if "language" not in doc or not isinstance(doc["language"], str) or len(doc["language"]) != 2:
                    # AI didn't return language or invalid format - use validated currentUserLanguage
                    doc["language"] = userPromptLanguage
                    if "language" not in doc:
                        logger.warning(f"Document {doc.get('id')} missing language - using currentUserLanguage: {userPromptLanguage}")
                    else:
                        logger.warning(f"Document {doc.get('id')} has invalid language format from AI: {doc['language']}, using currentUserLanguage")
                else:
                    # AI returned valid language format - normalize
                    doc["language"] = doc["language"].lower().strip()[:2]
                    logger.debug(f"Document {doc.get('id')} using AI-determined language: {doc['language']}")

                # Validation 3.7: Document missing 'chapters' field
                if "chapters" not in doc:
                    raise ValueError(f"Document {doc.get('id')} missing 'chapters' field - cannot auto-fix")

                # Validation 3.8: Chapter missing 'contentParts' field
                for chapter in doc["chapters"]:
                    if "contentParts" not in chapter:
                        raise ValueError(f"Chapter {chapter.get('id')} missing 'contentParts' field - cannot auto-fix")

            # ChatLog abschließen
            self.services.chat.progressLogFinish(structureOperationId, True)

            return structure

        except Exception as e:
            self.services.chat.progressLogFinish(structureOperationId, False)
            logger.error(f"Error in generateStructure: {str(e)}")
            raise

    def _buildChapterStructurePrompt(
        self,
        userPrompt: str,
        contentParts: List[ContentPart],
        outputFormat: str
    ) -> str:
        """Baue Prompt für Chapter-Struktur-Generierung."""
        # Baue ContentParts-Index - filtere leere Parts heraus
        contentPartsIndex = ""
        validParts = []
        filteredParts = []

        for part in contentParts:
            contentFormat = part.metadata.get("contentFormat", "unknown")

            # WICHTIG: Reference Parts haben absichtlich leere Daten - immer einschließen
            if contentFormat == "reference":
                validParts.append(part)
                logger.debug(f"Including reference ContentPart {part.id} (intentionally empty data)")
                continue

            # Überspringe leere Parts (keine Daten oder nur Container ohne Inhalt)
            # ABER: Reference Parts wurden bereits oben behandelt
            if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
                # Überspringe Container-Parts ohne Daten
                if part.typeGroup == "container" and not part.data:
                    filteredParts.append((part.id, "container without data"))
                    continue
                # Überspringe andere leere Parts (aber nicht Reference, die wurden bereits behandelt)
                if not part.data:
                    filteredParts.append((part.id, f"no data (format: {contentFormat})"))
                    continue

            validParts.append(part)
            logger.debug(f"Including ContentPart {part.id}: format={contentFormat}, type={part.typeGroup}, dataLength={len(str(part.data)) if part.data else 0}")

        if filteredParts:
            logger.debug(f"Filtered out {len(filteredParts)} empty ContentParts: {filteredParts}")

        logger.info(f"Building structure prompt with {len(validParts)} valid ContentParts (from {len(contentParts)} total)")

        # Baue Index nur für gültige Parts
        for i, part in enumerate(validParts, 1):
            contentFormat = part.metadata.get("contentFormat", "unknown")
            originalFileName = part.metadata.get('originalFileName', 'N/A')

            contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
            contentPartsIndex += f"   Format: {contentFormat}\n"
            contentPartsIndex += f"   Type: {part.typeGroup}\n"
            contentPartsIndex += f"   MIME Type: {part.mimeType or 'N/A'}\n"
            contentPartsIndex += f"   Source: {part.metadata.get('documentId', 'unknown')}\n"
            contentPartsIndex += f"   Original file name: {originalFileName}\n"
            contentPartsIndex += f"   Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"

        if not contentPartsIndex:
            contentPartsIndex = "\n(No content parts available)"

        # Get language from services (user intention analysis)
        language = self._getUserLanguage()
        logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}")

        prompt = f"""# TASK: Generate Chapter Structure

This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.

## USER REQUEST (for context)
```
{userPrompt}
```

## AVAILABLE CONTENT PARTS
{contentPartsIndex}

## CONTENT ASSIGNMENT RULE
If the user request mentions documents/images/data, then EVERY chapter that generates content related to those references MUST assign the relevant ContentParts explicitly.

Assignment logic:
- If chapter DISPLAYS a document/image → assign "object" format ContentPart with "caption"
- If chapter generates text content ABOUT a document/image/data → assign ContentPart with "instruction":
  - Prefer "extracted" format if available (contains analyzed/extracted content)
  - If only "object" format is available, use "object" format with "instruction" (to write ABOUT the image/document)
- If chapter's generationHint or purpose relates to a document/image/data mentioned in user request → it MUST have ContentParts assigned
- Multiple chapters might assign the same ContentPart (e.g., one chapter displays image, another writes about it)
- Use ContentPart IDs exactly as listed in AVAILABLE CONTENT PARTS above
- Empty contentParts are only allowed if chapter generates content WITHOUT referencing any documents/images/data from the user request

CRITICAL RULE: If the user request mentions BOTH:
  a) Documents/images/data (listed in AVAILABLE CONTENT PARTS above), AND
  b) Generic content types (article text, main content, body text, etc.)
Then chapters that generate those generic content types MUST assign the relevant ContentParts, because the content should relate to or be based on the provided documents/images/data.

## CHAPTER STRUCTURE REQUIREMENTS
- Generate chapters based on USER REQUEST - analyze what structure the user wants
- Each chapter needs: id, level (1, 2, 3, etc.), title
- contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Assign ContentParts as required by CONTENT ASSIGNMENT RULE above
- generationHint: Description of what content to generate for this chapter
- The number of chapters depends on the user request - create only what is requested

## DOCUMENT OUTPUT FORMAT
For each document, determine the output format by analyzing the USER REQUEST:
- Look for explicit format mentions
- Infer from document purpose
- Infer from content type
- If format cannot be determined from the prompt, use: "{outputFormat}"
- Include "outputFormat" field in each document in the JSON structure
- Multiple documents can have different formats

## DOCUMENT LANGUAGE
For each document, determine the language by analyzing the USER REQUEST:
- Look for explicit language mentions
- Map language names to ISO 639-1 codes
- If language cannot be determined from the prompt, use: "{language}"
- Include "language" field in each document in the JSON structure
- Multiple documents can have different languages

## JSON STRUCTURE REQUIREMENTS
- metadata: {{"title": "...", "language": "..."}}
- documents: Array of document objects, each with:
  - id: Unique document identifier (e.g., "doc_1")
  - title: Document title
  - filename: Output filename with extension (e.g., "document.docx")
  - outputFormat: Format code (e.g., "docx", "pdf", "html", "xlsx", "pptx", "txt")
  - language: ISO 639-1 language code (e.g., "de", "en", "fr", "it")
  - chapters: Array of chapter objects, each with:
    - id: Unique chapter identifier (e.g., "chapter_1")
    - level: Heading level (1, 2, 3, etc.)
    - title: Chapter title
    - contentParts: Object mapping ContentPart IDs to usage instructions {{"partId": {{"instruction": "..."}} or {{"caption": "..."}}}}
    - generationHint: Description of what content to generate
    - sections: Empty array []

EXAMPLE STRUCTURE (for reference only - adapt to user request):
{{
  "metadata": {{
    "title": "Document Title",
    "language": "{language}"
  }},
  "documents": [{{
    "id": "doc_1",
    "title": "Document Title",
    "filename": "document.{outputFormat}",
    "outputFormat": "{outputFormat}",
    "language": "{language}",
    "chapters": [
      {{
        "id": "chapter_1",
        "level": 1,
        "title": "Chapter Title",
        "contentParts": {{
          "extracted_part_id": {{
            "instruction": "Use extracted content..."
          }}
        }},
        "generationHint": "Description of chapter content",
        "sections": []
      }}
    ]
  }}]
}}

CRITICAL INSTRUCTIONS:
- Generate chapters based on USER REQUEST, NOT based on the example above
- The example shows the JSON structure format, NOT the required chapters
- Create only the chapters that match the user's request
- Adapt chapter titles and structure to match the user's specific request
- Determine outputFormat and language for each document by analyzing the USER REQUEST above
- The example shows placeholders "{outputFormat}" and "{language}" - YOU MUST REPLACE THESE with actual values determined from the USER REQUEST

MANDATORY CONTENT ASSIGNMENT CHECK:
For each chapter, verify:
1. Does the user request mention documents/images/data? (e.g., "photo", "image", "document", "data", "based on", "about")
2. Does this chapter's generationHint, title, or purpose relate to those documents/images/data mentioned in step 1?
   - Examples: "article about the photo", "text describing the image", "analysis of the document", "content based on the data"
   - Even if chapter doesn't explicitly say "about the image", if user request mentions both the image AND this chapter's content type → relate them
3. If YES to both → chapter MUST have contentParts assigned (cannot be empty {{}})
4. If ContentPart is "object" format and chapter needs to write ABOUT it → assign with "instruction" field, not just "caption"

OUTPUT FORMAT: Start with {{ and end with }}. Do NOT use markdown code fences (```json). Do NOT add explanatory text before or after the JSON. Return ONLY the JSON object itself.
"""
        return prompt