gateway/modules/serviceCenter/services/serviceAi/subDocumentIntents.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Intent Analysis Module

Handles analysis of document intents, including:
- Clarifying which documents need extraction vs reference
- Resolving pre-extracted documents
- Building intent analysis prompts
"""
import json
import logging
from typing import Dict, Any, List, Optional

from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import DocumentIntent
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped

logger = logging.getLogger(__name__)


class DocumentIntentAnalyzer:
    """Handles document intent analysis and resolution."""

    def __init__(self, services, aiService):
        """Initialize DocumentIntentAnalyzer with service center and AI service access."""
        self.services = services
        self.aiService = aiService

    async def clarifyDocumentIntents(
        self,
        documents: List[ChatDocument],
        userPrompt: str,
        actionParameters: Dict[str, Any],
        parentOperationId: str
    ) -> List[DocumentIntent]:
        """
        Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen.
        Gibt DocumentIntent für jedes Dokument zurück.

        Args:
            documents: Liste der zu verarbeitenden Dokumente
            userPrompt: User-Anfrage
            actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat)
            parentOperationId: Parent Operation-ID für ChatLog-Hierarchie

        Returns:
            Liste von DocumentIntent-Objekten
        """
        # Erstelle Operation-ID für Intent-Analyse
        intentOperationId = f"{parentOperationId}_intent_analysis"

        # Starte ChatLog mit Parent-Referenz
        self.services.chat.progressLogStart(
            intentOperationId,
            "Document Intent Analysis",
            "Intent Analysis",
            f"Analyzing {len(documents)} documents",
            parentOperationId=parentOperationId
        )

        try:
            # Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse
            documentMapping = {}  # Maps original doc ID -> JSON doc ID
            resolvedDocuments = []

            for doc in documents:
                preExtracted = self.resolvePreExtractedDocument(doc)
                if preExtracted:
                    originalDocId = preExtracted["originalDocument"]["id"]
                    documentMapping[originalDocId] = doc.id
                    # Erstelle temporäres ChatDocument für ursprüngliches Dokument
                    originalDoc = ChatDocument(
                        id=originalDocId,
                        fileName=preExtracted["originalDocument"]["fileName"],
                        mimeType=preExtracted["originalDocument"]["mimeType"],
                        fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
                        fileId=doc.fileId,  # Behalte fileId vom JSON
                        messageId=doc.messageId if hasattr(doc, 'messageId') else None  # Behalte messageId falls vorhanden
                    )
                    resolvedDocuments.append(originalDoc)
                else:
                    resolvedDocuments.append(doc)

            # Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten
            intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters)

            # AI-Call (verwende callAiPlanning für einfache JSON-Responses)
            # Debug-Logs werden bereits von callAiPlanning geschrieben
            checkWorkflowStopped(self.services)
            aiResponse = await self.aiService.callAiPlanning(
                prompt=intentPrompt,
                debugType="document_intent_analysis"
            )

            # Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig
            intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse))
            documentIntents = []
            for intent in intentsData.get("intents", []):
                docId = intent.get("documentId")
                # Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID
                if docId in documentMapping:
                    intent["documentId"] = documentMapping[docId]
                documentIntents.append(DocumentIntent(**intent))

            # Debug-Log (harmonisiert)
            self.services.utils.writeDebugFile(
                json.dumps([intent.dict() for intent in documentIntents], indent=2),
                "document_intent_analysis_result"
            )

            # State 1 Validation: Validate and auto-fix document intents
            documentIds = {d.id for d in documents}
            validatedIntents = []

            for intent in documentIntents:
                # Validation 1.2: Skip intents for unknown documents
                if intent.documentId not in documentIds:
                    # Try to find similar UUID (fix AI hallucination/typo)
                    correctedDocId = self._findSimilarDocumentId(intent.documentId, documentIds)
                    if correctedDocId:
                        logger.warning(f"Corrected UUID typo in AI response: {intent.documentId} -> {correctedDocId}")
                        intent.documentId = correctedDocId
                    else:
                        logger.warning(f"Skipping intent for unknown document: {intent.documentId}")
                        continue
                validatedIntents.append(intent)

            # Validation 1.1: Documents without intents are OK (not needed)
            # Intents for non-existing documents are already filtered above
            documentIntents = validatedIntents

            # ChatLog abschließen
            self.services.chat.progressLogFinish(intentOperationId, True)

            return documentIntents

        except Exception as e:
            self.services.chat.progressLogFinish(intentOperationId, False)
            logger.error(f"Error in clarifyDocumentIntents: {str(e)}")
            raise

    def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]:
        """
        Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält.
        Gibt Dict zurück mit:
        - originalDocument: ChatDocument-Info des ursprünglichen Dokuments
        - contentExtracted: ContentExtracted-Objekt mit Parts
        - parts: Liste der ContentParts

        Returns None wenn kein pre-extracted Format erkannt wird.
        """
        if document.mimeType != "application/json":
            logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check")
            return None

        try:
            docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
            if not docBytes:
                return None

            docData = docBytes.decode('utf-8')
            jsonData = json.loads(docData)

            if not isinstance(jsonData, dict):
                return None

            # Check for ContentExtracted format
            # Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
            documentData = None

            validationMetadata = jsonData.get("validationMetadata", {})
            actionType = validationMetadata.get("actionType")
            logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}")

            if actionType == "context.extractContent":
                # Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
                documentData = jsonData.get("documentData")
                logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}")
            else:
                logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})")

            if documentData:

                try:
                    # Stelle sicher, dass "id" vorhanden ist
                    if "id" not in documentData:
                        documentData["id"] = document.id

                    contentExtracted = ContentExtracted(**documentData)

                    if contentExtracted.parts:
                        # Extrahiere ursprüngliche Dokument-Info aus den Parts
                        originalDocId = None
                        originalFileName = None
                        originalMimeType = None

                        for part in contentExtracted.parts:
                            if part.metadata:
                                # Versuche ursprüngliche Dokument-Info zu finden
                                if not originalDocId and part.metadata.get("documentId"):
                                    originalDocId = part.metadata.get("documentId")
                                if not originalFileName and part.metadata.get("originalFileName"):
                                    originalFileName = part.metadata.get("originalFileName")
                                if not originalMimeType and part.metadata.get("documentMimeType"):
                                    originalMimeType = part.metadata.get("documentMimeType")

                        # Falls nicht gefunden, versuche aus documentName zu extrahieren
                        if not originalFileName:
                            # Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
                            if document.fileName and "_extracted_" in document.fileName:
                                originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"

                        return {
                            "originalDocument": {
                                "id": originalDocId or document.id,
                                "fileName": originalFileName or document.fileName,
                                "mimeType": originalMimeType or "application/pdf",
                                "fileSize": document.fileSize
                            },
                            "contentExtracted": contentExtracted,
                            "parts": contentExtracted.parts
                        }
                except Exception as parseError:
                    logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
                    logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
                    import traceback
                    logger.debug(f"Parse error traceback: {traceback.format_exc()}")
                    return None
            else:
                logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")

            return None
        except Exception as e:
            logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
            return None

    def _buildIntentAnalysisPrompt(
        self,
        userPrompt: str,
        documents: List[ChatDocument],
        actionParameters: Dict[str, Any]
    ) -> str:
        """Baue Prompt für Intent-Analyse."""
        # Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs
        docListText = ""
        for i, doc in enumerate(documents, 1):
            # Prüfe ob es ein pre-extracted JSON ist
            preExtracted = self.resolvePreExtractedDocument(doc)

            if preExtracted:
                # Zeige ursprüngliches Dokument statt JSON
                originalDoc = preExtracted["originalDocument"]
                partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})"
                docListText += f"\n{i}. Document ID: {originalDoc['id']}\n"
                docListText += f"   File Name: {originalDoc['fileName']}{partsInfo}\n"
                docListText += f"   MIME Type: {originalDoc['mimeType']}\n"
                docListText += f"   File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n"
            else:
                # Normales Dokument
                docListText += f"\n{i}. Document ID: {doc.id}\n"
                docListText += f"   File Name: {doc.fileName}\n"
                docListText += f"   MIME Type: {doc.mimeType}\n"
                docListText += f"   File Size: {doc.fileSize} bytes\n"

        outputFormat = actionParameters.get("outputFormat", "txt")

        # FENCE user input to prevent prompt injection
        fencedUserPrompt = f"""```user_request
{userPrompt}
```"""

        prompt = f"""USER REQUEST:
{fencedUserPrompt}

DOCUMENTS TO ANALYZE:
{docListText}

TASK: For each document, determine its intents (can be multiple):
- "extract": Content extraction needed (text, structure, OCR, etc.)
- "render": Image/binary should be rendered as-is (visual element)
- "reference": Document reference/attachment (no extraction, just reference)

TASK: For each document, determine:
1. Intents (can be multiple): "extract", "render", "reference"
Note: Output format and language are NOT determined here - they will be
      determined during structure generation (Phase 3) in the chapter structure JSON

OUTPUT FORMAT: {outputFormat} (global fallback - for reference only)

RETURN JSON:
{{
  "intents": [
    {{
      "documentId": "doc_1",
      "intents": ["extract"],
      "extractionPrompt": "Extract all text content, preserving structure",
      "reasoning": "User needs text content for document generation"
    }},
    {{
      "documentId": "doc_2",
      "intents": ["extract", "render"],
      "extractionPrompt": "Extract text content from image using vision AI",
      "reasoning": "Image contains text that needs extraction, but also should be rendered visually"
    }},
    {{
      "documentId": "doc_3",
      "intents": ["reference"],
      "extractionPrompt": null,
      "reasoning": "Document is only used as reference, no extraction needed"
    }}
  ]
}}

CRITICAL RULES:
1. For images (mimeType starts with "image/"):
   - If user wants to "include" or "show" images → add "render"
   - If user wants to "analyze", "read text", or "extract text" from images → add "extract"
   - Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering

2. For text documents:
   - If user mentions "template" or "structure" → "reference" or "extract" based on context
   - If user mentions "reference" or "context" → "reference"
   - Default → "extract"

3. Consider output format:
   - For formats like PDF, DOCX, PPTX: images usually need "render"
   - For formats like CSV, JSON: usually "extract" only
   - For HTML: can have both "extract" and "render"

Return ONLY valid JSON following the structure above.
"""
        return prompt

    def _findSimilarDocumentId(self, incorrectId: str, validIds: set) -> Optional[str]:
        """
        Versucht eine ähnliche Dokument-ID zu finden, falls die AI die UUID geändert hat.
        Prüft auf UUID-Typo (z.B. 4451 -> 4551).

        Args:
            incorrectId: Die falsche UUID aus der AI-Response
            validIds: Set von gültigen Dokument-IDs

        Returns:
            Korrigierte UUID falls gefunden, sonst None
        """
        if not incorrectId or len(incorrectId) != 36:  # UUID Format: 8-4-4-4-12
            return None

        # Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
        if incorrectId.count('-') != 4:
            return None

        # Versuche Levenshtein-ähnliche Suche: Prüfe ob nur 1-2 Zeichen unterschiedlich sind
        for validId in validIds:
            if len(validId) != 36:
                continue

            # Zähle unterschiedliche Zeichen
            differences = sum(c1 != c2 for c1, c2 in zip(incorrectId, validId))

            # Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
            if differences <= 2:
                # Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
                if incorrectId.count('-') == validId.count('-'):
                    return validId

        return None