gateway/modules/services/serviceAi/subDocumentIntents.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Intent Analysis Module

Handles analysis of document intents, including:
- Clarifying which documents need extraction vs reference
- Resolving pre-extracted documents
- Building intent analysis prompts
"""
import json
import logging
from typing import Dict, Any, List, Optional

from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import DocumentIntent

logger = logging.getLogger(__name__)


class DocumentIntentAnalyzer:
    """Handles document intent analysis and resolution."""

    def __init__(self, services, aiService):
        """Initialize DocumentIntentAnalyzer with service center and AI service access."""
        self.services = services
        self.aiService = aiService

    async def clarifyDocumentIntents(
        self,
        documents: List[ChatDocument],
        userPrompt: str,
        actionParameters: Dict[str, Any],
        parentOperationId: str
    ) -> List[DocumentIntent]:
        """
        Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen.
        Gibt DocumentIntent für jedes Dokument zurück.

        Args:
            documents: Liste der zu verarbeitenden Dokumente
            userPrompt: User-Anfrage
            actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat)
            parentOperationId: Parent Operation-ID für ChatLog-Hierarchie

        Returns:
            Liste von DocumentIntent-Objekten
        """
        # Erstelle Operation-ID für Intent-Analyse
        intentOperationId = f"{parentOperationId}_intent_analysis"

        # Starte ChatLog mit Parent-Referenz
        self.services.chat.progressLogStart(
            intentOperationId,
            "Document Intent Analysis",
            "Intent Analysis",
            f"Analyzing {len(documents)} documents",
            parentOperationId=parentOperationId
        )

        try:
            # Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse
            documentMapping = {}  # Maps original doc ID -> JSON doc ID
            resolvedDocuments = []

            for doc in documents:
                preExtracted = self.resolvePreExtractedDocument(doc)
                if preExtracted:
                    originalDocId = preExtracted["originalDocument"]["id"]
                    documentMapping[originalDocId] = doc.id
                    # Erstelle temporäres ChatDocument für ursprüngliches Dokument
                    originalDoc = ChatDocument(
                        id=originalDocId,
                        fileName=preExtracted["originalDocument"]["fileName"],
                        mimeType=preExtracted["originalDocument"]["mimeType"],
                        fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
                        fileId=doc.fileId,  # Behalte fileId vom JSON
                        messageId=doc.messageId if hasattr(doc, 'messageId') else None  # Behalte messageId falls vorhanden
                    )
                    resolvedDocuments.append(originalDoc)
                else:
                    resolvedDocuments.append(doc)

            # Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten
            intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters)

            # AI-Call (verwende callAiPlanning für einfache JSON-Responses)
            # Debug-Logs werden bereits von callAiPlanning geschrieben
            aiResponse = await self.aiService.callAiPlanning(
                prompt=intentPrompt,
                debugType="document_intent_analysis"
            )

            # Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig
            intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse))
            documentIntents = []
            for intent in intentsData.get("intents", []):
                docId = intent.get("documentId")
                # Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID
                if docId in documentMapping:
                    intent["documentId"] = documentMapping[docId]
                documentIntents.append(DocumentIntent(**intent))

            # Debug-Log (harmonisiert)
            self.services.utils.writeDebugFile(
                json.dumps([intent.dict() for intent in documentIntents], indent=2),
                "document_intent_analysis_result"
            )

            # ChatLog abschließen
            self.services.chat.progressLogFinish(intentOperationId, True)

            return documentIntents

        except Exception as e:
            self.services.chat.progressLogFinish(intentOperationId, False)
            logger.error(f"Error in clarifyDocumentIntents: {str(e)}")
            raise

    def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]:
        """
        Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält.
        Gibt Dict zurück mit:
        - originalDocument: ChatDocument-Info des ursprünglichen Dokuments
        - contentExtracted: ContentExtracted-Objekt mit Parts
        - parts: Liste der ContentParts

        Returns None wenn kein pre-extracted Format erkannt wird.
        """
        if document.mimeType != "application/json":
            logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check")
            return None

        try:
            docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
            if not docBytes:
                return None

            docData = docBytes.decode('utf-8')
            jsonData = json.loads(docData)

            if not isinstance(jsonData, dict):
                return None

            # Check for ContentExtracted format
            # Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
            documentData = None

            validationMetadata = jsonData.get("validationMetadata", {})
            actionType = validationMetadata.get("actionType")
            logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}")

            if actionType == "context.extractContent":
                # Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
                documentData = jsonData.get("documentData")
                logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}")
            else:
                logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})")

            if documentData:
                from modules.datamodels.datamodelExtraction import ContentExtracted

                try:
                    # Stelle sicher, dass "id" vorhanden ist
                    if "id" not in documentData:
                        documentData["id"] = document.id

                    contentExtracted = ContentExtracted(**documentData)

                    if contentExtracted.parts:
                        # Extrahiere ursprüngliche Dokument-Info aus den Parts
                        originalDocId = None
                        originalFileName = None
                        originalMimeType = None

                        for part in contentExtracted.parts:
                            if part.metadata:
                                # Versuche ursprüngliche Dokument-Info zu finden
                                if not originalDocId and part.metadata.get("documentId"):
                                    originalDocId = part.metadata.get("documentId")
                                if not originalFileName and part.metadata.get("originalFileName"):
                                    originalFileName = part.metadata.get("originalFileName")
                                if not originalMimeType and part.metadata.get("documentMimeType"):
                                    originalMimeType = part.metadata.get("documentMimeType")

                        # Falls nicht gefunden, versuche aus documentName zu extrahieren
                        if not originalFileName:
                            # Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
                            if document.fileName and "_extracted_" in document.fileName:
                                originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"

                        return {
                            "originalDocument": {
                                "id": originalDocId or document.id,
                                "fileName": originalFileName or document.fileName,
                                "mimeType": originalMimeType or "application/pdf",
                                "fileSize": document.fileSize
                            },
                            "contentExtracted": contentExtracted,
                            "parts": contentExtracted.parts
                        }
                except Exception as parseError:
                    logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
                    logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
                    import traceback
                    logger.debug(f"Parse error traceback: {traceback.format_exc()}")
                    return None
            else:
                logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")

            return None
        except Exception as e:
            logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
            return None

    def _buildIntentAnalysisPrompt(
        self,
        userPrompt: str,
        documents: List[ChatDocument],
        actionParameters: Dict[str, Any]
    ) -> str:
        """Baue Prompt für Intent-Analyse."""
        # Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs
        docListText = ""
        for i, doc in enumerate(documents, 1):
            # Prüfe ob es ein pre-extracted JSON ist
            preExtracted = self.resolvePreExtractedDocument(doc)

            if preExtracted:
                # Zeige ursprüngliches Dokument statt JSON
                originalDoc = preExtracted["originalDocument"]
                partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})"
                docListText += f"\n{i}. Document ID: {originalDoc['id']}\n"
                docListText += f"   File Name: {originalDoc['fileName']}{partsInfo}\n"
                docListText += f"   MIME Type: {originalDoc['mimeType']}\n"
                docListText += f"   File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n"
            else:
                # Normales Dokument
                docListText += f"\n{i}. Document ID: {doc.id}\n"
                docListText += f"   File Name: {doc.fileName}\n"
                docListText += f"   MIME Type: {doc.mimeType}\n"
                docListText += f"   File Size: {doc.fileSize} bytes\n"

        outputFormat = actionParameters.get("outputFormat", "txt")

        prompt = f"""USER REQUEST:
{userPrompt}

DOCUMENTS TO ANALYZE:
{docListText}

TASK: For each document, determine its intents (can be multiple):
- "extract": Content extraction needed (text, structure, OCR, etc.)
- "render": Image/binary should be rendered as-is (visual element)
- "reference": Document reference/attachment (no extraction, just reference)

OUTPUT FORMAT: {outputFormat}

RETURN JSON:
{{
  "intents": [
    {{
      "documentId": "doc_1",
      "intents": ["extract"],  # Array - can contain multiple!
      "extractionPrompt": "Extract all text content, preserving structure",
      "reasoning": "User needs text content for document generation"
    }},
    {{
      "documentId": "doc_2",
      "intents": ["extract", "render"],  # Both! Image needs text extraction AND visual rendering
      "extractionPrompt": "Extract text content from image using vision AI",
      "reasoning": "Image contains text that needs extraction, but also should be rendered visually"
    }},
    {{
      "documentId": "doc_3",
      "intents": ["reference"],
      "extractionPrompt": null,
      "reasoning": "Document is only used as reference, no extraction needed"
    }}
  ]
}}

CRITICAL RULES:
1. For images (mimeType starts with "image/"):
   - If user wants to "include" or "show" images → add "render"
   - If user wants to "analyze", "read text", or "extract text" from images → add "extract"
   - Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering

2. For text documents:
   - If user mentions "template" or "structure" → "reference" or "extract" based on context
   - If user mentions "reference" or "context" → "reference"
   - Default → "extract"

3. Consider output format:
   - For formats like PDF, DOCX, PPTX: images usually need "render"
   - For formats like CSV, JSON: usually "extract" only
   - For HTML: can have both "extract" and "render"

Return ONLY valid JSON following the structure above.
"""
        return prompt