# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Document Intent Analysis Module Handles analysis of document intents, including: - Clarifying which documents need extraction vs reference - Resolving pre-extracted documents - Building intent analysis prompts """ import json import logging from typing import Dict, Any, List, Optional from modules.aichat.datamodelFeatureAiChat import ChatDocument from modules.datamodels.datamodelExtraction import DocumentIntent from modules.workflows.processing.shared.stateTools import checkWorkflowStopped logger = logging.getLogger(__name__) class DocumentIntentAnalyzer: """Handles document intent analysis and resolution.""" def __init__(self, services, aiService): """Initialize DocumentIntentAnalyzer with service center and AI service access.""" self.services = services self.aiService = aiService async def clarifyDocumentIntents( self, documents: List[ChatDocument], userPrompt: str, actionParameters: Dict[str, Any], parentOperationId: str ) -> List[DocumentIntent]: """ Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen. Gibt DocumentIntent für jedes Dokument zurück. Args: documents: Liste der zu verarbeitenden Dokumente userPrompt: User-Anfrage actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat) parentOperationId: Parent Operation-ID für ChatLog-Hierarchie Returns: Liste von DocumentIntent-Objekten """ # Erstelle Operation-ID für Intent-Analyse intentOperationId = f"{parentOperationId}_intent_analysis" # Starte ChatLog mit Parent-Referenz self.services.chat.progressLogStart( intentOperationId, "Document Intent Analysis", "Intent Analysis", f"Analyzing {len(documents)} documents", parentOperationId=parentOperationId ) try: # Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse documentMapping = {} # Maps original doc ID -> JSON doc ID resolvedDocuments = [] for doc in documents: preExtracted = self.resolvePreExtractedDocument(doc) if preExtracted: originalDocId = preExtracted["originalDocument"]["id"] documentMapping[originalDocId] = doc.id # Erstelle temporäres ChatDocument für ursprüngliches Dokument originalDoc = ChatDocument( id=originalDocId, fileName=preExtracted["originalDocument"]["fileName"], mimeType=preExtracted["originalDocument"]["mimeType"], fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize), fileId=doc.fileId, # Behalte fileId vom JSON messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden ) resolvedDocuments.append(originalDoc) else: resolvedDocuments.append(doc) # Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters) # AI-Call (verwende callAiPlanning für einfache JSON-Responses) # Debug-Logs werden bereits von callAiPlanning geschrieben checkWorkflowStopped(self.services) aiResponse = await self.aiService.callAiPlanning( prompt=intentPrompt, debugType="document_intent_analysis" ) # Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse)) documentIntents = [] for intent in intentsData.get("intents", []): docId = intent.get("documentId") # Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID if docId in documentMapping: intent["documentId"] = documentMapping[docId] documentIntents.append(DocumentIntent(**intent)) # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( json.dumps([intent.dict() for intent in documentIntents], indent=2), "document_intent_analysis_result" ) # State 1 Validation: Validate and auto-fix document intents documentIds = {d.id for d in documents} validatedIntents = [] for intent in documentIntents: # Validation 1.2: Skip intents for unknown documents if intent.documentId not in documentIds: # Try to find similar UUID (fix AI hallucination/typo) correctedDocId = self._findSimilarDocumentId(intent.documentId, documentIds) if correctedDocId: logger.warning(f"Corrected UUID typo in AI response: {intent.documentId} -> {correctedDocId}") intent.documentId = correctedDocId else: logger.warning(f"Skipping intent for unknown document: {intent.documentId}") continue validatedIntents.append(intent) # Validation 1.1: Documents without intents are OK (not needed) # Intents for non-existing documents are already filtered above documentIntents = validatedIntents # ChatLog abschließen self.services.chat.progressLogFinish(intentOperationId, True) return documentIntents except Exception as e: self.services.chat.progressLogFinish(intentOperationId, False) logger.error(f"Error in clarifyDocumentIntents: {str(e)}") raise def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]: """ Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält. Gibt Dict zurück mit: - originalDocument: ChatDocument-Info des ursprünglichen Dokuments - contentExtracted: ContentExtracted-Objekt mit Parts - parts: Liste der ContentParts Returns None wenn kein pre-extracted Format erkannt wird. """ if document.mimeType != "application/json": logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check") return None try: docBytes = self.services.interfaceDbComponent.getFileData(document.fileId) if not docBytes: return None docData = docBytes.decode('utf-8') jsonData = json.loads(docData) if not isinstance(jsonData, dict): return None # Check for ContentExtracted format # Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt documentData = None validationMetadata = jsonData.get("validationMetadata", {}) actionType = validationMetadata.get("actionType") logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}") if actionType == "context.extractContent": # Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}} documentData = jsonData.get("documentData") logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}") else: logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})") if documentData: from modules.datamodels.datamodelExtraction import ContentExtracted try: # Stelle sicher, dass "id" vorhanden ist if "id" not in documentData: documentData["id"] = document.id contentExtracted = ContentExtracted(**documentData) if contentExtracted.parts: # Extrahiere ursprüngliche Dokument-Info aus den Parts originalDocId = None originalFileName = None originalMimeType = None for part in contentExtracted.parts: if part.metadata: # Versuche ursprüngliche Dokument-Info zu finden if not originalDocId and part.metadata.get("documentId"): originalDocId = part.metadata.get("documentId") if not originalFileName and part.metadata.get("originalFileName"): originalFileName = part.metadata.get("originalFileName") if not originalMimeType and part.metadata.get("documentMimeType"): originalMimeType = part.metadata.get("documentMimeType") # Falls nicht gefunden, versuche aus documentName zu extrahieren if not originalFileName: # Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf") if document.fileName and "_extracted_" in document.fileName: originalFileName = document.fileName.split("_extracted_")[0] + ".pdf" return { "originalDocument": { "id": originalDocId or document.id, "fileName": originalFileName or document.fileName, "mimeType": originalMimeType or "application/pdf", "fileSize": document.fileSize }, "contentExtracted": contentExtracted, "parts": contentExtracted.parts } except Exception as parseError: logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}") logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}") import traceback logger.debug(f"Parse error traceback: {traceback.format_exc()}") return None else: logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})") return None except Exception as e: logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}") return None def _buildIntentAnalysisPrompt( self, userPrompt: str, documents: List[ChatDocument], actionParameters: Dict[str, Any] ) -> str: """Baue Prompt für Intent-Analyse.""" # Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs docListText = "" for i, doc in enumerate(documents, 1): # Prüfe ob es ein pre-extracted JSON ist preExtracted = self.resolvePreExtractedDocument(doc) if preExtracted: # Zeige ursprüngliches Dokument statt JSON originalDoc = preExtracted["originalDocument"] partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})" docListText += f"\n{i}. Document ID: {originalDoc['id']}\n" docListText += f" File Name: {originalDoc['fileName']}{partsInfo}\n" docListText += f" MIME Type: {originalDoc['mimeType']}\n" docListText += f" File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n" else: # Normales Dokument docListText += f"\n{i}. Document ID: {doc.id}\n" docListText += f" File Name: {doc.fileName}\n" docListText += f" MIME Type: {doc.mimeType}\n" docListText += f" File Size: {doc.fileSize} bytes\n" outputFormat = actionParameters.get("outputFormat", "txt") # FENCE user input to prevent prompt injection fencedUserPrompt = f"""```user_request {userPrompt} ```""" prompt = f"""USER REQUEST: {fencedUserPrompt} DOCUMENTS TO ANALYZE: {docListText} TASK: For each document, determine its intents (can be multiple): - "extract": Content extraction needed (text, structure, OCR, etc.) - "render": Image/binary should be rendered as-is (visual element) - "reference": Document reference/attachment (no extraction, just reference) TASK: For each document, determine: 1. Intents (can be multiple): "extract", "render", "reference" Note: Output format and language are NOT determined here - they will be determined during structure generation (Phase 3) in the chapter structure JSON OUTPUT FORMAT: {outputFormat} (global fallback - for reference only) RETURN JSON: {{ "intents": [ {{ "documentId": "doc_1", "intents": ["extract"], "extractionPrompt": "Extract all text content, preserving structure", "reasoning": "User needs text content for document generation" }}, {{ "documentId": "doc_2", "intents": ["extract", "render"], "extractionPrompt": "Extract text content from image using vision AI", "reasoning": "Image contains text that needs extraction, but also should be rendered visually" }}, {{ "documentId": "doc_3", "intents": ["reference"], "extractionPrompt": null, "reasoning": "Document is only used as reference, no extraction needed" }} ] }} CRITICAL RULES: 1. For images (mimeType starts with "image/"): - If user wants to "include" or "show" images → add "render" - If user wants to "analyze", "read text", or "extract text" from images → add "extract" - Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering 2. For text documents: - If user mentions "template" or "structure" → "reference" or "extract" based on context - If user mentions "reference" or "context" → "reference" - Default → "extract" 3. Consider output format: - For formats like PDF, DOCX, PPTX: images usually need "render" - For formats like CSV, JSON: usually "extract" only - For HTML: can have both "extract" and "render" Return ONLY valid JSON following the structure above. """ return prompt def _findSimilarDocumentId(self, incorrectId: str, validIds: set) -> Optional[str]: """ Versucht eine ähnliche Dokument-ID zu finden, falls die AI die UUID geändert hat. Prüft auf UUID-Typo (z.B. 4451 -> 4551). Args: incorrectId: Die falsche UUID aus der AI-Response validIds: Set von gültigen Dokument-IDs Returns: Korrigierte UUID falls gefunden, sonst None """ if not incorrectId or len(incorrectId) != 36: # UUID Format: 8-4-4-4-12 return None # Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) if incorrectId.count('-') != 4: return None # Versuche Levenshtein-ähnliche Suche: Prüfe ob nur 1-2 Zeichen unterschiedlich sind for validId in validIds: if len(validId) != 36: continue # Zähle unterschiedliche Zeichen differences = sum(c1 != c2 for c1, c2 in zip(incorrectId, validId)) # Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo if differences <= 2: # Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche) if incorrectId.count('-') == validId.count('-'): return validId return None