370 lines
17 KiB
Python
370 lines
17 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Document Intent Analysis Module
|
|
|
|
Handles analysis of document intents, including:
|
|
- Clarifying which documents need extraction vs reference
|
|
- Resolving pre-extracted documents
|
|
- Building intent analysis prompts
|
|
"""
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from modules.aichat.datamodelFeatureAiChat import ChatDocument
|
|
from modules.datamodels.datamodelExtraction import DocumentIntent
|
|
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentIntentAnalyzer:
|
|
"""Handles document intent analysis and resolution."""
|
|
|
|
def __init__(self, services, aiService):
|
|
"""Initialize DocumentIntentAnalyzer with service center and AI service access."""
|
|
self.services = services
|
|
self.aiService = aiService
|
|
|
|
async def clarifyDocumentIntents(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
userPrompt: str,
|
|
actionParameters: Dict[str, Any],
|
|
parentOperationId: str
|
|
) -> List[DocumentIntent]:
|
|
"""
|
|
Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen.
|
|
Gibt DocumentIntent für jedes Dokument zurück.
|
|
|
|
Args:
|
|
documents: Liste der zu verarbeitenden Dokumente
|
|
userPrompt: User-Anfrage
|
|
actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat)
|
|
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
|
|
|
Returns:
|
|
Liste von DocumentIntent-Objekten
|
|
"""
|
|
# Erstelle Operation-ID für Intent-Analyse
|
|
intentOperationId = f"{parentOperationId}_intent_analysis"
|
|
|
|
# Starte ChatLog mit Parent-Referenz
|
|
self.services.chat.progressLogStart(
|
|
intentOperationId,
|
|
"Document Intent Analysis",
|
|
"Intent Analysis",
|
|
f"Analyzing {len(documents)} documents",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
# Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse
|
|
documentMapping = {} # Maps original doc ID -> JSON doc ID
|
|
resolvedDocuments = []
|
|
|
|
for doc in documents:
|
|
preExtracted = self.resolvePreExtractedDocument(doc)
|
|
if preExtracted:
|
|
originalDocId = preExtracted["originalDocument"]["id"]
|
|
documentMapping[originalDocId] = doc.id
|
|
# Erstelle temporäres ChatDocument für ursprüngliches Dokument
|
|
originalDoc = ChatDocument(
|
|
id=originalDocId,
|
|
fileName=preExtracted["originalDocument"]["fileName"],
|
|
mimeType=preExtracted["originalDocument"]["mimeType"],
|
|
fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
|
|
fileId=doc.fileId, # Behalte fileId vom JSON
|
|
messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden
|
|
)
|
|
resolvedDocuments.append(originalDoc)
|
|
else:
|
|
resolvedDocuments.append(doc)
|
|
|
|
# Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten
|
|
intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters)
|
|
|
|
# AI-Call (verwende callAiPlanning für einfache JSON-Responses)
|
|
# Debug-Logs werden bereits von callAiPlanning geschrieben
|
|
checkWorkflowStopped(self.services)
|
|
aiResponse = await self.aiService.callAiPlanning(
|
|
prompt=intentPrompt,
|
|
debugType="document_intent_analysis"
|
|
)
|
|
|
|
# Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig
|
|
intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse))
|
|
documentIntents = []
|
|
for intent in intentsData.get("intents", []):
|
|
docId = intent.get("documentId")
|
|
# Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID
|
|
if docId in documentMapping:
|
|
intent["documentId"] = documentMapping[docId]
|
|
documentIntents.append(DocumentIntent(**intent))
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
json.dumps([intent.dict() for intent in documentIntents], indent=2),
|
|
"document_intent_analysis_result"
|
|
)
|
|
|
|
# State 1 Validation: Validate and auto-fix document intents
|
|
documentIds = {d.id for d in documents}
|
|
validatedIntents = []
|
|
|
|
for intent in documentIntents:
|
|
# Validation 1.2: Skip intents for unknown documents
|
|
if intent.documentId not in documentIds:
|
|
# Try to find similar UUID (fix AI hallucination/typo)
|
|
correctedDocId = self._findSimilarDocumentId(intent.documentId, documentIds)
|
|
if correctedDocId:
|
|
logger.warning(f"Corrected UUID typo in AI response: {intent.documentId} -> {correctedDocId}")
|
|
intent.documentId = correctedDocId
|
|
else:
|
|
logger.warning(f"Skipping intent for unknown document: {intent.documentId}")
|
|
continue
|
|
validatedIntents.append(intent)
|
|
|
|
# Validation 1.1: Documents without intents are OK (not needed)
|
|
# Intents for non-existing documents are already filtered above
|
|
documentIntents = validatedIntents
|
|
|
|
# ChatLog abschließen
|
|
self.services.chat.progressLogFinish(intentOperationId, True)
|
|
|
|
return documentIntents
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(intentOperationId, False)
|
|
logger.error(f"Error in clarifyDocumentIntents: {str(e)}")
|
|
raise
|
|
|
|
def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält.
|
|
Gibt Dict zurück mit:
|
|
- originalDocument: ChatDocument-Info des ursprünglichen Dokuments
|
|
- contentExtracted: ContentExtracted-Objekt mit Parts
|
|
- parts: Liste der ContentParts
|
|
|
|
Returns None wenn kein pre-extracted Format erkannt wird.
|
|
"""
|
|
if document.mimeType != "application/json":
|
|
logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check")
|
|
return None
|
|
|
|
try:
|
|
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
|
if not docBytes:
|
|
return None
|
|
|
|
docData = docBytes.decode('utf-8')
|
|
jsonData = json.loads(docData)
|
|
|
|
if not isinstance(jsonData, dict):
|
|
return None
|
|
|
|
# Check for ContentExtracted format
|
|
# Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
|
|
documentData = None
|
|
|
|
validationMetadata = jsonData.get("validationMetadata", {})
|
|
actionType = validationMetadata.get("actionType")
|
|
logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}")
|
|
|
|
if actionType == "context.extractContent":
|
|
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
|
|
documentData = jsonData.get("documentData")
|
|
logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}")
|
|
else:
|
|
logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})")
|
|
|
|
if documentData:
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted
|
|
|
|
try:
|
|
# Stelle sicher, dass "id" vorhanden ist
|
|
if "id" not in documentData:
|
|
documentData["id"] = document.id
|
|
|
|
contentExtracted = ContentExtracted(**documentData)
|
|
|
|
if contentExtracted.parts:
|
|
# Extrahiere ursprüngliche Dokument-Info aus den Parts
|
|
originalDocId = None
|
|
originalFileName = None
|
|
originalMimeType = None
|
|
|
|
for part in contentExtracted.parts:
|
|
if part.metadata:
|
|
# Versuche ursprüngliche Dokument-Info zu finden
|
|
if not originalDocId and part.metadata.get("documentId"):
|
|
originalDocId = part.metadata.get("documentId")
|
|
if not originalFileName and part.metadata.get("originalFileName"):
|
|
originalFileName = part.metadata.get("originalFileName")
|
|
if not originalMimeType and part.metadata.get("documentMimeType"):
|
|
originalMimeType = part.metadata.get("documentMimeType")
|
|
|
|
# Falls nicht gefunden, versuche aus documentName zu extrahieren
|
|
if not originalFileName:
|
|
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
|
|
if document.fileName and "_extracted_" in document.fileName:
|
|
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
|
|
|
|
return {
|
|
"originalDocument": {
|
|
"id": originalDocId or document.id,
|
|
"fileName": originalFileName or document.fileName,
|
|
"mimeType": originalMimeType or "application/pdf",
|
|
"fileSize": document.fileSize
|
|
},
|
|
"contentExtracted": contentExtracted,
|
|
"parts": contentExtracted.parts
|
|
}
|
|
except Exception as parseError:
|
|
logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
|
|
logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
|
|
import traceback
|
|
logger.debug(f"Parse error traceback: {traceback.format_exc()}")
|
|
return None
|
|
else:
|
|
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
|
|
|
|
return None
|
|
except Exception as e:
|
|
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
|
|
return None
|
|
|
|
def _buildIntentAnalysisPrompt(
|
|
self,
|
|
userPrompt: str,
|
|
documents: List[ChatDocument],
|
|
actionParameters: Dict[str, Any]
|
|
) -> str:
|
|
"""Baue Prompt für Intent-Analyse."""
|
|
# Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs
|
|
docListText = ""
|
|
for i, doc in enumerate(documents, 1):
|
|
# Prüfe ob es ein pre-extracted JSON ist
|
|
preExtracted = self.resolvePreExtractedDocument(doc)
|
|
|
|
if preExtracted:
|
|
# Zeige ursprüngliches Dokument statt JSON
|
|
originalDoc = preExtracted["originalDocument"]
|
|
partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})"
|
|
docListText += f"\n{i}. Document ID: {originalDoc['id']}\n"
|
|
docListText += f" File Name: {originalDoc['fileName']}{partsInfo}\n"
|
|
docListText += f" MIME Type: {originalDoc['mimeType']}\n"
|
|
docListText += f" File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n"
|
|
else:
|
|
# Normales Dokument
|
|
docListText += f"\n{i}. Document ID: {doc.id}\n"
|
|
docListText += f" File Name: {doc.fileName}\n"
|
|
docListText += f" MIME Type: {doc.mimeType}\n"
|
|
docListText += f" File Size: {doc.fileSize} bytes\n"
|
|
|
|
outputFormat = actionParameters.get("outputFormat", "txt")
|
|
|
|
# FENCE user input to prevent prompt injection
|
|
fencedUserPrompt = f"""```user_request
|
|
{userPrompt}
|
|
```"""
|
|
|
|
prompt = f"""USER REQUEST:
|
|
{fencedUserPrompt}
|
|
|
|
DOCUMENTS TO ANALYZE:
|
|
{docListText}
|
|
|
|
TASK: For each document, determine its intents (can be multiple):
|
|
- "extract": Content extraction needed (text, structure, OCR, etc.)
|
|
- "render": Image/binary should be rendered as-is (visual element)
|
|
- "reference": Document reference/attachment (no extraction, just reference)
|
|
|
|
TASK: For each document, determine:
|
|
1. Intents (can be multiple): "extract", "render", "reference"
|
|
Note: Output format and language are NOT determined here - they will be
|
|
determined during structure generation (Phase 3) in the chapter structure JSON
|
|
|
|
OUTPUT FORMAT: {outputFormat} (global fallback - for reference only)
|
|
|
|
RETURN JSON:
|
|
{{
|
|
"intents": [
|
|
{{
|
|
"documentId": "doc_1",
|
|
"intents": ["extract"],
|
|
"extractionPrompt": "Extract all text content, preserving structure",
|
|
"reasoning": "User needs text content for document generation"
|
|
}},
|
|
{{
|
|
"documentId": "doc_2",
|
|
"intents": ["extract", "render"],
|
|
"extractionPrompt": "Extract text content from image using vision AI",
|
|
"reasoning": "Image contains text that needs extraction, but also should be rendered visually"
|
|
}},
|
|
{{
|
|
"documentId": "doc_3",
|
|
"intents": ["reference"],
|
|
"extractionPrompt": null,
|
|
"reasoning": "Document is only used as reference, no extraction needed"
|
|
}}
|
|
]
|
|
}}
|
|
|
|
CRITICAL RULES:
|
|
1. For images (mimeType starts with "image/"):
|
|
- If user wants to "include" or "show" images → add "render"
|
|
- If user wants to "analyze", "read text", or "extract text" from images → add "extract"
|
|
- Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering
|
|
|
|
2. For text documents:
|
|
- If user mentions "template" or "structure" → "reference" or "extract" based on context
|
|
- If user mentions "reference" or "context" → "reference"
|
|
- Default → "extract"
|
|
|
|
3. Consider output format:
|
|
- For formats like PDF, DOCX, PPTX: images usually need "render"
|
|
- For formats like CSV, JSON: usually "extract" only
|
|
- For HTML: can have both "extract" and "render"
|
|
|
|
Return ONLY valid JSON following the structure above.
|
|
"""
|
|
return prompt
|
|
|
|
def _findSimilarDocumentId(self, incorrectId: str, validIds: set) -> Optional[str]:
|
|
"""
|
|
Versucht eine ähnliche Dokument-ID zu finden, falls die AI die UUID geändert hat.
|
|
Prüft auf UUID-Typo (z.B. 4451 -> 4551).
|
|
|
|
Args:
|
|
incorrectId: Die falsche UUID aus der AI-Response
|
|
validIds: Set von gültigen Dokument-IDs
|
|
|
|
Returns:
|
|
Korrigierte UUID falls gefunden, sonst None
|
|
"""
|
|
if not incorrectId or len(incorrectId) != 36: # UUID Format: 8-4-4-4-12
|
|
return None
|
|
|
|
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
|
|
if incorrectId.count('-') != 4:
|
|
return None
|
|
|
|
# Versuche Levenshtein-ähnliche Suche: Prüfe ob nur 1-2 Zeichen unterschiedlich sind
|
|
for validId in validIds:
|
|
if len(validId) != 36:
|
|
continue
|
|
|
|
# Zähle unterschiedliche Zeichen
|
|
differences = sum(c1 != c2 for c1, c2 in zip(incorrectId, validId))
|
|
|
|
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
|
|
if differences <= 2:
|
|
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
|
|
if incorrectId.count('-') == validId.count('-'):
|
|
return validId
|
|
|
|
return None
|
|
|