gateway/modules/serviceCenter/services/serviceAi/subDocumentIntents.py
2026-03-06 14:03:18 +01:00

369 lines
17 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Intent Analysis Module
Handles analysis of document intents, including:
- Clarifying which documents need extraction vs reference
- Resolving pre-extracted documents
- Building intent analysis prompts
"""
import json
import logging
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import DocumentIntent
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class DocumentIntentAnalyzer:
"""Handles document intent analysis and resolution."""
def __init__(self, services, aiService):
"""Initialize DocumentIntentAnalyzer with service center and AI service access."""
self.services = services
self.aiService = aiService
async def clarifyDocumentIntents(
self,
documents: List[ChatDocument],
userPrompt: str,
actionParameters: Dict[str, Any],
parentOperationId: str
) -> List[DocumentIntent]:
"""
Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen.
Gibt DocumentIntent für jedes Dokument zurück.
Args:
documents: Liste der zu verarbeitenden Dokumente
userPrompt: User-Anfrage
actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat)
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
Returns:
Liste von DocumentIntent-Objekten
"""
# Erstelle Operation-ID für Intent-Analyse
intentOperationId = f"{parentOperationId}_intent_analysis"
# Starte ChatLog mit Parent-Referenz
self.services.chat.progressLogStart(
intentOperationId,
"Document Intent Analysis",
"Intent Analysis",
f"Analyzing {len(documents)} documents",
parentOperationId=parentOperationId
)
try:
# Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse
documentMapping = {} # Maps original doc ID -> JSON doc ID
resolvedDocuments = []
for doc in documents:
preExtracted = self.resolvePreExtractedDocument(doc)
if preExtracted:
originalDocId = preExtracted["originalDocument"]["id"]
documentMapping[originalDocId] = doc.id
# Erstelle temporäres ChatDocument für ursprüngliches Dokument
originalDoc = ChatDocument(
id=originalDocId,
fileName=preExtracted["originalDocument"]["fileName"],
mimeType=preExtracted["originalDocument"]["mimeType"],
fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
fileId=doc.fileId, # Behalte fileId vom JSON
messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden
)
resolvedDocuments.append(originalDoc)
else:
resolvedDocuments.append(doc)
# Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten
intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters)
# AI-Call (verwende callAiPlanning für einfache JSON-Responses)
# Debug-Logs werden bereits von callAiPlanning geschrieben
checkWorkflowStopped(self.services)
aiResponse = await self.aiService.callAiPlanning(
prompt=intentPrompt,
debugType="document_intent_analysis"
)
# Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig
intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse))
documentIntents = []
for intent in intentsData.get("intents", []):
docId = intent.get("documentId")
# Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID
if docId in documentMapping:
intent["documentId"] = documentMapping[docId]
documentIntents.append(DocumentIntent(**intent))
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps([intent.dict() for intent in documentIntents], indent=2),
"document_intent_analysis_result"
)
# State 1 Validation: Validate and auto-fix document intents
documentIds = {d.id for d in documents}
validatedIntents = []
for intent in documentIntents:
# Validation 1.2: Skip intents for unknown documents
if intent.documentId not in documentIds:
# Try to find similar UUID (fix AI hallucination/typo)
correctedDocId = self._findSimilarDocumentId(intent.documentId, documentIds)
if correctedDocId:
logger.warning(f"Corrected UUID typo in AI response: {intent.documentId} -> {correctedDocId}")
intent.documentId = correctedDocId
else:
logger.warning(f"Skipping intent for unknown document: {intent.documentId}")
continue
validatedIntents.append(intent)
# Validation 1.1: Documents without intents are OK (not needed)
# Intents for non-existing documents are already filtered above
documentIntents = validatedIntents
# ChatLog abschließen
self.services.chat.progressLogFinish(intentOperationId, True)
return documentIntents
except Exception as e:
self.services.chat.progressLogFinish(intentOperationId, False)
logger.error(f"Error in clarifyDocumentIntents: {str(e)}")
raise
def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]:
"""
Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält.
Gibt Dict zurück mit:
- originalDocument: ChatDocument-Info des ursprünglichen Dokuments
- contentExtracted: ContentExtracted-Objekt mit Parts
- parts: Liste der ContentParts
Returns None wenn kein pre-extracted Format erkannt wird.
"""
if document.mimeType != "application/json":
logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check")
return None
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if not docBytes:
return None
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
if not isinstance(jsonData, dict):
return None
# Check for ContentExtracted format
# Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
documentData = None
validationMetadata = jsonData.get("validationMetadata", {})
actionType = validationMetadata.get("actionType")
logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}")
if actionType == "context.extractContent":
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
documentData = jsonData.get("documentData")
logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}")
else:
logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})")
if documentData:
try:
# Stelle sicher, dass "id" vorhanden ist
if "id" not in documentData:
documentData["id"] = document.id
contentExtracted = ContentExtracted(**documentData)
if contentExtracted.parts:
# Extrahiere ursprüngliche Dokument-Info aus den Parts
originalDocId = None
originalFileName = None
originalMimeType = None
for part in contentExtracted.parts:
if part.metadata:
# Versuche ursprüngliche Dokument-Info zu finden
if not originalDocId and part.metadata.get("documentId"):
originalDocId = part.metadata.get("documentId")
if not originalFileName and part.metadata.get("originalFileName"):
originalFileName = part.metadata.get("originalFileName")
if not originalMimeType and part.metadata.get("documentMimeType"):
originalMimeType = part.metadata.get("documentMimeType")
# Falls nicht gefunden, versuche aus documentName zu extrahieren
if not originalFileName:
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
if document.fileName and "_extracted_" in document.fileName:
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
return {
"originalDocument": {
"id": originalDocId or document.id,
"fileName": originalFileName or document.fileName,
"mimeType": originalMimeType or "application/pdf",
"fileSize": document.fileSize
},
"contentExtracted": contentExtracted,
"parts": contentExtracted.parts
}
except Exception as parseError:
logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
import traceback
logger.debug(f"Parse error traceback: {traceback.format_exc()}")
return None
else:
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
return None
except Exception as e:
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
return None
def _buildIntentAnalysisPrompt(
self,
userPrompt: str,
documents: List[ChatDocument],
actionParameters: Dict[str, Any]
) -> str:
"""Baue Prompt für Intent-Analyse."""
# Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs
docListText = ""
for i, doc in enumerate(documents, 1):
# Prüfe ob es ein pre-extracted JSON ist
preExtracted = self.resolvePreExtractedDocument(doc)
if preExtracted:
# Zeige ursprüngliches Dokument statt JSON
originalDoc = preExtracted["originalDocument"]
partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})"
docListText += f"\n{i}. Document ID: {originalDoc['id']}\n"
docListText += f" File Name: {originalDoc['fileName']}{partsInfo}\n"
docListText += f" MIME Type: {originalDoc['mimeType']}\n"
docListText += f" File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n"
else:
# Normales Dokument
docListText += f"\n{i}. Document ID: {doc.id}\n"
docListText += f" File Name: {doc.fileName}\n"
docListText += f" MIME Type: {doc.mimeType}\n"
docListText += f" File Size: {doc.fileSize} bytes\n"
outputFormat = actionParameters.get("outputFormat", "txt")
# FENCE user input to prevent prompt injection
fencedUserPrompt = f"""```user_request
{userPrompt}
```"""
prompt = f"""USER REQUEST:
{fencedUserPrompt}
DOCUMENTS TO ANALYZE:
{docListText}
TASK: For each document, determine its intents (can be multiple):
- "extract": Content extraction needed (text, structure, OCR, etc.)
- "render": Image/binary should be rendered as-is (visual element)
- "reference": Document reference/attachment (no extraction, just reference)
TASK: For each document, determine:
1. Intents (can be multiple): "extract", "render", "reference"
Note: Output format and language are NOT determined here - they will be
determined during structure generation (Phase 3) in the chapter structure JSON
OUTPUT FORMAT: {outputFormat} (global fallback - for reference only)
RETURN JSON:
{{
"intents": [
{{
"documentId": "doc_1",
"intents": ["extract"],
"extractionPrompt": "Extract all text content, preserving structure",
"reasoning": "User needs text content for document generation"
}},
{{
"documentId": "doc_2",
"intents": ["extract", "render"],
"extractionPrompt": "Extract text content from image using vision AI",
"reasoning": "Image contains text that needs extraction, but also should be rendered visually"
}},
{{
"documentId": "doc_3",
"intents": ["reference"],
"extractionPrompt": null,
"reasoning": "Document is only used as reference, no extraction needed"
}}
]
}}
CRITICAL RULES:
1. For images (mimeType starts with "image/"):
- If user wants to "include" or "show" images → add "render"
- If user wants to "analyze", "read text", or "extract text" from images → add "extract"
- Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering
2. For text documents:
- If user mentions "template" or "structure""reference" or "extract" based on context
- If user mentions "reference" or "context""reference"
- Default → "extract"
3. Consider output format:
- For formats like PDF, DOCX, PPTX: images usually need "render"
- For formats like CSV, JSON: usually "extract" only
- For HTML: can have both "extract" and "render"
Return ONLY valid JSON following the structure above.
"""
return prompt
def _findSimilarDocumentId(self, incorrectId: str, validIds: set) -> Optional[str]:
"""
Versucht eine ähnliche Dokument-ID zu finden, falls die AI die UUID geändert hat.
Prüft auf UUID-Typo (z.B. 4451 -> 4551).
Args:
incorrectId: Die falsche UUID aus der AI-Response
validIds: Set von gültigen Dokument-IDs
Returns:
Korrigierte UUID falls gefunden, sonst None
"""
if not incorrectId or len(incorrectId) != 36: # UUID Format: 8-4-4-4-12
return None
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
if incorrectId.count('-') != 4:
return None
# Versuche Levenshtein-ähnliche Suche: Prüfe ob nur 1-2 Zeichen unterschiedlich sind
for validId in validIds:
if len(validId) != 36:
continue
# Zähle unterschiedliche Zeichen
differences = sum(c1 != c2 for c1, c2 in zip(incorrectId, validId))
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
if differences <= 2:
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
if incorrectId.count('-') == validId.count('-'):
return validId
return None