serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter) - modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface() - 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt - resolver.py: legacy fallback auf altes services/ entfernt - modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py) - pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
720 lines
40 KiB
Python
720 lines
40 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Content Extraction Module
|
|
|
|
Handles content extraction and preparation, including:
|
|
- Extracting content from documents based on intents
|
|
- Processing pre-extracted documents
|
|
- Vision AI for image text extraction
|
|
- AI processing of text content
|
|
"""
|
|
import json
|
|
import logging
|
|
import base64
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent, ExtractionOptions, MergeStrategy
|
|
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ContentExtractor:
|
|
"""Handles content extraction and preparation."""
|
|
|
|
def __init__(self, services, aiService, intentAnalyzer):
|
|
"""Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
|
|
self.services = services
|
|
self.aiService = aiService
|
|
self.intentAnalyzer = intentAnalyzer
|
|
|
|
async def extractAndPrepareContent(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
documentIntents: List[DocumentIntent],
|
|
parentOperationId: str,
|
|
getIntentForDocument: callable
|
|
) -> List[ContentPart]:
|
|
"""
|
|
Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
|
|
Gibt Liste von ContentParts im passenden Format zurück.
|
|
|
|
WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
|
|
Beispiel: Bild mit intents=["extract", "render"] erzeugt:
|
|
- ContentPart(contentFormat="object", ...) für Rendering
|
|
- ContentPart(contentFormat="extracted", ...) für Text-Analyse
|
|
|
|
Args:
|
|
documents: Liste der zu verarbeitenden Dokumente
|
|
documentIntents: Liste von DocumentIntent-Objekten
|
|
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
|
getIntentForDocument: Callable to get intent for document ID
|
|
|
|
Returns:
|
|
Liste von ContentParts mit vollständigen Metadaten
|
|
"""
|
|
# Erstelle Operation-ID für Extraktion
|
|
extractionOperationId = f"{parentOperationId}_content_extraction"
|
|
|
|
# Starte ChatLog mit Parent-Referenz
|
|
self.services.chat.progressLogStart(
|
|
extractionOperationId,
|
|
"Content Extraction",
|
|
"Extraction",
|
|
f"Extracting from {len(documents)} documents",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
allContentParts = []
|
|
|
|
for document in documents:
|
|
checkWorkflowStopped(self.services)
|
|
# Check if document is already a ContentExtracted document (pre-extracted JSON)
|
|
logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
|
|
preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)
|
|
|
|
if preExtracted:
|
|
logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
|
|
logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
|
|
logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")
|
|
|
|
# Verwende bereits extrahierte ContentParts direkt
|
|
contentExtracted = preExtracted["contentExtracted"]
|
|
|
|
# WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
|
|
# (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
|
|
intent = getIntentForDocument(document.id, documentIntents)
|
|
logger.info(f" Intent lookup for document {document.id}: found={intent is not None}")
|
|
if intent:
|
|
logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
|
|
else:
|
|
logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")
|
|
|
|
if contentExtracted.parts:
|
|
# CRITICAL: Process pre-extracted parts - analyze structure parts for nested content
|
|
processedParts = []
|
|
for part in contentExtracted.parts:
|
|
# Überspringe leere Parts (Container ohne Daten)
|
|
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
|
|
if part.typeGroup == "container":
|
|
continue # Überspringe leere Container
|
|
|
|
# CRITICAL: Check if structure part contains nested parts (e.g., JSON with documentData.parts)
|
|
if part.typeGroup == "structure" and part.mimeType == "application/json" and part.data:
|
|
nestedParts = self._extractNestedPartsFromStructure(part, document, preExtracted, intent)
|
|
if nestedParts:
|
|
# Replace structure part with extracted nested parts
|
|
processedParts.extend(nestedParts)
|
|
logger.info(f"✅ Extracted {len(nestedParts)} nested parts from structure part {part.id}")
|
|
continue # Skip original structure part
|
|
|
|
# Keep original part if no nested parts found
|
|
processedParts.append(part)
|
|
|
|
# Use processed parts (with nested parts extracted)
|
|
for part in processedParts:
|
|
if not part.metadata:
|
|
part.metadata = {}
|
|
|
|
# Ensure metadata is complete
|
|
if "documentId" not in part.metadata:
|
|
part.metadata["documentId"] = document.id
|
|
|
|
# WICHTIG: Prüfe Intent für dieses Part
|
|
partIntent = intent.intents if intent else ["extract"]
|
|
|
|
# Debug-Logging für Intent-Verarbeitung
|
|
logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")
|
|
|
|
# WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
|
|
# Generische Intent-Verarbeitung für ALLE Content-Typen
|
|
hasReferenceIntent = "reference" in partIntent
|
|
hasRenderIntent = "render" in partIntent
|
|
hasExtractIntent = "extract" in partIntent
|
|
hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)
|
|
|
|
logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")
|
|
|
|
# SAFETY: For images with any intent, always ensure render is included
|
|
# This ensures the image object part is always available for later rendering
|
|
isImage = part.typeGroup == "image" or (part.mimeType and part.mimeType.startswith("image/"))
|
|
if isImage and hasPartData and not hasRenderIntent:
|
|
logger.info(f"🖼️ Auto-adding render intent for image {part.id} (original intents: {partIntent})")
|
|
hasRenderIntent = True
|
|
|
|
# Track ob der originale Part bereits hinzugefügt wurde
|
|
originalPartAdded = False
|
|
|
|
# 1. Reference Intent: Erstelle Reference ContentPart
|
|
if hasReferenceIntent:
|
|
referencePart = ContentPart(
|
|
id=f"ref_{document.id}_{part.id}",
|
|
label=f"Reference: {part.label or 'Content'}",
|
|
typeGroup="reference",
|
|
mimeType=part.mimeType or "application/octet-stream",
|
|
data="", # Leer - nur Referenz
|
|
metadata={
|
|
"contentFormat": "reference",
|
|
"documentId": document.id,
|
|
"documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
|
|
"intent": "reference",
|
|
"usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
|
}
|
|
)
|
|
allContentParts.append(referencePart)
|
|
logger.debug(f"✅ Created reference ContentPart for {part.id}")
|
|
|
|
# 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
|
|
if hasRenderIntent and hasPartData:
|
|
# Prüfe ob es ein Binary/Image ist (kann gerendert werden)
|
|
isRenderable = (
|
|
part.typeGroup == "image" or
|
|
part.typeGroup == "binary" or
|
|
(part.mimeType and (
|
|
part.mimeType.startswith("image/") or
|
|
part.mimeType.startswith("video/") or
|
|
part.mimeType.startswith("audio/") or
|
|
self._isBinary(part.mimeType)
|
|
))
|
|
)
|
|
|
|
if isRenderable:
|
|
objectPart = ContentPart(
|
|
id=f"obj_{document.id}_{part.id}",
|
|
label=f"Object: {part.label or 'Content'}",
|
|
typeGroup=part.typeGroup,
|
|
mimeType=part.mimeType or "application/octet-stream",
|
|
data=part.data, # Base64/Binary data ist bereits vorhanden
|
|
metadata={
|
|
"contentFormat": "object",
|
|
"documentId": document.id,
|
|
"intent": "render",
|
|
"usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
|
|
}
|
|
)
|
|
allContentParts.append(objectPart)
|
|
logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
|
|
else:
|
|
logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
|
|
elif hasRenderIntent and not hasPartData:
|
|
logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")
|
|
|
|
# 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation)
|
|
if hasExtractIntent:
|
|
# For images: Keep as image part with extract intent - Vision AI extraction happens during section generation
|
|
if part.typeGroup == "image" and hasPartData:
|
|
logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation")
|
|
# Keep image part as-is, mark with extract intent
|
|
part.metadata.update({
|
|
"contentFormat": "extracted", # Marked for extraction, but not yet extracted
|
|
"intent": "extract",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
|
|
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.",
|
|
"needsVisionExtraction": True # Flag to indicate Vision AI extraction needed
|
|
})
|
|
allContentParts.append(part)
|
|
originalPartAdded = True
|
|
else:
|
|
# For text/table content: Use directly as extracted (no AI processing here)
|
|
# AI processing with extractionPrompt happens during section generation
|
|
if not originalPartAdded:
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"intent": "extract",
|
|
"fromExtractContent": True,
|
|
"skipExtraction": True, # Already extracted (raw extraction)
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
|
|
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None
|
|
})
|
|
# Stelle sicher dass contentFormat gesetzt ist
|
|
if "contentFormat" not in part.metadata:
|
|
part.metadata["contentFormat"] = "extracted"
|
|
allContentParts.append(part)
|
|
originalPartAdded = True
|
|
logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")
|
|
|
|
# 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
|
|
# (sollte normalerweise nicht vorkommen, da default "extract" ist)
|
|
if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
|
|
logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"intent": "extract",
|
|
"fromExtractContent": True,
|
|
"skipExtraction": True,
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
|
})
|
|
allContentParts.append(part)
|
|
originalPartAdded = True
|
|
|
|
logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
|
|
logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}")
|
|
continue # Skip normal extraction for this document
|
|
|
|
# Check if it's standardized JSON format (has "documents" or "sections")
|
|
if document.mimeType == "application/json":
|
|
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
|
if docBytes:
|
|
try:
|
|
docData = docBytes.decode('utf-8')
|
|
jsonData = json.loads(docData)
|
|
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
logger.warning(f"Could not parse JSON document {document.fileName}: {str(e)}")
|
|
jsonData = None
|
|
|
|
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
|
|
logger.info(f"Document is already in standardized JSON format, using as reference")
|
|
contentPart = ContentPart(
|
|
id=f"ref_{document.id}",
|
|
label=f"Reference: {document.fileName}",
|
|
typeGroup="structure",
|
|
mimeType="application/json",
|
|
data=docData,
|
|
metadata={
|
|
"contentFormat": "reference",
|
|
"documentId": document.id,
|
|
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
|
"skipExtraction": True,
|
|
"intent": "reference"
|
|
}
|
|
)
|
|
allContentParts.append(contentPart)
|
|
logger.info(f"✅ Using JSON document directly without extraction")
|
|
continue
|
|
|
|
# Normal extraction path
|
|
intent = getIntentForDocument(document.id, documentIntents)
|
|
|
|
if not intent:
|
|
# Try to find intent by similar UUID (fix for AI UUID hallucination)
|
|
correctedIntent = self._findIntentBySimilarId(document.id, documentIntents)
|
|
if correctedIntent:
|
|
logger.warning(f"Found intent for document {document.id} using UUID correction (original: {correctedIntent.documentId})")
|
|
# Create new intent with correct document ID
|
|
intent = DocumentIntent(
|
|
documentId=document.id,
|
|
intents=correctedIntent.intents,
|
|
extractionPrompt=correctedIntent.extractionPrompt,
|
|
reasoning=f"Intent matched by UUID similarity (original: {correctedIntent.documentId})"
|
|
)
|
|
else:
|
|
# Default: extract für alle Dokumente ohne Intent
|
|
logger.warning(f"No intent found for document {document.id}, using default 'extract'")
|
|
intent = DocumentIntent(
|
|
documentId=document.id,
|
|
intents=["extract"],
|
|
extractionPrompt="Extract all content from the document",
|
|
reasoning="Default intent: no specific intent found"
|
|
)
|
|
|
|
# WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen
|
|
|
|
if "reference" in intent.intents:
|
|
# Erstelle Reference ContentPart
|
|
contentPart = ContentPart(
|
|
id=f"ref_{document.id}",
|
|
label=f"Reference: {document.fileName}",
|
|
typeGroup="reference",
|
|
mimeType=document.mimeType,
|
|
data="",
|
|
metadata={
|
|
"contentFormat": "reference",
|
|
"documentId": document.id,
|
|
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
|
"intent": "reference",
|
|
"usageHint": f"Reference document: {document.fileName}"
|
|
}
|
|
)
|
|
allContentParts.append(contentPart)
|
|
|
|
# WICHTIG: "render" und "extract" können beide vorhanden sein!
|
|
# In diesem Fall erzeugen wir BEIDE ContentParts
|
|
|
|
# SAFETY: For images with any intent, always create object part for later rendering
|
|
isImageDocument = document.mimeType and document.mimeType.startswith("image/")
|
|
shouldAutoRender = isImageDocument and "render" not in intent.intents and ("extract" in intent.intents or "reference" in intent.intents)
|
|
if shouldAutoRender:
|
|
logger.info(f"🖼️ Auto-adding render for image document {document.id} (original intents: {intent.intents})")
|
|
|
|
if "render" in intent.intents or shouldAutoRender:
|
|
# Für Images/Binary: extrahiere als Object
|
|
if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
|
|
try:
|
|
# Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
|
|
binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
|
|
if not binaryData:
|
|
logger.warning(f"No binary data found for document {document.id}")
|
|
continue
|
|
base64Data = base64.b64encode(binaryData).decode('utf-8')
|
|
|
|
contentPart = ContentPart(
|
|
id=f"obj_{document.id}",
|
|
label=f"Object: {document.fileName}",
|
|
typeGroup="image" if document.mimeType.startswith("image/") else "binary",
|
|
mimeType=document.mimeType,
|
|
data=base64Data,
|
|
metadata={
|
|
"contentFormat": "object",
|
|
"documentId": document.id,
|
|
"intent": "render",
|
|
"usageHint": f"Render as visual element: {document.fileName}",
|
|
"originalFileName": document.fileName,
|
|
# Verknüpfung zu extracted Part (falls vorhanden)
|
|
"relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
|
|
}
|
|
)
|
|
allContentParts.append(contentPart)
|
|
except Exception as e:
|
|
logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")
|
|
|
|
if "extract" in intent.intents:
|
|
# Extrahiere Content mit Extraction Service
|
|
extractionPrompt = intent.extractionPrompt or "Extract all content from the document"
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
extractionPrompt,
|
|
f"content_extraction_prompt_{document.id}"
|
|
)
|
|
|
|
# Führe Extraktion aus
|
|
|
|
extractionOptions = ExtractionOptions(
|
|
prompt=extractionPrompt,
|
|
mergeStrategy=MergeStrategy()
|
|
)
|
|
|
|
# extractContent ist nicht async - keine await nötig
|
|
checkWorkflowStopped(self.services)
|
|
extractedResults = self.services.extraction.extractContent(
|
|
[document],
|
|
extractionOptions,
|
|
operationId=extractionOperationId,
|
|
parentOperationId=extractionOperationId
|
|
)
|
|
|
|
# Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
|
|
# Check if object part exists (either explicit render or auto-render for images)
|
|
hasObjectPart = "render" in intent.intents or shouldAutoRender
|
|
|
|
for extracted in extractedResults:
|
|
for part in extracted.parts:
|
|
# Markiere als extracted Format
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"documentId": document.id,
|
|
"extractionPrompt": extractionPrompt,
|
|
"intent": "extract",
|
|
"usageHint": f"Use extracted content from {document.fileName}",
|
|
# Verknüpfung zu object Part (falls vorhanden - including auto-render for images)
|
|
"relatedObjectPartId": f"obj_{document.id}" if hasObjectPart else None
|
|
})
|
|
|
|
# For images: Mark that Vision AI extraction is needed during section generation
|
|
if part.typeGroup == "image":
|
|
part.metadata["needsVisionExtraction"] = True
|
|
logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation")
|
|
|
|
# Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
|
|
if hasObjectPart:
|
|
part.id = f"ext_{document.id}_{part.id}"
|
|
allContentParts.append(part)
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
|
|
"content_extraction_result"
|
|
)
|
|
|
|
# State 2 Validation: Validate and auto-fix ContentParts
|
|
validatedParts = []
|
|
for part in allContentParts:
|
|
# Validation 2.1: Skip ContentParts without documentId
|
|
if not part.metadata.get("documentId"):
|
|
logger.warning(f"Skipping ContentPart {part.id} - missing documentId in metadata")
|
|
continue
|
|
|
|
# Validation 2.2: Skip ContentParts with invalid contentFormat
|
|
contentFormat = part.metadata.get("contentFormat")
|
|
if contentFormat not in ["extracted", "object", "reference"]:
|
|
logger.warning(
|
|
f"Skipping ContentPart {part.id} - invalid contentFormat: {contentFormat}"
|
|
)
|
|
continue
|
|
|
|
validatedParts.append(part)
|
|
|
|
# ChatLog abschließen
|
|
self.services.chat.progressLogFinish(extractionOperationId, True)
|
|
|
|
return validatedParts
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(extractionOperationId, False)
|
|
logger.error(f"Error in extractAndPrepareContent: {str(e)}")
|
|
raise
|
|
|
|
async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
|
|
"""
|
|
Extrahiere Text aus einem Image-Part mit Vision AI.
|
|
|
|
Args:
|
|
imagePart: ContentPart mit typeGroup="image"
|
|
extractionPrompt: Prompt für die Text-Extraktion
|
|
|
|
Returns:
|
|
Extrahierter Text oder None bei Fehler
|
|
"""
|
|
try:
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
|
|
# Final extraction prompt
|
|
finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
finalPrompt,
|
|
f"content_extraction_prompt_image_{imagePart.id}"
|
|
)
|
|
|
|
# Erstelle AI-Call-Request mit Image-Part
|
|
request = AiCallRequest(
|
|
prompt=finalPrompt,
|
|
context="",
|
|
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
|
|
contentParts=[imagePart]
|
|
)
|
|
|
|
# Verwende AI-Service für Vision AI-Verarbeitung
|
|
checkWorkflowStopped(self.services)
|
|
response = await self.aiService.callAi(request)
|
|
|
|
# Debug-Log für Response (harmonisiert)
|
|
if response and response.content:
|
|
self.services.utils.writeDebugFile(
|
|
response.content,
|
|
f"content_extraction_response_image_{imagePart.id}"
|
|
)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
|
|
# Kein Content zurückgegeben - return error message für Debugging
|
|
errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
|
|
logger.warning(errorMsg)
|
|
return f"[ERROR: {errorMsg}]"
|
|
except Exception as e:
|
|
errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
|
|
logger.error(errorMsg)
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
# Return error message statt None für Debugging
|
|
return f"[ERROR: {errorMsg}]"
|
|
|
|
async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
|
|
"""
|
|
Verarbeite Text-Content mit AI basierend auf extractionPrompt.
|
|
|
|
WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
|
|
(z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
|
|
verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.
|
|
|
|
Args:
|
|
textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
|
|
extractionPrompt: Prompt für die AI-Verarbeitung des Textes
|
|
|
|
Returns:
|
|
AI-verarbeiteter Text oder None bei Fehler
|
|
"""
|
|
try:
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
|
|
# Final extraction prompt
|
|
finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."
|
|
|
|
# Debug-Log (harmonisiert) - log prompt with text preview
|
|
textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
|
|
promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
|
|
self.services.utils.writeDebugFile(
|
|
promptWithContext,
|
|
f"content_extraction_prompt_text_{textPart.id}"
|
|
)
|
|
|
|
# Erstelle Text-ContentPart für AI-Verarbeitung
|
|
# Verwende den vorhandenen Text als Input
|
|
textContentPart = ContentPart(
|
|
id=textPart.id,
|
|
label=textPart.label,
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=textPart.data if textPart.data else "",
|
|
metadata=textPart.metadata.copy() if textPart.metadata else {}
|
|
)
|
|
|
|
# Erstelle AI-Call-Request mit Text-Part
|
|
request = AiCallRequest(
|
|
prompt=finalPrompt,
|
|
context="",
|
|
options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
|
|
contentParts=[textContentPart]
|
|
)
|
|
|
|
# Verwende AI-Service für Text-Verarbeitung
|
|
checkWorkflowStopped(self.services)
|
|
response = await self.aiService.callAi(request)
|
|
|
|
# Debug-Log für Response (harmonisiert)
|
|
if response and response.content:
|
|
self.services.utils.writeDebugFile(
|
|
response.content,
|
|
f"content_extraction_response_text_{textPart.id}"
|
|
)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
|
|
# Kein Content zurückgegeben - return error message für Debugging
|
|
errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
|
|
logger.warning(errorMsg)
|
|
return f"[ERROR: {errorMsg}]"
|
|
except Exception as e:
|
|
errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
|
|
logger.error(errorMsg)
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
# Return error message statt None für Debugging
|
|
return f"[ERROR: {errorMsg}]"
|
|
|
|
def _isBinary(self, mimeType: str) -> bool:
|
|
"""Prüfe ob MIME-Type binary ist."""
|
|
binaryTypes = [
|
|
"application/octet-stream",
|
|
"application/pdf",
|
|
"application/zip",
|
|
"application/x-zip-compressed"
|
|
]
|
|
return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")
|
|
|
|
def _extractNestedPartsFromStructure(
|
|
self,
|
|
structurePart: ContentPart,
|
|
document: ChatDocument,
|
|
preExtracted: Dict[str, Any],
|
|
intent: Optional[Any]
|
|
) -> List[ContentPart]:
|
|
"""
|
|
Extract nested parts from a structure ContentPart (e.g., JSON with documentData.parts).
|
|
|
|
This is a generic function that analyzes pre-processed ContentParts and extracts
|
|
any nested parts that are embedded in structure data (typically JSON).
|
|
|
|
Works with standard ContentExtracted format: documentData.parts array.
|
|
Each nested part is extracted as a separate ContentPart with proper metadata.
|
|
|
|
Args:
|
|
structurePart: ContentPart with typeGroup="structure" containing nested parts
|
|
document: The document this part belongs to
|
|
preExtracted: Pre-extracted document metadata
|
|
intent: Document intent for nested parts
|
|
|
|
Returns:
|
|
List of extracted ContentParts, empty if no nested parts found
|
|
"""
|
|
nestedParts = []
|
|
|
|
try:
|
|
# Parse JSON structure
|
|
jsonData = json.loads(structurePart.data)
|
|
|
|
# Check for standard ContentExtracted format: documentData.parts
|
|
if isinstance(jsonData, dict):
|
|
documentData = jsonData.get("documentData")
|
|
if isinstance(documentData, dict):
|
|
parts = documentData.get("parts", [])
|
|
if isinstance(parts, list) and len(parts) > 0:
|
|
# Extract each nested part
|
|
for nestedPartData in parts:
|
|
if not isinstance(nestedPartData, dict):
|
|
continue
|
|
|
|
nestedPartId = nestedPartData.get("id") or f"nested_{len(nestedParts)}"
|
|
nestedTypeGroup = nestedPartData.get("typeGroup", "text")
|
|
nestedMimeType = nestedPartData.get("mimeType", "text/plain")
|
|
nestedLabel = nestedPartData.get("label", structurePart.label)
|
|
nestedData = nestedPartData.get("data", "")
|
|
nestedMetadata = nestedPartData.get("metadata", {})
|
|
|
|
# Create ContentPart for nested part
|
|
nestedPart = ContentPart(
|
|
id=f"{structurePart.id}_{nestedPartId}",
|
|
parentId=structurePart.id,
|
|
label=nestedLabel,
|
|
typeGroup=nestedTypeGroup,
|
|
mimeType=nestedMimeType,
|
|
data=nestedData,
|
|
metadata={
|
|
**nestedMetadata,
|
|
"documentId": document.id,
|
|
"fromNestedStructure": True,
|
|
"parentStructurePartId": structurePart.id,
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
|
}
|
|
)
|
|
|
|
nestedParts.append(nestedPart)
|
|
logger.debug(f"✅ Extracted nested part: {nestedPart.id} (typeGroup={nestedTypeGroup}, mimeType={nestedMimeType})")
|
|
|
|
# If no nested parts found, return empty list (original part will be kept)
|
|
if not nestedParts:
|
|
logger.debug(f"No nested parts found in structure part {structurePart.id}")
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Could not parse structure part {structurePart.id} as JSON: {str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"Error extracting nested parts from structure part {structurePart.id}: {str(e)}")
|
|
|
|
return nestedParts
|
|
|
|
def _findIntentBySimilarId(self, documentId: str, documentIntents: List[DocumentIntent]) -> Optional[DocumentIntent]:
|
|
"""
|
|
Versucht ein Intent zu finden, dessen UUID ähnlich zur angegebenen Dokument-ID ist.
|
|
Dies hilft bei AI UUID-Halluzinationen (z.B. 4451 -> 4551).
|
|
|
|
Args:
|
|
documentId: Die Dokument-ID für die ein Intent gesucht wird
|
|
documentIntents: Liste aller verfügbaren DocumentIntents
|
|
|
|
Returns:
|
|
DocumentIntent mit ähnlicher UUID falls gefunden, sonst None
|
|
"""
|
|
if not documentId or len(documentId) != 36: # UUID Format: 8-4-4-4-12
|
|
return None
|
|
|
|
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
|
|
if documentId.count('-') != 4:
|
|
return None
|
|
|
|
for intent in documentIntents:
|
|
intentId = intent.documentId
|
|
if len(intentId) != 36:
|
|
continue
|
|
|
|
# Zähle unterschiedliche Zeichen
|
|
differences = sum(c1 != c2 for c1, c2 in zip(documentId, intentId))
|
|
|
|
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
|
|
if differences <= 2:
|
|
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
|
|
if documentId.count('-') == intentId.count('-'):
|
|
return intent
|
|
|
|
return None
|
|
|