670 lines
42 KiB
Python
670 lines
42 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Content Extraction Module
|
|
|
|
Handles content extraction and preparation, including:
|
|
- Extracting content from documents based on intents
|
|
- Processing pre-extracted documents
|
|
- Vision AI for image text extraction
|
|
- AI processing of text content
|
|
"""
|
|
import json
|
|
import logging
|
|
import base64
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ContentExtractor:
|
|
"""Handles content extraction and preparation."""
|
|
|
|
def __init__(self, services, aiService, intentAnalyzer):
|
|
"""Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
|
|
self.services = services
|
|
self.aiService = aiService
|
|
self.intentAnalyzer = intentAnalyzer
|
|
|
|
async def extractAndPrepareContent(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
documentIntents: List[DocumentIntent],
|
|
parentOperationId: str,
|
|
getIntentForDocument: callable
|
|
) -> List[ContentPart]:
|
|
"""
|
|
Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
|
|
Gibt Liste von ContentParts im passenden Format zurück.
|
|
|
|
WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
|
|
Beispiel: Bild mit intents=["extract", "render"] erzeugt:
|
|
- ContentPart(contentFormat="object", ...) für Rendering
|
|
- ContentPart(contentFormat="extracted", ...) für Text-Analyse
|
|
|
|
Args:
|
|
documents: Liste der zu verarbeitenden Dokumente
|
|
documentIntents: Liste von DocumentIntent-Objekten
|
|
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
|
getIntentForDocument: Callable to get intent for document ID
|
|
|
|
Returns:
|
|
Liste von ContentParts mit vollständigen Metadaten
|
|
"""
|
|
# Erstelle Operation-ID für Extraktion
|
|
extractionOperationId = f"{parentOperationId}_content_extraction"
|
|
|
|
# Starte ChatLog mit Parent-Referenz
|
|
self.services.chat.progressLogStart(
|
|
extractionOperationId,
|
|
"Content Extraction",
|
|
"Extraction",
|
|
f"Extracting from {len(documents)} documents",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
allContentParts = []
|
|
|
|
for document in documents:
|
|
# Check if document is already a ContentExtracted document (pre-extracted JSON)
|
|
logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
|
|
preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)
|
|
|
|
if preExtracted:
|
|
logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
|
|
logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
|
|
logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")
|
|
|
|
# Verwende bereits extrahierte ContentParts direkt
|
|
contentExtracted = preExtracted["contentExtracted"]
|
|
|
|
# WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
|
|
# (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
|
|
intent = getIntentForDocument(document.id, documentIntents)
|
|
logger.info(f" Intent lookup for document {document.id}: found={intent is not None}")
|
|
if intent:
|
|
logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
|
|
else:
|
|
logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")
|
|
|
|
if contentExtracted.parts:
|
|
for part in contentExtracted.parts:
|
|
# Überspringe leere Parts (Container ohne Daten)
|
|
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
|
|
if part.typeGroup == "container":
|
|
continue # Überspringe leere Container
|
|
|
|
if not part.metadata:
|
|
part.metadata = {}
|
|
|
|
# Ensure metadata is complete
|
|
if "documentId" not in part.metadata:
|
|
part.metadata["documentId"] = document.id
|
|
|
|
# WICHTIG: Prüfe Intent für dieses Part
|
|
partIntent = intent.intents if intent else ["extract"]
|
|
|
|
# Debug-Logging für Intent-Verarbeitung
|
|
logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")
|
|
|
|
# WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
|
|
# Generische Intent-Verarbeitung für ALLE Content-Typen
|
|
hasReferenceIntent = "reference" in partIntent
|
|
hasRenderIntent = "render" in partIntent
|
|
hasExtractIntent = "extract" in partIntent
|
|
hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)
|
|
|
|
logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")
|
|
|
|
# Track ob der originale Part bereits hinzugefügt wurde
|
|
originalPartAdded = False
|
|
|
|
# 1. Reference Intent: Erstelle Reference ContentPart
|
|
if hasReferenceIntent:
|
|
referencePart = ContentPart(
|
|
id=f"ref_{document.id}_{part.id}",
|
|
label=f"Reference: {part.label or 'Content'}",
|
|
typeGroup="reference",
|
|
mimeType=part.mimeType or "application/octet-stream",
|
|
data="", # Leer - nur Referenz
|
|
metadata={
|
|
"contentFormat": "reference",
|
|
"documentId": document.id,
|
|
"documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
|
|
"intent": "reference",
|
|
"usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
|
}
|
|
)
|
|
allContentParts.append(referencePart)
|
|
logger.debug(f"✅ Created reference ContentPart for {part.id}")
|
|
|
|
# 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
|
|
if hasRenderIntent and hasPartData:
|
|
# Prüfe ob es ein Binary/Image ist (kann gerendert werden)
|
|
isRenderable = (
|
|
part.typeGroup == "image" or
|
|
part.typeGroup == "binary" or
|
|
(part.mimeType and (
|
|
part.mimeType.startswith("image/") or
|
|
part.mimeType.startswith("video/") or
|
|
part.mimeType.startswith("audio/") or
|
|
self._isBinary(part.mimeType)
|
|
))
|
|
)
|
|
|
|
if isRenderable:
|
|
objectPart = ContentPart(
|
|
id=f"obj_{document.id}_{part.id}",
|
|
label=f"Object: {part.label or 'Content'}",
|
|
typeGroup=part.typeGroup,
|
|
mimeType=part.mimeType or "application/octet-stream",
|
|
data=part.data, # Base64/Binary data ist bereits vorhanden
|
|
metadata={
|
|
"contentFormat": "object",
|
|
"documentId": document.id,
|
|
"intent": "render",
|
|
"usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
|
|
}
|
|
)
|
|
allContentParts.append(objectPart)
|
|
logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
|
|
else:
|
|
logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
|
|
elif hasRenderIntent and not hasPartData:
|
|
logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")
|
|
|
|
# 3. Extract Intent: Erstelle Extracted ContentPart (möglicherweise mit zusätzlicher Verarbeitung)
|
|
if hasExtractIntent:
|
|
# Spezielle Behandlung für Images: Vision AI für Text-Extraktion
|
|
if part.typeGroup == "image" and hasPartData:
|
|
logger.info(f"🔄 Processing image {part.id} with Vision AI (extract intent)")
|
|
try:
|
|
extractionPrompt = intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image. Return only the extracted text, no additional formatting."
|
|
extractedText = await self.extractTextFromImage(part, extractionPrompt)
|
|
if extractedText:
|
|
# Prüfe ob es ein Error-Message ist
|
|
isError = extractedText.startswith("[ERROR:")
|
|
|
|
# Erstelle neuen Text-Part mit extrahiertem Text oder Error-Message
|
|
textPart = ContentPart(
|
|
id=f"extracted_{document.id}_{part.id}",
|
|
label=f"Extracted text from {part.label or 'Image'}" if not isError else f"Error extracting from {part.label or 'Image'}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=extractedText,
|
|
metadata={
|
|
"contentFormat": "extracted",
|
|
"documentId": document.id,
|
|
"intent": "extract",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
|
|
"extractionPrompt": extractionPrompt,
|
|
"extractionMethod": "vision",
|
|
"isError": isError
|
|
}
|
|
)
|
|
allContentParts.append(textPart)
|
|
if isError:
|
|
logger.error(f"❌ Vision AI extraction failed for image {part.id}: {extractedText}")
|
|
else:
|
|
logger.info(f"✅ Extracted text from image {part.id} using Vision AI: {len(extractedText)} chars")
|
|
else:
|
|
# Sollte nicht vorkommen (Funktion gibt jetzt immer Error-Message zurück)
|
|
errorMsg = f"Vision AI extraction failed: Unexpected empty response for image {part.id}"
|
|
logger.error(errorMsg)
|
|
errorPart = ContentPart(
|
|
id=f"extracted_{document.id}_{part.id}",
|
|
label=f"Error extracting from {part.label or 'Image'}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=f"[ERROR: {errorMsg}]",
|
|
metadata={
|
|
"contentFormat": "extracted",
|
|
"documentId": document.id,
|
|
"intent": "extract",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"extractionPrompt": extractionPrompt,
|
|
"extractionMethod": "vision",
|
|
"isError": True
|
|
}
|
|
)
|
|
allContentParts.append(errorPart)
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to extract text from image {part.id}: {str(e)}")
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
# Kein Fallback: Wenn render Intent vorhanden, haben wir bereits object Part
|
|
# Wenn nur extract Intent: Original Part ist kein Text, daher nicht als extracted hinzufügen
|
|
if not hasRenderIntent:
|
|
logger.debug(f"Image {part.id} has only extract intent, Vision AI failed - no extracted text available")
|
|
else:
|
|
# Für alle anderen Content-Typen: Prüfe ob AI-Verarbeitung benötigt wird
|
|
# WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Content
|
|
# (z.B. Text aus PDF-Text-Layer, Tabellen, etc.). Wenn "extract" Intent vorhanden ist,
|
|
# muss dieser Content mit AI verarbeitet werden basierend auf extractionPrompt.
|
|
|
|
# Prüfe ob Part Text-Content hat (kann mit AI verarbeitet werden)
|
|
isTextContent = (
|
|
part.typeGroup == "text" or
|
|
part.typeGroup == "table" or
|
|
(part.data and isinstance(part.data, str) and len(part.data.strip()) > 0)
|
|
)
|
|
|
|
if isTextContent and intent and intent.extractionPrompt:
|
|
# Text-Content mit extractionPrompt: Verarbeite mit AI
|
|
logger.info(f"🔄 Processing text content {part.id} with AI (extract intent with prompt)")
|
|
try:
|
|
extractionPrompt = intent.extractionPrompt
|
|
processedText = await self.processTextContentWithAi(part, extractionPrompt)
|
|
if processedText:
|
|
# Prüfe ob es ein Error-Message ist
|
|
isError = processedText.startswith("[ERROR:")
|
|
|
|
# Erstelle neuen Text-Part mit AI-verarbeitetem Text oder Error-Message
|
|
processedPart = ContentPart(
|
|
id=f"extracted_{document.id}_{part.id}",
|
|
label=f"AI-processed: {part.label or 'Content'}" if not isError else f"Error processing {part.label or 'Content'}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=processedText,
|
|
metadata={
|
|
"contentFormat": "extracted",
|
|
"documentId": document.id,
|
|
"intent": "extract",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
|
|
"extractionPrompt": extractionPrompt,
|
|
"extractionMethod": "ai",
|
|
"sourcePartId": part.id,
|
|
"fromExtractContent": True,
|
|
"isError": isError
|
|
}
|
|
)
|
|
allContentParts.append(processedPart)
|
|
originalPartAdded = True
|
|
if isError:
|
|
logger.error(f"❌ AI text processing failed for part {part.id}: {processedText}")
|
|
else:
|
|
logger.info(f"✅ Processed text content {part.id} with AI: {len(processedText)} chars")
|
|
else:
|
|
# Sollte nicht vorkommen (Funktion gibt jetzt immer Error-Message zurück)
|
|
errorMsg = f"AI text processing failed: Unexpected empty response for part {part.id}"
|
|
logger.error(errorMsg)
|
|
errorPart = ContentPart(
|
|
id=f"extracted_{document.id}_{part.id}",
|
|
label=f"Error processing {part.label or 'Content'}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=f"[ERROR: {errorMsg}]",
|
|
metadata={
|
|
"contentFormat": "extracted",
|
|
"documentId": document.id,
|
|
"intent": "extract",
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"extractionPrompt": extractionPrompt,
|
|
"extractionMethod": "ai",
|
|
"sourcePartId": part.id,
|
|
"isError": True
|
|
}
|
|
)
|
|
allContentParts.append(errorPart)
|
|
originalPartAdded = True
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to process text content {part.id} with AI: {str(e)}")
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
# Fallback: Verwende Original-Part
|
|
if not originalPartAdded:
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"intent": "extract",
|
|
"fromExtractContent": True,
|
|
"skipExtraction": True,
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None
|
|
})
|
|
allContentParts.append(part)
|
|
originalPartAdded = True
|
|
else:
|
|
# Kein extractionPrompt oder kein Text-Content: Verwende Part direkt als extracted
|
|
# (Content ist bereits extrahiert von context.extractContent, keine weitere AI-Verarbeitung nötig)
|
|
# WICHTIG: Nur hinzufügen wenn noch nicht hinzugefügt (z.B. durch render Intent)
|
|
if not originalPartAdded:
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"intent": "extract",
|
|
"fromExtractContent": True,
|
|
"skipExtraction": True, # Bereits extrahiert
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
|
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None
|
|
})
|
|
# Stelle sicher dass contentFormat gesetzt ist
|
|
if "contentFormat" not in part.metadata:
|
|
part.metadata["contentFormat"] = "extracted"
|
|
allContentParts.append(part)
|
|
originalPartAdded = True
|
|
logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")
|
|
|
|
# 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
|
|
# (sollte normalerweise nicht vorkommen, da default "extract" ist)
|
|
if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
|
|
logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"intent": "extract",
|
|
"fromExtractContent": True,
|
|
"skipExtraction": True,
|
|
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
|
})
|
|
allContentParts.append(part)
|
|
originalPartAdded = True
|
|
|
|
logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
|
|
logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}")
|
|
continue # Skip normal extraction for this document
|
|
|
|
# Check if it's standardized JSON format (has "documents" or "sections")
|
|
if document.mimeType == "application/json":
|
|
try:
|
|
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
|
if docBytes:
|
|
docData = docBytes.decode('utf-8')
|
|
jsonData = json.loads(docData)
|
|
|
|
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
|
|
logger.info(f"Document is already in standardized JSON format, using as reference")
|
|
# Create reference ContentPart for structured JSON
|
|
contentPart = ContentPart(
|
|
id=f"ref_{document.id}",
|
|
label=f"Reference: {document.fileName}",
|
|
typeGroup="structure",
|
|
mimeType="application/json",
|
|
data=docData,
|
|
metadata={
|
|
"contentFormat": "reference",
|
|
"documentId": document.id,
|
|
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
|
"skipExtraction": True,
|
|
"intent": "reference"
|
|
}
|
|
)
|
|
allContentParts.append(contentPart)
|
|
logger.info(f"✅ Using JSON document directly without extraction")
|
|
continue # Skip normal extraction for this document
|
|
except Exception as e:
|
|
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
|
|
# Continue with normal extraction
|
|
|
|
# Normal extraction path
|
|
intent = getIntentForDocument(document.id, documentIntents)
|
|
|
|
if not intent:
|
|
# Default: extract für alle Dokumente ohne Intent
|
|
logger.warning(f"No intent found for document {document.id}, using default 'extract'")
|
|
intent = DocumentIntent(
|
|
documentId=document.id,
|
|
intents=["extract"],
|
|
extractionPrompt="Extract all content from the document",
|
|
reasoning="Default intent: no specific intent found"
|
|
)
|
|
|
|
# WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen
|
|
|
|
if "reference" in intent.intents:
|
|
# Erstelle Reference ContentPart
|
|
contentPart = ContentPart(
|
|
id=f"ref_{document.id}",
|
|
label=f"Reference: {document.fileName}",
|
|
typeGroup="reference",
|
|
mimeType=document.mimeType,
|
|
data="",
|
|
metadata={
|
|
"contentFormat": "reference",
|
|
"documentId": document.id,
|
|
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
|
"intent": "reference",
|
|
"usageHint": f"Reference document: {document.fileName}"
|
|
}
|
|
)
|
|
allContentParts.append(contentPart)
|
|
|
|
# WICHTIG: "render" und "extract" können beide vorhanden sein!
|
|
# In diesem Fall erzeugen wir BEIDE ContentParts
|
|
|
|
if "render" in intent.intents:
|
|
# Für Images/Binary: extrahiere als Object
|
|
if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
|
|
try:
|
|
# Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
|
|
binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
|
|
if not binaryData:
|
|
logger.warning(f"No binary data found for document {document.id}")
|
|
continue
|
|
base64Data = base64.b64encode(binaryData).decode('utf-8')
|
|
|
|
contentPart = ContentPart(
|
|
id=f"obj_{document.id}",
|
|
label=f"Object: {document.fileName}",
|
|
typeGroup="image" if document.mimeType.startswith("image/") else "binary",
|
|
mimeType=document.mimeType,
|
|
data=base64Data,
|
|
metadata={
|
|
"contentFormat": "object",
|
|
"documentId": document.id,
|
|
"intent": "render",
|
|
"usageHint": f"Render as visual element: {document.fileName}",
|
|
"originalFileName": document.fileName,
|
|
# Verknüpfung zu extracted Part (falls vorhanden)
|
|
"relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
|
|
}
|
|
)
|
|
allContentParts.append(contentPart)
|
|
except Exception as e:
|
|
logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")
|
|
|
|
if "extract" in intent.intents:
|
|
# Extrahiere Content mit Extraction Service
|
|
extractionPrompt = intent.extractionPrompt or "Extract all content from the document"
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
extractionPrompt,
|
|
f"content_extraction_prompt_{document.id}"
|
|
)
|
|
|
|
# Führe Extraktion aus
|
|
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
|
|
|
|
extractionOptions = ExtractionOptions(
|
|
prompt=extractionPrompt,
|
|
mergeStrategy=MergeStrategy()
|
|
)
|
|
|
|
# extractContent ist nicht async - keine await nötig
|
|
extractedResults = self.services.extraction.extractContent(
|
|
[document],
|
|
extractionOptions,
|
|
operationId=extractionOperationId,
|
|
parentOperationId=extractionOperationId
|
|
)
|
|
|
|
# Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
|
|
for extracted in extractedResults:
|
|
for part in extracted.parts:
|
|
# Markiere als extracted Format
|
|
part.metadata.update({
|
|
"contentFormat": "extracted",
|
|
"documentId": document.id,
|
|
"extractionPrompt": extractionPrompt,
|
|
"intent": "extract",
|
|
"usageHint": f"Use extracted content from {document.fileName}",
|
|
# Verknüpfung zu object Part (falls vorhanden)
|
|
"relatedObjectPartId": f"obj_{document.id}" if "render" in intent.intents else None
|
|
})
|
|
# Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
|
|
if "render" in intent.intents:
|
|
part.id = f"ext_{document.id}_{part.id}"
|
|
allContentParts.append(part)
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
|
|
"content_extraction_result"
|
|
)
|
|
|
|
# ChatLog abschließen
|
|
self.services.chat.progressLogFinish(extractionOperationId, True)
|
|
|
|
return allContentParts
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(extractionOperationId, False)
|
|
logger.error(f"Error in extractAndPrepareContent: {str(e)}")
|
|
raise
|
|
|
|
async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
|
|
"""
|
|
Extrahiere Text aus einem Image-Part mit Vision AI.
|
|
|
|
Args:
|
|
imagePart: ContentPart mit typeGroup="image"
|
|
extractionPrompt: Prompt für die Text-Extraktion
|
|
|
|
Returns:
|
|
Extrahierter Text oder None bei Fehler
|
|
"""
|
|
try:
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
|
|
# Final extraction prompt
|
|
finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
finalPrompt,
|
|
f"content_extraction_prompt_image_{imagePart.id}"
|
|
)
|
|
|
|
# Erstelle AI-Call-Request mit Image-Part
|
|
request = AiCallRequest(
|
|
prompt=finalPrompt,
|
|
context="",
|
|
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
|
|
contentParts=[imagePart]
|
|
)
|
|
|
|
# Verwende AI-Service für Vision AI-Verarbeitung
|
|
response = await self.aiService.callAi(request)
|
|
|
|
# Debug-Log für Response (harmonisiert)
|
|
if response and response.content:
|
|
self.services.utils.writeDebugFile(
|
|
response.content,
|
|
f"content_extraction_response_image_{imagePart.id}"
|
|
)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
|
|
# Kein Content zurückgegeben - return error message für Debugging
|
|
errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
|
|
logger.warning(errorMsg)
|
|
return f"[ERROR: {errorMsg}]"
|
|
except Exception as e:
|
|
errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
|
|
logger.error(errorMsg)
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
# Return error message statt None für Debugging
|
|
return f"[ERROR: {errorMsg}]"
|
|
|
|
async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
|
|
"""
|
|
Verarbeite Text-Content mit AI basierend auf extractionPrompt.
|
|
|
|
WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
|
|
(z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
|
|
verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.
|
|
|
|
Args:
|
|
textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
|
|
extractionPrompt: Prompt für die AI-Verarbeitung des Textes
|
|
|
|
Returns:
|
|
AI-verarbeiteter Text oder None bei Fehler
|
|
"""
|
|
try:
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
|
|
# Final extraction prompt
|
|
finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."
|
|
|
|
# Debug-Log (harmonisiert) - log prompt with text preview
|
|
textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
|
|
promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
|
|
self.services.utils.writeDebugFile(
|
|
promptWithContext,
|
|
f"content_extraction_prompt_text_{textPart.id}"
|
|
)
|
|
|
|
# Erstelle Text-ContentPart für AI-Verarbeitung
|
|
# Verwende den vorhandenen Text als Input
|
|
textContentPart = ContentPart(
|
|
id=textPart.id,
|
|
label=textPart.label,
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=textPart.data if textPart.data else "",
|
|
metadata=textPart.metadata.copy() if textPart.metadata else {}
|
|
)
|
|
|
|
# Erstelle AI-Call-Request mit Text-Part
|
|
request = AiCallRequest(
|
|
prompt=finalPrompt,
|
|
context="",
|
|
options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
|
|
contentParts=[textContentPart]
|
|
)
|
|
|
|
# Verwende AI-Service für Text-Verarbeitung
|
|
response = await self.aiService.callAi(request)
|
|
|
|
# Debug-Log für Response (harmonisiert)
|
|
if response and response.content:
|
|
self.services.utils.writeDebugFile(
|
|
response.content,
|
|
f"content_extraction_response_text_{textPart.id}"
|
|
)
|
|
|
|
if response and response.content:
|
|
return response.content.strip()
|
|
|
|
# Kein Content zurückgegeben - return error message für Debugging
|
|
errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
|
|
logger.warning(errorMsg)
|
|
return f"[ERROR: {errorMsg}]"
|
|
except Exception as e:
|
|
errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
|
|
logger.error(errorMsg)
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
# Return error message statt None für Debugging
|
|
return f"[ERROR: {errorMsg}]"
|
|
|
|
def _isBinary(self, mimeType: str) -> bool:
|
|
"""Prüfe ob MIME-Type binary ist."""
|
|
binaryTypes = [
|
|
"application/octet-stream",
|
|
"application/pdf",
|
|
"application/zip",
|
|
"application/x-zip-compressed"
|
|
]
|
|
return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")
|
|
|