gateway/modules/services/serviceAi/subContentExtraction.py

540 lines
31 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Content Extraction Module
Handles content extraction and preparation, including:
- Extracting content from documents based on intents
- Processing pre-extracted documents
- Vision AI for image text extraction
- AI processing of text content
"""
import json
import logging
import base64
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
logger = logging.getLogger(__name__)
class ContentExtractor:
"""Handles content extraction and preparation."""
def __init__(self, services, aiService, intentAnalyzer):
"""Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
self.services = services
self.aiService = aiService
self.intentAnalyzer = intentAnalyzer
async def extractAndPrepareContent(
self,
documents: List[ChatDocument],
documentIntents: List[DocumentIntent],
parentOperationId: str,
getIntentForDocument: callable
) -> List[ContentPart]:
"""
Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
Gibt Liste von ContentParts im passenden Format zurück.
WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
Beispiel: Bild mit intents=["extract", "render"] erzeugt:
- ContentPart(contentFormat="object", ...) für Rendering
- ContentPart(contentFormat="extracted", ...) für Text-Analyse
Args:
documents: Liste der zu verarbeitenden Dokumente
documentIntents: Liste von DocumentIntent-Objekten
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
getIntentForDocument: Callable to get intent for document ID
Returns:
Liste von ContentParts mit vollständigen Metadaten
"""
# Erstelle Operation-ID für Extraktion
extractionOperationId = f"{parentOperationId}_content_extraction"
# Starte ChatLog mit Parent-Referenz
self.services.chat.progressLogStart(
extractionOperationId,
"Content Extraction",
"Extraction",
f"Extracting from {len(documents)} documents",
parentOperationId=parentOperationId
)
try:
allContentParts = []
for document in documents:
# Check if document is already a ContentExtracted document (pre-extracted JSON)
logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)
if preExtracted:
logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")
# Verwende bereits extrahierte ContentParts direkt
contentExtracted = preExtracted["contentExtracted"]
# WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
# (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
intent = getIntentForDocument(document.id, documentIntents)
logger.info(f" Intent lookup for document {document.id}: found={intent is not None}")
if intent:
logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
else:
logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")
if contentExtracted.parts:
for part in contentExtracted.parts:
# Überspringe leere Parts (Container ohne Daten)
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
if part.typeGroup == "container":
continue # Überspringe leere Container
if not part.metadata:
part.metadata = {}
# Ensure metadata is complete
if "documentId" not in part.metadata:
part.metadata["documentId"] = document.id
# WICHTIG: Prüfe Intent für dieses Part
partIntent = intent.intents if intent else ["extract"]
# Debug-Logging für Intent-Verarbeitung
logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")
# WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
# Generische Intent-Verarbeitung für ALLE Content-Typen
hasReferenceIntent = "reference" in partIntent
hasRenderIntent = "render" in partIntent
hasExtractIntent = "extract" in partIntent
hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)
logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")
# Track ob der originale Part bereits hinzugefügt wurde
originalPartAdded = False
# 1. Reference Intent: Erstelle Reference ContentPart
if hasReferenceIntent:
referencePart = ContentPart(
id=f"ref_{document.id}_{part.id}",
label=f"Reference: {part.label or 'Content'}",
typeGroup="reference",
mimeType=part.mimeType or "application/octet-stream",
data="", # Leer - nur Referenz
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
"intent": "reference",
"usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
"originalFileName": preExtracted["originalDocument"]["fileName"]
}
)
allContentParts.append(referencePart)
logger.debug(f"✅ Created reference ContentPart for {part.id}")
# 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
if hasRenderIntent and hasPartData:
# Prüfe ob es ein Binary/Image ist (kann gerendert werden)
isRenderable = (
part.typeGroup == "image" or
part.typeGroup == "binary" or
(part.mimeType and (
part.mimeType.startswith("image/") or
part.mimeType.startswith("video/") or
part.mimeType.startswith("audio/") or
self._isBinary(part.mimeType)
))
)
if isRenderable:
objectPart = ContentPart(
id=f"obj_{document.id}_{part.id}",
label=f"Object: {part.label or 'Content'}",
typeGroup=part.typeGroup,
mimeType=part.mimeType or "application/octet-stream",
data=part.data, # Base64/Binary data ist bereits vorhanden
metadata={
"contentFormat": "object",
"documentId": document.id,
"intent": "render",
"usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
}
)
allContentParts.append(objectPart)
logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
else:
logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
elif hasRenderIntent and not hasPartData:
logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")
# 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation)
if hasExtractIntent:
# For images: Keep as image part with extract intent - Vision AI extraction happens during section generation
if part.typeGroup == "image" and hasPartData:
logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation")
# Keep image part as-is, mark with extract intent
part.metadata.update({
"contentFormat": "extracted", # Marked for extraction, but not yet extracted
"intent": "extract",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.",
"needsVisionExtraction": True # Flag to indicate Vision AI extraction needed
})
allContentParts.append(part)
originalPartAdded = True
else:
# For text/table content: Use directly as extracted (no AI processing here)
# AI processing with extractionPrompt happens during section generation
if not originalPartAdded:
part.metadata.update({
"contentFormat": "extracted",
"intent": "extract",
"fromExtractContent": True,
"skipExtraction": True, # Already extracted (raw extraction)
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None
})
# Stelle sicher dass contentFormat gesetzt ist
if "contentFormat" not in part.metadata:
part.metadata["contentFormat"] = "extracted"
allContentParts.append(part)
originalPartAdded = True
logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")
# 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
# (sollte normalerweise nicht vorkommen, da default "extract" ist)
if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
part.metadata.update({
"contentFormat": "extracted",
"intent": "extract",
"fromExtractContent": True,
"skipExtraction": True,
"originalFileName": preExtracted["originalDocument"]["fileName"]
})
allContentParts.append(part)
originalPartAdded = True
logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}")
continue # Skip normal extraction for this document
# Check if it's standardized JSON format (has "documents" or "sections")
if document.mimeType == "application/json":
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
# Create reference ContentPart for structured JSON
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue # Skip normal extraction for this document
except Exception as e:
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
# Continue with normal extraction
# Normal extraction path
intent = getIntentForDocument(document.id, documentIntents)
if not intent:
# Default: extract für alle Dokumente ohne Intent
logger.warning(f"No intent found for document {document.id}, using default 'extract'")
intent = DocumentIntent(
documentId=document.id,
intents=["extract"],
extractionPrompt="Extract all content from the document",
reasoning="Default intent: no specific intent found"
)
# WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen
if "reference" in intent.intents:
# Erstelle Reference ContentPart
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="reference",
mimeType=document.mimeType,
data="",
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"intent": "reference",
"usageHint": f"Reference document: {document.fileName}"
}
)
allContentParts.append(contentPart)
# WICHTIG: "render" und "extract" können beide vorhanden sein!
# In diesem Fall erzeugen wir BEIDE ContentParts
if "render" in intent.intents:
# Für Images/Binary: extrahiere als Object
if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
try:
# Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
if not binaryData:
logger.warning(f"No binary data found for document {document.id}")
continue
base64Data = base64.b64encode(binaryData).decode('utf-8')
contentPart = ContentPart(
id=f"obj_{document.id}",
label=f"Object: {document.fileName}",
typeGroup="image" if document.mimeType.startswith("image/") else "binary",
mimeType=document.mimeType,
data=base64Data,
metadata={
"contentFormat": "object",
"documentId": document.id,
"intent": "render",
"usageHint": f"Render as visual element: {document.fileName}",
"originalFileName": document.fileName,
# Verknüpfung zu extracted Part (falls vorhanden)
"relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
}
)
allContentParts.append(contentPart)
except Exception as e:
logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")
if "extract" in intent.intents:
# Extrahiere Content mit Extraction Service
extractionPrompt = intent.extractionPrompt or "Extract all content from the document"
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
extractionPrompt,
f"content_extraction_prompt_{document.id}"
)
# Führe Extraktion aus
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
extractionOptions = ExtractionOptions(
prompt=extractionPrompt,
mergeStrategy=MergeStrategy()
)
# extractContent ist nicht async - keine await nötig
extractedResults = self.services.extraction.extractContent(
[document],
extractionOptions,
operationId=extractionOperationId,
parentOperationId=extractionOperationId
)
# Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
for extracted in extractedResults:
for part in extracted.parts:
# Markiere als extracted Format
part.metadata.update({
"contentFormat": "extracted",
"documentId": document.id,
"extractionPrompt": extractionPrompt,
"intent": "extract",
"usageHint": f"Use extracted content from {document.fileName}",
# Verknüpfung zu object Part (falls vorhanden)
"relatedObjectPartId": f"obj_{document.id}" if "render" in intent.intents else None
})
# For images: Mark that Vision AI extraction is needed during section generation
if part.typeGroup == "image":
part.metadata["needsVisionExtraction"] = True
logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation")
# Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
if "render" in intent.intents:
part.id = f"ext_{document.id}_{part.id}"
allContentParts.append(part)
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
"content_extraction_result"
)
# ChatLog abschließen
self.services.chat.progressLogFinish(extractionOperationId, True)
return allContentParts
except Exception as e:
self.services.chat.progressLogFinish(extractionOperationId, False)
logger.error(f"Error in extractAndPrepareContent: {str(e)}")
raise
async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Extrahiere Text aus einem Image-Part mit Vision AI.
Args:
imagePart: ContentPart mit typeGroup="image"
extractionPrompt: Prompt für die Text-Extraktion
Returns:
Extrahierter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Final extraction prompt
finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
finalPrompt,
f"content_extraction_prompt_image_{imagePart.id}"
)
# Erstelle AI-Call-Request mit Image-Part
request = AiCallRequest(
prompt=finalPrompt,
context="",
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
contentParts=[imagePart]
)
# Verwende AI-Service für Vision AI-Verarbeitung
response = await self.aiService.callAi(request)
# Debug-Log für Response (harmonisiert)
if response and response.content:
self.services.utils.writeDebugFile(
response.content,
f"content_extraction_response_image_{imagePart.id}"
)
if response and response.content:
return response.content.strip()
# Kein Content zurückgegeben - return error message für Debugging
errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
logger.warning(errorMsg)
return f"[ERROR: {errorMsg}]"
except Exception as e:
errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
logger.error(errorMsg)
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Return error message statt None für Debugging
return f"[ERROR: {errorMsg}]"
async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Verarbeite Text-Content mit AI basierend auf extractionPrompt.
WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
(z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.
Args:
textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
extractionPrompt: Prompt für die AI-Verarbeitung des Textes
Returns:
AI-verarbeiteter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Final extraction prompt
finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."
# Debug-Log (harmonisiert) - log prompt with text preview
textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
self.services.utils.writeDebugFile(
promptWithContext,
f"content_extraction_prompt_text_{textPart.id}"
)
# Erstelle Text-ContentPart für AI-Verarbeitung
# Verwende den vorhandenen Text als Input
textContentPart = ContentPart(
id=textPart.id,
label=textPart.label,
typeGroup="text",
mimeType="text/plain",
data=textPart.data if textPart.data else "",
metadata=textPart.metadata.copy() if textPart.metadata else {}
)
# Erstelle AI-Call-Request mit Text-Part
request = AiCallRequest(
prompt=finalPrompt,
context="",
options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
contentParts=[textContentPart]
)
# Verwende AI-Service für Text-Verarbeitung
response = await self.aiService.callAi(request)
# Debug-Log für Response (harmonisiert)
if response and response.content:
self.services.utils.writeDebugFile(
response.content,
f"content_extraction_response_text_{textPart.id}"
)
if response and response.content:
return response.content.strip()
# Kein Content zurückgegeben - return error message für Debugging
errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
logger.warning(errorMsg)
return f"[ERROR: {errorMsg}]"
except Exception as e:
errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
logger.error(errorMsg)
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Return error message statt None für Debugging
return f"[ERROR: {errorMsg}]"
def _isBinary(self, mimeType: str) -> bool:
"""Prüfe ob MIME-Type binary ist."""
binaryTypes = [
"application/octet-stream",
"application/pdf",
"application/zip",
"application/x-zip-compressed"
]
return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")