# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Content Extraction Module Handles content extraction and preparation, including: - Extracting content from documents based on intents - Processing pre-extracted documents - Vision AI for image text extraction - AI processing of text content """ import json import logging import base64 from typing import Dict, Any, List, Optional from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent logger = logging.getLogger(__name__) class ContentExtractor: """Handles content extraction and preparation.""" def __init__(self, services, aiService, intentAnalyzer): """Initialize ContentExtractor with service center, AI service, and intent analyzer access.""" self.services = services self.aiService = aiService self.intentAnalyzer = intentAnalyzer async def extractAndPrepareContent( self, documents: List[ChatDocument], documentIntents: List[DocumentIntent], parentOperationId: str, getIntentForDocument: callable ) -> List[ContentPart]: """ Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor. Gibt Liste von ContentParts im passenden Format zurück. WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind. Beispiel: Bild mit intents=["extract", "render"] erzeugt: - ContentPart(contentFormat="object", ...) für Rendering - ContentPart(contentFormat="extracted", ...) für Text-Analyse Args: documents: Liste der zu verarbeitenden Dokumente documentIntents: Liste von DocumentIntent-Objekten parentOperationId: Parent Operation-ID für ChatLog-Hierarchie getIntentForDocument: Callable to get intent for document ID Returns: Liste von ContentParts mit vollständigen Metadaten """ # Erstelle Operation-ID für Extraktion extractionOperationId = f"{parentOperationId}_content_extraction" # Starte ChatLog mit Parent-Referenz self.services.chat.progressLogStart( extractionOperationId, "Content Extraction", "Extraction", f"Extracting from {len(documents)} documents", parentOperationId=parentOperationId ) try: allContentParts = [] for document in documents: # Check if document is already a ContentExtracted document (pre-extracted JSON) logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content") preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document) if preExtracted: logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}") logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}") logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}") # Verwende bereits extrahierte ContentParts direkt contentExtracted = preExtracted["contentExtracted"] # WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original # (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID) intent = getIntentForDocument(document.id, documentIntents) logger.info(f" Intent lookup for document {document.id}: found={intent is not None}") if intent: logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...") else: logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}") if contentExtracted.parts: for part in contentExtracted.parts: # Überspringe leere Parts (Container ohne Daten) if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0): if part.typeGroup == "container": continue # Überspringe leere Container if not part.metadata: part.metadata = {} # Ensure metadata is complete if "documentId" not in part.metadata: part.metadata["documentId"] = document.id # WICHTIG: Prüfe Intent für dieses Part partIntent = intent.intents if intent else ["extract"] # Debug-Logging für Intent-Verarbeitung logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}") # WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart # Generische Intent-Verarbeitung für ALLE Content-Typen hasReferenceIntent = "reference" in partIntent hasRenderIntent = "render" in partIntent hasExtractIntent = "extract" in partIntent hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0) logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}") # Track ob der originale Part bereits hinzugefügt wurde originalPartAdded = False # 1. Reference Intent: Erstelle Reference ContentPart if hasReferenceIntent: referencePart = ContentPart( id=f"ref_{document.id}_{part.id}", label=f"Reference: {part.label or 'Content'}", typeGroup="reference", mimeType=part.mimeType or "application/octet-stream", data="", # Leer - nur Referenz metadata={ "contentFormat": "reference", "documentId": document.id, "documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}", "intent": "reference", "usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}", "originalFileName": preExtracted["originalDocument"]["fileName"] } ) allContentParts.append(referencePart) logger.debug(f"✅ Created reference ContentPart for {part.id}") # 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering) if hasRenderIntent and hasPartData: # Prüfe ob es ein Binary/Image ist (kann gerendert werden) isRenderable = ( part.typeGroup == "image" or part.typeGroup == "binary" or (part.mimeType and ( part.mimeType.startswith("image/") or part.mimeType.startswith("video/") or part.mimeType.startswith("audio/") or self._isBinary(part.mimeType) )) ) if isRenderable: objectPart = ContentPart( id=f"obj_{document.id}_{part.id}", label=f"Object: {part.label or 'Content'}", typeGroup=part.typeGroup, mimeType=part.mimeType or "application/octet-stream", data=part.data, # Base64/Binary data ist bereits vorhanden metadata={ "contentFormat": "object", "documentId": document.id, "intent": "render", "usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}", "originalFileName": preExtracted["originalDocument"]["fileName"], "relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None } ) allContentParts.append(objectPart) logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)") else: logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})") elif hasRenderIntent and not hasPartData: logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part") # 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation) if hasExtractIntent: # For images: Keep as image part with extract intent - Vision AI extraction happens during section generation if part.typeGroup == "image" and hasPartData: logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation") # Keep image part as-is, mark with extract intent part.metadata.update({ "contentFormat": "extracted", # Marked for extraction, but not yet extracted "intent": "extract", "originalFileName": preExtracted["originalDocument"]["fileName"], "relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None, "extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.", "needsVisionExtraction": True # Flag to indicate Vision AI extraction needed }) allContentParts.append(part) originalPartAdded = True else: # For text/table content: Use directly as extracted (no AI processing here) # AI processing with extractionPrompt happens during section generation if not originalPartAdded: part.metadata.update({ "contentFormat": "extracted", "intent": "extract", "fromExtractContent": True, "skipExtraction": True, # Already extracted (raw extraction) "originalFileName": preExtracted["originalDocument"]["fileName"], "relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None, "extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None }) # Stelle sicher dass contentFormat gesetzt ist if "contentFormat" not in part.metadata: part.metadata["contentFormat"] = "extracted" allContentParts.append(part) originalPartAdded = True logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)") # 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt # (sollte normalerweise nicht vorkommen, da default "extract" ist) if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded: logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default") part.metadata.update({ "contentFormat": "extracted", "intent": "extract", "fromExtractContent": True, "skipExtraction": True, "originalFileName": preExtracted["originalDocument"]["fileName"] }) allContentParts.append(part) originalPartAdded = True logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}") logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}") continue # Skip normal extraction for this document # Check if it's standardized JSON format (has "documents" or "sections") if document.mimeType == "application/json": try: docBytes = self.services.interfaceDbComponent.getFileData(document.fileId) if docBytes: docData = docBytes.decode('utf-8') jsonData = json.loads(docData) if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData): logger.info(f"Document is already in standardized JSON format, using as reference") # Create reference ContentPart for structured JSON contentPart = ContentPart( id=f"ref_{document.id}", label=f"Reference: {document.fileName}", typeGroup="structure", mimeType="application/json", data=docData, metadata={ "contentFormat": "reference", "documentId": document.id, "documentReference": f"docItem:{document.id}:{document.fileName}", "skipExtraction": True, "intent": "reference" } ) allContentParts.append(contentPart) logger.info(f"✅ Using JSON document directly without extraction") continue # Skip normal extraction for this document except Exception as e: logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}") # Continue with normal extraction # Normal extraction path intent = getIntentForDocument(document.id, documentIntents) if not intent: # Default: extract für alle Dokumente ohne Intent logger.warning(f"No intent found for document {document.id}, using default 'extract'") intent = DocumentIntent( documentId=document.id, intents=["extract"], extractionPrompt="Extract all content from the document", reasoning="Default intent: no specific intent found" ) # WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen if "reference" in intent.intents: # Erstelle Reference ContentPart contentPart = ContentPart( id=f"ref_{document.id}", label=f"Reference: {document.fileName}", typeGroup="reference", mimeType=document.mimeType, data="", metadata={ "contentFormat": "reference", "documentId": document.id, "documentReference": f"docItem:{document.id}:{document.fileName}", "intent": "reference", "usageHint": f"Reference document: {document.fileName}" } ) allContentParts.append(contentPart) # WICHTIG: "render" und "extract" können beide vorhanden sein! # In diesem Fall erzeugen wir BEIDE ContentParts if "render" in intent.intents: # Für Images/Binary: extrahiere als Object if document.mimeType.startswith("image/") or self._isBinary(document.mimeType): try: # Lade Binary-Daten (getFileData ist nicht async - keine await nötig) binaryData = self.services.interfaceDbComponent.getFileData(document.fileId) if not binaryData: logger.warning(f"No binary data found for document {document.id}") continue base64Data = base64.b64encode(binaryData).decode('utf-8') contentPart = ContentPart( id=f"obj_{document.id}", label=f"Object: {document.fileName}", typeGroup="image" if document.mimeType.startswith("image/") else "binary", mimeType=document.mimeType, data=base64Data, metadata={ "contentFormat": "object", "documentId": document.id, "intent": "render", "usageHint": f"Render as visual element: {document.fileName}", "originalFileName": document.fileName, # Verknüpfung zu extracted Part (falls vorhanden) "relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None } ) allContentParts.append(contentPart) except Exception as e: logger.error(f"Failed to load binary data for document {document.id}: {str(e)}") if "extract" in intent.intents: # Extrahiere Content mit Extraction Service extractionPrompt = intent.extractionPrompt or "Extract all content from the document" # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( extractionPrompt, f"content_extraction_prompt_{document.id}" ) # Führe Extraktion aus from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy extractionOptions = ExtractionOptions( prompt=extractionPrompt, mergeStrategy=MergeStrategy() ) # extractContent ist nicht async - keine await nötig extractedResults = self.services.extraction.extractContent( [document], extractionOptions, operationId=extractionOperationId, parentOperationId=extractionOperationId ) # Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten for extracted in extractedResults: for part in extracted.parts: # Markiere als extracted Format part.metadata.update({ "contentFormat": "extracted", "documentId": document.id, "extractionPrompt": extractionPrompt, "intent": "extract", "usageHint": f"Use extracted content from {document.fileName}", # Verknüpfung zu object Part (falls vorhanden) "relatedObjectPartId": f"obj_{document.id}" if "render" in intent.intents else None }) # For images: Mark that Vision AI extraction is needed during section generation if part.typeGroup == "image": part.metadata["needsVisionExtraction"] = True logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation") # Stelle sicher, dass ID eindeutig ist (falls object Part existiert) if "render" in intent.intents: part.id = f"ext_{document.id}_{part.id}" allContentParts.append(part) # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( json.dumps([part.dict() for part in allContentParts], indent=2, default=str), "content_extraction_result" ) # ChatLog abschließen self.services.chat.progressLogFinish(extractionOperationId, True) return allContentParts except Exception as e: self.services.chat.progressLogFinish(extractionOperationId, False) logger.error(f"Error in extractAndPrepareContent: {str(e)}") raise async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]: """ Extrahiere Text aus einem Image-Part mit Vision AI. Args: imagePart: ContentPart mit typeGroup="image" extractionPrompt: Prompt für die Text-Extraktion Returns: Extrahierter Text oder None bei Fehler """ try: from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum # Final extraction prompt finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting." # Debug-Log (harmonisiert) self.services.utils.writeDebugFile( finalPrompt, f"content_extraction_prompt_image_{imagePart.id}" ) # Erstelle AI-Call-Request mit Image-Part request = AiCallRequest( prompt=finalPrompt, context="", options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE), contentParts=[imagePart] ) # Verwende AI-Service für Vision AI-Verarbeitung response = await self.aiService.callAi(request) # Debug-Log für Response (harmonisiert) if response and response.content: self.services.utils.writeDebugFile( response.content, f"content_extraction_response_image_{imagePart.id}" ) if response and response.content: return response.content.strip() # Kein Content zurückgegeben - return error message für Debugging errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}" logger.warning(errorMsg) return f"[ERROR: {errorMsg}]" except Exception as e: errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}" logger.error(errorMsg) import traceback logger.debug(f"Traceback: {traceback.format_exc()}") # Return error message statt None für Debugging return f"[ERROR: {errorMsg}]" async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]: """ Verarbeite Text-Content mit AI basierend auf extractionPrompt. WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text (z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt. Args: textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ) extractionPrompt: Prompt für die AI-Verarbeitung des Textes Returns: AI-verarbeiteter Text oder None bei Fehler """ try: from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum # Final extraction prompt finalPrompt = extractionPrompt or "Process and extract the key information from the following text content." # Debug-Log (harmonisiert) - log prompt with text preview textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "") promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}" self.services.utils.writeDebugFile( promptWithContext, f"content_extraction_prompt_text_{textPart.id}" ) # Erstelle Text-ContentPart für AI-Verarbeitung # Verwende den vorhandenen Text als Input textContentPart = ContentPart( id=textPart.id, label=textPart.label, typeGroup="text", mimeType="text/plain", data=textPart.data if textPart.data else "", metadata=textPart.metadata.copy() if textPart.metadata else {} ) # Erstelle AI-Call-Request mit Text-Part request = AiCallRequest( prompt=finalPrompt, context="", options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT), contentParts=[textContentPart] ) # Verwende AI-Service für Text-Verarbeitung response = await self.aiService.callAi(request) # Debug-Log für Response (harmonisiert) if response and response.content: self.services.utils.writeDebugFile( response.content, f"content_extraction_response_text_{textPart.id}" ) if response and response.content: return response.content.strip() # Kein Content zurückgegeben - return error message für Debugging errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}" logger.warning(errorMsg) return f"[ERROR: {errorMsg}]" except Exception as e: errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}" logger.error(errorMsg) import traceback logger.debug(f"Traceback: {traceback.format_exc()}") # Return error message statt None für Debugging return f"[ERROR: {errorMsg}]" def _isBinary(self, mimeType: str) -> bool: """Prüfe ob MIME-Type binary ist.""" binaryTypes = [ "application/octet-stream", "application/pdf", "application/zip", "application/x-zip-compressed" ] return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")