""" Refactored helper function for intelligent data extraction (continued). """ import logging import json from typing import List, Dict, Any, Optional, Tuple import asyncio from datetime import datetime logger = logging.getLogger(__name__) async def data_extraction( prompt: str, files: List[Dict[str, Any]], messages: List[Dict[str, Any]], ai_service, lucydom_interface = None, workflow_id: str = None, add_log_func = None ) -> Dict[str, Any]: """ Führt einen AI Call durch, um zu bestimmen, welche Inhalte aus welchen Dateiobjekten extrahiert werden sollen, und führt dann die notwendigen Extraktionen durch. Args: prompt: Spezifizierung, welche Daten extrahiert werden sollen files: Liste aller verfügbaren Dateien mit Metadaten messages: Liste aller Nachrichten im Workflow ai_service: Service für KI-Anfragen lucydom_interface: Interface für Datenbankzugriffe (optional) workflow_id: Optionale ID des Workflows für Logging add_log_func: Optionale Funktion für das Hinzufügen von Logs Returns: Strukturiertes Text-Objekt mit extrahierten Daten und Kontext-Informationen """ try: # 1. AI Call zur Bestimmung der notwendigen Extraktionen extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func) # 2. Extraktionen durchführen extracted_data = await _execute_extractions( extraction_plan, files, messages, lucydom_interface, ai_service, workflow_id, add_log_func ) # 3. Extrahierte Daten strukturieren structured_result = _structure_extracted_data(extracted_data, files, prompt) return structured_result except Exception as e: logger.error(f"Fehler bei der Datenextraktion: {str(e)}", exc_info=True) # Fehler-Log hinzufügen if add_log_func and workflow_id: add_log_func(workflow_id, f"Fehler bei der Datenextraktion: {str(e)}", "error") # Fehler-Ergebnis zurückgeben return { "error": str(e), "status": "error", "files_processed": len(files), "message": f"Die Datenextraktion konnte nicht durchgeführt werden: {str(e)}" } async def _create_extraction_plan( prompt: str, files: List[Dict[str, Any]], messages: List[Dict[str, Any]], ai_service, workflow_id: str = None, add_log_func = None ) -> List[Dict[str, Any]]: """ Erstellt einen Extraktionsplan mit AI-Unterstützung. Args: prompt: Spezifizierung, welche Daten extrahiert werden sollen files: Liste aller verfügbaren Dateien mit Metadaten messages: Liste aller Nachrichten im Workflow ai_service: Service für KI-Anfragen workflow_id: Optionale ID des Workflows für Logging add_log_func: Optionale Funktion für das Hinzufügen von Logs Returns: Extraktionsplan (Liste von Extraktionsanweisungen pro Datei) """ # Erstelle Kontext-Informationen für den AI Call file_infos = [] for file in files: # Basis-Metadaten file_info = { "id": file.get("id", ""), "name": file.get("name", ""), "type": file.get("type", ""), "content_type": file.get("content_type", ""), "size": file.get("size", "") } # Extraktionsstatus prüfen (falls vorhanden) doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages) if doc_contents: # Prüfen, ob mindestens ein Content mit is_extracted=True existiert already_extracted = any( content.get("is_extracted", False) for content in doc_contents ) file_info["already_extracted"] = already_extracted # Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar) for content in doc_contents: if content.get("type") == "text" and content.get("text"): preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "") file_info["content_preview"] = preview_text break else: file_info["already_extracted"] = False file_infos.append(file_info) # AI-Prompt erstellen extraction_prompt = f""" Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen. AUFGABE: {prompt} VERFÜGBARE DATEIEN: {json.dumps(file_infos, indent=2)} Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen: 1. file_id: Die ID der zu extrahierenden Datei 2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird) 3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien) 4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist) Format: [ {{ "file_id": 1234, "extract_needed": true, "extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen", "importance": 5 }}, ... ] Gib nur das JSON-Array zurück, ohne weitere Erklärungen. """ # Log hinzufügen if add_log_func and workflow_id: add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info") try: # AI-Call durchführen extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}]) # JSON aus der Antwort extrahieren import re json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL) if json_match: extraction_plan = json.loads(json_match.group(0)) # Log hinzufügen if add_log_func and workflow_id: add_log_func( workflow_id, f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien", "info" ) return extraction_plan else: # Fallback bei Parsing-Problemen if add_log_func and workflow_id: add_log_func( workflow_id, "Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan", "warning" ) # Standard-Plan: Alle nicht extrahierten Dateien extrahieren default_plan = [] for file in files: doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages) already_extracted = any( content.get("is_extracted", False) for content in doc_contents ) if doc_contents else False default_plan.append({ "file_id": file.get("id", 0), "extract_needed": not already_extracted, "extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}", "importance": 3 }) return default_plan except Exception as e: logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True) if add_log_func and workflow_id: add_log_func( workflow_id, f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", "error" ) # Leerer Plan bei Fehlern return [] async def _execute_extractions( extraction_plan: List[Dict[str, Any]], files: List[Dict[str, Any]], messages: List[Dict[str, Any]], lucydom_interface, ai_service, workflow_id: str = None, add_log_func = None, logging_utils = None ) -> List[Dict[str, Any]]: """ Execute the planned extractions. Args: extraction_plan: List of extraction instructions files: List of all available files lucydom_interface: Interface for database access ai_service: Service for AI requests workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs logging_utils: Optional logging utility Returns: List with extracted data per file """ extracted_data = [] # Sort by importance sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True) for extraction_item in sorted_plan: file_id = extraction_item.get("file_id") extract_needed = extraction_item.get("extract_needed", False) extraction_prompt = extraction_item.get("extraction_prompt", "") # Find file metadata file_metadata = next((f for f in files if f.get("id") == file_id), None) if not file_metadata: logger.warning(f"File with ID {file_id} not found") continue file_name = file_metadata.get("name", "") file_type = file_metadata.get("type", "") content_type = file_metadata.get("content_type", "") # Add log if logging_utils: logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction") elif add_log_func and workflow_id: add_log_func( workflow_id, f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info" ) # Only perform extraction if needed if extract_needed: # Get file content via LucyDOM interface if lucydom_interface: try: file_content = await lucydom_interface.read_file_content(file_id) if not file_content: if logging_utils: logging_utils.warning(f"File {file_name} not found", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"File {file_name} not found", "warning") continue # Perform extraction based on file type if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')): # Image analysis with AI service if ai_service and hasattr(ai_service, "analyze_image"): try: image_analysis = await ai_service.analyze_image( image_data=file_content, prompt=extraction_prompt, mime_type=content_type ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": image_analysis, "is_extracted": True, "extraction_method": "image_analysis" }) if logging_utils: logging_utils.info(f"Image {file_name} successfully analyzed", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info") except Exception as e: logger.error(f"Error analyzing image {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error") else: # Fallback if no image analysis available extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Image: {file_name} (Analysis not available)", "is_extracted": False, "extraction_method": "none" }) else: # Text-based extraction for all other file types try: # Import directly here to avoid circular imports from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content, "is_extracted": is_extracted, "extraction_method": "text_extraction" }) if logging_utils: logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction") elif add_log_func and workflow_id: add_log_func( workflow_id, f"File {file_name} extracted (Status: {is_extracted})", "info" ) except Exception as e: logger.error(f"Error extracting text from {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error") except Exception as e: logger.error(f"Error reading file {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error") else: logger.warning(f"No LucyDOM interface available for file {file_name}") if logging_utils: logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning") else: # No extraction needed, use existing content doc_contents = _extract_document_contents_from_messages(file_id, messages) if doc_contents: # Use first text content for content in doc_contents: if content.get("type") == "text": extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content.get("text", ""), "is_extracted": content.get("is_extracted", False), "extraction_method": "existing_content" }) break else: # No existing content found extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"No content available for {file_name}", "is_extracted": False, "extraction_method": "none" }) return extracted_data def _structure_extracted_data( extracted_data: List[Dict[str, Any]], files: List[Dict[str, Any]], prompt: str ) -> Dict[str, Any]: """ Structure the extracted data into a formatted result. Args: extracted_data: List of extracted data per file files: List of all available files prompt: Original extraction prompt Returns: Structured result object """ # Create base structure result = { "prompt": prompt, "files_processed": len(extracted_data), "total_files": len(files), "extraction_timestamp": datetime.now().isoformat(), "status": "success", "extracted_content": [] } # Add extracted content for data_item in extracted_data: # Enrich with file metadata file_id = data_item.get("file_id", 0) file_metadata = next((f for f in files if f.get("id") == file_id), {}) content_item = { "file_id": file_id, "name": data_item.get("name", file_metadata.get("name", "")), "type": data_item.get("type", file_metadata.get("type", "")), "content_type": file_metadata.get("content_type", ""), "size": file_metadata.get("size", ""), "is_extracted": data_item.get("is_extracted", False), "extraction_method": data_item.get("extraction_method", ""), "content": data_item.get("content", "") } result["extracted_content"].append(content_item) return result def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Extract document contents for a specific file from workflow messages. Args: file_id: ID of the file messages: List of all messages in the workflow Returns: List of document contents for the specified file """ contents = [] for message in messages: # Search documents in the message for document in message.get("documents", []): source = document.get("source", {}) # Check if file ID matches if source.get("id") == file_id or (source.get("type") == "file" and source.get("id") == file_id): # Add contents of the file doc_contents = document.get("contents", []) if doc_contents: contents.extend(doc_contents) return contents def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None): """Helper function for logging with different log functions""" # Log via logger instance if log_type == "error": logger.error(message) elif log_type == "warning": logger.warning(message) else: logger.info(message) # Log via provided log function (if available) if add_log_func and workflow_id: add_log_func(workflow_id, message, log_type, agent_id, agent_name)