""" Refactored helper function for intelligent data extraction (continued). """ import logging import json from typing import List, Dict, Any, Optional, Tuple import asyncio from datetime import datetime logger = logging.getLogger(__name__) async def data_extraction( prompt: str, files: List[Dict[str, Any]], messages: List[Dict[str, Any]], ai_service, lucydom_interface = None, workflow_id: str = None, add_log_func = None, document_handler = None # Add this parameter ) -> Dict[str, Any]: """ Performs AI-driven data extraction with support for the document handler. Args: prompt: Specification of what data to extract files: List of all available files with metadata messages: List of all messages in the workflow ai_service: Service for AI requests lucydom_interface: Interface for database access (optional) workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs document_handler: Optional document handler for structured document operations Returns: Structured text object with extracted data and context information """ try: # Create extraction plan using AI extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func) # Execute extractions, preferring document handler if available if document_handler: extracted_data = await _execute_extractions_with_handler( extraction_plan, files, messages, document_handler, ai_service, workflow_id, add_log_func ) else: # Fall back to original implementation extracted_data = await _execute_extractions( extraction_plan, files, messages, lucydom_interface, ai_service, workflow_id, add_log_func ) # Structure extracted data structured_result = _structure_extracted_data(extracted_data, files, prompt) return structured_result except Exception as e: logger.error(f"Error in data extraction: {str(e)}", exc_info=True) # Add error log if add_log_func and workflow_id: add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error") # Return error result return { "error": str(e), "status": "error", "files_processed": len(files), "message": f"Data extraction failed: {str(e)}" } async def _execute_extractions_with_handler( extraction_plan: List[Dict[str, Any]], files: List[Dict[str, Any]], messages: List[Dict[str, Any]], document_handler, ai_service, workflow_id: str = None, add_log_func = None ) -> List[Dict[str, Any]]: """ Execute extractions using the document handler. Args: extraction_plan: List of extraction instructions files: List of all available files messages: List of all messages document_handler: Document handler for structured operations ai_service: Service for AI requests workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs Returns: List with extracted data per file """ extracted_data = [] # Sort by importance (highest first) sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True) for extraction_item in sorted_plan: file_id = extraction_item.get("file_id") extract_needed = extraction_item.get("extract_needed", False) extraction_prompt = extraction_item.get("extraction_prompt", "") # Find file metadata file_metadata = next((f for f in files if f.get("id") == file_id), None) if not file_metadata: logger.warning(f"File with ID {file_id} not found") continue file_name = file_metadata.get("name", "") file_type = file_metadata.get("type", "") content_type = file_metadata.get("content_type", "") # Log if add_log_func and workflow_id: add_log_func( workflow_id, f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info" ) # Only perform extraction if needed if extract_needed: # Find document in existing messages if available existing_content = _find_document_in_messages(file_id, messages) # Check if we should use document handler for contextual extraction if existing_content: # If document exists but needs contextual extraction document_id = existing_content.get("document_id") message_id = existing_content.get("message_id") if document_id and message_id: # Find the message containing the document for message in messages: if message.get("id") == message_id: # Extract content with context try: # Find document reference doc_reference = None for doc in message.get("documents", []): if doc.get("id") == document_id: doc_reference = doc break if doc_reference: # Use document handler to perform contextual extraction extracted_text = await document_handler.extract_document_content( document_id, file_id, extraction_prompt ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": extracted_text, "is_extracted": True, "extraction_method": "contextual_extraction" }) if add_log_func and workflow_id: add_log_func( workflow_id, f"Contextual extraction for {file_name}: {extraction_prompt}", "info" ) continue except Exception as e: logger.error(f"Error in contextual extraction for {file_name}: {str(e)}") # If we reach here, we need to perform a new extraction try: file_content = await document_handler.add_file_to_message( {}, # Empty message to extract just the document file_id, extraction_prompt ) # Get the extracted content from the document if "documents" in file_content and file_content["documents"]: doc = file_content["documents"][0] content_text = "" is_extracted = False for content in doc.get("contents", []): if content.get("type") == "text": content_text = content.get("text", "") is_extracted = content.get("is_extracted", False) break extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content_text, "is_extracted": is_extracted, "extraction_method": "document_handler" }) if add_log_func and workflow_id: add_log_func( workflow_id, f"Extracted {file_name} using document handler", "info" ) else: # Extraction failed extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Failed to extract content from {file_name}", "is_extracted": False, "extraction_method": "failed" }) except Exception as e: logger.error(f"Error extracting {file_name}: {str(e)}") extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Error extracting: {str(e)}", "is_extracted": False, "extraction_method": "error" }) else: # No extraction needed, use existing content existing_content = _find_document_in_messages(file_id, messages) if existing_content: extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": existing_content.get("content", ""), "is_extracted": existing_content.get("is_extracted", False), "extraction_method": "existing_content" }) else: # No existing content found extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"No content available for {file_name}", "is_extracted": False, "extraction_method": "none" }) return extracted_data def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]: """ Find a document by file ID in workflow messages. Args: file_id: ID of the file to find messages: List of messages to search Returns: Dictionary with document information or empty dict if not found """ for message in messages: for doc_index, document in enumerate(message.get("documents", [])): source = document.get("source", {}) # Check if file ID matches if source.get("id") == str(file_id) or source.get("id") == file_id: # Found the document content_text = "" is_extracted = False # Look for text content for content in document.get("contents", []): if content.get("type") == "text": content_text = content.get("text", "") is_extracted = content.get("is_extracted", False) break return { "document_id": document.get("id"), "message_id": message.get("id"), "content": content_text, "is_extracted": is_extracted } return {} async def _create_extraction_plan( prompt: str, files: List[Dict[str, Any]], messages: List[Dict[str, Any]], ai_service, workflow_id: str = None, add_log_func = None ) -> List[Dict[str, Any]]: """ Erstellt einen Extraktionsplan mit AI-Unterstützung. Args: prompt: Spezifizierung, welche Daten extrahiert werden sollen files: Liste aller verfügbaren Dateien mit Metadaten messages: Liste aller Nachrichten im Workflow ai_service: Service für KI-Anfragen workflow_id: Optionale ID des Workflows für Logging add_log_func: Optionale Funktion für das Hinzufügen von Logs Returns: Extraktionsplan (Liste von Extraktionsanweisungen pro Datei) """ # Erstelle Kontext-Informationen für den AI Call file_infos = [] for file in files: # Basis-Metadaten file_info = { "id": file.get("id", ""), "name": file.get("name", ""), "type": file.get("type", ""), "content_type": file.get("content_type", ""), "size": file.get("size", "") } # Extraktionsstatus prüfen (falls vorhanden) doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages) if doc_contents: # Prüfen, ob mindestens ein Content mit is_extracted=True existiert already_extracted = any( content.get("is_extracted", False) for content in doc_contents ) file_info["already_extracted"] = already_extracted # Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar) for content in doc_contents: if content.get("type") == "text" and content.get("text"): preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "") file_info["content_preview"] = preview_text break else: file_info["already_extracted"] = False file_infos.append(file_info) # AI-Prompt erstellen extraction_prompt = f""" Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen. AUFGABE: {prompt} VERFÜGBARE DATEIEN: {json.dumps(file_infos, indent=2)} Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen: 1. file_id: Die ID der zu extrahierenden Datei 2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird) 3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien) 4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist) Format: [ {{ "file_id": 1234, "extract_needed": true, "extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen", "importance": 5 }}, ... ] Gib nur das JSON-Array zurück, ohne weitere Erklärungen. """ # Log hinzufügen if add_log_func and workflow_id: add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info") try: # AI-Call durchführen extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}]) # JSON aus der Antwort extrahieren import re json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL) if json_match: extraction_plan = json.loads(json_match.group(0)) # Log hinzufügen if add_log_func and workflow_id: add_log_func( workflow_id, f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien", "info" ) return extraction_plan else: # Fallback bei Parsing-Problemen if add_log_func and workflow_id: add_log_func( workflow_id, "Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan", "warning" ) # Standard-Plan: Alle nicht extrahierten Dateien extrahieren default_plan = [] for file in files: doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages) already_extracted = any( content.get("is_extracted", False) for content in doc_contents ) if doc_contents else False default_plan.append({ "file_id": file.get("id", 0), "extract_needed": not already_extracted, "extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}", "importance": 3 }) return default_plan except Exception as e: logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True) if add_log_func and workflow_id: add_log_func( workflow_id, f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", "error" ) # Leerer Plan bei Fehlern return [] async def _execute_extractions( extraction_plan: List[Dict[str, Any]], files: List[Dict[str, Any]], messages: List[Dict[str, Any]], lucydom_interface, ai_service, workflow_id: str = None, add_log_func = None, logging_utils = None ) -> List[Dict[str, Any]]: """ Execute the planned extractions. Args: extraction_plan: List of extraction instructions files: List of all available files lucydom_interface: Interface for database access ai_service: Service for AI requests workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs logging_utils: Optional logging utility Returns: List with extracted data per file """ extracted_data = [] # Sort by importance sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True) for extraction_item in sorted_plan: file_id = extraction_item.get("file_id") extract_needed = extraction_item.get("extract_needed", False) extraction_prompt = extraction_item.get("extraction_prompt", "") # Find file metadata file_metadata = next((f for f in files if f.get("id") == file_id), None) if not file_metadata: logger.warning(f"File with ID {file_id} not found") continue file_name = file_metadata.get("name", "") file_type = file_metadata.get("type", "") content_type = file_metadata.get("content_type", "") # Add log if logging_utils: logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction") elif add_log_func and workflow_id: add_log_func( workflow_id, f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info" ) # Only perform extraction if needed if extract_needed: # Get file content via LucyDOM interface if lucydom_interface: try: file_content = await lucydom_interface.read_file_content(file_id) if not file_content: if logging_utils: logging_utils.warning(f"File {file_name} not found", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"File {file_name} not found", "warning") continue # Perform extraction based on file type if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')): # Image analysis with AI service if ai_service and hasattr(ai_service, "analyze_image"): try: image_analysis = await ai_service.analyze_image( image_data=file_content, prompt=extraction_prompt, mime_type=content_type ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": image_analysis, "is_extracted": True, "extraction_method": "image_analysis" }) if logging_utils: logging_utils.info(f"Image {file_name} successfully analyzed", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info") except Exception as e: logger.error(f"Error analyzing image {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error") else: # Fallback if no image analysis available extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Image: {file_name} (Analysis not available)", "is_extracted": False, "extraction_method": "none" }) else: # Text-based extraction for all other file types try: # Import directly here to avoid circular imports from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content, "is_extracted": is_extracted, "extraction_method": "text_extraction" }) if logging_utils: logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction") elif add_log_func and workflow_id: add_log_func( workflow_id, f"File {file_name} extracted (Status: {is_extracted})", "info" ) except Exception as e: logger.error(f"Error extracting text from {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error") except Exception as e: logger.error(f"Error reading file {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error") else: logger.warning(f"No LucyDOM interface available for file {file_name}") if logging_utils: logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning") else: # No extraction needed, use existing content doc_contents = _extract_document_contents_from_messages(file_id, messages) if doc_contents: # Use first text content for content in doc_contents: if content.get("type") == "text": extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content.get("text", ""), "is_extracted": content.get("is_extracted", False), "extraction_method": "existing_content" }) break else: # No existing content found extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"No content available for {file_name}", "is_extracted": False, "extraction_method": "none" }) return extracted_data def _structure_extracted_data( extracted_data: List[Dict[str, Any]], files: List[Dict[str, Any]], prompt: str ) -> Dict[str, Any]: """ Structure the extracted data into a formatted result. Args: extracted_data: List of extracted data per file files: List of all available files prompt: Original extraction prompt Returns: Structured result object """ # Create base structure result = { "prompt": prompt, "files_processed": len(extracted_data), "total_files": len(files), "extraction_timestamp": datetime.now().isoformat(), "status": "success", "extracted_content": [] } # Add extracted content for data_item in extracted_data: # Enrich with file metadata file_id = data_item.get("file_id", 0) file_metadata = next((f for f in files if f.get("id") == file_id), {}) content_item = { "file_id": file_id, "name": data_item.get("name", file_metadata.get("name", "")), "type": data_item.get("type", file_metadata.get("type", "")), "content_type": file_metadata.get("content_type", ""), "size": file_metadata.get("size", ""), "is_extracted": data_item.get("is_extracted", False), "extraction_method": data_item.get("extraction_method", ""), "content": data_item.get("content", "") } result["extracted_content"].append(content_item) return result def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Extract document contents for a specific file from workflow messages. Enhanced to handle the new document structure. Args: file_id: ID of the file messages: List of all messages in the workflow Returns: List of document contents for the specified file """ contents = [] for message in messages: # Search documents in the message for document in message.get("documents", []): source = document.get("source", {}) # Check if file ID matches (handle both string and int comparison) if (source.get("id") == file_id or (isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or (isinstance(file_id, str) and source.get("id") == file_id)): # Add contents of the file doc_contents = document.get("contents", []) if doc_contents: # Ensure each content has document reference for content in doc_contents: content_copy = content.copy() content_copy["document_id"] = document.get("id") content_copy["message_id"] = message.get("id") contents.append(content_copy) return contents def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None): """Helper function for logging with different log functions""" # Log via logger instance if log_type == "error": logger.error(message) elif log_type == "warning": logger.warning(message) else: logger.info(message) # Log via provided log function (if available) if add_log_func and workflow_id: add_log_func(workflow_id, message, log_type, agent_id, agent_name)