""" Refactored helper function for intelligent data extraction (continued). """ import logging import json from typing import List, Dict, Any, Optional, Tuple import asyncio from datetime import datetime import uuid logger = logging.getLogger(__name__) async def data_extraction( prompt: str, files: List[Dict[str, Any]], messages: List[Dict[str, Any]], ai_service, lucydom_interface = None, workflow_id: str = None, add_log_func = None, document_handler = None # Add document handler parameter ) -> Dict[str, Any]: """ Performs AI-driven data extraction with improved document and image handling. Args: prompt: Specification of what data to extract files: List of all available files with metadata messages: List of all messages in the workflow ai_service: Service for AI requests lucydom_interface: Interface for database access (optional) workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs document_handler: Optional document handler for structured document operations Returns: Structured text object with extracted data and context information """ try: # Log extraction start _log(add_log_func, workflow_id, f"Starting data extraction with {len(files)} files", "info") # Create enhanced extraction plan using AI _log(add_log_func, workflow_id, "Creating extraction plan", "info") extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func) # If we have extraction plan, log summary if extraction_plan: extract_needed_count = sum(1 for item in extraction_plan if item.get("extract_needed", False)) _log(add_log_func, workflow_id, f"Extraction plan created: {len(extraction_plan)} files, {extract_needed_count} need extraction", "info") # Execute extractions, preferring document handler if available if document_handler: _log(add_log_func, workflow_id, "Using document handler for extraction", "info") extracted_data = await _execute_extractions_with_handler( extraction_plan, files, messages, document_handler, ai_service, workflow_id, add_log_func ) else: # Fall back to original implementation _log(add_log_func, workflow_id, "Using fallback extraction method", "info") extracted_data = await _execute_extractions( extraction_plan, files, messages, lucydom_interface, ai_service, workflow_id, add_log_func ) # Structure extracted data _log(add_log_func, workflow_id, f"Structuring extracted data from {len(extracted_data)} files", "info") structured_result = _structure_extracted_data(extracted_data, files, prompt) # Enhance with contextual summaries using AI if ai_service and structured_result["extracted_content"]: _log(add_log_func, workflow_id, "Creating contextual summaries for extracted content", "info") try: # Create a prompt for contextual summary summary_prompt = f""" Create concise, contextual summaries of the following extracted content according to this requirement: REQUIREMENT: {prompt} EXTRACTED CONTENT: """ for item in structured_result["extracted_content"]: file_name = item.get("name", "Unnamed file") content_preview = item.get("content", "")[:500] + "..." if len(item.get("content", "")) > 500 else item.get("content", "") summary_prompt += f"\n--- {file_name} ---\n{content_preview}\n" # Call AI for contextual summaries summaries = await ai_service.call_api([{"role": "user", "content": summary_prompt}]) structured_result["contextual_summary"] = summaries _log(add_log_func, workflow_id, "Added contextual summaries to extracted data", "info") except Exception as e: _log(add_log_func, workflow_id, f"Error creating contextual summaries: {str(e)}", "warning") # Handle image-specific content separately image_content = [item for item in structured_result["extracted_content"] if "Image Analysis" in item.get("content", "") or item.get("type") == "image"] if image_content and len(image_content) > 0: _log(add_log_func, workflow_id, f"Processing {len(image_content)} image-related content items", "info") # Add image analysis summary if we have AI service if ai_service: try: # Create a prompt for image analysis summary image_summary_prompt = f""" Summarize the key visual information from these image analyses according to this requirement: REQUIREMENT: {prompt} IMAGE ANALYSES: """ for item in image_content: file_name = item.get("name", "Unnamed image") content = item.get("content", "") image_summary_prompt += f"\n--- {file_name} ---\n{content}\n" # Call AI for image analysis summary image_summaries = await ai_service.call_api([{"role": "user", "content": image_summary_prompt}]) structured_result["image_analysis_summary"] = image_summaries _log(add_log_func, workflow_id, "Added image analysis summary to extracted data", "info") except Exception as e: _log(add_log_func, workflow_id, f"Error creating image analysis summary: {str(e)}", "warning") return structured_result except Exception as e: logger.error(f"Error in data extraction: {str(e)}", exc_info=True) # Add error log if add_log_func and workflow_id: add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error") # Return error result return { "error": str(e), "status": "error", "files_processed": len(files), "message": f"Data extraction failed: {str(e)}" } async def _execute_extractions_with_handler( extraction_plan: List[Dict[str, Any]], files: List[Dict[str, Any]], messages: List[Dict[str, Any]], document_handler, ai_service, workflow_id: str = None, add_log_func = None ) -> List[Dict[str, Any]]: """ Execute extractions using the document handler with enhanced image processing. Args: extraction_plan: List of extraction instructions files: List of all available files messages: List of all messages document_handler: Document handler for structured operations ai_service: Service for AI requests workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs Returns: List with extracted data per file """ extracted_data = [] # Sort by importance (highest first) sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True) for extraction_item in sorted_plan: file_id = extraction_item.get("file_id") extract_needed = extraction_item.get("extract_needed", False) extraction_prompt = extraction_item.get("extraction_prompt", "") # Find file metadata file_metadata = next((f for f in files if f.get("id") == file_id), None) if not file_metadata: logger.warning(f"File with ID {file_id} not found") continue file_name = file_metadata.get("name", "") file_type = file_metadata.get("type", "") content_type = file_metadata.get("content_type", "") # Log extraction start _log(add_log_func, workflow_id, f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info") # Only perform extraction if needed if extract_needed: # Check if file already exists in messages with content existing_content = _find_document_in_messages(file_id, messages) if existing_content and existing_content.get("content"): # Content already exists, check if we need more specialized extraction current_context = existing_content.get("extraction_context", "") # Check if new extraction prompt is different or more specific if extraction_prompt and extraction_prompt != current_context: _log(add_log_func, workflow_id, f"Re-extracting {file_name} with new prompt: {extraction_prompt}", "info") # Create an empty message to extract into empty_message = {} # Use document handler to extract with new context try: result_message = await document_handler.add_file_to_message( empty_message, file_id, extraction_prompt ) # Get the document content from result if "documents" in result_message and result_message["documents"]: doc = result_message["documents"][0] # Get text content content_text = "" is_extracted = False for content in doc.get("contents", []): if content.get("type") == "text": content_text = content.get("text", "") is_extracted = content.get("is_extracted", False) break # Create extraction result extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content_text, "is_extracted": is_extracted, "extraction_method": "document_handler_reextract", "extraction_context": extraction_prompt }) # Check for additional documents (e.g., extracted images) for additional_doc in result_message.get("documents", [])[1:]: source = additional_doc.get("source", {}) # Skip if not an extracted document if source.get("type") != "extracted": continue # Get content add_content_text = "" add_is_extracted = False for content in additional_doc.get("contents", []): if content.get("type") == "text": add_content_text = content.get("text", "") add_is_extracted = content.get("is_extracted", False) break # Add as separate extraction result if add_content_text: extracted_data.append({ "file_id": source.get("id", f"extracted_{uuid.uuid4()}"), "name": source.get("name", f"Extracted from {file_name}"), "type": source.get("content_type", "image"), "content": add_content_text, "is_extracted": add_is_extracted, "extraction_method": "document_handler_extracted_component", "extraction_context": content.get("extraction_context", extraction_prompt), "parent_file_id": file_id }) _log(add_log_func, workflow_id, f"Extracted embedded content from {file_name}", "info") _log(add_log_func, workflow_id, f"Re-extracted {file_name} with new context", "info") continue except Exception as e: logger.error(f"Error re-extracting {file_name}: {str(e)}") _log(add_log_func, workflow_id, f"Error re-extracting {file_name}: {str(e)}", "warning") # Use existing content extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": existing_content.get("content", ""), "is_extracted": existing_content.get("is_extracted", False), "extraction_method": "existing_content", "extraction_context": current_context }) _log(add_log_func, workflow_id, f"Using existing content for {file_name}", "info") continue # Need to extract content with document handler try: # Create an empty message to extract into empty_message = {} # Use document handler to add file and extract content result_message = await document_handler.add_file_to_message( empty_message, file_id, extraction_prompt ) # Get the document content from result if "documents" in result_message and result_message["documents"]: # Process main document doc = result_message["documents"][0] # First document is the main file # Get text content content_text = "" is_extracted = False for content in doc.get("contents", []): if content.get("type") == "text": content_text = content.get("text", "") is_extracted = content.get("is_extracted", False) break # Create extraction result for main document extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content_text, "is_extracted": is_extracted, "extraction_method": "document_handler", "extraction_context": extraction_prompt }) _log(add_log_func, workflow_id, f"Extracted {file_name} using document handler", "info") # Process additional documents (e.g., extracted images) for additional_doc in result_message.get("documents", [])[1:]: source = additional_doc.get("source", {}) # Skip if not an extracted document if source.get("type") != "extracted": continue # Get content add_content_text = "" add_is_extracted = False for content in additional_doc.get("contents", []): if content.get("type") == "text": add_content_text = content.get("text", "") add_is_extracted = content.get("is_extracted", False) break # Add as separate extraction result if add_content_text: extracted_data.append({ "file_id": source.get("id", f"extracted_{uuid.uuid4()}"), "name": source.get("name", f"Extracted from {file_name}"), "type": source.get("content_type", "image"), "content": add_content_text, "is_extracted": add_is_extracted, "extraction_method": "document_handler_extracted_component", "extraction_context": content.get("extraction_context", extraction_prompt), "parent_file_id": file_id }) _log(add_log_func, workflow_id, f"Extracted embedded content from {file_name}", "info") else: # Extraction failed extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Failed to extract content from {file_name}", "is_extracted": False, "extraction_method": "failed" }) _log(add_log_func, workflow_id, f"Failed to extract content from {file_name}", "warning") except Exception as e: logger.error(f"Error extracting {file_name}: {str(e)}") _log(add_log_func, workflow_id, f"Error extracting {file_name}: {str(e)}", "warning") extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Error extracting: {str(e)}", "is_extracted": False, "extraction_method": "error" }) else: # No extraction needed, use existing content existing_content = _find_document_in_messages(file_id, messages) if existing_content: extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": existing_content.get("content", ""), "is_extracted": existing_content.get("is_extracted", False), "extraction_method": "existing_content", "extraction_context": existing_content.get("extraction_context", "") }) _log(add_log_func, workflow_id, f"Using existing content for {file_name}", "info") else: # No existing content found extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"No content available for {file_name}", "is_extracted": False, "extraction_method": "none" }) _log(add_log_func, workflow_id, f"No content available for {file_name}", "warning") return extracted_data def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]: """ Find a document by file ID in workflow messages. Args: file_id: ID of the file to find messages: List of messages to search Returns: Dictionary with document information or empty dict if not found """ for message in messages: for doc_index, document in enumerate(message.get("documents", [])): source = document.get("source", {}) # Check if file ID matches if source.get("id") == str(file_id) or source.get("id") == file_id: # Found the document content_text = "" is_extracted = False # Look for text content for content in document.get("contents", []): if content.get("type") == "text": content_text = content.get("text", "") is_extracted = content.get("is_extracted", False) break return { "document_id": document.get("id"), "message_id": message.get("id"), "content": content_text, "is_extracted": is_extracted } return {} async def _create_extraction_plan( prompt: str, files: List[Dict[str, Any]], messages: List[Dict[str, Any]], ai_service, workflow_id: str = None, add_log_func = None ) -> List[Dict[str, Any]]: """ Erstellt einen Extraktionsplan mit AI-Unterstützung. Args: prompt: Spezifizierung, welche Daten extrahiert werden sollen files: Liste aller verfügbaren Dateien mit Metadaten messages: Liste aller Nachrichten im Workflow ai_service: Service für KI-Anfragen workflow_id: Optionale ID des Workflows für Logging add_log_func: Optionale Funktion für das Hinzufügen von Logs Returns: Extraktionsplan (Liste von Extraktionsanweisungen pro Datei) """ # Erstelle Kontext-Informationen für den AI Call file_infos = [] for file in files: # Basis-Metadaten file_info = { "id": file.get("id", ""), "name": file.get("name", ""), "type": file.get("type", ""), "content_type": file.get("content_type", ""), "size": file.get("size", "") } # Extraktionsstatus prüfen (falls vorhanden) doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages) if doc_contents: # Prüfen, ob mindestens ein Content mit is_extracted=True existiert already_extracted = any( content.get("is_extracted", False) for content in doc_contents ) file_info["already_extracted"] = already_extracted # Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar) for content in doc_contents: if content.get("type") == "text" and content.get("text"): preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "") file_info["content_preview"] = preview_text break else: file_info["already_extracted"] = False file_infos.append(file_info) # AI-Prompt erstellen extraction_prompt = f""" Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen. AUFGABE: {prompt} VERFÜGBARE DATEIEN: {json.dumps(file_infos, indent=2)} Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen: 1. file_id: Die ID der zu extrahierenden Datei 2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird) 3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien) 4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist) Format: [ {{ "file_id": 1234, "extract_needed": true, "extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen", "importance": 5 }}, ... ] Gib nur das JSON-Array zurück, ohne weitere Erklärungen. """ # Log hinzufügen if add_log_func and workflow_id: add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info") try: # AI-Call durchführen extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}]) # JSON aus der Antwort extrahieren import re json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL) if json_match: extraction_plan = json.loads(json_match.group(0)) # Log hinzufügen if add_log_func and workflow_id: add_log_func( workflow_id, f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien", "info" ) return extraction_plan else: # Fallback bei Parsing-Problemen if add_log_func and workflow_id: add_log_func( workflow_id, "Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan", "warning" ) # Standard-Plan: Alle nicht extrahierten Dateien extrahieren default_plan = [] for file in files: doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages) already_extracted = any( content.get("is_extracted", False) for content in doc_contents ) if doc_contents else False default_plan.append({ "file_id": file.get("id", 0), "extract_needed": not already_extracted, "extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}", "importance": 3 }) return default_plan except Exception as e: logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True) if add_log_func and workflow_id: add_log_func( workflow_id, f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", "error" ) # Leerer Plan bei Fehlern return [] async def _execute_extractions( extraction_plan: List[Dict[str, Any]], files: List[Dict[str, Any]], messages: List[Dict[str, Any]], lucydom_interface, ai_service, workflow_id: str = None, add_log_func = None, logging_utils = None ) -> List[Dict[str, Any]]: """ Execute the planned extractions. Args: extraction_plan: List of extraction instructions files: List of all available files lucydom_interface: Interface for database access ai_service: Service for AI requests workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs logging_utils: Optional logging utility Returns: List with extracted data per file """ extracted_data = [] # Sort by importance sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True) for extraction_item in sorted_plan: file_id = extraction_item.get("file_id") extract_needed = extraction_item.get("extract_needed", False) extraction_prompt = extraction_item.get("extraction_prompt", "") # Find file metadata file_metadata = next((f for f in files if f.get("id") == file_id), None) if not file_metadata: logger.warning(f"File with ID {file_id} not found") continue file_name = file_metadata.get("name", "") file_type = file_metadata.get("type", "") content_type = file_metadata.get("content_type", "") # Add log if logging_utils: logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction") elif add_log_func and workflow_id: add_log_func( workflow_id, f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info" ) # Only perform extraction if needed if extract_needed: # Get file content via LucyDOM interface if lucydom_interface: try: file_content = await lucydom_interface.read_file_content(file_id) if not file_content: if logging_utils: logging_utils.warning(f"File {file_name} not found", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"File {file_name} not found", "warning") continue # Perform extraction based on file type if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')): # Image analysis with AI service if ai_service and hasattr(ai_service, "analyze_image"): try: image_analysis = await ai_service.analyze_image( image_data=file_content, prompt=extraction_prompt, mime_type=content_type ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": image_analysis, "is_extracted": True, "extraction_method": "image_analysis" }) if logging_utils: logging_utils.info(f"Image {file_name} successfully analyzed", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info") except Exception as e: logger.error(f"Error analyzing image {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error") else: # Fallback if no image analysis available extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"Image: {file_name} (Analysis not available)", "is_extracted": False, "extraction_method": "none" }) else: # Text-based extraction for all other file types try: # Import directly here to avoid circular imports from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content, "is_extracted": is_extracted, "extraction_method": "text_extraction" }) if logging_utils: logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction") elif add_log_func and workflow_id: add_log_func( workflow_id, f"File {file_name} extracted (Status: {is_extracted})", "info" ) except Exception as e: logger.error(f"Error extracting text from {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error") except Exception as e: logger.error(f"Error reading file {file_name}: {str(e)}") if logging_utils: logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error") else: logger.warning(f"No LucyDOM interface available for file {file_name}") if logging_utils: logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction") elif add_log_func and workflow_id: add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning") else: # No extraction needed, use existing content doc_contents = _extract_document_contents_from_messages(file_id, messages) if doc_contents: # Use first text content for content in doc_contents: if content.get("type") == "text": extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": content.get("text", ""), "is_extracted": content.get("is_extracted", False), "extraction_method": "existing_content" }) break else: # No existing content found extracted_data.append({ "file_id": file_id, "name": file_name, "type": file_type, "content": f"No content available for {file_name}", "is_extracted": False, "extraction_method": "none" }) return extracted_data def _structure_extracted_data( extracted_data: List[Dict[str, Any]], files: List[Dict[str, Any]], prompt: str ) -> Dict[str, Any]: """ Structure the extracted data into a formatted result. Args: extracted_data: List of extracted data per file files: List of all available files prompt: Original extraction prompt Returns: Structured result object """ # Create base structure result = { "prompt": prompt, "files_processed": len(extracted_data), "total_files": len(files), "extraction_timestamp": datetime.now().isoformat(), "status": "success", "extracted_content": [] } # Add extracted content for data_item in extracted_data: # Enrich with file metadata file_id = data_item.get("file_id", 0) file_metadata = next((f for f in files if f.get("id") == file_id), {}) content_item = { "file_id": file_id, "name": data_item.get("name", file_metadata.get("name", "")), "type": data_item.get("type", file_metadata.get("type", "")), "content_type": file_metadata.get("content_type", ""), "size": file_metadata.get("size", ""), "is_extracted": data_item.get("is_extracted", False), "extraction_method": data_item.get("extraction_method", ""), "content": data_item.get("content", "") } result["extracted_content"].append(content_item) return result def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Extract document contents for a specific file from workflow messages. Enhanced to handle the new document structure. Args: file_id: ID of the file messages: List of all messages in the workflow Returns: List of document contents for the specified file """ contents = [] for message in messages: # Search documents in the message for document in message.get("documents", []): source = document.get("source", {}) # Check if file ID matches (handle both string and int comparison) if (source.get("id") == file_id or (isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or (isinstance(file_id, str) and source.get("id") == file_id)): # Add contents of the file doc_contents = document.get("contents", []) if doc_contents: # Ensure each content has document reference for content in doc_contents: content_copy = content.copy() content_copy["document_id"] = document.get("id") content_copy["message_id"] = message.get("id") contents.append(content_copy) return contents def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None): """Helper function for logging with different log functions""" # Log via logger instance if log_type == "error": logger.error(message) elif log_type == "warning": logger.warning(message) else: logger.info(message) # Log via provided log function (if available) if add_log_func and workflow_id: add_log_func(workflow_id, message, log_type, agent_id, agent_name)