gateway/gwserver/modules/agentservice_dataextraction.py
2025-04-14 20:05:33 +02:00

742 lines
No EOL
30 KiB
Python

"""
Refactored helper function for intelligent data extraction (continued).
"""
import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime
logger = logging.getLogger(__name__)
async def data_extraction(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
lucydom_interface = None,
workflow_id: str = None,
add_log_func = None,
document_handler = None # Add this parameter
) -> Dict[str, Any]:
"""
Performs AI-driven data extraction with support for the document handler.
Args:
prompt: Specification of what data to extract
files: List of all available files with metadata
messages: List of all messages in the workflow
ai_service: Service for AI requests
lucydom_interface: Interface for database access (optional)
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
document_handler: Optional document handler for structured document operations
Returns:
Structured text object with extracted data and context information
"""
try:
# Create extraction plan using AI
extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)
# Execute extractions, preferring document handler if available
if document_handler:
extracted_data = await _execute_extractions_with_handler(
extraction_plan,
files,
messages,
document_handler,
ai_service,
workflow_id,
add_log_func
)
else:
# Fall back to original implementation
extracted_data = await _execute_extractions(
extraction_plan,
files,
messages,
lucydom_interface,
ai_service,
workflow_id,
add_log_func
)
# Structure extracted data
structured_result = _structure_extracted_data(extracted_data, files, prompt)
return structured_result
except Exception as e:
logger.error(f"Error in data extraction: {str(e)}", exc_info=True)
# Add error log
if add_log_func and workflow_id:
add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error")
# Return error result
return {
"error": str(e),
"status": "error",
"files_processed": len(files),
"message": f"Data extraction failed: {str(e)}"
}
async def _execute_extractions_with_handler(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
document_handler,
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Execute extractions using the document handler.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
messages: List of all messages
document_handler: Document handler for structured operations
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance (highest first)
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Log
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
"info"
)
# Only perform extraction if needed
if extract_needed:
# Find document in existing messages if available
existing_content = _find_document_in_messages(file_id, messages)
# Check if we should use document handler for contextual extraction
if existing_content:
# If document exists but needs contextual extraction
document_id = existing_content.get("document_id")
message_id = existing_content.get("message_id")
if document_id and message_id:
# Find the message containing the document
for message in messages:
if message.get("id") == message_id:
# Extract content with context
try:
# Find document reference
doc_reference = None
for doc in message.get("documents", []):
if doc.get("id") == document_id:
doc_reference = doc
break
if doc_reference:
# Use document handler to perform contextual extraction
extracted_text = await document_handler.extract_document_content(
document_id,
file_id,
extraction_prompt
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": extracted_text,
"is_extracted": True,
"extraction_method": "contextual_extraction"
})
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Contextual extraction for {file_name}: {extraction_prompt}",
"info"
)
continue
except Exception as e:
logger.error(f"Error in contextual extraction for {file_name}: {str(e)}")
# If we reach here, we need to perform a new extraction
try:
file_content = await document_handler.add_file_to_message(
{}, # Empty message to extract just the document
file_id,
extraction_prompt
)
# Get the extracted content from the document
if "documents" in file_content and file_content["documents"]:
doc = file_content["documents"][0]
content_text = ""
is_extracted = False
for content in doc.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content_text,
"is_extracted": is_extracted,
"extraction_method": "document_handler"
})
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Extracted {file_name} using document handler",
"info"
)
else:
# Extraction failed
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Failed to extract content from {file_name}",
"is_extracted": False,
"extraction_method": "failed"
})
except Exception as e:
logger.error(f"Error extracting {file_name}: {str(e)}")
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Error extracting: {str(e)}",
"is_extracted": False,
"extraction_method": "error"
})
else:
# No extraction needed, use existing content
existing_content = _find_document_in_messages(file_id, messages)
if existing_content:
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": existing_content.get("content", ""),
"is_extracted": existing_content.get("is_extracted", False),
"extraction_method": "existing_content"
})
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
return extracted_data
def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Find a document by file ID in workflow messages.
Args:
file_id: ID of the file to find
messages: List of messages to search
Returns:
Dictionary with document information or empty dict if not found
"""
for message in messages:
for doc_index, document in enumerate(message.get("documents", [])):
source = document.get("source", {})
# Check if file ID matches
if source.get("id") == str(file_id) or source.get("id") == file_id:
# Found the document
content_text = ""
is_extracted = False
# Look for text content
for content in document.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
return {
"document_id": document.get("id"),
"message_id": message.get("id"),
"content": content_text,
"is_extracted": is_extracted
}
return {}
async def _create_extraction_plan(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Erstellt einen Extraktionsplan mit AI-Unterstützung.
Args:
prompt: Spezifizierung, welche Daten extrahiert werden sollen
files: Liste aller verfügbaren Dateien mit Metadaten
messages: Liste aller Nachrichten im Workflow
ai_service: Service für KI-Anfragen
workflow_id: Optionale ID des Workflows für Logging
add_log_func: Optionale Funktion für das Hinzufügen von Logs
Returns:
Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
"""
# Erstelle Kontext-Informationen für den AI Call
file_infos = []
for file in files:
# Basis-Metadaten
file_info = {
"id": file.get("id", ""),
"name": file.get("name", ""),
"type": file.get("type", ""),
"content_type": file.get("content_type", ""),
"size": file.get("size", "")
}
# Extraktionsstatus prüfen (falls vorhanden)
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
if doc_contents:
# Prüfen, ob mindestens ein Content mit is_extracted=True existiert
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
)
file_info["already_extracted"] = already_extracted
# Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
for content in doc_contents:
if content.get("type") == "text" and content.get("text"):
preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
file_info["content_preview"] = preview_text
break
else:
file_info["already_extracted"] = False
file_infos.append(file_info)
# AI-Prompt erstellen
extraction_prompt = f"""
Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.
AUFGABE:
{prompt}
VERFÜGBARE DATEIEN:
{json.dumps(file_infos, indent=2)}
Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
1. file_id: Die ID der zu extrahierenden Datei
2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)
Format:
[
{{
"file_id": 1234,
"extract_needed": true,
"extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
"importance": 5
}},
...
]
Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
"""
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")
try:
# AI-Call durchführen
extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
# JSON aus der Antwort extrahieren
import re
json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)
if json_match:
extraction_plan = json.loads(json_match.group(0))
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
"info"
)
return extraction_plan
else:
# Fallback bei Parsing-Problemen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
"Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
"warning"
)
# Standard-Plan: Alle nicht extrahierten Dateien extrahieren
default_plan = []
for file in files:
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
) if doc_contents else False
default_plan.append({
"file_id": file.get("id", 0),
"extract_needed": not already_extracted,
"extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
"importance": 3
})
return default_plan
except Exception as e:
logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
"error"
)
# Leerer Plan bei Fehlern
return []
async def _execute_extractions(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
lucydom_interface,
ai_service,
workflow_id: str = None,
add_log_func = None,
logging_utils = None
) -> List[Dict[str, Any]]:
"""
Execute the planned extractions.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
lucydom_interface: Interface for database access
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
logging_utils: Optional logging utility
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Add log
if logging_utils:
logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
"info"
)
# Only perform extraction if needed
if extract_needed:
# Get file content via LucyDOM interface
if lucydom_interface:
try:
file_content = await lucydom_interface.read_file_content(file_id)
if not file_content:
if logging_utils:
logging_utils.warning(f"File {file_name} not found", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"File {file_name} not found", "warning")
continue
# Perform extraction based on file type
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
# Image analysis with AI service
if ai_service and hasattr(ai_service, "analyze_image"):
try:
image_analysis = await ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": image_analysis,
"is_extracted": True,
"extraction_method": "image_analysis"
})
if logging_utils:
logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
else:
# Fallback if no image analysis available
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Image: {file_name} (Analysis not available)",
"is_extracted": False,
"extraction_method": "none"
})
else:
# Text-based extraction for all other file types
try:
# Import directly here to avoid circular imports
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content,
"is_extracted": is_extracted,
"extraction_method": "text_extraction"
})
if logging_utils:
logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"File {file_name} extracted (Status: {is_extracted})",
"info"
)
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
except Exception as e:
logger.error(f"Error reading file {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
else:
logger.warning(f"No LucyDOM interface available for file {file_name}")
if logging_utils:
logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
else:
# No extraction needed, use existing content
doc_contents = _extract_document_contents_from_messages(file_id, messages)
if doc_contents:
# Use first text content
for content in doc_contents:
if content.get("type") == "text":
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content.get("text", ""),
"is_extracted": content.get("is_extracted", False),
"extraction_method": "existing_content"
})
break
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
return extracted_data
def _structure_extracted_data(
extracted_data: List[Dict[str, Any]],
files: List[Dict[str, Any]],
prompt: str
) -> Dict[str, Any]:
"""
Structure the extracted data into a formatted result.
Args:
extracted_data: List of extracted data per file
files: List of all available files
prompt: Original extraction prompt
Returns:
Structured result object
"""
# Create base structure
result = {
"prompt": prompt,
"files_processed": len(extracted_data),
"total_files": len(files),
"extraction_timestamp": datetime.now().isoformat(),
"status": "success",
"extracted_content": []
}
# Add extracted content
for data_item in extracted_data:
# Enrich with file metadata
file_id = data_item.get("file_id", 0)
file_metadata = next((f for f in files if f.get("id") == file_id), {})
content_item = {
"file_id": file_id,
"name": data_item.get("name", file_metadata.get("name", "")),
"type": data_item.get("type", file_metadata.get("type", "")),
"content_type": file_metadata.get("content_type", ""),
"size": file_metadata.get("size", ""),
"is_extracted": data_item.get("is_extracted", False),
"extraction_method": data_item.get("extraction_method", ""),
"content": data_item.get("content", "")
}
result["extracted_content"].append(content_item)
return result
def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Extract document contents for a specific file from workflow messages.
Enhanced to handle the new document structure.
Args:
file_id: ID of the file
messages: List of all messages in the workflow
Returns:
List of document contents for the specified file
"""
contents = []
for message in messages:
# Search documents in the message
for document in message.get("documents", []):
source = document.get("source", {})
# Check if file ID matches (handle both string and int comparison)
if (source.get("id") == file_id or
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
(isinstance(file_id, str) and source.get("id") == int(file_id))):
# Add contents of the file
doc_contents = document.get("contents", [])
if doc_contents:
# Ensure each content has document reference
for content in doc_contents:
content_copy = content.copy()
content_copy["document_id"] = document.get("id")
content_copy["message_id"] = message.get("id")
contents.append(content_copy)
return contents
def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
"""Helper function for logging with different log functions"""
# Log via logger instance
if log_type == "error":
logger.error(message)
elif log_type == "warning":
logger.warning(message)
else:
logger.info(message)
# Log via provided log function (if available)
if add_log_func and workflow_id:
add_log_func(workflow_id, message, log_type, agent_id, agent_name)