gateway/gwserver/modules/agentservice_dataextraction.py
2025-04-11 23:39:10 +02:00

494 lines
No EOL
20 KiB
Python

"""
Refactored helper function for intelligent data extraction (continued).
"""
import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime
logger = logging.getLogger(__name__)
async def data_extraction(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
lucydom_interface = None,
workflow_id: str = None,
add_log_func = None
) -> Dict[str, Any]:
"""
Führt einen AI Call durch, um zu bestimmen, welche Inhalte aus welchen Dateiobjekten extrahiert werden sollen,
und führt dann die notwendigen Extraktionen durch.
Args:
prompt: Spezifizierung, welche Daten extrahiert werden sollen
files: Liste aller verfügbaren Dateien mit Metadaten
messages: Liste aller Nachrichten im Workflow
ai_service: Service für KI-Anfragen
lucydom_interface: Interface für Datenbankzugriffe (optional)
workflow_id: Optionale ID des Workflows für Logging
add_log_func: Optionale Funktion für das Hinzufügen von Logs
Returns:
Strukturiertes Text-Objekt mit extrahierten Daten und Kontext-Informationen
"""
try:
# 1. AI Call zur Bestimmung der notwendigen Extraktionen
extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)
# 2. Extraktionen durchführen
extracted_data = await _execute_extractions(
extraction_plan,
files,
messages,
lucydom_interface,
ai_service,
workflow_id,
add_log_func
)
# 3. Extrahierte Daten strukturieren
structured_result = _structure_extracted_data(extracted_data, files, prompt)
return structured_result
except Exception as e:
logger.error(f"Fehler bei der Datenextraktion: {str(e)}", exc_info=True)
# Fehler-Log hinzufügen
if add_log_func and workflow_id:
add_log_func(workflow_id, f"Fehler bei der Datenextraktion: {str(e)}", "error")
# Fehler-Ergebnis zurückgeben
return {
"error": str(e),
"status": "error",
"files_processed": len(files),
"message": f"Die Datenextraktion konnte nicht durchgeführt werden: {str(e)}"
}
async def _create_extraction_plan(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Erstellt einen Extraktionsplan mit AI-Unterstützung.
Args:
prompt: Spezifizierung, welche Daten extrahiert werden sollen
files: Liste aller verfügbaren Dateien mit Metadaten
messages: Liste aller Nachrichten im Workflow
ai_service: Service für KI-Anfragen
workflow_id: Optionale ID des Workflows für Logging
add_log_func: Optionale Funktion für das Hinzufügen von Logs
Returns:
Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
"""
# Erstelle Kontext-Informationen für den AI Call
file_infos = []
for file in files:
# Basis-Metadaten
file_info = {
"id": file.get("id", ""),
"name": file.get("name", ""),
"type": file.get("type", ""),
"content_type": file.get("content_type", ""),
"size": file.get("size", "")
}
# Extraktionsstatus prüfen (falls vorhanden)
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
if doc_contents:
# Prüfen, ob mindestens ein Content mit is_extracted=True existiert
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
)
file_info["already_extracted"] = already_extracted
# Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
for content in doc_contents:
if content.get("type") == "text" and content.get("text"):
preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
file_info["content_preview"] = preview_text
break
else:
file_info["already_extracted"] = False
file_infos.append(file_info)
# AI-Prompt erstellen
extraction_prompt = f"""
Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.
AUFGABE:
{prompt}
VERFÜGBARE DATEIEN:
{json.dumps(file_infos, indent=2)}
Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
1. file_id: Die ID der zu extrahierenden Datei
2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)
Format:
[
{{
"file_id": 1234,
"extract_needed": true,
"extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
"importance": 5
}},
...
]
Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
"""
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")
try:
# AI-Call durchführen
extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
# JSON aus der Antwort extrahieren
import re
json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)
if json_match:
extraction_plan = json.loads(json_match.group(0))
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
"info"
)
return extraction_plan
else:
# Fallback bei Parsing-Problemen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
"Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
"warning"
)
# Standard-Plan: Alle nicht extrahierten Dateien extrahieren
default_plan = []
for file in files:
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
) if doc_contents else False
default_plan.append({
"file_id": file.get("id", 0),
"extract_needed": not already_extracted,
"extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
"importance": 3
})
return default_plan
except Exception as e:
logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
"error"
)
# Leerer Plan bei Fehlern
return []
async def _execute_extractions(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
lucydom_interface,
ai_service,
workflow_id: str = None,
add_log_func = None,
logging_utils = None
) -> List[Dict[str, Any]]:
"""
Execute the planned extractions.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
lucydom_interface: Interface for database access
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
logging_utils: Optional logging utility
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Add log
if logging_utils:
logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
"info"
)
# Only perform extraction if needed
if extract_needed:
# Get file content via LucyDOM interface
if lucydom_interface:
try:
file_content = await lucydom_interface.read_file_content(file_id)
if not file_content:
if logging_utils:
logging_utils.warning(f"File {file_name} not found", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"File {file_name} not found", "warning")
continue
# Perform extraction based on file type
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
# Image analysis with AI service
if ai_service and hasattr(ai_service, "analyze_image"):
try:
image_analysis = await ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": image_analysis,
"is_extracted": True,
"extraction_method": "image_analysis"
})
if logging_utils:
logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
else:
# Fallback if no image analysis available
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Image: {file_name} (Analysis not available)",
"is_extracted": False,
"extraction_method": "none"
})
else:
# Text-based extraction for all other file types
try:
# Import directly here to avoid circular imports
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content,
"is_extracted": is_extracted,
"extraction_method": "text_extraction"
})
if logging_utils:
logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"File {file_name} extracted (Status: {is_extracted})",
"info"
)
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
except Exception as e:
logger.error(f"Error reading file {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
else:
logger.warning(f"No LucyDOM interface available for file {file_name}")
if logging_utils:
logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
else:
# No extraction needed, use existing content
doc_contents = _extract_document_contents_from_messages(file_id, messages)
if doc_contents:
# Use first text content
for content in doc_contents:
if content.get("type") == "text":
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content.get("text", ""),
"is_extracted": content.get("is_extracted", False),
"extraction_method": "existing_content"
})
break
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
return extracted_data
def _structure_extracted_data(
extracted_data: List[Dict[str, Any]],
files: List[Dict[str, Any]],
prompt: str
) -> Dict[str, Any]:
"""
Structure the extracted data into a formatted result.
Args:
extracted_data: List of extracted data per file
files: List of all available files
prompt: Original extraction prompt
Returns:
Structured result object
"""
# Create base structure
result = {
"prompt": prompt,
"files_processed": len(extracted_data),
"total_files": len(files),
"extraction_timestamp": datetime.now().isoformat(),
"status": "success",
"extracted_content": []
}
# Add extracted content
for data_item in extracted_data:
# Enrich with file metadata
file_id = data_item.get("file_id", 0)
file_metadata = next((f for f in files if f.get("id") == file_id), {})
content_item = {
"file_id": file_id,
"name": data_item.get("name", file_metadata.get("name", "")),
"type": data_item.get("type", file_metadata.get("type", "")),
"content_type": file_metadata.get("content_type", ""),
"size": file_metadata.get("size", ""),
"is_extracted": data_item.get("is_extracted", False),
"extraction_method": data_item.get("extraction_method", ""),
"content": data_item.get("content", "")
}
result["extracted_content"].append(content_item)
return result
def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Extract document contents for a specific file from workflow messages.
Args:
file_id: ID of the file
messages: List of all messages in the workflow
Returns:
List of document contents for the specified file
"""
contents = []
for message in messages:
# Search documents in the message
for document in message.get("documents", []):
source = document.get("source", {})
# Check if file ID matches
if source.get("id") == file_id or (source.get("type") == "file" and source.get("id") == file_id):
# Add contents of the file
doc_contents = document.get("contents", [])
if doc_contents:
contents.extend(doc_contents)
return contents
def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
"""Helper function for logging with different log functions"""
# Log via logger instance
if log_type == "error":
logger.error(message)
elif log_type == "warning":
logger.warning(message)
else:
logger.info(message)
# Log via provided log function (if available)
if add_log_func and workflow_id:
add_log_func(workflow_id, message, log_type, agent_id, agent_name)