494 lines
No EOL
20 KiB
Python
494 lines
No EOL
20 KiB
Python
"""
|
|
Refactored helper function for intelligent data extraction (continued).
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
import asyncio
|
|
from datetime import datetime
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
async def data_extraction(
|
|
prompt: str,
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
ai_service,
|
|
lucydom_interface = None,
|
|
workflow_id: str = None,
|
|
add_log_func = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Führt einen AI Call durch, um zu bestimmen, welche Inhalte aus welchen Dateiobjekten extrahiert werden sollen,
|
|
und führt dann die notwendigen Extraktionen durch.
|
|
|
|
Args:
|
|
prompt: Spezifizierung, welche Daten extrahiert werden sollen
|
|
files: Liste aller verfügbaren Dateien mit Metadaten
|
|
messages: Liste aller Nachrichten im Workflow
|
|
ai_service: Service für KI-Anfragen
|
|
lucydom_interface: Interface für Datenbankzugriffe (optional)
|
|
workflow_id: Optionale ID des Workflows für Logging
|
|
add_log_func: Optionale Funktion für das Hinzufügen von Logs
|
|
|
|
Returns:
|
|
Strukturiertes Text-Objekt mit extrahierten Daten und Kontext-Informationen
|
|
"""
|
|
try:
|
|
# 1. AI Call zur Bestimmung der notwendigen Extraktionen
|
|
extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)
|
|
|
|
# 2. Extraktionen durchführen
|
|
extracted_data = await _execute_extractions(
|
|
extraction_plan,
|
|
files,
|
|
messages,
|
|
lucydom_interface,
|
|
ai_service,
|
|
workflow_id,
|
|
add_log_func
|
|
)
|
|
|
|
# 3. Extrahierte Daten strukturieren
|
|
structured_result = _structure_extracted_data(extracted_data, files, prompt)
|
|
|
|
return structured_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler bei der Datenextraktion: {str(e)}", exc_info=True)
|
|
|
|
# Fehler-Log hinzufügen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Fehler bei der Datenextraktion: {str(e)}", "error")
|
|
|
|
# Fehler-Ergebnis zurückgeben
|
|
return {
|
|
"error": str(e),
|
|
"status": "error",
|
|
"files_processed": len(files),
|
|
"message": f"Die Datenextraktion konnte nicht durchgeführt werden: {str(e)}"
|
|
}
|
|
|
|
async def _create_extraction_plan(
|
|
prompt: str,
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
ai_service,
|
|
workflow_id: str = None,
|
|
add_log_func = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Erstellt einen Extraktionsplan mit AI-Unterstützung.
|
|
|
|
Args:
|
|
prompt: Spezifizierung, welche Daten extrahiert werden sollen
|
|
files: Liste aller verfügbaren Dateien mit Metadaten
|
|
messages: Liste aller Nachrichten im Workflow
|
|
ai_service: Service für KI-Anfragen
|
|
workflow_id: Optionale ID des Workflows für Logging
|
|
add_log_func: Optionale Funktion für das Hinzufügen von Logs
|
|
|
|
Returns:
|
|
Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
|
|
"""
|
|
# Erstelle Kontext-Informationen für den AI Call
|
|
file_infos = []
|
|
for file in files:
|
|
# Basis-Metadaten
|
|
file_info = {
|
|
"id": file.get("id", ""),
|
|
"name": file.get("name", ""),
|
|
"type": file.get("type", ""),
|
|
"content_type": file.get("content_type", ""),
|
|
"size": file.get("size", "")
|
|
}
|
|
|
|
# Extraktionsstatus prüfen (falls vorhanden)
|
|
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
|
|
|
|
if doc_contents:
|
|
# Prüfen, ob mindestens ein Content mit is_extracted=True existiert
|
|
already_extracted = any(
|
|
content.get("is_extracted", False) for content in doc_contents
|
|
)
|
|
file_info["already_extracted"] = already_extracted
|
|
|
|
# Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
|
|
for content in doc_contents:
|
|
if content.get("type") == "text" and content.get("text"):
|
|
preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
|
|
file_info["content_preview"] = preview_text
|
|
break
|
|
else:
|
|
file_info["already_extracted"] = False
|
|
|
|
file_infos.append(file_info)
|
|
|
|
# AI-Prompt erstellen
|
|
extraction_prompt = f"""
|
|
Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
|
|
und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.
|
|
|
|
AUFGABE:
|
|
{prompt}
|
|
|
|
VERFÜGBARE DATEIEN:
|
|
{json.dumps(file_infos, indent=2)}
|
|
|
|
Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
|
|
1. file_id: Die ID der zu extrahierenden Datei
|
|
2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
|
|
3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
|
|
4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)
|
|
|
|
Format:
|
|
[
|
|
{{
|
|
"file_id": 1234,
|
|
"extract_needed": true,
|
|
"extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
|
|
"importance": 5
|
|
}},
|
|
...
|
|
]
|
|
|
|
Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
|
|
"""
|
|
|
|
# Log hinzufügen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")
|
|
|
|
try:
|
|
# AI-Call durchführen
|
|
extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
|
|
|
|
# JSON aus der Antwort extrahieren
|
|
import re
|
|
json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)
|
|
|
|
if json_match:
|
|
extraction_plan = json.loads(json_match.group(0))
|
|
|
|
# Log hinzufügen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
|
|
"info"
|
|
)
|
|
|
|
return extraction_plan
|
|
else:
|
|
# Fallback bei Parsing-Problemen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
"Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
|
|
"warning"
|
|
)
|
|
|
|
# Standard-Plan: Alle nicht extrahierten Dateien extrahieren
|
|
default_plan = []
|
|
for file in files:
|
|
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
|
|
already_extracted = any(
|
|
content.get("is_extracted", False) for content in doc_contents
|
|
) if doc_contents else False
|
|
|
|
default_plan.append({
|
|
"file_id": file.get("id", 0),
|
|
"extract_needed": not already_extracted,
|
|
"extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
|
|
"importance": 3
|
|
})
|
|
|
|
return default_plan
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)
|
|
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
|
|
"error"
|
|
)
|
|
|
|
# Leerer Plan bei Fehlern
|
|
return []
|
|
|
|
async def _execute_extractions(
|
|
extraction_plan: List[Dict[str, Any]],
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
lucydom_interface,
|
|
ai_service,
|
|
workflow_id: str = None,
|
|
add_log_func = None,
|
|
logging_utils = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute the planned extractions.
|
|
|
|
Args:
|
|
extraction_plan: List of extraction instructions
|
|
files: List of all available files
|
|
lucydom_interface: Interface for database access
|
|
ai_service: Service for AI requests
|
|
workflow_id: Optional workflow ID for logging
|
|
add_log_func: Optional function for adding logs
|
|
logging_utils: Optional logging utility
|
|
|
|
Returns:
|
|
List with extracted data per file
|
|
"""
|
|
extracted_data = []
|
|
|
|
# Sort by importance
|
|
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
|
|
|
|
for extraction_item in sorted_plan:
|
|
file_id = extraction_item.get("file_id")
|
|
extract_needed = extraction_item.get("extract_needed", False)
|
|
extraction_prompt = extraction_item.get("extraction_prompt", "")
|
|
|
|
# Find file metadata
|
|
file_metadata = next((f for f in files if f.get("id") == file_id), None)
|
|
|
|
if not file_metadata:
|
|
logger.warning(f"File with ID {file_id} not found")
|
|
continue
|
|
|
|
file_name = file_metadata.get("name", "")
|
|
file_type = file_metadata.get("type", "")
|
|
content_type = file_metadata.get("content_type", "")
|
|
|
|
# Add log
|
|
if logging_utils:
|
|
logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
|
|
"info"
|
|
)
|
|
|
|
# Only perform extraction if needed
|
|
if extract_needed:
|
|
# Get file content via LucyDOM interface
|
|
if lucydom_interface:
|
|
try:
|
|
file_content = await lucydom_interface.read_file_content(file_id)
|
|
|
|
if not file_content:
|
|
if logging_utils:
|
|
logging_utils.warning(f"File {file_name} not found", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"File {file_name} not found", "warning")
|
|
continue
|
|
|
|
# Perform extraction based on file type
|
|
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
|
|
# Image analysis with AI service
|
|
if ai_service and hasattr(ai_service, "analyze_image"):
|
|
try:
|
|
image_analysis = await ai_service.analyze_image(
|
|
image_data=file_content,
|
|
prompt=extraction_prompt,
|
|
mime_type=content_type
|
|
)
|
|
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": image_analysis,
|
|
"is_extracted": True,
|
|
"extraction_method": "image_analysis"
|
|
})
|
|
|
|
if logging_utils:
|
|
logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing image {file_name}: {str(e)}")
|
|
if logging_utils:
|
|
logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
|
|
else:
|
|
# Fallback if no image analysis available
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"Image: {file_name} (Analysis not available)",
|
|
"is_extracted": False,
|
|
"extraction_method": "none"
|
|
})
|
|
else:
|
|
# Text-based extraction for all other file types
|
|
try:
|
|
# Import directly here to avoid circular imports
|
|
from modules.agentservice_utils import extract_text_from_file_content
|
|
|
|
content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": content,
|
|
"is_extracted": is_extracted,
|
|
"extraction_method": "text_extraction"
|
|
})
|
|
|
|
if logging_utils:
|
|
logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"File {file_name} extracted (Status: {is_extracted})",
|
|
"info"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from {file_name}: {str(e)}")
|
|
if logging_utils:
|
|
logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
|
|
except Exception as e:
|
|
logger.error(f"Error reading file {file_name}: {str(e)}")
|
|
if logging_utils:
|
|
logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
|
|
else:
|
|
logger.warning(f"No LucyDOM interface available for file {file_name}")
|
|
if logging_utils:
|
|
logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
|
|
else:
|
|
# No extraction needed, use existing content
|
|
doc_contents = _extract_document_contents_from_messages(file_id, messages)
|
|
|
|
if doc_contents:
|
|
# Use first text content
|
|
for content in doc_contents:
|
|
if content.get("type") == "text":
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": content.get("text", ""),
|
|
"is_extracted": content.get("is_extracted", False),
|
|
"extraction_method": "existing_content"
|
|
})
|
|
break
|
|
else:
|
|
# No existing content found
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"No content available for {file_name}",
|
|
"is_extracted": False,
|
|
"extraction_method": "none"
|
|
})
|
|
|
|
return extracted_data
|
|
|
|
def _structure_extracted_data(
|
|
extracted_data: List[Dict[str, Any]],
|
|
files: List[Dict[str, Any]],
|
|
prompt: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Structure the extracted data into a formatted result.
|
|
|
|
Args:
|
|
extracted_data: List of extracted data per file
|
|
files: List of all available files
|
|
prompt: Original extraction prompt
|
|
|
|
Returns:
|
|
Structured result object
|
|
"""
|
|
# Create base structure
|
|
result = {
|
|
"prompt": prompt,
|
|
"files_processed": len(extracted_data),
|
|
"total_files": len(files),
|
|
"extraction_timestamp": datetime.now().isoformat(),
|
|
"status": "success",
|
|
"extracted_content": []
|
|
}
|
|
|
|
# Add extracted content
|
|
for data_item in extracted_data:
|
|
# Enrich with file metadata
|
|
file_id = data_item.get("file_id", 0)
|
|
file_metadata = next((f for f in files if f.get("id") == file_id), {})
|
|
|
|
content_item = {
|
|
"file_id": file_id,
|
|
"name": data_item.get("name", file_metadata.get("name", "")),
|
|
"type": data_item.get("type", file_metadata.get("type", "")),
|
|
"content_type": file_metadata.get("content_type", ""),
|
|
"size": file_metadata.get("size", ""),
|
|
"is_extracted": data_item.get("is_extracted", False),
|
|
"extraction_method": data_item.get("extraction_method", ""),
|
|
"content": data_item.get("content", "")
|
|
}
|
|
|
|
result["extracted_content"].append(content_item)
|
|
|
|
return result
|
|
|
|
def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract document contents for a specific file from workflow messages.
|
|
|
|
Args:
|
|
file_id: ID of the file
|
|
messages: List of all messages in the workflow
|
|
|
|
Returns:
|
|
List of document contents for the specified file
|
|
"""
|
|
contents = []
|
|
|
|
for message in messages:
|
|
# Search documents in the message
|
|
for document in message.get("documents", []):
|
|
source = document.get("source", {})
|
|
|
|
# Check if file ID matches
|
|
if source.get("id") == file_id or (source.get("type") == "file" and source.get("id") == file_id):
|
|
# Add contents of the file
|
|
doc_contents = document.get("contents", [])
|
|
|
|
if doc_contents:
|
|
contents.extend(doc_contents)
|
|
|
|
return contents
|
|
|
|
def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
|
|
"""Helper function for logging with different log functions"""
|
|
# Log via logger instance
|
|
if log_type == "error":
|
|
logger.error(message)
|
|
elif log_type == "warning":
|
|
logger.warning(message)
|
|
else:
|
|
logger.info(message)
|
|
|
|
# Log via provided log function (if available)
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, message, log_type, agent_id, agent_name) |