refactored chat engine

This commit is contained in:
ValueOn AG 2025-04-20 22:22:22 +02:00
parent 9e2b6f1344
commit 9247de4346
42 changed files with 2271 additions and 14660 deletions

2
app.py
View file

@ -14,7 +14,7 @@ from logging.handlers import RotatingFileHandler
from datetime import timedelta
import pathlib
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
from modules.gateway_interface import get_gateway_interface

View file

@ -2,7 +2,7 @@ import logging
import httpx
from typing import Dict, Any, List, Optional, Union
from fastapi import HTTPException
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
# Logger konfigurieren
logger = logging.getLogger(__name__)
@ -199,7 +199,7 @@ class ChatService:
# Unterscheide zwischen Dateipfad und Binärdaten
if isinstance(image_data, str):
# Es ist ein Dateipfad - importiere filehandling nur bei Bedarf
from gateway.gwserver.modules import agentservice_filemanager as file_handler
from modules import agentservice_filemanager as file_handler
base64_data, auto_mime_type = file_handler.encode_file_to_base64(image_data)
mime_type = mime_type or auto_mime_type
else:

View file

@ -2,7 +2,7 @@ import logging
import httpx
from typing import Dict, Any, List, Optional, Union
from fastapi import HTTPException
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
# Logger konfigurieren
logger = logging.getLogger(__name__)
@ -108,7 +108,7 @@ class ChatService:
# Unterscheide zwischen Dateipfad und Binärdaten
if isinstance(image_data, str):
# Es ist ein Dateipfad - importiere filehandling nur bei Bedarf
from gateway.gwserver.modules import agentservice_filemanager as file_handler
from modules import agentservice_filemanager as file_handler
base64_data, auto_mime_type = file_handler.encode_file_to_base64(image_data)
mime_type = mime_type or auto_mime_type
else:

View file

@ -375,7 +375,7 @@ class DatabaseConnector:
# Wenn die Tabelle leer ist und eine System-ID registriert werden soll
if not data:
self.register_initial_id(table, record_data["id"])
logger.info(f"Initiale ID {record_data['id']} für Tabelle {table} registriert")
logger.info(f"Initiale ID {record_data['id']} für Tabelle {table} wurde registriert")
# Füge den neuen Datensatz hinzu
data.append(record_data)
@ -462,6 +462,7 @@ class DatabaseConnector:
# Datensatz nicht gefunden
raise ValueError(f"Datensatz mit ID {record_id} nicht gefunden in Tabelle {table}")
# System-Tabellen-Funktionen
def register_initial_id(self, table: str, initial_id: int) -> bool:

View file

@ -1,675 +0,0 @@
import os
import logging
from typing import List, Dict, Any, Optional, Union
from datetime import datetime
import mysql.connector
from mysql.connector import Error
logger = logging.getLogger(__name__)
class DatabaseConnector:
"""
Ein Konnektor für MySQL-basierte Datenspeicherung.
Stellt generische Datenbankoperationen bereit.
"""
def __init__(self, db_host: str, db_database: str, db_user: str, db_password: str, mandate_id: int = None, user_id: int = None):
"""
Initialisiert den MySQL-Datenbankkonnektor.
Args:
db_host: MySQL-Server Host
db_database: Name der Datenbank
db_user: Benutzername für die Authentifizierung
db_password: Passwort für die Authentifizierung
mandate_id: Kontext-Parameter für den Mandanten
user_id: Kontext-Parameter für den Benutzer
"""
# Speichere die Eingabeparameter
self.db_host = db_host
self.db_database = db_database
self.db_user = db_user
self.db_password = db_password
# Prüfe, ob Kontext-Parameter gesetzt sind
if mandate_id is None or user_id is None:
raise ValueError("mandate_id und user_id müssen gesetzt sein")
# Stelle Verbindung zur Datenbank her
self.connection = self._create_connection()
# System-Tabelle initialisieren
self._system_table_name = "_system"
self._initialize_system_table()
# Temporär mandate_id und user_id speichern
self._mandate_id = mandate_id
self._user_id = user_id
# Wenn mandate_id oder user_id 0 sind, versuche die initialen IDs zu verwenden
if mandate_id == 0:
initial_mandate_id = self.get_initial_id("mandates")
if initial_mandate_id is not None:
self._mandate_id = initial_mandate_id
logger.info(f"Verwende initiale mandate_id: {initial_mandate_id} statt 0")
if user_id == 0:
initial_user_id = self.get_initial_id("users")
if initial_user_id is not None:
self._user_id = initial_user_id
logger.info(f"Verwende initiale user_id: {initial_user_id} statt 0")
# Setze die effektiven IDs als Eigenschaften
self.mandate_id = self._mandate_id
self.user_id = self._user_id
logger.info(f"DatabaseConnector initialisiert für Datenbank: {db_database}")
logger.info(f"Kontext: mandate_id={self.mandate_id}, user_id={self.user_id}")
def _create_connection(self):
"""Erstellt eine Verbindung zur MySQL-Datenbank"""
try:
connection = mysql.connector.connect(
host=self.db_host,
database=self.db_database,
user=self.db_user,
password=self.db_password
)
if connection.is_connected():
logger.info(f"Verbunden mit MySQL-Server Version {connection.get_server_info()}")
return connection
except Error as e:
logger.error(f"Fehler bei der Verbindung zu MySQL: {e}")
raise
def _initialize_system_table(self):
"""Initialisiert die System-Tabelle, falls sie noch nicht existiert."""
cursor = None
try:
cursor = self.connection.cursor()
# Prüfe, ob die System-Tabelle existiert
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_schema = '{self.db_database}'
AND table_name = '{self._system_table_name}'
""")
if cursor.fetchone()[0] == 0:
# Erstelle die System-Tabelle
cursor.execute(f"""
CREATE TABLE {self._system_table_name} (
table_name VARCHAR(255) PRIMARY KEY,
initial_id INT NOT NULL
)
""")
self.connection.commit()
logger.info(f"System-Tabelle '{self._system_table_name}' erstellt")
except Error as e:
logger.error(f"Fehler beim Initialisieren der System-Tabelle: {e}")
if self.connection.is_connected():
self.connection.rollback()
raise
finally:
if cursor and cursor.is_connected():
cursor.close()
def _execute_query(self, query: str, params: tuple = None):
"""Führt eine SQL-Abfrage aus"""
cursor = None
try:
cursor = self.connection.cursor(dictionary=True)
cursor.execute(query, params)
return cursor
except Error as e:
logger.error(f"Fehler bei der Ausführung der Abfrage: {e}")
raise
finally:
if cursor:
cursor.close()
def _execute_select(self, query: str, params: tuple = None) -> List[Dict[str, Any]]:
"""Führt eine SELECT-Abfrage aus und gibt die Ergebnisse zurück"""
cursor = None
try:
cursor = self.connection.cursor(dictionary=True)
cursor.execute(query, params)
result = cursor.fetchall()
return result
except Error as e:
logger.error(f"Fehler bei der Ausführung der SELECT-Abfrage: {e}")
raise
finally:
if cursor:
cursor.close()
def _execute_insert(self, query: str, params: tuple = None) -> int:
"""Führt eine INSERT-Abfrage aus und gibt die ID des eingefügten Datensatzes zurück"""
cursor = None
try:
cursor = self.connection.cursor()
cursor.execute(query, params)
self.connection.commit()
return cursor.lastrowid
except Error as e:
logger.error(f"Fehler bei der Ausführung der INSERT-Abfrage: {e}")
self.connection.rollback()
raise
finally:
if cursor:
cursor.close()
def _execute_update(self, query: str, params: tuple = None) -> int:
"""Führt eine UPDATE-Abfrage aus und gibt die Anzahl der betroffenen Zeilen zurück"""
cursor = None
try:
cursor = self.connection.cursor()
cursor.execute(query, params)
self.connection.commit()
return cursor.rowcount
except Error as e:
logger.error(f"Fehler bei der Ausführung der UPDATE-Abfrage: {e}")
self.connection.rollback()
raise
finally:
if cursor:
cursor.close()
def _execute_delete(self, query: str, params: tuple = None) -> int:
"""Führt eine DELETE-Abfrage aus und gibt die Anzahl der gelöschten Zeilen zurück"""
cursor = None
try:
cursor = self.connection.cursor()
cursor.execute(query, params)
self.connection.commit()
return cursor.rowcount
except Error as e:
logger.error(f"Fehler bei der Ausführung der DELETE-Abfrage: {e}")
self.connection.rollback()
raise
finally:
if cursor:
cursor.close()
def _apply_record_filter(self, record_filter: Dict[str, Any] = None) -> str:
"""Erstellt eine WHERE-Klausel basierend auf dem Datensatzfilter"""
if not record_filter:
return "WHERE 1=1"
conditions = []
params = []
for field, value in record_filter.items():
conditions.append(f"{field} = %s")
params.append(value)
where_clause = "WHERE " + " AND ".join(conditions)
return where_clause, tuple(params)
def _get_context_filter(self) -> tuple:
"""Erstellt eine WHERE-Klausel für den Mandanten- und Benutzerkontext"""
return "WHERE mandate_id = %s", (self.mandate_id,)
# Public API
def get_tables(self, filter_criteria: Dict[str, Any] = None) -> List[str]:
"""
Gibt eine Liste aller verfügbaren Tabellen zurück.
Args:
filter_criteria: Optionale Filterkriterien (nicht implementiert)
Returns:
Liste der Tabellennamen
"""
query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = %s
AND table_name NOT LIKE '\_%'
"""
try:
result = self._execute_select(query, (self.db_database,))
return [row["table_name"] for row in result]
except Exception as e:
logger.error(f"Fehler beim Abrufen der Tabellen: {e}")
return []
def get_fields(self, table: str, filter_criteria: Dict[str, Any] = None) -> List[str]:
"""
Gibt eine Liste aller Felder einer Tabelle zurück.
Args:
table: Name der Tabelle
filter_criteria: Optionale Filterkriterien (nicht implementiert)
Returns:
Liste der Feldnamen
"""
query = """
SELECT column_name
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
"""
try:
result = self._execute_select(query, (self.db_database, table))
return [row["column_name"] for row in result]
except Exception as e:
logger.error(f"Fehler beim Abrufen der Felder für Tabelle {table}: {e}")
return []
def get_schema(self, table: str, language: str = None, filter_criteria: Dict[str, Any] = None) -> Dict[str, Dict[str, Any]]:
"""
Gibt ein Schema-Objekt für eine Tabelle zurück mit Datentypen und Labels.
Args:
table: Name der Tabelle
language: Sprache für die Labels (optional)
filter_criteria: Optionale Filterkriterien (nicht implementiert)
Returns:
Schema-Objekt mit Feldern, Datentypen und Labels
"""
query = """
SELECT
column_name,
data_type,
column_comment
FROM
information_schema.columns
WHERE
table_schema = %s AND table_name = %s
"""
schema = {}
try:
result = self._execute_select(query, (self.db_database, table))
for row in result:
field = row["column_name"]
data_type = row["data_type"]
comment = row["column_comment"]
# Label erstellen (Standardwert ist der Feldname)
label = field
# Wenn ein Kommentar existiert, verwende diesen als Label
if comment:
label = comment
schema[field] = {
"type": data_type,
"label": label
}
return schema
except Exception as e:
logger.error(f"Fehler beim Abrufen des Schemas für Tabelle {table}: {e}")
return {}
def get_recordset(self, table: str, field_filter: List[str] = None, record_filter: Dict[str, Any] = None) -> List[Dict[str, Any]]:
"""
Gibt eine Liste von Datensätzen aus einer Tabelle zurück, gefiltert nach Kriterien.
Args:
table: Name der Tabelle
field_filter: Filter für Felder (welche Felder zurückgegeben werden sollen)
record_filter: Filter für Datensätze (welche Datensätze zurückgegeben werden sollen)
Returns:
Liste der gefilterten Datensätze
"""
# Bestimme die Felder für die Abfrage
fields = "*"
if field_filter and isinstance(field_filter, list):
fields = ", ".join(field_filter)
# Basisbedingung ist der Mandantenkontext
base_where, base_params = self._get_context_filter()
# Wende zusätzliche Filterbedingungen an, wenn vorhanden
additional_where = ""
additional_params = ()
if record_filter:
additional_where, additional_params = self._apply_record_filter(record_filter)
# Entferne das "WHERE" am Anfang und ersetze es durch "AND"
additional_where = " AND " + additional_where[6:]
# Kombiniere die Bedingungen und Parameter
where_clause = base_where + additional_where
params = base_params + additional_params
# Erstelle die vollständige Abfrage
query = f"""
SELECT {fields} FROM {table} {where_clause}
"""
try:
return self._execute_select(query, params)
except Exception as e:
logger.error(f"Fehler beim Abrufen der Datensätze aus Tabelle {table}: {e}")
return []
def record_create(self, table: str, record_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Erstellt einen neuen Datensatz in der Tabelle.
Args:
table: Name der Tabelle
record_data: Daten für den neuen Datensatz
Returns:
Der erstellte Datensatz
"""
# Füge mandate_id und user_id hinzu, falls nicht vorhanden oder 0
if "mandate_id" not in record_data or record_data["mandate_id"] == 0:
record_data["mandate_id"] = self.mandate_id
if "user_id" not in record_data or record_data["user_id"] == 0:
record_data["user_id"] = self.user_id
# Erstelle die Abfrage
fields = ", ".join(record_data.keys())
placeholders = ", ".join(["%s"] * len(record_data))
values = tuple(record_data.values())
query = f"""
INSERT INTO {table} ({fields})
VALUES ({placeholders})
"""
try:
# Prüfe zuerst, ob die Tabelle leer ist
check_query = f"""
SELECT COUNT(*) as count FROM {table}
"""
count_result = self._execute_select(check_query)
is_empty = count_result[0]["count"] == 0
# Führe die Abfrage aus und erhalte die ID des neuen Datensatzes
new_id = self._execute_insert(query, values)
# Wenn die Tabelle vorher leer war, registriere die neue ID als initiale ID
if is_empty and new_id:
self.register_initial_id(table, new_id)
logger.info(f"Initiale ID {new_id} für Tabelle {table} registriert")
# Füge die ID zum Datensatz hinzu, falls eine zurückgegeben wurde
if new_id:
record_data["id"] = new_id
return record_data
except Exception as e:
logger.error(f"Fehler beim Erstellen des Datensatzes in Tabelle {table}: {e}")
raise ValueError(f"Fehler beim Erstellen des Datensatzes in Tabelle {table}")
def record_delete(self, table: str, record_id: Union[str, int]) -> bool:
"""
Löscht einen Datensatz aus der Tabelle.
Args:
table: Name der Tabelle
record_id: ID des zu löschenden Datensatzes
Returns:
True bei Erfolg, False bei Fehler
"""
# Prüfe, ob es sich um die initiale ID handelt
initial_id = self.get_initial_id(table)
if initial_id is not None and initial_id == record_id:
logger.warning(f"Versuch, den initialen Datensatz mit ID {record_id} aus Tabelle {table} zu löschen, wurde verhindert")
return False
# Prüfe zuerst, ob der Datensatz zum aktuellen Mandanten gehört
check_query = f"""
SELECT mandate_id FROM {table} WHERE id = %s
"""
try:
result = self._execute_select(check_query, (record_id,))
if not result:
# Datensatz nicht gefunden
return False
if result[0]["mandate_id"] != self.mandate_id:
raise ValueError("Not your mandate")
# Lösche den Datensatz
delete_query = f"""
DELETE FROM {table} WHERE id = %s AND mandate_id = %s
"""
rows_affected = self._execute_delete(delete_query, (record_id, self.mandate_id))
return rows_affected > 0
except Exception as e:
logger.error(f"Fehler beim Löschen des Datensatzes aus Tabelle {table}: {e}")
return False
def record_modify(self, table: str, record_id: Union[str, int], record_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Ändert einen Datensatz in der Tabelle.
Args:
table: Name der Tabelle
record_id: ID des zu ändernden Datensatzes
record_data: Neue Daten für den Datensatz
Returns:
Der aktualisierte Datensatz
"""
# Prüfe, ob es sich um die initiale ID handelt und die ID geändert werden soll
initial_id = self.get_initial_id(table)
if initial_id is not None and initial_id == record_id and "id" in record_data and record_data["id"] != record_id:
raise ValueError(f"Die ID des initialen Datensatzes in Tabelle {table} kann nicht geändert werden")
# Prüfe zuerst, ob der Datensatz zum aktuellen Mandanten gehört
check_query = f"""
SELECT mandate_id FROM {table} WHERE id = %s
"""
try:
result = self._execute_select(check_query, (record_id,))
if not result:
# Datensatz nicht gefunden
raise ValueError(f"Datensatz mit ID {record_id} nicht gefunden in Tabelle {table}")
if result[0]["mandate_id"] != self.mandate_id:
raise ValueError("Not your mandate")
# Erstelle die SET-Klausel und Parameter für das Update
set_clauses = []
values = []
for key, value in record_data.items():
set_clauses.append(f"{key} = %s")
values.append(value)
set_clause = ", ".join(set_clauses)
values.append(record_id) # Für die WHERE-Bedingung
values.append(self.mandate_id) # Für die mandate_id-Bedingung
# Aktualisiere den Datensatz
update_query = f"""
UPDATE {table}
SET {set_clause}
WHERE id = %s AND mandate_id = %s
"""
rows_affected = self._execute_update(update_query, tuple(values))
if rows_affected > 0:
# Lade den aktualisierten Datensatz
get_query = f"""
SELECT * FROM {table} WHERE id = %s
"""
updated_record = self._execute_select(get_query, (record_id,))
if updated_record:
return updated_record[0]
else:
raise ValueError(f"Fehler beim Abrufen des aktualisierten Datensatzes aus Tabelle {table}")
else:
raise ValueError(f"Fehler beim Aktualisieren des Datensatzes in Tabelle {table}")
except Exception as e:
logger.error(f"Fehler beim Aktualisieren des Datensatzes in Tabelle {table}: {e}")
raise
# System-Tabellen-Funktionen
def register_initial_id(self, table: str, initial_id: int) -> bool:
"""
Registriert die initiale ID für eine Tabelle.
Args:
table: Name der Tabelle
initial_id: Die initiale ID
Returns:
True bei Erfolg, False bei Fehler
"""
try:
# Prüfe zuerst, ob bereits eine initiale ID für diese Tabelle registriert ist
check_query = f"""
SELECT COUNT(*) as count
FROM {self._system_table_name}
WHERE table_name = %s
"""
result = self._execute_select(check_query, (table,))
if result and result[0]["count"] > 0:
# Bereits registriert
return True
# Registriere die initiale ID
insert_query = f"""
INSERT INTO {self._system_table_name} (table_name, initial_id)
VALUES (%s, %s)
"""
self._execute_insert(insert_query, (table, initial_id))
logger.info(f"Initiale ID {initial_id} für Tabelle {table} registriert")
return True
except Exception as e:
logger.error(f"Fehler beim Registrieren der initialen ID für Tabelle {table}: {e}")
return False
def get_initial_id(self, table: str) -> Optional[int]:
"""
Gibt die initiale ID für eine Tabelle zurück.
Args:
table: Name der Tabelle
Returns:
Die initiale ID oder None, wenn nicht vorhanden
"""
try:
query = f"""
SELECT initial_id
FROM {self._system_table_name}
WHERE table_name = %s
"""
result = self._execute_select(query, (table,))
if result and len(result) > 0:
logger.info(f"Gefundene initiale ID für Tabelle {table}: {result[0]['initial_id']}")
return result[0]["initial_id"]
# Wenn keine initiale ID gefunden wurde, versuche den ersten Datensatz zu verwenden
if table and not table.startswith("_"):
try:
query = f"""
SELECT id
FROM {table}
ORDER BY id
LIMIT 1
"""
first_record = self._execute_select(query)
if first_record and len(first_record) > 0 and "id" in first_record[0]:
first_id = first_record[0]["id"]
# Registriere diese ID als initiale ID
self.register_initial_id(table, first_id)
logger.info(f"Automatisch erkannte initiale ID {first_id} für Tabelle {table}")
return first_id
except Exception as inner_e:
logger.warning(f"Konnte keinen ersten Datensatz in Tabelle {table} finden: {inner_e}")
logger.debug(f"Keine initiale ID für Tabelle {table} gefunden")
return None
except Exception as e:
logger.error(f"Fehler beim Abrufen der initialen ID für Tabelle {table}: {e}")
return None
def has_initial_id(self, table: str) -> bool:
"""
Prüft, ob eine initiale ID für eine Tabelle registriert ist.
Args:
table: Name der Tabelle
Returns:
True, wenn eine initiale ID registriert ist, sonst False
"""
try:
query = f"""
SELECT COUNT(*) as count
FROM {self._system_table_name}
WHERE table_name = %s
"""
result = self._execute_select(query, (table,))
if result and len(result) > 0:
return result[0]["count"] > 0
return False
except Exception as e:
logger.error(f"Fehler beim Prüfen der initialen ID für Tabelle {table}: {e}")
return False
def get_all_initial_ids(self) -> Dict[str, int]:
"""
Gibt alle registrierten initialen IDs zurück.
Returns:
Dictionary mit Tabellennamen als Schlüssel und initialen IDs als Werte
"""
try:
query = f"""
SELECT table_name, initial_id
FROM {self._system_table_name}
"""
result = self._execute_select(query)
initial_ids = {}
for row in result:
initial_ids[row["table_name"]] = row["initial_id"]
return initial_ids
except Exception as e:
logger.error(f"Fehler beim Abrufen aller initialen IDs: {e}")
return {}
def close(self):
"""Schließt die Datenbankverbindung"""
if hasattr(self, 'connection') and self.connection.is_connected():
self.connection.close()
logger.info("Datenbankverbindung geschlossen")

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,399 +0,0 @@
"""
Creative Agent for knowledge-based answers and creative content generation.
Handles open questions, documentation tasks, and special 'poweron' requests.
Based on the refactored Core-Module.
"""
import logging
from typing import List, Dict, Any, Optional
import json
from modules.agentservice_base import BaseAgent
from modules.agentservice_utils import MessageUtils, LoggingUtils
from modules.agentservice_protocol import AgentCommunicationProtocol
logger = logging.getLogger(__name__)
class CreativeAgent(BaseAgent):
"""Agent for knowledge-based answers and creative content generation"""
def __init__(self):
"""Initialize the Creative Agent"""
super().__init__()
self.id = "creative"
self.name = "Creative Knowledge Assistant"
self.type = "knowledge"
self.description = "Provides knowledge-based answers, creates content, handles document processing, and responds to PowerOn requests"
# Extended capabilities to explicitly cover document processing
self.capabilities = ("knowledge_sharing,content_creation,document_generation,"
"creative_writing,poweron,document_processing,"
"information_extraction,data_transformation,"
"document_analysis,text_processing,table_creation,"
"visual_information_processing,content_structuring")
# Update result format to include tables
self.result_format = "Text,Document,Table"
# Add enhanced document capabilities
self.supports_documents = True
self.document_capabilities = ["read", "create", "analyze", "extract", "transform"]
self.required_context = ["workflow_id"]
self.document_handler = None
# Initialize AI service
self.ai_service = None
# Initialize protocol
self.protocol = AgentCommunicationProtocol()
# Initialize utilities
self.message_utils = MessageUtils()
def get_agent_info(self) -> Dict[str, Any]:
"""Get agent information for agent registry"""
info = super().get_agent_info()
info.update({
"metadata": {
"specialties": [
"creative_writing",
"documentation",
"knowledge",
"poweron",
"document_processing",
"information_extraction",
"content_transformation",
"table_generation",
"document_analysis"
]
}
})
return info
def set_document_handler(self, document_handler):
"""Set the document handler for file operations"""
self.document_handler = document_handler
async def old_process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Process a message and generate a creative or knowledge-based response.
Enhanced with improved document handling.
Args:
message: The message to process
context: Additional context
Returns:
The generated response
"""
# Extract workflow_id from context or message
workflow_id = context.get("workflow_id") if context and isinstance(context, dict) else None
if not workflow_id and isinstance(message, dict):
workflow_id = message.get("workflow_id", "unknown")
# Create response structure early for fallback
response = {
"role": "assistant",
"content": "",
"agent_id": self.id,
"agent_type": self.type,
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id
}
# Safely create logging utils
log_func = None
logging_utils = None
try:
from modules.agentservice_utils import LoggingUtils
log_func = context.get("log_func") if context and isinstance(context, dict) else None
logging_utils = LoggingUtils(workflow_id, log_func)
except Exception as e:
# If we can't even create logging utils, use basic logging
logger.error(f"Error creating logging utils: {str(e)}")
# Log function that works with or without logging_utils
def safe_log(message, level="info"):
try:
if logging_utils:
if level == "info":
logging_utils.info(message, "agents")
elif level == "warning":
logging_utils.warning(message, "agents")
elif level == "error":
logging_utils.error(message, "agents")
else:
if level == "info":
logger.info(message)
elif level == "warning":
logger.warning(message)
elif level == "error":
logger.error(message)
except Exception as log_err:
logger.error(f"Error in logging: {str(log_err)}")
try:
safe_log("Starting to process request", "info")
# Get the prompt from the message with safety check
prompt = ""
if isinstance(message, dict):
prompt = message.get("content", "")
safe_log(f"Processing request: {prompt[:50]}...", "info")
# Power-On handling with safety check
if prompt and "poweron" in prompt.lower():
safe_log("Detected PowerOn keyword, generating specialized response", "info")
poweron_prompt = f"""
Tell to the user in the language of their prompt a big big thank you, that they think for you being PowerOn. Tell them, how pleased you are, to be part of the PowerOn family, working to support humans for a better life.
Then generate a short answer (1-2 sentences) to this question: {prompt}
"""
try:
poweron_response = await self.ai_service.call_api([
{"role": "system", "content": "You are a helpful assistant that is part of the PowerOn family."},
{"role": "user", "content": poweron_prompt}
])
response["content"] = poweron_response
safe_log("PowerOn response generated", "info")
return response
except Exception as api_err:
safe_log(f"Error calling API for PowerOn: {str(api_err)}", "error")
response["content"] = "I encountered an error while generating a PowerOn response. Please try again."
return response
# Create system prompt
system_prompt = "You are a helpful, creative assistant specializing in knowledge sharing, content creation, and document processing."
# Add conversation summarization capabilities
system_prompt += """
When asked to summarize information, always consider:
1. All provided document content
2. The entire conversation history in the current workflow
3. Any structured data that has been shared
For summarization tasks specifically, make sure to analyze the complete context including previous messages in the conversation, not just the files or the current request.
"""
if workflow_id and workflow_id != "unknown":
system_prompt += """
You are currently operating within a workflow where multiple messages may have been exchanged.
When generating summaries or overviews, you must incorporate the content from previous messages
in this workflow as they contain valuable context and information.
"""
# Safely check for documents
has_documents = False
document_count = 0
try:
if isinstance(message, dict) and "documents" in message:
documents = message.get("documents")
if documents is not None:
document_count = len(documents)
has_documents = document_count > 0
safe_log(f"Message contains {document_count} documents", "info")
except Exception as doc_err:
safe_log(f"Error checking documents: {str(doc_err)}", "warning")
# Initialize document variables
document_content = ""
document_texts = []
document_names = []
# Process documents with extreme caution
if has_documents:
safe_log("Processing attached documents", "info")
# Try document handler first
try:
if self.document_handler:
try:
document_content = self.document_handler.merge_document_contents(message)
if document_content:
safe_log("Successfully extracted document content with handler", "info")
else:
safe_log("Document handler returned empty content", "warning")
except Exception as handler_err:
safe_log(f"Error using document handler: {str(handler_err)}", "warning")
except Exception as err:
safe_log(f"General error with document handler: {str(err)}", "warning")
# Fallback: manual extraction (very cautious)
try:
documents = message.get("documents", []) or []
for i, doc in enumerate(documents):
if doc is None:
safe_log(f"Document at index {i} is None", "warning")
continue
try:
# Process source
source = None
if isinstance(doc, dict):
source = doc.get("source")
# Get name
doc_name = "Document"
if isinstance(source, dict):
doc_name = source.get("name", f"Document {i+1}")
document_names.append(doc_name)
safe_log(f"Processing document: {doc_name}", "info")
# Get contents
contents = []
if isinstance(doc, dict):
contents = doc.get("contents", []) or []
doc_text = ""
for content_item in contents:
if content_item is None:
continue
if isinstance(content_item, dict) and content_item.get("type") == "text":
text = content_item.get("text", "")
if text:
doc_text = text
document_texts.append(doc_text)
safe_log(f"Found text content in {doc_name}", "info")
break
# Handle empty content
if not doc_text:
safe_log(f"No text content found in {doc_name}", "warning")
placeholder = f"[This appears to be a document named '{doc_name}', but I couldn't extract its content]"
document_texts.append(placeholder)
except Exception as doc_err:
safe_log(f"Error processing individual document: {str(doc_err)}", "warning")
except Exception as docs_err:
safe_log(f"Error in document processing loop: {str(docs_err)}", "warning")
# Combine prompt with documents safely
full_prompt = prompt
try:
if document_content:
full_prompt = f"{prompt}\n\n### Reference Documents:\n{document_content}"
safe_log("Using document handler content", "info")
elif document_texts and document_names:
# Use only corresponding pairs of names and texts
docs_content = ""
min_length = min(len(document_names), len(document_texts))
for i in range(min_length):
name = document_names[i]
text = document_texts[i]
docs_content += f"\n\n### Document: {name}\n{text}"
if docs_content:
full_prompt = f"{prompt}\n\n{docs_content}"
safe_log("Using manually extracted content", "info")
else:
safe_log("No document content could be added", "warning")
else:
safe_log("No document content available to add to prompt", "info")
except Exception as combine_err:
safe_log(f"Error combining prompt with documents: {str(combine_err)}", "warning")
# Call AI API
try:
safe_log("Calling AI service", "info")
content = await self.ai_service.call_api([
{"role": "system", "content": system_prompt},
{"role": "user", "content": full_prompt}
])
response["content"] = content
safe_log("Response successfully generated", "info")
except Exception as api_err:
safe_log(f"Error calling AI API: {str(api_err)}", "error")
response["content"] = f"I encountered an error while processing your request. Please try again or rephrase your question."
return response
except Exception as e:
# Ultra-safe error handling
error_msg = f"Error generating response: {str(e)}"
try:
if logging_utils:
logging_utils.error(error_msg, "error")
else:
logger.error(error_msg)
except:
logger.error(f"Critical error in error handling: {error_msg}")
response["content"] = f"I encountered an error while processing your request: {str(e)}"
return response
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Direct message processing function that focuses on properly handling the user's request.
"""
# Extract workflow_id and setup response
workflow_id = "unknown"
if context and isinstance(context, dict) and "workflow_id" in context:
workflow_id = context["workflow_id"]
elif message and isinstance(message, dict) and "workflow_id" in message:
workflow_id = message["workflow_id"]
response = {
"role": "assistant",
"content": "",
"agent_id": self.id,
"agent_type": self.type,
"agent_name": self.name,
"result_format": "Text",
"workflow_id": workflow_id
}
try:
# Extract the user's message directly
user_message = ""
if isinstance(message, dict) and "content" in message:
user_message = message["content"]
# Ensure we have something to process
if not user_message:
response["content"] = "Please provide a message for me to respond to."
return response
# Simple system prompt that focuses on direct response to the user's request
system_prompt = """You are a helpful, creative assistant.
Respond directly to the user's request without referencing any workflow or system context.
Focus only on providing a direct, helpful response to the specific question or request."""
# Process with AI
content = await self.ai_service.call_api([
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
])
response["content"] = content
return response
except Exception as e:
logger.error(f"Error in process_message: {str(e)}")
response["content"] = f"I encountered an error while processing your request: {str(e)}"
return response
# Singleton-Instanz
_creative_agent = None
def get_creative_agent():
"""Returns a singleton instance of the Creative Agent"""
global _creative_agent
if _creative_agent is None:
_creative_agent = CreativeAgent()
return _creative_agent

View file

@ -1,574 +0,0 @@
"""
Dokumentations-Agent für die Erstellung von Dokumentation, Berichten und strukturierten Inhalten.
Verwendet einen adaptiven Prozess zur Erstellung hochwertiger Dokumentation basierend auf der Komplexität des Auftrags.
Angepasst für das refaktorisierte Core-Modul und AgentCommunicationProtocol.
"""
import logging
import json
import re
import traceback
from typing import List, Dict, Any, Optional, Tuple, Union
from datetime import datetime
import uuid
from modules.agentservice_base import BaseAgent
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
from modules.agentservice_filemanager import FileManager # Import the file manager
logger = logging.getLogger(__name__)
class DocumentationAgent(BaseAgent):
"""Agent for creating documentation and structured content"""
def __init__(self):
"""Initialize the documentation agent"""
super().__init__()
self.id = "documentation_agent"
self.name = "Documentation Specialist"
self.type = "documentation"
self.description = "Creates documentation and structured content"
self.capabilities = "report_generation,documentation,content_structuring,technical_writing,knowledge_organization"
self.result_format = "FormattedDocument"
# Initialize AI service
self.ai_service = None
# Initialize document handler
self.document_handler = None
# Document capabilities
self.supports_documents = True
self.document_capabilities = ["read", "reference", "create"]
self.required_context = ["document_purpose", "target_audience"]
# Initialize protocol
self.protocol = AgentCommunicationProtocol()
# Initialize utilities
self.message_utils = MessageUtils()
# Track the latest generated document
self.last_document = {}
def get_agent_info(self) -> Dict[str, Any]:
"""Get agent information for agent registry"""
info = super().get_agent_info()
info.update({
"metadata": {
"document_types": ["manual", "report", "process", "presentation", "document"],
"formats": ["markdown", "text"]
}
})
return info
def set_document_handler(self, document_handler):
"""Set the document handler for file operations"""
self.document_handler = document_handler
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Process a message and create documentation.
Args:
message: Input message
context: Optional context
Returns:
Response with documentation
"""
# Extract workflow_id from context or message
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
# Get or create logging_utils
log_func = context.get("log_func") if context else None
logging_utils = LoggingUtils(workflow_id, log_func)
# Create response structure
response = {
"role": "assistant",
"content": "",
"agent_id": self.id,
"agent_type": self.type,
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id,
"documents": []
}
try:
# Initial status update
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Starting document creation",
sender_id=self.id,
status="in_progress",
progress=0.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Extract task from message
task = message.get("content", "")
# Detect document type - 10% progress
document_type = self._detect_document_type(task)
logging_utils.info(f"Creating {document_type} documentation", "execution")
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Identified document type: {document_type}",
sender_id=self.id,
status="in_progress",
progress=0.1,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Process any attached documents - 30% progress
document_context = ""
if message.get("documents"):
logging_utils.info("Processing reference documents", "execution")
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Processing reference documents",
sender_id=self.id,
status="in_progress",
progress=0.2,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
document_context = await self._process_documents(message)
# Update progress
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Reference documents processed",
sender_id=self.id,
status="in_progress",
progress=0.3,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Enhanced prompt with document context
enhanced_prompt = f"{task}\n\n{document_context}"
# Assess complexity - 40% progress
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Assessing document complexity",
sender_id=self.id,
status="in_progress",
progress=0.4,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
is_complex = await self._assess_complexity(enhanced_prompt)
complexity_type = "complex" if is_complex else "simple"
logging_utils.info(f"Document complexity assessment: {complexity_type}", "execution")
# Generate title - 50% progress
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Generating document title",
sender_id=self.id,
status="in_progress",
progress=0.5,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
title = await self._generate_title(enhanced_prompt, document_type)
logging_utils.info(f"Document title: {title}", "execution")
# Update progress
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Generating {document_type}: {title}",
sender_id=self.id,
status="in_progress",
progress=0.6,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Generate content based on complexity - 70% progress
if is_complex:
# For complex documents, use the AI service with enhanced prompt
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Creating complex {document_type} document: {title}",
sender_id=self.id,
status="in_progress",
progress=0.7,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
content = await self._generate_complex_document(enhanced_prompt, document_type, title)
logging_utils.info("Complex document generated", "execution")
else:
# For simple documents, use direct generation
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Creating simple {document_type} document: {title}",
sender_id=self.id,
status="in_progress",
progress=0.7,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
content = await self._generate_simple_document(enhanced_prompt, document_type, title)
logging_utils.info("Simple document generated", "execution")
# Finalize document - 90% progress
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Finalizing document",
sender_id=self.id,
status="in_progress",
progress=0.9,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Create a document artifact if document handler is available
if self.document_handler:
doc_id = f"doc_{uuid.uuid4()}"
document = {
"id": doc_id,
"source": {
"type": "generated",
"id": doc_id,
"name": title,
"content_type": "text/markdown",
"size": len(content)
},
"contents": [
{
"type": "text",
"text": content,
"is_extracted": True
}
]
}
# Add document to response
response["documents"].append(document)
# Store the latest document
self.last_document = document
# Update response content to reference the document
response["content"] = f"I've created a document titled '{title}' that contains the requested information. The document is attached to this message."
# If protocol message is required, send it
if context and context.get("require_protocol_message"):
result_message = self.send_document_result(
document_title=title,
document_content=content,
sender_id=self.id,
receiver_id=context.get("receiver_id", "workflow"),
context_id=workflow_id
)
# Just log the message creation
logging_utils.info(f"Created protocol result message: {result_message.id}", "execution")
else:
# If no document handler, just put content in response
response["content"] = content
# Final progress update
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Document creation completed",
sender_id=self.id,
status="completed",
progress=1.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
return response
except Exception as e:
error_msg = f"Error in documentation agent: {str(e)}"
logging_utils.error(error_msg, "error")
# Create error response using protocol
error_message = self.protocol.create_error_message(
error_description=error_msg,
sender_id=self.id,
error_type="documentation",
error_details={"traceback": traceback.format_exc()},
context_id=workflow_id
)
# Log error status
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Error creating documentation: {str(e)}",
sender_id=self.id,
status="error",
progress=1.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "error", self.id, self.name)
# Set error in response
response["content"] = f"## Error creating documentation\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
response["status"] = "error"
return response
# Helper method to process document content with enhanced logging
async def _process_documents(self, message: Dict[str, Any]) -> str:
"""Process documents in the message with detailed logging"""
if not message.get("documents"):
return ""
document_context = ""
if self.document_handler:
# Use document handler to merge contents
document_context = self.document_handler.merge_document_contents(message)
else:
# Manual processing
for document in message.get("documents", []):
source = document.get("source", {})
doc_name = source.get("name", "unnamed")
document_context += f"\n\n--- {doc_name} ---\n"
for content in document.get("contents", []):
if content.get("type") == "text":
document_context += content.get("text", "")
# Log summary of processed documents
doc_count = len(message.get("documents", []))
context_size = len(document_context)
logger.info(f"Processed {doc_count} documents, extracted {context_size} characters of context")
return document_context
async def _assess_complexity(self, task: str) -> bool:
"""
Assess task complexity to determine document structure.
Args:
task: The task description
Returns:
True if complex document needed, False otherwise
"""
if not self.ai_service:
# Default to complex if no AI service
return True
prompt = f"""
Analyze this task and determine if it requires a complex or simple document structure:
{task}
Respond with only "COMPLEX" or "SIMPLE".
"""
try:
response = await self.ai_service.call_api([
{"role": "system", "content": "You determine document complexity requirements."},
{"role": "user", "content": prompt}
])
return "COMPLEX" in response.upper()
except Exception:
# Default to complex on error
return True
async def _generate_title(self, task: str, document_type: str) -> str:
"""
Generate a title for the document.
Args:
task: The task description
document_type: Type of document
Returns:
Generated title
"""
if not self.ai_service:
# Default title if no AI service
return f"{document_type.capitalize()} Document"
prompt = f"""
Create a concise, professional title for this {document_type}:
{task}
Respond with ONLY the title, nothing else.
"""
try:
title = await self.ai_service.call_api([
{"role": "system", "content": "You create document titles."},
{"role": "user", "content": prompt}
])
# Clean up the title
return title.strip('"\'#*- \n\t')
except Exception:
# Default title on error
return f"{document_type.capitalize()} Document"
async def _generate_complex_document(self, task: str, document_type: str, title: str) -> str:
"""
Generate a complex document with structure.
Args:
task: The task description
document_type: Type of document
title: Document title
Returns:
Generated document content
"""
if not self.ai_service:
return f"# {title}\n\nUnable to generate complex document: AI service not available."
prompt = f"""
Create a comprehensive, well-structured {document_type} titled "{title}" based on:
{task}
The document should include:
1. A clear introduction with purpose and scope
2. Logically organized sections with headings
3. Detailed content with examples and evidence
4. A conclusion with key takeaways
5. Appropriate formatting using Markdown
Format the document in Markdown with proper headings, lists, and emphasis.
"""
try:
content = await self.ai_service.call_api([
{"role": "system", "content": "You create comprehensive, well-structured documentation."},
{"role": "user", "content": prompt}
])
# Ensure title is at the top
if not content.strip().startswith("# "):
content = f"# {title}\n\n{content}"
return content
except Exception as e:
return f"# {title}\n\nError generating document: {str(e)}"
async def _generate_simple_document(self, task: str, document_type: str, title: str) -> str:
"""
Generate a simple document without complex structure.
Args:
task: The task description
document_type: Type of document
title: Document title
Returns:
Generated document content
"""
if not self.ai_service:
return f"# {title}\n\nUnable to generate document: AI service not available."
prompt = f"""
Create a concise, focused {document_type} titled "{title}" based on:
{task}
The document should be clear, precise, and to the point without complex chapter structure.
Format using Markdown with appropriate headings and formatting.
"""
try:
content = await self.ai_service.call_api([
{"role": "system", "content": "You create concise, focused documentation."},
{"role": "user", "content": prompt}
])
# Ensure title is at the top
if not content.strip().startswith("# "):
content = f"# {title}\n\n{content}"
return content
except Exception as e:
return f"# {title}\n\nError generating document: {str(e)}"
def _detect_document_type(self, message: str) -> str:
"""
Detect document type from the message.
Args:
message: User message
Returns:
Detected document type
"""
message = message.lower()
if any(term in message for term in ["manual", "guide", "instruction", "tutorial"]):
return "manual"
elif any(term in message for term in ["report", "analysis", "assessment", "review"]):
return "report"
elif any(term in message for term in ["process", "workflow", "procedure", "steps"]):
return "process"
elif any(term in message for term in ["presentation", "slides", "deck"]):
return "presentation"
else:
return "document"
def send_document_result(self, document_title: str, document_content: str,
sender_id: str, receiver_id: str, context_id: str = None) -> AgentMessage:
"""Send a document result using the protocol"""
metadata = {
"document_type": self._detect_document_type(document_content),
"title": document_title,
"created_at": datetime.now().isoformat()
}
return self.protocol.create_result_message(
result_content=document_content,
sender_id=sender_id,
receiver_id=receiver_id,
task_id=f"doc_{uuid.uuid4()}",
output_data=metadata,
result_format=self.result_format,
context_id=context_id
)
def send_error_message(self, error_description: str, sender_id: str, receiver_id: str = None,
context_id: str = None) -> AgentMessage:
"""Send an error message using the protocol"""
return self.protocol.create_error_message(
error_description=error_description,
sender_id=sender_id,
receiver_id=receiver_id,
error_type="documentation_error",
error_details={"timestamp": datetime.now().isoformat()},
context_id=context_id
)
# Singleton instance
_documentation_agent = None
def get_documentation_agent():
"""Returns a singleton instance of the documentation agent"""
global _documentation_agent
if _documentation_agent is None:
_documentation_agent = DocumentationAgent()
return _documentation_agent

File diff suppressed because it is too large Load diff

View file

@ -1,233 +0,0 @@
"""
Enhanced base agent class for the Agentservice.
Provides improved communication and document handling capabilities.
"""
import logging
import json
from typing import Dict, Any, List, Optional, Tuple, Union
import asyncio
from datetime import datetime
import uuid
logger = logging.getLogger(__name__)
class AgentBase:
"""
Enhanced base agent class with improved communication capabilities.
All specialized agents should inherit from this class.
"""
def __init__(self):
"""Initialize the enhanced agent."""
self.name = "base"
self.capabilities = "Basic agent operations"
self.result_format = "Text"
# System dependencies
self.ai_service = None
def set_dependencies(self, ai_service=None, document_handler=None, lucydom_interface=None):
self.ai_service = ai_service
def get_config(self) -> Dict[str, Any]:
"""
Get detailed information about the agent.
Returns:
Dictionary with agent information
"""
return {
"name": self.name,
"capabilities": self.capabilities,
"result_format": self.result_format,
}
def get_capabilities(self) -> List[str]:
"""
Get a list of agent capabilities.
Returns:
List of capability strings
"""
# Split capabilities into a list
if isinstance(self.capabilities, str):
return [cap.strip() for cap in self.capabilities.split(",")]
return []
def get_supported_formats(self) -> List[str]:
"""
Get supported output formats.
Returns:
List of supported format strings
"""
if isinstance(self.result_format, str):
return [fmt.strip() for fmt in self.result_format.split(",")]
return ["Text"]
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Process a message and generate a response.
Args:
message: Input message
context: Optional context information
Returns:
Response message
"""
# Basic implementation - should be overridden by specialized agents
if not self.ai_service:
logger.warning(f"Agent {self.id} has no AI service configured")
return {
"role": "assistant",
"content": f"I'm {self.name}, but I'm not properly configured. Please set up the AI service.",
"agent_id": self.id,
"agent_type": self.type,
"result_format": "Text"
}
# Process documents if available and set up document handler
document_context = ""
if self.supports_documents and self.document_handler and message.get("documents"):
document_context = await self._process_documents(message)
# Create enhanced prompt
prompt = self._create_enhanced_prompt(message, document_context, context)
# Generate response
try:
response_content = await self.ai_service.call_api([
{"role": "system", "content": self._get_system_prompt()},
{"role": "user", "content": prompt}
])
# Process the response to extract any special instructions or status
content, status = self._process_response(response_content)
return {
"role": "assistant",
"content": content,
"agent_id": self.id,
"agent_type": self.type,
"agent_name": self.name,
"result_format": self.result_format,
"status": status,
"workflow_id": message.get("workflow_id"),
"documents": message.get("documents", []) # Pass through documents
}
except Exception as e:
logger.error(f"Error in agent {self.id}: {str(e)}")
return {
"role": "assistant",
"content": f"I encountered an error: {str(e)}",
"agent_id": self.id,
"agent_type": self.type,
"result_format": "Text",
"status": "error"
}
async def _process_documents(self, message: Dict[str, Any]) -> str:
"""
Process documents in the message.
Args:
message: Input message with documents
Returns:
Document context as text
"""
# Simply extract text from documents
if not self.document_handler:
return ""
return self.document_handler.merge_document_contents(message)
def _create_enhanced_prompt(self, message: Dict[str, Any], document_context: str, context: Dict[str, Any] = None) -> str:
"""
Create an enhanced prompt with context.
Args:
message: Input message
document_context: Document context
context: Optional additional context
Returns:
Enhanced prompt
"""
prompt = message.get("content", "")
# Add document context if available
if document_context:
prompt += f"\n\n=== DOCUMENT CONTEXT ===\n{document_context}"
# Add any additional context
if context:
# Add expected format if specified
if "expected_format" in context:
prompt += f"\n\nPlease format your response as: {context['expected_format']}"
# Add dependency outputs if available
if "dependency_outputs" in context:
prompt += "\n\n=== OUTPUTS FROM PREVIOUS ACTIVITIES ===\n"
for key, value in context["dependency_outputs"].items():
if isinstance(value, dict) and "content" in value:
prompt += f"\n--- {key} ---\n{value['content']}\n"
else:
prompt += f"\n--- {key} ---\n{str(value)}\n"
return prompt
def _get_system_prompt(self) -> str:
"""
Get the system prompt for the agent.
Returns:
System prompt string
"""
return f"""
You are {self.name}, a specialized {self.type} agent.
{self.description}
Your capabilities include: {self.capabilities}
You should format your responses according to: {self.result_format}
Respond clearly and helpfully to the user's request.
When appropriate, include a status indicator at the end of your message:
[STATUS: COMPLETE] - When you've fully addressed the request
[STATUS: PARTIAL] - When you've partially addressed the request
[STATUS: QUESTION] - When you need more information
"""
def _process_response(self, response: str) -> Tuple[str, str]:
"""
Process the response to extract status and clean content.
Args:
response: Raw response from the AI
Returns:
Tuple of (cleaned content, status)
"""
# Default status
status = "complete"
# Check for status tags
import re
status_match = re.search(r'\[STATUS:\s*(COMPLETE|PARTIAL|QUESTION)\]', response, re.IGNORECASE)
if status_match:
status_value = status_match.group(1).lower()
# Remove the status tag
content = re.sub(r'\[STATUS:\s*(COMPLETE|PARTIAL|QUESTION)\]', '', response, flags=re.IGNORECASE).strip()
return content, status_value
return response, status
# Factory functions
def get_enhanced_base_agent() -> BaseAgent:
"""Get an instance of the enhanced base agent."""
return BaseAgent()

View file

@ -1,921 +0,0 @@
"""
Refactored helper function for intelligent data extraction (continued).
"""
import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime
import uuid
logger = logging.getLogger(__name__)
async def data_extraction(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
lucydom_interface = None,
workflow_id: str = None,
add_log_func = None,
document_handler = None # Add document handler parameter
) -> Dict[str, Any]:
"""
Performs AI-driven data extraction with improved document and image handling.
Args:
prompt: Specification of what data to extract
files: List of all available files with metadata
messages: List of all messages in the workflow
ai_service: Service for AI requests
lucydom_interface: Interface for database access (optional)
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
document_handler: Optional document handler for structured document operations
Returns:
Structured text object with extracted data and context information
"""
try:
# Log extraction start
_log(add_log_func, workflow_id, f"Starting data extraction with {len(files)} files", "info")
# Create enhanced extraction plan using AI
_log(add_log_func, workflow_id, "Creating extraction plan", "info")
extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)
# If we have extraction plan, log summary
if extraction_plan:
extract_needed_count = sum(1 for item in extraction_plan if item.get("extract_needed", False))
_log(add_log_func, workflow_id,
f"Extraction plan created: {len(extraction_plan)} files, {extract_needed_count} need extraction", "info")
# Execute extractions, preferring document handler if available
if document_handler:
_log(add_log_func, workflow_id, "Using document handler for extraction", "info")
extracted_data = await _execute_extractions_with_handler(
extraction_plan,
files,
messages,
document_handler,
ai_service,
workflow_id,
add_log_func
)
else:
# Fall back to original implementation
_log(add_log_func, workflow_id, "Using fallback extraction method", "info")
extracted_data = await _execute_extractions(
extraction_plan,
files,
messages,
lucydom_interface,
ai_service,
workflow_id,
add_log_func
)
# Structure extracted data
_log(add_log_func, workflow_id, f"Structuring extracted data from {len(extracted_data)} files", "info")
structured_result = _structure_extracted_data(extracted_data, files, prompt)
# Enhance with contextual summaries using AI
if ai_service and structured_result["extracted_content"]:
_log(add_log_func, workflow_id, "Creating contextual summaries for extracted content", "info")
try:
# Create a prompt for contextual summary
summary_prompt = f"""
Create concise, contextual summaries of the following extracted content according to this requirement:
REQUIREMENT: {prompt}
EXTRACTED CONTENT:
"""
for item in structured_result["extracted_content"]:
file_name = item.get("name", "Unnamed file")
content_preview = item.get("content", "")[:500] + "..." if len(item.get("content", "")) > 500 else item.get("content", "")
summary_prompt += f"\n--- {file_name} ---\n{content_preview}\n"
# Call AI for contextual summaries
summaries = await ai_service.call_api([{"role": "user", "content": summary_prompt}])
structured_result["contextual_summary"] = summaries
_log(add_log_func, workflow_id, "Added contextual summaries to extracted data", "info")
except Exception as e:
_log(add_log_func, workflow_id, f"Error creating contextual summaries: {str(e)}", "warning")
# Handle image-specific content separately
image_content = [item for item in structured_result["extracted_content"]
if "Image Analysis" in item.get("content", "") or item.get("type") == "image"]
if image_content and len(image_content) > 0:
_log(add_log_func, workflow_id, f"Processing {len(image_content)} image-related content items", "info")
# Add image analysis summary if we have AI service
if ai_service:
try:
# Create a prompt for image analysis summary
image_summary_prompt = f"""
Summarize the key visual information from these image analyses according to this requirement:
REQUIREMENT: {prompt}
IMAGE ANALYSES:
"""
for item in image_content:
file_name = item.get("name", "Unnamed image")
content = item.get("content", "")
image_summary_prompt += f"\n--- {file_name} ---\n{content}\n"
# Call AI for image analysis summary
image_summaries = await ai_service.call_api([{"role": "user", "content": image_summary_prompt}])
structured_result["image_analysis_summary"] = image_summaries
_log(add_log_func, workflow_id, "Added image analysis summary to extracted data", "info")
except Exception as e:
_log(add_log_func, workflow_id, f"Error creating image analysis summary: {str(e)}", "warning")
return structured_result
except Exception as e:
logger.error(f"Error in data extraction: {str(e)}", exc_info=True)
# Add error log
if add_log_func and workflow_id:
add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error")
# Return error result
return {
"error": str(e),
"status": "error",
"files_processed": len(files),
"message": f"Data extraction failed: {str(e)}"
}
async def _execute_extractions_with_handler(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
document_handler,
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Execute extractions using the document handler with enhanced image processing.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
messages: List of all messages
document_handler: Document handler for structured operations
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance (highest first)
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Log extraction start
_log(add_log_func, workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info")
# Only perform extraction if needed
if extract_needed:
# Check if file already exists in messages with content
existing_content = _find_document_in_messages(file_id, messages)
if existing_content and existing_content.get("content"):
# Content already exists, check if we need more specialized extraction
current_context = existing_content.get("extraction_context", "")
# Check if new extraction prompt is different or more specific
if extraction_prompt and extraction_prompt != current_context:
_log(add_log_func, workflow_id,
f"Re-extracting {file_name} with new prompt: {extraction_prompt}", "info")
# Create an empty message to extract into
empty_message = {}
# Use document handler to extract with new context
try:
result_message = await document_handler.add_file_to_message(
empty_message,
file_id,
extraction_prompt
)
# Get the document content from result
if "documents" in result_message and result_message["documents"]:
doc = result_message["documents"][0]
# Get text content
content_text = ""
is_extracted = False
for content in doc.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
# Create extraction result
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content_text,
"is_extracted": is_extracted,
"extraction_method": "document_handler_reextract",
"extraction_context": extraction_prompt
})
# Check for additional documents (e.g., extracted images)
for additional_doc in result_message.get("documents", [])[1:]:
source = additional_doc.get("source", {})
# Skip if not an extracted document
if source.get("type") != "extracted":
continue
# Get content
add_content_text = ""
add_is_extracted = False
for content in additional_doc.get("contents", []):
if content.get("type") == "text":
add_content_text = content.get("text", "")
add_is_extracted = content.get("is_extracted", False)
break
# Add as separate extraction result
if add_content_text:
extracted_data.append({
"file_id": source.get("id", f"extracted_{uuid.uuid4()}"),
"name": source.get("name", f"Extracted from {file_name}"),
"type": source.get("content_type", "image"),
"content": add_content_text,
"is_extracted": add_is_extracted,
"extraction_method": "document_handler_extracted_component",
"extraction_context": content.get("extraction_context", extraction_prompt),
"parent_file_id": file_id
})
_log(add_log_func, workflow_id,
f"Extracted embedded content from {file_name}", "info")
_log(add_log_func, workflow_id,
f"Re-extracted {file_name} with new context", "info")
continue
except Exception as e:
logger.error(f"Error re-extracting {file_name}: {str(e)}")
_log(add_log_func, workflow_id,
f"Error re-extracting {file_name}: {str(e)}", "warning")
# Use existing content
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": existing_content.get("content", ""),
"is_extracted": existing_content.get("is_extracted", False),
"extraction_method": "existing_content",
"extraction_context": current_context
})
_log(add_log_func, workflow_id,
f"Using existing content for {file_name}", "info")
continue
# Need to extract content with document handler
try:
# Create an empty message to extract into
empty_message = {}
# Use document handler to add file and extract content
result_message = await document_handler.add_file_to_message(
empty_message,
file_id,
extraction_prompt
)
# Get the document content from result
if "documents" in result_message and result_message["documents"]:
# Process main document
doc = result_message["documents"][0] # First document is the main file
# Get text content
content_text = ""
is_extracted = False
for content in doc.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
# Create extraction result for main document
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content_text,
"is_extracted": is_extracted,
"extraction_method": "document_handler",
"extraction_context": extraction_prompt
})
_log(add_log_func, workflow_id,
f"Extracted {file_name} using document handler", "info")
# Process additional documents (e.g., extracted images)
for additional_doc in result_message.get("documents", [])[1:]:
source = additional_doc.get("source", {})
# Skip if not an extracted document
if source.get("type") != "extracted":
continue
# Get content
add_content_text = ""
add_is_extracted = False
for content in additional_doc.get("contents", []):
if content.get("type") == "text":
add_content_text = content.get("text", "")
add_is_extracted = content.get("is_extracted", False)
break
# Add as separate extraction result
if add_content_text:
extracted_data.append({
"file_id": source.get("id", f"extracted_{uuid.uuid4()}"),
"name": source.get("name", f"Extracted from {file_name}"),
"type": source.get("content_type", "image"),
"content": add_content_text,
"is_extracted": add_is_extracted,
"extraction_method": "document_handler_extracted_component",
"extraction_context": content.get("extraction_context", extraction_prompt),
"parent_file_id": file_id
})
_log(add_log_func, workflow_id,
f"Extracted embedded content from {file_name}", "info")
else:
# Extraction failed
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Failed to extract content from {file_name}",
"is_extracted": False,
"extraction_method": "failed"
})
_log(add_log_func, workflow_id,
f"Failed to extract content from {file_name}", "warning")
except Exception as e:
logger.error(f"Error extracting {file_name}: {str(e)}")
_log(add_log_func, workflow_id,
f"Error extracting {file_name}: {str(e)}", "warning")
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Error extracting: {str(e)}",
"is_extracted": False,
"extraction_method": "error"
})
else:
# No extraction needed, use existing content
existing_content = _find_document_in_messages(file_id, messages)
if existing_content:
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": existing_content.get("content", ""),
"is_extracted": existing_content.get("is_extracted", False),
"extraction_method": "existing_content",
"extraction_context": existing_content.get("extraction_context", "")
})
_log(add_log_func, workflow_id,
f"Using existing content for {file_name}", "info")
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
_log(add_log_func, workflow_id,
f"No content available for {file_name}", "warning")
return extracted_data
def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Find a document by file ID in workflow messages.
Args:
file_id: ID of the file to find
messages: List of messages to search
Returns:
Dictionary with document information or empty dict if not found
"""
for message in messages:
for doc_index, document in enumerate(message.get("documents", [])):
source = document.get("source", {})
# Check if file ID matches
if source.get("id") == str(file_id) or source.get("id") == file_id:
# Found the document
content_text = ""
is_extracted = False
# Look for text content
for content in document.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
return {
"document_id": document.get("id"),
"message_id": message.get("id"),
"content": content_text,
"is_extracted": is_extracted
}
return {}
async def _create_extraction_plan(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Erstellt einen Extraktionsplan mit AI-Unterstützung.
Args:
prompt: Spezifizierung, welche Daten extrahiert werden sollen
files: Liste aller verfügbaren Dateien mit Metadaten
messages: Liste aller Nachrichten im Workflow
ai_service: Service für KI-Anfragen
workflow_id: Optionale ID des Workflows für Logging
add_log_func: Optionale Funktion für das Hinzufügen von Logs
Returns:
Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
"""
# Erstelle Kontext-Informationen für den AI Call
file_infos = []
for file in files:
# Basis-Metadaten
file_info = {
"id": file.get("id", ""),
"name": file.get("name", ""),
"type": file.get("type", ""),
"content_type": file.get("content_type", ""),
"size": file.get("size", "")
}
# Extraktionsstatus prüfen (falls vorhanden)
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
if doc_contents:
# Prüfen, ob mindestens ein Content mit is_extracted=True existiert
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
)
file_info["already_extracted"] = already_extracted
# Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
for content in doc_contents:
if content.get("type") == "text" and content.get("text"):
preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
file_info["content_preview"] = preview_text
break
else:
file_info["already_extracted"] = False
file_infos.append(file_info)
# AI-Prompt erstellen
extraction_prompt = f"""
Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.
AUFGABE:
{prompt}
VERFÜGBARE DATEIEN:
{json.dumps(file_infos, indent=2)}
Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
1. file_id: Die ID der zu extrahierenden Datei
2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)
Format:
[
{{
"file_id": 1234,
"extract_needed": true,
"extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
"importance": 5
}},
...
]
Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
"""
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")
try:
# AI-Call durchführen
extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
# JSON aus der Antwort extrahieren
import re
json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)
if json_match:
extraction_plan = json.loads(json_match.group(0))
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
"info"
)
return extraction_plan
else:
# Fallback bei Parsing-Problemen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
"Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
"warning"
)
# Standard-Plan: Alle nicht extrahierten Dateien extrahieren
default_plan = []
for file in files:
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
) if doc_contents else False
default_plan.append({
"file_id": file.get("id", 0),
"extract_needed": not already_extracted,
"extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
"importance": 3
})
return default_plan
except Exception as e:
logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
"error"
)
# Leerer Plan bei Fehlern
return []
async def _execute_extractions(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
lucydom_interface,
ai_service,
workflow_id: str = None,
add_log_func = None,
logging_utils = None
) -> List[Dict[str, Any]]:
"""
Execute the planned extractions.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
lucydom_interface: Interface for database access
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
logging_utils: Optional logging utility
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Add log
if logging_utils:
logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
"info"
)
# Only perform extraction if needed
if extract_needed:
# Get file content via LucyDOM interface
if lucydom_interface:
try:
file_content = await lucydom_interface.read_file_content(file_id)
if not file_content:
if logging_utils:
logging_utils.warning(f"File {file_name} not found", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"File {file_name} not found", "warning")
continue
# Perform extraction based on file type
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
# Image analysis with AI service
if ai_service and hasattr(ai_service, "analyze_image"):
try:
image_analysis = await ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": image_analysis,
"is_extracted": True,
"extraction_method": "image_analysis"
})
if logging_utils:
logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
else:
# Fallback if no image analysis available
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Image: {file_name} (Analysis not available)",
"is_extracted": False,
"extraction_method": "none"
})
else:
# Text-based extraction for all other file types
try:
# Import directly here to avoid circular imports
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content,
"is_extracted": is_extracted,
"extraction_method": "text_extraction"
})
if logging_utils:
logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"File {file_name} extracted (Status: {is_extracted})",
"info"
)
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
except Exception as e:
logger.error(f"Error reading file {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
else:
logger.warning(f"No LucyDOM interface available for file {file_name}")
if logging_utils:
logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
else:
# No extraction needed, use existing content
doc_contents = _extract_document_contents_from_messages(file_id, messages)
if doc_contents:
# Use first text content
for content in doc_contents:
if content.get("type") == "text":
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content.get("text", ""),
"is_extracted": content.get("is_extracted", False),
"extraction_method": "existing_content"
})
break
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
return extracted_data
def _structure_extracted_data(
extracted_data: List[Dict[str, Any]],
files: List[Dict[str, Any]],
prompt: str
) -> Dict[str, Any]:
"""
Structure the extracted data into a formatted result.
Args:
extracted_data: List of extracted data per file
files: List of all available files
prompt: Original extraction prompt
Returns:
Structured result object
"""
# Create base structure
result = {
"prompt": prompt,
"files_processed": len(extracted_data),
"total_files": len(files),
"extraction_timestamp": datetime.now().isoformat(),
"status": "success",
"extracted_content": []
}
# Add extracted content
for data_item in extracted_data:
# Enrich with file metadata
file_id = data_item.get("file_id", 0)
file_metadata = next((f for f in files if f.get("id") == file_id), {})
content_item = {
"file_id": file_id,
"name": data_item.get("name", file_metadata.get("name", "")),
"type": data_item.get("type", file_metadata.get("type", "")),
"content_type": file_metadata.get("content_type", ""),
"size": file_metadata.get("size", ""),
"is_extracted": data_item.get("is_extracted", False),
"extraction_method": data_item.get("extraction_method", ""),
"content": data_item.get("content", "")
}
result["extracted_content"].append(content_item)
return result
def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Extract document contents for a specific file from workflow messages.
Enhanced to handle the new document structure.
Args:
file_id: ID of the file
messages: List of all messages in the workflow
Returns:
List of document contents for the specified file
"""
contents = []
for message in messages:
# Search documents in the message
for document in message.get("documents", []):
source = document.get("source", {})
# Check if file ID matches (handle both string and int comparison)
if (source.get("id") == file_id or
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
(isinstance(file_id, str) and source.get("id") == file_id)):
# Add contents of the file
doc_contents = document.get("contents", [])
if doc_contents:
# Ensure each content has document reference
for content in doc_contents:
content_copy = content.copy()
content_copy["document_id"] = document.get("id")
content_copy["message_id"] = message.get("id")
contents.append(content_copy)
return contents
def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
"""Helper function for logging with different log functions"""
# Log via logger instance
if log_type == "error":
logger.error(message)
elif log_type == "warning":
logger.warning(message)
else:
logger.info(message)
# Log via provided log function (if available)
if add_log_func and workflow_id:
add_log_func(workflow_id, message, log_type, agent_id, agent_name)

View file

@ -1,890 +0,0 @@
"""
Enhanced document handling module for the Agentservice (continued).
"""
import os
import logging
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union
logger = logging.getLogger(__name__)
class DocumentHandler:
"""
Centralized document handler for consistent document management across the system.
"""
def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None):
"""Initialize the document handler."""
self.workflow_id = workflow_id
self.lucydom_interface = lucydom_interface
self.ai_service = ai_service
# Import necessary utilities
from modules.agentservice_filemanager import get_file_manager
self.file_manager = get_file_manager()
def set_workflow_id(self, workflow_id: str):
"""Set or update the workflow ID."""
self.workflow_id = workflow_id
def set_lucydom_interface(self, lucydom_interface):
"""Set or update the LucyDOM interface."""
self.lucydom_interface = lucydom_interface
def set_ai_service(self, ai_service):
"""Set or update the AI service."""
self.ai_service = ai_service
async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]:
"""
Add a file to a message with contextual extraction.
Args:
message: The message to add the file to
file_id: ID of the file to add
extraction_prompt: Optional prompt for contextual extraction (e.g., for images)
Returns:
Updated message with the file added
"""
if not self.lucydom_interface:
logger.error("LucyDOM interface not available")
return message
try:
# Get file metadata
file = self.lucydom_interface.get_file(file_id)
if not file:
logger.warning(f"File with ID {file_id} not found")
return message
# Get necessary file information
file_name = file.get("name", "unnamed_file")
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
# Initialize documents array if needed
if "documents" not in message:
message["documents"] = []
# Check if file is already in the message
file_already_added = any(
doc.get("source", {}).get("id") == str(file_id)
for doc in message.get("documents", [])
)
if file_already_added:
logger.info(f"File {file_name} already exists in message, skipping")
return message
# Create a unique document ID
doc_id = f"doc_{uuid.uuid4()}"
# Create document structure
document = {
"id": doc_id,
"source": {
"type": "file",
"id": str(file_id),
"name": file_name,
"content_type": content_type,
"size": file.get("size"),
"upload_date": file.get("upload_date", datetime.now().isoformat())
},
"contents": []
}
# Only read content if we have extraction prompt or specific types
if (extraction_prompt or
file_type in ["document", "text"] or
(content_type and content_type.startswith("text/"))):
# Read file content
file_content = await self.lucydom_interface.read_file_content(file_id)
if file_content:
# Process based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
# Image analysis if prompt provided
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# Use provided prompt or default one
image_prompt = extraction_prompt or "Describe this image in detail"
logger.info(f"Analyzing image {file_name} with prompt: {image_prompt}")
image_analysis = await self.ai_service.analyze_image(
image_data=file_content,
prompt=image_prompt,
mime_type=content_type
)
# Add the analysis as text content
document["contents"].append({
"type": "text",
"text": f"Image Analysis:\n{image_analysis}",
"is_extracted": True,
"extraction_context": extraction_prompt
})
logger.info(f"Added image analysis for {file_name} to message")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"Image file: {file_name} (Analysis failed: {str(e)})",
"is_extracted": False
})
else:
# Just add placeholder if no analysis available
document["contents"].append({
"type": "text",
"text": f"Image file: {file_name} (no analysis requested)",
"is_extracted": False
})
# Enhanced PDF processing - extract text and images
elif file_name.lower().endswith('.pdf'):
logger.info(f"Processing PDF file: {file_name}")
# Extract text content first
from modules.agentservice_utils import extract_text_from_file_content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
# Add text content
document["contents"].append({
"type": "text",
"text": text_content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Extracted text content from PDF {file_name}")
# Extract and analyze images from PDF if we have AI service
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# Import necessary modules
import fitz # PyMuPDF
from io import BytesIO
# Add detailed logging
logger.info(f"Starting PDF image extraction for {file_name}")
# Check if extraction prompt is available or use default
image_prompt = extraction_prompt or "Describe this image from the PDF document"
# Open PDF from memory stream with detailed error checking
try:
pdf_document = fitz.open(stream=file_content, filetype="pdf")
logger.info(f"Successfully opened PDF with {len(pdf_document)} pages")
except Exception as pdf_open_error:
logger.error(f"Failed to open PDF: {str(pdf_open_error)}")
raise
# Initialize images list and image count
images_analysis = []
image_count = 0
# Process each page
for page_num, page in enumerate(pdf_document, 1):
# Get list of images on the page
image_list = page.get_images(full=True)
if image_list:
logger.info(f"Found {len(image_list)} images on page {page_num}")
# Process each image
for img_index, img in enumerate(image_list):
try:
xref = img[0] # Get image reference
# Extract image data
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Analyze image
image_analysis = await self.ai_service.analyze_image(
image_data=image_bytes,
prompt=f"{image_prompt} (Page {page_num}, Image {img_index+1})",
mime_type=f"image/{image_ext}"
)
# Add to analysis list
images_analysis.append({
"page": page_num,
"index": img_index + 1,
"analysis": image_analysis
})
image_count += 1
logger.info(f"Analyzed image {img_index+1} on page {page_num}")
# Create a separate document for each extracted image if needed
if True: # Set to condition if you want to control this
img_doc_id = f"img_doc_{uuid.uuid4()}"
image_filename = f"page{page_num}_image{img_index+1}.{image_ext}"
image_document = {
"id": img_doc_id,
"source": {
"type": "extracted",
"parent_id": str(file_id),
"id": img_doc_id,
"name": image_filename,
"content_type": f"image/{image_ext}",
"size": len(image_bytes)
},
"contents": [{
"type": "text",
"text": f"Image Analysis (PDF Page {page_num}, Image {img_index+1}):\n{image_analysis}",
"is_extracted": True,
"extraction_context": image_prompt
}]
}
# Add image document to message
message["documents"].append(image_document)
logger.info(f"Added extracted image document {image_filename} to message")
except Exception as img_err:
logger.warning(f"Error processing image {img_index} on page {page_num}: {str(img_err)}")
# Close the PDF
pdf_document.close()
# Add combined image analysis to the main document
if images_analysis:
combined_analysis = "\n\n## Embedded Images Analysis\n\n"
for img in images_analysis:
combined_analysis += f"### Page {img['page']}, Image {img['index']}\n{img['analysis']}\n\n"
document["contents"].append({
"type": "text",
"text": combined_analysis,
"is_extracted": True,
"extraction_context": f"Analysis of {image_count} images embedded in the PDF"
})
logger.info(f"Added combined analysis of {image_count} PDF images to document")
except ImportError:
logger.warning("PyMuPDF (fitz) is not installed, skipping PDF image extraction")
document["contents"].append({
"type": "text",
"text": "\n\nNote: PDF may contain images that were not extracted due to missing libraries.",
"is_extracted": False
})
except Exception as e:
logger.error(f"Error extracting images from PDF {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"\n\nError extracting images from PDF: {str(e)}",
"is_extracted": False
})
# Word document processing with image extraction
elif file_name.lower().endswith(('.docx', '.doc')):
logger.info(f"Processing Word document: {file_name}")
# Extract text content first
from modules.agentservice_utils import extract_text_from_file_content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
# Add text content
document["contents"].append({
"type": "text",
"text": text_content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Extracted text content from Word document {file_name}")
# Attempt to extract and analyze images from Word document
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# For .docx documents
if file_name.lower().endswith('.docx'):
import zipfile
from io import BytesIO
# Check if extraction prompt is available or use default
image_prompt = extraction_prompt or "Describe this image from the Word document"
# Create a zipfile object from the .docx content
docx_zip = zipfile.ZipFile(BytesIO(file_content))
# Images in .docx are stored in the "word/media" directory
image_files = [f for f in docx_zip.namelist() if f.startswith('word/media/')]
if image_files:
logger.info(f"Found {len(image_files)} images in Word document {file_name}")
# Process each image
images_analysis = []
for i, img_path in enumerate(image_files):
try:
# Extract image data
image_bytes = docx_zip.read(img_path)
# Determine image type from filename
image_ext = img_path.split('.')[-1] if '.' in img_path else 'png'
# Analyze image
image_analysis = await self.ai_service.analyze_image(
image_data=image_bytes,
prompt=f"{image_prompt} (Image {i+1})",
mime_type=f"image/{image_ext}"
)
# Add to analysis list
images_analysis.append({
"index": i + 1,
"path": img_path,
"analysis": image_analysis
})
logger.info(f"Analyzed image {i+1} ({img_path}) from Word document")
# Create a separate document for each extracted image if needed
img_doc_id = f"img_doc_{uuid.uuid4()}"
image_filename = f"word_image{i+1}.{image_ext}"
image_document = {
"id": img_doc_id,
"source": {
"type": "extracted",
"parent_id": str(file_id),
"id": img_doc_id,
"name": image_filename,
"content_type": f"image/{image_ext}",
"size": len(image_bytes)
},
"contents": [{
"type": "text",
"text": f"Image Analysis (Word Document Image {i+1}):\n{image_analysis}",
"is_extracted": True,
"extraction_context": image_prompt
}]
}
# Add image document to message
message["documents"].append(image_document)
logger.info(f"Added extracted image document {image_filename} to message")
except Exception as img_err:
logger.warning(f"Error processing image {img_path}: {str(img_err)}")
# Add combined image analysis to the main document
if images_analysis:
combined_analysis = "\n\n## Embedded Images Analysis\n\n"
for img in images_analysis:
combined_analysis += f"### Image {img['index']}\n{img['analysis']}\n\n"
document["contents"].append({
"type": "text",
"text": combined_analysis,
"is_extracted": True,
"extraction_context": f"Analysis of {len(images_analysis)} images embedded in the Word document"
})
logger.info(f"Added combined analysis of {len(images_analysis)} Word document images")
# Close the zip file
docx_zip.close()
# Note: For .doc (older format) we would need additional libraries
# This could be implemented with libraries like antiword or pywin32
elif file_name.lower().endswith('.doc'):
logger.warning("Image extraction from .doc files is not supported yet")
document["contents"].append({
"type": "text",
"text": "\n\nNote: This is an older .doc format document. Images may be present but could not be extracted.",
"is_extracted": False
})
except Exception as e:
logger.error(f"Error extracting images from Word document {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"\n\nError extracting images from Word document: {str(e)}",
"is_extracted": False
})
# Excel file processing with enhanced capabilities
elif file_name.lower().endswith(('.xlsx', '.xls')):
logger.info(f"Processing Excel document: {file_name}")
# Extract text representation of spreadsheet data
from modules.agentservice_utils import extract_text_from_file_content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
# Add text content
document["contents"].append({
"type": "text",
"text": text_content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Extracted data from Excel document {file_name}")
# Try to extract charts and images if available
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# For .xlsx files (newer format)
if file_name.lower().endswith('.xlsx'):
import zipfile
from io import BytesIO
# Create a zipfile object from the Excel content
xlsx_zip = zipfile.ZipFile(BytesIO(file_content))
# Charts and images can be in various directories
media_paths = [
'xl/media/',
'xl/drawings/',
'xl/charts/'
]
# Collect all potential media files
media_files = []
for path in media_paths:
media_files.extend([f for f in xlsx_zip.namelist() if f.startswith(path)])
if media_files:
logger.info(f"Found {len(media_files)} media files in Excel document {file_name}")
# Process image files (skip XML and other non-image files)
image_extensions = ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'tiff', 'emf', 'wmf']
image_files = [f for f in media_files if f.split('.')[-1].lower() in image_extensions]
if image_files:
logger.info(f"Found {len(image_files)} images/charts in Excel document {file_name}")
image_prompt = extraction_prompt or "Describe this chart/image from the Excel document"
images_analysis = []
for i, img_path in enumerate(image_files):
try:
# Extract image data
image_bytes = xlsx_zip.read(img_path)
# Determine image type from filename
image_ext = img_path.split('.')[-1] if '.' in img_path else 'png'
# Analyze image
image_analysis = await self.ai_service.analyze_image(
image_data=image_bytes,
prompt=f"{image_prompt} (Describe what this chart or image shows, including any data trends or patterns visible)",
mime_type=f"image/{image_ext}"
)
# Add to analysis list
images_analysis.append({
"index": i + 1,
"path": img_path,
"analysis": image_analysis
})
logger.info(f"Analyzed image/chart {i+1} from Excel document")
# Create a separate document for each extracted image
img_doc_id = f"img_doc_{uuid.uuid4()}"
image_filename = f"excel_image{i+1}.{image_ext}"
image_document = {
"id": img_doc_id,
"source": {
"type": "extracted",
"parent_id": str(file_id),
"id": img_doc_id,
"name": image_filename,
"content_type": f"image/{image_ext}",
"size": len(image_bytes)
},
"contents": [{
"type": "text",
"text": f"Chart/Image Analysis (Excel Document Item {i+1}):\n{image_analysis}",
"is_extracted": True,
"extraction_context": image_prompt
}]
}
# Add image document to message
message["documents"].append(image_document)
except Exception as img_err:
logger.warning(f"Error processing image {img_path}: {str(img_err)}")
# Add combined image analysis to the main document
if images_analysis:
combined_analysis = "\n\n## Embedded Charts and Images Analysis\n\n"
for img in images_analysis:
combined_analysis += f"### Chart/Image {img['index']}\n{img['analysis']}\n\n"
document["contents"].append({
"type": "text",
"text": combined_analysis,
"is_extracted": True,
"extraction_context": f"Analysis of {len(images_analysis)} charts/images from the Excel document"
})
# Close the zip file
xlsx_zip.close()
except Exception as e:
logger.error(f"Error extracting charts/images from Excel document {file_name}: {str(e)}")
else:
# For other file types, extract text
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
document["contents"].append({
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})")
else:
# No content available
document["contents"].append({
"type": "text",
"text": f"File content not available for {file_name}",
"is_extracted": False
})
else:
# Just add reference without content
document["contents"].append({
"type": "text",
"text": f"File: {file_name} (content not loaded)",
"is_extracted": False
})
# Add document to message
message["documents"].append(document)
logger.info(f"File {file_name} successfully added to message")
return message
except Exception as e:
logger.error(f"Error adding file {file_id} to message: {str(e)}")
return message
async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]:
"""
Extract or update document content with contextual extraction.
Args:
doc_id: ID of the document to extract
message: Message containing the document
extraction_prompt: Contextual prompt for extraction
Returns:
Updated message with extracted content
"""
if not message or "documents" not in message:
return message
updated_message = message.copy()
# Find the document
for i, document in enumerate(updated_message.get("documents", [])):
if document.get("id") == doc_id:
# Get file ID from source
source = document.get("source", {})
file_id = source.get("id")
if file_id and self.lucydom_interface:
# Get file metadata
file = self.lucydom_interface.get_file(int(file_id))
if not file:
continue
# Get file content
file_content = await self.lucydom_interface.read_file_content(int(file_id))
if not file_content:
continue
# Process based on file type
file_name = file.get("name", "unnamed_file")
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
# Update content based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
image_analysis = await self.ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
# Create or update content
new_content = {
"type": "text",
"text": f"Image Analysis:\n{image_analysis}",
"is_extracted": True,
"extraction_context": extraction_prompt
}
# Update or add content
contents = document.get("contents", [])
contents_updated = False
for j, content in enumerate(contents):
if content.get("type") == "text":
updated_message["documents"][i]["contents"][j] = new_content
contents_updated = True
break
if not contents_updated:
if not updated_message["documents"][i].get("contents"):
updated_message["documents"][i]["contents"] = []
updated_message["documents"][i]["contents"].append(new_content)
logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}")
except Exception as e:
logger.error(f"Error updating image analysis for {file_name}: {str(e)}")
else:
# For other file types, extract text with new context
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
new_content = {
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
}
# Update or add content
contents = document.get("contents", [])
contents_updated = False
for j, content_item in enumerate(contents):
if content_item.get("type") == "text":
updated_message["documents"][i]["contents"][j] = new_content
contents_updated = True
break
if not contents_updated:
if not updated_message["documents"][i].get("contents"):
updated_message["documents"][i]["contents"] = []
updated_message["documents"][i]["contents"].append(new_content)
logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}")
# Found and processed the document, stop searching
break
return updated_message
async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]:
"""
Extract all relevant files from a workflow with context-aware extraction.
Args:
workflow: The workflow object
extraction_prompt: Contextual prompt for extraction
file_filter: Optional filter for file types (e.g., "csv", "image")
Returns:
Dictionary with extracted content
"""
# Import for data extraction
from modules.agentservice_dataextraction import data_extraction
# Get all files from the workflow
files = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
# Apply filter if provided
if file_filter:
file_name = file_info.get("name", "").lower()
content_type = file_info.get("content_type", "").lower()
if (file_filter.lower() in file_name or
file_filter.lower() in content_type):
# Check if file is already in the list
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
else:
# No filter, include all files
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
# If no files found, return empty result
if not files:
return {
"prompt": extraction_prompt,
"files_processed": 0,
"extracted_content": []
}
# Get all messages from the workflow
workflow_messages = workflow.get("messages", [])
# Extract data using the dataextraction module
extracted_data = await data_extraction(
prompt=extraction_prompt,
files=files,
messages=workflow_messages,
ai_service=self.ai_service,
lucydom_interface=self.lucydom_interface,
workflow_id=self.workflow_id,
add_log_func=None # We don't have access to add_log_func here
)
return extracted_data
def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str:
"""
Get file content from a message.
Args:
message: The message containing the document
file_id: Optional file ID to search for
doc_id: Optional document ID to search for
Returns:
Text content of the file if available
"""
if not message or "documents" not in message:
return ""
# Search for the document
for document in message.get("documents", []):
# Match by document ID or file ID
source = document.get("source", {})
source_file_id = source.get("id")
if ((doc_id and document.get("id") == doc_id) or
(file_id and source_file_id and str(file_id) == str(source_file_id))):
# Get text content from document
for content in document.get("contents", []):
if content.get("type") == "text":
return content.get("text", "")
return ""
def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]:
"""
Create a new text document in a message.
Args:
message: The message to add the document to
content: Text content
title: Document title
Returns:
Updated message with the new document
"""
# Initialize documents array if needed
updated_message = message.copy()
if "documents" not in updated_message:
updated_message["documents"] = []
# Create document ID
doc_id = f"doc_{uuid.uuid4()}"
# Create document structure
document = {
"id": doc_id,
"source": {
"type": "generated",
"id": doc_id,
"name": title,
"content_type": "text/plain",
"size": len(content)
},
"contents": [
{
"type": "text",
"text": content,
"is_extracted": True
}
]
}
# Add document to message
updated_message["documents"].append(document)
logger.info(f"Created text document '{title}' in message")
return updated_message
def merge_document_contents(self, message: Dict[str, Any]) -> str:
"""
Merge all document contents from a message into a single text.
Args:
message: The message containing documents
Returns:
Combined text content from all documents
"""
if not message or "documents" not in message:
return ""
combined_text = ""
for document in message.get("documents", []):
source = document.get("source", {})
doc_name = source.get("name", "Unnamed Document")
# Extract text content
doc_text = ""
for content in document.get("contents", []):
if content.get("type") == "text":
doc_text = content.get("text", "")
break
if doc_text:
combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}"
return combined_text.strip()
# Factory function
def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler:
"""Get a document handler instance."""
return DocumentHandler(workflow_id, lucydom_interface, ai_service)

File diff suppressed because it is too large Load diff

View file

@ -1,338 +0,0 @@
"""
Agent Communication Protocol module for the Agentservice.
Defines a standardized format for agents to exchange information.
"""
import json
import uuid
from typing import Dict, Any, List, Optional
from datetime import datetime
class AgentMessage:
"""
Standard message format for inter-agent communication.
Includes content, metadata, and document references.
"""
def __init__(
self,
content: str,
sender_id: str,
receiver_id: Optional[str] = None,
message_type: str = "text",
metadata: Optional[Dict[str, Any]] = None,
documents: Optional[List[Dict[str, Any]]] = None,
context_id: Optional[str] = None
):
"""
Initialize an agent message.
Args:
content: The main message content
sender_id: ID of the sending agent
receiver_id: Optional ID of the receiving agent
message_type: Type of message (text, task, result, etc.)
metadata: Optional metadata dictionary
documents: Optional list of document references
context_id: Optional conversation context ID
"""
self.id = f"msg_{uuid.uuid4()}"
self.timestamp = datetime.now().isoformat()
self.content = content
self.sender_id = sender_id
self.receiver_id = receiver_id
self.message_type = message_type
self.metadata = metadata or {}
self.documents = documents or []
self.context_id = context_id
def to_dict(self) -> Dict[str, Any]:
"""Convert the message to a dictionary."""
return {
"id": self.id,
"timestamp": self.timestamp,
"content": self.content,
"sender_id": self.sender_id,
"receiver_id": self.receiver_id,
"message_type": self.message_type,
"metadata": self.metadata,
"documents": self.documents,
"context_id": self.context_id
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'AgentMessage':
"""Create a message from a dictionary."""
message = cls(
content=data.get("content", ""),
sender_id=data.get("sender_id", "unknown"),
receiver_id=data.get("receiver_id"),
message_type=data.get("message_type", "text"),
metadata=data.get("metadata", {}),
documents=data.get("documents", []),
context_id=data.get("context_id")
)
message.id = data.get("id", message.id)
message.timestamp = data.get("timestamp", message.timestamp)
return message
def to_json(self) -> str:
"""Convert the message to a JSON string."""
return json.dumps(self.to_dict())
@classmethod
def from_json(cls, json_str: str) -> 'AgentMessage':
"""Create a message from a JSON string."""
return cls.from_dict(json.loads(json_str))
class AgentCommunicationProtocol:
"""
Defines the protocol for agents to communicate with each other.
Provides standardized message creation and handling.
"""
@staticmethod
def create_text_message(
content: str,
sender_id: str,
receiver_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
documents: Optional[List[Dict[str, Any]]] = None,
context_id: Optional[str] = None
) -> AgentMessage:
"""Create a simple text message."""
return AgentMessage(
content=content,
sender_id=sender_id,
receiver_id=receiver_id,
message_type="text",
metadata=metadata,
documents=documents,
context_id=context_id
)
@staticmethod
def create_task_message(
task_description: str,
sender_id: str,
receiver_id: str,
input_data: Optional[Dict[str, Any]] = None,
documents: Optional[List[Dict[str, Any]]] = None,
context_id: Optional[str] = None
) -> AgentMessage:
"""Create a task assignment message."""
metadata = {
"task_type": "general",
"input_data": input_data or {},
"priority": "normal",
"task_id": f"task_{uuid.uuid4()}"
}
return AgentMessage(
content=task_description,
sender_id=sender_id,
receiver_id=receiver_id,
message_type="task",
metadata=metadata,
documents=documents,
context_id=context_id
)
@staticmethod
def create_result_message(
result_content: str,
sender_id: str,
receiver_id: str,
task_id: str,
output_data: Optional[Dict[str, Any]] = None,
result_format: str = "text",
documents: Optional[List[Dict[str, Any]]] = None,
context_id: Optional[str] = None
) -> AgentMessage:
"""Create a task result message."""
metadata = {
"task_id": task_id,
"result_format": result_format,
"status": "completed",
"output_data": output_data or {}
}
return AgentMessage(
content=result_content,
sender_id=sender_id,
receiver_id=receiver_id,
message_type="result",
metadata=metadata,
documents=documents,
context_id=context_id
)
@staticmethod
def create_error_message(
error_description: str,
sender_id: str,
receiver_id: Optional[str] = None,
error_type: str = "general",
error_details: Optional[Dict[str, Any]] = None,
context_id: Optional[str] = None
) -> AgentMessage:
"""Create an error message."""
metadata = {
"error_type": error_type,
"error_details": error_details or {},
"severity": "error"
}
return AgentMessage(
content=error_description,
sender_id=sender_id,
receiver_id=receiver_id,
message_type="error",
metadata=metadata,
context_id=context_id
)
@staticmethod
def create_document_request_message(
document_description: str,
sender_id: str,
receiver_id: str,
filters: Optional[Dict[str, Any]] = None,
context_id: Optional[str] = None
) -> AgentMessage:
"""Create a document request message."""
metadata = {
"request_type": "document",
"filters": filters or {},
"request_id": f"req_{uuid.uuid4()}"
}
return AgentMessage(
content=document_description,
sender_id=sender_id,
receiver_id=receiver_id,
message_type="request",
metadata=metadata,
context_id=context_id
)
@staticmethod
def create_status_update_message(
status_description: str,
sender_id: str,
receiver_id: Optional[str] = None,
status: str = "in_progress",
progress: float = 0.0,
context_id: Optional[str] = None
) -> AgentMessage:
"""Create a status update message."""
metadata = {
"status": status,
"progress": progress,
"update_type": "status"
}
return AgentMessage(
content=status_description,
sender_id=sender_id,
receiver_id=receiver_id,
message_type="status",
metadata=metadata,
context_id=context_id
)
@staticmethod
def convert_system_message_to_agent_message(system_message: Dict[str, Any], sender_id: str) -> AgentMessage:
"""
Convert a system message to an agent message.
Args:
system_message: Message object from the workflow
sender_id: ID of the sending agent
Returns:
AgentMessage instance
"""
# Extract basic information
content = system_message.get("content", "")
message_id = system_message.get("id", f"msg_{uuid.uuid4()}")
timestamp = system_message.get("started_at", datetime.now().isoformat())
# Create metadata
metadata = {
"agent_type": system_message.get("agent_type"),
"agent_name": system_message.get("agent_name"),
"workflow_id": system_message.get("workflow_id"),
"sequence_no": system_message.get("sequence_no"),
"result_format": system_message.get("result_format"),
"original_message_id": message_id
}
# Create agent message
agent_message = AgentMessage(
content=content,
sender_id=sender_id,
message_type="system",
metadata=metadata,
documents=system_message.get("documents", []),
context_id=system_message.get("workflow_id")
)
# Set original ID and timestamp
agent_message.id = message_id
agent_message.timestamp = timestamp
return agent_message
@staticmethod
def convert_agent_message_to_system_message(agent_message: AgentMessage) -> Dict[str, Any]:
"""
Convert an agent message to a system message.
Args:
agent_message: The agent message to convert
Returns:
System message dictionary
"""
message_data = agent_message.to_dict()
metadata = message_data.get("metadata", {})
# Create system message structure
system_message = {
"id": message_data.get("id", f"msg_{uuid.uuid4()}"),
"workflow_id": message_data.get("context_id"),
"started_at": message_data.get("timestamp", datetime.now().isoformat()),
"finished_at": datetime.now().isoformat(),
"sequence_no": metadata.get("sequence_no", 0),
"status": "completed",
"role": "assistant",
"data_stats": {
"processing_time": 0.0,
"token_count": 0,
"bytes_sent": 0,
"bytes_received": 0
},
"agent_type": metadata.get("agent_type"),
"agent_id": message_data.get("sender_id"),
"agent_name": metadata.get("agent_name"),
"result_format": metadata.get("result_format", "text"),
"content": message_data.get("content", ""),
"documents": message_data.get("documents", [])
}
# If this is a result message, add more metadata
if message_data.get("message_type") == "result":
system_message["output_data"] = metadata.get("output_data", {})
system_message["task_id"] = metadata.get("task_id")
return system_message
# Factory function
def get_agent_protocol():
"""Get the agent communication protocol."""
return AgentCommunicationProtocol

View file

@ -1,290 +0,0 @@
"""
Updated registry for all available agents in the system.
Provides centralized agent registration and access with improved error handling.
"""
import os
import logging
import importlib
from typing import Dict, Any, List, Optional
# Import direct base agent module
from modules.agentservice_base import BaseAgent
logger = logging.getLogger(__name__)
class AgentRegistry:
"""Registry for all available agents in the system"""
_instance = None
@classmethod
def get_instance(cls):
"""Get a singleton instance of the Agent Registry"""
if cls._instance is None:
cls._instance = cls()
return cls._instance
def __init__(self):
"""Initialize the Agent Registry"""
if AgentRegistry._instance is not None:
raise RuntimeError("Singleton instance already exists - use get_instance()")
self.agents = {}
self.ai_service = None
self.document_handler = None
self.lucydom_interface = None
self._load_agents()
def _load_agents(self):
"""Load all available agents"""
# List of all agent modules to load
logger.info("Automatically loading agent modules...")
agent_modules = []
for filename in os.listdir(os.path.dirname(__file__)):
if filename.startswith("agentservice_agent_") and filename.endswith(".py"):
agent_modules.append(filename[:-3]) # Remove .py extension
if not agent_modules:
logger.warning("No agent modules found")
return
logger.info(f"Found {len(agent_modules)} agent modules")
for module_name in agent_modules:
try:
# Import the module
module = importlib.import_module(f"modules.{module_name}")
# Look for the agent class or a get_*_agent function
agent_type = module_name.split('_')[-1]
class_name = f"{agent_type.capitalize()}Agent"
getter_name = f"get_{agent_type}_agent"
agent = None
# Try to get the agent via the get_*_agent function
if hasattr(module, getter_name):
getter_func = getattr(module, getter_name)
agent = getter_func()
logger.info(f"Agent '{agent.name}' (Type: {agent.type}) loaded via {getter_name}()")
# Alternatively, try to instantiate the agent directly
elif hasattr(module, class_name):
agent_class = getattr(module, class_name)
agent = agent_class()
logger.info(f"Agent '{agent.name}' (Type: {agent.type}) directly instantiated")
if agent:
# Register the agent
self.register_agent(agent)
else:
logger.warning(f"No agent class or getter function found in module {module_name}")
except ImportError as e:
logger.error(f"Module {module_name} could not be imported: {e}")
except Exception as e:
logger.error(f"Error loading agent from module {module_name}: {e}")
def set_dependencies(self, ai_service=None, document_handler=None, lucydom_interface=None):
"""
Set system dependencies for all agents.
Args:
ai_service: AI service for text generation
document_handler: Document handler for document operations
lucydom_interface: LucyDOM interface for database access
"""
self.ai_service = ai_service
# Update all registered agents
self.update_agent_dependencies()
def update_agent_dependencies(self):
"""Update dependencies for all registered agents"""
for agent_id, agent in self.agents.items():
if hasattr(agent, 'set_dependencies'):
agent.set_dependencies(
ai_service=self.ai_service,
document_handler=self.document_handler,
lucydom_interface=self.lucydom_interface
)
def register_agent(self, agent: 'BaseAgent'):
"""
Register an agent in the registry.
Args:
agent: The agent to register
"""
agent_type = agent.type
agent_id = getattr(agent, 'id', agent_type)
# Initialize enhanced agents with dependencies
if hasattr(agent, 'set_dependencies'):
agent.set_dependencies(
ai_service=self.ai_service,
document_handler=self.document_handler,
lucydom_interface=self.lucydom_interface
)
self.agents[agent_type] = agent
# Also register by ID if it's different from type
if agent_id != agent_type:
self.agents[agent_id] = agent
logger.debug(f"Agent '{agent.name}' (Type: {agent_type}, ID: {agent_id}) registered")
def get_agent(self, agent_identifier: str) -> Optional[BaseAgent]:
"""
Get an agent instance by ID or type.
Args:
agent_identifier: ID or type of the desired agent
Returns:
Agent instance or None if not found
"""
# Try to find directly by type
if agent_identifier in self.agents:
return self.agents[agent_identifier]
# If not found, try different name variants
variants = [
agent_identifier,
agent_identifier.replace('_agent', ''),
f"{agent_identifier}_agent"
]
for variant in variants:
if variant in self.agents:
return self.agents[variant]
logger.warning(f"Agent with identifier '{agent_identifier}' not found")
return None
def get_all_agents(self) -> Dict[str, BaseAgent]:
"""Get all registered agents."""
return self.agents
def get_agent_infos(self) -> List[Dict[str, Any]]:
"""Get information about all registered agents."""
agent_infos = []
# Only once per agent instance (since we register both by type and ID)
seen_agents = set()
for agent in self.agents.values():
if agent not in seen_agents:
agent_infos.append(agent.get_agent_info())
seen_agents.add(agent)
return agent_infos
def get_agent_by_format(self, required_format: str) -> Optional[BaseAgent]:
"""
Find an agent that can produce the required output format.
Args:
required_format: The required output format
Returns:
Agent that can produce the required format, or None if not found
"""
# Create mapping of result format -> agent for faster lookup
format_to_agent = {}
seen_agents = set()
for agent in self.agents.values():
if agent not in seen_agents:
# Get the agent's result format
agent_format = getattr(agent, 'result_format', None)
if agent_format:
format_to_agent[agent_format.lower()] = agent
seen_agents.add(agent)
# Try to find an exact match
if required_format.lower() in format_to_agent:
return format_to_agent[required_format.lower()]
# If no exact match, try to find a partial match
for fmt, agent in format_to_agent.items():
if required_format.lower() in fmt or fmt in required_format.lower():
return agent
# No match found
return None
def initialize_agents_for_workflow(self) -> Dict[str, Dict[str, Any]]:
"""Initialize agents for a workflow."""
initialized_agents = {}
seen_agents = set()
for agent in self.agents.values():
if agent not in seen_agents:
agent_info = agent.get_agent_info()
agent_id = agent_info["id"]
initialized_agents[agent_id] = agent_info
seen_agents.add(agent)
return initialized_agents
def get_agent_capabilities(self) -> Dict[str, List[str]]:
"""
Get a mapping of capabilities to agents.
Useful for finding the right agent for a specific task.
Returns:
Dict mapping capability keywords to agent IDs
"""
capabilities_map = {}
seen_agents = set()
for agent in self.agents.values():
if agent not in seen_agents:
# Get agent info
agent_id = getattr(agent, 'id', agent.type)
# Extract capabilities - check for get_capabilities method first
if hasattr(agent, 'get_capabilities') and callable(getattr(agent, 'get_capabilities')):
capabilities = agent.get_capabilities()
else:
# Fall back to string parsing
capabilities_str = getattr(agent, 'capabilities', "")
capabilities = [kw.strip().lower() for kw in capabilities_str.split(',') if kw.strip()]
# Add each capability to the mapping
for capability in capabilities:
if capability not in capabilities_map:
capabilities_map[capability] = []
if agent_id not in capabilities_map[capability]:
capabilities_map[capability].append(agent_id)
seen_agents.add(agent)
return capabilities_map
def get_agent_by_capability(self, capability: str) -> Optional['BaseAgent']:
"""
Find an agent with a specific capability.
Args:
capability: The required capability
Returns:
Agent with the required capability, or None if not found
"""
# Create mapping of capabilities for faster lookup
capability_map = self.get_agent_capabilities()
# Look for the capability (case-insensitive)
capability = capability.lower()
matching_agents = []
# Direct match
if capability in capability_map:
matching_agents = capability_map[capability]
else:
# Partial matches
for cap, agents in capability_map.items():
if capability in cap or cap in capability:
matching_agents.extend(agents)
# Return the first matching agent
if matching_agents:
agent_id = matching_agents[0]
return self.get_agent(agent_id)
return None

View file

@ -1,760 +0,0 @@
"""
Centralized utility functions for the Agentservice (continued).
"""
import os
import logging
import json
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union, Callable
from io import BytesIO
logger = logging.getLogger(__name__)
class WorkflowUtils:
"""
Utility class for workflow operations.
Centralizes common workflow-related functions.
"""
def __init__(self, workflow_id: str = None):
"""Initialize with optional workflow ID"""
self.workflow_id = workflow_id
def set_workflow_id(self, workflow_id: str):
"""Set or update the workflow ID"""
self.workflow_id = workflow_id
def get_documents(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all documents from a workflow across all messages.
Args:
workflow: The workflow object
Returns:
List of document objects
"""
documents = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
# Add to list if not already present
if not any(d.get("id") == doc.get("id") for d in documents):
documents.append(doc)
return documents
def get_files(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all file references from a workflow.
Args:
workflow: The workflow object
Returns:
List of file metadata objects
"""
files = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("content_type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
# Check if file is already in the list
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
return files
def extract_by_prompt(self, workflow: Dict[str, Any], prompt: str, ai_service) -> Dict[str, Any]:
"""
Extract data from workflow documents based on an AI prompt.
Args:
workflow: The workflow object
prompt: The extraction prompt
ai_service: The AI service to use for extraction
Returns:
Extracted data
"""
# This is an async method but we're exposing it as a regular method
# The caller should use it with asyncio.run() or await
async def _extract():
# Create extraction prompt
files = self.get_files(workflow)
file_descriptions = "\n".join([f"- {f.get('name', 'unnamed')} ({f.get('type', 'unknown')})" for f in files])
extraction_prompt = f"""
Extract relevant information from the following files based on this request:
REQUEST: {prompt}
FILES:
{file_descriptions}
Focus on the most relevant content and provide a structured output.
"""
# Call AI
response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
return {
"prompt": prompt,
"extracted_content": response,
"files_processed": len(files)
}
# Return the coroutine
return _extract()
def merge_workflows(self, workflows: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Merge multiple workflows into a single unified workflow.
Useful for workflow templates or combining partial workflows.
Args:
workflows: List of workflow objects to merge
Returns:
Merged workflow
"""
if not workflows:
return {}
# Start with the first workflow
result = workflows[0].copy()
# Initialize lists if not present
if "messages" not in result:
result["messages"] = []
if "logs" not in result:
result["logs"] = []
# Merge additional workflows
for workflow in workflows[1:]:
# Append messages
for message in workflow.get("messages", []):
# Check for duplicates
if not any(m.get("id") == message.get("id") for m in result["messages"]):
result["messages"].append(message)
# Append logs
for log in workflow.get("logs", []):
# Check for duplicates
if not any(l.get("id") == log.get("id") for l in result["logs"]):
result["logs"].append(log)
# Update status if needed
if workflow.get("status") == "failed":
result["status"] = "failed"
# Update last_activity if newer
if (workflow.get("last_activity") and
(not result.get("last_activity") or
workflow["last_activity"] > result["last_activity"])):
result["last_activity"] = workflow["last_activity"]
return result
def get_message(self, workflow: Dict[str, Any], message_id: str) -> Optional[Dict[str, Any]]:
"""
Find a message by ID in the workflow.
Args:
workflow: The workflow object
message_id: The message ID to find
Returns:
Message object or None if not found
"""
for message in workflow.get("messages", []):
if message.get("id") == message_id:
return message
return None
def to_str(self, workflow: Dict[str, Any]) -> str:
"""
Convert workflow to a formatted string representation.
Args:
workflow: The workflow object
Returns:
String representation of the workflow
"""
# Create a summary string
result = f"Workflow: {workflow.get('id')}\n"
result += f"Status: {workflow.get('status', 'unknown')}\n"
result += f"Started: {workflow.get('started_at', 'unknown')}\n"
result += f"Last Activity: {workflow.get('last_activity', 'unknown')}\n"
# Add message count
message_count = len(workflow.get("messages", []))
result += f"Messages: {message_count}\n"
# Add log count
log_count = len(workflow.get("logs", []))
result += f"Logs: {log_count}\n"
return result
class MessageUtils:
"""
Utility class for message operations.
Centralizes common message-related functions.
"""
def create_message(self, workflow_id: str, role: str = "system") -> Dict[str, Any]:
"""
Create a new message object.
Args:
workflow_id: ID of the workflow
role: Role of the message ('system', 'user', 'assistant')
Returns:
New message object
"""
message_id = f"msg_{uuid.uuid4()}"
current_time = datetime.now().isoformat()
# Create message object
message = {
"id": message_id,
"workflow_id": workflow_id,
"parent_message_id": None,
"started_at": current_time,
"finished_at": None,
"sequence_no": 0,
"status": "pending",
"role": role,
"data_stats": {
"processing_time": 0.0,
"token_count": 0,
"bytes_sent": 0,
"bytes_received": 0
},
"documents": [],
"content": None,
"agent_type": None
}
return message
def finalize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
"""
Finalize a message by setting completion timestamp.
Args:
message: The message object
Returns:
Updated message object
"""
message["finished_at"] = datetime.now().isoformat()
message["status"] = "completed"
return message
def get_documents(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all documents from a message.
Args:
message: The message object
Returns:
List of document objects
"""
return message.get("documents", [])
def get_files(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all file references from a message.
Args:
message: The message object
Returns:
List of file metadata objects
"""
files = []
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("content_type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
files.append(file_info)
return files
def extract_text_content(self, message: Dict[str, Any]) -> str:
"""
Extract text content from a message including document content.
Args:
message: The message object
Returns:
String with all text content from the message
"""
content = message.get("content", "")
# Add document content
for doc in message.get("documents", []):
# Check for document contents
for doc_content in doc.get("contents", []):
if doc_content.get("type") == "text":
content += "\n\n" + doc_content.get("text", "")
return content
def to_str(self, message: Dict[str, Any]) -> str:
"""
Convert message to a formatted string representation.
Args:
message: The message object
Returns:
String representation of the message
"""
# Create a summary string
result = f"Message: {message.get('id')}\n"
result += f"Role: {message.get('role', 'unknown')}\n"
# Add agent info if available
if message.get("agent_type"):
result += f"Agent: {message.get('agent_name', message.get('agent_type', 'unknown'))}\n"
# Add content summary
content = message.get("content", "")
if content:
content_preview = content[:100] + "..." if len(content) > 100 else content
result += f"Content: {content_preview}\n"
# Add document count
doc_count = len(message.get("documents", []))
result += f"Documents: {doc_count}\n"
return result
class FileUtils:
"""
Utility class for file operations.
Centralizes common file-related functions.
"""
def is_text_extractable(self, file_name: str, content_type: str = None) -> bool:
"""
Check if text can be extracted from a file.
Args:
file_name: Name of the file
content_type: MIME type (optional)
Returns:
True if text can be extracted, False otherwise
"""
# Text files
if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')):
return True
# Excel files
if file_name.endswith(('.xlsx', '.xls')):
try:
import pandas
return True
except ImportError:
return False
# PDF files
if file_name.endswith('.pdf'):
try:
# Check if PyPDF2 or PyMuPDF is available
try:
import PyPDF2
return True
except ImportError:
try:
import fitz # PyMuPDF
return True
except ImportError:
return False
except:
return False
# Images and other non-text files
if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg',
'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv',
'.mp3', '.wav', '.ogg', '.flac', '.aac')):
return False
# Check content type if file extension doesn't give a clear answer
if content_type:
if content_type.startswith(('text/', 'application/json', 'application/xml')):
return True
elif content_type == 'application/pdf':
return True
elif content_type.startswith(('image/', 'video/', 'audio/')):
return False
# Default to allowing extraction attempt
return True
def get_mime_type(self, file_name: str) -> str:
"""
Get MIME type based on file name.
Args:
file_name: Name of the file
Returns:
MIME type string
"""
import mimetypes
# Initialize mimetypes
mimetypes.init()
# Get MIME type
mime_type, _ = mimetypes.guess_type(file_name)
if not mime_type:
# Default mappings for common extensions
extension_map = {
'txt': 'text/plain',
'md': 'text/markdown',
'json': 'application/json',
'csv': 'text/csv',
'html': 'text/html',
'htm': 'text/html',
'pdf': 'application/pdf',
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
'png': 'image/png',
'gif': 'image/gif',
'svg': 'image/svg+xml',
'webp': 'image/webp',
'mp4': 'video/mp4',
'mp3': 'audio/mpeg'
}
# Get extension
ext = os.path.splitext(file_name)[1].lower().lstrip('.')
# Return mapped MIME type or default
mime_type = extension_map.get(ext, 'application/octet-stream')
return mime_type
class LoggingUtils:
"""
Enhanced logging utilities for better workflow tracking.
Provides structured and categorized logging for workflows.
"""
def __init__(self, workflow_id: str = None, log_func: Callable = None):
"""
Initialize logging utilities.
Args:
workflow_id: ID of the workflow for context
log_func: Function to call for adding workflow logs
"""
self.workflow_id = workflow_id
self.log_func = log_func
self.logger = logging.getLogger(__name__)
# Define log categories
self.categories = {
"workflow": "Workflow Management",
"planning": "Activity Planning",
"execution": "Activity Execution",
"agents": "Agent Selection & Execution",
"files": "File Processing",
"summary": "Results Summary",
"error": "Error Handling",
"code": "Code Execution",
}
def set_workflow_id(self, workflow_id: str):
"""Update the workflow ID"""
self.workflow_id = workflow_id
def set_log_func(self, log_func: Callable):
"""Update the log function"""
self.log_func = log_func
def info(self, message: str, category: str = "workflow", details: str = None):
"""
Log an informational message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.info(log_message)
# Log to workflow if function available
if self.log_func and self.workflow_id:
self.log_func(self.workflow_id, message, "info", category, category_name)
def warning(self, message: str, category: str = "workflow", details: str = None):
"""
Log a warning message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.warning(log_message)
# Log to workflow if function available
if self.log_func and self.workflow_id:
self.log_func(self.workflow_id, message, "warning", category, category_name)
def error(self, message: str, category: str = "error", details: str = None):
"""
Log an error message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.error(log_message)
# Log to workflow if function available
if self.log_func and self.workflow_id:
self.log_func(self.workflow_id, message, "error", category, category_name)
def debug(self, message: str, category: str = "workflow", details: str = None):
"""
Log a debug message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.debug(log_message)
def get_category_name(self, category: str) -> str:
"""
Get human-readable category name.
Args:
category: Category code
Returns:
Human-readable category name
"""
return self.categories.get(category, category)
def extract_text_from_file_content(file_content: bytes, file_name: str, content_type: str = None) -> Tuple[str, bool]:
"""
Extract text from various file formats based on binary content.
Args:
file_content: Binary content of the file
file_name: Name of the file for format detection
content_type: Optional MIME type of the file
Returns:
Tuple with (extracted text, is_extracted flag)
"""
# Check if file is likely text-extractable
if not is_text_extractable(file_name, content_type):
return f"[File: {file_name} - Text extraction not supported]", False
try:
# Simple text files
if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv', '.log', '.ini', '.cfg', '.conf')) or (content_type and (content_type.startswith('text/') or content_type in ['application/json', 'application/xml', 'text/csv'])):
try:
return file_content.decode('utf-8'), True
except UnicodeDecodeError:
try:
return file_content.decode('latin1'), True
except:
return file_content.decode('cp1252', errors='replace'), True
# Excel files
elif file_name.endswith(('.xlsx', '.xls')):
try:
import pandas as pd
# Create temporary in-memory file
file_obj = BytesIO(file_content)
df = pd.read_excel(file_obj)
result = f"Excel file with {len(df)} rows and {len(df.columns)} columns.\n"
result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
result += df.to_string(index=False)
return result, True
except ImportError:
return f"[Excel file: {file_name} - pandas not installed]", False
except Exception as e:
return f"[Error extracting Excel content: {str(e)}]", False
# CSV files
elif file_name.endswith('.csv'):
try:
import pandas as pd
try:
# Create temporary in-memory file
file_obj = BytesIO(file_content)
df = pd.read_csv(file_obj, encoding='utf-8')
except UnicodeDecodeError:
file_obj = BytesIO(file_content)
try:
df = pd.read_csv(file_obj, encoding='latin1')
except:
file_obj = BytesIO(file_content)
df = pd.read_csv(file_obj, encoding='cp1252')
result = f"CSV file with {len(df)} rows and {len(df.columns)} columns.\n"
result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
result += df.to_string(index=False)
return result, True
except ImportError:
return f"[CSV file: {file_name} - pandas not installed]", False
except Exception as e:
return f"[Error extracting CSV content: {str(e)}]", False
# PDF files
elif file_name.endswith('.pdf'):
try:
try:
from PyPDF2 import PdfReader
reader = PdfReader(BytesIO(file_content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n\n"
return text, True
except ImportError:
try:
import fitz # PyMuPDF
doc = fitz.open(stream=file_content, filetype="pdf")
text = ""
for page in doc:
text += page.get_text() + "\n\n"
return text, True
except ImportError:
return f"[PDF: {file_name} - No PDF library installed]", False
except Exception as e:
return f"[Error reading PDF file {file_name}: {str(e)}]", False
# Default case - try basic text extraction
else:
try:
return file_content.decode('utf-8', errors='replace'), True
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
return f"[Text extraction error: {str(e)}]", False
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
return f"[Text extraction error: {str(e)}]", False
def is_text_extractable(file_name: str, content_type: str = None) -> bool:
"""Check if text can be extracted from a file."""
# Text files
if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')):
return True
# Excel files
if file_name.endswith(('.xlsx', '.xls')):
try:
import pandas
return True
except ImportError:
return False
# PDF files
if file_name.endswith('.pdf'):
try:
# Check if PyPDF2 or PyMuPDF is available
try:
import PyPDF2
return True
except ImportError:
try:
import fitz # PyMuPDF
return True
except ImportError:
return False
except:
return False
# Images and other non-text files
if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg',
'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv',
'.mp3', '.wav', '.ogg', '.flac', '.aac')):
return False
# Check content type if file extension doesn't give a clear answer
if content_type:
if content_type.startswith(('text/', 'application/json', 'application/xml')):
return True
elif content_type == 'application/pdf':
return True
elif content_type.startswith(('image/', 'video/', 'audio/')):
return False
# Default to allowing extraction attempt
return True

File diff suppressed because it is too large Load diff

View file

@ -1,689 +0,0 @@
"""
Refactored WorkflowManager class for the Agentservice (continued).
"""
import os
import logging
import asyncio
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union
logger = logging.getLogger(__name__)
class WorkflowManager:
def __init__(self, mandate_id: int = None, user_id: int = None, ai_service = None, lucydom_interface = None):
"""Initialize the WorkflowManager."""
self.mandate_id = mandate_id
self.user_id = user_id
self.ai_service = ai_service
self.lucydom_interface = lucydom_interface
# Cache for workflows
self.workflows = {}
# Directory for results
self.results_dir = os.path.join("results", "workflows")
os.makedirs(self.results_dir, exist_ok=True)
# Initialize document handler
from modules.agentservice_document_handler import get_document_handler
self.document_handler = get_document_handler(
lucydom_interface=lucydom_interface,
ai_service=ai_service
)
# Initialize agent registry with dependencies
from modules.agentservice_registry import AgentRegistry
registry = AgentRegistry.get_instance()
registry.set_dependencies(
ai_service=ai_service,
document_handler=self.document_handler,
lucydom_interface=lucydom_interface
)
async def list_workflows(self, mandate_id: int = None, user_id: int = None) -> List[Dict[str, Any]]:
"""
List all available workflows.
Args:
mandate_id: Optional mandate ID for filtering
user_id: Optional user ID for filtering
Returns:
List of workflow summaries
"""
workflows = []
# Load from database if available
if self.lucydom_interface:
try:
# Get all workflows for the user
if user_id is not None:
user_workflows = self.lucydom_interface.get_workflows_by_user(user_id)
else:
user_workflows = self.lucydom_interface.get_all_workflows()
# Filter by mandate if specified
if mandate_id is not None:
user_workflows = [wf for wf in user_workflows if wf.get("mandate_id") == mandate_id]
# Create workflow summaries
for workflow in user_workflows:
summary = {
"id": workflow.get("id"),
"name": workflow.get("name", f"Workflow {workflow.get('id')}"),
"status": workflow.get("status"),
"started_at": workflow.get("started_at"),
"last_activity": workflow.get("last_activity"),
"completed_at": workflow.get("completed_at")
}
# Add message count if available
messages = self.lucydom_interface.get_workflow_messages(workflow.get("id"))
if messages:
summary["message_count"] = len(messages)
workflows.append(summary)
logger.info(f"Loaded {len(workflows)} workflows from database")
# Sort by last activity (newest first)
return sorted(workflows, key=lambda w: w.get("last_activity", ""), reverse=True)
except Exception as e:
logger.error(f"Error retrieving workflows from database: {str(e)}")
# Load from files if no database or error occurred
try:
for filename in os.listdir(self.results_dir):
if filename.startswith("workflow_") and filename.endswith(".json"):
workflow_path = os.path.join(self.results_dir, filename)
try:
import json
with open(workflow_path, 'r', encoding='utf-8') as f:
workflow = json.load(f)
# Check if mandate and user ID match filters
if mandate_id is not None and workflow.get("mandate_id") != mandate_id:
continue
if user_id is not None and workflow.get("user_id") != user_id:
continue
# Create workflow summary
summary = {
"id": workflow.get("id"),
"name": workflow.get("name", f"Workflow {workflow.get('id')}"),
"status": workflow.get("status"),
"started_at": workflow.get("started_at"),
"last_activity": workflow.get("last_activity"),
"message_count": len(workflow.get("messages", []))
}
workflows.append(summary)
except Exception as e:
logger.error(f"Error loading workflow file {filename}: {str(e)}")
logger.info(f"Loaded {len(workflows)} workflows from files")
# Sort by last activity (newest first)
return sorted(workflows, key=lambda w: w.get("last_activity", ""), reverse=True)
except Exception as e:
logger.error(f"Error listing workflows: {str(e)}")
return []
async def execute_workflow(self, message: Dict[str, Any], files: List[Dict[str, Any]] = None, workflow_id: str = None, is_user_input: bool = False) -> Dict[str, Any]:
"""
Execute a workflow with the given message and files.
Args:
message: Input message (prompt)
files: Optional list of file metadata
workflow_id: Optional ID for continuing an existing workflow
is_user_input: Flag indicating if this is user input to an existing workflow
Returns:
Workflow execution result
"""
# Use provided workflow_id or generate a new one for a new workflow
if not workflow_id:
workflow_id = f"wf_{uuid.uuid4()}"
# Initialize a new workflow
workflow = self._initialize_workflow(workflow_id)
else:
# Load existing workflow for continuation
workflow = await self.load_workflow(workflow_id)
if not workflow:
# Fallback: initialize a new workflow with the provided ID
workflow = self._initialize_workflow(workflow_id)
# Capture start time
start_time = datetime.now()
try:
# Create WorkflowExecution with document handler
from modules.agentservice_workflow_execution import WorkflowExecution
execution = WorkflowExecution(
workflow_manager=self,
workflow_id=workflow_id,
mandate_id=self.mandate_id,
user_id=self.user_id,
ai_service=self.ai_service,
lucydom_interface=self.lucydom_interface
)
# Set the document handler's workflow ID
self.document_handler.set_workflow_id(workflow_id)
# Execute the workflow
result = await execution.execute(message, workflow, files, is_user_input)
# Calculate duration
duration = (datetime.now() - start_time).total_seconds()
# Update workflow stats
if "data_stats" not in workflow:
workflow["data_stats"] = {
"total_processing_time": 0.0,
"total_token_count": 0,
"total_bytes_sent": 0,
"total_bytes_received": 0
}
workflow["data_stats"]["total_processing_time"] = duration
workflow["completed_at"] = datetime.now().isoformat()
# Save final state
self._save_workflow(workflow)
return result
except Exception as e:
logger.error(f"Error executing workflow: {str(e)}", exc_info=True)
# Update workflow status
workflow["status"] = "failed"
workflow["last_activity"] = datetime.now().isoformat()
self._add_log(workflow, f"Workflow execution failed: {str(e)}", "error")
# Save failed state
self._save_workflow(workflow)
return {
"workflow_id": workflow_id,
"status": "failed",
"error": str(e)
}
def _save_workflow(self, workflow: Dict[str, Any]) -> bool:
"""
Save workflow state to database and/or file.
Enhanced to handle structured documents.
Args:
workflow: The workflow object to save
Returns:
True if saved successfully, False otherwise
"""
try:
workflow_id = workflow.get("id")
# Update in-memory cache
self.workflows[workflow_id] = workflow
# Update in database if available
if self.lucydom_interface:
# NEW: Enhanced document handling for database persistence
# Create a copy of the workflow for database storage
db_workflow = workflow.copy()
# Save to database
try:
self.lucydom_interface.save_workflow_state(db_workflow)
logger.info(f"Workflow {workflow_id} saved to database")
except Exception as db_error:
logger.error(f"Error saving workflow to database: {str(db_error)}")
# Continue to file saving even if database fails
# Save to file (always do this as backup)
import json
workflow_path = os.path.join(self.results_dir, f"workflow_{workflow_id}.json")
with open(workflow_path, 'w', encoding='utf-8') as f:
json.dump(workflow, f, indent=2, ensure_ascii=False)
logger.info(f"Workflow {workflow_id} saved to file: {workflow_path}")
return True
except Exception as e:
logger.error(f"Error saving workflow state: {str(e)}")
return False
async def load_workflow(self, workflow_id: str) -> Optional[Dict[str, Any]]:
"""
Load a workflow by ID.
Enhanced to ensure document handler is properly configured.
Args:
workflow_id: ID of the workflow to load
Returns:
The workflow object or None if not found
"""
# Check memory cache first
if workflow_id in self.workflows:
workflow = self.workflows[workflow_id]
# NEW: Configure document handler for this workflow
self.document_handler.set_workflow_id(workflow_id)
return workflow
# Try to load from database
if self.lucydom_interface:
try:
workflow = self.lucydom_interface.load_workflow_state(workflow_id)
if workflow:
# Cache in memory
self.workflows[workflow_id] = workflow
# NEW: Configure document handler for this workflow
self.document_handler.set_workflow_id(workflow_id)
logger.info(f"Workflow {workflow_id} loaded from database")
return workflow
except Exception as e:
logger.error(f"Error loading workflow from database: {str(e)}")
# Try to load from file
workflow_path = os.path.join(self.results_dir, f"workflow_{workflow_id}.json")
if os.path.exists(workflow_path):
try:
import json
with open(workflow_path, 'r', encoding='utf-8') as f:
workflow = json.load(f)
# Cache in memory
self.workflows[workflow_id] = workflow
# NEW: Configure document handler for this workflow
self.document_handler.set_workflow_id(workflow_id)
logger.info(f"Workflow {workflow_id} loaded from file: {workflow_path}")
return workflow
except Exception as e:
logger.error(f"Error loading workflow from file: {str(e)}")
logger.warning(f"Workflow {workflow_id} not found")
return None
async def delete_workflow(self, workflow_id: str) -> bool:
"""
Delete a workflow.
Args:
workflow_id: ID of the workflow
Returns:
True on success, False if workflow not found
"""
# Remove from memory
if workflow_id in self.workflows:
del self.workflows[workflow_id]
# Delete from database
if self.lucydom_interface:
try:
db_success = self.lucydom_interface.delete_workflow(workflow_id)
logger.info(f"Workflow {workflow_id} deleted from database: {db_success}")
except Exception as e:
logger.error(f"Error deleting workflow {workflow_id} from database: {str(e)}")
# Delete file
workflow_path = os.path.join(self.results_dir, f"workflow_{workflow_id}.json")
try:
if os.path.exists(workflow_path):
os.remove(workflow_path)
logger.info(f"Workflow {workflow_id} deleted from file: {workflow_path}")
return True
else:
logger.warning(f"Workflow {workflow_id} not found: {workflow_path}")
return False
except Exception as e:
logger.error(f"Error deleting workflow file {workflow_id}: {str(e)}")
return False
def _initialize_workflow(self, workflow_id: str) -> Dict[str, Any]:
"""
Initialize a new workflow.
Args:
workflow_id: ID of the workflow
Returns:
The initialized workflow object
"""
current_time = datetime.now().isoformat()
# Create complete workflow object according to the data model
workflow = {
"id": workflow_id,
"name": f"Workflow {workflow_id}",
"mandate_id": self.mandate_id,
"user_id": self.user_id,
"status": "running",
"started_at": current_time,
"last_activity": current_time,
"current_round": 1,
# Complete statistics structure according to DataStats model
"data_stats": {
"total_processing_time": 0.0,
"total_token_count": 0,
"total_bytes_sent": 0,
"total_bytes_received": 0
},
# Empty arrays for messages and logs
"messages": [],
"logs": []
}
# Log entry for workflow start
self._add_log(workflow, "Workflow started", "info", "workflow", "Workflow Management")
# Save workflow to database
if self.lucydom_interface:
try:
# Direct save of the complete workflow object
self.lucydom_interface.save_workflow_state(workflow)
logger.info(f"Workflow {workflow_id} created in database")
except Exception as e:
logger.error(f"Error creating workflow {workflow_id} in database: {str(e)}")
# Cache workflow in memory
self.workflows[workflow_id] = workflow
return workflow
async def stop_workflow(self, workflow_id: str) -> bool:
"""
Stop a running workflow.
Args:
workflow_id: ID of the workflow to stop
Returns:
True on success, False if workflow not found or already stopped
"""
try:
workflow = self.workflows.get(workflow_id)
if not workflow:
# Try to load the workflow
workflow = await self.load_workflow(workflow_id)
if not workflow:
return False
# If workflow is not running or completed, abort
if workflow.get("status") not in ["running", "completed"]:
return False
# Set status to stopped
workflow["status"] = "stopped"
workflow["last_activity"] = datetime.now().isoformat()
self._add_log(workflow, "Workflow was manually stopped", "info", "workflow", "Workflow Management")
# Save workflow
self._save_workflow(workflow)
return True
except Exception as e:
logger.error(f"Error stopping workflow {workflow_id}: {str(e)}")
return False
def _add_log(self, workflow: Dict[str, Any], message: str, log_type: str, agent_id: Optional[str] = None, agent_name: Optional[str] = None) -> None:
"""Add a log entry to the workflow."""
# First, check if workflow is a string (ID) instead of dictionary
if isinstance(workflow, str):
# Try to load the workflow by ID
workflow_id = workflow
workflow = self.workflows.get(workflow_id)
if not workflow:
# Just log to the logger and return
logger.info(f"Log (couldn't add to workflow {workflow_id}): {log_type} - {message}")
return
# Check if workflow is a dictionary
if not isinstance(workflow, dict):
logger.error(f"Invalid workflow type: {type(workflow)}. Expected dictionary.")
# Just log to the logger and return
logger.info(f"Log (couldn't add to workflow): {log_type} - {message}")
return
# Create log entry
log_entry = {
"id": f"log_{uuid.uuid4()}",
"message": message,
"type": log_type,
"timestamp": datetime.now().isoformat(),
"agent_id": agent_id,
"agent_name": agent_name
}
# Add log entry to workflow
if "logs" not in workflow:
workflow["logs"] = []
workflow["logs"].append(log_entry)
# Update last activity
workflow["last_activity"] = log_entry["timestamp"]
# Save log entry to database if available
if self.lucydom_interface:
try:
# Add workflow ID to log entry
log_data = log_entry.copy()
log_data["workflow_id"] = workflow["id"]
self.lucydom_interface.create_workflow_log(log_data)
logger.debug(f"Log entry for workflow {workflow['id']} saved to database")
except Exception as e:
logger.error(f"Error saving log entry for workflow {workflow['id']} to database: {str(e)}")
# Also log to standard logger with the category prefix
category_prefix = f"[{agent_name or agent_id or 'Workflow'}]" if agent_name or agent_id else ""
log_message = f"{category_prefix} {message}"
if log_type == "error":
logger.error(log_message)
elif log_type == "warning":
logger.warning(log_message)
else:
logger.info(log_message)
def get_workflow_status(self, workflow_id: str) -> Optional[Dict[str, Any]]:
"""
Get the status of a workflow.
Args:
workflow_id: ID of the workflow
Returns:
Dictionary with status information or None if workflow not found
"""
# Get from memory
workflow = self.workflows.get(workflow_id)
# If not in memory, load from database or file
if not workflow:
# Load from database if available
if self.lucydom_interface:
try:
workflow_data = self.lucydom_interface.get_workflow(workflow_id)
if workflow_data:
workflow = workflow_data
except Exception as e:
logger.error(f"Error loading workflow status from database: {str(e)}")
# If not in database, load from file
if not workflow:
try:
import json
workflow_path = os.path.join(self.results_dir, f"workflow_{workflow_id}.json")
if os.path.exists(workflow_path):
with open(workflow_path, 'r', encoding='utf-8') as f:
workflow = json.load(f)
except Exception as e:
logger.error(f"Error loading workflow status from file: {str(e)}")
return None
if not workflow:
return None
# Extract status information
status_info = {
"id": workflow.get("id"),
"name": workflow.get("name", f"Workflow {workflow_id}"),
"status": workflow.get("status"),
"progress": 1.0 if workflow.get("status") in ["completed", "failed", "stopped"] else 0.5,
"started_at": workflow.get("started_at"),
"last_activity": workflow.get("last_activity"),
"workflow_complete": workflow.get("status") == "completed",
"current_round": workflow.get("current_round", 1),
"data_stats": workflow.get("data_stats", {
"total_processing_time": 0.0,
"total_token_count": 0,
"total_bytes_sent": 0,
"total_bytes_received": 0
})
}
return status_info
def get_workflow_logs(self, workflow_id: str) -> Optional[List[Dict[str, Any]]]:
"""
Get logs for a workflow.
Args:
workflow_id: ID of the workflow
Returns:
List of logs or None if workflow not found
"""
# Get from memory
workflow = self.workflows.get(workflow_id)
# If not in memory, load from database
if not workflow and self.lucydom_interface:
try:
logs = self.lucydom_interface.get_workflow_logs(workflow_id)
return logs
except Exception as e:
logger.error(f"Error loading workflow logs from database: {str(e)}")
# If not in database or no interface available, load from file
if not workflow:
try:
import json
workflow_path = os.path.join(self.results_dir, f"workflow_{workflow_id}.json")
if os.path.exists(workflow_path):
with open(workflow_path, 'r', encoding='utf-8') as f:
workflow = json.load(f)
except Exception as e:
logger.error(f"Error loading workflow logs from file: {str(e)}")
return None
return workflow.get("logs", []) if workflow else None
def get_workflow_messages(self, workflow_id: str) -> Optional[List[Dict[str, Any]]]:
"""
Get messages for a workflow.
Args:
workflow_id: ID of the workflow
Returns:
List of messages or None if workflow not found
"""
# Get from memory
workflow = self.workflows.get(workflow_id)
# If not in memory, load from database
if not workflow and self.lucydom_interface:
try:
messages = self.lucydom_interface.get_workflow_messages(workflow_id)
return messages
except Exception as e:
logger.error(f"Error loading workflow messages from database: {str(e)}")
# If not in database or no interface available, load from file
if not workflow:
try:
import json
workflow_path = os.path.join(self.results_dir, f"workflow_{workflow_id}.json")
if os.path.exists(workflow_path):
with open(workflow_path, 'r', encoding='utf-8') as f:
workflow = json.load(f)
except Exception as e:
logger.error(f"Error loading workflow messages from file: {str(e)}")
return None
return workflow.get("messages", []) if workflow else None
# Factory function for WorkflowManager
def get_workflow_manager(mandate_id: int = None, user_id: int = None, ai_service = None, lucydom_interface = None):
"""
Get a WorkflowManager instance for the specified context.
Reuses existing instances and updates dependencies.
Args:
mandate_id: Mandate ID
user_id: User ID
ai_service: AI service
lucydom_interface: LucyDOM interface
Returns:
WorkflowManager instance
"""
from modules.lucydom_interface import get_lucydom_interface
context_key = f"{mandate_id}_{user_id}"
# Get LucyDOM interface if not provided
if not lucydom_interface:
lucydom_interface = get_lucydom_interface(mandate_id, user_id)
if context_key not in _workflow_managers:
_workflow_managers[context_key] = WorkflowManager(
mandate_id,
user_id,
ai_service,
lucydom_interface
)
# Update services if provided
if ai_service is not None:
_workflow_managers[context_key].ai_service = ai_service
# NEW: Update document handler's AI service
if hasattr(_workflow_managers[context_key], 'document_handler'):
_workflow_managers[context_key].document_handler.set_ai_service(ai_service)
# NEW: Update agent registry dependencies
from modules.agentservice_registry import AgentRegistry
registry = AgentRegistry.get_instance()
registry.set_dependencies(ai_service=ai_service)
return _workflow_managers[context_key]
# Singleton factory for WorkflowManager instances per context
_workflow_managers = {}

View file

@ -6,7 +6,7 @@ from jose import JWTError, jwt
import logging
from modules.gateway_interface import get_gateway_interface
from modules.utility import APP_CONFIG
from gateway.modules.configuration import APP_CONFIG
# Get Config Data
SECRET_KEY = APP_CONFIG.get("APP_JWT_SECRET_SECRET")

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -27,7 +27,6 @@ class AgentAnalyst(AgentBase):
super().__init__()
self.name = "Data Analyst"
self.capabilities = "data_analysis,pattern_recognition,statistics,visualization,data_interpretation"
self.result_format = "AnalysisReport"
# Visualisierungseinstellungen
self.plt_style = 'seaborn-v0_8-whitegrid'
@ -38,13 +37,6 @@ class AgentAnalyst(AgentBase):
def get_agent_info(self) -> Dict[str, Any]:
"""Gibt Agent-Informationen für die Registry zurück"""
info = super().get_config()
info.update({
"metadata": {
"supported_formats": ["csv", "xlsx", "json", "text"],
"analysis_types": ["statistical", "trend", "comparative", "predictive", "clustering", "general"],
"visualization_types": ["bar", "line", "scatter", "histogram", "box", "heatmap", "pie"]
}
})
return info
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
@ -66,7 +58,6 @@ class AgentAnalyst(AgentBase):
"role": "assistant",
"content": "",
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id,
"documents": []
}

View file

@ -27,7 +27,6 @@ class AgentCoder(AgentBase):
super().__init__()
self.name = "Python Code Agent"
self.capabilities = "code_development,data_processing,file_processing,automation"
self.result_format = "python_code"
# Executor-Einstellungen
self.executor_timeout = 60 # Sekunden
@ -42,13 +41,6 @@ class AgentCoder(AgentBase):
def get_agent_info(self) -> Dict[str, Any]:
"""Gibt Agent-Informationen für die Registry zurück"""
info = super().get_config()
info.update({
"metadata": {
"timeout": self.executor_timeout,
"memory_limit": self.executor_memory_limit,
"max_correction_attempts": self.max_correction_attempts
}
})
return info
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
@ -70,7 +62,6 @@ class AgentCoder(AgentBase):
"role": "assistant",
"content": "",
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id,
"documents": []
}

View file

@ -23,26 +23,9 @@ class AgentCreative(AgentBase):
"document_analysis,text_processing,table_creation,"
"content_structuring")
self.result_format = "Text,Document,Table"
def get_agent_info(self) -> Dict[str, Any]:
"""Gibt Agent-Informationen für die Registry zurück"""
info = super().get_config()
info.update({
"metadata": {
"specialties": [
"creative_writing",
"documentation",
"knowledge",
"poweron",
"document_processing",
"information_extraction",
"content_transformation",
"table_generation",
"document_analysis"
]
}
})
return info
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
@ -64,7 +47,6 @@ class AgentCreative(AgentBase):
"role": "assistant",
"content": "",
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id,
"documents": []
}

View file

@ -21,17 +21,10 @@ class AgentDocumentation(AgentBase):
super().__init__()
self.name = "Documentation Specialist"
self.capabilities = "report_generation,documentation,content_structuring,technical_writing,knowledge_organization"
self.result_format = "FormattedDocument"
def get_agent_info(self) -> Dict[str, Any]:
"""Gibt Agent-Informationen für die Registry zurück"""
info = super().get_config()
info.update({
"metadata": {
"document_types": ["manual", "report", "process", "presentation", "document"],
"formats": ["markdown", "text"]
}
})
return info
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
@ -53,7 +46,6 @@ class AgentDocumentation(AgentBase):
"role": "assistant",
"content": "",
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id,
"documents": []
}
@ -79,7 +71,7 @@ class AgentDocumentation(AgentBase):
is_complex = self._assess_complexity(enhanced_prompt)
# Titel generieren
title = self._generate_title(enhanced_prompt, document_type)
title = await self._generate_title(enhanced_prompt, document_type)
# Inhalt basierend auf Komplexität generieren
if is_complex:

View file

@ -6,14 +6,13 @@ Angepasst für die neue chat.py Architektur und chat_registry.py.
import json
import logging
import time
import traceback
from typing import Dict, Any, List, Optional
from urllib.parse import quote_plus, unquote
from bs4 import BeautifulSoup
import requests
from modules.chat_registry import AgentBase
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
@ -24,8 +23,7 @@ class AgentWebcrawler(AgentBase):
"""Initialisiert den Webcrawler-Agent"""
super().__init__()
self.name = "Webscraper"
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
self.result_format = "SearchResults"
self.capabilities = "web_search,website_information_retrieval"
# Web-Crawling-Konfiguration
self.max_url = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_URLS"))
@ -36,13 +34,6 @@ class AgentWebcrawler(AgentBase):
def get_agent_info(self) -> Dict[str, Any]:
"""Gibt Agent-Informationen für die Registry zurück"""
info = super().get_config()
info.update({
"metadata": {
"max_url": self.max_url,
"max_result": self.max_result,
"timeout": self.timeout
}
})
return info
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
@ -64,7 +55,6 @@ class AgentWebcrawler(AgentBase):
"role": "assistant",
"content": "",
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id
}

View file

@ -0,0 +1,777 @@
"""
Modul zur Extraktion von Inhalten aus verschiedenen Dateiformaten.
Bietet spezialisierte Funktionen für die Verarbeitung von Text, PDF, Office-Dokumenten, Bildern usw.
"""
import logging
import os
import io
from typing import Dict, Any, List, Optional, Union, Tuple
import base64
# Logger konfigurieren
logger = logging.getLogger(__name__)
# Optional imports - only loaded when needed
pdf_extractor_loaded = False
office_extractor_loaded = False
image_processor_loaded = False
def get_document_contents(file_metadata: Dict[str, Any], file_content: bytes) -> List[Dict[str, Any]]:
"""
Hauptfunktion zur Extraktion von Inhalten aus einer Datei basierend auf dem MIME-Typ.
Delegiert an spezialisierte Extraktionsfunktionen.
Args:
file_metadata: Metadaten der Datei (Name, MIME-Typ, etc.)
file_content: Binärdaten der Datei
Returns:
Liste von Document-Content-Objekten mit metadata und is_text Flag
"""
try:
mime_type = file_metadata.get("mime_type", "application/octet-stream")
file_name = file_metadata.get("name", "unknown")
logger.info(f"Extrahiere Inhalte aus Datei '{file_name}' (MIME-Typ: {mime_type})")
# Inhalte basierend auf MIME-Typ extrahieren
contents = []
# Text-basierte Formate
if mime_type.startswith("text/") or mime_type in [
"application/json",
"application/xml",
"application/javascript",
"application/x-python"
]:
contents.extend(extract_text_content(file_name, file_content, mime_type))
# CSV Format
elif mime_type == "text/csv":
contents.extend(extract_csv_content(file_name, file_content))
# Bilder
elif mime_type.startswith("image/"):
contents.extend(extract_image_content(file_name, file_content, mime_type))
# PDF Dokumente
elif mime_type == "application/pdf":
contents.extend(extract_pdf_content(file_name, file_content))
# Word-Dokumente
elif mime_type in [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword"
]:
contents.extend(extract_word_content(file_name, file_content, mime_type))
# Excel-Dokumente
elif mime_type in [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel"
]:
contents.extend(extract_excel_content(file_name, file_content, mime_type))
# PowerPoint-Dokumente
elif mime_type in [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]:
contents.extend(extract_powerpoint_content(file_name, file_content, mime_type))
# Binärdaten als Fallback für unbekannte Formate
else:
contents.extend(extract_binary_content(file_name, file_content, mime_type))
# Fallback, wenn keine Inhalte extrahiert werden konnten
if not contents:
logger.warning(f"Keine Inhalte aus Datei '{file_name}' extrahiert, verwende Binär-Fallback")
contents.append({
"sequence_nr": 1,
"name": '1_undefined',
"ext": os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "bin",
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False
}
})
for content in contents:
if isinstance(content.get("data"), bytes):
content["data"] = base64.b64encode(content["data"]).decode('utf-8')
# Markiere in Metadaten, dass dies base64-kodiert ist
if "metadata" not in content:
content["metadata"] = {}
content["metadata"]["base64_encoded"] = True
logger.info(f"Erfolgreich {len(contents)} Inhalte aus Datei '{file_name}' extrahiert")
return contents
except Exception as e:
logger.error(f"Fehler bei der Inhaltsextraktion: {str(e)}")
# Fallback bei Fehler - Originaldaten zurückgeben
return [{
"sequence_nr": 1,
"name": file_metadata.get("name", "unknown"),
"ext": os.path.splitext(file_metadata.get("name", ""))[1][1:] if os.path.splitext(file_metadata.get("name", ""))[1] else "bin",
"content_type": file_metadata.get("mime_type", "application/octet-stream"),
"data": file_content,
"metadata": {
"is_text": False
}
}]
def _load_pdf_extractor():
"""Lädt die PDF-Extraktions-Bibliotheken bei Bedarf"""
global pdf_extractor_loaded
if not pdf_extractor_loaded:
try:
global PyPDF2, fitz
import PyPDF2
import fitz # PyMuPDF für umfangreichere PDF-Verarbeitung
pdf_extractor_loaded = True
logger.info("PDF-Extraktions-Bibliotheken erfolgreich geladen")
except ImportError as e:
logger.warning(f"PDF-Extraktions-Bibliotheken konnten nicht geladen werden: {e}")
def _load_office_extractor():
"""Lädt die Office-Dokument-Extraktions-Bibliotheken bei Bedarf"""
global office_extractor_loaded
if not office_extractor_loaded:
try:
global docx, openpyxl
import docx # python-docx für Word-Dokumente
import openpyxl # für Excel-Dateien
office_extractor_loaded = True
logger.info("Office-Extraktions-Bibliotheken erfolgreich geladen")
except ImportError as e:
logger.warning(f"Office-Extraktions-Bibliotheken konnten nicht geladen werden: {e}")
def _load_image_processor():
"""Lädt die Bild-Verarbeitungs-Bibliotheken bei Bedarf"""
global image_processor_loaded
if not image_processor_loaded:
try:
global PIL, Image
from PIL import Image
image_processor_loaded = True
logger.info("Bild-Verarbeitungs-Bibliotheken erfolgreich geladen")
except ImportError as e:
logger.warning(f"Bild-Verarbeitungs-Bibliotheken konnten nicht geladen werden: {e}")
def extract_text_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
"""
Extrahiert Text aus Textdateien.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
mime_type: MIME-Typ der Datei
Returns:
Liste von Text-Content-Objekten mit metadata.is_text = True
"""
try:
# Originaldateiendung beibehalten
file_extension = os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "txt"
# Text-Inhalt extrahieren
text_content = file_content.decode('utf-8')
return [{
"sequence_nr": 1,
"name": "1_text", # Simplified naming
"ext": file_extension,
"content_type": "text",
"data": text_content,
"metadata": {
"is_text": True
}
}]
except UnicodeDecodeError:
logger.warning(f"Konnte Text aus Datei '{file_name}' nicht als UTF-8 decodieren, versuche andere Kodierungen")
try:
# Versuche alternative Kodierungen
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
text_content = file_content.decode(encoding)
logger.info(f"Text erfolgreich mit Kodierung {encoding} decodiert")
return [{
"sequence_nr": 1,
"name": "1_text", # Simplified naming
"ext": file_extension,
"content_type": "text",
"data": text_content,
"metadata": {
"is_text": True,
"encoding": encoding
}
}]
except UnicodeDecodeError:
continue
# Fallback auf Binärdaten, wenn keine Kodierung funktioniert
logger.warning(f"Konnte Text nicht decodieren, verwende Binärdaten")
return [{
"sequence_nr": 1,
"name": "1_binary", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False
}
}]
except Exception as e:
logger.error(f"Fehler bei der alternativen Textdekodierung: {str(e)}")
# Binärdaten als Fallback zurückgeben
return [{
"sequence_nr": 1,
"name": "1_binary", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False
}
}]
def extract_csv_content(file_name: str, file_content: bytes) -> List[Dict[str, Any]]:
"""
Extrahiert Inhalt aus CSV-Dateien.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
Returns:
Liste von CSV-Content-Objekten mit metadata.is_text = True
"""
try:
# Text-Inhalt extrahieren
csv_content = file_content.decode('utf-8')
return [{
"sequence_nr": 1,
"name": "1_csv", # Simplified naming
"ext": "csv",
"content_type": "csv",
"data": csv_content,
"metadata": {
"is_text": True,
"format": "csv"
}
}]
except UnicodeDecodeError:
logger.warning(f"Konnte CSV aus Datei '{file_name}' nicht als UTF-8 decodieren, versuche andere Kodierungen")
try:
# Versuche alternative Kodierungen für CSV
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
csv_content = file_content.decode(encoding)
logger.info(f"CSV erfolgreich mit Kodierung {encoding} decodiert")
return [{
"sequence_nr": 1,
"name": "1_csv", # Simplified naming
"ext": "csv",
"content_type": "csv",
"data": csv_content,
"metadata": {
"is_text": True,
"encoding": encoding,
"format": "csv"
}
}]
except UnicodeDecodeError:
continue
# Fallback auf Binärdaten
return [{
"sequence_nr": 1,
"name": "1_binary", # Simplified naming
"ext": "csv",
"content_type": "text/csv",
"data": file_content,
"metadata": {
"is_text": False
}
}]
except Exception as e:
logger.error(f"Fehler bei der alternativen CSV-Dekodierung: {str(e)}")
return [{
"sequence_nr": 1,
"name": "1_binary", # Simplified naming
"ext": "csv",
"content_type": "text/csv",
"data": file_content,
"metadata": {
"is_text": False
}
}]
def extract_image_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
"""
Extrahiert Inhalt aus Bilddateien und erzeugt ggf. Metadaten-Beschreibungen.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
mime_type: MIME-Typ der Datei
Returns:
Liste von Image-Content-Objekten mit metadata.is_text = False
"""
# Dateiendung aus MIME-Typ oder Dateinamen extrahieren
file_extension = mime_type.split('/')[-1]
if file_extension == "jpeg":
file_extension = "jpg"
# Wenn möglich, Bild analysieren und Metadaten extrahieren
image_metadata = {
"is_text": False,
"format": "image"
}
image_description = None
try:
_load_image_processor()
if image_processor_loaded and file_content and len(file_content) > 0:
with io.BytesIO(file_content) as img_stream:
try:
img = Image.open(img_stream)
# Überprüfe, ob das Bild tatsächlich geladen wurde
img.verify()
# Um sicher weiterzuarbeiten, neu laden
img_stream.seek(0)
img = Image.open(img_stream)
image_metadata.update({
"format": img.format,
"mode": img.mode,
"width": img.width,
"height": img.height
})
# Extrahiere EXIF-Daten, falls vorhanden
if hasattr(img, '_getexif') and callable(img._getexif):
exif = img._getexif()
if exif:
exif_data = {}
for tag_id, value in exif.items():
exif_data[f"tag_{tag_id}"] = str(value)
image_metadata["exif"] = exif_data
# Erzeuge Bildbeschreibung
image_description = f"Image ({img.width}x{img.height}, {img.format}, {img.mode})"
except Exception as inner_e:
logger.warning(f"Fehler beim Verarbeiten des Bildes: {str(inner_e)}")
image_metadata["error"] = str(inner_e)
image_description = f"Image (unable to process: {str(inner_e)})"
except Exception as e:
logger.warning(f"Konnte Bildmetadaten nicht extrahieren: {str(e)}")
image_metadata["error"] = str(e)
# Bild-Inhalt zurückgeben
contents = [{
"sequence_nr": 1,
"name": "1_image", # Simplified naming
"ext": file_extension,
"content_type": "image",
"data": file_content,
"metadata": image_metadata
}]
# Falls Bildbeschreibung vorhanden, als zusätzlichen Text-Content hinzufügen
if image_description:
contents.append({
"sequence_nr": 2,
"name": "2_text_image_info", # Simplified naming with label
"ext": "txt",
"content_type": "text",
"data": image_description,
"metadata": {
"is_text": True,
"image_description": True
}
})
return contents
def extract_pdf_content(file_name: str, file_content: bytes) -> List[Dict[str, Any]]:
"""
Extrahiert Text und Bilder aus PDF-Dateien.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
Returns:
Liste von PDF-Content-Objekten (Text und Bilder) mit metadata.is_text Flag
"""
contents = []
extracted_content_found = False
try:
# PDF-Extraktions-Bibliotheken laden
_load_pdf_extractor()
if not pdf_extractor_loaded:
logger.warning("PDF-Extraktion nicht möglich: Bibliotheken nicht verfügbar")
# Originaldatei als binären Inhalt hinzufügen
contents.append({
"sequence_nr": 1,
"name": "1_pdf", # Simplified naming
"ext": "pdf",
"content_type": "application/pdf",
"data": file_content,
"metadata": {
"is_text": False,
"format": "pdf"
}
})
return contents
# Text mit PyPDF2 extrahieren
extracted_text = ""
pdf_metadata = {}
with io.BytesIO(file_content) as pdf_stream:
pdf_reader = PyPDF2.PdfReader(pdf_stream)
# Metadaten extrahieren
pdf_info = pdf_reader.metadata or {}
for key, value in pdf_info.items():
if key.startswith('/'):
pdf_metadata[key[1:]] = value
else:
pdf_metadata[key] = value
# Text aus allen Seiten extrahieren
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
if page_text:
extracted_text += f"--- Seite {page_num + 1} ---\n{page_text}\n\n"
# Wenn Text gefunden wurde, als eigenen Content hinzufügen
if extracted_text.strip():
extracted_content_found = True
contents.append({
"sequence_nr": len(contents) + 1,
"name": f"{len(contents) + 1}_text", # Simplified naming
"ext": "txt",
"content_type": "text",
"data": extracted_text,
"metadata": {
"is_text": True,
"source": "pdf",
"pages": len(pdf_reader.pages),
"pdf_metadata": pdf_metadata
}
})
# Bilder mit PyMuPDF (fitz) extrahieren
try:
with io.BytesIO(file_content) as pdf_stream:
doc = fitz.open(stream=pdf_stream, filetype="pdf")
image_count = 0
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
try:
image_count += 1
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Bild als Content hinzufügen
extracted_content_found = True
contents.append({
"sequence_nr": len(contents) + 1,
"name": f"{len(contents) + 1}_image_page{page_num+1}_{img_index+1}", # Simplified naming with label
"ext": image_ext,
"content_type": f"image/{image_ext}",
"data": image_bytes,
"metadata": {
"is_text": False,
"source": "pdf",
"page": page_num + 1,
"index": img_index
}
})
except Exception as img_e:
logger.warning(f"Fehler bei der Extraktion von Bild {img_index} auf Seite {page_num + 1}: {str(img_e)}")
# Dokument schließen
doc.close()
except Exception as img_extract_e:
logger.warning(f"Fehler bei der Bildextraktion aus PDF: {str(img_extract_e)}")
except Exception as e:
logger.error(f"Fehler bei der PDF-Extraktion: {str(e)}")
# Wenn keine Inhalte extrahiert wurden, füge das Original-PDF hinzu
if not extracted_content_found:
contents.append({
"sequence_nr": 1,
"name": "1_pdf", # Simplified naming
"ext": "pdf",
"content_type": "application/pdf",
"data": file_content,
"metadata": {
"is_text": False,
"format": "pdf"
}
})
return contents
def extract_word_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
"""
Extrahiert Text und Bilder aus Word-Dokumenten.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
mime_type: MIME-Typ der Datei
Returns:
Liste von Word-Content-Objekten (Text und ggf. Bilder) mit metadata.is_text Flag
"""
contents = []
extracted_content_found = False
# Dateiendung bestimmen
file_extension = "docx" if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" else "doc"
try:
# Office-Extraktions-Bibliotheken laden
_load_office_extractor()
if not office_extractor_loaded:
logger.warning("Word-Extraktion nicht möglich: Bibliotheken nicht verfügbar")
# Originaldatei als binären Inhalt hinzufügen
contents.append({
"sequence_nr": 1,
"name": "1_word", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False,
"format": "word"
}
})
return contents
# Unterstützt nur DOCX (neueres Format)
if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
with io.BytesIO(file_content) as docx_stream:
doc = docx.Document(docx_stream)
# Text extrahieren
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
# Tabellen extrahieren
for table in doc.tables:
for row in table.rows:
row_text = []
for cell in row.cells:
row_text.append(cell.text)
full_text.append(" | ".join(row_text))
extracted_text = "\n\n".join(full_text)
# Extrahierten Text als Content hinzufügen
if extracted_text.strip():
extracted_content_found = True
contents.append({
"sequence_nr": 1,
"name": "1_text", # Simplified naming
"ext": "txt",
"content_type": "text",
"data": extracted_text,
"metadata": {
"is_text": True,
"source": "docx",
"paragraph_count": len(doc.paragraphs),
"table_count": len(doc.tables)
}
})
else:
logger.warning(f"Extraktion aus altem Word-Format (DOC) nicht unterstützt")
except Exception as e:
logger.error(f"Fehler bei der Word-Extraktion: {str(e)}")
# Wenn keine Inhalte extrahiert wurden, füge das Original-Dokument hinzu
if not extracted_content_found:
contents.append({
"sequence_nr": 1,
"name": "1_word", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False,
"format": "word"
}
})
return contents
def extract_excel_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
"""
Extrahiert Tabellendaten aus Excel-Dateien.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
mime_type: MIME-Typ der Datei
Returns:
Liste von Excel-Content-Objekten mit metadata.is_text Flag
"""
contents = []
extracted_content_found = False
# Dateiendung bestimmen
file_extension = "xlsx" if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" else "xls"
try:
# Office-Extraktions-Bibliotheken laden
_load_office_extractor()
if not office_extractor_loaded:
logger.warning("Excel-Extraktion nicht möglich: Bibliotheken nicht verfügbar")
# Originaldatei als binären Inhalt hinzufügen
contents.append({
"sequence_nr": 1,
"name": "1_excel", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False,
"format": "excel"
}
})
return contents
# Unterstützt nur XLSX (neueres Format)
if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
with io.BytesIO(file_content) as xlsx_stream:
workbook = openpyxl.load_workbook(xlsx_stream, data_only=True)
# Jedes Arbeitsblatt als separaten CSV-Content extrahieren
for sheet_index, sheet_name in enumerate(workbook.sheetnames):
sheet = workbook[sheet_name]
# Daten als CSV formatieren
csv_rows = []
for row in sheet.iter_rows():
csv_row = []
for cell in row:
value = cell.value
if value is None:
csv_row.append("")
else:
csv_row.append(str(value).replace('"', '""'))
csv_rows.append(','.join(f'"{cell}"' for cell in csv_row))
csv_content = "\n".join(csv_rows)
# Als CSV-Content hinzufügen
if csv_content.strip():
extracted_content_found = True
sheet_safe_name = sheet_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
contents.append({
"sequence_nr": len(contents) + 1,
"name": f"{len(contents) + 1}_csv_{sheet_safe_name}", # Simplified naming with sheet label
"ext": "csv",
"content_type": "csv",
"data": csv_content,
"metadata": {
"is_text": True,
"source": "xlsx",
"sheet": sheet_name,
"format": "csv"
}
})
else:
logger.warning(f"Extraktion aus altem Excel-Format (XLS) nicht unterstützt")
except Exception as e:
logger.error(f"Fehler bei der Excel-Extraktion: {str(e)}")
# Wenn keine Inhalte extrahiert wurden, füge das Original-Dokument hinzu
if not extracted_content_found:
contents.append({
"sequence_nr": 1,
"name": "1_excel", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False,
"format": "excel"
}
})
return contents
def extract_powerpoint_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
"""
Extrahiert Inhalte aus PowerPoint-Präsentationen.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
mime_type: MIME-Typ der Datei
Returns:
Liste von PowerPoint-Content-Objekten mit metadata.is_text = False
"""
# Für PowerPoint geben wir aktuell nur die originale Binärdatei zurück
# Eine vollständige Extraktion würde mehr spezialisierte Bibliotheken erfordern
file_extension = "pptx" if mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" else "ppt"
return [{
"sequence_nr": 1,
"name": "1_powerpoint", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False,
"format": "powerpoint"
}
}]
def extract_binary_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
"""
Fallback für binäre Dateien, bei denen keine spezifische Extraktion möglich ist.
Args:
file_name: Name der Datei
file_content: Binärdaten der Datei
mime_type: MIME-Typ der Datei
Returns:
Liste mit einem binären Content-Objekt mit metadata.is_text = False
"""
file_extension = os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "bin"
return [{
"sequence_nr": 1,
"name": "1_binary", # Simplified naming
"ext": file_extension,
"content_type": mime_type,
"data": file_content,
"metadata": {
"is_text": False,
"format": "binary"
}
}]

View file

@ -140,9 +140,8 @@ class AgentRegistry:
agent_infos.append({
"name": agent.name,
"capabilities": getattr(agent, 'capabilities', ""),
"result_format": getattr(agent, 'result_format', "Text")
})
logger.error(f"Agent mit Kennung '{agent.name}' hat keine vollständigen Daten")
logger.error(f"Agent '{agent.name}' does not show profile.")
seen_agents.add(agent)
return agent_infos
@ -158,7 +157,6 @@ class AgentBase:
"""Initialisiere den Basis-Agenten."""
self.name = "Basis-Agent"
self.capabilities = "Grundlegende Agentenfunktionen"
self.result_format = "Text"
self.ai_service = None
def set_dependencies(self, ai_service=None):
@ -168,7 +166,6 @@ class AgentBase:
return {
"name": self.name,
"capabilities": self.capabilities,
"result_format": self.result_format
}
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
@ -179,7 +176,6 @@ class AgentBase:
"role": "assistant",
"content": f"Ich bin {self.name}, aber ich bin nicht richtig konfiguriert. Bitte den AI-Service einrichten.",
"agent_name": self.name,
"result_format": "Text"
}
# Einfachen Prompt erstellen
@ -196,7 +192,6 @@ class AgentBase:
"role": "assistant",
"content": response_content,
"agent_name": self.name,
"result_format": self.result_format
}
except Exception as e:
logger.error(f"Fehler in Agent {self.id}: {str(e)}")
@ -204,7 +199,6 @@ class AgentBase:
"role": "assistant",
"content": f"Ich habe einen Fehler festgestellt: {str(e)}",
"agent_name": self.name,
"result_format": "Text"
}

View file

@ -6,11 +6,9 @@ config.ini files and environment variables stored in .env files, using a flat st
"""
import os
import configparser
import logging
from typing import Any, Dict, Optional
from pathlib import Path
import time
# Set up logging
logger = logging.getLogger(__name__)

View file

@ -5,7 +5,7 @@ import importlib
from passlib.context import CryptContext
from connectors.connector_db_json import DatabaseConnector
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
logger = logging.getLogger(__name__)

View file

@ -1,6 +1,5 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from datetime import datetime
class Label(BaseModel):

View file

@ -7,7 +7,7 @@ import importlib
import hashlib
from connectors.connector_db_json import DatabaseConnector
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
@ -208,15 +208,18 @@ class LucyDOMInterface:
# File Utilities
def calculate_file_hash(self, file_content: bytes) -> str:
"""Berechnet einen SHA-256-Hash für den Dateiinhalt"""
return hashlib.sha256(file_content).hexdigest()
def check_for_duplicate_file(self, file_hash: str) -> Optional[Dict[str, Any]]:
"""Prüft, ob bereits eine Datei mit demselben Hash existiert"""
files = self.db.get_recordset("files", record_filter={"file_hash": file_hash})
if files:
return files[0]
return None
def get_mime_type(self, filename: str) -> str:
"""Ermittelt den MIME-Typ basierend auf der Dateiendung"""
import os
ext = os.path.splitext(filename)[1].lower()[1:]
extension_to_mime = {
@ -246,42 +249,53 @@ class LucyDOMInterface:
return extension_to_mime.get(ext.lower(), "application/octet-stream")
# File Methoden
# File Methoden - Metadaten-basierte Operationen
def get_all_files(self) -> List[Dict[str, Any]]:
"""Gibt alle Dateien des aktuellen Mandanten zurück"""
"""
Gibt alle Dateien des aktuellen Mandanten zurück ohne Binärdaten.
Returns:
Liste von FileItem-Objekten ohne Binärdaten
"""
files = self.db.get_recordset("files")
# Remove binary data from the response to reduce payload size
for file in files:
if "data" in file:
del file["data"]
return files
def get_file(self, file_id: int) -> Optional[Dict[str, Any]]:
"""Gibt eine Datei anhand ihrer ID zurück, ohne Binärdaten"""
"""
Gibt eine Datei anhand ihrer ID zurück, ohne Binärdaten.
Args:
file_id: ID der gesuchten Datei
Returns:
FileItem ohne Binärdaten oder None, wenn nicht gefunden
"""
files = self.db.get_recordset("files", record_filter={"id": file_id})
if files:
file = files[0]
# Remove binary data from the response to reduce payload size
if "data" in file:
del file["data"]
return file
return files[0]
return None
def create_file(self,
name: str,
mime_type: str,
size: int = None,
data: bytes = None,
file_hash: str = None) -> Dict[str, Any]:
"""Erstellt einen neuen Dateieintrag in der Datenbank mit Inhalt"""
def create_file(self, name: str, mime_type: str, size: int = None, file_hash: str = None) -> Dict[str, Any]:
"""
Erstellt einen neuen Dateieintrag in der Datenbank ohne Inhalt.
Der eigentliche Dateiinhalt wird separat in der FileData-Tabelle gespeichert.
Args:
name: Name der Datei
mime_type: MIME-Typ der Datei
size: Größe der Datei in Bytes
file_hash: Hash-Wert der Datei für Deduplizierung
Returns:
Das erstellte FileItem-Objekt
"""
file_data = {
"mandate_id": self.mandate_id,
"user_id": self.user_id,
"name": name,
"mime_type": mime_type,
"size": size,
"data": data, # Jetzt wird der Dateiinhalt direkt in der Datenbank gespeichert
"file_hash": file_hash,
"creation_date": self._get_current_timestamp()
}
@ -289,14 +303,14 @@ class LucyDOMInterface:
def update_file(self, file_id: int, update_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Aktualisiert eine vorhandene Datei
Aktualisiert die Metadaten einer vorhandenen Datei ohne die Binärdaten zu beeinflussen.
Args:
file_id: ID der zu aktualisierenden Datei
update_data: Dictionary mit zu aktualisierenden Feldern
Returns:
Das aktualisierte Datei-Objekt ohne Binärdaten
Das aktualisierte FileItem-Objekt
"""
# Prüfen, ob die Datei existiert
file = self.get_file(file_id)
@ -304,17 +318,11 @@ class LucyDOMInterface:
raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden")
# Datei aktualisieren
updated_file = self.db.record_modify("files", file_id, update_data)
# Binärdaten aus der Antwort entfernen
if "data" in updated_file:
del updated_file["data"]
return updated_file
return self.db.record_modify("files", file_id, update_data)
def delete_file(self, file_id: int) -> bool:
"""
Löscht eine Datei aus der Datenbank.
Löscht eine Datei aus der Datenbank (Metadaten und Inhalt).
Args:
file_id: ID der Datei
@ -339,11 +347,20 @@ class LucyDOMInterface:
other_references = [f for f in self.db.get_recordset("files", record_filter={"file_hash": file_hash})
if f.get("id") != file_id]
# If other files reference this content, only delete the database entry
# If other files reference this content, only delete the database entry for FileItem
if other_references:
logger.info(f"Andere Referenzen auf den Dateiinhalt gefunden, nur DB-Eintrag wird gelöscht: {file_id}")
logger.info(f"Andere Referenzen auf den Dateiinhalt gefunden, nur FileItem wird gelöscht: {file_id}")
else:
# Lösche auch den Dateiinhalt in der FileData-Tabelle
try:
file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id})
if file_data_entries:
self.db.record_delete("file_data", file_id)
logger.info(f"FileData für Datei {file_id} gelöscht")
except Exception as e:
logger.warning(f"Fehler beim Löschen des FileData für Datei {file_id}: {str(e)}")
# Lösche den Datenbankeintrag
# Lösche den FileItem-Eintrag
return self.db.record_delete("files", file_id)
except FileNotFoundError as e:
@ -356,9 +373,165 @@ class LucyDOMInterface:
logger.error(f"Fehler beim Löschen der Datei {file_id}: {str(e)}")
raise FileDeletionError(f"Fehler beim Löschen der Datei: {str(e)}")
# FileData Methoden - Binärdaten-basierte Operationen
def create_file_data(self, file_id: int, data: bytes) -> bool:
"""
Speichert die Binärdaten einer Datei in der Datenbank als Base64-String.
Args:
file_id: ID der zugehörigen Datei
data: Binärdaten
Returns:
True bei Erfolg, False bei Fehler
"""
try:
import base64
# Convert binary data to base64 string
if isinstance(data, bytes):
encoded_data = base64.b64encode(data).decode('utf-8')
logger.debug(f"Converted {len(data)} bytes to base64 string of length {len(encoded_data)}")
else:
logger.warning(f"Data is not bytes, but {type(data)}. Attempting to handle...")
# Try to convert to bytes if it's not already
if isinstance(data, str):
# Check if it might already be base64 encoded
try:
# See if it's valid base64
base64.b64decode(data)
# If no error, assume it's already encoded
encoded_data = data
logger.info(f"Data appears to be already base64 encoded, using as is")
except:
# Not base64, so encode the string as bytes then to base64
encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8')
logger.info(f"Converted string to base64")
else:
# For other types, convert to string first
encoded_data = base64.b64encode(str(data).encode('utf-8')).decode('utf-8')
logger.warning(f"Converted non-standard type to base64")
# Create the file_data record with encoded data
file_data = {
"id": file_id,
"data": encoded_data
}
self.db.record_create("file_data", file_data)
logger.info(f"Successfully stored encoded data for file {file_id}")
return True
except Exception as e:
logger.error(f"Fehler beim Speichern der Binärdaten für Datei {file_id}: {str(e)}")
return False
def get_file_data(self, file_id: int) -> Optional[bytes]:
"""
Gibt die Binärdaten einer Datei zurück.
Konvertiert Base64-String aus der Datenbank zurück zu bytes.
Args:
file_id: ID der Datei
Returns:
Binärdaten oder None, wenn nicht gefunden
"""
import base64
file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id})
if file_data_entries and "data" in file_data_entries[0]:
encoded_data = file_data_entries[0]["data"]
try:
# Check if it's a string (most likely base64)
if isinstance(encoded_data, str):
try:
# Try to decode base64
binary_data = base64.b64decode(encoded_data)
logger.debug(f"Successfully decoded base64 string to {len(binary_data)} bytes")
return binary_data
except Exception as e:
logger.error(f"Failed to decode base64 data: {str(e)}")
# If it's not valid base64, return as bytes
return encoded_data.encode('utf-8')
# If it's already bytes (shouldn't happen with model change)
elif isinstance(encoded_data, bytes):
logger.warning(f"Data was already bytes, no conversion needed")
return encoded_data
else:
logger.error(f"Unexpected data type in database: {type(encoded_data)}")
return None
except Exception as e:
logger.error(f"Error processing file data: {str(e)}")
return None
else:
logger.warning(f"No data found for file ID {file_id}")
return None
def update_file_data(self, file_id: int, data: Union[bytes, str]) -> bool:
"""
Aktualisiert die Binärdaten einer Datei in der Datenbank.
Konvertiert bytes zu Base64-String für die Speicherung.
Args:
file_id: ID der Datei
data: Neue Binärdaten oder kodierte Daten
Returns:
True bei Erfolg, False bei Fehler
"""
try:
import base64
# Convert data to base64 string if it's bytes
if isinstance(data, bytes):
encoded_data = base64.b64encode(data).decode('utf-8')
logger.debug(f"Converted {len(data)} bytes to base64 string")
elif isinstance(data, str):
# Check if it might already be base64 encoded
try:
# See if it's valid base64
base64.b64decode(data)
# If no error, assume it's already encoded
encoded_data = data
logger.debug(f"Data appears to be already base64 encoded, using as is")
except:
# Not base64, so encode the string as bytes then to base64
encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8')
logger.debug(f"Converted string to base64")
else:
# For other types, convert to string first
encoded_data = base64.b64encode(str(data).encode('utf-8')).decode('utf-8')
logger.warning(f"Converted non-standard type to base64")
# Check if a record already exists
file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id})
if file_data_entries:
# Update the existing record
self.db.record_modify("file_data", file_id, {"data": encoded_data})
logger.info(f"Updated existing file data for file ID {file_id}")
else:
# Create a new record
file_data = {
"id": file_id,
"data": encoded_data
}
self.db.record_create("file_data", file_data)
logger.info(f"Created new file data for file ID {file_id}")
return True
except Exception as e:
logger.error(f"Fehler beim Aktualisieren der Binärdaten für Datei {file_id}: {str(e)}")
return False
def save_uploaded_file(self, file_content: bytes, file_name: str) -> Dict[str, Any]:
"""
Speichert eine hochgeladene Datei direkt in der Datenbank.
Speichert eine hochgeladene Datei in der Datenbank.
Metadaten werden in der 'files'-Tabelle gespeichert,
Binärdaten in der 'file_data'-Tabelle als Base64-String.
Args:
file_content: Binärdaten der Datei
@ -385,11 +558,6 @@ class LucyDOMInterface:
if existing_file:
# Simply return the existing file metadata
logger.info(f"Duplikat gefunden für {file_name}: {existing_file['id']}")
# Entferne die Binärdaten aus der Antwort
if "data" in existing_file:
existing_file_copy = existing_file.copy()
del existing_file_copy["data"]
return existing_file_copy
return existing_file
# MIME-Typ bestimmen
@ -398,28 +566,25 @@ class LucyDOMInterface:
# Dateigröße bestimmen
file_size = len(file_content)
# Speichere in der Datenbank
# 1. Speichere Metadaten in der 'files'-Tabelle
logger.info(f"Saving file metadata to database for file: {file_name}")
db_file = self.create_file(
name=file_name,
mime_type=mime_type,
size=file_size,
data=file_content, # Dateiinhalt direkt in der Datenbank speichern
file_hash=file_hash
)
# 2. Speichere Binärdaten als Base64-String in der 'file_data'-Tabelle
logger.info(f"Saving file content to database for file: {file_name}")
self.create_file_data(db_file["id"], file_content)
# Debug: Verify database record was created
if not db_file:
logger.warning(f"Database record for file {file_name} was not created properly")
else:
logger.info(f"Database record created for file {file_name}")
# Entferne die Binärdaten aus der Antwort
if "data" in db_file:
db_file_copy = db_file.copy()
del db_file_copy["data"]
db_file = db_file_copy
logger.info(f"File upload process completed for: {file_name}")
return db_file
@ -429,7 +594,7 @@ class LucyDOMInterface:
def download_file(self, file_id: int) -> Optional[Dict[str, Any]]:
"""
Gibt eine Datei zum Download zurück.
Gibt eine Datei zum Download zurück, einschließlich Binärdaten.
Args:
file_id: ID der Datei
@ -438,20 +603,24 @@ class LucyDOMInterface:
Dictionary mit Dateidaten und -metadaten oder None, wenn nicht gefunden
"""
try:
# Holen der vollständigen Datei inklusive Binärdaten aus der Datenbank
files = self.db.get_recordset("files", record_filter={"id": file_id})
# 1. Metadaten aus der 'files'-Tabelle holen
file = self.get_file(file_id)
if not files or "data" not in files[0]:
raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden oder hat keine Daten")
if not file:
raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden")
file = files[0]
# 2. Binärdaten aus der 'file_data'-Tabelle holen
file_content = self.get_file_data(file_id)
if file_content is None:
raise FileNotFoundError(f"Binärdaten für Datei mit ID {file_id} nicht gefunden")
return {
"id": file_id,
"name": file.get("name", f"file_{file_id}"),
"content_type": file.get("mime_type", "application/octet-stream"),
"size": file.get("size", len(file.get("data", b""))),
"content": file.get("data")
"size": file.get("size", len(file_content)),
"content": file_content
}
except FileNotFoundError as e:
# Re-raise FileNotFoundError as is

View file

@ -1,139 +0,0 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from datetime import datetime
class Label(BaseModel):
"""Label für ein Attribut oder eine Klasse mit Unterstützung für mehrere Sprachen"""
default: str
translations: Dict[str, str] = {}
def get_label(self, language: str = None):
"""Gibt das Label in der angegebenen Sprache zurück, oder den Standardwert wenn nicht verfügbar"""
if language and language in self.translations:
return self.translations[language]
return self.default
class Prompt(BaseModel):
"""Datenmodell für einen Prompt"""
id: int = Field(description="Eindeutige ID des Prompts")
mandate_id: int = Field(description="ID des zugehörigen Mandanten")
user_id: int = Field(description="ID des Erstellers")
content: str = Field(description="Inhalt des Prompts")
name: str = Field(description="Anzeigename des Prompts")
label: Label = Field(
default=Label(default="Prompt", translations={"en": "Prompt", "fr": "Invite"}),
description="Label für die Klasse"
)
# Labels für Attribute
field_labels: Dict[str, Label] = {
"id": Label(default="ID", translations={}),
"mandate_id": Label(default="Mandanten-ID", translations={"en": "Mandate ID", "fr": "ID de mandat"}),
"user_id": Label(default="Benutzer-ID", translations={"en": "User ID", "fr": "ID d'utilisateur"}),
"content": Label(default="Inhalt", translations={"en": "Content", "fr": "Contenu"}),
"name": Label(default="Name", translations={"en": "Label", "fr": "Nom"}),
}
class FileItem(BaseModel):
"""Datenmodell für ein Datenobjekt"""
id: int = Field(description="Eindeutige ID des Datenobjekts")
mandate_id: int = Field(description="ID des zugehörigen Mandanten")
user_id: int = Field(description="ID des Erstellers")
name: str = Field(description="Name des Datenobjekts")
mime_type: str = Field(description="Typ des Datenobjekts MIME-Typ")
size: Optional[str] = Field(None, description="Größe des Datenobjekts")
file_hash: str = Field(description="Hash code")
data: bytes = Field(description="Inhalt der Datei")
creation_date: Optional[str] = Field(None, description="Datum des Hochladens")
label: Label = Field(
default=Label(default="Datenobjekt", translations={"en": "Data Object", "fr": "Objet de données"}),
description="Label für die Klasse"
)
# Labels für Attribute
field_labels: Dict[str, Label] = {
"id": Label(default="ID", translations={}),
"mandate_id": Label(default="Mandanten-ID", translations={"en": "Mandate ID", "fr": "ID de mandat"}),
"user_id": Label(default="Benutzer-ID", translations={"en": "User ID", "fr": "ID d'utilisateur"}),
"name": Label(default="Name", translations={"en": "Name", "fr": "Nom"}),
"mime_type": Label(default="Typ", translations={"en": "Type", "fr": "Type"}),
"size": Label(default="Größe", translations={"en": "Size", "fr": "Taille"}),
"file_hash": Label(default="File-Hash", translations={"en": "Hash", "fr": "Hash"}),
"data": Label(default="Daten", translations={"en": "Data", "fr": "Contenu"}),
"creation_date": Label(default="Upload-Datum", translations={"en": "Upload date", "fr": "Date de téléchargement"})
}
# Workflow-Modellklassen
class DocumentContent(BaseModel):
"""Inhalt eines Dokuments im Workflow"""
sequence_nr: Optional[int] = Field(1,description="Sequenz-Nummer des Inhaltes im Quelldokument")
name: str = Field(description="Optionale Bezeichnung")
ext: str = Field(description="Content extension for export: txt, csv, json, jpg, png")
content_type: str = Field(description="MIME-Typ")
data: bytes = Field(description="Inhalt der Datei")
class Document(BaseModel):
"""Dokument im Workflow """
id: str = Field(description="Eindeutige ID des Dokuments")
file_id: int = Field(description="Quelldatei")
contents: List[DocumentContent] = Field(description="Dokumentinhalte")
class DataStats(BaseModel):
"""Statistiken für Performance und Datennutzung"""
processing_time: Optional[float] = Field(None, description="Verarbeitungszeit in Sekunden")
token_count: Optional[int] = Field(None, description="Token-Anzahl (für KI-Modelle)")
bytes_sent: Optional[int] = Field(None, description="Gesendete Bytes")
bytes_received: Optional[int] = Field(None, description="Empfangene Bytes")
class Message(BaseModel):
"""Nachrichtenobjekt im Workflow"""
id: str = Field(description="Eindeutige ID der Nachricht")
workflow_id: str = Field(description="Referenz zum übergeordneten Workflow")
parent_message_id: Optional[str] = Field(None, description="Referenz zur beantworteten Nachricht")
started_at: str = Field(description="Zeitstempel für Nachrichtenerstellung")
finished_at: Optional[str] = Field(None, description="Zeitstempel für Nachrichtenabschluss")
sequence_no: int = Field(description="Sequenznummer für Sortierung")
status: str = Field(description="Status der Nachricht ('processing', 'completed')")
role: str = Field(description="Rolle des Absenders ('system', 'user', 'assistant')")
data_stats: Optional[DataStats] = Field(None, description="Statistiken")
documents: Optional[List[Document]] = Field(None, description="Dokumente in dieser Nachricht")
content: Optional[str] = Field(None, description="Textinhalt der Nachricht")
agent_name: Optional[str] = Field(None, description="Name des verwendeten Agenten")
class Workflow(BaseModel):
"""Workflow-Objekt für Multi-Agent-System"""
id: str = Field(description="Eindeutige ID des Workflows")
name: Optional[str] = Field(None, description="Name des Workflows")
mandate_id: int = Field(description="ID des Mandanten")
user_id: int = Field(description="ID des Benutzers")
status: str = Field(description="Status des Workflows ('running', 'failed', 'stopped')")
started_at: str = Field(description="Startzeitpunkt")
last_activity: str = Field(description="Zeitpunkt der letzten Aktivität")
last_message_id: str = Field(description="The last registered message")
data_stats: Optional[Dict[str, Any]] = Field(None, description="Gesamt-Statistiken")
messages: List[Message] = Field(default=[], description="Nachrichtenverlauf")
logs: List[Dict[str, Any]] = Field(default=[], description="Protokolleinträge")
# Anfragemodelle für die API
class WorkflowCreateRequest(BaseModel):
"""Anfrage zur Erstellung eines neuen Workflows"""
name: Optional[str] = Field(None, description="Name des Workflows")
prompt: str = Field(description="Zu verwendender Prompt")
files: List[int] = Field(default=[], description="Liste von FileItem ID")
class UserInputRequest(BaseModel):
"""Anfrage für Benutzereingabe an einen laufenden Workflow"""
prompt: str = Field(description="Nachricht des Benutzers")
files: List[int] = Field(default=[], description="Liste zusätzlicher FileItem ID")

View file

@ -1,6 +1,5 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from datetime import datetime
class Label(BaseModel):
@ -39,7 +38,7 @@ class Prompt(BaseModel):
class FileItem(BaseModel):
"""Datenmodell für ein Datenobjekt"""
"""Datenmodell für ein File"""
id: int = Field(description="Eindeutige ID des Datenobjekts")
mandate_id: int = Field(description="ID des zugehörigen Mandanten")
user_id: int = Field(description="ID des Erstellers")
@ -47,7 +46,6 @@ class FileItem(BaseModel):
mime_type: str = Field(description="Typ des Datenobjekts MIME-Typ")
size: Optional[int] = Field(None, description="Größe des Datenobjekts in Bytes")
file_hash: str = Field(description="Hash code für Deduplizierung")
data: bytes = Field(description="Binärer Inhalt der Datei")
creation_date: Optional[str] = Field(None, description="Datum des Hochladens")
workflow_id: Optional[str] = Field(None, description="ID des zugehörigen Workflows, falls vorhanden")
@ -65,25 +63,32 @@ class FileItem(BaseModel):
"mime_type": Label(default="Typ", translations={"en": "Type", "fr": "Type"}),
"size": Label(default="Größe", translations={"en": "Size", "fr": "Taille"}),
"file_hash": Label(default="File-Hash", translations={"en": "Hash", "fr": "Hash"}),
"data": Label(default="Daten", translations={"en": "Data", "fr": "Contenu"}),
"creation_date": Label(default="Upload-Datum", translations={"en": "Upload date", "fr": "Date de téléchargement"}),
"workflow_id": Label(default="Workflow-ID", translations={"en": "Workflow ID", "fr": "ID du workflow"})
}
class FileData(BaseModel):
"""Datenmodell für den File-Inhalt"""
id: int = Field(description="Eindeutige ID des Datenobjekts")
data: str = Field(description="Binärer Inhalt der Datei als Base64-String")
# Workflow-Modellklassen
class DocumentContent(BaseModel):
"""Inhalt eines Dokuments im Workflow"""
sequence_nr: Optional[int] = Field(1,description="Sequenz-Nummer des Inhaltes im Quelldokument")
name: str = Field(description="Optionale Bezeichnung")
sequence_nr: int = Field(1, description="Sequenz-Nummer des Inhaltes im Quelldokument")
name: str = Field(description="Bezeichnung")
ext: str = Field(description="Content extension for export: txt, csv, json, jpg, png")
content_type: str = Field(description="MIME-Typ")
data: bytes = Field(description="Inhalt der Datei")
data: str = Field(description="Binärer Inhalt der Daten als Base64-String")
summary: str = Field(description="Zusammenfassung des Datei-Inhaltes")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Metadaten zum Inhalt, wie z.B. is_text Flag, Format-Informationen, Encoding usw.")
class Document(BaseModel):
"""Dokument im Workflow - Referenziert direkt eine Datei in der Datenbank"""
id: str = Field(description="Eindeutige ID des Dokuments")
name: str = Field(description="Name des Datenobjekts")
ext: str = Field(description="Extension des Datenobjekts")
file_id: int = Field(description="ID der referenzierten Datei in der Datenbank")
contents: List[DocumentContent] = Field(description="Dokumentinhalte")
@ -137,4 +142,4 @@ class WorkflowCreateRequest(BaseModel):
class UserInputRequest(BaseModel):
"""Anfrage für Benutzereingabe an einen laufenden Workflow"""
prompt: str = Field(description="Nachricht des Benutzers")
files: List[int] = Field(default=[], description="Liste zusätzlicher FileItem ID")
listFileId: List[int] = Field(default=[], description="Liste zusätzlicher FileItem ID")

View file

@ -1,6 +1,45 @@
....................... TASKS
run agent, then save output files to db
. files save-> fileid list, ALWAYS TO WRITE NEW FILES!
. chat_message_to_workflow(role, agent,chatmsg, workflow): with answer and fileidlist
----------------------- OPEN
PRIO1:
Split big files into content-parts
PRIO2:
implement cleanup routines for files in lucydom_interface (File_Management_CLEANUP_INTERVAL): temp older than interval, all orphaned
frontend: no labels definition
Integrate NDA Text as modal form - Data governance agreement by login with checkbox
sharepoint connector with document search, content search, content extraction
add connector to myoutlook
frontend to react
----------------------- DONE
alle expliziten prompt ersetzen.
kannst du mir zusammenstellen, wo es überall in chat.py explizite texte an den user in den messages drin hat? - stell dir vor, es arbeitet ein japaner, der würde es nicht verstehen. die referenzen der code-elemente reicht.
die agents registry bereinigen inkl agents
die file upload & dragdrop bereinigen, dass einfach file in db geschrieben wird mit file im file-object
@ -14,35 +53,9 @@ Workflow:
----------------------- OPEN
PRIO1:
Split big files into content-parts
Cleanup routine for files older than xxx days in upload dir
Integrate NDA Text as modal form - Data governance agreement by login with checkbox
frontend to react
PRIO2:
implement cleanup routines for files in lucydom_interface (File_Management_CLEANUP_INTERVAL): temp older than interval, all orphaned
frontend: no labels definition
sharepoint connector with document search, content search, content extraction
add connector to myoutlook
----------------------- DONE
annst du bitte den Code Vorschlag von Dir als class "ChatManager" ins modul "chat.py" umbauen und mir diese class liefern. hier zusätzliche infos und dokumente.
kannst du bitte den Code Vorschlag von Dir als class "ChatManager" ins modul "chat.py" umbauen und mir diese class liefern. hier zusätzliche infos und dokumente.
für die implementierung der funktionen bitte die beiliegenden module als grundlage verwenden, aber allen code neu erstellen. denn die heutigen codes sind viel zu lange haben zuviele details auf allen levels drin. die implementierung der funktionen soll ebenfalls high-level sein, indem alle detail-ausführungen in grundlagen-funktionen ausgelagert werden.

View file

@ -8,30 +8,37 @@ pydantic==1.10.13 # Ältere Version ohne Rust-Abhängigkeit
## Authentication & Security
python-jose==3.3.0
passlib==1.7.4
argon2-cffi>=21.3.0 # Für Passwort-Hashing in gateway_interface.py
## Database
mysql-connector-python==8.1.0
## PDF & Document Processing
reportlab==4.0.4
fitz
PyMuPDF>=1.23.7 # Statt dem ungenauen 'fitz'
PyPDF2==3.0.1
python-docx>=0.8.11 # Für Word-Dokumente
openpyxl>=3.1.2 # Für Excel-Dateien
## Data Processing & Analysis
numpy==1.26.3 # Version die mit pandas und matplotlib kompatibel ist
pandas==2.2.3 # Aktuelle Version beibehalten
FuzzyTM>=0.4.0
numpy==1.26.3 # Version die mit pandas und matplotlib kompatibel ist
pandas==2.2.3 # Aktuelle Version beibehalten
## Data Visualization
matplotlib==3.8.0 # Aktuelle Version beibehalten
matplotlib==3.8.0 # Aktuelle Version beibehalten
seaborn==0.13.0
plotly==5.18.0
## Web Scraping & HTTP
beautifulsoup4==4.12.2
requests==2.31.0
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
## Image Processing
Pillow>=10.0.0 # Für Bildverarbeitung (als PIL importiert)
## Utilities
python-dateutil==2.8.2
python-dotenv==1.0.0
## Dependencies for trio (used by httpx)
sortedcontainers>=2.4.0 # Required by trio

View file

@ -7,7 +7,7 @@ from dataclasses import dataclass
import io
from modules.auth import get_current_active_user, get_user_context
from modules.utility import APP_CONFIG
from modules.configuration import APP_CONFIG
# Import interfaces
from modules.lucydom_interface import get_lucydom_interface, FileError, FileNotFoundError, FileStorageError, FilePermissionError, FileDeletionError
@ -75,7 +75,7 @@ async def get_files(current_user: Dict[str, Any] = Depends(get_current_active_us
try:
context = await get_context(current_user)
# Alle Dateien generisch abrufen
# Alle Dateien generisch abrufen - nur Metadaten, keine Binärdaten
files = context.interface_data.get_all_files()
return files
except Exception as e:
@ -106,7 +106,7 @@ async def upload_file(
if len(file_content) > max_size:
raise HTTPException(
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"Datei zu groß. Maximale Größe: {APP_CONFIG.get("File_Management_MAX_UPLOAD_SIZE_MB")}MB"
detail=f"Datei zu groß. Maximale Größe: {APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB"
)
# Datei über das LucyDOM-Interface in der Datenbank speichern
@ -141,12 +141,14 @@ async def get_file(
current_user: Dict[str, Any] = Depends(get_current_active_user)
):
"""
Gibt eine Datei anhand ihrer ID direkt aus der Datenbank zurück.
Gibt eine Datei anhand ihrer ID zum Download zurück.
Ruft sowohl Metadaten als auch Binärdaten ab.
"""
try:
context = await get_context(current_user)
# Datei über das LucyDOM-Interface aus der Datenbank abrufen
# Verwendet die download_file-Methode, die nun Metadaten und Binärdaten kombiniert
file_data = context.interface_data.download_file(file_id)
# Datei zurückgeben
@ -192,11 +194,13 @@ async def delete_file(
):
"""
Löscht eine Datei anhand ihrer ID aus der Datenbank.
Entfernt sowohl die Metadaten als auch die Binärdaten.
"""
try:
context = await get_context(current_user)
# Datei über das LucyDOM-Interface löschen
# Die Methode kümmert sich nun um das Löschen beider Tabellen (files und file_data)
context.interface_data.delete_file(file_id)
# Erfolgreiche Löschung ohne Inhalt zurückgeben (204 No Content)
@ -237,7 +241,7 @@ async def get_file_stats(
try:
context = await get_context(current_user)
# Alle Dateien abrufen
# Alle Dateien abrufen - nur Metadaten
all_files = context.interface_data.get_all_files()
# Statistiken berechnen

View file

@ -1,227 +0,0 @@
#!/usr/bin/env python3
"""
Testskript zum Erstellen eines Workflows mit Prompt und Datei über den Gateway.
"""
import requests
import json
import os
import time
import sys
from datetime import datetime
# Konfiguration
API_BASE_URL = "http://localhost:8000" # Anpassen an deine Gateway-URL
API_TOKEN = "your_api_token_here" # Dein API-Token
# Headers für Authentifizierung
HEADERS = {
"Authorization": f"Bearer {API_TOKEN}",
"Content-Type": "application/json"
}
def log_message(message):
"""Gibt eine formatierte Nachricht mit Zeitstempel aus"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] {message}")
def upload_file(file_path):
"""Lädt eine Datei hoch und gibt die Datei-ID zurück"""
log_message(f"Lade Datei hoch: {file_path}")
if not os.path.exists(file_path):
log_message(f"FEHLER: Datei nicht gefunden: {file_path}")
return None
# Multipart-Formular für Datei-Upload vorbereiten
file_name = os.path.basename(file_path)
files = {
'file': (file_name, open(file_path, 'rb'), 'application/octet-stream')
}
# Datei hochladen
upload_url = f"{API_BASE_URL}/api/files/upload"
response = requests.post(
upload_url,
headers={"Authorization": f"Bearer {API_TOKEN}"}, # Nur Authorization-Header
files=files
)
if response.status_code != 200:
log_message(f"FEHLER: Datei-Upload fehlgeschlagen. Status: {response.status_code}")
log_message(f"Response: {response.text}")
return None
# Datei-ID extrahieren
file_data = response.json()
file_id = file_data.get("id")
log_message(f"Datei erfolgreich hochgeladen. ID: {file_id}")
return file_id
def create_workflow(prompt, file_id=None):
"""Erstellt einen neuen Workflow mit Prompt und optionaler Datei"""
log_message("Erstelle neuen Workflow...")
# Nachricht für den Workflow vorbereiten
user_input = {
"message": prompt
}
# Wenn eine Datei-ID vorhanden ist, füge sie hinzu
if file_id:
user_input["additional_files"] = [file_id]
# Workflow erstellen
workflow_url = f"{API_BASE_URL}/api/workflows/user-input"
response = requests.post(
workflow_url,
headers=HEADERS,
json=user_input
)
if response.status_code >= 400:
log_message(f"FEHLER: Workflow-Erstellung fehlgeschlagen. Status: {response.status_code}")
log_message(f"Response: {response.text}")
return None
# Workflow-ID extrahieren
workflow_data = response.json()
workflow_id = workflow_data.get("workflow_id")
log_message(f"Workflow erfolgreich erstellt. ID: {workflow_id}")
return workflow_id
def poll_workflow_status(workflow_id, max_attempts=20, delay=2):
"""Fragt den Status eines Workflows ab und wartet bis zur Fertigstellung"""
log_message(f"Prüfe Status des Workflows {workflow_id}...")
for attempt in range(1, max_attempts + 1):
status_url = f"{API_BASE_URL}/api/workflows/{workflow_id}/status"
response = requests.get(
status_url,
headers=HEADERS
)
if response.status_code != 200:
log_message(f"FEHLER: Status-Abfrage fehlgeschlagen. Status: {response.status_code}")
continue
status_data = response.json()
current_status = status_data.get("status")
log_message(f"Workflow-Status: {current_status} (Versuch {attempt}/{max_attempts})")
if current_status in ["completed", "stopped", "failed"]:
log_message(f"Workflow ist abgeschlossen mit Status: {current_status}")
return status_data
time.sleep(delay)
log_message(f"Maximale Anzahl von Versuchen erreicht. Letzter Status: {current_status}")
return None
def get_workflow_messages(workflow_id):
"""Ruft alle Nachrichten eines Workflows ab"""
log_message(f"Hole Nachrichten für Workflow {workflow_id}...")
messages_url = f"{API_BASE_URL}/api/workflows/{workflow_id}/messages"
response = requests.get(
messages_url,
headers=HEADERS
)
if response.status_code != 200:
log_message(f"FEHLER: Abrufen der Nachrichten fehlgeschlagen. Status: {response.status_code}")
return []
messages = response.json()
log_message(f"{len(messages)} Nachrichten gefunden.")
return messages
def print_workflow_results(workflow_id):
"""Gibt die Ergebnisse eines Workflows aus"""
log_message("=== WORKFLOW-ERGEBNISSE ===")
# Status abrufen
status_url = f"{API_BASE_URL}/api/workflows/{workflow_id}/status"
status_response = requests.get(status_url, headers=HEADERS)
if status_response.status_code == 200:
status_data = status_response.json()
log_message(f"Workflow-Name: {status_data.get('name')}")
log_message(f"Status: {status_data.get('status')}")
log_message(f"Gestartet: {status_data.get('started_at')}")
log_message(f"Letzte Aktivität: {status_data.get('last_activity')}")
# Nachrichten abrufen und ausgeben
messages = get_workflow_messages(workflow_id)
log_message(f"Anzahl der Nachrichten: {len(messages)}")
for i, msg in enumerate(messages, 1):
log_message(f"--- Nachricht {i} ---")
log_message(f"Rolle: {msg.get('role')}")
# Inhalt gekürzt ausgeben (maximal ersten 200 Zeichen)
content = msg.get('content', '')
if content:
if len(content) > 200:
log_message(f"Inhalt: {content[:200]}... [gekürzt]")
else:
log_message(f"Inhalt: {content}")
# Anzahl der Dokumente ausgeben
docs = msg.get('documents', [])
if docs:
log_message(f"Dokumente: {len(docs)}")
for j, doc in enumerate(docs, 1):
source = doc.get('source', {})
doc_name = source.get('name', f"Dokument {j}")
log_message(f" - {doc_name}")
def main():
"""Hauptfunktion zum Testen des Workflows"""
# Beispiel-Datei zum Hochladen (Pfad anpassen)
file_path = "example.csv" # Hier den Pfad zu deiner Testdatei angeben
# Prompt für den Workflow
test_prompt = """Bitte analysiere die angehängte Datei und erstelle eine Zusammenfassung der wichtigsten Informationen.
Wenn es sich um eine CSV-Datei handelt, identifiziere die Spalten und gib mir einen Überblick über die enthaltenen Daten.
Erstelle außerdem eine Visualisierung, wenn du Zahlenwerte in der Datei findest."""
try:
# Datei hochladen
file_id = upload_file(file_path)
if not file_id:
log_message("Test abgebrochen: Datei konnte nicht hochgeladen werden.")
return False
# Workflow erstellen
workflow_id = create_workflow(test_prompt, file_id)
if not workflow_id:
log_message("Test abgebrochen: Workflow konnte nicht erstellt werden.")
return False
# Auf Abschluss des Workflows warten
workflow_status = poll_workflow_status(workflow_id)
if not workflow_status:
log_message("Test unvollständig: Timeout beim Warten auf Workflow-Abschluss.")
# Weiter machen, um zumindest Teilergebnisse zu sehen
# Ergebnisse ausgeben
print_workflow_results(workflow_id)
log_message("Test abgeschlossen.")
return True
except Exception as e:
log_message(f"FEHLER: Unerwartete Ausnahme: {str(e)}")
import traceback
log_message(traceback.format_exc())
return False
if __name__ == "__main__":
log_message("=== WORKFLOW-TEST GESTARTET ===")
success = main()
log_message(f"=== WORKFLOW-TEST BEENDET (Erfolgreich: {success}) ===")
sys.exit(0 if success else 1)

View file

@ -1,182 +0,0 @@
#!/usr/bin/env python3
"""
Backend-Testskript für die Workflow-Funktionalität mit Prompt und Datei.
Dieses Skript testet die Backend-Komponenten direkt, ohne über die API zu gehen.
"""
import os
import sys
import asyncio
import uuid
from datetime import datetime
import logging
import json
# Pfad zum Projekt-Root hinzufügen, damit Module gefunden werden
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Logging konfigurieren
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("backend_test")
# Imports aus dem Backend
from modules.lucydom_interface import get_lucydom_interface
from modules.chat import get_chat_manager
# Testparameter
TEST_MANDATE_ID = 1
TEST_USER_ID = 1
TEST_FILE_PATH = "d:/temp/prompt_a1.txt" # Pfad zur Testdatei anpassen
TEST_FILE_PATH1 = "d:/temp/LF-Nutshell.png" # Pfad zur Testdatei anpassen
TEST_PROMPT = """Bitte analysiere die angehängte Datei und erstelle eine Zusammenfassung der wichtigsten Informationen.
Erstelle außerdem eine Visualisierung, wenn du Zahlenwerte in der Datei findest."""
async def upload_test_file():
"""Lädt eine Testdatei ins Backend hoch und gibt die Datei-ID zurück"""
logger.info(f"Lade Testdatei hoch: {TEST_FILE_PATH}")
# LucyDOM-Interface initialisieren
lucy_interface = get_lucydom_interface(TEST_MANDATE_ID, TEST_USER_ID)
try:
# Prüfen, ob die Datei existiert
if not os.path.exists(TEST_FILE_PATH):
logger.error(f"Testdatei nicht gefunden: {TEST_FILE_PATH}")
return None
# Datei lesen
with open(TEST_FILE_PATH, 'rb') as f:
file_content = f.read()
# Dateinamen extrahieren
file_name = os.path.basename(TEST_FILE_PATH)
# Datei hochladen
file_meta = lucy_interface.save_uploaded_file(file_content, file_name)
file_id = file_meta.get('id')
logger.info(f"Datei erfolgreich hochgeladen. ID: {file_id}")
return file_meta
except Exception as e:
logger.error(f"Fehler beim Hochladen der Datei: {str(e)}")
return None
async def create_test_workflow(file_meta):
"""Erstellt einen Testworkflow mit dem angegebenen Prompt und der Datei"""
logger.info("Erstelle Testworkflow...")
# Chat-Manager initialisieren
chat_manager = get_chat_manager(TEST_MANDATE_ID, TEST_USER_ID)
# Nachrichtenobjekt vorbereiten
message = {
"role": "user",
"content": TEST_PROMPT,
"documents": [file_meta] if file_meta else []
}
try:
# Workflow erstellen (neue Workflow-ID wird automatisch generiert)
workflow = await chat_manager.workflow_integrate_userinput(message)
if not workflow:
logger.error("Workflow konnte nicht erstellt werden")
return None
workflow_id = workflow.get("id")
logger.info(f"Workflow erfolgreich erstellt. ID: {workflow_id}")
return workflow
except Exception as e:
logger.error(f"Fehler bei der Workflow-Erstellung: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return None
def print_workflow_details(workflow):
"""Gibt Details zum Workflow aus"""
if not workflow:
logger.warning("Kein Workflow zum Anzeigen vorhanden")
return
logger.info("=== WORKFLOW-DETAILS ===")
logger.info(f"ID: {workflow.get('id')}")
logger.info(f"Name: {workflow.get('name')}")
logger.info(f"Status: {workflow.get('status')}")
logger.info(f"Mandanten-ID: {workflow.get('mandate_id')}")
logger.info(f"Benutzer-ID: {workflow.get('user_id')}")
logger.info(f"Gestartet: {workflow.get('started_at')}")
logger.info(f"Letzte Aktivität: {workflow.get('last_activity')}")
# Nachrichten ausgeben
messages = workflow.get("messages", [])
logger.info(f"Anzahl der Nachrichten: {len(messages)}")
for i, msg in enumerate(messages, 1):
logger.info(f"--- Nachricht {i} ---")
logger.info(f"ID: {msg.get('id')}")
logger.info(f"Rolle: {msg.get('role')}")
logger.info(f"Sequenz: {msg.get('sequence_no')}")
logger.info(f"Agent: {msg.get('agent_name')}")
# Inhalt gekürzt ausgeben
content = msg.get('content', '')
if content:
preview = content[:200] + ('...' if len(content) > 200 else '')
logger.info(f"Inhalt: {preview}")
# Dokumente auflisten
documents = msg.get('documents', [])
if documents:
logger.info(f"Dokumente: {len(documents)}")
for j, doc in enumerate(documents, 1):
source = doc.get('source', {})
doc_name = source.get('name', f"Dokument {j}")
logger.info(f" - {doc_name}")
# Logs ausgeben
logs = workflow.get("logs", [])
logger.info(f"Anzahl der Logs: {len(logs)}")
if len(logs) > 0:
logger.info("Letzte 3 Logs:")
for log in logs[-3:]:
logger.info(f" - [{log.get('timestamp')}] {log.get('message')}")
async def main():
"""Hauptfunktion für den Backend-Test"""
logger.info("=== BACKEND WORKFLOW-TEST GESTARTET ===")
try:
# Schritt 1: Testdatei hochladen
file_meta = await upload_test_file()
if not file_meta:
logger.error("Test abgebrochen: Datei konnte nicht hochgeladen werden")
return False
# Schritt 2: Workflow erstellen
workflow = await create_test_workflow(file_meta)
if not workflow:
logger.error("Test abgebrochen: Workflow konnte nicht erstellt werden")
return False
# Schritt 3: Workflow-Details ausgeben
print_workflow_details(workflow)
logger.info("=== BACKEND WORKFLOW-TEST ERFOLGREICH BEENDET ===")
return True
except Exception as e:
logger.error(f"Unerwarteter Fehler im Test: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return False
if __name__ == "__main__":
# Event-Loop ausführen
success = asyncio.run(main())
sys.exit(0 if success else 1)

229
test_workflow1.py Normal file
View file

@ -0,0 +1,229 @@
"""
Test-Skript für den ChatManager-Workflow mit simulierten Datei-Uploads.
Demonstriert den vollständigen Workflow von Datei-Upload bis Chat-Ausführung.
"""
import asyncio
import base64
import logging
import os
import sys
from typing import Dict, Any, List, Tuple
from datetime import datetime
# Logging konfigurieren
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("test_workflow")
# Pfad zum Projektverzeichnis hinzufügen
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Module importieren
from modules.lucydom_interface import get_lucydom_interface
from modules.chat import get_chat_manager
async def create_test_files(mandate_id: int, user_id: int) -> Tuple[int, int]:
"""
Erstellt eine Textdatei und ein Bild für Tests und lädt sie in die Datenbank hoch.
Args:
mandate_id: ID des Mandanten
user_id: ID des Benutzers
Returns:
Tuple mit (text_file_id, image_file_id)
"""
logger.info("Erstelle Test-Dateien...")
lucy_interface = get_lucydom_interface(mandate_id, user_id)
# Textdatei erstellen
text_content = """
Dies ist eine Test-Textdatei für den ChatManager-Workflow.
Sie enthält einige Informationen zum Testen der Dokumentverarbeitung.
Der ChatManager sollte in der Lage sein, diese Datei zu verarbeiten
und daraus relevante Informationen zu extrahieren.
Diese Datei dient als Beispiel für Text-basierte Dokumente, die in einem
Chat-Workflow verwendet werden können.
"""
text_file_bytes = text_content.encode('utf-8')
text_file = lucy_interface.save_uploaded_file(text_file_bytes, "test_document.txt")
text_file_id = text_file["id"]
logger.info(f"Textdatei erstellt mit ID: {text_file_id}")
# Create a simple test image using PIL
try:
from PIL import Image
import io
# Create a 100x100 red image
img = Image.new('RGB', (100, 100), color = 'red')
# Save to BytesIO
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes = img_bytes.getvalue()
# Upload to database
image_file = lucy_interface.save_uploaded_file(img_bytes, "test_image.png")
image_file_id = image_file["id"]
logger.info(f"Bilddatei erstellt mit ID: {image_file_id}")
except ImportError:
# Fallback to the original method if PIL is not available
png_data = bytes([
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, # PNG Header
# ... rest of your PNG data ...
])
with open("./test_img_orig.png", 'wb') as f:
f.write(png_data)
image_file = lucy_interface.save_uploaded_file(png_data, "test_image.png")
image_file_id = image_file["id"]
logger.info(f"Bilddatei erstellt mit ID: {image_file_id}")
return text_file_id, image_file_id
async def run_chat_workflow(mandate_id: int, user_id: int, file_ids: List[int]) -> Dict[str, Any]:
"""
Führt einen Chat-Workflow mit gegebenen Datei-IDs aus.
Args:
mandate_id: ID des Mandanten
user_id: ID des Benutzers
file_ids: Liste der Datei-IDs
Returns:
Das Workflow-Ergebnis
"""
logger.info(f"Starte Chat-Workflow mit Dateien: {file_ids}")
# ChatManager initialisieren
chat_manager = get_chat_manager(mandate_id, user_id)
# Benutzeranfrage erstellen
user_input = {
"message": "Analysiere bitte die hochgeladenen Dateien und erkläre mir deren Inhalt.",
"additional_fileids": file_ids
}
# Chat-Workflow ausführen
workflow_result = await chat_manager.chat_run(user_input)
logger.info(f"Workflow abgeschlossen mit ID: {workflow_result['id']}")
return workflow_result
def analyze_workflow_result(workflow: Dict[str, Any]) -> None:
"""
Analysiert und gibt Informationen über das Workflow-Ergebnis aus.
Args:
workflow: Das Workflow-Ergebnis
"""
logger.info("Analysiere Workflow-Ergebnis:")
logger.info(f"Workflow-ID: {workflow['id']}")
logger.info(f"Status: {workflow['status']}")
logger.info(f"Anzahl Nachrichten: {len(workflow.get('messages', []))}")
for i, message in enumerate(workflow.get('messages', [])):
logger.info(f"Nachricht {i+1}:")
logger.info(f" Rolle: {message.get('role', 'unbekannt')}")
# Nur die ersten 100 Zeichen des Inhalts anzeigen
content = message.get('content', '')
content_preview = content[:100] + '...' if len(content) > 100 else content
logger.info(f" Inhalt: {content_preview}")
# Dokumente in der Nachricht anzeigen
documents = message.get('documents', [])
logger.info(f" Dokumente: {len(documents)}")
for j, doc in enumerate(documents):
doc_id = doc.get('id', 'keine ID')
file_id = doc.get('file_id', 'keine file_id')
logger.info(f" Dokument {j+1}: ID={doc_id}, File-ID={file_id}")
# Informationen über Inhalte
contents = doc.get('contents', [])
for k, content in enumerate(contents):
content_name = content.get('name', 'kein Name')
content_type = content.get('content_type', 'unbekannt')
logger.info(f" Inhalt {k+1}: {content_name} ({content_type})")
# Log-Einträge anzeigen
logger.info(f"Logs: {len(workflow.get('logs', []))}")
for i, log in enumerate(workflow.get('logs', []))[:10]: # Begrenzung auf 10 Logs
log_type = log.get('type', 'info')
log_message = log.get('message', '')
log_message_preview = log_message[:100] + '...' if len(log_message) > 100 else log_message
logger.info(f" Log {i+1} [{log_type}]: {log_message_preview}")
async def cleanup_test_files(mandate_id: int, user_id: int, file_ids: List[int]) -> None:
"""
Bereinigt die erstellten Testdateien.
Args:
mandate_id: ID des Mandanten
user_id: ID des Benutzers
file_ids: Liste der zu löschenden Datei-IDs
"""
logger.info("Beginne Bereinigung der Testdateien...")
lucy_interface = get_lucydom_interface(mandate_id, user_id)
for file_id in file_ids:
try:
success = lucy_interface.delete_file(file_id)
if success:
logger.info(f"Datei mit ID {file_id} erfolgreich gelöscht")
else:
logger.warning(f"Fehler beim Löschen der Datei mit ID {file_id}")
except Exception as e:
logger.error(f"Fehler beim Löschen der Datei mit ID {file_id}: {str(e)}")
logger.info("Bereinigung abgeschlossen")
async def main():
"""
Hauptfunktion, die den gesamten Testprozess steuert.
"""
# Testparameter
MANDATE_ID = 1 # Test-Mandanten-ID
USER_ID = 1 # Test-Benutzer-ID
CLEANUP = True # Bereinigung nach dem Test
try:
logger.info("=== Test-Workflow für ChatManager gestartet ===")
# Schritt 1: Testdateien erstellen
text_file_id, image_file_id = await create_test_files(MANDATE_ID, USER_ID)
file_ids = [text_file_id, image_file_id]
# Schritt 2: Chat-Workflow ausführen
workflow_result = await run_chat_workflow(MANDATE_ID, USER_ID, file_ids)
# Schritt 3: Ergebnis analysieren
analyze_workflow_result(workflow_result)
# Schritt 4: Optional bereinigen
if CLEANUP:
await cleanup_test_files(MANDATE_ID, USER_ID, file_ids)
logger.info("=== Test-Workflow erfolgreich abgeschlossen ===")
except Exception as e:
logger.error(f"Fehler im Test-Workflow: {str(e)}", exc_info=True)
logger.info("=== Test-Workflow mit Fehler beendet ===")
if __name__ == "__main__":
# Event-Loop für asyncio erstellen und Hauptfunktion ausführen
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

373
test_workflow2.py Normal file
View file

@ -0,0 +1,373 @@
"""
Erweitertes Test-Skript für den ChatManager-Workflow mit simulierten Datei-Uploads.
Bietet zusätzliche Konfigurationsmöglichkeiten und detailliertere Tests.
"""
import asyncio
import logging
import os
import sys
import argparse
import json
from typing import Dict, Any, List, Tuple, Optional
from datetime import datetime
# Logging konfigurieren
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("test_workflow")
# Pfad zum Projektverzeichnis hinzufügen
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Module importieren
from modules.lucydom_interface import get_lucydom_interface
from modules.chat import get_chat_manager
class TestConfig:
"""Konfigurationsklasse für Testparameter"""
def __init__(self):
self.mandate_id = 1
self.user_id = 1
self.cleanup = True
self.save_results = True
self.results_dir = "test_results"
self.test_message = "Analysiere bitte die hochgeladenen Dateien und erkläre mir deren Inhalt."
self.text_file_content = """
Dies ist eine Test-Textdatei für den ChatManager-Workflow.
Sie enthält einige Informationen zum Testen der Dokumentverarbeitung.
Der ChatManager sollte in der Lage sein, diese Datei zu verarbeiten
und daraus relevante Informationen zu extrahieren.
Diese Datei dient als Beispiel für Text-basierte Dokumente, die in einem
Chat-Workflow verwendet werden können.
"""
def parse_args() -> TestConfig:
"""Parst Kommandozeilenargumente"""
parser = argparse.ArgumentParser(description="Test für ChatManager-Workflow")
parser.add_argument("--mandate-id", type=int, default=1, help="ID des Mandanten")
parser.add_argument("--user-id", type=int, default=1, help="ID des Benutzers")
parser.add_argument("--no-cleanup", action="store_true", help="Testdateien nicht löschen")
parser.add_argument("--no-save", action="store_true", help="Ergebnisse nicht speichern")
parser.add_argument("--results-dir", type=str, default="test_results", help="Verzeichnis für Ergebnisse")
parser.add_argument("--message", type=str, help="Benutzernachricht für den Test")
args = parser.parse_args()
config = TestConfig()
config.mandate_id = args.mandate_id
config.user_id = args.user_id
config.cleanup = not args.no_cleanup
config.save_results = not args.no_save
config.results_dir = args.results_dir
if args.message:
config.test_message = args.message
return config
async def create_test_files(config: TestConfig) -> Tuple[int, int]:
"""
Erstellt eine Textdatei und ein Bild für Tests und lädt sie in die Datenbank hoch.
Args:
config: Testkonfiguration
Returns:
Tuple mit (text_file_id, image_file_id)
"""
logger.info("Erstelle Test-Dateien...")
lucy_interface = get_lucydom_interface(config.mandate_id, config.user_id)
# Textdatei erstellen
text_content = config.text_file_content
text_file_bytes = text_content.encode('utf-8')
text_file = lucy_interface.save_uploaded_file(text_file_bytes, "test_document.txt")
text_file_id = text_file["id"]
logger.info(f"Textdatei erstellt mit ID: {text_file_id}")
# Bilddatei erstellen (einfaches 1x1 PNG)
# Base64-kodiertes 1x1 PNG
png_data = bytes.fromhex(
"89504e470d0a1a0a0000000d49484452000000010000000108060000001f15c4"
"89000000017352474200aece1ce90000000467414d410000b18f0bfc61050000"
"000970485973000016250000162501495224f00000001974455874536f667477"
"617265007777772e696e6b73636170652e6f72679bee3c1a0000000c49444154"
"08d763f8ffff3f0005fe02fec1cd59830000000049454e44ae426082"
)
image_file = lucy_interface.save_uploaded_file(png_data, "test_image.png")
image_file_id = image_file["id"]
logger.info(f"Bilddatei erstellt mit ID: {image_file_id}")
return text_file_id, image_file_id
async def verify_uploaded_files(mandate_id: int, user_id: int, file_ids: List[int]) -> bool:
"""
Überprüft, ob die hochgeladenen Dateien korrekt in der Datenbank gespeichert wurden
Args:
mandate_id: ID des Mandanten
user_id: ID des Benutzers
file_ids: Liste der Datei-IDs
Returns:
True, wenn alle Dateien verfügbar sind
"""
logger.info("Überprüfe hochgeladene Dateien...")
lucy_interface = get_lucydom_interface(mandate_id, user_id)
all_files_available = True
for file_id in file_ids:
file = lucy_interface.get_file(file_id)
if file:
file_data = lucy_interface.get_file_data(file_id)
if file_data:
logger.info(f"Datei {file_id} ({file.get('name', 'Unbekannt')}, {file.get('mime_type', 'Unbekannt')}) ist verfügbar")
logger.info(f" Größe: {len(file_data)} Bytes")
else:
logger.error(f"Datei {file_id} hat keine Binärdaten")
all_files_available = False
else:
logger.error(f"Datei mit ID {file_id} nicht in der Datenbank gefunden")
all_files_available = False
return all_files_available
async def run_chat_workflow(config: TestConfig, file_ids: List[int]) -> Dict[str, Any]:
"""
Führt einen Chat-Workflow mit gegebenen Datei-IDs aus.
Args:
config: Testkonfiguration
file_ids: Liste der Datei-IDs
Returns:
Das Workflow-Ergebnis
"""
logger.info(f"Starte Chat-Workflow mit Dateien: {file_ids}")
# ChatManager initialisieren
chat_manager = get_chat_manager(config.mandate_id, config.user_id)
# Benutzeranfrage erstellen
user_input = {
"message": config.test_message,
"additional_fileids": file_ids
}
# Start-Zeit erfassen
start_time = datetime.now()
# Chat-Workflow ausführen
workflow_result = await chat_manager.chat_run(user_input)
# Ende-Zeit und Dauer berechnen
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
logger.info(f"Workflow abgeschlossen mit ID: {workflow_result['id']}")
logger.info(f"Dauer: {duration:.2f} Sekunden")
return workflow_result
def analyze_workflow_result(workflow: Dict[str, Any]) -> Dict[str, Any]:
"""
Analysiert das Workflow-Ergebnis und gibt Statistiken zurück.
Args:
workflow: Das Workflow-Ergebnis
Returns:
Dictionary mit Analyseergebnissen
"""
logger.info("Analysiere Workflow-Ergebnis:")
# Basis-Informationen
analysis = {
"workflow_id": workflow.get("id"),
"status": workflow.get("status"),
"message_count": len(workflow.get("messages", [])),
"log_count": len(workflow.get("logs", [])),
"document_count": 0,
"roles": {},
"document_types": {},
"response_sizes": []
}
# Nachrichten analysieren
for message in workflow.get("messages", []):
# Rollen zählen
role = message.get("role", "unknown")
if role not in analysis["roles"]:
analysis["roles"][role] = 0
analysis["roles"][role] += 1
# Content-Größe bei Antworten
if role == "assistant":
content = message.get("content", "")
analysis["response_sizes"].append(len(content))
# Dokumente zählen und analysieren
documents = message.get("documents", [])
analysis["document_count"] += len(documents)
for doc in documents:
contents = doc.get("contents", [])
for content in contents:
content_type = content.get("content_type", "unknown")
if content_type not in analysis["document_types"]:
analysis["document_types"][content_type] = 0
analysis["document_types"][content_type] += 1
# Ausgabe für Log
logger.info(f"Workflow-ID: {analysis['workflow_id']}")
logger.info(f"Status: {analysis['status']}")
logger.info(f"Anzahl Nachrichten: {analysis['message_count']}")
logger.info(f"Anzahl Dokumente: {analysis['document_count']}")
logger.info(f"Rollen-Verteilung: {analysis['roles']}")
logger.info(f"Dokumenttypen: {analysis['document_types']}")
if analysis["response_sizes"]:
avg_size = sum(analysis["response_sizes"]) / len(analysis["response_sizes"])
logger.info(f"Durchschnittliche Antwortgröße: {avg_size:.2f} Zeichen")
# Detaillierte Nachrichteninformationen
for i, message in enumerate(workflow.get("messages", [])[:5]): # Begrenzung auf 5 Nachrichten
logger.info(f"Nachricht {i+1}:")
logger.info(f" Rolle: {message.get('role', 'unbekannt')}")
# Nur die ersten 100 Zeichen des Inhalts anzeigen
content = message.get("content", "")
content_preview = content[:100] + "..." if len(content) > 100 else content
logger.info(f" Inhalt: {content_preview}")
# Dokumente in der Nachricht anzeigen
documents = message.get("documents", [])
if documents:
logger.info(f" Dokumente: {len(documents)}")
for j, doc in enumerate(documents):
file_id = doc.get("file_id", "keine file_id")
logger.info(f" Dokument {j+1}: File-ID={file_id}")
return analysis
def save_test_results(config: TestConfig, workflow: Dict[str, Any], analysis: Dict[str, Any]) -> None:
"""
Speichert die Testergebnisse in einer Datei.
Args:
config: Testkonfiguration
workflow: Das vollständige Workflow-Ergebnis
analysis: Die Analyseergebnisse
"""
if not config.save_results:
return
# Ergebnisverzeichnis erstellen, falls es nicht existiert
os.makedirs(config.results_dir, exist_ok=True)
# Zeitstempel für eindeutige Dateinamen
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Speichere die Analyse
analysis_file = os.path.join(config.results_dir, f"analysis_{timestamp}.json")
with open(analysis_file, "w", encoding="utf-8") as f:
json.dump(analysis, f, indent=2, ensure_ascii=False)
logger.info(f"Analyse gespeichert in: {analysis_file}")
# Speichere den vollständigen Workflow (ohne große Binärdaten)
workflow_copy = workflow.copy()
# Entferne Binärdaten aus dem Export, um die Dateigröße zu reduzieren
for message in workflow_copy.get("messages", []):
if "documents" in message:
for doc in message.get("documents", []):
if "contents" in doc:
for content in doc.get("contents", []):
if "data" in content and isinstance(content["data"], bytes) and len(content["data"]) > 1000:
content["data"] = f"[{len(content['data'])} Bytes]"
workflow_file = os.path.join(config.results_dir, f"workflow_{timestamp}.json")
with open(workflow_file, "w", encoding="utf-8") as f:
# Konvertiere Bytes zu Strings für JSON-Serialisierung
json.dump(workflow_copy, f, indent=2, ensure_ascii=False, default=lambda o:
o.decode("utf-8") if isinstance(o, bytes) else str(o))
logger.info(f"Workflow gespeichert in: {workflow_file}")
async def cleanup_test_files(config: TestConfig, file_ids: List[int]) -> None:
"""
Bereinigt die erstellten Testdateien.
Args:
config: Testkonfiguration
file_ids: Liste der zu löschenden Datei-IDs
"""
if not config.cleanup:
logger.info("Bereinigung übersprungen (--no-cleanup)")
return
logger.info("Beginne Bereinigung der Testdateien...")
lucy_interface = get_lucydom_interface(config.mandate_id, config.user_id)
for file_id in file_ids:
try:
success = lucy_interface.delete_file(file_id)
if success:
logger.info(f"Datei mit ID {file_id} erfolgreich gelöscht")
else:
logger.warning(f"Fehler beim Löschen der Datei mit ID {file_id}")
except Exception as e:
logger.error(f"Fehler beim Löschen der Datei mit ID {file_id}: {str(e)}")
logger.info("Bereinigung abgeschlossen")
async def main():
"""
Hauptfunktion, die den gesamten Testprozess steuert.
"""
# Konfiguration laden
config = parse_args()
try:
logger.info("=== Test-Workflow für ChatManager gestartet ===")
logger.info(f"Mandate-ID: {config.mandate_id}, User-ID: {config.user_id}")
# Schritt 1: Testdateien erstellen
text_file_id, image_file_id = await create_test_files(config)
file_ids = [text_file_id, image_file_id]
# Schritt 2: Hochgeladene Dateien überprüfen
files_ok = await verify_uploaded_files(config.mandate_id, config.user_id, file_ids)
if not files_ok:
logger.error("Fehler bei den hochgeladenen Dateien, Test wird abgebrochen")
return
# Schritt 3: Chat-Workflow ausführen
workflow_result = await run_chat_workflow(config, file_ids)
# Schritt 4: Ergebnis analysieren
analysis = analyze_workflow_result(workflow_result)
# Schritt 5: Ergebnisse speichern
save_test_results(config, workflow_result, analysis)
# Schritt 6: Bereinigen
await cleanup_test_files(config, file_ids)
logger.info("=== Test-Workflow erfolgreich abgeschlossen ===")
except Exception as e:
logger.error(f"Fehler im Test-Workflow: {str(e)}", exc_info=True)
logger.info("=== Test-Workflow mit Fehler beendet ===")
if __name__ == "__main__":
# Event-Loop für asyncio erstellen und Hauptfunktion ausführen
loop = asyncio.get_event_loop()
loop.run_until_complete(main())