995 lines
No EOL
40 KiB
Python
995 lines
No EOL
40 KiB
Python
"""
|
|
Central file management module for the Agentservice.
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import base64
|
|
import json
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any, Optional, Tuple, Union, BinaryIO
|
|
from io import BytesIO
|
|
|
|
# Import utilities from agentservice_utils
|
|
from modules.agentservice_utils import extract_text_from_file_content, is_text_extractable
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Helper function for adding logs
|
|
def _log(add_log_func, workflow_id, message, level="info"):
|
|
"""Helper function for adding logs with standardized formatting."""
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, message, level)
|
|
|
|
# Also log to standard logger
|
|
if level == "info":
|
|
logger.info(message)
|
|
elif level == "warning":
|
|
logger.warning(message)
|
|
elif level == "error":
|
|
logger.error(message)
|
|
|
|
class FileExtractionError(Exception):
|
|
"""Exception for file extraction errors."""
|
|
pass
|
|
|
|
class FileManager:
|
|
"""Central file management for the Agentservice."""
|
|
|
|
_instance = None
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
"""Get the singleton instance of FileManager."""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
"""Initialize the FileManager."""
|
|
# Ensure singleton pattern
|
|
if FileManager._instance is not None:
|
|
raise RuntimeError("Singleton instance already exists - use get_instance()")
|
|
|
|
# Import utilities
|
|
# Instead of storing file_utils, we'll use the imported functions directly
|
|
|
|
async def read_file_contents(self,
|
|
file_contexts: List[Dict[str, Any]],
|
|
lucydom_interface,
|
|
workflow_id: str = None,
|
|
add_log_func = None,
|
|
ai_service = None # AI service parameter for image analysis
|
|
) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Liest den Inhalt aller Dateien und führt bei Bildern und Dokumenten Analysen durch.
|
|
Verwendet LucyDOM-Interface statt direkter Dateizugriffe.
|
|
Gibt jetzt ein Dictionary mit Dateiinhalten und Extraktionsstatus zurück.
|
|
|
|
Args:
|
|
file_contexts: Liste der Dateikontexte mit Metadaten
|
|
lucydom_interface: LucyDOM-Interface für Dateizugriffe
|
|
workflow_id: Optionale ID des Workflows für Logging
|
|
add_log_func: Optionale Funktion für das Hinzufügen von Logs
|
|
ai_service: Optionaler AI-Service für die Bildanalyse
|
|
|
|
Returns:
|
|
Dictionary mit Dateiinhalten und Metadaten (file_id -> {content, is_extracted, ...})
|
|
"""
|
|
file_contents = {}
|
|
|
|
# Add debug logging
|
|
logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}")
|
|
|
|
for file in file_contexts:
|
|
file_id = file["id"]
|
|
file_name = file["name"]
|
|
file_type = file.get("type", "unknown")
|
|
content_type = file.get("content_type")
|
|
|
|
print("DEGUB5:",file_name,file_type)
|
|
|
|
try:
|
|
# Dateiinhalt über LucyDOM-Interface abrufen
|
|
file_data = await lucydom_interface.read_file_content(file_id)
|
|
|
|
if not file_data:
|
|
_log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning")
|
|
file_contents[file_id] = {
|
|
"content": f"File content not available (File not found)",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
continue
|
|
|
|
logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})")
|
|
|
|
# Bildverarbeitung - immer KI-Analyse verwenden, wenn verfügbar
|
|
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
|
|
if ai_service and hasattr(ai_service, "analyze_image"):
|
|
try:
|
|
image_analysis = await ai_service.analyze_image(
|
|
image_data=file_data,
|
|
prompt="Describe this image in detail",
|
|
mime_type=content_type
|
|
)
|
|
|
|
logger.debug(f"Image analysis successfully generated for {file_name}")
|
|
|
|
file_contents[file_id] = {
|
|
"content": f"Image Analysis:\n{image_analysis}",
|
|
"is_extracted": False, # Bildanalyse gilt nicht als Text-Extraktion
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
_log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing image {file_name}: {str(e)}")
|
|
_log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
|
|
file_contents[file_id] = {
|
|
"content": f"Image file: {file_name} (Analysis failed: {str(e)})",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
else:
|
|
file_contents[file_id] = {
|
|
"content": f"Image file: {file_name} (AI analysis not available)",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
|
|
# Dokument- und Textdateien
|
|
elif (file_type == "document" or not file_type or file_name.lower().endswith(('.csv', '.txt', '.json', '.xml')) or (content_type and content_type.startswith('text/'))):
|
|
# Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt
|
|
content, is_extracted = extract_text_from_file_content(
|
|
file_data, file_name, content_type
|
|
)
|
|
file_contents[file_id] = {
|
|
"content": content,
|
|
"is_extracted": is_extracted,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
_log(add_log_func, workflow_id,
|
|
f"File {file_name} read successfully (extracted: {is_extracted})", "info")
|
|
|
|
# Andere Dateitypen - nur Metadaten speichern
|
|
else:
|
|
file_contents[file_id] = {
|
|
"content": f"File: {file_name} (Type: {file_type}, content not available)",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
_log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error reading file {file_name}: {str(e)}")
|
|
_log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
|
|
file_contents[file_id] = {
|
|
"content": f"File content not available (Error: {str(e)})",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
|
|
return file_contents
|
|
|
|
@staticmethod
|
|
def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Fügt eine Datei zu einer Nachricht hinzu mit Kennzeichnung, ob Text extrahiert wurde.
|
|
|
|
Args:
|
|
message: Die zu erweiternde Nachricht
|
|
file_data: Dateimetadaten und Inhalt
|
|
|
|
Returns:
|
|
Die aktualisierte Nachricht mit der Datei
|
|
"""
|
|
# Detailliertes Logging für Debugging
|
|
logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})")
|
|
|
|
# Initialize documents array if needed
|
|
if "documents" not in message:
|
|
message["documents"] = []
|
|
logger.debug("Initialized empty documents array in message")
|
|
|
|
# Create a unique ID for the document if not provided
|
|
doc_id = file_data.get("id", f"file_{uuid.uuid4()}")
|
|
|
|
# Extract file size if available
|
|
file_size = file_data.get("size")
|
|
if isinstance(file_size, str) and file_size.isdigit():
|
|
file_size = int(file_size)
|
|
elif file_size is None and file_data.get("content"):
|
|
# Estimate size from content if not provided
|
|
file_size = len(file_data.get("content", ""))
|
|
|
|
# Bestimmen, ob der Inhalt bereits extrahiert wurde
|
|
content = file_data.get("content", "No content available")
|
|
file_name = file_data.get("name", "unnamed_file")
|
|
content_type = file_data.get("content_type")
|
|
|
|
# Prüfen, ob der Inhalt als extrahiert markiert werden sollte
|
|
is_extracted = file_data.get("is_extracted", False)
|
|
if not is_extracted and isinstance(content, str) and content.strip() and file_name:
|
|
# Wenn nicht explizit markiert, aber Inhalt vorhanden ist, prüfen wir den Dateityp
|
|
is_extracted = is_text_extractable(file_name, content_type)
|
|
|
|
# Create standard document structure that matches the data model
|
|
document = {
|
|
"id": doc_id,
|
|
"source": {
|
|
"type": "file",
|
|
"id": file_data.get("id", doc_id),
|
|
"name": file_name,
|
|
"content_type": content_type,
|
|
"size": file_size,
|
|
"upload_date": file_data.get("upload_date", datetime.now().isoformat())
|
|
},
|
|
"contents": [
|
|
{
|
|
"type": "text",
|
|
"text": content,
|
|
"is_extracted": is_extracted # Flag für den Extraktionsstatus hinzufügen
|
|
}
|
|
]
|
|
}
|
|
|
|
# Log document structure for debugging
|
|
logger.debug(f"Created document structure: id={doc_id}, name={file_name}, is_extracted={is_extracted}")
|
|
|
|
# Check if file is already in the message to avoid duplicates
|
|
file_already_added = any(
|
|
doc.get("source", {}).get("id") == file_data.get("id")
|
|
for doc in message.get("documents", [])
|
|
)
|
|
|
|
if not file_already_added:
|
|
message["documents"].append(document)
|
|
logger.info(f"File {file_name} successfully added to message (total: {len(message.get('documents', []))} files)")
|
|
else:
|
|
logger.info(f"File {file_name} already exists in message, skipping")
|
|
|
|
return message
|
|
|
|
async def analyze_file(self, file_id: int, prompt: str, lucydom_interface, ai_service) -> Dict[str, Any]:
|
|
"""
|
|
Analyze a file using the appropriate method based on file type.
|
|
|
|
Args:
|
|
file_id: ID of the file to analyze
|
|
prompt: Analysis prompt
|
|
lucydom_interface: Interface for database access
|
|
ai_service: Service for AI requests
|
|
|
|
Returns:
|
|
Analysis result
|
|
"""
|
|
if not lucydom_interface:
|
|
raise ValueError("LucyDOM interface not available")
|
|
|
|
if not ai_service:
|
|
raise ValueError("AI service not available")
|
|
|
|
try:
|
|
# Get file metadata
|
|
file = lucydom_interface.get_file(file_id)
|
|
if not file:
|
|
raise ValueError(f"File with ID {file_id} not found")
|
|
|
|
# Get file content
|
|
file_content = await lucydom_interface.read_file_content(file_id)
|
|
if not file_content:
|
|
raise ValueError(f"Content for file {file_id} not found")
|
|
|
|
# Extract metadata
|
|
file_name = file.get("name", "unnamed")
|
|
content_type = file.get("content_type")
|
|
file_type = file.get("type")
|
|
|
|
# Process based on file type
|
|
if file_type == "image" or (content_type and content_type.startswith("image/")):
|
|
# Image analysis
|
|
if hasattr(ai_service, "analyze_image"):
|
|
analysis = await ai_service.analyze_image(
|
|
image_data=file_content,
|
|
prompt=prompt,
|
|
mime_type=content_type
|
|
)
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "image",
|
|
"result": analysis
|
|
}
|
|
else:
|
|
raise ValueError("AI service does not support image analysis")
|
|
|
|
elif file_name.endswith(".pdf"):
|
|
# PDF analysis - first extract text, then analyze
|
|
try:
|
|
# Extract text
|
|
text_content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
if not is_extracted:
|
|
raise ValueError(f"Failed to extract text from PDF {file_name}")
|
|
|
|
# Analyze text with AI
|
|
pdf_analysis_prompt = f"""
|
|
Analyze the following PDF content based on this request:
|
|
|
|
REQUEST: {prompt}
|
|
|
|
PDF CONTENT:
|
|
{text_content[:10000]} # Limit to first 10K chars to avoid token limits
|
|
"""
|
|
|
|
analysis = await ai_service.call_api([{"role": "user", "content": pdf_analysis_prompt}])
|
|
|
|
# Also check for images in the PDF
|
|
has_images = False
|
|
image_analysis = None
|
|
|
|
try:
|
|
# Extract and analyze images
|
|
image_results = await self.extract_and_analyze_pdf_images(
|
|
file_content,
|
|
f"Analyze images with respect to: {prompt}",
|
|
ai_service
|
|
)
|
|
|
|
if image_results and len(image_results) > 0:
|
|
has_images = True
|
|
image_analysis = "\n\nPDF IMAGES ANALYSIS:\n"
|
|
for img in image_results:
|
|
image_analysis += f"- Image on page {img.get('page')}: {img.get('response')}\n"
|
|
except Exception as img_err:
|
|
logger.warning(f"Could not analyze images in PDF {file_name}: {str(img_err)}")
|
|
|
|
# Combine text and image analysis if available
|
|
if has_images and image_analysis:
|
|
analysis += image_analysis
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "pdf",
|
|
"result": analysis,
|
|
"has_images": has_images
|
|
}
|
|
|
|
except Exception as pdf_err:
|
|
logger.error(f"Error analyzing PDF {file_name}: {str(pdf_err)}")
|
|
raise
|
|
|
|
elif file_name.endswith(('.xlsx', '.xls', '.csv')):
|
|
# Tabular data analysis
|
|
try:
|
|
# Extract text content
|
|
text_content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
if not is_extracted:
|
|
raise ValueError(f"Failed to extract data from {file_name}")
|
|
|
|
# Analyze with AI
|
|
data_analysis_prompt = f"""
|
|
Analyze the following tabular data based on this request:
|
|
|
|
REQUEST: {prompt}
|
|
|
|
DATA CONTENT:
|
|
{text_content[:10000]} # Limit to first 10K chars
|
|
|
|
Provide a structured analysis including:
|
|
1. Data overview
|
|
2. Key insights
|
|
3. Patterns and trends
|
|
4. Answers to the specific request
|
|
"""
|
|
|
|
analysis = await ai_service.call_api([{"role": "user", "content": data_analysis_prompt}])
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "tabular_data",
|
|
"result": analysis
|
|
}
|
|
|
|
except Exception as data_err:
|
|
logger.error(f"Error analyzing tabular data {file_name}: {str(data_err)}")
|
|
raise
|
|
|
|
else:
|
|
# Default to text analysis for all other file types
|
|
try:
|
|
# Extract text content
|
|
text_content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
if not is_extracted:
|
|
raise ValueError(f"Failed to extract text from {file_name}")
|
|
|
|
# Analyze with AI
|
|
text_analysis_prompt = f"""
|
|
Analyze the following document content based on this request:
|
|
|
|
REQUEST: {prompt}
|
|
|
|
DOCUMENT CONTENT:
|
|
{text_content[:10000]} # Limit to first 10K chars
|
|
"""
|
|
|
|
analysis = await ai_service.call_api([{"role": "user", "content": text_analysis_prompt}])
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "text",
|
|
"result": analysis
|
|
}
|
|
|
|
except Exception as text_err:
|
|
logger.error(f"Error analyzing text content {file_name}: {str(text_err)}")
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing file {file_id}: {str(e)}")
|
|
raise
|
|
|
|
async def extract_and_analyze_pdf_images(self,
|
|
pdf_content: bytes,
|
|
prompt: str,
|
|
ai_service
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extrahiert Bilder aus einer PDF-Datei und analysiert sie.
|
|
Arbeitet mit Binärdaten statt Dateipfaden.
|
|
|
|
Args:
|
|
pdf_content: Binärdaten der PDF-Datei
|
|
prompt: Prompt für die Bildanalyse
|
|
ai_service: AI-Service für die Bildanalyse
|
|
|
|
Returns:
|
|
Liste mit Analyseergebnissen für jedes Bild
|
|
"""
|
|
image_responses = []
|
|
temp_files = [] # Liste der temporären Dateien zur Bereinigung
|
|
|
|
try:
|
|
# PDF mit PyMuPDF öffnen
|
|
import fitz # PyMuPDF
|
|
# BytesIO is already imported at the top level
|
|
import tempfile
|
|
|
|
# PDF im Speicher öffnen
|
|
doc = fitz.open(stream=pdf_content, filetype="pdf")
|
|
logger.info(f"PDF geöffnet mit {len(doc)} Seiten")
|
|
|
|
for page_num, page in enumerate(doc, 1):
|
|
# Alle Bilder auf der Seite finden
|
|
image_list = page.get_images(full=True)
|
|
|
|
if image_list:
|
|
logger.info(f"Seite {page_num}: {len(image_list)} Bilder gefunden")
|
|
|
|
for img_index, img in enumerate(image_list):
|
|
try:
|
|
# Bild-Referenz
|
|
xref = img[0]
|
|
|
|
# Bild und Metadaten extrahieren
|
|
base_image = doc.extract_image(xref)
|
|
image_bytes = base_image["image"] # Tatsächliche Bilddaten
|
|
image_ext = base_image["ext"] # Dateiendung (jpg, png, etc.)
|
|
|
|
# Erstelle temporäre Datei
|
|
fd, temp_img_path = tempfile.mkstemp(suffix=f".{image_ext}")
|
|
temp_files.append(temp_img_path) # Zur Bereinigungsliste hinzufügen
|
|
|
|
with os.fdopen(fd, 'wb') as img_file:
|
|
img_file.write(image_bytes)
|
|
|
|
logger.debug(f"Bild temporär gespeichert: {temp_img_path}")
|
|
|
|
# Analysiere mit AI-Service
|
|
try:
|
|
analysis_result = await ai_service.analyze_image(
|
|
image_data=image_bytes, # Direktes Übergeben der Bilddaten
|
|
prompt=prompt,
|
|
mime_type=f"image/{image_ext}"
|
|
)
|
|
logger.debug(f"Bildanalyse für Bild {img_index} auf Seite {page_num} abgeschlossen")
|
|
except Exception as analyze_error:
|
|
logger.error(f"Fehler bei der Bildanalyse: {str(analyze_error)}")
|
|
analysis_result = f"[Fehler bei der Bildanalyse: {str(analyze_error)}]"
|
|
|
|
# Ergebnis speichern
|
|
try:
|
|
# Versuche zuerst, die Größe aus base_image zu bekommen
|
|
if 'width' in base_image and 'height' in base_image:
|
|
image_size = f"{base_image['width']}x{base_image['height']}"
|
|
else:
|
|
# Alternative: Öffne das temporäre Bild, um die Größe zu bestimmen
|
|
from PIL import Image
|
|
with Image.open(temp_img_path) as img:
|
|
width, height = img.size
|
|
image_size = f"{width}x{height}"
|
|
except Exception as e:
|
|
logger.warning(f"Konnte Bildgröße nicht ermitteln: {str(e)}")
|
|
image_size = "unbekannt"
|
|
|
|
image_responses.append({
|
|
"page": page_num,
|
|
"image_index": img_index,
|
|
"format": image_ext,
|
|
"image_size": image_size,
|
|
"response": analysis_result
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Fehler bei der Extraktion von Bild {img_index} auf Seite {page_num}: {str(e)}")
|
|
continue
|
|
|
|
logger.info(f"Extrahiert und analysiert: {len(image_responses)} Bilder aus PDF")
|
|
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) ist nicht installiert. Installiere es mit 'pip install pymupdf'")
|
|
raise FileExtractionError("PyMuPDF (fitz) ist nicht installiert")
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Extrahieren von PDF-Bildern: {str(e)}")
|
|
raise FileExtractionError(f"Fehler beim Extrahieren von PDF-Bildern: {str(e)}")
|
|
finally:
|
|
# Bereinige alle temporären Dateien
|
|
for temp_file in temp_files:
|
|
try:
|
|
if os.path.exists(temp_file):
|
|
os.remove(temp_file)
|
|
except Exception as e:
|
|
logger.warning(f"Konnte temporäre Datei nicht entfernen: {temp_file} - {str(e)}")
|
|
|
|
return image_responses
|
|
|
|
async def analyze_multiple_files(
|
|
self,
|
|
file_ids: List[int],
|
|
prompt: str,
|
|
lucydom_interface,
|
|
ai_service
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Analyze multiple files and synthesize a combined result.
|
|
|
|
Args:
|
|
file_ids: List of file IDs to analyze
|
|
prompt: Analysis prompt
|
|
lucydom_interface: Interface for database access
|
|
ai_service: Service for AI requests
|
|
|
|
Returns:
|
|
Combined analysis result
|
|
"""
|
|
results = []
|
|
|
|
# Analyze each file
|
|
for file_id in file_ids:
|
|
try:
|
|
analysis = await self.analyze_file(file_id, prompt, lucydom_interface, ai_service)
|
|
results.append(analysis)
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing file {file_id}: {str(e)}")
|
|
results.append({
|
|
"file_id": file_id,
|
|
"error": str(e),
|
|
"analysis_type": "error"
|
|
})
|
|
|
|
# Now synthesize a combined analysis
|
|
if results:
|
|
try:
|
|
# Prepare prompt for synthesis
|
|
synthesis_prompt = f"""
|
|
Synthesize a combined analysis based on these individual file analyses:
|
|
|
|
ORIGINAL REQUEST: {prompt}
|
|
|
|
INDIVIDUAL ANALYSES:
|
|
"""
|
|
|
|
for i, result in enumerate(results, 1):
|
|
file_name = result.get("file_name", f"File {i}")
|
|
analysis_type = result.get("analysis_type", "unknown")
|
|
analysis_result = result.get("result", "No analysis available")
|
|
|
|
synthesis_prompt += f"""
|
|
## {file_name} ({analysis_type})
|
|
{analysis_result}
|
|
|
|
---
|
|
"""
|
|
|
|
synthesis_prompt += """
|
|
Please provide a comprehensive synthesis that:
|
|
1. Combines insights from all files
|
|
2. Addresses the original request
|
|
3. Highlights connections between different files
|
|
4. Provides a unified conclusion
|
|
"""
|
|
|
|
# Call AI for synthesis
|
|
synthesis = await ai_service.call_api([{"role": "user", "content": synthesis_prompt}])
|
|
|
|
return {
|
|
"synthesis": synthesis,
|
|
"individual_results": results,
|
|
"files_analyzed": len(results)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error synthesizing combined analysis: {str(e)}")
|
|
return {
|
|
"error": str(e),
|
|
"individual_results": results,
|
|
"files_analyzed": len(results)
|
|
}
|
|
else:
|
|
return {
|
|
"synthesis": "No files were successfully analyzed.",
|
|
"individual_results": [],
|
|
"files_analyzed": 0
|
|
}
|
|
|
|
def determine_file_type(self, file_name: str, content_type: str = None) -> str:
|
|
"""
|
|
Determine the file type based on name and content type.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
content_type: MIME type (optional)
|
|
|
|
Returns:
|
|
File type string ('document', 'image', etc.)
|
|
"""
|
|
# Check content type first
|
|
if content_type:
|
|
if content_type.startswith('image/'):
|
|
return "image"
|
|
elif content_type in ['application/pdf']:
|
|
return "document"
|
|
elif content_type in ['application/vnd.ms-excel',
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'text/csv']:
|
|
return "spreadsheet"
|
|
|
|
# Check file extension
|
|
lower_name = file_name.lower()
|
|
|
|
# Images
|
|
if lower_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg')):
|
|
return "image"
|
|
|
|
# Documents
|
|
if lower_name.endswith(('.pdf', '.doc', '.docx', '.txt', '.md', '.rtf')):
|
|
return "document"
|
|
|
|
# Spreadsheets
|
|
if lower_name.endswith(('.xlsx', '.xls', '.csv')):
|
|
return "spreadsheet"
|
|
|
|
# Presentations
|
|
if lower_name.endswith(('.pptx', '.ppt')):
|
|
return "presentation"
|
|
|
|
# Data files
|
|
if lower_name.endswith(('.json', '.xml', '.yaml', '.yml')):
|
|
return "data"
|
|
|
|
# Default to document
|
|
return "document"
|
|
|
|
def get_mime_type(self, file_name: str) -> str:
|
|
"""Get MIME type based on file name."""
|
|
# Import from lucydom_interface
|
|
from lucydom_interface import LucyDOMInterface
|
|
temp_interface = LucyDOMInterface(0, 0) # Default values
|
|
return temp_interface.get_mime_type(file_name)
|
|
|
|
def prepare_file_contexts(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Bereitet die Dateikontexte basierend auf Metadaten vor.
|
|
Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank.
|
|
|
|
Args:
|
|
files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type)
|
|
|
|
Returns:
|
|
Liste von Dateikontexten für die Verarbeitung
|
|
"""
|
|
file_contexts = []
|
|
|
|
logger.info(f"Preparing file contexts for {len(files)} files")
|
|
|
|
for file in files:
|
|
file_id = file.get("id")
|
|
file_name = file.get("name")
|
|
file_type = file.get("type")
|
|
|
|
# Create a comprehensive context with all available metadata
|
|
context = {
|
|
"id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"size": file.get("size", "Unbekannt"),
|
|
"content_type": file.get("content_type"),
|
|
"path": file.get("path"),
|
|
"upload_date": file.get("upload_date"),
|
|
"hash": file.get("hash"),
|
|
"mandate_id": file.get("mandate_id"),
|
|
"user_id": file.get("user_id")
|
|
}
|
|
|
|
# Log for debugging
|
|
logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})")
|
|
|
|
file_contexts.append(context)
|
|
|
|
return file_contexts
|
|
|
|
# Factory method
|
|
@staticmethod
|
|
def get_instance():
|
|
"""Get the singleton instance of FileManager."""
|
|
if FileManager._instance is None:
|
|
FileManager._instance = FileManager()
|
|
return FileManager._instance
|
|
|
|
|
|
# Create a singleton instance for module-level access
|
|
file_manager = FileManager.get_instance()
|
|
|
|
def get_file_manager():
|
|
"""Get the singleton instance of FileManager."""
|
|
return file_manager
|
|
|
|
|
|
class WorkflowFileManager:
|
|
"""
|
|
Specialized file manager for workflow operations.
|
|
Handles workflow-specific file operations and document management.
|
|
"""
|
|
|
|
def __init__(self, workflow_id: str = None, lucydom_interface = None):
|
|
"""
|
|
Initialize the workflow file manager.
|
|
|
|
Args:
|
|
workflow_id: Optional workflow ID for context
|
|
lucydom_interface: LucyDOM interface for database operations
|
|
"""
|
|
self.workflow_id = workflow_id
|
|
self.lucydom_interface = lucydom_interface
|
|
self.file_manager = get_file_manager()
|
|
|
|
def set_workflow_id(self, workflow_id: str):
|
|
"""Set or update the workflow ID."""
|
|
self.workflow_id = workflow_id
|
|
|
|
def set_lucydom_interface(self, lucydom_interface):
|
|
"""Set or update the LucyDOM interface."""
|
|
self.lucydom_interface = lucydom_interface
|
|
|
|
async def add_files_to_message(self,
|
|
message: Dict[str, Any],
|
|
file_ids: List[int],
|
|
add_log_func = None) -> Dict[str, Any]:
|
|
"""
|
|
Add multiple files to a message.
|
|
|
|
Args:
|
|
message: The message to add files to
|
|
file_ids: List of file IDs to add
|
|
add_log_func: Optional logging function
|
|
|
|
Returns:
|
|
Updated message
|
|
"""
|
|
if not self.lucydom_interface:
|
|
_log(add_log_func, self.workflow_id, "LucyDOM interface not available", "error")
|
|
return message
|
|
|
|
updated_message = message.copy()
|
|
|
|
# Get file metadata
|
|
files = []
|
|
for file_id in file_ids:
|
|
file = self.lucydom_interface.get_file(file_id)
|
|
if file:
|
|
files.append(file)
|
|
else:
|
|
_log(add_log_func, self.workflow_id, f"File not found: {file_id}", "warning")
|
|
|
|
# Prepare file contexts
|
|
file_contexts = self.file_manager.prepare_file_contexts(files)
|
|
|
|
# Read file contents
|
|
file_contents = await self.file_manager.read_file_contents(
|
|
file_contexts,
|
|
self.lucydom_interface,
|
|
self.workflow_id,
|
|
add_log_func
|
|
)
|
|
|
|
# Add files to message
|
|
for file_id, content_data in file_contents.items():
|
|
# Add file to message
|
|
updated_message = FileManager.add_file_to_message(updated_message, content_data)
|
|
|
|
return updated_message
|
|
|
|
def get_files_from_message(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract file references from a message.
|
|
|
|
Args:
|
|
message: The message to extract files from
|
|
|
|
Returns:
|
|
List of file metadata
|
|
"""
|
|
files = []
|
|
|
|
# Process documents
|
|
for doc in message.get("documents", []):
|
|
source = doc.get("source", {})
|
|
|
|
# Only include file documents
|
|
if source.get("type") == "file":
|
|
file_info = {
|
|
"id": source.get("id", ""),
|
|
"name": source.get("name", ""),
|
|
"type": source.get("content_type", ""),
|
|
"content_type": source.get("content_type", ""),
|
|
"size": source.get("size", 0)
|
|
}
|
|
|
|
files.append(file_info)
|
|
|
|
return files
|
|
|
|
def get_document_text_content(self, message: Dict[str, Any]) -> str:
|
|
"""
|
|
Extract text content from all documents in a message.
|
|
|
|
Args:
|
|
message: The message to extract content from
|
|
|
|
Returns:
|
|
Combined text content
|
|
"""
|
|
content = ""
|
|
|
|
# Process all documents
|
|
for doc in message.get("documents", []):
|
|
for doc_content in doc.get("contents", []):
|
|
if doc_content.get("type") == "text":
|
|
content += "\n\n" + doc_content.get("text", "")
|
|
|
|
return content
|
|
|
|
async def extract_document_info(self,
|
|
workflow: Dict[str, Any],
|
|
message_id: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Extract document information from a workflow or specific message.
|
|
|
|
Args:
|
|
workflow: The workflow object
|
|
message_id: Optional message ID to focus on a specific message
|
|
|
|
Returns:
|
|
Document information
|
|
"""
|
|
result = {
|
|
"documents": [],
|
|
"file_count": 0,
|
|
"extracted_text": ""
|
|
}
|
|
|
|
if message_id:
|
|
# Process only the specified message
|
|
for message in workflow.get("messages", []):
|
|
if message.get("id") == message_id:
|
|
files = self.get_files_from_message(message)
|
|
result["documents"].extend(files)
|
|
result["file_count"] = len(files)
|
|
result["extracted_text"] = self.get_document_text_content(message)
|
|
break
|
|
else:
|
|
# Process all messages
|
|
for message in workflow.get("messages", []):
|
|
files = self.get_files_from_message(message)
|
|
result["documents"].extend(files)
|
|
result["extracted_text"] += self.get_document_text_content(message)
|
|
|
|
# De-duplicate files
|
|
unique_files = {}
|
|
for file in result["documents"]:
|
|
file_id = file.get("id")
|
|
if file_id and file_id not in unique_files:
|
|
unique_files[file_id] = file
|
|
|
|
result["documents"] = list(unique_files.values())
|
|
result["file_count"] = len(result["documents"])
|
|
|
|
return result
|
|
|
|
async def analyze_workflow_documents(self,
|
|
workflow: Dict[str, Any],
|
|
prompt: str,
|
|
ai_service,
|
|
message_id: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Analyze documents in a workflow.
|
|
|
|
Args:
|
|
workflow: The workflow object
|
|
prompt: Analysis prompt
|
|
ai_service: Service for AI analysis
|
|
message_id: Optional message ID to focus on specific message
|
|
|
|
Returns:
|
|
Analysis result
|
|
"""
|
|
if not self.lucydom_interface:
|
|
raise ValueError("LucyDOM interface not available")
|
|
|
|
if not ai_service:
|
|
raise ValueError("AI service not available")
|
|
|
|
# Extract document info
|
|
doc_info = await self.extract_document_info(workflow, message_id)
|
|
|
|
if doc_info["file_count"] == 0:
|
|
return {
|
|
"result": "No documents found for analysis",
|
|
"files_analyzed": 0
|
|
}
|
|
|
|
# Get file IDs
|
|
file_ids = [doc.get("id") for doc in doc_info["documents"] if doc.get("id")]
|
|
|
|
# Analyze files
|
|
analysis = await self.file_manager.analyze_multiple_files(
|
|
file_ids,
|
|
prompt,
|
|
self.lucydom_interface,
|
|
ai_service
|
|
)
|
|
|
|
return analysis
|
|
|
|
|
|
# Export the workflow file manager factory function
|
|
def get_workflow_file_manager(workflow_id: str = None, lucydom_interface = None):
|
|
"""Get a workflow file manager instance."""
|
|
return WorkflowFileManager(workflow_id, lucydom_interface) |