1206 lines
No EOL
49 KiB
Python
1206 lines
No EOL
49 KiB
Python
"""
|
|
Central file management module for the Agentservice.
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import base64
|
|
import json
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any, Optional, Tuple, Union, BinaryIO
|
|
from io import BytesIO
|
|
|
|
# Import utilities from agentservice_utils
|
|
from modules.agentservice_utils import extract_text_from_file_content, is_text_extractable
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Helper function for adding logs
|
|
def _log(add_log_func, workflow_id, message, level="info"):
|
|
"""Helper function for adding logs with standardized formatting."""
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, message, level)
|
|
|
|
# Also log to standard logger
|
|
if level == "info":
|
|
logger.info(message)
|
|
elif level == "warning":
|
|
logger.warning(message)
|
|
elif level == "error":
|
|
logger.error(message)
|
|
|
|
class FileExtractionError(Exception):
|
|
"""Exception for file extraction errors."""
|
|
pass
|
|
|
|
|
|
|
|
class FileManager:
|
|
"""Central file management for the Agentservice."""
|
|
|
|
_instance = None
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
"""Get the singleton instance of FileManager."""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
"""Initialize the FileManager."""
|
|
# Ensure singleton pattern
|
|
if FileManager._instance is not None:
|
|
raise RuntimeError("Singleton instance already exists - use get_instance()")
|
|
|
|
# Import utilities
|
|
# Instead of storing file_utils, we'll use the imported functions directly
|
|
|
|
async def read_file_contents(self,
|
|
file_contexts: List[Dict[str, Any]],
|
|
lucydom_interface,
|
|
workflow_id: str = None,
|
|
add_log_func = None,
|
|
ai_service = None,
|
|
extraction_context: str = None # Add this parameter
|
|
) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Read file contents with optional contextual extraction.
|
|
|
|
Args:
|
|
file_contexts: List of file contexts with metadata
|
|
lucydom_interface: LucyDOM interface for file access
|
|
workflow_id: Optional workflow ID for logging
|
|
add_log_func: Optional function for adding logs
|
|
ai_service: AI service for image analysis
|
|
extraction_context: Optional context prompt for extraction
|
|
|
|
Returns:
|
|
Dictionary with file contents and metadata
|
|
"""
|
|
file_contents = {}
|
|
# Add debug logging
|
|
logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}")
|
|
|
|
for file in file_contexts:
|
|
file_id = file["id"]
|
|
file_name = file["name"]
|
|
file_type = file.get("type", "unknown")
|
|
content_type = file.get("content_type")
|
|
|
|
try:
|
|
# Dateiinhalt über LucyDOM-Interface abrufen
|
|
file_data = await lucydom_interface.read_file_content(file_id)
|
|
|
|
if not file_data:
|
|
_log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning")
|
|
file_contents[file_id] = {
|
|
"content": f"File content not available (File not found)",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
continue
|
|
|
|
logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})")
|
|
|
|
# For image analysis, add extraction context
|
|
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
|
|
if ai_service and hasattr(ai_service, "analyze_image"):
|
|
try:
|
|
# Use extraction context if provided
|
|
prompt = extraction_context or "Describe this image in detail"
|
|
|
|
image_analysis = await ai_service.analyze_image(
|
|
image_data=file_data,
|
|
prompt=prompt, # Use contextual prompt
|
|
mime_type=content_type
|
|
)
|
|
|
|
file_contents[file_id] = {
|
|
"content": f"Image Analysis:\n{image_analysis}",
|
|
"is_extracted": True, # Mark as extracted
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type,
|
|
"extraction_context": prompt # Store the used prompt
|
|
}
|
|
_log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing image {file_name}: {str(e)}")
|
|
_log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
|
|
file_contents[file_id] = {
|
|
"content": f"Image file: {file_name} (Analysis failed: {str(e)})",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
else:
|
|
file_contents[file_id] = {
|
|
"content": f"Image file: {file_name} (AI analysis not available)",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
|
|
# Dokument- und Textdateien
|
|
elif (file_type == "document" or not file_type or file_name.lower().endswith(('.csv', '.txt', '.json', '.xml')) or (content_type and content_type.startswith('text/'))):
|
|
# Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt
|
|
content, is_extracted = extract_text_from_file_content(
|
|
file_data, file_name, content_type
|
|
)
|
|
file_contents[file_id] = {
|
|
"content": content,
|
|
"is_extracted": is_extracted,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
_log(add_log_func, workflow_id,
|
|
f"File {file_name} read successfully (extracted: {is_extracted})", "info")
|
|
|
|
# Andere Dateitypen - nur Metadaten speichern
|
|
else:
|
|
file_contents[file_id] = {
|
|
"content": f"File: {file_name} (Type: {file_type}, content not available)",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
_log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error reading file {file_name}: {str(e)}")
|
|
_log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
|
|
file_contents[file_id] = {
|
|
"content": f"File content not available (Error: {str(e)})",
|
|
"is_extracted": False,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content_type": content_type
|
|
}
|
|
|
|
return file_contents
|
|
|
|
@staticmethod
|
|
def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Add a file to a message with consistent document structure.
|
|
|
|
Args:
|
|
message: The message to add the file to
|
|
file_data: File metadata and content
|
|
|
|
Returns:
|
|
Updated message with the file added
|
|
"""
|
|
logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})")
|
|
|
|
# Initialize documents array if needed
|
|
if "documents" not in message:
|
|
message["documents"] = []
|
|
|
|
# Create a unique ID for the document if not provided
|
|
doc_id = file_data.get("id", f"file_{uuid.uuid4()}")
|
|
|
|
# Extract metadata
|
|
file_size = file_data.get("size")
|
|
if isinstance(file_size, str) and file_size.isdigit():
|
|
file_size = int(file_size)
|
|
elif file_size is None and file_data.get("content"):
|
|
file_size = len(file_data.get("content", ""))
|
|
|
|
# Determine if content is already extracted
|
|
content = file_data.get("content", "No content available")
|
|
file_name = file_data.get("name", "unnamed_file")
|
|
content_type = file_data.get("content_type")
|
|
is_extracted = file_data.get("is_extracted", False)
|
|
|
|
# Create standard document structure that follows the data model
|
|
document = {
|
|
"id": f"doc_{uuid.uuid4()}", # Unique document ID separate from file ID
|
|
"source": {
|
|
"type": "file",
|
|
"id": doc_id,
|
|
"name": file_name,
|
|
"content_type": content_type,
|
|
"size": file_size,
|
|
"upload_date": file_data.get("upload_date", datetime.now().isoformat())
|
|
},
|
|
"contents": [
|
|
{
|
|
"type": "text",
|
|
"text": content,
|
|
"is_extracted": is_extracted,
|
|
"extraction_context": file_data.get("extraction_context", None)
|
|
}
|
|
]
|
|
}
|
|
|
|
# Check if file is already in the message
|
|
file_already_added = any(
|
|
doc.get("source", {}).get("id") == doc_id
|
|
for doc in message.get("documents", [])
|
|
)
|
|
|
|
if not file_already_added:
|
|
message["documents"].append(document)
|
|
logger.info(f"File {file_name} added to message (total: {len(message.get('documents', []))} files)")
|
|
else:
|
|
logger.info(f"File {file_name} already exists in message, skipping")
|
|
|
|
return message
|
|
|
|
|
|
async def analyze_file(self, file_id: int, prompt: str, lucydom_interface, ai_service) -> Dict[str, Any]:
|
|
"""
|
|
Analyze a file using the appropriate method based on file type.
|
|
|
|
Args:
|
|
file_id: ID of the file to analyze
|
|
prompt: Analysis prompt
|
|
lucydom_interface: Interface for database access
|
|
ai_service: Service for AI requests
|
|
|
|
Returns:
|
|
Analysis result
|
|
"""
|
|
if not lucydom_interface:
|
|
raise ValueError("LucyDOM interface not available")
|
|
|
|
if not ai_service:
|
|
raise ValueError("AI service not available")
|
|
|
|
try:
|
|
# Get file metadata
|
|
file = lucydom_interface.get_file(file_id)
|
|
if not file:
|
|
raise ValueError(f"File with ID {file_id} not found")
|
|
|
|
# Get file content
|
|
file_content = await lucydom_interface.read_file_content(file_id)
|
|
if not file_content:
|
|
raise ValueError(f"Content for file {file_id} not found")
|
|
|
|
# Extract metadata
|
|
file_name = file.get("name", "unnamed")
|
|
content_type = file.get("content_type")
|
|
file_type = file.get("type")
|
|
|
|
# Process based on file type
|
|
if file_type == "image" or (content_type and content_type.startswith("image/")):
|
|
# Image analysis
|
|
if hasattr(ai_service, "analyze_image"):
|
|
analysis = await ai_service.analyze_image(
|
|
image_data=file_content,
|
|
prompt=prompt,
|
|
mime_type=content_type
|
|
)
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "image",
|
|
"result": analysis
|
|
}
|
|
else:
|
|
raise ValueError("AI service does not support image analysis")
|
|
|
|
elif file_name.endswith(".pdf"):
|
|
# PDF analysis - first extract text, then analyze
|
|
try:
|
|
# Extract text
|
|
text_content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
if not is_extracted:
|
|
raise ValueError(f"Failed to extract text from PDF {file_name}")
|
|
|
|
# Analyze text with AI
|
|
pdf_analysis_prompt = f"""
|
|
Analyze the following PDF content based on this request:
|
|
|
|
REQUEST: {prompt}
|
|
|
|
PDF CONTENT:
|
|
{text_content} # In a future release to split into tokensets, if too big file
|
|
"""
|
|
|
|
analysis = await ai_service.call_api([{"role": "user", "content": pdf_analysis_prompt}])
|
|
|
|
# Also check for images in the PDF
|
|
has_images = False
|
|
image_analysis = None
|
|
|
|
try:
|
|
# Extract and analyze images
|
|
image_results = await self.extract_and_analyze_pdf_images(
|
|
file_content,
|
|
f"Analyze images with respect to: {prompt}",
|
|
ai_service
|
|
)
|
|
|
|
if image_results and len(image_results) > 0:
|
|
has_images = True
|
|
image_analysis = "\n\nPDF IMAGES ANALYSIS:\n"
|
|
for img in image_results:
|
|
image_analysis += f"- Image on page {img.get('page')}: {img.get('response')}\n"
|
|
except Exception as img_err:
|
|
logger.warning(f"Could not analyze images in PDF {file_name}: {str(img_err)}")
|
|
|
|
# Combine text and image analysis if available
|
|
if has_images and image_analysis:
|
|
analysis += image_analysis
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "pdf",
|
|
"result": analysis,
|
|
"has_images": has_images
|
|
}
|
|
|
|
except Exception as pdf_err:
|
|
logger.error(f"Error analyzing PDF {file_name}: {str(pdf_err)}")
|
|
raise
|
|
|
|
elif file_name.endswith(('.xlsx', '.xls', '.csv')):
|
|
# Tabular data analysis
|
|
try:
|
|
# Extract text content
|
|
text_content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
if not is_extracted:
|
|
raise ValueError(f"Failed to extract data from {file_name}")
|
|
|
|
# Analyze with AI
|
|
data_analysis_prompt = f"""
|
|
Analyze the following tabular data based on this request:
|
|
|
|
REQUEST: {prompt}
|
|
|
|
DATA CONTENT:
|
|
{text_content} # In a future release to split into tokensets to limit storage
|
|
|
|
Provide a structured analysis including:
|
|
1. Data overview
|
|
2. Key insights
|
|
3. Patterns and trends
|
|
4. Answers to the specific request
|
|
"""
|
|
|
|
analysis = await ai_service.call_api([{"role": "user", "content": data_analysis_prompt}])
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "tabular_data",
|
|
"result": analysis
|
|
}
|
|
|
|
except Exception as data_err:
|
|
logger.error(f"Error analyzing tabular data {file_name}: {str(data_err)}")
|
|
raise
|
|
|
|
else:
|
|
# Default to text analysis for all other file types
|
|
try:
|
|
# Extract text content
|
|
text_content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
if not is_extracted:
|
|
raise ValueError(f"Failed to extract text from {file_name}")
|
|
|
|
# Analyze with AI
|
|
text_analysis_prompt = f"""
|
|
Analyze the following document content based on this request:
|
|
|
|
REQUEST: {prompt}
|
|
|
|
DOCUMENT CONTENT:
|
|
{text_content} # In a future release to split into tokensets
|
|
"""
|
|
|
|
analysis = await ai_service.call_api([{"role": "user", "content": text_analysis_prompt}])
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"file_name": file_name,
|
|
"analysis_type": "text",
|
|
"result": analysis
|
|
}
|
|
|
|
except Exception as text_err:
|
|
logger.error(f"Error analyzing text content {file_name}: {str(text_err)}")
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing file {file_id}: {str(e)}")
|
|
raise
|
|
|
|
|
|
async def extract_and_analyze_pdf_images(self,
|
|
pdf_content: bytes,
|
|
prompt: str,
|
|
ai_service
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract images from a PDF file and analyze them.
|
|
Works with binary data instead of file paths.
|
|
|
|
Args:
|
|
pdf_content: Binary data of the PDF file
|
|
prompt: Prompt for image analysis
|
|
ai_service: AI service for image analysis
|
|
|
|
Returns:
|
|
List with analysis results for each image
|
|
"""
|
|
image_responses = []
|
|
temp_files = [] # List of temporary files for cleanup
|
|
|
|
try:
|
|
# Import required libraries
|
|
try:
|
|
import fitz # PyMuPDF
|
|
from io import BytesIO
|
|
import tempfile
|
|
|
|
logger.info(f"Starting PDF image extraction with PyMuPDF")
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) is not installed. Install it with 'pip install pymupdf'")
|
|
return []
|
|
|
|
# Open PDF in memory
|
|
try:
|
|
doc = fitz.open(stream=pdf_content, filetype="pdf")
|
|
page_count = len(doc)
|
|
logger.info(f"PDF opened with {page_count} pages")
|
|
except Exception as pdf_err:
|
|
logger.error(f"Error opening PDF: {str(pdf_err)}")
|
|
return []
|
|
|
|
# Process each page with multiple extraction methods
|
|
for page_num, page in enumerate(doc, 1):
|
|
logger.info(f"Processing page {page_num}/{page_count}")
|
|
|
|
# Method 1: Standard extraction using get_images
|
|
try:
|
|
image_list = page.get_images(full=True)
|
|
if image_list:
|
|
logger.info(f"Method 1: Found {len(image_list)} images on page {page_num}")
|
|
|
|
for img_index, img in enumerate(image_list):
|
|
try:
|
|
xref = img[0] # Get image reference
|
|
|
|
# Extract image data
|
|
base_image = doc.extract_image(xref)
|
|
image_bytes = base_image["image"]
|
|
image_ext = base_image["ext"]
|
|
|
|
# Check for valid image data
|
|
if not image_bytes or len(image_bytes) < 100:
|
|
logger.warning(f"Empty or very small image data for image {img_index+1} on page {page_num}")
|
|
continue
|
|
|
|
# Analyze image
|
|
analysis_result = await ai_service.analyze_image(
|
|
image_data=image_bytes,
|
|
prompt=prompt,
|
|
mime_type=f"image/{image_ext}"
|
|
)
|
|
|
|
# Store image size
|
|
image_size = f"{base_image.get('width', 0)}x{base_image.get('height', 0)}"
|
|
|
|
# Add result
|
|
image_responses.append({
|
|
"page": page_num,
|
|
"image_index": img_index,
|
|
"format": image_ext,
|
|
"image_size": image_size,
|
|
"method": "get_images",
|
|
"response": analysis_result
|
|
})
|
|
|
|
logger.info(f"Successfully analyzed image {img_index+1} on page {page_num} using method 1")
|
|
except Exception as e:
|
|
logger.warning(f"Error processing image {img_index} on page {page_num} (Method 1): {str(e)}")
|
|
else:
|
|
logger.info(f"Method 1: No images found on page {page_num} using get_images")
|
|
except Exception as m1_err:
|
|
logger.warning(f"Error in Method 1 for page {page_num}: {str(m1_err)}")
|
|
|
|
# Method 2: Extract embedded images using page.get_drawings()
|
|
try:
|
|
drawings = page.get_drawings()
|
|
drawing_images = 0
|
|
|
|
for drawing_index, drawing in enumerate(drawings):
|
|
try:
|
|
# Check if drawing contains an image
|
|
if "image" in str(drawing).lower():
|
|
drawing_images += 1
|
|
rect = drawing["rect"] # Get rectangle of the drawing
|
|
|
|
# Extract the area as an image
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect)
|
|
img_bytes = pix.tobytes("png")
|
|
|
|
# Analyze the image
|
|
analysis_result = await ai_service.analyze_image(
|
|
image_data=img_bytes,
|
|
prompt=f"{prompt} (Page {page_num}, Drawing {drawing_index+1})",
|
|
mime_type="image/png"
|
|
)
|
|
|
|
# Add result
|
|
image_responses.append({
|
|
"page": page_num,
|
|
"image_index": drawing_index,
|
|
"format": "png",
|
|
"image_size": f"{pix.width}x{pix.height}",
|
|
"method": "get_drawings",
|
|
"response": analysis_result
|
|
})
|
|
|
|
logger.info(f"Successfully analyzed drawing image {drawing_index+1} on page {page_num} using method 2")
|
|
except Exception as drawing_err:
|
|
logger.warning(f"Error processing drawing {drawing_index} on page {page_num}: {str(drawing_err)}")
|
|
|
|
if drawing_images > 0:
|
|
logger.info(f"Method 2: Processed {drawing_images} images from drawings on page {page_num}")
|
|
else:
|
|
logger.info(f"Method 2: No images found in drawings on page {page_num}")
|
|
except Exception as m2_err:
|
|
logger.warning(f"Error in Method 2 for page {page_num}: {str(m2_err)}")
|
|
|
|
# Method 3: Extract using blocks detection
|
|
try:
|
|
blocks = page.get_text("dict")["blocks"]
|
|
img_blocks = [b for b in blocks if b.get("type") == 1] # type 1 = image
|
|
|
|
if img_blocks:
|
|
logger.info(f"Method 3: Found {len(img_blocks)} image blocks on page {page_num}")
|
|
|
|
for block_index, block in enumerate(img_blocks):
|
|
try:
|
|
# Extract using pixmap for the block region
|
|
rect = block["bbox"]
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect)
|
|
img_bytes = pix.tobytes("png")
|
|
|
|
# Analyze image
|
|
analysis_result = await ai_service.analyze_image(
|
|
image_data=img_bytes,
|
|
prompt=f"{prompt} (Page {page_num}, Block {block_index+1})",
|
|
mime_type="image/png"
|
|
)
|
|
|
|
# Add result
|
|
image_responses.append({
|
|
"page": page_num,
|
|
"image_index": block_index,
|
|
"format": "png",
|
|
"image_size": f"{pix.width}x{pix.height}",
|
|
"method": "block_extraction",
|
|
"response": analysis_result
|
|
})
|
|
|
|
logger.info(f"Successfully analyzed image block {block_index+1} on page {page_num} using method 3")
|
|
except Exception as block_err:
|
|
logger.warning(f"Error processing block {block_index} on page {page_num}: {str(block_err)}")
|
|
else:
|
|
logger.info(f"Method 3: No image blocks found on page {page_num}")
|
|
except Exception as m3_err:
|
|
logger.warning(f"Error in Method 3 for page {page_num}: {str(m3_err)}")
|
|
|
|
# Method 4: Last resort - render the entire page as an image and analyze
|
|
if not image_responses or not any(resp.get("page") == page_num for resp in image_responses):
|
|
try:
|
|
logger.info(f"Method 4: Rendering entire page {page_num} as image")
|
|
|
|
# Render the entire page as an image
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
|
img_bytes = pix.tobytes("png")
|
|
|
|
# Analyze the page as an image
|
|
analysis_result = await ai_service.analyze_image(
|
|
image_data=img_bytes,
|
|
prompt=f"{prompt} (Full page {page_num})",
|
|
mime_type="image/png"
|
|
)
|
|
|
|
# Add result
|
|
image_responses.append({
|
|
"page": page_num,
|
|
"image_index": 0,
|
|
"format": "png",
|
|
"image_size": f"{pix.width}x{pix.height}",
|
|
"method": "full_page_render",
|
|
"response": analysis_result
|
|
})
|
|
|
|
logger.info(f"Successfully analyzed full page {page_num} as image using method 4")
|
|
except Exception as m4_err:
|
|
logger.warning(f"Error in Method 4 for page {page_num}: {str(m4_err)}")
|
|
|
|
# Close the document
|
|
doc.close()
|
|
|
|
# Deduplicate results (different methods might extract the same image)
|
|
deduplicated_responses = []
|
|
seen_areas = set()
|
|
|
|
for response in image_responses:
|
|
# Create a unique identifier for the image area
|
|
area_key = f"{response['page']}_{response['image_size']}"
|
|
|
|
if area_key not in seen_areas:
|
|
seen_areas.add(area_key)
|
|
deduplicated_responses.append(response)
|
|
|
|
logger.info(f"PDF image extraction complete: Found {len(image_responses)} images, deduplicated to {len(deduplicated_responses)}")
|
|
return deduplicated_responses
|
|
|
|
except ImportError as imp_err:
|
|
logger.error(f"Required library not available for PDF image extraction: {str(imp_err)}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error extracting images from PDF: {str(e)}")
|
|
return []
|
|
finally:
|
|
# Clean up temporary files
|
|
for temp_file in temp_files:
|
|
try:
|
|
if os.path.exists(temp_file):
|
|
os.remove(temp_file)
|
|
except Exception as e:
|
|
logger.warning(f"Could not remove temporary file: {temp_file} - {str(e)}")
|
|
|
|
|
|
async def analyze_multiple_files(
|
|
self,
|
|
file_ids: List[int],
|
|
prompt: str,
|
|
lucydom_interface,
|
|
ai_service
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Analyze multiple files and synthesize a combined result.
|
|
|
|
Args:
|
|
file_ids: List of file IDs to analyze
|
|
prompt: Analysis prompt
|
|
lucydom_interface: Interface for database access
|
|
ai_service: Service for AI requests
|
|
|
|
Returns:
|
|
Combined analysis result
|
|
"""
|
|
results = []
|
|
|
|
# Analyze each file
|
|
for file_id in file_ids:
|
|
try:
|
|
analysis = await self.analyze_file(file_id, prompt, lucydom_interface, ai_service)
|
|
results.append(analysis)
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing file {file_id}: {str(e)}")
|
|
results.append({
|
|
"file_id": file_id,
|
|
"error": str(e),
|
|
"analysis_type": "error"
|
|
})
|
|
|
|
# Now synthesize a combined analysis
|
|
if results:
|
|
try:
|
|
# Prepare prompt for synthesis
|
|
synthesis_prompt = f"""
|
|
Synthesize a combined analysis based on these individual file analyses:
|
|
|
|
ORIGINAL REQUEST: {prompt}
|
|
|
|
INDIVIDUAL ANALYSES:
|
|
"""
|
|
|
|
for i, result in enumerate(results, 1):
|
|
file_name = result.get("file_name", f"File {i}")
|
|
analysis_type = result.get("analysis_type", "unknown")
|
|
analysis_result = result.get("result", "No analysis available")
|
|
|
|
synthesis_prompt += f"""
|
|
## {file_name} ({analysis_type})
|
|
{analysis_result}
|
|
|
|
---
|
|
"""
|
|
|
|
synthesis_prompt += """
|
|
Please provide a comprehensive synthesis that:
|
|
1. Combines insights from all files
|
|
2. Addresses the original request
|
|
3. Highlights connections between different files
|
|
4. Provides a unified conclusion
|
|
"""
|
|
|
|
# Call AI for synthesis
|
|
synthesis = await ai_service.call_api([{"role": "user", "content": synthesis_prompt}])
|
|
|
|
return {
|
|
"synthesis": synthesis,
|
|
"individual_results": results,
|
|
"files_analyzed": len(results)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error synthesizing combined analysis: {str(e)}")
|
|
return {
|
|
"error": str(e),
|
|
"individual_results": results,
|
|
"files_analyzed": len(results)
|
|
}
|
|
else:
|
|
return {
|
|
"synthesis": "No files were successfully analyzed.",
|
|
"individual_results": [],
|
|
"files_analyzed": 0
|
|
}
|
|
|
|
def determine_file_type(self, file_name: str, content_type: str = None) -> str:
|
|
"""
|
|
Determine the file type based on name and content type.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
content_type: MIME type (optional)
|
|
|
|
Returns:
|
|
File type string ('document', 'image', etc.)
|
|
"""
|
|
# Check content type first
|
|
if content_type:
|
|
if content_type.startswith('image/'):
|
|
return "image"
|
|
elif content_type in ['application/pdf']:
|
|
return "document"
|
|
elif content_type in ['application/vnd.ms-excel',
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'text/csv']:
|
|
return "spreadsheet"
|
|
|
|
# Check file extension
|
|
lower_name = file_name.lower()
|
|
|
|
# Images
|
|
if lower_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg')):
|
|
return "image"
|
|
|
|
# Documents
|
|
if lower_name.endswith(('.pdf', '.doc', '.docx', '.txt', '.md', '.rtf')):
|
|
return "document"
|
|
|
|
# Spreadsheets
|
|
if lower_name.endswith(('.xlsx', '.xls', '.csv')):
|
|
return "spreadsheet"
|
|
|
|
# Presentations
|
|
if lower_name.endswith(('.pptx', '.ppt')):
|
|
return "presentation"
|
|
|
|
# Data files
|
|
if lower_name.endswith(('.json', '.xml', '.yaml', '.yml')):
|
|
return "data"
|
|
|
|
# Default to document
|
|
return "document"
|
|
|
|
def get_mime_type(self, file_name: str) -> str:
|
|
"""Get MIME type based on file name."""
|
|
# Import from lucydom_interface
|
|
from lucydom_interface import LucyDOMInterface
|
|
temp_interface = LucyDOMInterface(0, 0) # Default values
|
|
return temp_interface.get_mime_type(file_name)
|
|
|
|
def prepare_file_contexts(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Bereitet die Dateikontexte basierend auf Metadaten vor.
|
|
Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank.
|
|
|
|
Args:
|
|
files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type)
|
|
|
|
Returns:
|
|
Liste von Dateikontexten für die Verarbeitung
|
|
"""
|
|
file_contexts = []
|
|
|
|
logger.info(f"Preparing file contexts for {len(files)} files")
|
|
|
|
for file in files:
|
|
file_id = file.get("id")
|
|
file_name = file.get("name")
|
|
file_type = file.get("type")
|
|
|
|
# Create a comprehensive context with all available metadata
|
|
context = {
|
|
"id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"size": file.get("size", "Unbekannt"),
|
|
"content_type": file.get("content_type"),
|
|
"path": file.get("path"),
|
|
"upload_date": file.get("upload_date"),
|
|
"hash": file.get("hash"),
|
|
"mandate_id": file.get("mandate_id"),
|
|
"user_id": file.get("user_id")
|
|
}
|
|
|
|
# Log for debugging
|
|
logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})")
|
|
|
|
file_contexts.append(context)
|
|
|
|
return file_contexts
|
|
|
|
def create_document_reference(self, message: Dict[str, Any], file_id: int, reference_type: str = "reference") -> Dict[str, Any]:
|
|
"""
|
|
Create a document reference without loading content.
|
|
|
|
Args:
|
|
message: The message to add the reference to
|
|
file_id: ID of the file to reference
|
|
reference_type: Type of reference (reference, citation, etc.)
|
|
|
|
Returns:
|
|
Updated message with the document reference
|
|
"""
|
|
if not self.lucydom_interface:
|
|
logger.warning("LucyDOM interface not available for document reference")
|
|
return message
|
|
|
|
# Get file metadata
|
|
file = self.lucydom_interface.get_file(file_id)
|
|
if not file:
|
|
logger.warning(f"File with ID {file_id} not found for reference")
|
|
return message
|
|
|
|
# Create document structure with just the reference
|
|
document = {
|
|
"id": f"ref_{uuid.uuid4()}",
|
|
"source": {
|
|
"type": "file",
|
|
"id": str(file_id),
|
|
"name": file.get("name", "referenced_file"),
|
|
"content_type": file.get("content_type"),
|
|
"size": file.get("size"),
|
|
"reference_type": reference_type
|
|
},
|
|
"contents": [] # Empty contents - will be loaded on demand
|
|
}
|
|
|
|
# Add to message
|
|
updated_message = message.copy()
|
|
if "documents" not in updated_message:
|
|
updated_message["documents"] = []
|
|
|
|
updated_message["documents"].append(document)
|
|
logger.info(f"Added document reference for file {file.get('name')} (ID: {file_id})")
|
|
|
|
return updated_message
|
|
|
|
def should_extract_document(self, document: Dict[str, Any], context_prompt: str = None) -> bool:
|
|
"""
|
|
Determine if a document needs content extraction.
|
|
|
|
Args:
|
|
document: The document object
|
|
context_prompt: Current context prompt
|
|
|
|
Returns:
|
|
True if extraction is needed, False otherwise
|
|
"""
|
|
# If document has no contents, extraction is needed
|
|
if not document.get("contents"):
|
|
return True
|
|
|
|
# If document has contents but extraction status is False, extraction may be needed
|
|
for content in document.get("contents", []):
|
|
if content.get("type") == "text":
|
|
# If already extracted, check if context has changed
|
|
if content.get("is_extracted", False):
|
|
# If context prompt is different from what was used previously,
|
|
# we may need to re-extract with the new context
|
|
prev_context = content.get("extraction_context")
|
|
if context_prompt and prev_context != context_prompt:
|
|
return True
|
|
return False
|
|
return True
|
|
|
|
# Default to needing extraction
|
|
return True
|
|
|
|
|
|
|
|
# Factory method
|
|
@staticmethod
|
|
def get_instance():
|
|
"""Get the singleton instance of FileManager."""
|
|
if FileManager._instance is None:
|
|
FileManager._instance = FileManager()
|
|
return FileManager._instance
|
|
|
|
|
|
|
|
# Create a singleton instance for module-level access
|
|
file_manager = FileManager.get_instance()
|
|
|
|
def get_file_manager():
|
|
"""Get the singleton instance of FileManager."""
|
|
return file_manager
|
|
|
|
|
|
|
|
|
|
class WorkflowFileManager:
|
|
"""
|
|
Specialized file manager for workflow operations.
|
|
Handles workflow-specific file operations and document management.
|
|
"""
|
|
|
|
def __init__(self, workflow_id: str = None, lucydom_interface = None):
|
|
"""
|
|
Initialize the workflow file manager.
|
|
|
|
Args:
|
|
workflow_id: Optional workflow ID for context
|
|
lucydom_interface: LucyDOM interface for database operations
|
|
"""
|
|
self.workflow_id = workflow_id
|
|
self.lucydom_interface = lucydom_interface
|
|
self.file_manager = get_file_manager()
|
|
self.document_handler = None
|
|
|
|
def set_workflow_id(self, workflow_id: str):
|
|
"""Set or update the workflow ID."""
|
|
self.workflow_id = workflow_id
|
|
|
|
def set_lucydom_interface(self, lucydom_interface):
|
|
"""Set or update the LucyDOM interface."""
|
|
self.lucydom_interface = lucydom_interface
|
|
|
|
async def add_files_to_message(self,
|
|
message: Dict[str, Any],
|
|
file_ids: List[int],
|
|
add_log_func = None) -> Dict[str, Any]:
|
|
"""
|
|
Add multiple files to a message.
|
|
|
|
Args:
|
|
message: The message to add files to
|
|
file_ids: List of file IDs to add
|
|
add_log_func: Optional logging function
|
|
|
|
Returns:
|
|
Updated message
|
|
"""
|
|
|
|
# If document handler is available, use it
|
|
if self.document_handler:
|
|
return await self.document_handler.add_files_to_message(
|
|
message,
|
|
file_ids,
|
|
extraction_prompt=None # Default to no extraction
|
|
)
|
|
|
|
if not self.lucydom_interface:
|
|
_log(add_log_func, self.workflow_id, "LucyDOM interface not available", "error")
|
|
return message
|
|
|
|
updated_message = message.copy()
|
|
|
|
# Get file metadata
|
|
files = []
|
|
for file_id in file_ids:
|
|
file = self.lucydom_interface.get_file(file_id)
|
|
if file:
|
|
files.append(file)
|
|
else:
|
|
_log(add_log_func, self.workflow_id, f"File not found: {file_id}", "warning")
|
|
|
|
# Prepare file contexts
|
|
file_contexts = self.file_manager.prepare_file_contexts(files)
|
|
|
|
# Read file contents
|
|
file_contents = await self.file_manager.read_file_contents(
|
|
file_contexts,
|
|
self.lucydom_interface,
|
|
self.workflow_id,
|
|
add_log_func
|
|
)
|
|
|
|
# Add files to message
|
|
for file_id, content_data in file_contents.items():
|
|
# Add file to message
|
|
updated_message = FileManager.add_file_to_message(updated_message, content_data)
|
|
|
|
return updated_message
|
|
|
|
def get_files_from_message(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract file references from a message.
|
|
|
|
Args:
|
|
message: The message to extract files from
|
|
|
|
Returns:
|
|
List of file metadata
|
|
"""
|
|
files = []
|
|
|
|
# Process documents
|
|
for doc in message.get("documents", []):
|
|
source = doc.get("source", {})
|
|
|
|
# Only include file documents
|
|
if source.get("type") == "file":
|
|
file_info = {
|
|
"id": source.get("id", ""),
|
|
"name": source.get("name", ""),
|
|
"type": source.get("content_type", ""),
|
|
"content_type": source.get("content_type", ""),
|
|
"size": source.get("size", 0)
|
|
}
|
|
|
|
files.append(file_info)
|
|
|
|
return files
|
|
|
|
def get_document_text_content(self, message: Dict[str, Any]) -> str:
|
|
"""
|
|
Extract text content from all documents in a message.
|
|
|
|
Args:
|
|
message: The message to extract content from
|
|
|
|
Returns:
|
|
Combined text content
|
|
"""
|
|
content = ""
|
|
|
|
# Process all documents
|
|
for doc in message.get("documents", []):
|
|
for doc_content in doc.get("contents", []):
|
|
if doc_content.get("type") == "text":
|
|
content += "\n\n" + doc_content.get("text", "")
|
|
|
|
return content
|
|
|
|
async def extract_document_info(self,
|
|
workflow: Dict[str, Any],
|
|
message_id: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Extract document information from a workflow or specific message.
|
|
|
|
Args:
|
|
workflow: The workflow object
|
|
message_id: Optional message ID to focus on a specific message
|
|
|
|
Returns:
|
|
Document information
|
|
"""
|
|
result = {
|
|
"documents": [],
|
|
"file_count": 0,
|
|
"extracted_text": ""
|
|
}
|
|
|
|
if message_id:
|
|
# Process only the specified message
|
|
for message in workflow.get("messages", []):
|
|
if message.get("id") == message_id:
|
|
files = self.get_files_from_message(message)
|
|
result["documents"].extend(files)
|
|
result["file_count"] = len(files)
|
|
result["extracted_text"] = self.get_document_text_content(message)
|
|
break
|
|
else:
|
|
# Process all messages
|
|
for message in workflow.get("messages", []):
|
|
files = self.get_files_from_message(message)
|
|
result["documents"].extend(files)
|
|
result["extracted_text"] += self.get_document_text_content(message)
|
|
|
|
# De-duplicate files
|
|
unique_files = {}
|
|
for file in result["documents"]:
|
|
file_id = file.get("id")
|
|
if file_id and file_id not in unique_files:
|
|
unique_files[file_id] = file
|
|
|
|
result["documents"] = list(unique_files.values())
|
|
result["file_count"] = len(result["documents"])
|
|
|
|
return result
|
|
|
|
async def analyze_workflow_documents(self,
|
|
workflow: Dict[str, Any],
|
|
prompt: str,
|
|
ai_service,
|
|
message_id: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Analyze documents in a workflow.
|
|
|
|
Args:
|
|
workflow: The workflow object
|
|
prompt: Analysis prompt
|
|
ai_service: Service for AI analysis
|
|
message_id: Optional message ID to focus on specific message
|
|
|
|
Returns:
|
|
Analysis result
|
|
"""
|
|
if not self.lucydom_interface:
|
|
raise ValueError("LucyDOM interface not available")
|
|
|
|
if not ai_service:
|
|
raise ValueError("AI service not available")
|
|
|
|
# Extract document info
|
|
doc_info = await self.extract_document_info(workflow, message_id)
|
|
|
|
if doc_info["file_count"] == 0:
|
|
return {
|
|
"result": "No documents found for analysis",
|
|
"files_analyzed": 0
|
|
}
|
|
|
|
# Get file IDs
|
|
file_ids = [doc.get("id") for doc in doc_info["documents"] if doc.get("id")]
|
|
|
|
# Analyze files
|
|
analysis = await self.file_manager.analyze_multiple_files(
|
|
file_ids,
|
|
prompt,
|
|
self.lucydom_interface,
|
|
ai_service
|
|
)
|
|
|
|
return analysis
|
|
|
|
# Export the workflow file manager factory function
|
|
def get_workflow_file_manager(workflow_id: str = None, lucydom_interface = None):
|
|
"""Get a workflow file manager instance."""
|
|
return WorkflowFileManager(workflow_id, lucydom_interface) |