gateway/modules/agentservice_filemanager.py
2025-04-16 21:42:26 +02:00

1206 lines
No EOL
49 KiB
Python

"""
Central file management module for the Agentservice.
"""
import os
import logging
import base64
import json
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union, BinaryIO
from io import BytesIO
# Import utilities from agentservice_utils
from modules.agentservice_utils import extract_text_from_file_content, is_text_extractable
logger = logging.getLogger(__name__)
# Helper function for adding logs
def _log(add_log_func, workflow_id, message, level="info"):
"""Helper function for adding logs with standardized formatting."""
if add_log_func and workflow_id:
add_log_func(workflow_id, message, level)
# Also log to standard logger
if level == "info":
logger.info(message)
elif level == "warning":
logger.warning(message)
elif level == "error":
logger.error(message)
class FileExtractionError(Exception):
"""Exception for file extraction errors."""
pass
class FileManager:
"""Central file management for the Agentservice."""
_instance = None
@classmethod
def get_instance(cls):
"""Get the singleton instance of FileManager."""
if cls._instance is None:
cls._instance = cls()
return cls._instance
def __init__(self):
"""Initialize the FileManager."""
# Ensure singleton pattern
if FileManager._instance is not None:
raise RuntimeError("Singleton instance already exists - use get_instance()")
# Import utilities
# Instead of storing file_utils, we'll use the imported functions directly
async def read_file_contents(self,
file_contexts: List[Dict[str, Any]],
lucydom_interface,
workflow_id: str = None,
add_log_func = None,
ai_service = None,
extraction_context: str = None # Add this parameter
) -> Dict[str, Dict[str, Any]]:
"""
Read file contents with optional contextual extraction.
Args:
file_contexts: List of file contexts with metadata
lucydom_interface: LucyDOM interface for file access
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
ai_service: AI service for image analysis
extraction_context: Optional context prompt for extraction
Returns:
Dictionary with file contents and metadata
"""
file_contents = {}
# Add debug logging
logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}")
for file in file_contexts:
file_id = file["id"]
file_name = file["name"]
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
try:
# Dateiinhalt über LucyDOM-Interface abrufen
file_data = await lucydom_interface.read_file_content(file_id)
if not file_data:
_log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning")
file_contents[file_id] = {
"content": f"File content not available (File not found)",
"is_extracted": False,
"name": file_name,
"type": file_type,
"content_type": content_type
}
continue
logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})")
# For image analysis, add extraction context
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
if ai_service and hasattr(ai_service, "analyze_image"):
try:
# Use extraction context if provided
prompt = extraction_context or "Describe this image in detail"
image_analysis = await ai_service.analyze_image(
image_data=file_data,
prompt=prompt, # Use contextual prompt
mime_type=content_type
)
file_contents[file_id] = {
"content": f"Image Analysis:\n{image_analysis}",
"is_extracted": True, # Mark as extracted
"name": file_name,
"type": file_type,
"content_type": content_type,
"extraction_context": prompt # Store the used prompt
}
_log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
_log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
file_contents[file_id] = {
"content": f"Image file: {file_name} (Analysis failed: {str(e)})",
"is_extracted": False,
"name": file_name,
"type": file_type,
"content_type": content_type
}
else:
file_contents[file_id] = {
"content": f"Image file: {file_name} (AI analysis not available)",
"is_extracted": False,
"name": file_name,
"type": file_type,
"content_type": content_type
}
# Dokument- und Textdateien
elif (file_type == "document" or not file_type or file_name.lower().endswith(('.csv', '.txt', '.json', '.xml')) or (content_type and content_type.startswith('text/'))):
# Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt
content, is_extracted = extract_text_from_file_content(
file_data, file_name, content_type
)
file_contents[file_id] = {
"content": content,
"is_extracted": is_extracted,
"name": file_name,
"type": file_type,
"content_type": content_type
}
_log(add_log_func, workflow_id,
f"File {file_name} read successfully (extracted: {is_extracted})", "info")
# Andere Dateitypen - nur Metadaten speichern
else:
file_contents[file_id] = {
"content": f"File: {file_name} (Type: {file_type}, content not available)",
"is_extracted": False,
"name": file_name,
"type": file_type,
"content_type": content_type
}
_log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")
except Exception as e:
logger.error(f"Error reading file {file_name}: {str(e)}")
_log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
file_contents[file_id] = {
"content": f"File content not available (Error: {str(e)})",
"is_extracted": False,
"name": file_name,
"type": file_type,
"content_type": content_type
}
return file_contents
@staticmethod
def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Add a file to a message with consistent document structure.
Args:
message: The message to add the file to
file_data: File metadata and content
Returns:
Updated message with the file added
"""
logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})")
# Initialize documents array if needed
if "documents" not in message:
message["documents"] = []
# Create a unique ID for the document if not provided
doc_id = file_data.get("id", f"file_{uuid.uuid4()}")
# Extract metadata
file_size = file_data.get("size")
if isinstance(file_size, str) and file_size.isdigit():
file_size = int(file_size)
elif file_size is None and file_data.get("content"):
file_size = len(file_data.get("content", ""))
# Determine if content is already extracted
content = file_data.get("content", "No content available")
file_name = file_data.get("name", "unnamed_file")
content_type = file_data.get("content_type")
is_extracted = file_data.get("is_extracted", False)
# Create standard document structure that follows the data model
document = {
"id": f"doc_{uuid.uuid4()}", # Unique document ID separate from file ID
"source": {
"type": "file",
"id": doc_id,
"name": file_name,
"content_type": content_type,
"size": file_size,
"upload_date": file_data.get("upload_date", datetime.now().isoformat())
},
"contents": [
{
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": file_data.get("extraction_context", None)
}
]
}
# Check if file is already in the message
file_already_added = any(
doc.get("source", {}).get("id") == doc_id
for doc in message.get("documents", [])
)
if not file_already_added:
message["documents"].append(document)
logger.info(f"File {file_name} added to message (total: {len(message.get('documents', []))} files)")
else:
logger.info(f"File {file_name} already exists in message, skipping")
return message
async def analyze_file(self, file_id: int, prompt: str, lucydom_interface, ai_service) -> Dict[str, Any]:
"""
Analyze a file using the appropriate method based on file type.
Args:
file_id: ID of the file to analyze
prompt: Analysis prompt
lucydom_interface: Interface for database access
ai_service: Service for AI requests
Returns:
Analysis result
"""
if not lucydom_interface:
raise ValueError("LucyDOM interface not available")
if not ai_service:
raise ValueError("AI service not available")
try:
# Get file metadata
file = lucydom_interface.get_file(file_id)
if not file:
raise ValueError(f"File with ID {file_id} not found")
# Get file content
file_content = await lucydom_interface.read_file_content(file_id)
if not file_content:
raise ValueError(f"Content for file {file_id} not found")
# Extract metadata
file_name = file.get("name", "unnamed")
content_type = file.get("content_type")
file_type = file.get("type")
# Process based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
# Image analysis
if hasattr(ai_service, "analyze_image"):
analysis = await ai_service.analyze_image(
image_data=file_content,
prompt=prompt,
mime_type=content_type
)
return {
"file_id": file_id,
"file_name": file_name,
"analysis_type": "image",
"result": analysis
}
else:
raise ValueError("AI service does not support image analysis")
elif file_name.endswith(".pdf"):
# PDF analysis - first extract text, then analyze
try:
# Extract text
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
if not is_extracted:
raise ValueError(f"Failed to extract text from PDF {file_name}")
# Analyze text with AI
pdf_analysis_prompt = f"""
Analyze the following PDF content based on this request:
REQUEST: {prompt}
PDF CONTENT:
{text_content} # In a future release to split into tokensets, if too big file
"""
analysis = await ai_service.call_api([{"role": "user", "content": pdf_analysis_prompt}])
# Also check for images in the PDF
has_images = False
image_analysis = None
try:
# Extract and analyze images
image_results = await self.extract_and_analyze_pdf_images(
file_content,
f"Analyze images with respect to: {prompt}",
ai_service
)
if image_results and len(image_results) > 0:
has_images = True
image_analysis = "\n\nPDF IMAGES ANALYSIS:\n"
for img in image_results:
image_analysis += f"- Image on page {img.get('page')}: {img.get('response')}\n"
except Exception as img_err:
logger.warning(f"Could not analyze images in PDF {file_name}: {str(img_err)}")
# Combine text and image analysis if available
if has_images and image_analysis:
analysis += image_analysis
return {
"file_id": file_id,
"file_name": file_name,
"analysis_type": "pdf",
"result": analysis,
"has_images": has_images
}
except Exception as pdf_err:
logger.error(f"Error analyzing PDF {file_name}: {str(pdf_err)}")
raise
elif file_name.endswith(('.xlsx', '.xls', '.csv')):
# Tabular data analysis
try:
# Extract text content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
if not is_extracted:
raise ValueError(f"Failed to extract data from {file_name}")
# Analyze with AI
data_analysis_prompt = f"""
Analyze the following tabular data based on this request:
REQUEST: {prompt}
DATA CONTENT:
{text_content} # In a future release to split into tokensets to limit storage
Provide a structured analysis including:
1. Data overview
2. Key insights
3. Patterns and trends
4. Answers to the specific request
"""
analysis = await ai_service.call_api([{"role": "user", "content": data_analysis_prompt}])
return {
"file_id": file_id,
"file_name": file_name,
"analysis_type": "tabular_data",
"result": analysis
}
except Exception as data_err:
logger.error(f"Error analyzing tabular data {file_name}: {str(data_err)}")
raise
else:
# Default to text analysis for all other file types
try:
# Extract text content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
if not is_extracted:
raise ValueError(f"Failed to extract text from {file_name}")
# Analyze with AI
text_analysis_prompt = f"""
Analyze the following document content based on this request:
REQUEST: {prompt}
DOCUMENT CONTENT:
{text_content} # In a future release to split into tokensets
"""
analysis = await ai_service.call_api([{"role": "user", "content": text_analysis_prompt}])
return {
"file_id": file_id,
"file_name": file_name,
"analysis_type": "text",
"result": analysis
}
except Exception as text_err:
logger.error(f"Error analyzing text content {file_name}: {str(text_err)}")
raise
except Exception as e:
logger.error(f"Error analyzing file {file_id}: {str(e)}")
raise
async def extract_and_analyze_pdf_images(self,
pdf_content: bytes,
prompt: str,
ai_service
) -> List[Dict[str, Any]]:
"""
Extract images from a PDF file and analyze them.
Works with binary data instead of file paths.
Args:
pdf_content: Binary data of the PDF file
prompt: Prompt for image analysis
ai_service: AI service for image analysis
Returns:
List with analysis results for each image
"""
image_responses = []
temp_files = [] # List of temporary files for cleanup
try:
# Import required libraries
try:
import fitz # PyMuPDF
from io import BytesIO
import tempfile
logger.info(f"Starting PDF image extraction with PyMuPDF")
except ImportError:
logger.error("PyMuPDF (fitz) is not installed. Install it with 'pip install pymupdf'")
return []
# Open PDF in memory
try:
doc = fitz.open(stream=pdf_content, filetype="pdf")
page_count = len(doc)
logger.info(f"PDF opened with {page_count} pages")
except Exception as pdf_err:
logger.error(f"Error opening PDF: {str(pdf_err)}")
return []
# Process each page with multiple extraction methods
for page_num, page in enumerate(doc, 1):
logger.info(f"Processing page {page_num}/{page_count}")
# Method 1: Standard extraction using get_images
try:
image_list = page.get_images(full=True)
if image_list:
logger.info(f"Method 1: Found {len(image_list)} images on page {page_num}")
for img_index, img in enumerate(image_list):
try:
xref = img[0] # Get image reference
# Extract image data
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Check for valid image data
if not image_bytes or len(image_bytes) < 100:
logger.warning(f"Empty or very small image data for image {img_index+1} on page {page_num}")
continue
# Analyze image
analysis_result = await ai_service.analyze_image(
image_data=image_bytes,
prompt=prompt,
mime_type=f"image/{image_ext}"
)
# Store image size
image_size = f"{base_image.get('width', 0)}x{base_image.get('height', 0)}"
# Add result
image_responses.append({
"page": page_num,
"image_index": img_index,
"format": image_ext,
"image_size": image_size,
"method": "get_images",
"response": analysis_result
})
logger.info(f"Successfully analyzed image {img_index+1} on page {page_num} using method 1")
except Exception as e:
logger.warning(f"Error processing image {img_index} on page {page_num} (Method 1): {str(e)}")
else:
logger.info(f"Method 1: No images found on page {page_num} using get_images")
except Exception as m1_err:
logger.warning(f"Error in Method 1 for page {page_num}: {str(m1_err)}")
# Method 2: Extract embedded images using page.get_drawings()
try:
drawings = page.get_drawings()
drawing_images = 0
for drawing_index, drawing in enumerate(drawings):
try:
# Check if drawing contains an image
if "image" in str(drawing).lower():
drawing_images += 1
rect = drawing["rect"] # Get rectangle of the drawing
# Extract the area as an image
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect)
img_bytes = pix.tobytes("png")
# Analyze the image
analysis_result = await ai_service.analyze_image(
image_data=img_bytes,
prompt=f"{prompt} (Page {page_num}, Drawing {drawing_index+1})",
mime_type="image/png"
)
# Add result
image_responses.append({
"page": page_num,
"image_index": drawing_index,
"format": "png",
"image_size": f"{pix.width}x{pix.height}",
"method": "get_drawings",
"response": analysis_result
})
logger.info(f"Successfully analyzed drawing image {drawing_index+1} on page {page_num} using method 2")
except Exception as drawing_err:
logger.warning(f"Error processing drawing {drawing_index} on page {page_num}: {str(drawing_err)}")
if drawing_images > 0:
logger.info(f"Method 2: Processed {drawing_images} images from drawings on page {page_num}")
else:
logger.info(f"Method 2: No images found in drawings on page {page_num}")
except Exception as m2_err:
logger.warning(f"Error in Method 2 for page {page_num}: {str(m2_err)}")
# Method 3: Extract using blocks detection
try:
blocks = page.get_text("dict")["blocks"]
img_blocks = [b for b in blocks if b.get("type") == 1] # type 1 = image
if img_blocks:
logger.info(f"Method 3: Found {len(img_blocks)} image blocks on page {page_num}")
for block_index, block in enumerate(img_blocks):
try:
# Extract using pixmap for the block region
rect = block["bbox"]
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect)
img_bytes = pix.tobytes("png")
# Analyze image
analysis_result = await ai_service.analyze_image(
image_data=img_bytes,
prompt=f"{prompt} (Page {page_num}, Block {block_index+1})",
mime_type="image/png"
)
# Add result
image_responses.append({
"page": page_num,
"image_index": block_index,
"format": "png",
"image_size": f"{pix.width}x{pix.height}",
"method": "block_extraction",
"response": analysis_result
})
logger.info(f"Successfully analyzed image block {block_index+1} on page {page_num} using method 3")
except Exception as block_err:
logger.warning(f"Error processing block {block_index} on page {page_num}: {str(block_err)}")
else:
logger.info(f"Method 3: No image blocks found on page {page_num}")
except Exception as m3_err:
logger.warning(f"Error in Method 3 for page {page_num}: {str(m3_err)}")
# Method 4: Last resort - render the entire page as an image and analyze
if not image_responses or not any(resp.get("page") == page_num for resp in image_responses):
try:
logger.info(f"Method 4: Rendering entire page {page_num} as image")
# Render the entire page as an image
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img_bytes = pix.tobytes("png")
# Analyze the page as an image
analysis_result = await ai_service.analyze_image(
image_data=img_bytes,
prompt=f"{prompt} (Full page {page_num})",
mime_type="image/png"
)
# Add result
image_responses.append({
"page": page_num,
"image_index": 0,
"format": "png",
"image_size": f"{pix.width}x{pix.height}",
"method": "full_page_render",
"response": analysis_result
})
logger.info(f"Successfully analyzed full page {page_num} as image using method 4")
except Exception as m4_err:
logger.warning(f"Error in Method 4 for page {page_num}: {str(m4_err)}")
# Close the document
doc.close()
# Deduplicate results (different methods might extract the same image)
deduplicated_responses = []
seen_areas = set()
for response in image_responses:
# Create a unique identifier for the image area
area_key = f"{response['page']}_{response['image_size']}"
if area_key not in seen_areas:
seen_areas.add(area_key)
deduplicated_responses.append(response)
logger.info(f"PDF image extraction complete: Found {len(image_responses)} images, deduplicated to {len(deduplicated_responses)}")
return deduplicated_responses
except ImportError as imp_err:
logger.error(f"Required library not available for PDF image extraction: {str(imp_err)}")
return []
except Exception as e:
logger.error(f"Error extracting images from PDF: {str(e)}")
return []
finally:
# Clean up temporary files
for temp_file in temp_files:
try:
if os.path.exists(temp_file):
os.remove(temp_file)
except Exception as e:
logger.warning(f"Could not remove temporary file: {temp_file} - {str(e)}")
async def analyze_multiple_files(
self,
file_ids: List[int],
prompt: str,
lucydom_interface,
ai_service
) -> Dict[str, Any]:
"""
Analyze multiple files and synthesize a combined result.
Args:
file_ids: List of file IDs to analyze
prompt: Analysis prompt
lucydom_interface: Interface for database access
ai_service: Service for AI requests
Returns:
Combined analysis result
"""
results = []
# Analyze each file
for file_id in file_ids:
try:
analysis = await self.analyze_file(file_id, prompt, lucydom_interface, ai_service)
results.append(analysis)
except Exception as e:
logger.error(f"Error analyzing file {file_id}: {str(e)}")
results.append({
"file_id": file_id,
"error": str(e),
"analysis_type": "error"
})
# Now synthesize a combined analysis
if results:
try:
# Prepare prompt for synthesis
synthesis_prompt = f"""
Synthesize a combined analysis based on these individual file analyses:
ORIGINAL REQUEST: {prompt}
INDIVIDUAL ANALYSES:
"""
for i, result in enumerate(results, 1):
file_name = result.get("file_name", f"File {i}")
analysis_type = result.get("analysis_type", "unknown")
analysis_result = result.get("result", "No analysis available")
synthesis_prompt += f"""
## {file_name} ({analysis_type})
{analysis_result}
---
"""
synthesis_prompt += """
Please provide a comprehensive synthesis that:
1. Combines insights from all files
2. Addresses the original request
3. Highlights connections between different files
4. Provides a unified conclusion
"""
# Call AI for synthesis
synthesis = await ai_service.call_api([{"role": "user", "content": synthesis_prompt}])
return {
"synthesis": synthesis,
"individual_results": results,
"files_analyzed": len(results)
}
except Exception as e:
logger.error(f"Error synthesizing combined analysis: {str(e)}")
return {
"error": str(e),
"individual_results": results,
"files_analyzed": len(results)
}
else:
return {
"synthesis": "No files were successfully analyzed.",
"individual_results": [],
"files_analyzed": 0
}
def determine_file_type(self, file_name: str, content_type: str = None) -> str:
"""
Determine the file type based on name and content type.
Args:
file_name: Name of the file
content_type: MIME type (optional)
Returns:
File type string ('document', 'image', etc.)
"""
# Check content type first
if content_type:
if content_type.startswith('image/'):
return "image"
elif content_type in ['application/pdf']:
return "document"
elif content_type in ['application/vnd.ms-excel',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'text/csv']:
return "spreadsheet"
# Check file extension
lower_name = file_name.lower()
# Images
if lower_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg')):
return "image"
# Documents
if lower_name.endswith(('.pdf', '.doc', '.docx', '.txt', '.md', '.rtf')):
return "document"
# Spreadsheets
if lower_name.endswith(('.xlsx', '.xls', '.csv')):
return "spreadsheet"
# Presentations
if lower_name.endswith(('.pptx', '.ppt')):
return "presentation"
# Data files
if lower_name.endswith(('.json', '.xml', '.yaml', '.yml')):
return "data"
# Default to document
return "document"
def get_mime_type(self, file_name: str) -> str:
"""Get MIME type based on file name."""
# Import from lucydom_interface
from lucydom_interface import LucyDOMInterface
temp_interface = LucyDOMInterface(0, 0) # Default values
return temp_interface.get_mime_type(file_name)
def prepare_file_contexts(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Bereitet die Dateikontexte basierend auf Metadaten vor.
Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank.
Args:
files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type)
Returns:
Liste von Dateikontexten für die Verarbeitung
"""
file_contexts = []
logger.info(f"Preparing file contexts for {len(files)} files")
for file in files:
file_id = file.get("id")
file_name = file.get("name")
file_type = file.get("type")
# Create a comprehensive context with all available metadata
context = {
"id": file_id,
"name": file_name,
"type": file_type,
"size": file.get("size", "Unbekannt"),
"content_type": file.get("content_type"),
"path": file.get("path"),
"upload_date": file.get("upload_date"),
"hash": file.get("hash"),
"mandate_id": file.get("mandate_id"),
"user_id": file.get("user_id")
}
# Log for debugging
logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})")
file_contexts.append(context)
return file_contexts
def create_document_reference(self, message: Dict[str, Any], file_id: int, reference_type: str = "reference") -> Dict[str, Any]:
"""
Create a document reference without loading content.
Args:
message: The message to add the reference to
file_id: ID of the file to reference
reference_type: Type of reference (reference, citation, etc.)
Returns:
Updated message with the document reference
"""
if not self.lucydom_interface:
logger.warning("LucyDOM interface not available for document reference")
return message
# Get file metadata
file = self.lucydom_interface.get_file(file_id)
if not file:
logger.warning(f"File with ID {file_id} not found for reference")
return message
# Create document structure with just the reference
document = {
"id": f"ref_{uuid.uuid4()}",
"source": {
"type": "file",
"id": str(file_id),
"name": file.get("name", "referenced_file"),
"content_type": file.get("content_type"),
"size": file.get("size"),
"reference_type": reference_type
},
"contents": [] # Empty contents - will be loaded on demand
}
# Add to message
updated_message = message.copy()
if "documents" not in updated_message:
updated_message["documents"] = []
updated_message["documents"].append(document)
logger.info(f"Added document reference for file {file.get('name')} (ID: {file_id})")
return updated_message
def should_extract_document(self, document: Dict[str, Any], context_prompt: str = None) -> bool:
"""
Determine if a document needs content extraction.
Args:
document: The document object
context_prompt: Current context prompt
Returns:
True if extraction is needed, False otherwise
"""
# If document has no contents, extraction is needed
if not document.get("contents"):
return True
# If document has contents but extraction status is False, extraction may be needed
for content in document.get("contents", []):
if content.get("type") == "text":
# If already extracted, check if context has changed
if content.get("is_extracted", False):
# If context prompt is different from what was used previously,
# we may need to re-extract with the new context
prev_context = content.get("extraction_context")
if context_prompt and prev_context != context_prompt:
return True
return False
return True
# Default to needing extraction
return True
# Factory method
@staticmethod
def get_instance():
"""Get the singleton instance of FileManager."""
if FileManager._instance is None:
FileManager._instance = FileManager()
return FileManager._instance
# Create a singleton instance for module-level access
file_manager = FileManager.get_instance()
def get_file_manager():
"""Get the singleton instance of FileManager."""
return file_manager
class WorkflowFileManager:
"""
Specialized file manager for workflow operations.
Handles workflow-specific file operations and document management.
"""
def __init__(self, workflow_id: str = None, lucydom_interface = None):
"""
Initialize the workflow file manager.
Args:
workflow_id: Optional workflow ID for context
lucydom_interface: LucyDOM interface for database operations
"""
self.workflow_id = workflow_id
self.lucydom_interface = lucydom_interface
self.file_manager = get_file_manager()
self.document_handler = None
def set_workflow_id(self, workflow_id: str):
"""Set or update the workflow ID."""
self.workflow_id = workflow_id
def set_lucydom_interface(self, lucydom_interface):
"""Set or update the LucyDOM interface."""
self.lucydom_interface = lucydom_interface
async def add_files_to_message(self,
message: Dict[str, Any],
file_ids: List[int],
add_log_func = None) -> Dict[str, Any]:
"""
Add multiple files to a message.
Args:
message: The message to add files to
file_ids: List of file IDs to add
add_log_func: Optional logging function
Returns:
Updated message
"""
# If document handler is available, use it
if self.document_handler:
return await self.document_handler.add_files_to_message(
message,
file_ids,
extraction_prompt=None # Default to no extraction
)
if not self.lucydom_interface:
_log(add_log_func, self.workflow_id, "LucyDOM interface not available", "error")
return message
updated_message = message.copy()
# Get file metadata
files = []
for file_id in file_ids:
file = self.lucydom_interface.get_file(file_id)
if file:
files.append(file)
else:
_log(add_log_func, self.workflow_id, f"File not found: {file_id}", "warning")
# Prepare file contexts
file_contexts = self.file_manager.prepare_file_contexts(files)
# Read file contents
file_contents = await self.file_manager.read_file_contents(
file_contexts,
self.lucydom_interface,
self.workflow_id,
add_log_func
)
# Add files to message
for file_id, content_data in file_contents.items():
# Add file to message
updated_message = FileManager.add_file_to_message(updated_message, content_data)
return updated_message
def get_files_from_message(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract file references from a message.
Args:
message: The message to extract files from
Returns:
List of file metadata
"""
files = []
# Process documents
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("content_type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
files.append(file_info)
return files
def get_document_text_content(self, message: Dict[str, Any]) -> str:
"""
Extract text content from all documents in a message.
Args:
message: The message to extract content from
Returns:
Combined text content
"""
content = ""
# Process all documents
for doc in message.get("documents", []):
for doc_content in doc.get("contents", []):
if doc_content.get("type") == "text":
content += "\n\n" + doc_content.get("text", "")
return content
async def extract_document_info(self,
workflow: Dict[str, Any],
message_id: str = None) -> Dict[str, Any]:
"""
Extract document information from a workflow or specific message.
Args:
workflow: The workflow object
message_id: Optional message ID to focus on a specific message
Returns:
Document information
"""
result = {
"documents": [],
"file_count": 0,
"extracted_text": ""
}
if message_id:
# Process only the specified message
for message in workflow.get("messages", []):
if message.get("id") == message_id:
files = self.get_files_from_message(message)
result["documents"].extend(files)
result["file_count"] = len(files)
result["extracted_text"] = self.get_document_text_content(message)
break
else:
# Process all messages
for message in workflow.get("messages", []):
files = self.get_files_from_message(message)
result["documents"].extend(files)
result["extracted_text"] += self.get_document_text_content(message)
# De-duplicate files
unique_files = {}
for file in result["documents"]:
file_id = file.get("id")
if file_id and file_id not in unique_files:
unique_files[file_id] = file
result["documents"] = list(unique_files.values())
result["file_count"] = len(result["documents"])
return result
async def analyze_workflow_documents(self,
workflow: Dict[str, Any],
prompt: str,
ai_service,
message_id: str = None) -> Dict[str, Any]:
"""
Analyze documents in a workflow.
Args:
workflow: The workflow object
prompt: Analysis prompt
ai_service: Service for AI analysis
message_id: Optional message ID to focus on specific message
Returns:
Analysis result
"""
if not self.lucydom_interface:
raise ValueError("LucyDOM interface not available")
if not ai_service:
raise ValueError("AI service not available")
# Extract document info
doc_info = await self.extract_document_info(workflow, message_id)
if doc_info["file_count"] == 0:
return {
"result": "No documents found for analysis",
"files_analyzed": 0
}
# Get file IDs
file_ids = [doc.get("id") for doc in doc_info["documents"] if doc.get("id")]
# Analyze files
analysis = await self.file_manager.analyze_multiple_files(
file_ids,
prompt,
self.lucydom_interface,
ai_service
)
return analysis
# Export the workflow file manager factory function
def get_workflow_file_manager(workflow_id: str = None, lucydom_interface = None):
"""Get a workflow file manager instance."""
return WorkflowFileManager(workflow_id, lucydom_interface)