498 lines
No EOL
21 KiB
Python
498 lines
No EOL
21 KiB
Python
"""
|
|
Enhanced document handling module for the Agentservice (continued).
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any, Optional, Tuple, Union
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentHandler:
|
|
"""
|
|
Centralized document handler for consistent document management across the system.
|
|
"""
|
|
|
|
def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None):
|
|
"""Initialize the document handler."""
|
|
self.workflow_id = workflow_id
|
|
self.lucydom_interface = lucydom_interface
|
|
self.ai_service = ai_service
|
|
|
|
# Import necessary utilities
|
|
from modules.agentservice_filemanager import get_file_manager
|
|
self.file_manager = get_file_manager()
|
|
|
|
def set_workflow_id(self, workflow_id: str):
|
|
"""Set or update the workflow ID."""
|
|
self.workflow_id = workflow_id
|
|
|
|
def set_lucydom_interface(self, lucydom_interface):
|
|
"""Set or update the LucyDOM interface."""
|
|
self.lucydom_interface = lucydom_interface
|
|
|
|
def set_ai_service(self, ai_service):
|
|
"""Set or update the AI service."""
|
|
self.ai_service = ai_service
|
|
|
|
async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Add a file to a message with optional contextual extraction.
|
|
|
|
Args:
|
|
message: The message to add the file to
|
|
file_id: ID of the file to add
|
|
extraction_prompt: Optional prompt for contextual extraction (e.g., for images)
|
|
|
|
Returns:
|
|
Updated message with the file added
|
|
"""
|
|
if not self.lucydom_interface:
|
|
logger.error("LucyDOM interface not available")
|
|
return message
|
|
|
|
try:
|
|
# Get file metadata
|
|
file = self.lucydom_interface.get_file(file_id)
|
|
if not file:
|
|
logger.warning(f"File with ID {file_id} not found")
|
|
return message
|
|
|
|
# Get necessary file information
|
|
file_name = file.get("name", "unnamed_file")
|
|
file_type = file.get("type", "unknown")
|
|
content_type = file.get("content_type")
|
|
|
|
# Initialize documents array if needed
|
|
if "documents" not in message:
|
|
message["documents"] = []
|
|
|
|
# Check if file is already in the message
|
|
file_already_added = any(
|
|
doc.get("source", {}).get("id") == str(file_id)
|
|
for doc in message.get("documents", [])
|
|
)
|
|
|
|
if file_already_added:
|
|
logger.info(f"File {file_name} already exists in message, skipping")
|
|
return message
|
|
|
|
# Create a unique document ID
|
|
doc_id = f"doc_{uuid.uuid4()}"
|
|
|
|
# Create document structure
|
|
document = {
|
|
"id": doc_id,
|
|
"source": {
|
|
"type": "file",
|
|
"id": str(file_id),
|
|
"name": file_name,
|
|
"content_type": content_type,
|
|
"size": file.get("size"),
|
|
"upload_date": file.get("upload_date", datetime.now().isoformat())
|
|
},
|
|
"contents": []
|
|
}
|
|
|
|
# Only read content if we have extraction prompt or specific types
|
|
if (extraction_prompt or
|
|
file_type in ["document", "text"] or
|
|
(content_type and content_type.startswith("text/"))):
|
|
|
|
# Read file content
|
|
file_content = await self.lucydom_interface.read_file_content(file_id)
|
|
|
|
if file_content:
|
|
# Process based on file type
|
|
if file_type == "image" or (content_type and content_type.startswith("image/")):
|
|
# Image analysis if prompt provided
|
|
if extraction_prompt and self.ai_service and hasattr(self.ai_service, "analyze_image"):
|
|
try:
|
|
image_analysis = await self.ai_service.analyze_image(
|
|
image_data=file_content,
|
|
prompt=extraction_prompt or "Describe this image in detail",
|
|
mime_type=content_type
|
|
)
|
|
|
|
# Add the analysis as text content
|
|
document["contents"].append({
|
|
"type": "text",
|
|
"text": f"Image Analysis:\n{image_analysis}",
|
|
"is_extracted": True,
|
|
"extraction_context": extraction_prompt
|
|
})
|
|
|
|
logger.info(f"Added image analysis for {file_name} to message")
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing image {file_name}: {str(e)}")
|
|
document["contents"].append({
|
|
"type": "text",
|
|
"text": f"Image file: {file_name} (Analysis failed: {str(e)})",
|
|
"is_extracted": False
|
|
})
|
|
else:
|
|
# Just add placeholder if no analysis available
|
|
document["contents"].append({
|
|
"type": "text",
|
|
"text": f"Image file: {file_name} (no analysis requested)",
|
|
"is_extracted": False
|
|
})
|
|
else:
|
|
# For other file types, extract text
|
|
from modules.agentservice_utils import extract_text_from_file_content
|
|
|
|
content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
document["contents"].append({
|
|
"type": "text",
|
|
"text": content,
|
|
"is_extracted": is_extracted,
|
|
"extraction_context": extraction_prompt
|
|
})
|
|
|
|
logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})")
|
|
else:
|
|
# No content available
|
|
document["contents"].append({
|
|
"type": "text",
|
|
"text": f"File content not available for {file_name}",
|
|
"is_extracted": False
|
|
})
|
|
else:
|
|
# Just add reference without content
|
|
document["contents"].append({
|
|
"type": "text",
|
|
"text": f"File: {file_name} (content not loaded)",
|
|
"is_extracted": False
|
|
})
|
|
|
|
# Add document to message
|
|
message["documents"].append(document)
|
|
|
|
logger.info(f"File {file_name} successfully added to message")
|
|
return message
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding file {file_id} to message: {str(e)}")
|
|
return message
|
|
|
|
async def add_files_to_message(self, message: Dict[str, Any], file_ids: List[int], extraction_prompt: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Add multiple files to a message.
|
|
|
|
Args:
|
|
message: The message to add files to
|
|
file_ids: List of file IDs to add
|
|
extraction_prompt: Optional prompt for contextual extraction
|
|
|
|
Returns:
|
|
Updated message with files added
|
|
"""
|
|
updated_message = message.copy()
|
|
|
|
for file_id in file_ids:
|
|
updated_message = await self.add_file_to_message(updated_message, file_id, extraction_prompt)
|
|
|
|
return updated_message
|
|
|
|
async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]:
|
|
"""
|
|
Extract or update document content with contextual extraction.
|
|
|
|
Args:
|
|
doc_id: ID of the document to extract
|
|
message: Message containing the document
|
|
extraction_prompt: Contextual prompt for extraction
|
|
|
|
Returns:
|
|
Updated message with extracted content
|
|
"""
|
|
if not message or "documents" not in message:
|
|
return message
|
|
|
|
updated_message = message.copy()
|
|
|
|
# Find the document
|
|
for i, document in enumerate(updated_message.get("documents", [])):
|
|
if document.get("id") == doc_id:
|
|
# Get file ID from source
|
|
source = document.get("source", {})
|
|
file_id = source.get("id")
|
|
|
|
if file_id and self.lucydom_interface:
|
|
# Get file metadata
|
|
file = self.lucydom_interface.get_file(int(file_id))
|
|
if not file:
|
|
continue
|
|
|
|
# Get file content
|
|
file_content = await self.lucydom_interface.read_file_content(int(file_id))
|
|
if not file_content:
|
|
continue
|
|
|
|
# Process based on file type
|
|
file_name = file.get("name", "unnamed_file")
|
|
file_type = file.get("type", "unknown")
|
|
content_type = file.get("content_type")
|
|
|
|
# Update content based on file type
|
|
if file_type == "image" or (content_type and content_type.startswith("image/")):
|
|
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
|
|
try:
|
|
image_analysis = await self.ai_service.analyze_image(
|
|
image_data=file_content,
|
|
prompt=extraction_prompt,
|
|
mime_type=content_type
|
|
)
|
|
|
|
# Create or update content
|
|
new_content = {
|
|
"type": "text",
|
|
"text": f"Image Analysis:\n{image_analysis}",
|
|
"is_extracted": True,
|
|
"extraction_context": extraction_prompt
|
|
}
|
|
|
|
# Update or add content
|
|
contents = document.get("contents", [])
|
|
contents_updated = False
|
|
|
|
for j, content in enumerate(contents):
|
|
if content.get("type") == "text":
|
|
updated_message["documents"][i]["contents"][j] = new_content
|
|
contents_updated = True
|
|
break
|
|
|
|
if not contents_updated:
|
|
if not updated_message["documents"][i].get("contents"):
|
|
updated_message["documents"][i]["contents"] = []
|
|
updated_message["documents"][i]["contents"].append(new_content)
|
|
|
|
logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}")
|
|
except Exception as e:
|
|
logger.error(f"Error updating image analysis for {file_name}: {str(e)}")
|
|
else:
|
|
# For other file types, extract text with new context
|
|
from modules.agentservice_utils import extract_text_from_file_content
|
|
|
|
content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
new_content = {
|
|
"type": "text",
|
|
"text": content,
|
|
"is_extracted": is_extracted,
|
|
"extraction_context": extraction_prompt
|
|
}
|
|
|
|
# Update or add content
|
|
contents = document.get("contents", [])
|
|
contents_updated = False
|
|
|
|
for j, content_item in enumerate(contents):
|
|
if content_item.get("type") == "text":
|
|
updated_message["documents"][i]["contents"][j] = new_content
|
|
contents_updated = True
|
|
break
|
|
|
|
if not contents_updated:
|
|
if not updated_message["documents"][i].get("contents"):
|
|
updated_message["documents"][i]["contents"] = []
|
|
updated_message["documents"][i]["contents"].append(new_content)
|
|
|
|
logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}")
|
|
|
|
# Found and processed the document, stop searching
|
|
break
|
|
|
|
return updated_message
|
|
|
|
async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Extract all relevant files from a workflow with context-aware extraction.
|
|
|
|
Args:
|
|
workflow: The workflow object
|
|
extraction_prompt: Contextual prompt for extraction
|
|
file_filter: Optional filter for file types (e.g., "csv", "image")
|
|
|
|
Returns:
|
|
Dictionary with extracted content
|
|
"""
|
|
# Import for data extraction
|
|
from modules.agentservice_dataextraction import data_extraction
|
|
|
|
# Get all files from the workflow
|
|
files = []
|
|
|
|
# Process all messages
|
|
for message in workflow.get("messages", []):
|
|
# Extract documents from the message
|
|
for doc in message.get("documents", []):
|
|
source = doc.get("source", {})
|
|
|
|
# Only include file documents
|
|
if source.get("type") == "file":
|
|
file_info = {
|
|
"id": source.get("id", ""),
|
|
"name": source.get("name", ""),
|
|
"type": source.get("type", ""),
|
|
"content_type": source.get("content_type", ""),
|
|
"size": source.get("size", 0)
|
|
}
|
|
|
|
# Apply filter if provided
|
|
if file_filter:
|
|
file_name = file_info.get("name", "").lower()
|
|
content_type = file_info.get("content_type", "").lower()
|
|
|
|
if (file_filter.lower() in file_name or
|
|
file_filter.lower() in content_type):
|
|
# Check if file is already in the list
|
|
if not any(f.get("id") == file_info["id"] for f in files):
|
|
files.append(file_info)
|
|
else:
|
|
# No filter, include all files
|
|
if not any(f.get("id") == file_info["id"] for f in files):
|
|
files.append(file_info)
|
|
|
|
# If no files found, return empty result
|
|
if not files:
|
|
return {
|
|
"prompt": extraction_prompt,
|
|
"files_processed": 0,
|
|
"extracted_content": []
|
|
}
|
|
|
|
# Get all messages from the workflow
|
|
workflow_messages = workflow.get("messages", [])
|
|
|
|
# Extract data using the dataextraction module
|
|
extracted_data = await data_extraction(
|
|
prompt=extraction_prompt,
|
|
files=files,
|
|
messages=workflow_messages,
|
|
ai_service=self.ai_service,
|
|
lucydom_interface=self.lucydom_interface,
|
|
workflow_id=self.workflow_id,
|
|
add_log_func=None # We don't have access to add_log_func here
|
|
)
|
|
|
|
return extracted_data
|
|
|
|
def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str:
|
|
"""
|
|
Get file content from a message.
|
|
|
|
Args:
|
|
message: The message containing the document
|
|
file_id: Optional file ID to search for
|
|
doc_id: Optional document ID to search for
|
|
|
|
Returns:
|
|
Text content of the file if available
|
|
"""
|
|
if not message or "documents" not in message:
|
|
return ""
|
|
|
|
# Search for the document
|
|
for document in message.get("documents", []):
|
|
# Match by document ID or file ID
|
|
source = document.get("source", {})
|
|
source_file_id = source.get("id")
|
|
|
|
if ((doc_id and document.get("id") == doc_id) or
|
|
(file_id and source_file_id and str(file_id) == str(source_file_id))):
|
|
|
|
# Get text content from document
|
|
for content in document.get("contents", []):
|
|
if content.get("type") == "text":
|
|
return content.get("text", "")
|
|
|
|
return ""
|
|
|
|
def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]:
|
|
"""
|
|
Create a new text document in a message.
|
|
|
|
Args:
|
|
message: The message to add the document to
|
|
content: Text content
|
|
title: Document title
|
|
|
|
Returns:
|
|
Updated message with the new document
|
|
"""
|
|
# Initialize documents array if needed
|
|
updated_message = message.copy()
|
|
if "documents" not in updated_message:
|
|
updated_message["documents"] = []
|
|
|
|
# Create document ID
|
|
doc_id = f"doc_{uuid.uuid4()}"
|
|
|
|
# Create document structure
|
|
document = {
|
|
"id": doc_id,
|
|
"source": {
|
|
"type": "generated",
|
|
"id": doc_id,
|
|
"name": title,
|
|
"content_type": "text/plain",
|
|
"size": len(content)
|
|
},
|
|
"contents": [
|
|
{
|
|
"type": "text",
|
|
"text": content,
|
|
"is_extracted": True
|
|
}
|
|
]
|
|
}
|
|
|
|
# Add document to message
|
|
updated_message["documents"].append(document)
|
|
|
|
logger.info(f"Created text document '{title}' in message")
|
|
return updated_message
|
|
|
|
def merge_document_contents(self, message: Dict[str, Any]) -> str:
|
|
"""
|
|
Merge all document contents from a message into a single text.
|
|
|
|
Args:
|
|
message: The message containing documents
|
|
|
|
Returns:
|
|
Combined text content from all documents
|
|
"""
|
|
if not message or "documents" not in message:
|
|
return ""
|
|
|
|
combined_text = ""
|
|
|
|
for document in message.get("documents", []):
|
|
source = document.get("source", {})
|
|
doc_name = source.get("name", "Unnamed Document")
|
|
|
|
# Extract text content
|
|
doc_text = ""
|
|
for content in document.get("contents", []):
|
|
if content.get("type") == "text":
|
|
doc_text = content.get("text", "")
|
|
break
|
|
|
|
if doc_text:
|
|
combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}"
|
|
|
|
return combined_text.strip()
|
|
|
|
# Factory function
|
|
def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler:
|
|
"""Get a document handler instance."""
|
|
return DocumentHandler(workflow_id, lucydom_interface, ai_service) |