gateway/gwserver/modules/agentservice_document_handler.py
2025-04-14 20:05:33 +02:00

498 lines
No EOL
21 KiB
Python

"""
Enhanced document handling module for the Agentservice (continued).
"""
import os
import logging
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union
logger = logging.getLogger(__name__)
class DocumentHandler:
"""
Centralized document handler for consistent document management across the system.
"""
def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None):
"""Initialize the document handler."""
self.workflow_id = workflow_id
self.lucydom_interface = lucydom_interface
self.ai_service = ai_service
# Import necessary utilities
from modules.agentservice_filemanager import get_file_manager
self.file_manager = get_file_manager()
def set_workflow_id(self, workflow_id: str):
"""Set or update the workflow ID."""
self.workflow_id = workflow_id
def set_lucydom_interface(self, lucydom_interface):
"""Set or update the LucyDOM interface."""
self.lucydom_interface = lucydom_interface
def set_ai_service(self, ai_service):
"""Set or update the AI service."""
self.ai_service = ai_service
async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]:
"""
Add a file to a message with optional contextual extraction.
Args:
message: The message to add the file to
file_id: ID of the file to add
extraction_prompt: Optional prompt for contextual extraction (e.g., for images)
Returns:
Updated message with the file added
"""
if not self.lucydom_interface:
logger.error("LucyDOM interface not available")
return message
try:
# Get file metadata
file = self.lucydom_interface.get_file(file_id)
if not file:
logger.warning(f"File with ID {file_id} not found")
return message
# Get necessary file information
file_name = file.get("name", "unnamed_file")
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
# Initialize documents array if needed
if "documents" not in message:
message["documents"] = []
# Check if file is already in the message
file_already_added = any(
doc.get("source", {}).get("id") == str(file_id)
for doc in message.get("documents", [])
)
if file_already_added:
logger.info(f"File {file_name} already exists in message, skipping")
return message
# Create a unique document ID
doc_id = f"doc_{uuid.uuid4()}"
# Create document structure
document = {
"id": doc_id,
"source": {
"type": "file",
"id": str(file_id),
"name": file_name,
"content_type": content_type,
"size": file.get("size"),
"upload_date": file.get("upload_date", datetime.now().isoformat())
},
"contents": []
}
# Only read content if we have extraction prompt or specific types
if (extraction_prompt or
file_type in ["document", "text"] or
(content_type and content_type.startswith("text/"))):
# Read file content
file_content = await self.lucydom_interface.read_file_content(file_id)
if file_content:
# Process based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
# Image analysis if prompt provided
if extraction_prompt and self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
image_analysis = await self.ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt or "Describe this image in detail",
mime_type=content_type
)
# Add the analysis as text content
document["contents"].append({
"type": "text",
"text": f"Image Analysis:\n{image_analysis}",
"is_extracted": True,
"extraction_context": extraction_prompt
})
logger.info(f"Added image analysis for {file_name} to message")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"Image file: {file_name} (Analysis failed: {str(e)})",
"is_extracted": False
})
else:
# Just add placeholder if no analysis available
document["contents"].append({
"type": "text",
"text": f"Image file: {file_name} (no analysis requested)",
"is_extracted": False
})
else:
# For other file types, extract text
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
document["contents"].append({
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})")
else:
# No content available
document["contents"].append({
"type": "text",
"text": f"File content not available for {file_name}",
"is_extracted": False
})
else:
# Just add reference without content
document["contents"].append({
"type": "text",
"text": f"File: {file_name} (content not loaded)",
"is_extracted": False
})
# Add document to message
message["documents"].append(document)
logger.info(f"File {file_name} successfully added to message")
return message
except Exception as e:
logger.error(f"Error adding file {file_id} to message: {str(e)}")
return message
async def add_files_to_message(self, message: Dict[str, Any], file_ids: List[int], extraction_prompt: str = None) -> Dict[str, Any]:
"""
Add multiple files to a message.
Args:
message: The message to add files to
file_ids: List of file IDs to add
extraction_prompt: Optional prompt for contextual extraction
Returns:
Updated message with files added
"""
updated_message = message.copy()
for file_id in file_ids:
updated_message = await self.add_file_to_message(updated_message, file_id, extraction_prompt)
return updated_message
async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]:
"""
Extract or update document content with contextual extraction.
Args:
doc_id: ID of the document to extract
message: Message containing the document
extraction_prompt: Contextual prompt for extraction
Returns:
Updated message with extracted content
"""
if not message or "documents" not in message:
return message
updated_message = message.copy()
# Find the document
for i, document in enumerate(updated_message.get("documents", [])):
if document.get("id") == doc_id:
# Get file ID from source
source = document.get("source", {})
file_id = source.get("id")
if file_id and self.lucydom_interface:
# Get file metadata
file = self.lucydom_interface.get_file(int(file_id))
if not file:
continue
# Get file content
file_content = await self.lucydom_interface.read_file_content(int(file_id))
if not file_content:
continue
# Process based on file type
file_name = file.get("name", "unnamed_file")
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
# Update content based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
image_analysis = await self.ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
# Create or update content
new_content = {
"type": "text",
"text": f"Image Analysis:\n{image_analysis}",
"is_extracted": True,
"extraction_context": extraction_prompt
}
# Update or add content
contents = document.get("contents", [])
contents_updated = False
for j, content in enumerate(contents):
if content.get("type") == "text":
updated_message["documents"][i]["contents"][j] = new_content
contents_updated = True
break
if not contents_updated:
if not updated_message["documents"][i].get("contents"):
updated_message["documents"][i]["contents"] = []
updated_message["documents"][i]["contents"].append(new_content)
logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}")
except Exception as e:
logger.error(f"Error updating image analysis for {file_name}: {str(e)}")
else:
# For other file types, extract text with new context
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
new_content = {
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
}
# Update or add content
contents = document.get("contents", [])
contents_updated = False
for j, content_item in enumerate(contents):
if content_item.get("type") == "text":
updated_message["documents"][i]["contents"][j] = new_content
contents_updated = True
break
if not contents_updated:
if not updated_message["documents"][i].get("contents"):
updated_message["documents"][i]["contents"] = []
updated_message["documents"][i]["contents"].append(new_content)
logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}")
# Found and processed the document, stop searching
break
return updated_message
async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]:
"""
Extract all relevant files from a workflow with context-aware extraction.
Args:
workflow: The workflow object
extraction_prompt: Contextual prompt for extraction
file_filter: Optional filter for file types (e.g., "csv", "image")
Returns:
Dictionary with extracted content
"""
# Import for data extraction
from modules.agentservice_dataextraction import data_extraction
# Get all files from the workflow
files = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
# Apply filter if provided
if file_filter:
file_name = file_info.get("name", "").lower()
content_type = file_info.get("content_type", "").lower()
if (file_filter.lower() in file_name or
file_filter.lower() in content_type):
# Check if file is already in the list
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
else:
# No filter, include all files
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
# If no files found, return empty result
if not files:
return {
"prompt": extraction_prompt,
"files_processed": 0,
"extracted_content": []
}
# Get all messages from the workflow
workflow_messages = workflow.get("messages", [])
# Extract data using the dataextraction module
extracted_data = await data_extraction(
prompt=extraction_prompt,
files=files,
messages=workflow_messages,
ai_service=self.ai_service,
lucydom_interface=self.lucydom_interface,
workflow_id=self.workflow_id,
add_log_func=None # We don't have access to add_log_func here
)
return extracted_data
def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str:
"""
Get file content from a message.
Args:
message: The message containing the document
file_id: Optional file ID to search for
doc_id: Optional document ID to search for
Returns:
Text content of the file if available
"""
if not message or "documents" not in message:
return ""
# Search for the document
for document in message.get("documents", []):
# Match by document ID or file ID
source = document.get("source", {})
source_file_id = source.get("id")
if ((doc_id and document.get("id") == doc_id) or
(file_id and source_file_id and str(file_id) == str(source_file_id))):
# Get text content from document
for content in document.get("contents", []):
if content.get("type") == "text":
return content.get("text", "")
return ""
def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]:
"""
Create a new text document in a message.
Args:
message: The message to add the document to
content: Text content
title: Document title
Returns:
Updated message with the new document
"""
# Initialize documents array if needed
updated_message = message.copy()
if "documents" not in updated_message:
updated_message["documents"] = []
# Create document ID
doc_id = f"doc_{uuid.uuid4()}"
# Create document structure
document = {
"id": doc_id,
"source": {
"type": "generated",
"id": doc_id,
"name": title,
"content_type": "text/plain",
"size": len(content)
},
"contents": [
{
"type": "text",
"text": content,
"is_extracted": True
}
]
}
# Add document to message
updated_message["documents"].append(document)
logger.info(f"Created text document '{title}' in message")
return updated_message
def merge_document_contents(self, message: Dict[str, Any]) -> str:
"""
Merge all document contents from a message into a single text.
Args:
message: The message containing documents
Returns:
Combined text content from all documents
"""
if not message or "documents" not in message:
return ""
combined_text = ""
for document in message.get("documents", []):
source = document.get("source", {})
doc_name = source.get("name", "Unnamed Document")
# Extract text content
doc_text = ""
for content in document.get("contents", []):
if content.get("type") == "text":
doc_text = content.get("text", "")
break
if doc_text:
combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}"
return combined_text.strip()
# Factory function
def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler:
"""Get a document handler instance."""
return DocumentHandler(workflow_id, lucydom_interface, ai_service)