gateway/gwserver/modules/agentservice_document_handler.py
2025-04-16 10:49:27 +02:00

890 lines
No EOL
48 KiB
Python

"""
Enhanced document handling module for the Agentservice (continued).
"""
import os
import logging
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union
logger = logging.getLogger(__name__)
class DocumentHandler:
"""
Centralized document handler for consistent document management across the system.
"""
def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None):
"""Initialize the document handler."""
self.workflow_id = workflow_id
self.lucydom_interface = lucydom_interface
self.ai_service = ai_service
# Import necessary utilities
from modules.agentservice_filemanager import get_file_manager
self.file_manager = get_file_manager()
def set_workflow_id(self, workflow_id: str):
"""Set or update the workflow ID."""
self.workflow_id = workflow_id
def set_lucydom_interface(self, lucydom_interface):
"""Set or update the LucyDOM interface."""
self.lucydom_interface = lucydom_interface
def set_ai_service(self, ai_service):
"""Set or update the AI service."""
self.ai_service = ai_service
async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]:
"""
Add a file to a message with contextual extraction.
Args:
message: The message to add the file to
file_id: ID of the file to add
extraction_prompt: Optional prompt for contextual extraction (e.g., for images)
Returns:
Updated message with the file added
"""
if not self.lucydom_interface:
logger.error("LucyDOM interface not available")
return message
try:
# Get file metadata
file = self.lucydom_interface.get_file(file_id)
if not file:
logger.warning(f"File with ID {file_id} not found")
return message
# Get necessary file information
file_name = file.get("name", "unnamed_file")
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
# Initialize documents array if needed
if "documents" not in message:
message["documents"] = []
# Check if file is already in the message
file_already_added = any(
doc.get("source", {}).get("id") == str(file_id)
for doc in message.get("documents", [])
)
if file_already_added:
logger.info(f"File {file_name} already exists in message, skipping")
return message
# Create a unique document ID
doc_id = f"doc_{uuid.uuid4()}"
# Create document structure
document = {
"id": doc_id,
"source": {
"type": "file",
"id": str(file_id),
"name": file_name,
"content_type": content_type,
"size": file.get("size"),
"upload_date": file.get("upload_date", datetime.now().isoformat())
},
"contents": []
}
# Only read content if we have extraction prompt or specific types
if (extraction_prompt or
file_type in ["document", "text"] or
(content_type and content_type.startswith("text/"))):
# Read file content
file_content = await self.lucydom_interface.read_file_content(file_id)
if file_content:
# Process based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
# Image analysis if prompt provided
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# Use provided prompt or default one
image_prompt = extraction_prompt or "Describe this image in detail"
logger.info(f"Analyzing image {file_name} with prompt: {image_prompt}")
image_analysis = await self.ai_service.analyze_image(
image_data=file_content,
prompt=image_prompt,
mime_type=content_type
)
# Add the analysis as text content
document["contents"].append({
"type": "text",
"text": f"Image Analysis:\n{image_analysis}",
"is_extracted": True,
"extraction_context": extraction_prompt
})
logger.info(f"Added image analysis for {file_name} to message")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"Image file: {file_name} (Analysis failed: {str(e)})",
"is_extracted": False
})
else:
# Just add placeholder if no analysis available
document["contents"].append({
"type": "text",
"text": f"Image file: {file_name} (no analysis requested)",
"is_extracted": False
})
# Enhanced PDF processing - extract text and images
elif file_name.lower().endswith('.pdf'):
logger.info(f"Processing PDF file: {file_name}")
# Extract text content first
from modules.agentservice_utils import extract_text_from_file_content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
# Add text content
document["contents"].append({
"type": "text",
"text": text_content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Extracted text content from PDF {file_name}")
# Extract and analyze images from PDF if we have AI service
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# Import necessary modules
import fitz # PyMuPDF
from io import BytesIO
# Add detailed logging
logger.info(f"Starting PDF image extraction for {file_name}")
# Check if extraction prompt is available or use default
image_prompt = extraction_prompt or "Describe this image from the PDF document"
# Open PDF from memory stream with detailed error checking
try:
pdf_document = fitz.open(stream=file_content, filetype="pdf")
logger.info(f"Successfully opened PDF with {len(pdf_document)} pages")
except Exception as pdf_open_error:
logger.error(f"Failed to open PDF: {str(pdf_open_error)}")
raise
# Initialize images list and image count
images_analysis = []
image_count = 0
# Process each page
for page_num, page in enumerate(pdf_document, 1):
# Get list of images on the page
image_list = page.get_images(full=True)
if image_list:
logger.info(f"Found {len(image_list)} images on page {page_num}")
# Process each image
for img_index, img in enumerate(image_list):
try:
xref = img[0] # Get image reference
# Extract image data
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Analyze image
image_analysis = await self.ai_service.analyze_image(
image_data=image_bytes,
prompt=f"{image_prompt} (Page {page_num}, Image {img_index+1})",
mime_type=f"image/{image_ext}"
)
# Add to analysis list
images_analysis.append({
"page": page_num,
"index": img_index + 1,
"analysis": image_analysis
})
image_count += 1
logger.info(f"Analyzed image {img_index+1} on page {page_num}")
# Create a separate document for each extracted image if needed
if True: # Set to condition if you want to control this
img_doc_id = f"img_doc_{uuid.uuid4()}"
image_filename = f"page{page_num}_image{img_index+1}.{image_ext}"
image_document = {
"id": img_doc_id,
"source": {
"type": "extracted",
"parent_id": str(file_id),
"id": img_doc_id,
"name": image_filename,
"content_type": f"image/{image_ext}",
"size": len(image_bytes)
},
"contents": [{
"type": "text",
"text": f"Image Analysis (PDF Page {page_num}, Image {img_index+1}):\n{image_analysis}",
"is_extracted": True,
"extraction_context": image_prompt
}]
}
# Add image document to message
message["documents"].append(image_document)
logger.info(f"Added extracted image document {image_filename} to message")
except Exception as img_err:
logger.warning(f"Error processing image {img_index} on page {page_num}: {str(img_err)}")
# Close the PDF
pdf_document.close()
# Add combined image analysis to the main document
if images_analysis:
combined_analysis = "\n\n## Embedded Images Analysis\n\n"
for img in images_analysis:
combined_analysis += f"### Page {img['page']}, Image {img['index']}\n{img['analysis']}\n\n"
document["contents"].append({
"type": "text",
"text": combined_analysis,
"is_extracted": True,
"extraction_context": f"Analysis of {image_count} images embedded in the PDF"
})
logger.info(f"Added combined analysis of {image_count} PDF images to document")
except ImportError:
logger.warning("PyMuPDF (fitz) is not installed, skipping PDF image extraction")
document["contents"].append({
"type": "text",
"text": "\n\nNote: PDF may contain images that were not extracted due to missing libraries.",
"is_extracted": False
})
except Exception as e:
logger.error(f"Error extracting images from PDF {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"\n\nError extracting images from PDF: {str(e)}",
"is_extracted": False
})
# Word document processing with image extraction
elif file_name.lower().endswith(('.docx', '.doc')):
logger.info(f"Processing Word document: {file_name}")
# Extract text content first
from modules.agentservice_utils import extract_text_from_file_content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
# Add text content
document["contents"].append({
"type": "text",
"text": text_content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Extracted text content from Word document {file_name}")
# Attempt to extract and analyze images from Word document
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# For .docx documents
if file_name.lower().endswith('.docx'):
import zipfile
from io import BytesIO
# Check if extraction prompt is available or use default
image_prompt = extraction_prompt or "Describe this image from the Word document"
# Create a zipfile object from the .docx content
docx_zip = zipfile.ZipFile(BytesIO(file_content))
# Images in .docx are stored in the "word/media" directory
image_files = [f for f in docx_zip.namelist() if f.startswith('word/media/')]
if image_files:
logger.info(f"Found {len(image_files)} images in Word document {file_name}")
# Process each image
images_analysis = []
for i, img_path in enumerate(image_files):
try:
# Extract image data
image_bytes = docx_zip.read(img_path)
# Determine image type from filename
image_ext = img_path.split('.')[-1] if '.' in img_path else 'png'
# Analyze image
image_analysis = await self.ai_service.analyze_image(
image_data=image_bytes,
prompt=f"{image_prompt} (Image {i+1})",
mime_type=f"image/{image_ext}"
)
# Add to analysis list
images_analysis.append({
"index": i + 1,
"path": img_path,
"analysis": image_analysis
})
logger.info(f"Analyzed image {i+1} ({img_path}) from Word document")
# Create a separate document for each extracted image if needed
img_doc_id = f"img_doc_{uuid.uuid4()}"
image_filename = f"word_image{i+1}.{image_ext}"
image_document = {
"id": img_doc_id,
"source": {
"type": "extracted",
"parent_id": str(file_id),
"id": img_doc_id,
"name": image_filename,
"content_type": f"image/{image_ext}",
"size": len(image_bytes)
},
"contents": [{
"type": "text",
"text": f"Image Analysis (Word Document Image {i+1}):\n{image_analysis}",
"is_extracted": True,
"extraction_context": image_prompt
}]
}
# Add image document to message
message["documents"].append(image_document)
logger.info(f"Added extracted image document {image_filename} to message")
except Exception as img_err:
logger.warning(f"Error processing image {img_path}: {str(img_err)}")
# Add combined image analysis to the main document
if images_analysis:
combined_analysis = "\n\n## Embedded Images Analysis\n\n"
for img in images_analysis:
combined_analysis += f"### Image {img['index']}\n{img['analysis']}\n\n"
document["contents"].append({
"type": "text",
"text": combined_analysis,
"is_extracted": True,
"extraction_context": f"Analysis of {len(images_analysis)} images embedded in the Word document"
})
logger.info(f"Added combined analysis of {len(images_analysis)} Word document images")
# Close the zip file
docx_zip.close()
# Note: For .doc (older format) we would need additional libraries
# This could be implemented with libraries like antiword or pywin32
elif file_name.lower().endswith('.doc'):
logger.warning("Image extraction from .doc files is not supported yet")
document["contents"].append({
"type": "text",
"text": "\n\nNote: This is an older .doc format document. Images may be present but could not be extracted.",
"is_extracted": False
})
except Exception as e:
logger.error(f"Error extracting images from Word document {file_name}: {str(e)}")
document["contents"].append({
"type": "text",
"text": f"\n\nError extracting images from Word document: {str(e)}",
"is_extracted": False
})
# Excel file processing with enhanced capabilities
elif file_name.lower().endswith(('.xlsx', '.xls')):
logger.info(f"Processing Excel document: {file_name}")
# Extract text representation of spreadsheet data
from modules.agentservice_utils import extract_text_from_file_content
text_content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
# Add text content
document["contents"].append({
"type": "text",
"text": text_content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Extracted data from Excel document {file_name}")
# Try to extract charts and images if available
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
# For .xlsx files (newer format)
if file_name.lower().endswith('.xlsx'):
import zipfile
from io import BytesIO
# Create a zipfile object from the Excel content
xlsx_zip = zipfile.ZipFile(BytesIO(file_content))
# Charts and images can be in various directories
media_paths = [
'xl/media/',
'xl/drawings/',
'xl/charts/'
]
# Collect all potential media files
media_files = []
for path in media_paths:
media_files.extend([f for f in xlsx_zip.namelist() if f.startswith(path)])
if media_files:
logger.info(f"Found {len(media_files)} media files in Excel document {file_name}")
# Process image files (skip XML and other non-image files)
image_extensions = ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'tiff', 'emf', 'wmf']
image_files = [f for f in media_files if f.split('.')[-1].lower() in image_extensions]
if image_files:
logger.info(f"Found {len(image_files)} images/charts in Excel document {file_name}")
image_prompt = extraction_prompt or "Describe this chart/image from the Excel document"
images_analysis = []
for i, img_path in enumerate(image_files):
try:
# Extract image data
image_bytes = xlsx_zip.read(img_path)
# Determine image type from filename
image_ext = img_path.split('.')[-1] if '.' in img_path else 'png'
# Analyze image
image_analysis = await self.ai_service.analyze_image(
image_data=image_bytes,
prompt=f"{image_prompt} (Describe what this chart or image shows, including any data trends or patterns visible)",
mime_type=f"image/{image_ext}"
)
# Add to analysis list
images_analysis.append({
"index": i + 1,
"path": img_path,
"analysis": image_analysis
})
logger.info(f"Analyzed image/chart {i+1} from Excel document")
# Create a separate document for each extracted image
img_doc_id = f"img_doc_{uuid.uuid4()}"
image_filename = f"excel_image{i+1}.{image_ext}"
image_document = {
"id": img_doc_id,
"source": {
"type": "extracted",
"parent_id": str(file_id),
"id": img_doc_id,
"name": image_filename,
"content_type": f"image/{image_ext}",
"size": len(image_bytes)
},
"contents": [{
"type": "text",
"text": f"Chart/Image Analysis (Excel Document Item {i+1}):\n{image_analysis}",
"is_extracted": True,
"extraction_context": image_prompt
}]
}
# Add image document to message
message["documents"].append(image_document)
except Exception as img_err:
logger.warning(f"Error processing image {img_path}: {str(img_err)}")
# Add combined image analysis to the main document
if images_analysis:
combined_analysis = "\n\n## Embedded Charts and Images Analysis\n\n"
for img in images_analysis:
combined_analysis += f"### Chart/Image {img['index']}\n{img['analysis']}\n\n"
document["contents"].append({
"type": "text",
"text": combined_analysis,
"is_extracted": True,
"extraction_context": f"Analysis of {len(images_analysis)} charts/images from the Excel document"
})
# Close the zip file
xlsx_zip.close()
except Exception as e:
logger.error(f"Error extracting charts/images from Excel document {file_name}: {str(e)}")
else:
# For other file types, extract text
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
document["contents"].append({
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
})
logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})")
else:
# No content available
document["contents"].append({
"type": "text",
"text": f"File content not available for {file_name}",
"is_extracted": False
})
else:
# Just add reference without content
document["contents"].append({
"type": "text",
"text": f"File: {file_name} (content not loaded)",
"is_extracted": False
})
# Add document to message
message["documents"].append(document)
logger.info(f"File {file_name} successfully added to message")
return message
except Exception as e:
logger.error(f"Error adding file {file_id} to message: {str(e)}")
return message
async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]:
"""
Extract or update document content with contextual extraction.
Args:
doc_id: ID of the document to extract
message: Message containing the document
extraction_prompt: Contextual prompt for extraction
Returns:
Updated message with extracted content
"""
if not message or "documents" not in message:
return message
updated_message = message.copy()
# Find the document
for i, document in enumerate(updated_message.get("documents", [])):
if document.get("id") == doc_id:
# Get file ID from source
source = document.get("source", {})
file_id = source.get("id")
if file_id and self.lucydom_interface:
# Get file metadata
file = self.lucydom_interface.get_file(int(file_id))
if not file:
continue
# Get file content
file_content = await self.lucydom_interface.read_file_content(int(file_id))
if not file_content:
continue
# Process based on file type
file_name = file.get("name", "unnamed_file")
file_type = file.get("type", "unknown")
content_type = file.get("content_type")
# Update content based on file type
if file_type == "image" or (content_type and content_type.startswith("image/")):
if self.ai_service and hasattr(self.ai_service, "analyze_image"):
try:
image_analysis = await self.ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
# Create or update content
new_content = {
"type": "text",
"text": f"Image Analysis:\n{image_analysis}",
"is_extracted": True,
"extraction_context": extraction_prompt
}
# Update or add content
contents = document.get("contents", [])
contents_updated = False
for j, content in enumerate(contents):
if content.get("type") == "text":
updated_message["documents"][i]["contents"][j] = new_content
contents_updated = True
break
if not contents_updated:
if not updated_message["documents"][i].get("contents"):
updated_message["documents"][i]["contents"] = []
updated_message["documents"][i]["contents"].append(new_content)
logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}")
except Exception as e:
logger.error(f"Error updating image analysis for {file_name}: {str(e)}")
else:
# For other file types, extract text with new context
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
new_content = {
"type": "text",
"text": content,
"is_extracted": is_extracted,
"extraction_context": extraction_prompt
}
# Update or add content
contents = document.get("contents", [])
contents_updated = False
for j, content_item in enumerate(contents):
if content_item.get("type") == "text":
updated_message["documents"][i]["contents"][j] = new_content
contents_updated = True
break
if not contents_updated:
if not updated_message["documents"][i].get("contents"):
updated_message["documents"][i]["contents"] = []
updated_message["documents"][i]["contents"].append(new_content)
logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}")
# Found and processed the document, stop searching
break
return updated_message
async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]:
"""
Extract all relevant files from a workflow with context-aware extraction.
Args:
workflow: The workflow object
extraction_prompt: Contextual prompt for extraction
file_filter: Optional filter for file types (e.g., "csv", "image")
Returns:
Dictionary with extracted content
"""
# Import for data extraction
from modules.agentservice_dataextraction import data_extraction
# Get all files from the workflow
files = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
# Apply filter if provided
if file_filter:
file_name = file_info.get("name", "").lower()
content_type = file_info.get("content_type", "").lower()
if (file_filter.lower() in file_name or
file_filter.lower() in content_type):
# Check if file is already in the list
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
else:
# No filter, include all files
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
# If no files found, return empty result
if not files:
return {
"prompt": extraction_prompt,
"files_processed": 0,
"extracted_content": []
}
# Get all messages from the workflow
workflow_messages = workflow.get("messages", [])
# Extract data using the dataextraction module
extracted_data = await data_extraction(
prompt=extraction_prompt,
files=files,
messages=workflow_messages,
ai_service=self.ai_service,
lucydom_interface=self.lucydom_interface,
workflow_id=self.workflow_id,
add_log_func=None # We don't have access to add_log_func here
)
return extracted_data
def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str:
"""
Get file content from a message.
Args:
message: The message containing the document
file_id: Optional file ID to search for
doc_id: Optional document ID to search for
Returns:
Text content of the file if available
"""
if not message or "documents" not in message:
return ""
# Search for the document
for document in message.get("documents", []):
# Match by document ID or file ID
source = document.get("source", {})
source_file_id = source.get("id")
if ((doc_id and document.get("id") == doc_id) or
(file_id and source_file_id and str(file_id) == str(source_file_id))):
# Get text content from document
for content in document.get("contents", []):
if content.get("type") == "text":
return content.get("text", "")
return ""
def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]:
"""
Create a new text document in a message.
Args:
message: The message to add the document to
content: Text content
title: Document title
Returns:
Updated message with the new document
"""
# Initialize documents array if needed
updated_message = message.copy()
if "documents" not in updated_message:
updated_message["documents"] = []
# Create document ID
doc_id = f"doc_{uuid.uuid4()}"
# Create document structure
document = {
"id": doc_id,
"source": {
"type": "generated",
"id": doc_id,
"name": title,
"content_type": "text/plain",
"size": len(content)
},
"contents": [
{
"type": "text",
"text": content,
"is_extracted": True
}
]
}
# Add document to message
updated_message["documents"].append(document)
logger.info(f"Created text document '{title}' in message")
return updated_message
def merge_document_contents(self, message: Dict[str, Any]) -> str:
"""
Merge all document contents from a message into a single text.
Args:
message: The message containing documents
Returns:
Combined text content from all documents
"""
if not message or "documents" not in message:
return ""
combined_text = ""
for document in message.get("documents", []):
source = document.get("source", {})
doc_name = source.get("name", "Unnamed Document")
# Extract text content
doc_text = ""
for content in document.get("contents", []):
if content.get("type") == "text":
doc_text = content.get("text", "")
break
if doc_text:
combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}"
return combined_text.strip()
# Factory function
def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler:
"""Get a document handler instance."""
return DocumentHandler(workflow_id, lucydom_interface, ai_service)