gateway/modules/methods/methodDocument.py
2025-07-18 14:20:11 +02:00

368 lines
17 KiB
Python

"""
Document processing method module.
Handles document operations using the document service.
"""
import logging
from typing import Dict, Any, List, Optional
import uuid
from datetime import datetime, UTC
from modules.chat.methodBase import MethodBase, ActionResult, action
logger = logging.getLogger(__name__)
class MethodDocument(MethodBase):
"""Document method implementation for document operations"""
def __init__(self, serviceCenter: Any):
"""Initialize the document method"""
super().__init__(serviceCenter)
self.name = "document"
self.description = "Handle document operations like extraction and analysis"
@action
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Extract specific content from document with ai prompt and return it in the specified format
Parameters:
documentList (str): Reference to the document list to extract content from
aiPrompt (str): AI prompt for content extraction
includeMetadata (bool, optional): Whether to include metadata (default: True)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
documentList = parameters.get("documentList")
aiPrompt = parameters.get("aiPrompt")
includeMetadata = parameters.get("includeMetadata", True)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not documentList:
return self._createResult(
success=False,
data={},
error="Document list reference is required"
)
if not aiPrompt:
return self._createResult(
success=False,
data={},
error="AI prompt is required"
)
chatDocuments = self.service.getChatDocumentsFromDocumentList(documentList)
if not chatDocuments:
return self._createResult(
success=False,
data={},
error="No documents found for the provided reference"
)
# Determine output format based on expected formats
output_extension = ".txt" # Default
output_mime_type = "text/plain" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".txt")
output_mime_type = expected_format.get("mimeType", "text/plain")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
logger.info(f"Expected document formats: {expectedDocumentFormats}")
else:
logger.info("No expected format specified, using default .txt format")
# Enhance AI prompt to specify output format
enhanced_prompt = aiPrompt
if output_extension == ".csv":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows. Do not include ```csv or ``` markers."
elif output_extension == ".json":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure JSON data without any markdown formatting, code blocks, or additional text. Output only the JSON content. Do not include ```json or ``` markers."
elif output_extension == ".xml":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure XML data without any markdown formatting, code blocks, or additional text. Output only the XML content. Do not include ```xml or ``` markers."
elif output_extension != ".txt":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure {output_extension.upper()} data without any markdown formatting, code blocks, or additional text. Output only the {output_extension.upper()} content. Do not include any markdown markers."
# Extract content from all documents
all_extracted_content = []
file_infos = []
for chatDocument in chatDocuments:
fileId = chatDocument.fileId
file_data = self.service.getFileData(fileId)
file_info = self.service.getFileInfo(fileId)
if not file_data:
logger.warning(f"File not found or empty for fileId: {fileId}")
continue
extracted_content = await self.service.extractContentFromFileData(
prompt=enhanced_prompt, # Use enhanced prompt instead of original
fileData=file_data,
filename=file_info.get('name', 'document'),
mimeType=file_info.get('mimeType', 'application/octet-stream'),
base64Encoded=False,
documentId=chatDocument.id
)
all_extracted_content.append(extracted_content)
if includeMetadata:
file_infos.append(file_info)
if not all_extracted_content:
return self._createResult(
success=False,
data={},
error="No content could be extracted from any documents"
)
# Extract text content from ExtractedContent objects
text_contents = []
for content_obj in all_extracted_content:
if hasattr(content_obj, 'contents') and content_obj.contents:
# Extract text from ContentItem objects
for content_item in content_obj.contents:
if hasattr(content_item, 'data') and content_item.data:
text_contents.append(content_item.data)
elif isinstance(content_obj, str):
text_contents.append(content_obj)
else:
# Fallback: convert to string representation
text_contents.append(str(content_obj))
# Process each document individually and create separate output files
output_documents = []
for i, (chatDocument, extracted_content) in enumerate(zip(chatDocuments, all_extracted_content)):
# Extract text content from this document
text_content = ""
if hasattr(extracted_content, 'contents') and extracted_content.contents:
# Extract text from ContentItem objects
for content_item in extracted_content.contents:
if hasattr(content_item, 'data') and content_item.data:
text_content += content_item.data + "\n"
elif isinstance(extracted_content, str):
text_content = extracted_content
else:
# Fallback: convert to string representation
text_content = str(extracted_content)
# Create output filename based on original filename
original_filename = chatDocument.filename
base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
output_filename = f"{base_name}_extracted_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}"
# Create result data for this document
result_data = {
"documentCount": 1,
"content": text_content,
"originalFilename": original_filename,
"fileInfos": [file_infos[i]] if includeMetadata and i < len(file_infos) else None,
"timestamp": datetime.now(UTC).isoformat()
}
logger.info(f"Created output document: {output_filename} with {len(text_content)} characters")
logger.info(f"Content preview: {text_content[:200]}...")
output_documents.append({
"documentName": output_filename,
"documentData": result_data,
"mimeType": output_mime_type
})
return self._createResult(
success=True,
data={
"documents": output_documents
}
)
except Exception as e:
logger.error(f"Error extracting content: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)
@action
async def generateReport(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Generate a comprehensive, professional HTML report from multiple documents, consolidating and summarizing all findings using AI.
Parameters:
documentList (str): Reference to the document list to create the report from
title (str, optional): Title for the report (default: "Summary Report")
includeMetadata (bool, optional): Whether to include metadata (default: True)
"""
try:
documentList = parameters.get("documentList")
title = parameters.get("title", "Summary Report")
includeMetadata = parameters.get("includeMetadata", True)
if not documentList:
return self._createResult(
success=False,
data={},
error="Document list reference is required"
)
chatDocuments = self.service.getChatDocumentsFromDocumentList(documentList)
if not chatDocuments:
return self._createResult(
success=False,
data={},
error="No documents found for the provided reference"
)
# Generate HTML report
html_content = self._generateHtmlReport(chatDocuments, title, includeMetadata)
# Create output filename
timestamp = datetime.now(UTC).strftime('%Y%m%d_%H%M%S')
output_filename = f"report_{timestamp}.html"
result_data = {
"documentCount": len(chatDocuments),
"content": html_content,
"title": title,
"timestamp": datetime.now(UTC).isoformat()
}
logger.info(f"Generated HTML report: {output_filename} with {len(html_content)} characters")
return self._createResult(
success=True,
data={
"documents": [{
"documentName": output_filename,
"documentData": result_data,
"mimeType": "text/html"
}]
}
)
except Exception as e:
logger.error(f"Error generating report: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)
def _generateHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str:
"""
Generate a comprehensive HTML report using AI from all input documents.
"""
try:
# Filter out empty documents and collect content
validDocuments = []
allContent = []
for doc in chatDocuments:
content = ""
if hasattr(doc, 'content') and doc.content:
content = doc.content.strip()
elif hasattr(doc, 'data') and doc.data:
content = doc.data.strip()
# Skip empty documents
if content:
validDocuments.append(doc)
allContent.append(f"Document: {doc.filename}\n{content}\n")
if not validDocuments:
# If no valid documents, create a simple report
html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
html.append(f"<h1>{title}</h1>")
html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
html.append("<p><em>No content available in the provided documents.</em></p>")
html.append("</body></html>")
return '\n'.join(html)
# Create AI prompt for comprehensive report generation
combinedContent = "\n\n".join(allContent)
aiPrompt = f"""
Create a comprehensive, well-structured HTML report based on the following documents and content.
Report Title: {title}
Requirements:
1. Create a professional, well-formatted HTML report
2. Include an executive summary at the beginning
3. Organize information logically with clear sections
4. Highlight key findings and insights
5. Include relevant data, statistics, and conclusions
6. Use proper HTML formatting with headers, lists, and styling
7. Make it readable and professional
Document Content:
{combinedContent}
Generate a complete HTML report that integrates all the information into a cohesive, professional document.
"""
# Call AI to generate the report
logger.info(f"Generating AI report for {len(validDocuments)} documents")
aiReport = self.service.callAiTextBasic(aiPrompt, combinedContent)
# If AI call fails, fall back to basic HTML
if not aiReport or aiReport.strip() == "":
logger.warning("AI report generation failed, using fallback HTML")
return self._generateFallbackHtmlReport(validDocuments, title, includeMetadata)
# Clean up the AI response and ensure it's valid HTML
if not aiReport.strip().startswith('<html'):
# Wrap the AI content in proper HTML structure
html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
html.append(f"<h1>{title}</h1>")
html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
html.append(f"<p><b>Total Documents Analyzed:</b> {len(validDocuments)}</p>")
html.append("<hr>")
html.append(aiReport)
html.append("</body></html>")
return '\n'.join(html)
else:
# AI returned complete HTML, use it directly
return aiReport
except Exception as e:
logger.error(f"Error generating AI report: {str(e)}")
# Fall back to basic HTML report
return self._generateFallbackHtmlReport(chatDocuments, title, includeMetadata)
def _generateFallbackHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str:
"""
Generate a basic HTML report as fallback when AI generation fails.
"""
html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
html.append(f"<h1>{title}</h1>")
html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
html.append(f"<p><b>Total Documents:</b> {len(chatDocuments)}</p>")
for i, doc in enumerate(chatDocuments, 1):
html.append(f"<h2>Document {i}: {doc.filename}</h2>")
if includeMetadata:
html.append("<ul>")
html.append(f"<li><b>ID:</b> {doc.id}</li>")
html.append(f"<li><b>File ID:</b> {doc.fileId}</li>")
html.append(f"<li><b>Filename:</b> {doc.filename}</li>")
if hasattr(doc, 'createdAt'):
html.append(f"<li><b>Created:</b> {doc.createdAt}</li>")
html.append("</ul>")
# Add document content if available
content = ""
if hasattr(doc, 'content') and doc.content:
content = doc.content
elif hasattr(doc, 'data') and doc.data:
content = doc.data
if content:
html.append(f"<div style='white-space:pre-wrap; border:1px solid #ccc; padding:0.5em; margin-bottom:1em; background-color:#f9f9f9;'>{content}</div>")
else:
html.append("<p><em>No content available</em></p>")
html.append("</body></html>")
return '\n'.join(html)