368 lines
17 KiB
Python
368 lines
17 KiB
Python
"""
|
|
Document processing method module.
|
|
Handles document operations using the document service.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
import uuid
|
|
from datetime import datetime, UTC
|
|
|
|
from modules.chat.methodBase import MethodBase, ActionResult, action
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodDocument(MethodBase):
|
|
"""Document method implementation for document operations"""
|
|
|
|
def __init__(self, serviceCenter: Any):
|
|
"""Initialize the document method"""
|
|
super().__init__(serviceCenter)
|
|
self.name = "document"
|
|
self.description = "Handle document operations like extraction and analysis"
|
|
|
|
@action
|
|
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Extract specific content from document with ai prompt and return it in the specified format
|
|
|
|
Parameters:
|
|
documentList (str): Reference to the document list to extract content from
|
|
aiPrompt (str): AI prompt for content extraction
|
|
includeMetadata (bool, optional): Whether to include metadata (default: True)
|
|
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
|
|
"""
|
|
try:
|
|
documentList = parameters.get("documentList")
|
|
aiPrompt = parameters.get("aiPrompt")
|
|
includeMetadata = parameters.get("includeMetadata", True)
|
|
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
|
|
|
if not documentList:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="Document list reference is required"
|
|
)
|
|
|
|
if not aiPrompt:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="AI prompt is required"
|
|
)
|
|
|
|
chatDocuments = self.service.getChatDocumentsFromDocumentList(documentList)
|
|
if not chatDocuments:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No documents found for the provided reference"
|
|
)
|
|
|
|
# Determine output format based on expected formats
|
|
output_extension = ".txt" # Default
|
|
output_mime_type = "text/plain" # Default
|
|
|
|
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
|
# Use the first expected format
|
|
expected_format = expectedDocumentFormats[0]
|
|
output_extension = expected_format.get("extension", ".txt")
|
|
output_mime_type = expected_format.get("mimeType", "text/plain")
|
|
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
logger.info(f"Expected document formats: {expectedDocumentFormats}")
|
|
else:
|
|
logger.info("No expected format specified, using default .txt format")
|
|
|
|
# Enhance AI prompt to specify output format
|
|
enhanced_prompt = aiPrompt
|
|
if output_extension == ".csv":
|
|
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows. Do not include ```csv or ``` markers."
|
|
elif output_extension == ".json":
|
|
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure JSON data without any markdown formatting, code blocks, or additional text. Output only the JSON content. Do not include ```json or ``` markers."
|
|
elif output_extension == ".xml":
|
|
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure XML data without any markdown formatting, code blocks, or additional text. Output only the XML content. Do not include ```xml or ``` markers."
|
|
elif output_extension != ".txt":
|
|
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure {output_extension.upper()} data without any markdown formatting, code blocks, or additional text. Output only the {output_extension.upper()} content. Do not include any markdown markers."
|
|
|
|
# Extract content from all documents
|
|
all_extracted_content = []
|
|
file_infos = []
|
|
|
|
for chatDocument in chatDocuments:
|
|
fileId = chatDocument.fileId
|
|
file_data = self.service.getFileData(fileId)
|
|
file_info = self.service.getFileInfo(fileId)
|
|
|
|
if not file_data:
|
|
logger.warning(f"File not found or empty for fileId: {fileId}")
|
|
continue
|
|
|
|
extracted_content = await self.service.extractContentFromFileData(
|
|
prompt=enhanced_prompt, # Use enhanced prompt instead of original
|
|
fileData=file_data,
|
|
filename=file_info.get('name', 'document'),
|
|
mimeType=file_info.get('mimeType', 'application/octet-stream'),
|
|
base64Encoded=False,
|
|
documentId=chatDocument.id
|
|
)
|
|
|
|
all_extracted_content.append(extracted_content)
|
|
if includeMetadata:
|
|
file_infos.append(file_info)
|
|
|
|
if not all_extracted_content:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No content could be extracted from any documents"
|
|
)
|
|
|
|
# Extract text content from ExtractedContent objects
|
|
text_contents = []
|
|
for content_obj in all_extracted_content:
|
|
if hasattr(content_obj, 'contents') and content_obj.contents:
|
|
# Extract text from ContentItem objects
|
|
for content_item in content_obj.contents:
|
|
if hasattr(content_item, 'data') and content_item.data:
|
|
text_contents.append(content_item.data)
|
|
elif isinstance(content_obj, str):
|
|
text_contents.append(content_obj)
|
|
else:
|
|
# Fallback: convert to string representation
|
|
text_contents.append(str(content_obj))
|
|
|
|
# Process each document individually and create separate output files
|
|
output_documents = []
|
|
|
|
for i, (chatDocument, extracted_content) in enumerate(zip(chatDocuments, all_extracted_content)):
|
|
# Extract text content from this document
|
|
text_content = ""
|
|
if hasattr(extracted_content, 'contents') and extracted_content.contents:
|
|
# Extract text from ContentItem objects
|
|
for content_item in extracted_content.contents:
|
|
if hasattr(content_item, 'data') and content_item.data:
|
|
text_content += content_item.data + "\n"
|
|
elif isinstance(extracted_content, str):
|
|
text_content = extracted_content
|
|
else:
|
|
# Fallback: convert to string representation
|
|
text_content = str(extracted_content)
|
|
|
|
# Create output filename based on original filename
|
|
original_filename = chatDocument.filename
|
|
base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
|
|
output_filename = f"{base_name}_extracted_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}"
|
|
|
|
# Create result data for this document
|
|
result_data = {
|
|
"documentCount": 1,
|
|
"content": text_content,
|
|
"originalFilename": original_filename,
|
|
"fileInfos": [file_infos[i]] if includeMetadata and i < len(file_infos) else None,
|
|
"timestamp": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
logger.info(f"Created output document: {output_filename} with {len(text_content)} characters")
|
|
logger.info(f"Content preview: {text_content[:200]}...")
|
|
|
|
output_documents.append({
|
|
"documentName": output_filename,
|
|
"documentData": result_data,
|
|
"mimeType": output_mime_type
|
|
})
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documents": output_documents
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|
|
|
|
@action
|
|
async def generateReport(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Generate a comprehensive, professional HTML report from multiple documents, consolidating and summarizing all findings using AI.
|
|
|
|
Parameters:
|
|
documentList (str): Reference to the document list to create the report from
|
|
title (str, optional): Title for the report (default: "Summary Report")
|
|
includeMetadata (bool, optional): Whether to include metadata (default: True)
|
|
"""
|
|
try:
|
|
documentList = parameters.get("documentList")
|
|
title = parameters.get("title", "Summary Report")
|
|
includeMetadata = parameters.get("includeMetadata", True)
|
|
|
|
if not documentList:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="Document list reference is required"
|
|
)
|
|
|
|
chatDocuments = self.service.getChatDocumentsFromDocumentList(documentList)
|
|
if not chatDocuments:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No documents found for the provided reference"
|
|
)
|
|
|
|
# Generate HTML report
|
|
html_content = self._generateHtmlReport(chatDocuments, title, includeMetadata)
|
|
|
|
# Create output filename
|
|
timestamp = datetime.now(UTC).strftime('%Y%m%d_%H%M%S')
|
|
output_filename = f"report_{timestamp}.html"
|
|
|
|
result_data = {
|
|
"documentCount": len(chatDocuments),
|
|
"content": html_content,
|
|
"title": title,
|
|
"timestamp": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
logger.info(f"Generated HTML report: {output_filename} with {len(html_content)} characters")
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documents": [{
|
|
"documentName": output_filename,
|
|
"documentData": result_data,
|
|
"mimeType": "text/html"
|
|
}]
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error generating report: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|
|
|
|
def _generateHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str:
|
|
"""
|
|
Generate a comprehensive HTML report using AI from all input documents.
|
|
"""
|
|
try:
|
|
# Filter out empty documents and collect content
|
|
validDocuments = []
|
|
allContent = []
|
|
|
|
for doc in chatDocuments:
|
|
content = ""
|
|
if hasattr(doc, 'content') and doc.content:
|
|
content = doc.content.strip()
|
|
elif hasattr(doc, 'data') and doc.data:
|
|
content = doc.data.strip()
|
|
|
|
# Skip empty documents
|
|
if content:
|
|
validDocuments.append(doc)
|
|
allContent.append(f"Document: {doc.filename}\n{content}\n")
|
|
|
|
if not validDocuments:
|
|
# If no valid documents, create a simple report
|
|
html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
|
|
html.append(f"<h1>{title}</h1>")
|
|
html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
|
|
html.append("<p><em>No content available in the provided documents.</em></p>")
|
|
html.append("</body></html>")
|
|
return '\n'.join(html)
|
|
|
|
# Create AI prompt for comprehensive report generation
|
|
combinedContent = "\n\n".join(allContent)
|
|
aiPrompt = f"""
|
|
Create a comprehensive, well-structured HTML report based on the following documents and content.
|
|
|
|
Report Title: {title}
|
|
|
|
Requirements:
|
|
1. Create a professional, well-formatted HTML report
|
|
2. Include an executive summary at the beginning
|
|
3. Organize information logically with clear sections
|
|
4. Highlight key findings and insights
|
|
5. Include relevant data, statistics, and conclusions
|
|
6. Use proper HTML formatting with headers, lists, and styling
|
|
7. Make it readable and professional
|
|
|
|
Document Content:
|
|
{combinedContent}
|
|
|
|
Generate a complete HTML report that integrates all the information into a cohesive, professional document.
|
|
"""
|
|
|
|
# Call AI to generate the report
|
|
logger.info(f"Generating AI report for {len(validDocuments)} documents")
|
|
aiReport = self.service.callAiTextBasic(aiPrompt, combinedContent)
|
|
|
|
# If AI call fails, fall back to basic HTML
|
|
if not aiReport or aiReport.strip() == "":
|
|
logger.warning("AI report generation failed, using fallback HTML")
|
|
return self._generateFallbackHtmlReport(validDocuments, title, includeMetadata)
|
|
|
|
# Clean up the AI response and ensure it's valid HTML
|
|
if not aiReport.strip().startswith('<html'):
|
|
# Wrap the AI content in proper HTML structure
|
|
html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
|
|
html.append(f"<h1>{title}</h1>")
|
|
html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
|
|
html.append(f"<p><b>Total Documents Analyzed:</b> {len(validDocuments)}</p>")
|
|
html.append("<hr>")
|
|
html.append(aiReport)
|
|
html.append("</body></html>")
|
|
return '\n'.join(html)
|
|
else:
|
|
# AI returned complete HTML, use it directly
|
|
return aiReport
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating AI report: {str(e)}")
|
|
# Fall back to basic HTML report
|
|
return self._generateFallbackHtmlReport(chatDocuments, title, includeMetadata)
|
|
|
|
def _generateFallbackHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str:
|
|
"""
|
|
Generate a basic HTML report as fallback when AI generation fails.
|
|
"""
|
|
html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
|
|
html.append(f"<h1>{title}</h1>")
|
|
html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
|
|
html.append(f"<p><b>Total Documents:</b> {len(chatDocuments)}</p>")
|
|
|
|
for i, doc in enumerate(chatDocuments, 1):
|
|
html.append(f"<h2>Document {i}: {doc.filename}</h2>")
|
|
|
|
if includeMetadata:
|
|
html.append("<ul>")
|
|
html.append(f"<li><b>ID:</b> {doc.id}</li>")
|
|
html.append(f"<li><b>File ID:</b> {doc.fileId}</li>")
|
|
html.append(f"<li><b>Filename:</b> {doc.filename}</li>")
|
|
if hasattr(doc, 'createdAt'):
|
|
html.append(f"<li><b>Created:</b> {doc.createdAt}</li>")
|
|
html.append("</ul>")
|
|
|
|
# Add document content if available
|
|
content = ""
|
|
if hasattr(doc, 'content') and doc.content:
|
|
content = doc.content
|
|
elif hasattr(doc, 'data') and doc.data:
|
|
content = doc.data
|
|
|
|
if content:
|
|
html.append(f"<div style='white-space:pre-wrap; border:1px solid #ccc; padding:0.5em; margin-bottom:1em; background-color:#f9f9f9;'>{content}</div>")
|
|
else:
|
|
html.append("<p><em>No content available</em></p>")
|
|
|
|
html.append("</body></html>")
|
|
return '\n'.join(html)
|