344 lines
13 KiB
Python
344 lines
13 KiB
Python
"""
|
|
Document processing method module.
|
|
Handles document operations using the document service.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
import uuid
|
|
from datetime import datetime, UTC
|
|
|
|
from modules.workflow.managerDocument import DocumentManager
|
|
from modules.workflow.methodBase import MethodBase, ActionResult, action
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodDocument(MethodBase):
|
|
"""Document method implementation for document operations"""
|
|
|
|
def __init__(self, serviceContainer: Any):
|
|
"""Initialize the document method"""
|
|
super().__init__(serviceContainer)
|
|
self.name = "document"
|
|
self.description = "Handle document operations like extraction and analysis"
|
|
self.documentManager = DocumentManager(serviceContainer)
|
|
|
|
@action
|
|
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Extract content from document
|
|
|
|
Parameters:
|
|
documentList (str): Reference to the document list to extract content from
|
|
aiPrompt (str): AI prompt for content extraction
|
|
format (str, optional): Output format (default: "text")
|
|
includeMetadata (bool, optional): Whether to include metadata (default: True)
|
|
"""
|
|
try:
|
|
documentList = parameters.get("documentList")
|
|
aiPrompt = parameters.get("aiPrompt")
|
|
format = parameters.get("format", "text")
|
|
includeMetadata = parameters.get("includeMetadata", True)
|
|
|
|
if not documentList:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="Document list reference is required"
|
|
)
|
|
|
|
if not aiPrompt:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="AI prompt is required"
|
|
)
|
|
|
|
chatDocuments = self.serviceContainer.getChatDocumentsFromDocumentReference(documentList)
|
|
if not chatDocuments:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No documents found for the provided reference"
|
|
)
|
|
|
|
# Extract content from all documents
|
|
all_extracted_content = []
|
|
file_infos = []
|
|
|
|
for chatDocument in chatDocuments:
|
|
fileId = chatDocument.fileId
|
|
file_data = self.serviceContainer.getFileData(fileId)
|
|
file_info = self.serviceContainer.getFileInfo(fileId)
|
|
|
|
if not file_data:
|
|
logger.warning(f"File not found or empty for fileId: {fileId}")
|
|
continue
|
|
|
|
extracted_content = await self.serviceContainer.extractContentFromFileData(
|
|
prompt=aiPrompt,
|
|
fileData=file_data,
|
|
filename=file_info.get('name', 'document'),
|
|
mimeType=file_info.get('mimeType', 'application/octet-stream'),
|
|
base64Encoded=False
|
|
)
|
|
|
|
all_extracted_content.append(extracted_content)
|
|
if includeMetadata:
|
|
file_infos.append(file_info)
|
|
|
|
if not all_extracted_content:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No content could be extracted from any documents"
|
|
)
|
|
|
|
# Combine all extracted content
|
|
combined_content = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join(all_extracted_content)
|
|
|
|
result_data = {
|
|
"documentCount": len(chatDocuments),
|
|
"format": format,
|
|
"content": combined_content,
|
|
"fileInfos": file_infos if includeMetadata else None,
|
|
"timestamp": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documentName": f"extracted_content_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.txt",
|
|
"documentData": result_data
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|
|
|
|
@action
|
|
async def analyze(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Analyze document content
|
|
|
|
Parameters:
|
|
documentList (str): Reference to the document list to analyze
|
|
aiPrompt (str): AI prompt for content analysis
|
|
analysis (List[str], optional): Types of analysis to perform (default: ["entities", "topics", "sentiment"])
|
|
"""
|
|
try:
|
|
documentList = parameters.get("documentList")
|
|
aiPrompt = parameters.get("aiPrompt")
|
|
analysis = parameters.get("analysis", ["entities", "topics", "sentiment"])
|
|
|
|
if not documentList:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="Document list reference is required"
|
|
)
|
|
|
|
if not aiPrompt:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="AI prompt is required"
|
|
)
|
|
|
|
chatDocuments = self.serviceContainer.getChatDocumentsFromDocumentReference(documentList)
|
|
if not chatDocuments:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No documents found for the provided reference"
|
|
)
|
|
|
|
# Extract content from all documents
|
|
all_extracted_content = []
|
|
|
|
for chatDocument in chatDocuments:
|
|
fileId = chatDocument.fileId
|
|
file_data = self.serviceContainer.getFileData(fileId)
|
|
file_info = self.serviceContainer.getFileInfo(fileId)
|
|
|
|
if not file_data:
|
|
logger.warning(f"File not found or empty for fileId: {fileId}")
|
|
continue
|
|
|
|
extracted_content = await self.serviceContainer.extractContentFromFileData(
|
|
prompt=aiPrompt,
|
|
fileData=file_data,
|
|
filename=file_info.get('name', 'document'),
|
|
mimeType=file_info.get('mimeType', 'application/octet-stream'),
|
|
base64Encoded=False
|
|
)
|
|
|
|
all_extracted_content.append(extracted_content)
|
|
|
|
if not all_extracted_content:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No content could be extracted from any documents"
|
|
)
|
|
|
|
# Combine all extracted content for analysis
|
|
combined_content = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join(all_extracted_content)
|
|
|
|
analysis_prompt = f"""
|
|
Analyze this document content for the following aspects:
|
|
{', '.join(analysis)}
|
|
|
|
Document content:
|
|
{combined_content[:8000]} # Limit content length
|
|
|
|
Please provide a detailed analysis including:
|
|
1. Key entities (people, organizations, locations, dates)
|
|
2. Main topics and themes
|
|
3. Sentiment analysis (positive, negative, neutral)
|
|
4. Key insights and patterns
|
|
5. Important relationships between entities
|
|
6. Document structure and organization
|
|
"""
|
|
|
|
analysis_result = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(analysis_prompt)
|
|
|
|
result_data = {
|
|
"documentCount": len(chatDocuments),
|
|
"analysis": analysis,
|
|
"results": analysis_result,
|
|
"content": combined_content,
|
|
"timestamp": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documentName": f"document_analysis_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
|
|
"documentData": result_data
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|
|
|
|
@action
|
|
async def summarize(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Summarize document content
|
|
|
|
Parameters:
|
|
documentList (str): Reference to the document list to summarize
|
|
aiPrompt (str): AI prompt for content extraction
|
|
maxLength (int, optional): Maximum length of summary in words (default: 200)
|
|
format (str, optional): Output format (default: "text")
|
|
"""
|
|
try:
|
|
documentList = parameters.get("documentList")
|
|
aiPrompt = parameters.get("aiPrompt")
|
|
maxLength = parameters.get("maxLength", 200)
|
|
format = parameters.get("format", "text")
|
|
|
|
if not documentList:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="Document list reference is required"
|
|
)
|
|
|
|
if not aiPrompt:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="AI prompt is required"
|
|
)
|
|
|
|
chatDocuments = self.serviceContainer.getChatDocumentsFromDocumentReference(documentList)
|
|
if not chatDocuments:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No documents found for the provided reference"
|
|
)
|
|
|
|
# Extract content from all documents
|
|
all_extracted_content = []
|
|
|
|
for chatDocument in chatDocuments:
|
|
fileId = chatDocument.fileId
|
|
file_data = self.serviceContainer.getFileData(fileId)
|
|
file_info = self.serviceContainer.getFileInfo(fileId)
|
|
|
|
if not file_data:
|
|
logger.warning(f"File not found or empty for fileId: {fileId}")
|
|
continue
|
|
|
|
extracted_content = await self.serviceContainer.extractContentFromFileData(
|
|
prompt=aiPrompt,
|
|
fileData=file_data,
|
|
filename=file_info.get('name', 'document'),
|
|
mimeType=file_info.get('mimeType', 'application/octet-stream'),
|
|
base64Encoded=False
|
|
)
|
|
|
|
all_extracted_content.append(extracted_content)
|
|
|
|
if not all_extracted_content:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No content could be extracted from any documents"
|
|
)
|
|
|
|
# Combine all extracted content for summarization
|
|
combined_content = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join(all_extracted_content)
|
|
|
|
summary_prompt = f"""
|
|
Create a comprehensive summary of this document content.
|
|
|
|
Document content:
|
|
{combined_content[:8000]} # Limit content length
|
|
|
|
Requirements:
|
|
- Maximum length: {maxLength} words
|
|
- Format: {format}
|
|
- Include key points and main ideas
|
|
- Maintain accuracy and completeness
|
|
- Use clear, professional language
|
|
- Highlight important insights and conclusions
|
|
"""
|
|
|
|
summary = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(summary_prompt)
|
|
|
|
result_data = {
|
|
"documentCount": len(chatDocuments),
|
|
"maxLength": maxLength,
|
|
"format": format,
|
|
"summary": summary,
|
|
"wordCount": len(summary.split()),
|
|
"originalContent": combined_content,
|
|
"timestamp": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documentName": f"document_summary_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.txt",
|
|
"documentData": result_data
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error summarizing content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|