215 lines
No EOL
7.7 KiB
Python
215 lines
No EOL
7.7 KiB
Python
"""
|
|
Document processing method module.
|
|
Handles document operations using the document service.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime
|
|
|
|
from modules.interfaces.serviceChatModel import (
|
|
ChatDocument,
|
|
TaskDocument,
|
|
ExtractedContent,
|
|
ContentItem
|
|
)
|
|
from modules.workflow.managerDocument import DocumentManager
|
|
from modules.methods.methodBase import MethodBase
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodDocument(MethodBase):
|
|
"""Document processing method implementation"""
|
|
|
|
def __init__(self, serviceContainer):
|
|
"""Initialize the document method"""
|
|
super().__init__(serviceContainer)
|
|
self.documentManager = DocumentManager(serviceContainer)
|
|
|
|
async def process(self, action: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Process document operations
|
|
|
|
Args:
|
|
action: The action to perform
|
|
parameters: Action parameters
|
|
|
|
Returns:
|
|
Dictionary containing the operation result
|
|
|
|
Raises:
|
|
ValueError: If action is not supported
|
|
"""
|
|
try:
|
|
if action == "extract":
|
|
return await self._extractContent(parameters)
|
|
elif action == "analyze":
|
|
return await self._analyzeDocument(parameters)
|
|
elif action == "summarize":
|
|
return await self._summarizeDocument(parameters)
|
|
else:
|
|
raise ValueError(f"Unsupported action: {action}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing document action {action}: {str(e)}")
|
|
raise
|
|
|
|
async def _extractContent(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Extract content from a document
|
|
|
|
Args:
|
|
parameters: Dictionary containing:
|
|
- documentId: ID of the document to process
|
|
- documentType: Type of document ('ChatDocument' or 'TaskDocument')
|
|
|
|
Returns:
|
|
Dictionary containing extracted content
|
|
"""
|
|
try:
|
|
documentId = parameters.get("documentId")
|
|
documentType = parameters.get("documentType", "ChatDocument")
|
|
|
|
if not documentId:
|
|
raise ValueError("documentId is required")
|
|
|
|
# Get document from database
|
|
if documentType == "ChatDocument":
|
|
document = await self._getChatDocument(documentId)
|
|
if not document:
|
|
raise ValueError(f"ChatDocument {documentId} not found")
|
|
extracted = await self.documentManager.extractFromChatDocument(document)
|
|
else:
|
|
document = await self._getTaskDocument(documentId)
|
|
if not document:
|
|
raise ValueError(f"TaskDocument {documentId} not found")
|
|
extracted = await self.documentManager.extractFromTaskDocument(document)
|
|
|
|
return {
|
|
"success": True,
|
|
"content": extracted.dict(),
|
|
"metadata": await self.documentManager.getDocumentMetadata(document)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def _analyzeDocument(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Analyze document content
|
|
|
|
Args:
|
|
parameters: Dictionary containing:
|
|
- documentId: ID of the document to analyze
|
|
- documentType: Type of document
|
|
- analysisType: Type of analysis to perform
|
|
|
|
Returns:
|
|
Dictionary containing analysis results
|
|
"""
|
|
try:
|
|
# Extract content first
|
|
contentResult = await self._extractContent(parameters)
|
|
if not contentResult["success"]:
|
|
return contentResult
|
|
|
|
# Perform analysis based on type
|
|
analysisType = parameters.get("analysisType", "basic")
|
|
content = ExtractedContent(**contentResult["content"])
|
|
|
|
if analysisType == "basic":
|
|
# Basic analysis: count items, calculate statistics
|
|
stats = {
|
|
"totalItems": len(content.contents),
|
|
"totalSize": sum(item.metadata.size for item in content.contents),
|
|
"itemTypes": {}
|
|
}
|
|
|
|
for item in content.contents:
|
|
itemType = item.label
|
|
if itemType not in stats["itemTypes"]:
|
|
stats["itemTypes"][itemType] = 0
|
|
stats["itemTypes"][itemType] += 1
|
|
|
|
return {
|
|
"success": True,
|
|
"analysis": stats
|
|
}
|
|
else:
|
|
raise ValueError(f"Unsupported analysis type: {analysisType}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing document: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def _summarizeDocument(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Generate document summary
|
|
|
|
Args:
|
|
parameters: Dictionary containing:
|
|
- documentId: ID of the document to summarize
|
|
- documentType: Type of document
|
|
- summaryType: Type of summary to generate
|
|
|
|
Returns:
|
|
Dictionary containing summary
|
|
"""
|
|
try:
|
|
# Extract content first
|
|
contentResult = await self._extractContent(parameters)
|
|
if not contentResult["success"]:
|
|
return contentResult
|
|
|
|
# Generate summary based on type
|
|
summaryType = parameters.get("summaryType", "basic")
|
|
content = ExtractedContent(**contentResult["content"])
|
|
|
|
if summaryType == "basic":
|
|
# Basic summary: concatenate all text content
|
|
summary = "\n".join(
|
|
item.data for item in content.contents
|
|
if item.label == "main"
|
|
)
|
|
|
|
return {
|
|
"success": True,
|
|
"summary": summary
|
|
}
|
|
else:
|
|
raise ValueError(f"Unsupported summary type: {summaryType}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error summarizing document: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def _getChatDocument(self, documentId: str) -> Optional[ChatDocument]:
|
|
"""Get ChatDocument from database"""
|
|
try:
|
|
documentData = self.service.db.getRecord("chatDocuments", documentId)
|
|
if documentData:
|
|
return ChatDocument(**documentData)
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting ChatDocument {documentId}: {str(e)}")
|
|
return None
|
|
|
|
async def _getTaskDocument(self, documentId: str) -> Optional[TaskDocument]:
|
|
"""Get TaskDocument from database"""
|
|
try:
|
|
documentData = self.service.db.getRecord("taskDocuments", documentId)
|
|
if documentData:
|
|
return TaskDocument(**documentData)
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting TaskDocument {documentId}: {str(e)}")
|
|
return None |