202 lines
No EOL
7.4 KiB
Python
202 lines
No EOL
7.4 KiB
Python
"""
|
|
Document processing method module.
|
|
Handles document operations using the document service.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime
|
|
|
|
from modules.interfaces.interfaceChatModel import (
|
|
ChatDocument,
|
|
TaskDocument,
|
|
ExtractedContent,
|
|
ContentItem
|
|
)
|
|
from modules.workflow.managerDocument import DocumentManager
|
|
from modules.methods.methodBase import MethodBase, MethodResult, action
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodDocument(MethodBase):
|
|
"""Document processing method implementation"""
|
|
|
|
def __init__(self, serviceContainer):
|
|
"""Initialize the document method"""
|
|
super().__init__(serviceContainer)
|
|
self.documentManager = DocumentManager(serviceContainer)
|
|
|
|
@action
|
|
async def extract(self, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult:
|
|
"""
|
|
Extract content from document
|
|
|
|
Args:
|
|
parameters:
|
|
documentId: ID of the document to extract from
|
|
documentType: Type of document
|
|
extractionType: Type of extraction to perform
|
|
"""
|
|
try:
|
|
documentId = parameters["documentId"]
|
|
documentType = parameters.get("documentType", "text")
|
|
extractionType = parameters.get("extractionType", "full")
|
|
|
|
# Get document from service
|
|
document = await self.service.interfaceComponent.getDocument(documentId)
|
|
if not document:
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": f"Document not found: {documentId}"}
|
|
)
|
|
|
|
# Extract content based on type
|
|
if documentType == "text":
|
|
content = await self.documentManager.extractTextContent(document, extractionType)
|
|
elif documentType == "table":
|
|
content = await self.documentManager.extractTableContent(document, extractionType)
|
|
elif documentType == "image":
|
|
content = await self.documentManager.extractImageContent(document, extractionType)
|
|
else:
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": f"Unsupported document type: {documentType}"}
|
|
)
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documentId": documentId,
|
|
"type": documentType,
|
|
"content": content
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": str(e)}
|
|
)
|
|
|
|
@action
|
|
async def analyze(self, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult:
|
|
"""
|
|
Analyze document content
|
|
|
|
Args:
|
|
parameters:
|
|
documentId: ID of the document to analyze
|
|
documentType: Type of document
|
|
analysisType: Type of analysis to perform
|
|
"""
|
|
try:
|
|
# Extract content first
|
|
contentResult = await self.extract(parameters)
|
|
if not contentResult.success:
|
|
return contentResult
|
|
|
|
# Perform analysis based on type
|
|
analysisType = parameters.get("analysisType", "basic")
|
|
content = ExtractedContent(**contentResult.data["content"])
|
|
|
|
if analysisType == "basic":
|
|
# Basic analysis: count items, calculate statistics
|
|
stats = {
|
|
"totalItems": len(content.contents),
|
|
"totalSize": sum(item.metadata.size for item in content.contents),
|
|
"itemTypes": {}
|
|
}
|
|
|
|
for item in content.contents:
|
|
itemType = item.label
|
|
if itemType not in stats["itemTypes"]:
|
|
stats["itemTypes"][itemType] = 0
|
|
stats["itemTypes"][itemType] += 1
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documentId": parameters["documentId"],
|
|
"analysis": stats
|
|
}
|
|
)
|
|
else:
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": f"Unsupported analysis type: {analysisType}"}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing document: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": str(e)}
|
|
)
|
|
|
|
@action
|
|
async def summarize(self, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult:
|
|
"""
|
|
Summarize document content
|
|
|
|
Args:
|
|
parameters:
|
|
documentId: ID of the document to summarize
|
|
documentType: Type of document
|
|
summaryType: Type of summary to generate
|
|
"""
|
|
try:
|
|
# Extract content first
|
|
contentResult = await self.extract(parameters)
|
|
if not contentResult.success:
|
|
return contentResult
|
|
|
|
# Generate summary based on type
|
|
summaryType = parameters.get("summaryType", "basic")
|
|
content = ExtractedContent(**contentResult.data["content"])
|
|
|
|
if summaryType == "basic":
|
|
# Basic summary: concatenate all text content
|
|
summary = "\n".join(item.content for item in content.contents if item.content)
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documentId": parameters["documentId"],
|
|
"summary": summary
|
|
}
|
|
)
|
|
else:
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": f"Unsupported summary type: {summaryType}"}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error summarizing document: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={"error": str(e)}
|
|
)
|
|
|
|
async def _getChatDocument(self, documentId: str) -> Optional[ChatDocument]:
|
|
"""Get ChatDocument from database"""
|
|
try:
|
|
documentData = self.service.db.getRecord("chatDocuments", documentId)
|
|
if documentData:
|
|
return ChatDocument(**documentData)
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting ChatDocument {documentId}: {str(e)}")
|
|
return None
|
|
|
|
async def _getTaskDocument(self, documentId: str) -> Optional[TaskDocument]:
|
|
"""Get TaskDocument from database"""
|
|
try:
|
|
documentData = self.service.db.getRecord("taskDocuments", documentId)
|
|
if documentData:
|
|
return TaskDocument(**documentData)
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting TaskDocument {documentId}: {str(e)}")
|
|
return None |