285 lines
9.9 KiB
Python
285 lines
9.9 KiB
Python
"""
|
|
Document processing method module.
|
|
Handles document operations using the document service.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from modules.workflow.managerDocument import DocumentManager
|
|
from modules.methods.methodBase import MethodBase, ActionResult, action
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentService:
|
|
"""Service for document content extraction, analysis, and summarization"""
|
|
|
|
def __init__(self, serviceContainer: Any):
|
|
self.serviceContainer = serviceContainer
|
|
|
|
async def extractContent(self, fileId: str, format: str = "text", includeMetadata: bool = True) -> Dict[str, Any]:
|
|
"""Extract content from document using prompt-based extraction"""
|
|
try:
|
|
# Get file data
|
|
file_data = self.serviceContainer.getFileData(fileId)
|
|
file_info = self.serviceContainer.getFileInfo(fileId)
|
|
|
|
if not file_data:
|
|
return {
|
|
"error": "File not found or empty",
|
|
"fileId": fileId
|
|
}
|
|
|
|
# Create extraction prompt based on format
|
|
extraction_prompt = f"""
|
|
Extract and structure the content from this document.
|
|
|
|
File information:
|
|
- Name: {file_info.get('name', 'Unknown')}
|
|
- Type: {file_info.get('mimeType', 'Unknown')}
|
|
- Size: {len(file_data)} bytes
|
|
|
|
Please extract:
|
|
1. Main content and key information
|
|
2. Structured data if present (tables, lists, etc.)
|
|
3. Important facts and figures
|
|
4. Key insights and takeaways
|
|
|
|
Format the output as: {format}
|
|
Include metadata: {includeMetadata}
|
|
"""
|
|
|
|
# Use the new direct file data extraction method
|
|
extracted_content = await self.serviceContainer.extractContentFromFileData(
|
|
prompt=extraction_prompt,
|
|
fileData=file_data,
|
|
filename=file_info.get('name', 'document'),
|
|
mimeType=file_info.get('mimeType', 'application/octet-stream'),
|
|
base64Encoded=False
|
|
)
|
|
|
|
result = {
|
|
"fileId": fileId,
|
|
"format": format,
|
|
"content": extracted_content,
|
|
"fileInfo": file_info if includeMetadata else None
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return {
|
|
"error": str(e),
|
|
"fileId": fileId
|
|
}
|
|
|
|
async def analyzeContent(self, fileId: str, analysis: list = None) -> Dict[str, Any]:
|
|
"""Analyze document content for entities, topics, and sentiment"""
|
|
if analysis is None:
|
|
analysis = ["entities", "topics", "sentiment"]
|
|
|
|
try:
|
|
# First extract content
|
|
content_result = await self.extractContent(fileId, "text", True)
|
|
|
|
if "error" in content_result:
|
|
return content_result
|
|
|
|
content = content_result.get("content", "")
|
|
|
|
# Create analysis prompt
|
|
analysis_prompt = f"""
|
|
Analyze this document content for the following aspects:
|
|
{', '.join(analysis)}
|
|
|
|
Document content:
|
|
{content[:5000]} # Limit content length
|
|
|
|
Please provide a detailed analysis including:
|
|
1. Key entities (people, organizations, locations, dates)
|
|
2. Main topics and themes
|
|
3. Sentiment analysis (positive, negative, neutral)
|
|
4. Key insights and patterns
|
|
5. Important relationships between entities
|
|
6. Document structure and organization
|
|
"""
|
|
|
|
# Use AI service for analysis
|
|
analysis_result = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(analysis_prompt)
|
|
|
|
return {
|
|
"fileId": fileId,
|
|
"analysis": analysis,
|
|
"results": analysis_result,
|
|
"content": content_result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing content: {str(e)}")
|
|
return {
|
|
"error": str(e),
|
|
"fileId": fileId,
|
|
"analysis": analysis
|
|
}
|
|
|
|
async def summarizeContent(self, fileId: str, maxLength: int = 200, format: str = "text") -> Dict[str, Any]:
|
|
"""Summarize document content"""
|
|
try:
|
|
# First extract content
|
|
content_result = await self.extractContent(fileId, "text", False)
|
|
|
|
if "error" in content_result:
|
|
return content_result
|
|
|
|
content = content_result.get("content", "")
|
|
|
|
# Create summarization prompt
|
|
summary_prompt = f"""
|
|
Create a comprehensive summary of this document content.
|
|
|
|
Document content:
|
|
{content[:8000]} # Limit content length
|
|
|
|
Requirements:
|
|
- Maximum length: {maxLength} words
|
|
- Format: {format}
|
|
- Include key points and main ideas
|
|
- Maintain accuracy and completeness
|
|
- Use clear, professional language
|
|
- Highlight important insights and conclusions
|
|
"""
|
|
|
|
# Use AI service for summarization
|
|
summary = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(summary_prompt)
|
|
|
|
return {
|
|
"fileId": fileId,
|
|
"maxLength": maxLength,
|
|
"format": format,
|
|
"summary": summary,
|
|
"wordCount": len(summary.split()),
|
|
"originalContent": content_result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error summarizing content: {str(e)}")
|
|
return {
|
|
"error": str(e),
|
|
"fileId": fileId,
|
|
"maxLength": maxLength
|
|
}
|
|
|
|
class MethodDocument(MethodBase):
|
|
"""Document method implementation for document operations"""
|
|
|
|
def __init__(self, serviceContainer: Any):
|
|
"""Initialize the document method"""
|
|
super().__init__(serviceContainer)
|
|
self.name = "document"
|
|
self.description = "Handle document operations like extraction and analysis"
|
|
self.documentService = DocumentService(serviceContainer)
|
|
self.documentManager = DocumentManager(serviceContainer)
|
|
|
|
@action
|
|
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""Extract content from document"""
|
|
try:
|
|
fileId = parameters.get("fileId")
|
|
format = parameters.get("format", "text")
|
|
includeMetadata = parameters.get("includeMetadata", True)
|
|
|
|
if not fileId:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="File ID is required"
|
|
)
|
|
|
|
# Extract content
|
|
content = await self.documentService.extractContent(
|
|
fileId=fileId,
|
|
format=format,
|
|
includeMetadata=includeMetadata
|
|
)
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data=content
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|
|
|
|
@action
|
|
async def analyze(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""Analyze document content"""
|
|
try:
|
|
fileId = parameters.get("fileId")
|
|
analysis = parameters.get("analysis", ["entities", "topics", "sentiment"])
|
|
|
|
if not fileId:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="File ID is required"
|
|
)
|
|
|
|
# Analyze content
|
|
results = await self.documentService.analyzeContent(
|
|
fileId=fileId,
|
|
analysis=analysis
|
|
)
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data=results
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|
|
|
|
@action
|
|
async def summarize(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""Summarize document content"""
|
|
try:
|
|
fileId = parameters.get("fileId")
|
|
maxLength = parameters.get("maxLength", 200)
|
|
format = parameters.get("format", "text")
|
|
|
|
if not fileId:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="File ID is required"
|
|
)
|
|
|
|
# Summarize content
|
|
summary = await self.documentService.summarizeContent(
|
|
fileId=fileId,
|
|
maxLength=maxLength,
|
|
format=format
|
|
)
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data=summary
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error summarizing content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|