gateway/modules/methods/methodDocument.py
2025-06-21 03:06:00 +02:00

285 lines
9.9 KiB
Python

"""
Document processing method module.
Handles document operations using the document service.
"""
import logging
from typing import Dict, Any, List, Optional
from modules.workflow.managerDocument import DocumentManager
from modules.workflow.methodBase import MethodBase, ActionResult, action
logger = logging.getLogger(__name__)
class DocumentService:
"""Service for document content extraction, analysis, and summarization"""
def __init__(self, serviceContainer: Any):
self.serviceContainer = serviceContainer
async def extractContent(self, fileId: str, format: str = "text", includeMetadata: bool = True) -> Dict[str, Any]:
"""Extract content from document using prompt-based extraction"""
try:
# Get file data
file_data = self.serviceContainer.getFileData(fileId)
file_info = self.serviceContainer.getFileInfo(fileId)
if not file_data:
return {
"error": "File not found or empty",
"fileId": fileId
}
# Create extraction prompt based on format
extraction_prompt = f"""
Extract and structure the content from this document.
File information:
- Name: {file_info.get('name', 'Unknown')}
- Type: {file_info.get('mimeType', 'Unknown')}
- Size: {len(file_data)} bytes
Please extract:
1. Main content and key information
2. Structured data if present (tables, lists, etc.)
3. Important facts and figures
4. Key insights and takeaways
Format the output as: {format}
Include metadata: {includeMetadata}
"""
# Use the new direct file data extraction method
extracted_content = await self.serviceContainer.extractContentFromFileData(
prompt=extraction_prompt,
fileData=file_data,
filename=file_info.get('name', 'document'),
mimeType=file_info.get('mimeType', 'application/octet-stream'),
base64Encoded=False
)
result = {
"fileId": fileId,
"format": format,
"content": extracted_content,
"fileInfo": file_info if includeMetadata else None
}
return result
except Exception as e:
logger.error(f"Error extracting content: {str(e)}")
return {
"error": str(e),
"fileId": fileId
}
async def analyzeContent(self, fileId: str, analysis: list = None) -> Dict[str, Any]:
"""Analyze document content for entities, topics, and sentiment"""
if analysis is None:
analysis = ["entities", "topics", "sentiment"]
try:
# First extract content
content_result = await self.extractContent(fileId, "text", True)
if "error" in content_result:
return content_result
content = content_result.get("content", "")
# Create analysis prompt
analysis_prompt = f"""
Analyze this document content for the following aspects:
{', '.join(analysis)}
Document content:
{content[:5000]} # Limit content length
Please provide a detailed analysis including:
1. Key entities (people, organizations, locations, dates)
2. Main topics and themes
3. Sentiment analysis (positive, negative, neutral)
4. Key insights and patterns
5. Important relationships between entities
6. Document structure and organization
"""
# Use AI service for analysis
analysis_result = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(analysis_prompt)
return {
"fileId": fileId,
"analysis": analysis,
"results": analysis_result,
"content": content_result
}
except Exception as e:
logger.error(f"Error analyzing content: {str(e)}")
return {
"error": str(e),
"fileId": fileId,
"analysis": analysis
}
async def summarizeContent(self, fileId: str, maxLength: int = 200, format: str = "text") -> Dict[str, Any]:
"""Summarize document content"""
try:
# First extract content
content_result = await self.extractContent(fileId, "text", False)
if "error" in content_result:
return content_result
content = content_result.get("content", "")
# Create summarization prompt
summary_prompt = f"""
Create a comprehensive summary of this document content.
Document content:
{content[:8000]} # Limit content length
Requirements:
- Maximum length: {maxLength} words
- Format: {format}
- Include key points and main ideas
- Maintain accuracy and completeness
- Use clear, professional language
- Highlight important insights and conclusions
"""
# Use AI service for summarization
summary = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(summary_prompt)
return {
"fileId": fileId,
"maxLength": maxLength,
"format": format,
"summary": summary,
"wordCount": len(summary.split()),
"originalContent": content_result
}
except Exception as e:
logger.error(f"Error summarizing content: {str(e)}")
return {
"error": str(e),
"fileId": fileId,
"maxLength": maxLength
}
class MethodDocument(MethodBase):
"""Document method implementation for document operations"""
def __init__(self, serviceContainer: Any):
"""Initialize the document method"""
super().__init__(serviceContainer)
self.name = "document"
self.description = "Handle document operations like extraction and analysis"
self.documentService = DocumentService(serviceContainer)
self.documentManager = DocumentManager(serviceContainer)
@action
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
"""Extract content from document"""
try:
fileId = parameters.get("fileId")
format = parameters.get("format", "text")
includeMetadata = parameters.get("includeMetadata", True)
if not fileId:
return self._createResult(
success=False,
data={},
error="File ID is required"
)
# Extract content
content = await self.documentService.extractContent(
fileId=fileId,
format=format,
includeMetadata=includeMetadata
)
return self._createResult(
success=True,
data=content
)
except Exception as e:
logger.error(f"Error extracting content: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)
@action
async def analyze(self, parameters: Dict[str, Any]) -> ActionResult:
"""Analyze document content"""
try:
fileId = parameters.get("fileId")
analysis = parameters.get("analysis", ["entities", "topics", "sentiment"])
if not fileId:
return self._createResult(
success=False,
data={},
error="File ID is required"
)
# Analyze content
results = await self.documentService.analyzeContent(
fileId=fileId,
analysis=analysis
)
return self._createResult(
success=True,
data=results
)
except Exception as e:
logger.error(f"Error analyzing content: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)
@action
async def summarize(self, parameters: Dict[str, Any]) -> ActionResult:
"""Summarize document content"""
try:
fileId = parameters.get("fileId")
maxLength = parameters.get("maxLength", 200)
format = parameters.get("format", "text")
if not fileId:
return self._createResult(
success=False,
data={},
error="File ID is required"
)
# Summarize content
summary = await self.documentService.summarizeContent(
fileId=fileId,
maxLength=maxLength,
format=format
)
return self._createResult(
success=True,
data=summary
)
except Exception as e:
logger.error(f"Error summarizing content: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)