""" Document processing method module. Handles document operations using the document service. """ import logging from typing import Dict, Any, List, Optional from modules.workflow.managerDocument import DocumentManager from modules.workflow.methodBase import MethodBase, ActionResult, action logger = logging.getLogger(__name__) class DocumentService: """Service for document content extraction, analysis, and summarization""" def __init__(self, serviceContainer: Any): self.serviceContainer = serviceContainer async def extractContent(self, fileId: str, format: str = "text", includeMetadata: bool = True) -> Dict[str, Any]: """Extract content from document using prompt-based extraction""" try: # Get file data file_data = self.serviceContainer.getFileData(fileId) file_info = self.serviceContainer.getFileInfo(fileId) if not file_data: return { "error": "File not found or empty", "fileId": fileId } # Create extraction prompt based on format extraction_prompt = f""" Extract and structure the content from this document. File information: - Name: {file_info.get('name', 'Unknown')} - Type: {file_info.get('mimeType', 'Unknown')} - Size: {len(file_data)} bytes Please extract: 1. Main content and key information 2. Structured data if present (tables, lists, etc.) 3. Important facts and figures 4. Key insights and takeaways Format the output as: {format} Include metadata: {includeMetadata} """ # Use the new direct file data extraction method extracted_content = await self.serviceContainer.extractContentFromFileData( prompt=extraction_prompt, fileData=file_data, filename=file_info.get('name', 'document'), mimeType=file_info.get('mimeType', 'application/octet-stream'), base64Encoded=False ) result = { "fileId": fileId, "format": format, "content": extracted_content, "fileInfo": file_info if includeMetadata else None } return result except Exception as e: logger.error(f"Error extracting content: {str(e)}") return { "error": str(e), "fileId": fileId } async def analyzeContent(self, fileId: str, analysis: list = None) -> Dict[str, Any]: """Analyze document content for entities, topics, and sentiment""" if analysis is None: analysis = ["entities", "topics", "sentiment"] try: # First extract content content_result = await self.extractContent(fileId, "text", True) if "error" in content_result: return content_result content = content_result.get("content", "") # Create analysis prompt analysis_prompt = f""" Analyze this document content for the following aspects: {', '.join(analysis)} Document content: {content[:5000]} # Limit content length Please provide a detailed analysis including: 1. Key entities (people, organizations, locations, dates) 2. Main topics and themes 3. Sentiment analysis (positive, negative, neutral) 4. Key insights and patterns 5. Important relationships between entities 6. Document structure and organization """ # Use AI service for analysis analysis_result = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(analysis_prompt) return { "fileId": fileId, "analysis": analysis, "results": analysis_result, "content": content_result } except Exception as e: logger.error(f"Error analyzing content: {str(e)}") return { "error": str(e), "fileId": fileId, "analysis": analysis } async def summarizeContent(self, fileId: str, maxLength: int = 200, format: str = "text") -> Dict[str, Any]: """Summarize document content""" try: # First extract content content_result = await self.extractContent(fileId, "text", False) if "error" in content_result: return content_result content = content_result.get("content", "") # Create summarization prompt summary_prompt = f""" Create a comprehensive summary of this document content. Document content: {content[:8000]} # Limit content length Requirements: - Maximum length: {maxLength} words - Format: {format} - Include key points and main ideas - Maintain accuracy and completeness - Use clear, professional language - Highlight important insights and conclusions """ # Use AI service for summarization summary = await self.serviceContainer.interfaceAiCalls.callAiTextAdvanced(summary_prompt) return { "fileId": fileId, "maxLength": maxLength, "format": format, "summary": summary, "wordCount": len(summary.split()), "originalContent": content_result } except Exception as e: logger.error(f"Error summarizing content: {str(e)}") return { "error": str(e), "fileId": fileId, "maxLength": maxLength } class MethodDocument(MethodBase): """Document method implementation for document operations""" def __init__(self, serviceContainer: Any): """Initialize the document method""" super().__init__(serviceContainer) self.name = "document" self.description = "Handle document operations like extraction and analysis" self.documentService = DocumentService(serviceContainer) self.documentManager = DocumentManager(serviceContainer) @action async def extract(self, parameters: Dict[str, Any]) -> ActionResult: """Extract content from document""" try: fileId = parameters.get("fileId") format = parameters.get("format", "text") includeMetadata = parameters.get("includeMetadata", True) if not fileId: return self._createResult( success=False, data={}, error="File ID is required" ) # Extract content content = await self.documentService.extractContent( fileId=fileId, format=format, includeMetadata=includeMetadata ) return self._createResult( success=True, data=content ) except Exception as e: logger.error(f"Error extracting content: {str(e)}") return self._createResult( success=False, data={}, error=str(e) ) @action async def analyze(self, parameters: Dict[str, Any]) -> ActionResult: """Analyze document content""" try: fileId = parameters.get("fileId") analysis = parameters.get("analysis", ["entities", "topics", "sentiment"]) if not fileId: return self._createResult( success=False, data={}, error="File ID is required" ) # Analyze content results = await self.documentService.analyzeContent( fileId=fileId, analysis=analysis ) return self._createResult( success=True, data=results ) except Exception as e: logger.error(f"Error analyzing content: {str(e)}") return self._createResult( success=False, data={}, error=str(e) ) @action async def summarize(self, parameters: Dict[str, Any]) -> ActionResult: """Summarize document content""" try: fileId = parameters.get("fileId") maxLength = parameters.get("maxLength", 200) format = parameters.get("format", "text") if not fileId: return self._createResult( success=False, data={}, error="File ID is required" ) # Summarize content summary = await self.documentService.summarizeContent( fileId=fileId, maxLength=maxLength, format=format ) return self._createResult( success=True, data=summary ) except Exception as e: logger.error(f"Error summarizing content: {str(e)}") return self._createResult( success=False, data={}, error=str(e) )