135 lines
5.2 KiB
Python
135 lines
5.2 KiB
Python
"""
|
|
Document processing method module.
|
|
Handles document operations using the document service.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
import uuid
|
|
from datetime import datetime, UTC
|
|
|
|
from modules.workflow.methodBase import MethodBase, ActionResult, action
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodDocument(MethodBase):
|
|
"""Document method implementation for document operations"""
|
|
|
|
def __init__(self, serviceCenter: Any):
|
|
"""Initialize the document method"""
|
|
super().__init__(serviceCenter)
|
|
self.name = "document"
|
|
self.description = "Handle document operations like extraction and analysis"
|
|
|
|
@action
|
|
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Extract specific content from document with ai prompt and return it as a json file
|
|
|
|
Parameters:
|
|
documentList (str): Reference to the document list to extract content from
|
|
aiPrompt (str): AI prompt for content extraction
|
|
includeMetadata (bool, optional): Whether to include metadata (default: True)
|
|
"""
|
|
try:
|
|
documentList = parameters.get("documentList")
|
|
aiPrompt = parameters.get("aiPrompt")
|
|
includeMetadata = parameters.get("includeMetadata", True)
|
|
|
|
if not documentList:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="Document list reference is required"
|
|
)
|
|
|
|
if not aiPrompt:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="AI prompt is required"
|
|
)
|
|
|
|
chatDocuments = self.service.getChatDocumentsFromDocumentList(documentList)
|
|
if not chatDocuments:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No documents found for the provided reference"
|
|
)
|
|
|
|
# Extract content from all documents
|
|
all_extracted_content = []
|
|
file_infos = []
|
|
|
|
for chatDocument in chatDocuments:
|
|
fileId = chatDocument.fileId
|
|
file_data = self.service.getFileData(fileId)
|
|
file_info = self.service.getFileInfo(fileId)
|
|
|
|
if not file_data:
|
|
logger.warning(f"File not found or empty for fileId: {fileId}")
|
|
continue
|
|
|
|
extracted_content = await self.service.extractContentFromFileData(
|
|
prompt=aiPrompt,
|
|
fileData=file_data,
|
|
filename=file_info.get('name', 'document'),
|
|
mimeType=file_info.get('mimeType', 'application/octet-stream'),
|
|
base64Encoded=False,
|
|
documentId=chatDocument.id
|
|
)
|
|
|
|
all_extracted_content.append(extracted_content)
|
|
if includeMetadata:
|
|
file_infos.append(file_info)
|
|
|
|
if not all_extracted_content:
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error="No content could be extracted from any documents"
|
|
)
|
|
|
|
# Extract text content from ExtractedContent objects
|
|
text_contents = []
|
|
for content_obj in all_extracted_content:
|
|
if hasattr(content_obj, 'contents') and content_obj.contents:
|
|
# Extract text from ContentItem objects
|
|
for content_item in content_obj.contents:
|
|
if hasattr(content_item, 'data') and content_item.data:
|
|
text_contents.append(content_item.data)
|
|
elif isinstance(content_obj, str):
|
|
text_contents.append(content_obj)
|
|
else:
|
|
# Fallback: convert to string representation
|
|
text_contents.append(str(content_obj))
|
|
|
|
# Combine all extracted text content
|
|
combined_content = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join(text_contents)
|
|
|
|
result_data = {
|
|
"documentCount": len(chatDocuments),
|
|
"content": combined_content,
|
|
"fileInfos": file_infos if includeMetadata else None,
|
|
"timestamp": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
return self._createResult(
|
|
success=True,
|
|
data={
|
|
"documents": [
|
|
{
|
|
"documentName": f"extracted_content_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.txt",
|
|
"documentData": result_data
|
|
}
|
|
]
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {str(e)}")
|
|
return self._createResult(
|
|
success=False,
|
|
data={},
|
|
error=str(e)
|
|
)
|