650 lines
28 KiB
Python
650 lines
28 KiB
Python
from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict, Callable, Awaitable
|
|
import logging
|
|
import json
|
|
import os
|
|
import io
|
|
import base64
|
|
from datetime import datetime, UTC
|
|
from pathlib import Path
|
|
|
|
from modules.interfaces.serviceChatModel import (
|
|
ChatDocument,
|
|
TaskDocument,
|
|
ExtractedContent,
|
|
ContentItem,
|
|
ContentMetadata
|
|
)
|
|
from modules.interfaces.serviceManagementClass import ServiceManagement, getInterface
|
|
from modules.interfaces.serviceAppModel import User
|
|
from modules.neutralizer.neutralizer import DataAnonymizer
|
|
from modules.shared.configuration import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Optional imports - only loaded when needed
|
|
pdfExtractorLoaded = False
|
|
officeExtractorLoaded = False
|
|
imageProcessorLoaded = False
|
|
|
|
class FileProcessingError(Exception):
|
|
"""Custom exception for file processing errors."""
|
|
pass
|
|
|
|
class DocumentProcessor:
|
|
"""Processor for handling document operations and content extraction."""
|
|
|
|
def __init__(self, currentUser: Optional[User] = None):
|
|
"""Initialize the document processor."""
|
|
self.serviceManagement = getInterface(currentUser)
|
|
self._neutralizer = DataAnonymizer() if APP_CONFIG.get("ENABLE_CONTENT_NEUTRALIZATION", False) else None
|
|
self.supportedTypes: Dict[str, Callable[[Union[ChatDocument, TaskDocument]], Awaitable[List[ContentItem]]]] = {
|
|
'text/plain': self._processText,
|
|
'text/csv': self._processCsv,
|
|
'application/json': self._processJson,
|
|
'application/xml': self._processXml,
|
|
'text/html': self._processHtml,
|
|
'image/svg+xml': self._processSvg,
|
|
'image/jpeg': self._processImage,
|
|
'image/png': self._processImage,
|
|
'image/gif': self._processImage,
|
|
'application/pdf': self._processPdf,
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._processDocx,
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': self._processXlsx
|
|
}
|
|
|
|
def initialize(self) -> None:
|
|
"""Initialize the document processor."""
|
|
pass
|
|
|
|
def _loadPdfExtractor(self):
|
|
"""Loads PDF extraction libraries when needed"""
|
|
global pdfExtractorLoaded
|
|
if not pdfExtractorLoaded:
|
|
try:
|
|
global PyPDF2, fitz
|
|
import PyPDF2
|
|
import fitz # PyMuPDF for more extensive PDF processing
|
|
pdfExtractorLoaded = True
|
|
logger.info("PDF extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"PDF extraction libraries could not be loaded: {e}")
|
|
|
|
def _loadOfficeExtractor(self):
|
|
"""Loads Office document extraction libraries when needed"""
|
|
global officeExtractorLoaded
|
|
if not officeExtractorLoaded:
|
|
try:
|
|
global docx, openpyxl
|
|
import docx # python-docx for Word documents
|
|
import openpyxl # for Excel files
|
|
officeExtractorLoaded = True
|
|
logger.info("Office extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Office extraction libraries could not be loaded: {e}")
|
|
|
|
def _loadImageProcessor(self):
|
|
"""Loads image processing libraries when needed"""
|
|
global imageProcessorLoaded
|
|
if not imageProcessorLoaded:
|
|
try:
|
|
global PIL, Image
|
|
from PIL import Image
|
|
imageProcessorLoaded = True
|
|
logger.info("Image processing libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Image processing libraries could not be loaded: {e}")
|
|
|
|
async def processDocument(self, document: TaskDocument, prompt: str) -> ExtractedContent:
|
|
"""
|
|
Process a document and extract its contents with AI processing.
|
|
|
|
Args:
|
|
document: The document to process
|
|
prompt: Prompt for AI content extraction
|
|
|
|
Returns:
|
|
ExtractedContent containing the processed content
|
|
|
|
Raises:
|
|
FileProcessingError: If document processing fails
|
|
"""
|
|
try:
|
|
# Get content type
|
|
contentType = document.mimeType
|
|
if contentType == "application/octet-stream":
|
|
# Try to detect actual file type
|
|
contentType = self._detectContentType(document)
|
|
|
|
if contentType not in self.supportedTypes:
|
|
# Fallback to binary processing
|
|
contentItems = await self._processBinary(document)
|
|
else:
|
|
# Process document based on type
|
|
processor = self.supportedTypes[contentType]
|
|
contentItems = await processor(document)
|
|
|
|
# Process with AI if prompt provided
|
|
if prompt and contentItems:
|
|
try:
|
|
# Process each content item with AI
|
|
processedItems = []
|
|
for item in contentItems:
|
|
# Neutralize content if neutralizer is enabled
|
|
contentToProcess = item.data
|
|
if self._neutralizer and contentToProcess:
|
|
contentToProcess = self._neutralizer.neutralize(contentToProcess)
|
|
|
|
# Create AI prompt for this content
|
|
aiPrompt = f"""
|
|
Extract relevant information from this content based on the following prompt:
|
|
|
|
PROMPT: {prompt}
|
|
|
|
CONTENT:
|
|
{contentToProcess}
|
|
|
|
Return ONLY the extracted information in a clear, concise format.
|
|
"""
|
|
|
|
# Get AI response
|
|
response = await self.serviceManagement.callAi([
|
|
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
|
|
{"role": "user", "content": aiPrompt}
|
|
])
|
|
|
|
# Update content with AI processed data
|
|
processedItems.append(ContentItem(
|
|
label=item.label,
|
|
data=response.strip(),
|
|
metadata=item.metadata
|
|
))
|
|
|
|
contentItems = processedItems
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing content with AI: {str(e)}")
|
|
|
|
return ExtractedContent(
|
|
objectId=document.id,
|
|
objectType="TaskDocument",
|
|
contents=contentItems
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process document: {str(e)}")
|
|
|
|
def _detectContentType(self, document: Union[ChatDocument, TaskDocument]) -> str:
|
|
"""Detect content type from file content"""
|
|
try:
|
|
# Check file extension first
|
|
ext = os.path.splitext(document.filename)[1].lower()
|
|
if ext:
|
|
# Map common extensions to MIME types
|
|
extToMime = {
|
|
'.txt': 'text/plain',
|
|
'.md': 'text/markdown',
|
|
'.csv': 'text/csv',
|
|
'.json': 'application/json',
|
|
'.xml': 'application/xml',
|
|
'.js': 'application/javascript',
|
|
'.py': 'application/x-python',
|
|
'.svg': 'image/svg+xml',
|
|
'.jpg': 'image/jpeg',
|
|
'.png': 'image/png',
|
|
'.gif': 'image/gif',
|
|
'.pdf': 'application/pdf',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.doc': 'application/msword',
|
|
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'.xls': 'application/vnd.ms-excel',
|
|
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'.ppt': 'application/vnd.ms-powerpoint'
|
|
}
|
|
if ext in extToMime:
|
|
return extToMime[ext]
|
|
|
|
# Try to detect if it's text content
|
|
if isinstance(document, TaskDocument):
|
|
try:
|
|
content = base64.b64decode(document.data)
|
|
content.decode('utf-8')
|
|
return 'text/plain'
|
|
except UnicodeDecodeError:
|
|
pass
|
|
|
|
return 'application/octet-stream'
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error detecting content type: {str(e)}")
|
|
return 'application/octet-stream'
|
|
|
|
async def _processText(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process text document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
content = base64.b64decode(document.data).decode('utf-8')
|
|
else:
|
|
content = self.serviceManagement.getFileData(document.fileId)
|
|
if content is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
content = content.decode('utf-8')
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing text document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process text document: {str(e)}")
|
|
|
|
async def _processCsv(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process CSV document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
content = base64.b64decode(document.data).decode('utf-8')
|
|
else:
|
|
content = self.serviceManagement.getFileData(document.fileId)
|
|
if content is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
content = content.decode('utf-8')
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing CSV document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process CSV document: {str(e)}")
|
|
|
|
async def _processJson(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process JSON document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
content = base64.b64decode(document.data).decode('utf-8')
|
|
else:
|
|
content = self.serviceManagement.getFileData(document.fileId)
|
|
if content is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
content = content.decode('utf-8')
|
|
|
|
# Parse JSON to validate
|
|
jsonData = json.loads(content)
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing JSON document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process JSON document: {str(e)}")
|
|
|
|
async def _processXml(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process XML document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
content = base64.b64decode(document.data).decode('utf-8')
|
|
else:
|
|
content = self.serviceManagement.getFileData(document.fileId)
|
|
if content is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
content = content.decode('utf-8')
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing XML document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process XML document: {str(e)}")
|
|
|
|
async def _processHtml(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process HTML document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
content = base64.b64decode(document.data).decode('utf-8')
|
|
else:
|
|
content = self.serviceManagement.getFileData(document.fileId)
|
|
if content is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
content = content.decode('utf-8')
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing HTML document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process HTML document: {str(e)}")
|
|
|
|
async def _processSvg(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process SVG document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
content = base64.b64decode(document.data).decode('utf-8')
|
|
else:
|
|
content = self.serviceManagement.getFileData(document.fileId)
|
|
if content is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
content = content.decode('utf-8')
|
|
|
|
# Check if it's actually SVG
|
|
isSvg = "<svg" in content.lower()
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content if isSvg else None,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
error=None if isSvg else "Invalid SVG content"
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing SVG document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process SVG document: {str(e)}")
|
|
|
|
async def _processImage(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process image document"""
|
|
try:
|
|
self._loadImageProcessor()
|
|
if not imageProcessorLoaded:
|
|
raise FileProcessingError("Image processing libraries not available")
|
|
|
|
if isinstance(document, TaskDocument):
|
|
fileData = base64.b64decode(document.data)
|
|
else:
|
|
fileData = self.serviceManagement.getFileData(document.fileId)
|
|
if fileData is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
|
|
with io.BytesIO(fileData) as imgStream:
|
|
img = Image.open(imgStream)
|
|
metadata = ContentMetadata(
|
|
size=len(fileData),
|
|
width=img.width,
|
|
height=img.height,
|
|
colorMode=img.mode
|
|
)
|
|
|
|
# Convert image to base64 for storage
|
|
imgStream.seek(0)
|
|
imgData = base64.b64encode(imgStream.read()).decode('utf-8')
|
|
|
|
return [ContentItem(
|
|
label="image",
|
|
data=imgData,
|
|
metadata=metadata
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing image document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process image document: {str(e)}")
|
|
|
|
async def _processPdf(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process PDF document"""
|
|
try:
|
|
self._loadPdfExtractor()
|
|
if not pdfExtractorLoaded:
|
|
raise FileProcessingError("PDF extraction libraries not available")
|
|
|
|
if isinstance(document, TaskDocument):
|
|
fileData = base64.b64decode(document.data)
|
|
else:
|
|
fileData = self.serviceManagement.getFileData(document.fileId)
|
|
if fileData is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
|
|
contentItems = []
|
|
|
|
with io.BytesIO(fileData) as pdfStream:
|
|
# Extract text with PyPDF2
|
|
pdfReader = PyPDF2.PdfReader(pdfStream)
|
|
metadata = ContentMetadata(
|
|
size=len(fileData),
|
|
pages=len(pdfReader.pages)
|
|
)
|
|
|
|
# Extract text from all pages
|
|
for pageNum in range(len(pdfReader.pages)):
|
|
page = pdfReader.pages[pageNum]
|
|
pageText = page.extract_text()
|
|
if pageText:
|
|
contentItems.append(ContentItem(
|
|
label=f"page_{pageNum + 1}",
|
|
data=pageText,
|
|
metadata=ContentMetadata(
|
|
size=len(pageText.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
))
|
|
|
|
# Extract images with PyMuPDF
|
|
pdfStream.seek(0)
|
|
doc = fitz.open(stream=pdfStream, filetype="pdf")
|
|
for pageNum in range(len(doc)):
|
|
page = doc[pageNum]
|
|
for imgIndex, imgInfo in enumerate(page.get_images(full=True)):
|
|
try:
|
|
xref = imgInfo[0]
|
|
baseImage = doc.extract_image(xref)
|
|
if baseImage:
|
|
imageBytes = baseImage.get("image", b"")
|
|
imageExt = baseImage.get("ext", "png")
|
|
|
|
if imageBytes:
|
|
contentItems.append(ContentItem(
|
|
label=f"image_{pageNum + 1}_{imgIndex}",
|
|
data=base64.b64encode(imageBytes).decode('utf-8'),
|
|
metadata=ContentMetadata(
|
|
size=len(imageBytes),
|
|
pages=1
|
|
)
|
|
))
|
|
except Exception as imgE:
|
|
logger.warning(f"Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}")
|
|
|
|
doc.close()
|
|
|
|
return contentItems
|
|
except Exception as e:
|
|
logger.error(f"Error processing PDF document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process PDF document: {str(e)}")
|
|
|
|
async def _processDocx(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process Word document"""
|
|
try:
|
|
self._loadOfficeExtractor()
|
|
if not officeExtractorLoaded:
|
|
raise FileProcessingError("Office extraction libraries not available")
|
|
|
|
if isinstance(document, TaskDocument):
|
|
fileData = base64.b64decode(document.data)
|
|
else:
|
|
fileData = self.serviceManagement.getFileData(document.fileId)
|
|
if fileData is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
|
|
with io.BytesIO(fileData) as docxStream:
|
|
doc = docx.Document(docxStream)
|
|
|
|
# Extract text
|
|
fullText = []
|
|
for para in doc.paragraphs:
|
|
fullText.append(para.text)
|
|
|
|
# Extract tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
rowText = []
|
|
for cell in row.cells:
|
|
rowText.append(cell.text)
|
|
fullText.append(" | ".join(rowText))
|
|
|
|
content = "\n".join(fullText)
|
|
|
|
return [ContentItem(
|
|
label="main",
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=len(doc.paragraphs)
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing Word document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process Word document: {str(e)}")
|
|
|
|
async def _processXlsx(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process Excel document"""
|
|
try:
|
|
self._loadOfficeExtractor()
|
|
if not officeExtractorLoaded:
|
|
raise FileProcessingError("Office extraction libraries not available")
|
|
|
|
if isinstance(document, TaskDocument):
|
|
fileData = base64.b64decode(document.data)
|
|
else:
|
|
fileData = self.serviceManagement.getFileData(document.fileId)
|
|
if fileData is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
|
|
contentItems = []
|
|
|
|
with io.BytesIO(fileData) as xlsxStream:
|
|
workbook = openpyxl.load_workbook(xlsxStream, data_only=True)
|
|
|
|
for sheetName in workbook.sheetnames:
|
|
sheet = workbook[sheetName]
|
|
csvRows = []
|
|
for row in sheet.iter_rows():
|
|
csvRow = []
|
|
for cell in row:
|
|
value = cell.value
|
|
if value is None:
|
|
csvRow.append("")
|
|
else:
|
|
csvRow.append(str(value).replace('"', '""'))
|
|
csvRows.append(','.join(f'"{cell}"' for cell in csvRow))
|
|
|
|
content = "\n".join(csvRows)
|
|
contentItems.append(ContentItem(
|
|
label=sheetName,
|
|
data=content,
|
|
metadata=ContentMetadata(
|
|
size=len(content.encode('utf-8')),
|
|
pages=1
|
|
)
|
|
))
|
|
|
|
return contentItems
|
|
except Exception as e:
|
|
logger.error(f"Error processing Excel document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process Excel document: {str(e)}")
|
|
|
|
async def _processBinary(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
|
|
"""Process binary document"""
|
|
try:
|
|
if isinstance(document, TaskDocument):
|
|
fileData = base64.b64decode(document.data)
|
|
else:
|
|
fileData = self.serviceManagement.getFileData(document.fileId)
|
|
if fileData is None:
|
|
raise FileProcessingError(f"Could not get file data for {document.fileId}")
|
|
|
|
return [ContentItem(
|
|
label="binary",
|
|
data=base64.b64encode(fileData).decode('utf-8'),
|
|
metadata=ContentMetadata(
|
|
size=len(fileData),
|
|
error="Unsupported file type"
|
|
)
|
|
)]
|
|
except Exception as e:
|
|
logger.error(f"Error processing binary document: {str(e)}")
|
|
raise FileProcessingError(f"Failed to process binary document: {str(e)}")
|
|
|
|
async def _extractText(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text content from various text formats"""
|
|
try:
|
|
textContent = content.decode('utf-8')
|
|
return textContent
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Failed to decode text content as UTF-8 for {mimeType}")
|
|
return ""
|
|
|
|
async def _extractImage(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text from image content using OCR"""
|
|
try:
|
|
imageContent = Image.open(io.BytesIO(content))
|
|
text = pytesseract.image_to_string(imageContent)
|
|
return text
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from image: {str(e)}")
|
|
return ""
|
|
|
|
async def _extractVideo(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text from video content"""
|
|
try:
|
|
videoContent = io.BytesIO(content)
|
|
# TODO: Implement video text extraction
|
|
return "Video content extraction not implemented"
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from video: {str(e)}")
|
|
return ""
|
|
|
|
async def _extractAudio(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text from audio content using speech recognition"""
|
|
try:
|
|
audioContent = io.BytesIO(content)
|
|
# TODO: Implement audio text extraction
|
|
return "Audio content extraction not implemented"
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from audio: {str(e)}")
|
|
return ""
|
|
|
|
async def _extractJson(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text from JSON content"""
|
|
try:
|
|
jsonContent = json.loads(content.decode('utf-8'))
|
|
return json.dumps(jsonContent, indent=2)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from JSON: {str(e)}")
|
|
return ""
|
|
|
|
async def _extractXml(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text from XML content"""
|
|
try:
|
|
xmlContent = content.decode('utf-8')
|
|
return xmlContent
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from XML: {str(e)}")
|
|
return ""
|
|
|
|
async def _extractCsv(self, content: bytes, mimeType: str) -> str:
|
|
"""Extract text from CSV content"""
|
|
try:
|
|
csvContent = content.decode('utf-8')
|
|
return csvContent
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from CSV: {str(e)}")
|
|
return ""
|
|
|