from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict, Callable, Awaitable
import logging
import json
import os
import io
import base64
from datetime import datetime, UTC
from pathlib import Path
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

from modules.interfaces.serviceChatModel import (
    ChatDocument,
    TaskDocument,
    ExtractedContent,
    ContentItem,
    ContentMetadata
)
from modules.interfaces.serviceManagementClass import ServiceManagement, getInterface
from modules.interfaces.serviceAppModel import User
from modules.neutralizer.neutralizer import DataAnonymizer
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

# Optional imports - only loaded when needed
pdfExtractorLoaded = False
officeExtractorLoaded = False
imageProcessorLoaded = False

class FileProcessingError(Exception):
    """Custom exception for file processing errors."""
    pass

class DocumentProcessor:
    """Processor for handling document operations and content extraction."""
    
    def __init__(self, currentUser: Optional[User] = None):
        """Initialize the document processor."""

        self.serviceManagement = getInterface(currentUser)

        self._neutralizer = DataAnonymizer() if APP_CONFIG.get("ENABLE_CONTENT_NEUTRALIZATION", False) else None

        self.supportedTypes: Dict[str, Callable[[Union[ChatDocument, TaskDocument]], Awaitable[List[ContentItem]]]] = {
            'text/plain': self._processText,
            'text/csv': self._processCsv,
            'application/json': self._processJson,
            'application/xml': self._processXml,
            'text/html': self._processHtml,
            'image/svg+xml': self._processSvg,
            'image/jpeg': self._processImage,
            'image/png': self._processImage,
            'image/gif': self._processImage,
            'application/pdf': self._processPdf,
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._processDocx,
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': self._processXlsx
        }
        
        self.chunkSizes = {
            "text": 40000,        # General text content
            "plain": 40000,       # Plain text
            "csv": 40000,         # CSV data
            "json": 40000,        # JSON data
            "xml": 40000,         # XML data
            "html": 40000,        # HTML content
            "image": 1024 * 1024,  # 1MB for images
            "video": 5 * 1024 * 1024,  # 5MB for video chunks
            "binary": 1024 * 1024,  # 1MB for binary data
            "pdf": 40000,         # PDF text content
            "docx": 40000,        # Word document text
            "xlsx": 40000,        # Excel data
            "svg": 40000          # SVG content
        }
    
    def initialize(self) -> None:
        """Initialize the document processor."""
        pass
    
    def _loadPdfExtractor(self):
        """Loads PDF extraction libraries when needed"""
        global pdfExtractorLoaded
        if not pdfExtractorLoaded:
            try:
                global PyPDF2, fitz
                import PyPDF2
                import fitz  # PyMuPDF for more extensive PDF processing
                pdfExtractorLoaded = True
                logger.info("PDF extraction libraries successfully loaded")
            except ImportError as e:
                logger.warning(f"PDF extraction libraries could not be loaded: {e}")
    
    def _loadOfficeExtractor(self):
        """Loads Office document extraction libraries when needed"""
        global officeExtractorLoaded
        if not officeExtractorLoaded:
            try:
                global docx, openpyxl
                import docx  # python-docx for Word documents
                import openpyxl  # for Excel files
                officeExtractorLoaded = True
                logger.info("Office extraction libraries successfully loaded")
            except ImportError as e:
                logger.warning(f"Office extraction libraries could not be loaded: {e}")
    
    def _loadImageProcessor(self):
        """Loads image processing libraries when needed"""
        global imageProcessorLoaded
        if not imageProcessorLoaded:
            try:
                global PIL, Image
                from PIL import Image
                imageProcessorLoaded = True
                logger.info("Image processing libraries successfully loaded")
            except ImportError as e:
                logger.warning(f"Image processing libraries could not be loaded: {e}")
    
    async def processDocument(self, document: TaskDocument, prompt: str) -> ExtractedContent:
        """
        Process a document and extract its contents with AI processing.
        
        Args:
            document: The document to process
            prompt: Prompt for AI content extraction
            
        Returns:
            ExtractedContent containing the processed content
            
        Raises:
            FileProcessingError: If document processing fails
        """
        try:
            # Get content type
            contentType = document.mimeType
            if contentType == "application/octet-stream":
                # Try to detect actual file type
                contentType = self._detectContentType(document)
            
            if contentType not in self.supportedTypes:
                # Fallback to binary processing
                contentItems = await self._processBinary(document)
            else:
                # Process document based on type
                processor = self.supportedTypes[contentType]
                contentItems = await processor(document)
            
            # Process with AI if prompt provided
            if prompt and contentItems:
                try:
                    # Process each content item with AI
                    processedItems = await self._aiDataExtraction(contentItems, prompt)
                    
                    contentItems = processedItems
                    
                except Exception as e:
                    logger.error(f"Error processing content with AI: {str(e)}")
            
            return ExtractedContent(
                objectId=document.id,
                objectType="TaskDocument",
                contents=contentItems
            )
            
        except Exception as e:
            logger.error(f"Error processing document: {str(e)}")
            raise FileProcessingError(f"Failed to process document: {str(e)}")
    
    def _detectContentType(self, document: Union[ChatDocument, TaskDocument]) -> str:
        """Detect content type from file content"""
        try:
            # Check file extension first
            ext = os.path.splitext(document.filename)[1].lower()
            if ext:
                # Map common extensions to MIME types
                extToMime = {
                    '.txt': 'text/plain',
                    '.md': 'text/markdown',
                    '.csv': 'text/csv',
                    '.json': 'application/json',
                    '.xml': 'application/xml',
                    '.js': 'application/javascript',
                    '.py': 'application/x-python',
                    '.svg': 'image/svg+xml',
                    '.jpg': 'image/jpeg',
                    '.png': 'image/png',
                    '.gif': 'image/gif',
                    '.pdf': 'application/pdf',
                    '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                    '.doc': 'application/msword',
                    '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                    '.xls': 'application/vnd.ms-excel',
                    '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                    '.ppt': 'application/vnd.ms-powerpoint'
                }
                if ext in extToMime:
                    return extToMime[ext]
            
            # Try to detect if it's text content
            if isinstance(document, TaskDocument):
                try:
                    content = base64.b64decode(document.data)
                    content.decode('utf-8')
                    return 'text/plain'
                except UnicodeDecodeError:
                    pass
            
            return 'application/octet-stream'
            
        except Exception as e:
            logger.error(f"Error detecting content type: {str(e)}")
            return 'application/octet-stream'
    
    async def _processText(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process text document"""
        try:
            if isinstance(document, TaskDocument):
                content = base64.b64decode(document.data).decode('utf-8')
            else:
                content = self.serviceManagement.getFileData(document.fileId)
                if content is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
                content = content.decode('utf-8')
            
            return [ContentItem(
                label="main",
                data=content,
                metadata=ContentMetadata(
                    size=len(content.encode('utf-8')),
                    pages=1,
                    mimeType="text/plain",
                    base64Encoded=False
                )
            )]
        except Exception as e:
            logger.error(f"Error processing text document: {str(e)}")
            raise FileProcessingError(f"Failed to process text document: {str(e)}")
    
    async def _processCsv(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process CSV document"""
        try:
            if isinstance(document, TaskDocument):
                content = base64.b64decode(document.data).decode('utf-8')
            else:
                content = self.serviceManagement.getFileData(document.fileId)
                if content is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
                content = content.decode('utf-8')
            
            return [ContentItem(
                label="main",
                data=content,
                metadata=ContentMetadata(
                    size=len(content.encode('utf-8')),
                    pages=1,
                    mimeType="text/csv",
                    base64Encoded=False
                )
            )]
        except Exception as e:
            logger.error(f"Error processing CSV document: {str(e)}")
            raise FileProcessingError(f"Failed to process CSV document: {str(e)}")
    
    async def _processJson(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process JSON document"""
        try:
            if isinstance(document, TaskDocument):
                content = base64.b64decode(document.data).decode('utf-8')
            else:
                content = self.serviceManagement.getFileData(document.fileId)
                if content is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
                content = content.decode('utf-8')
            
            # Parse JSON to validate
            jsonData = json.loads(content)
            
            return [ContentItem(
                label="main",
                data=content,
                metadata=ContentMetadata(
                    size=len(content.encode('utf-8')),
                    pages=1,
                    mimeType="application/json",
                    base64Encoded=False
                )
            )]
        except Exception as e:
            logger.error(f"Error processing JSON document: {str(e)}")
            raise FileProcessingError(f"Failed to process JSON document: {str(e)}")
    
    async def _processXml(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process XML document"""
        try:
            if isinstance(document, TaskDocument):
                content = base64.b64decode(document.data).decode('utf-8')
            else:
                content = self.serviceManagement.getFileData(document.fileId)
                if content is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
                content = content.decode('utf-8')
            
            return [ContentItem(
                label="main",
                data=content,
                metadata=ContentMetadata(
                    size=len(content.encode('utf-8')),
                    pages=1,
                    mimeType="application/xml",
                    base64Encoded=False
                )
            )]
        except Exception as e:
            logger.error(f"Error processing XML document: {str(e)}")
            raise FileProcessingError(f"Failed to process XML document: {str(e)}")
    
    async def _processHtml(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process HTML document"""
        try:
            if isinstance(document, TaskDocument):
                content = base64.b64decode(document.data).decode('utf-8')
            else:
                content = self.serviceManagement.getFileData(document.fileId)
                if content is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
                content = content.decode('utf-8')
            
            return [ContentItem(
                label="main",
                data=content,
                metadata=ContentMetadata(
                    size=len(content.encode('utf-8')),
                    pages=1,
                    mimeType="text/html",
                    base64Encoded=False
                )
            )]
        except Exception as e:
            logger.error(f"Error processing HTML document: {str(e)}")
            raise FileProcessingError(f"Failed to process HTML document: {str(e)}")
    
    async def _processSvg(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process SVG document"""
        try:
            if isinstance(document, TaskDocument):
                content = base64.b64decode(document.data).decode('utf-8')
            else:
                content = self.serviceManagement.getFileData(document.fileId)
                if content is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
                content = content.decode('utf-8')
            
            # Check if it's actually SVG
            isSvg = "<svg" in content.lower()
            
            return [ContentItem(
                label="main",
                data=content if isSvg else None,
                metadata=ContentMetadata(
                    size=len(content.encode('utf-8')),
                    mimeType="image/svg+xml",
                    base64Encoded=False,
                    error=None if isSvg else "Invalid SVG content"
                )
            )]
        except Exception as e:
            logger.error(f"Error processing SVG document: {str(e)}")
            raise FileProcessingError(f"Failed to process SVG document: {str(e)}")
    
    async def _processImage(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process image document"""
        try:
            self._loadImageProcessor()
            if not imageProcessorLoaded:
                raise FileProcessingError("Image processing libraries not available")
            
            if isinstance(document, TaskDocument):
                fileData = base64.b64decode(document.data)
            else:
                fileData = self.serviceManagement.getFileData(document.fileId)
                if fileData is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
            
            with io.BytesIO(fileData) as imgStream:
                img = Image.open(imgStream)
                metadata = ContentMetadata(
                    size=len(fileData),
                    width=img.width,
                    height=img.height,
                    colorMode=img.mode,
                    mimeType=document.mimeType,
                    base64Encoded=True
                )
                
                # Convert image to base64 for storage
                imgStream.seek(0)
                imgData = base64.b64encode(imgStream.read()).decode('utf-8')
                
                return [ContentItem(
                    label="image",
                    data=imgData,
                    metadata=metadata
                )]
        except Exception as e:
            logger.error(f"Error processing image document: {str(e)}")
            raise FileProcessingError(f"Failed to process image document: {str(e)}")
    
    async def _processPdf(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process PDF document"""
        try:
            self._loadPdfExtractor()
            if not pdfExtractorLoaded:
                raise FileProcessingError("PDF extraction libraries not available")
            
            if isinstance(document, TaskDocument):
                fileData = base64.b64decode(document.data)
            else:
                fileData = self.serviceManagement.getFileData(document.fileId)
                if fileData is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
            
            contentItems = []
            
            with io.BytesIO(fileData) as pdfStream:
                # Extract text with PyPDF2
                pdfReader = PyPDF2.PdfReader(pdfStream)
                metadata = ContentMetadata(
                    size=len(fileData),
                    pages=len(pdfReader.pages),
                    mimeType="application/pdf",
                    base64Encoded=False
                )
                
                # Extract text from all pages
                for pageNum in range(len(pdfReader.pages)):
                    page = pdfReader.pages[pageNum]
                    pageText = page.extract_text()
                    if pageText:
                        contentItems.append(ContentItem(
                            label=f"page_{pageNum + 1}",
                            data=pageText,
                            metadata=ContentMetadata(
                                size=len(pageText.encode('utf-8')),
                                pages=1,
                                mimeType="text/plain",
                                base64Encoded=False
                            )
                        ))
                
                # Extract images with PyMuPDF
                pdfStream.seek(0)
                doc = fitz.open(stream=pdfStream, filetype="pdf")
                for pageNum in range(len(doc)):
                    page = doc[pageNum]
                    for imgIndex, imgInfo in enumerate(page.get_images(full=True)):
                        try:
                            xref = imgInfo[0]
                            baseImage = doc.extract_image(xref)
                            if baseImage:
                                imageBytes = baseImage.get("image", b"")
                                imageExt = baseImage.get("ext", "png")
                                
                                if imageBytes:
                                    contentItems.append(ContentItem(
                                        label=f"image_{pageNum + 1}_{imgIndex}",
                                        data=base64.b64encode(imageBytes).decode('utf-8'),
                                        metadata=ContentMetadata(
                                            size=len(imageBytes),
                                            pages=1,
                                            mimeType=f"image/{imageExt}",
                                            base64Encoded=True
                                        )
                                    ))
                        except Exception as imgE:
                            logger.warning(f"Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}")
                
                doc.close()
            
            return contentItems
        except Exception as e:
            logger.error(f"Error processing PDF document: {str(e)}")
            raise FileProcessingError(f"Failed to process PDF document: {str(e)}")
    
    async def _processDocx(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process Word document"""
        try:
            self._loadOfficeExtractor()
            if not officeExtractorLoaded:
                raise FileProcessingError("Office extraction libraries not available")
            
            if isinstance(document, TaskDocument):
                fileData = base64.b64decode(document.data)
            else:
                fileData = self.serviceManagement.getFileData(document.fileId)
                if fileData is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
            
            with io.BytesIO(fileData) as docxStream:
                doc = docx.Document(docxStream)
                
                # Extract text
                fullText = []
                for para in doc.paragraphs:
                    fullText.append(para.text)
                
                # Extract tables
                for table in doc.tables:
                    for row in table.rows:
                        rowText = []
                        for cell in row.cells:
                            rowText.append(cell.text)
                        fullText.append(" | ".join(rowText))
                
                content = "\n".join(fullText)
                
                return [ContentItem(
                    label="main",
                    data=content,
                    metadata=ContentMetadata(
                        size=len(content.encode('utf-8')),
                        pages=len(doc.paragraphs),
                        mimeType="text/plain",
                        base64Encoded=False
                    )
                )]
        except Exception as e:
            logger.error(f"Error processing Word document: {str(e)}")
            raise FileProcessingError(f"Failed to process Word document: {str(e)}")
    
    async def _processXlsx(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process Excel document"""
        try:
            self._loadOfficeExtractor()
            if not officeExtractorLoaded:
                raise FileProcessingError("Office extraction libraries not available")
            
            if isinstance(document, TaskDocument):
                fileData = base64.b64decode(document.data)
            else:
                fileData = self.serviceManagement.getFileData(document.fileId)
                if fileData is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
            
            contentItems = []
            
            with io.BytesIO(fileData) as xlsxStream:
                workbook = openpyxl.load_workbook(xlsxStream, data_only=True)
                
                for sheetName in workbook.sheetnames:
                    sheet = workbook[sheetName]
                    csvRows = []
                    for row in sheet.iter_rows():
                        csvRow = []
                        for cell in row:
                            value = cell.value
                            if value is None:
                                csvRow.append("")
                            else:
                                csvRow.append(str(value).replace('"', '""'))
                        csvRows.append(','.join(f'"{cell}"' for cell in csvRow))
                    
                    content = "\n".join(csvRows)
                    contentItems.append(ContentItem(
                        label=sheetName,
                        data=content,
                        metadata=ContentMetadata(
                            size=len(content.encode('utf-8')),
                            pages=1,
                            mimeType="text/csv",
                            base64Encoded=False
                        )
                    ))
            
            return contentItems
        except Exception as e:
            logger.error(f"Error processing Excel document: {str(e)}")
            raise FileProcessingError(f"Failed to process Excel document: {str(e)}")
    
    async def _processBinary(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]:
        """Process binary document"""
        try:
            if isinstance(document, TaskDocument):
                fileData = base64.b64decode(document.data)
            else:
                fileData = self.serviceManagement.getFileData(document.fileId)
                if fileData is None:
                    raise FileProcessingError(f"Could not get file data for {document.fileId}")
            
            return [ContentItem(
                label="binary",
                data=base64.b64encode(fileData).decode('utf-8'),
                metadata=ContentMetadata(
                    size=len(fileData),
                    mimeType=document.mimeType,
                    base64Encoded=True,
                    error="Unsupported file type"
                )
            )]
        except Exception as e:
            logger.error(f"Error processing binary document: {str(e)}")
            raise FileProcessingError(f"Failed to process binary document: {str(e)}")

    async def _aiDataExtraction(self, contentItems: List[ContentItem], prompt: str) -> List[ContentItem]:
        """
        Process content items with AI, handling chunking based on content type.
        
        Args:
            contentItems: List of content items to process
            prompt: Prompt for AI content extraction
            
        Returns:
            List of processed content items
        """
        processedItems = []
        
        for item in contentItems:
            try:
                # Get content type from metadata
                mimeType = item.metadata.mimeType if hasattr(item.metadata, 'mimeType') else "text/plain"
                
                # Chunk content based on type
                if mimeType.startswith('text/'):
                    chunks = await self._chunkText(item.data, mimeType)
                elif mimeType.startswith('image/'):
                    chunks = await self._chunkImage(item.data)
                elif mimeType.startswith('video/'):
                    chunks = await self._chunkVideo(item.data)
                else:
                    # Binary data - no chunking
                    chunks = [item.data]
                
                # Process each chunk
                chunkResults = []
                for chunk in chunks:
                    # Neutralize content if neutralizer is enabled
                    contentToProcess = chunk
                    if self._neutralizer and contentToProcess:
                        contentToProcess = self._neutralizer.neutralize(contentToProcess)
                    
                    # Create AI prompt for this chunk
                    aiPrompt = f"""
                    Extract relevant information from this content based on the following prompt:
                    
                    PROMPT: {prompt}
                    
                    CONTENT:
                    {contentToProcess}
                    
                    Return ONLY the extracted information in a clear, concise format.
                    """
                    
                    # Get AI response
                    response = await self.serviceManagement.callAi([
                        {"role": "system", "content": "You are an expert at extracting relevant information from documents."},
                        {"role": "user", "content": aiPrompt}
                    ])
                    
                    chunkResults.append(response.strip())
                
                # Combine chunk results
                combinedResult = "\n".join(chunkResults)
                
                # Update content with AI processed data
                processedItems.append(ContentItem(
                    label=item.label,
                    data=combinedResult,
                    metadata=ContentMetadata(
                        size=len(combinedResult.encode('utf-8')),
                        pages=1,
                        mimeType="text/plain",
                        base64Encoded=False
                    )
                ))
                
            except Exception as e:
                logger.error(f"Error processing content chunk: {str(e)}")
                # Add original content if processing fails
                processedItems.append(item)
        
        return processedItems

    def _chunkText(self, content: str, mimeType: str) -> List[str]:
        """Chunk text content based on mime type"""
        if mimeType == "text/plain":
            return self._chunkPlainText(content)
        elif mimeType == "text/csv":
            return self._chunkCsv(content)
        elif mimeType == "application/json":
            return self._chunkJson(content)
        elif mimeType == "application/xml":
            return self._chunkXml(content)
        elif mimeType == "text/html":
            return self._chunkHtml(content)
        else:
            return self._chunkPlainText(content)

    def _chunkPlainText(self, content: str) -> List[str]:
        """Chunk plain text content"""
        chunks = []
        currentChunk = []
        currentSize = 0
        
        for line in content.split('\n'):
            lineSize = len(line.encode('utf-8'))
            if currentSize + lineSize > self.chunkSizes["plain"]:
                if currentChunk:
                    chunks.append('\n'.join(currentChunk))
                currentChunk = [line]
                currentSize = lineSize
            else:
                currentChunk.append(line)
                currentSize += lineSize
        
        if currentChunk:
            chunks.append('\n'.join(currentChunk))
        
        return chunks

    def _chunkCsv(self, content: str) -> List[str]:
        """Chunk CSV content"""
        chunks = []
        currentChunk = []
        currentSize = 0
        
        for line in content.split('\n'):
            lineSize = len(line.encode('utf-8'))
            if currentSize + lineSize > self.chunkSizes["csv"]:
                if currentChunk:
                    chunks.append('\n'.join(currentChunk))
                currentChunk = [line]
                currentSize = lineSize
            else:
                currentChunk.append(line)
                currentSize += lineSize
        
        if currentChunk:
            chunks.append('\n'.join(currentChunk))
        
        return chunks

    def _chunkJson(self, content: str) -> List[str]:
        """Chunk JSON content"""
        try:
            data = json.loads(content)
            chunks = []
            currentChunk = []
            currentSize = 0
            
            def processValue(value, path=""):
                nonlocal currentChunk, currentSize
                valueStr = json.dumps({path: value}) if path else json.dumps(value)
                valueSize = len(valueStr.encode('utf-8'))
                
                if currentSize + valueSize > self.chunkSizes["json"]:
                    if currentChunk:
                        chunks.append(json.dumps(currentChunk))
                    currentChunk = [value]
                    currentSize = valueSize
                else:
                    currentChunk.append(value)
                    currentSize += valueSize
            
            if isinstance(data, list):
                for i, item in enumerate(data):
                    processValue(item, str(i))
            elif isinstance(data, dict):
                for key, value in data.items():
                    processValue(value, key)
            else:
                processValue(data)
            
            if currentChunk:
                chunks.append(json.dumps(currentChunk))
            
            return chunks
        except json.JSONDecodeError:
            return [content]

    def _chunkXml(self, content: str) -> List[str]:
        """Chunk XML content"""
        try:
            root = ET.fromstring(content)
            chunks = []
            currentChunk = []
            currentSize = 0
            
            def processElement(element, path=""):
                nonlocal currentChunk, currentSize
                elementStr = ET.tostring(element, encoding='unicode')
                elementSize = len(elementStr.encode('utf-8'))
                
                if currentSize + elementSize > self.chunkSizes["xml"]:
                    if currentChunk:
                        chunks.append(''.join(currentChunk))
                    currentChunk = [elementStr]
                    currentSize = elementSize
                else:
                    currentChunk.append(elementStr)
                    currentSize += elementSize
            
            for child in root:
                processElement(child)
            
            if currentChunk:
                chunks.append(''.join(currentChunk))
            
            return chunks
        except ET.ParseError:
            return [content]

    def _chunkHtml(self, content: str) -> List[str]:
        """Chunk HTML content"""
        try:
            soup = BeautifulSoup(content, 'html.parser')
            chunks = []
            currentChunk = []
            currentSize = 0
            
            def processElement(element):
                nonlocal currentChunk, currentSize
                elementStr = str(element)
                elementSize = len(elementStr.encode('utf-8'))
                
                if currentSize + elementSize > self.chunkSizes["html"]:
                    if currentChunk:
                        chunks.append(''.join(currentChunk))
                    currentChunk = [elementStr]
                    currentSize = elementSize
                else:
                    currentChunk.append(elementStr)
                    currentSize += elementSize
            
            for element in soup.find_all(['p', 'div', 'section', 'article']):
                processElement(element)
            
            if currentChunk:
                chunks.append(''.join(currentChunk))
            
            return chunks
        except Exception:
            return [content]

    def _chunkImage(self, content: str) -> List[str]:
        """Chunk image content"""
        try:
            imageData = base64.b64decode(content)
            chunks = []
            chunkSize = self.chunkSizes["image"]
            
            for i in range(0, len(imageData), chunkSize):
                chunk = imageData[i:i + chunkSize]
                chunks.append(base64.b64encode(chunk).decode('utf-8'))
            
            return chunks
        except Exception:
            return [content]

    def _chunkVideo(self, content: str) -> List[str]:
        """Chunk video content"""
        try:
            videoData = base64.b64decode(content)
            chunks = []
            chunkSize = self.chunkSizes["video"]
            
            for i in range(0, len(videoData), chunkSize):
                chunk = videoData[i:i + chunkSize]
                chunks.append(base64.b64encode(chunk).decode('utf-8'))
            
            return chunks
        except Exception:
            return [content]

    def _chunkBinary(self, content: str) -> List[str]:
        """Chunk binary content"""
        try:
            binaryData = base64.b64decode(content)
            chunks = []
            chunkSize = self.chunkSizes["binary"]
            
            for i in range(0, len(binaryData), chunkSize):
                chunk = binaryData[i:i + chunkSize]
                chunks.append(base64.b64encode(chunk).decode('utf-8'))
            
            return chunks
        except Exception:
            return [content]

    async def _extractText(self, content: bytes, mimeType: str) -> str:
        """Extract text content from various text formats"""
        try:
            textContent = content.decode('utf-8')
            return textContent
        except UnicodeDecodeError:
            logger.warning(f"Failed to decode text content as UTF-8 for {mimeType}")
            return ""

    async def _extractImage(self, content: bytes, mimeType: str) -> str:
        """Extract text from image content using OCR"""
        try:
            imageContent = Image.open(io.BytesIO(content))
            text = pytesseract.image_to_string(imageContent)
            return text
        except Exception as e:
            logger.error(f"Error extracting text from image: {str(e)}")
            return ""

    async def _extractVideo(self, content: bytes, mimeType: str) -> str:
        """Extract text from video content"""
        try:
            videoContent = io.BytesIO(content)
            # TODO: Implement video text extraction
            return "Video content extraction not implemented"
        except Exception as e:
            logger.error(f"Error extracting text from video: {str(e)}")
            return ""

    async def _extractAudio(self, content: bytes, mimeType: str) -> str:
        """Extract text from audio content using speech recognition"""
        try:
            audioContent = io.BytesIO(content)
            # TODO: Implement audio text extraction
            return "Audio content extraction not implemented"
        except Exception as e:
            logger.error(f"Error extracting text from audio: {str(e)}")
            return ""

    async def _extractJson(self, content: bytes, mimeType: str) -> str:
        """Extract text from JSON content"""
        try:
            jsonContent = json.loads(content.decode('utf-8'))
            return json.dumps(jsonContent, indent=2)
        except Exception as e:
            logger.error(f"Error extracting text from JSON: {str(e)}")
            return ""

    async def _extractXml(self, content: bytes, mimeType: str) -> str:
        """Extract text from XML content"""
        try:
            xmlContent = content.decode('utf-8')
            return xmlContent
        except Exception as e:
            logger.error(f"Error extracting text from XML: {str(e)}")
            return ""

    async def _extractCsv(self, content: bytes, mimeType: str) -> str:
        """Extract text from CSV content"""
        try:
            csvContent = content.decode('utf-8')
            return csvContent
        except Exception as e:
            logger.error(f"Error extracting text from CSV: {str(e)}")
            return ""