gateway/modules/chat_content_extraction.py

"""
Module for extracting content from various file formats.
Provides specialized functions for processing text, PDF, Office documents, images, etc.
"""

import logging
import os
import io
from typing import Dict, Any, List, Optional, Union, Tuple
import base64

# Configure logger
logger = logging.getLogger(__name__)

# Optional imports - only loaded when needed
pdf_extractor_loaded = False
office_extractor_loaded = False
image_processor_loaded = False

def get_document_contents(file_metadata: Dict[str, Any], file_content: bytes) -> List[Dict[str, Any]]:
    """
    Main function for extracting content from a file based on its MIME type.
    Delegates to specialized extraction functions.

    Args:
        file_metadata: File metadata (Name, MIME type, etc.)
        file_content: Binary data of the file

    Returns:
        List of Document-Content objects with metadata and is_text flag
    """
    try:
        mime_type = file_metadata.get("mime_type", "application/octet-stream")
        file_name = file_metadata.get("name", "unknown")

        logger.info(f"Extracting content from file '{file_name}' (MIME type: {mime_type})")

        # Extract content based on MIME type
        contents = []

        # Text-based formats
        if mime_type.startswith("text/") or mime_type in [
            "application/json",
            "application/xml",
            "application/javascript",
            "application/x-python"
        ]:
            contents.extend(extract_text_content(file_name, file_content, mime_type))

        # CSV Format
        elif mime_type == "text/csv":
            contents.extend(extract_csv_content(file_name, file_content))

        # Images
        elif mime_type.startswith("image/"):
            contents.extend(extract_image_content(file_name, file_content, mime_type))

        # PDF Documents
        elif mime_type == "application/pdf":
            contents.extend(extract_pdf_content(file_name, file_content))

        # Word Documents
        elif mime_type in [
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "application/msword"
        ]:
            contents.extend(extract_word_content(file_name, file_content, mime_type))

        # Excel Documents
        elif mime_type in [
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "application/vnd.ms-excel"
        ]:
            contents.extend(extract_excel_content(file_name, file_content, mime_type))

        # PowerPoint Documents
        elif mime_type in [
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            "application/vnd.ms-powerpoint"
        ]:
            contents.extend(extract_powerpoint_content(file_name, file_content, mime_type))

        # Binary data as fallback for unknown formats
        else:
            contents.extend(extract_binary_content(file_name, file_content, mime_type))

        # Fallback when no content could be extracted
        if not contents:
            logger.warning(f"No content extracted from file '{file_name}', using binary fallback")
            contents.append({
                "sequence_nr": 1,
                "name": '1_undefined',
                "ext": os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "bin",
                "content_type": mime_type,
                "data": file_content,
                "metadata": {
                    "is_text": False
                }
            })

        # Add generic attributes for all documents
        for content in contents:
            if isinstance(content.get("data"), bytes):
                content["data"] = base64.b64encode(content["data"]).decode('utf-8')
                # Add base64 flag
                if "metadata" not in content:
                    content["metadata"] = {}
                content["metadata"]["base64_encoded"] = True

        logger.info(f"Successfully extracted {len(contents)} content items from file '{file_name}'")
        return contents

    except Exception as e:
        logger.error(f"Error during content extraction: {str(e)}")
        # Fallback on error - return original data
        return [{
            "sequence_nr": 1,
            "name": file_metadata.get("name", "unknown"),
            "ext": os.path.splitext(file_metadata.get("name", ""))[1][1:] if os.path.splitext(file_metadata.get("name", ""))[1] else "bin",
            "content_type": file_metadata.get("mime_type", "application/octet-stream"),
            "data": file_content,
            "metadata": {
                "is_text": False
            }
        }]


def _load_pdf_extractor():
    """Loads PDF extraction libraries when needed"""
    global pdf_extractor_loaded
    if not pdf_extractor_loaded:
        try:
            global PyPDF2, fitz
            import PyPDF2
            import fitz  # PyMuPDF for more extensive PDF processing
            pdf_extractor_loaded = True
            logger.info("PDF extraction libraries successfully loaded")
        except ImportError as e:
            logger.warning(f"PDF extraction libraries could not be loaded: {e}")

def _load_office_extractor():
    """Loads Office document extraction libraries when needed"""
    global office_extractor_loaded
    if not office_extractor_loaded:
        try:
            global docx, openpyxl
            import docx  # python-docx for Word documents
            import openpyxl  # for Excel files
            office_extractor_loaded = True
            logger.info("Office extraction libraries successfully loaded")
        except ImportError as e:
            logger.warning(f"Office extraction libraries could not be loaded: {e}")

def _load_image_processor():
    """Loads image processing libraries when needed"""
    global image_processor_loaded
    if not image_processor_loaded:
        try:
            global PIL, Image
            from PIL import Image
            image_processor_loaded = True
            logger.info("Image processing libraries successfully loaded")
        except ImportError as e:
            logger.warning(f"Image processing libraries could not be loaded: {e}")

def extract_text_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
    """
    Extracts text from text files.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file
        mime_type: MIME type of the file

    Returns:
        List of Text-Content objects with metadata.is_text = True
    """
    try:
        # Keep original file extension
        file_extension = os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "txt"

        # Extract text content
        text_content = file_content.decode('utf-8')
        return [{
            "sequence_nr": 1,
            "name": "1_text",  # Simplified naming
            "ext": file_extension,
            "content_type": "text",
            "data": text_content,
            "metadata": {
                "is_text": True
            }
        }]
    except UnicodeDecodeError:
        logger.warning(f"Could not decode text from file '{file_name}' as UTF-8, trying alternative encodings")
        try:
            # Try alternative encodings
            for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                try:
                    text_content = file_content.decode(encoding)
                    logger.info(f"Text successfully decoded with encoding {encoding}")
                    return [{
                        "sequence_nr": 1,
                        "name": "1_text",  # Simplified naming
                        "ext": file_extension,
                        "content_type": "text",
                        "data": text_content,
                        "metadata": {
                            "is_text": True,
                            "encoding": encoding
                        }
                    }]
                except UnicodeDecodeError:
                    continue

            # Fallback to binary data if no encoding works
            logger.warning(f"Could not decode text, using binary data")
            return [{
                "sequence_nr": 1,
                "name": "1_binary",  # Simplified naming
                "ext": file_extension,
                "content_type": mime_type,
                "data": file_content,
                "metadata": {
                    "is_text": False
                }
            }]
        except Exception as e:
            logger.error(f"Error in alternative text decoding: {str(e)}")
            # Return binary data as fallback
            return [{
                "sequence_nr": 1,
                "name": "1_binary",  # Simplified naming
                "ext": file_extension,
                "content_type": mime_type,
                "data": file_content,
                "metadata": {
                    "is_text": False
                }
            }]

def extract_csv_content(file_name: str, file_content: bytes) -> List[Dict[str, Any]]:
    """
    Extracts content from CSV files.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file

    Returns:
        List of CSV-Content objects with metadata.is_text = True
    """
    try:
        # Extract text content
        csv_content = file_content.decode('utf-8')
        return [{
            "sequence_nr": 1,
            "name": "1_csv",  # Simplified naming
            "ext": "csv",
            "content_type": "csv",
            "data": csv_content,
            "metadata": {
                "is_text": True,
                "format": "csv"
            }
        }]
    except UnicodeDecodeError:
        logger.warning(f"Could not decode CSV from file '{file_name}' as UTF-8, trying alternative encodings")
        try:
            # Try alternative encodings for CSV
            for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                try:
                    csv_content = file_content.decode(encoding)
                    logger.info(f"CSV successfully decoded with encoding {encoding}")
                    return [{
                        "sequence_nr": 1,
                        "name": "1_csv",  # Simplified naming
                        "ext": "csv",
                        "content_type": "csv",
                        "data": csv_content,
                        "metadata": {
                            "is_text": True,
                            "encoding": encoding,
                            "format": "csv"
                        }
                    }]
                except UnicodeDecodeError:
                    continue

            # Fallback to binary data
            return [{
                "sequence_nr": 1,
                "name": "1_binary",  # Simplified naming
                "ext": "csv",
                "content_type": "text/csv",
                "data": file_content,
                "metadata": {
                    "is_text": False
                }
            }]
        except Exception as e:
            logger.error(f"Error in alternative CSV decoding: {str(e)}")
            return [{
                "sequence_nr": 1,
                "name": "1_binary",  # Simplified naming
                "ext": "csv",
                "content_type": "text/csv",
                "data": file_content,
                "metadata": {
                    "is_text": False
                }
            }]

def extract_image_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
    """
    Extracts content from image files and optionally generates metadata descriptions.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file
        mime_type: MIME type of the file

    Returns:
        List of Image-Content objects with metadata.is_text = False
    """

    # Extract file extension from MIME type or filename
    file_extension = mime_type.split('/')[-1]
    if file_extension == "jpeg":
        file_extension = "jpg"

    # If possible, analyze image and extract metadata
    image_metadata = {
        "is_text": False,
        "format": "image"
    }
    image_description = None

    try:
        _load_image_processor()
        if image_processor_loaded and file_content and len(file_content) > 0:
            with io.BytesIO(file_content) as img_stream:
                try:
                    img = Image.open(img_stream)
                    # Check if the image was actually loaded
                    img.verify()
                    # To safely continue working, reload
                    img_stream.seek(0)
                    img = Image.open(img_stream)
                    image_metadata.update({
                        "format": img.format,
                        "mode": img.mode,
                        "width": img.width,
                        "height": img.height
                    })
                    # Extract EXIF data if available
                    if hasattr(img, '_getexif') and callable(img._getexif):
                        exif = img._getexif()
                        if exif:
                            exif_data = {}
                            for tag_id, value in exif.items():
                                exif_data[f"tag_{tag_id}"] = str(value)
                            image_metadata["exif"] = exif_data

                    # Generate image description
                    image_description = f"Image ({img.width}x{img.height}, {img.format}, {img.mode})"
                except Exception as inner_e:
                    logger.warning(f"Error processing image: {str(inner_e)}")
                    image_metadata["error"] = str(inner_e)
                    image_description = f"Image (unable to process: {str(inner_e)})"
    except Exception as e:
        logger.warning(f"Could not extract image metadata: {str(e)}")
        image_metadata["error"] = str(e)


    # Return image content
    contents = [{
        "sequence_nr": 1,
        "name": "1_image",  # Simplified naming
        "ext": file_extension,
        "content_type": "image",
        "data": file_content,
        "metadata": image_metadata
    }]

    # If image description available, add as additional text content
    if image_description:
        contents.append({
            "sequence_nr": 2,
            "name": "2_text_image_info",  # Simplified naming with label
            "ext": "txt",
            "content_type": "text",
            "data": image_description,
            "metadata": {
                "is_text": True,
                "image_description": True
            }
        })

    return contents

def extract_pdf_content(file_name: str, file_content: bytes) -> List[Dict[str, Any]]:
    """
    Extracts text and images from PDF files.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file

    Returns:
        List of PDF-Content objects (text and images) with metadata.is_text flag
    """
    contents = []
    extracted_content_found = False

    try:
        # Load PDF extraction libraries
        _load_pdf_extractor()
        if not pdf_extractor_loaded:
            logger.warning("PDF extraction not possible: Libraries not available")
            # Add original file as binary content
            contents.append({
                "sequence_nr": 1,
                "name": "1_pdf",  # Simplified naming
                "ext": "pdf",
                "content_type": "application/pdf",
                "data": file_content,
                "metadata": {
                    "is_text": False,
                    "format": "pdf"
                }
            })
            return contents

        # Extract text with PyPDF2
        extracted_text = ""
        pdf_metadata = {}
        with io.BytesIO(file_content) as pdf_stream:
            pdf_reader = PyPDF2.PdfReader(pdf_stream)

            # Extract metadata
            pdf_info = pdf_reader.metadata or {}
            for key, value in pdf_info.items():
                if key.startswith('/'):
                    pdf_metadata[key[1:]] = value
                else:
                    pdf_metadata[key] = value

            # Extract text from all pages
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    extracted_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n"

        # If text was found, add as separate content
        if extracted_text.strip():
            extracted_content_found = True
            contents.append({
                "sequence_nr": len(contents) + 1,
                "name": f"{len(contents) + 1}_text",  # Simplified naming
                "ext": "txt",
                "content_type": "text",
                "data": extracted_text,
                "metadata": {
                    "is_text": True,
                    "source": "pdf",
                    "pages": len(pdf_reader.pages),
                    "pdf_metadata": pdf_metadata
                }
            })

        # Extract images with PyMuPDF (fitz)
        try:
            with io.BytesIO(file_content) as pdf_stream:
                doc = fitz.open(stream=pdf_stream, filetype="pdf")
                image_count = 0

                for page_num in range(len(doc)):
                    page = doc[page_num]
                    image_list = page.get_images(full=True)

                    for img_index, img_info in enumerate(image_list):
                        try:
                            image_count += 1
                            xref = img_info[0]
                            base_image = doc.extract_image(xref)
                            image_bytes = base_image["image"]
                            image_ext = base_image["ext"]

                            # Add image as content
                            extracted_content_found = True
                            contents.append({
                                "sequence_nr": len(contents) + 1,
                                "name": f"{len(contents) + 1}_image_page{page_num+1}_{img_index+1}",  # Simplified naming with label
                                "ext": image_ext,
                                "content_type": f"image/{image_ext}",
                                "data": image_bytes,
                                "metadata": {
                                    "is_text": False,
                                    "source": "pdf",
                                    "page": page_num + 1,
                                    "index": img_index
                                }
                            })
                        except Exception as img_e:
                            logger.warning(f"Error extracting image {img_index} on page {page_num + 1}: {str(img_e)}")

                # Close document
                doc.close()

        except Exception as img_extract_e:
            logger.warning(f"Error extracting images from PDF: {str(img_extract_e)}")

    except Exception as e:
        logger.error(f"Error in PDF extraction: {str(e)}")

    # If no content was extracted, add the original PDF
    if not extracted_content_found:
        contents.append({
            "sequence_nr": 1,
            "name": "1_pdf",  # Simplified naming
            "ext": "pdf",
            "content_type": "application/pdf",
            "data": file_content,
            "metadata": {
                "is_text": False,
                "format": "pdf"
            }
        })

    return contents

def extract_word_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
    """
    Extracts text and images from Word documents.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file
        mime_type: MIME type of the file

    Returns:
        List of Word-Content objects (text and possibly images) with metadata.is_text flag
    """
    contents = []
    extracted_content_found = False

    # Determine file extension
    file_extension = "docx" if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" else "doc"

    try:
        # Load Office extraction libraries
        _load_office_extractor()
        if not office_extractor_loaded:
            logger.warning("Word extraction not possible: Libraries not available")
            # Add original file as binary content
            contents.append({
                "sequence_nr": 1,
                "name": "1_word",  # Simplified naming
                "ext": file_extension,
                "content_type": mime_type,
                "data": file_content,
                "metadata": {
                    "is_text": False,
                    "format": "word"
                }
            })
            return contents

        # Only supports DOCX (newer format)
        if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            with io.BytesIO(file_content) as docx_stream:
                doc = docx.Document(docx_stream)

                # Extract text
                full_text = []
                for para in doc.paragraphs:
                    full_text.append(para.text)

                # Extract tables
                for table in doc.tables:
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text)
                        full_text.append(" | ".join(row_text))

                extracted_text = "\n\n".join(full_text)

                # Add extracted text as content
                if extracted_text.strip():
                    extracted_content_found = True
                    contents.append({
                        "sequence_nr": 1,
                        "name": "1_text",  # Simplified naming
                        "ext": "txt",
                        "content_type": "text",
                        "data": extracted_text,
                        "metadata": {
                            "is_text": True,
                            "source": "docx",
                            "paragraph_count": len(doc.paragraphs),
                            "table_count": len(doc.tables)
                        }
                    })
        else:
            logger.warning(f"Extraction from old Word format (DOC) not supported")

    except Exception as e:
        logger.error(f"Error in Word extraction: {str(e)}")

    # If no content was extracted, add the original document
    if not extracted_content_found:
        contents.append({
            "sequence_nr": 1,
            "name": "1_word",  # Simplified naming
            "ext": file_extension,
            "content_type": mime_type,
            "data": file_content,
            "metadata": {
                "is_text": False,
                "format": "word"
            }
        })

    return contents

def extract_excel_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
    """
    Extracts table data from Excel files.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file
        mime_type: MIME type of the file

    Returns:
        List of Excel-Content objects with metadata.is_text flag
    """
    contents = []
    extracted_content_found = False

    # Determine file extension
    file_extension = "xlsx" if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" else "xls"

    try:
        # Load Office extraction libraries
        _load_office_extractor()
        if not office_extractor_loaded:
            logger.warning("Excel extraction not possible: Libraries not available")
            # Add original file as binary content
            contents.append({
                "sequence_nr": 1,
                "name": "1_excel",  # Simplified naming
                "ext": file_extension,
                "content_type": mime_type,
                "data": file_content,
                "metadata": {
                    "is_text": False,
                    "format": "excel"
                }
            })
            return contents

        # Only supports XLSX (newer format)
        if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
            with io.BytesIO(file_content) as xlsx_stream:
                workbook = openpyxl.load_workbook(xlsx_stream, data_only=True)

                # Extract each worksheet as separate CSV content
                for sheet_index, sheet_name in enumerate(workbook.sheetnames):
                    sheet = workbook[sheet_name]

                    # Format data as CSV
                    csv_rows = []
                    for row in sheet.iter_rows():
                        csv_row = []
                        for cell in row:
                            value = cell.value
                            if value is None:
                                csv_row.append("")
                            else:
                                csv_row.append(str(value).replace('"', '""'))
                        csv_rows.append(','.join(f'"{cell}"' for cell in csv_row))

                    csv_content = "\n".join(csv_rows)

                    # Add as CSV content
                    if csv_content.strip():
                        extracted_content_found = True
                        sheet_safe_name = sheet_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
                        contents.append({
                            "sequence_nr": len(contents) + 1,
                            "name": f"{len(contents) + 1}_csv_{sheet_safe_name}",  # Simplified naming with sheet label
                            "ext": "csv",
                            "content_type": "csv",
                            "data": csv_content,
                            "metadata": {
                                "is_text": True,
                                "source": "xlsx",
                                "sheet": sheet_name,
                                "format": "csv"
                            }
                        })
        else:
            logger.warning(f"Extraction from old Excel format (XLS) not supported")

    except Exception as e:
        logger.error(f"Error in Excel extraction: {str(e)}")

    # If no content was extracted, add the original document
    if not extracted_content_found:
        contents.append({
            "sequence_nr": 1,
            "name": "1_excel",  # Simplified naming
            "ext": file_extension,
            "content_type": mime_type,
            "data": file_content,
            "metadata": {
                "is_text": False,
                "format": "excel"
            }
        })

    return contents

def extract_powerpoint_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
    """
    Extracts content from PowerPoint presentations.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file
        mime_type: MIME type of the file

    Returns:
        List of PowerPoint-Content objects with metadata.is_text = False
    """
    # For PowerPoint, we currently only return the original binary file
    # A complete extraction would require more specialized libraries
    file_extension = "pptx" if mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" else "ppt"
    return [{
        "sequence_nr": 1,
        "name": "1_powerpoint",  # Simplified naming
        "ext": file_extension,
        "content_type": mime_type,
        "data": file_content,
        "metadata": {
            "is_text": False,
            "format": "powerpoint"
        }
    }]

def extract_binary_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
    """
    Fallback for binary files where no specific extraction is possible.

    Args:
        file_name: Name of the file
        file_content: Binary data of the file
        mime_type: MIME type of the file

    Returns:
        List with a binary Content object with metadata.is_text = False
    """
    file_extension = os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "bin"
    return [{
        "sequence_nr": 1,
        "name": "1_binary",  # Simplified naming
        "ext": file_extension,
        "content_type": mime_type,
        "data": file_content,
        "metadata": {
            "is_text": False,
            "format": "binary"
        }
    }]