gateway/modules/workflow/processorDocument.py

from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict
import logging
import json
import os
import io
import base64
from datetime import datetime, UTC
from pathlib import Path
import mimetypes
import hashlib
import shutil
import re
import uuid

from modules.interfaces.serviceChatModel import (
    DocumentContext,
    DocumentExtraction,
    DocumentMetadata,
    DocumentContent,
    ProcessedDocument,
    ImageData
)

logger = logging.getLogger(__name__)

# Optional imports - only loaded when needed
pdfExtractorLoaded = False
officeExtractorLoaded = False
imageProcessorLoaded = False

class FileProcessingError(Exception):
    """Custom exception for file processing errors."""
    pass

class DocumentProcessor:
    """Processes documents with context awareness"""

    def __init__(self):
        self.supported_types = {
            "text/plain": self._process_text,
            "text/csv": self._process_csv,
            "application/json": self._process_json,
            "text/html": self._process_html,
            "image/svg+xml": self._process_svg,
            "application/pdf": self._process_pdf,
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": self._process_docx,
            "application/msword": self._process_docx,
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": self._process_xlsx,
            "application/vnd.ms-excel": self._process_xlsx,
            "application/vnd.openxmlformats-officedocument.presentationml.presentation": self._process_pptx,
            "application/vnd.ms-powerpoint": self._process_pptx
        }

        # Add image types
        for img_type in ["image/jpeg", "image/png", "image/gif", "image/bmp", "image/tiff"]:
            self.supported_types[img_type] = self._process_image

    def _load_pdf_extractor(self):
        """Loads PDF extraction libraries when needed"""
        global pdfExtractorLoaded
        if not pdfExtractorLoaded:
            try:
                global PyPDF2, fitz
                import PyPDF2
                import fitz  # PyMuPDF for more extensive PDF processing
                pdfExtractorLoaded = True
                logger.info("PDF extraction libraries successfully loaded")
            except ImportError as e:
                logger.warning(f"PDF extraction libraries could not be loaded: {e}")

    def _load_office_extractor(self):
        """Loads Office document extraction libraries when needed"""
        global officeExtractorLoaded
        if not officeExtractorLoaded:
            try:
                global docx, openpyxl
                import docx  # python-docx for Word documents
                import openpyxl  # for Excel files
                officeExtractorLoaded = True
                logger.info("Office extraction libraries successfully loaded")
            except ImportError as e:
                logger.warning(f"Office extraction libraries could not be loaded: {e}")

    def _load_image_processor(self):
        """Loads image processing libraries when needed"""
        global imageProcessorLoaded
        if not imageProcessorLoaded:
            try:
                global PIL, Image
                from PIL import Image
                imageProcessorLoaded = True
                logger.info("Image processing libraries successfully loaded")
            except ImportError as e:
                logger.warning(f"Image processing libraries could not be loaded: {e}")

    def process_with_context(self, doc: Dict[str, Any], context: DocumentContext) -> ProcessedDocument:
        """Process document with context"""
        try:
            # Get content type
            content_type = doc.get("contentType", "text/plain")
            if content_type == "application/octet-stream":
                # Try to detect actual file type
                content_type = self._detect_content_type(doc)

            if content_type not in self.supported_types:
                # Fallback to binary processing
                return self._process_binary(doc, context)

            # Process document
            processor = self.supported_types[content_type]
            extracted = processor(doc, context)

            # Track extraction
            self._track_extraction(doc, extracted, context)

            # Create ProcessedDocument
            return ProcessedDocument(
                id=doc.get("id", str(uuid.uuid4())),
                name=doc.get("name", "Unknown"),
                contentType=content_type,
                content=extracted,
                context=context
            )

        except Exception as e:
            logger.error(f"Error processing document: {str(e)}")
            # Fallback to binary processing
            return self._process_binary(doc, context)

    def _detect_content_type(self, doc: Dict[str, Any]) -> str:
        """Detect content type from file content"""
        try:
            # Check file extension first
            file_name = doc.get("name", "")
            ext = os.path.splitext(file_name)[1].lower()
            if ext:
                # Map common extensions to MIME types
                ext_to_mime = {
                    '.txt': 'text/plain',
                    '.md': 'text/markdown',
                    '.csv': 'text/csv',
                    '.json': 'application/json',
                    '.xml': 'application/xml',
                    '.js': 'application/javascript',
                    '.py': 'application/x-python',
                    '.svg': 'image/svg+xml',
                    '.jpg': 'image/jpeg',
                    '.jpeg': 'image/jpeg',
                    '.png': 'image/png',
                    '.gif': 'image/gif',
                    '.pdf': 'application/pdf',
                    '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                    '.doc': 'application/msword',
                    '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                    '.xls': 'application/vnd.ms-excel',
                    '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                    '.ppt': 'application/vnd.ms-powerpoint'
                }
                if ext in ext_to_mime:
                    return ext_to_mime[ext]

            # Try to detect if it's text content
            content = doc.get("content", "")
            if isinstance(content, bytes):
                try:
                    content.decode('utf-8')
                    return 'text/plain'
                except UnicodeDecodeError:
                    pass

            return 'application/octet-stream'

        except Exception as e:
            logger.error(f"Error detecting content type: {str(e)}")
            return 'application/octet-stream'

    def _process_text(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process text document"""
        content = doc.get("content", "")
        if isinstance(content, bytes):
            try:
                content = content.decode('utf-8')
            except UnicodeDecodeError:
                # Try alternative encodings
                for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                    try:
                        content = content.decode(encoding)
                        break
                    except UnicodeDecodeError:
                        continue

        sections = self._extract_sections(content)
        return DocumentContent(
            text=content,
            metadata=DocumentMetadata(
                type="text",
                format="text",
                size=len(content.encode('utf-8')),
                sections=sections
            )
        )

    def _process_csv(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process CSV document"""
        content = doc.get("content", "")
        if isinstance(content, bytes):
            content = content.decode('utf-8')

        return DocumentContent(
            text=content,
            metadata=DocumentMetadata(
                type="csv",
                format="csv",
                size=len(content.encode('utf-8')),
                sections=[f"Row {i+1}" for i in range(len(content.splitlines()))]
            )
        )

    def _process_json(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process JSON document"""
        content = doc.get("content", {})
        if isinstance(content, str):
            content = json.loads(content)
        elif isinstance(content, bytes):
            content = json.loads(content.decode('utf-8'))

        structure = self._analyze_structure(content)
        return DocumentContent(
            data=content,
            metadata=DocumentMetadata(
                type="json",
                format="json",
                size=len(json.dumps(content).encode('utf-8')),
                sections=list(content.keys()) if isinstance(content, dict) else []
            )
        )

    def _process_html(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process HTML document"""
        content = doc.get("content", "")
        if isinstance(content, bytes):
            content = content.decode('utf-8')

        return DocumentContent(
            text=content,
            metadata=DocumentMetadata(
                type="html",
                format="html",
                size=len(content.encode('utf-8')),
                sections=[
                    self._extract_title(content) or "Untitled",
                    *self._extract_links(content),
                    *self._extract_images(content)
                ]
            )
        )

    def _process_svg(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process SVG document"""
        content = doc.get("content", "")
        if isinstance(content, bytes):
            content = content.decode('utf-8')

        # Check if it's actually SVG
        is_svg = "<svg" in content.lower()

        return DocumentContent(
            text=content if is_svg else None,
            metadata=DocumentMetadata(
                type="svg",
                format="svg",
                size=len(content.encode('utf-8')),
                error=None if is_svg else "Invalid SVG content"
            )
        )

    def _process_image(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process image document"""
        content = doc.get("content", b"")
        if not isinstance(content, bytes):
            try:
                content = content.encode('utf-8')
            except Exception as e:
                logger.error(f"Error encoding image content: {str(e)}")
                return DocumentContent(
                    metadata=DocumentMetadata(
                        type="image",
                        format="unknown",
                        size=0,
                        error=f"Invalid image content: {str(e)}"
                    )
                )

        metadata = DocumentMetadata(
            type="image",
            format=doc.get("contentType", "").split("/")[-1],
            size=len(content)
        )

        try:
            self._load_image_processor()
            if imageProcessorLoaded:
                with io.BytesIO(content) as img_stream:
                    img = Image.open(img_stream)
                    img.verify()
                    img_stream.seek(0)
                    img = Image.open(img_stream)
                    metadata.pages = 1
                    if hasattr(img, '_getexif') and callable(img._getexif):
                        exif = img._getexif()
                        if exif:
                            metadata.sections = [f"EXIF_{tag_id}" for tag_id in exif.keys()]
        except Exception as e:
            logger.warning(f"Error processing image: {str(e)}")
            metadata.error = str(e)

        try:
            image_data = ImageData(
                data=base64.b64encode(content).decode('utf-8'),
                format=metadata.format,
                page=None,
                index=None
            )
        except Exception as e:
            logger.error(f"Error creating image data: {str(e)}")
            return DocumentContent(
                metadata=DocumentMetadata(
                    type="image",
                    format=metadata.format,
                    size=len(content),
                    error=f"Error creating image data: {str(e)}"
                )
            )

        return DocumentContent(
            images=[image_data],
            metadata=metadata
        )

    def _process_pdf(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process PDF document"""
        content = doc.get("content", b"")
        if not isinstance(content, bytes):
            content = content.encode('utf-8')

        metadata = DocumentMetadata(
            type="pdf",
            format="pdf",
            size=len(content)
        )

        text_content = ""
        images: List[ImageData] = []

        try:
            self._load_pdf_extractor()
            if pdfExtractorLoaded:
                with io.BytesIO(content) as pdf_stream:
                    # Extract text with PyPDF2
                    pdf_reader = PyPDF2.PdfReader(pdf_stream)
                    metadata.pages = len(pdf_reader.pages)

                    # Extract text from all pages
                    for page_num in range(len(pdf_reader.pages)):
                        page = pdf_reader.pages[page_num]
                        page_text = page.extract_text()
                        if page_text:
                            text_content += f"--- Page {page_num + 1} ---\n{page_text}\n\n"

                    # Extract images with PyMuPDF
                    pdf_stream.seek(0)
                    doc = fitz.open(stream=pdf_stream, filetype="pdf")
                    for page_num in range(len(doc)):
                        page = doc[page_num]
                        for img_index, img_info in enumerate(page.get_images(full=True)):
                            try:
                                xref = img_info[0]
                                base_image = doc.extract_image(xref)
                                if base_image:
                                    image_bytes = base_image.get("image", b"")
                                    image_ext = base_image.get("ext", "png")

                                    if image_bytes:
                                        image_data = ImageData(
                                            data=base64.b64encode(image_bytes).decode('utf-8'),
                                            format=image_ext,
                                            page=page_num + 1,
                                            index=img_index
                                        )
                                        images.append(image_data)
                            except Exception as img_e:
                                logger.warning(f"Error extracting image {img_index} on page {page_num + 1}: {str(img_e)}")

                    doc.close()

        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            metadata.error = str(e)

        return DocumentContent(
            text=text_content,
            images=images,
            metadata=metadata
        )

    def _process_docx(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process Word document"""
        content = doc.get("content", b"")
        if not isinstance(content, bytes):
            content = content.encode('utf-8')

        metadata = DocumentMetadata(
            type="docx",
            format="docx",
            size=len(content)
        )

        text_content = ""

        try:
            self._load_office_extractor()
            if officeExtractorLoaded:
                with io.BytesIO(content) as docx_stream:
                    doc = docx.Document(docx_stream)

                    # Extract text
                    full_text = []
                    for para in doc.paragraphs:
                        full_text.append(para.text)

                    # Extract tables
                    for table in doc.tables:
                        for row in table.rows:
                            row_text = []
                            for cell in row.cells:
                                row_text.append(cell.text)
                            full_text.append(" | ".join(row_text))

                    text_content = "\n\n".join(full_text)
                    metadata.pages = len(doc.paragraphs)
                    metadata.sections = [f"Paragraph {i+1}" for i in range(len(doc.paragraphs))]

        except Exception as e:
            logger.error(f"Error processing Word document: {str(e)}")
            metadata.error = str(e)

        return DocumentContent(
            text=text_content,
            metadata=metadata
        )

    def _process_xlsx(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process Excel document"""
        content = doc.get("content", b"")
        if not isinstance(content, bytes):
            content = content.encode('utf-8')

        metadata = DocumentMetadata(
            type="xlsx",
            format="xlsx",
            size=len(content)
        )

        sheets_data = []

        try:
            self._load_office_extractor()
            if officeExtractorLoaded:
                with io.BytesIO(content) as xlsx_stream:
                    workbook = openpyxl.load_workbook(xlsx_stream, data_only=True)
                    metadata.pages = len(workbook.sheetnames)

                    for sheet_name in workbook.sheetnames:
                        sheet = workbook[sheet_name]
                        csv_rows = []
                        for row in sheet.iter_rows():
                            csv_row = []
                            for cell in row:
                                value = cell.value
                                if value is None:
                                    csv_row.append("")
                                else:
                                    csv_row.append(str(value).replace('"', '""'))
                            csv_rows.append(','.join(f'"{cell}"' for cell in csv_row))

                        sheets_data.append({
                            "name": sheet_name,
                            "data": "\n".join(csv_rows)
                        })

                    metadata.sections = workbook.sheetnames

        except Exception as e:
            logger.error(f"Error processing Excel document: {str(e)}")
            metadata.error = str(e)

        return DocumentContent(
            data={"sheets": sheets_data},
            metadata=metadata
        )

    def _process_pptx(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
        """Process PowerPoint document"""
        return DocumentContent(
            metadata=DocumentMetadata(
                type="pptx",
                format="pptx",
                size=len(doc.get("content", b"")),
                error="PowerPoint processing not implemented"
            )
        )

    def _process_binary(self, doc: Dict[str, Any], context: DocumentContext) -> ProcessedDocument:
        """Process binary document"""
        content = doc.get("content", b"")
        if not isinstance(content, bytes):
            content = content.encode('utf-8')

        return ProcessedDocument(
            id=doc.get("id", str(uuid.uuid4())),
            name=doc.get("name", "Unknown"),
            contentType="application/octet-stream",
            content=DocumentContent(
                data={"binary": base64.b64encode(content).decode('utf-8')},
                metadata=DocumentMetadata(
                    type="binary",
                    format="binary",
                    size=len(content)
                )
            ),
            context=context
        )

    def _extract_sections(self, content: str) -> List[str]:
        """Extract sections from text content"""
        sections = []
        current_section = []

        for line in content.split("\n"):
            if line.startswith("#"):
                if current_section:
                    sections.append("".join(current_section))
                    current_section = []
            current_section.append(line)

        if current_section:
            sections.append("".join(current_section))

        return sections

    def _analyze_structure(self, content: Any) -> Dict[str, Any]:
        """Analyze structure of JSON content"""
        if isinstance(content, dict):
            return {
                "type": "object",
                "properties": {
                    k: self._analyze_structure(v)
                    for k, v in content.items()
                }
            }
        elif isinstance(content, list):
            return {
                "type": "array",
                "items": self._analyze_structure(content[0]) if content else {}
            }
        else:
            return {
                "type": type(content).__name__
            }

    def _extract_title(self, html: str) -> Optional[str]:
        """Extract title from HTML"""
        match = re.search(r"<title>(.*?)</title>", html, re.IGNORECASE)
        return match.group(1) if match else None

    def _extract_links(self, html: str) -> List[str]:
        """Extract links from HTML"""
        return re.findall(r'href=[\'"]?([^\'" >]+)', html)

    def _extract_images(self, html: str) -> List[str]:
        """Extract images from HTML"""
        return re.findall(r'src=[\'"]?([^\'" >]+)', html)

    def _track_extraction(self, doc: Dict[str, Any], extracted: DocumentContent, context: DocumentContext) -> None:
        """Track document extraction"""
        extraction = DocumentExtraction(
            timestamp=datetime.now(UTC).isoformat(),
            type=doc.get("contentType", "unknown"),
            sections=extracted.metadata.sections or [],
            metadata=extracted.metadata.dict()
        )
        context.extractionHistory.append(extraction)

    def get_supported_types(self) -> List[str]:
        """Get list of supported content types"""
        return list(self.supported_types.keys())

    def add_processor(self, content_type: str, processor: callable) -> None:
        """Add new document processor"""
        self.supported_types[content_type] = processor

    def remove_processor(self, content_type: str) -> None:
        """Remove document processor"""
        self.supported_types.pop(content_type, None)