# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Folder extractor -- treats a local folder reference as a container. Not registered in the MIME-based ExtractorRegistry (folders have no MIME type). Instead, called directly by agent tools (browseContainer) when handling folder references. Applies the same safety limits as ContainerExtractor. """ from typing import Any, Dict, List import logging import mimetypes from pathlib import Path from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from modules.datamodels.datamodelContent import ContainerLimitError, ContentContextRef from ..subRegistry import Extractor logger = logging.getLogger(__name__) MAX_TOTAL_EXTRACTED_SIZE = 500 * 1024 * 1024 MAX_FILE_COUNT = 10000 MAX_DEPTH = 5 class FolderExtractor(Extractor): """Extracts contents from a local folder path. Unlike other extractors, this does not receive fileBytes. Instead it receives a folder path via context["folderPath"] and walks the directory. """ def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return False def getSupportedExtensions(self) -> list[str]: return [] def getSupportedMimeTypes(self) -> list[str]: return [] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: """Extract folder contents. context must contain: folderPath: str -- absolute path to the folder """ folderPath = context.get("folderPath", "") if not folderPath: return [] folder = Path(folderPath) if not folder.is_dir(): logger.error(f"FolderExtractor: not a directory: {folderPath}") return [] rootId = makeId() parts: List[ContentPart] = [ ContentPart( id=rootId, parentId=None, label=folder.name or "folder", typeGroup="container", mimeType="inode/directory", data="", metadata={"folderPath": str(folder), "containerType": "folder"}, ) ] state = {"totalSize": 0, "fileCount": 0} try: _walkFolder(folder, rootId, "", 0, state, parts) except ContainerLimitError as e: logger.warning(f"Folder extraction limit reached: {e}") parts.append(ContentPart( id=makeId(), parentId=rootId, label="limit_exceeded", typeGroup="text", mimeType="text/plain", data=str(e), metadata={"warning": "Folder extraction limit exceeded"}, )) return parts def _walkFolder( folder: Path, parentId: str, containerPath: str, depth: int, state: Dict[str, int], parts: List[ContentPart], ) -> None: if depth > MAX_DEPTH: raise ContainerLimitError(f"Max folder depth {MAX_DEPTH} exceeded") try: entries = sorted(folder.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())) except PermissionError: logger.warning(f"Permission denied: {folder}") return for entry in entries: if entry.is_symlink(): logger.debug(f"Skipping symlink: {entry}") continue entryPath = f"{containerPath}/{entry.name}" if containerPath else entry.name if entry.is_dir(): folderId = makeId() parts.append(ContentPart( id=folderId, parentId=parentId, label=entry.name, typeGroup="container", mimeType="inode/directory", data="", metadata={"containerPath": entryPath, "containerType": "folder"}, )) _walkFolder(entry, folderId, entryPath, depth + 1, state, parts) elif entry.is_file(): try: fileSize = entry.stat().st_size except OSError: continue state["totalSize"] += fileSize state["fileCount"] += 1 if state["totalSize"] > MAX_TOTAL_EXTRACTED_SIZE: raise ContainerLimitError(f"Total extracted size exceeds {MAX_TOTAL_EXTRACTED_SIZE // (1024 * 1024)} MB") if state["fileCount"] > MAX_FILE_COUNT: raise ContainerLimitError(f"File count exceeds {MAX_FILE_COUNT}") guessedMime, _ = mimetypes.guess_type(entry.name) detectedMime = guessedMime or "application/octet-stream" from ..subRegistry import ExtractorRegistry registry = ExtractorRegistry() extractor = registry.resolve(detectedMime, entry.name) if extractor and not isinstance(extractor, FolderExtractor): try: fileData = entry.read_bytes() childParts = extractor.extract(fileData, {"fileName": entry.name, "mimeType": detectedMime}) for part in childParts: part.parentId = parentId if not part.metadata: part.metadata = {} part.metadata["containerPath"] = entryPath parts.extend(childParts) continue except Exception as e: logger.warning(f"Type-extractor failed for {entry.name}: {e}") import base64 try: fileData = entry.read_bytes() encodedData = base64.b64encode(fileData).decode("utf-8") except Exception: encodedData = "" parts.append(ContentPart( id=makeId(), parentId=parentId, label=entry.name, typeGroup="binary", mimeType=detectedMime, data=encodedData, metadata={ "size": fileSize, "containerPath": entryPath, "contextRef": ContentContextRef( containerPath=entryPath, location="file", ).model_dump(), }, ))