184 lines
6.1 KiB
Python
184 lines
6.1 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Folder extractor -- treats a local folder reference as a container.
|
|
|
|
Not registered in the MIME-based ExtractorRegistry (folders have no MIME type).
|
|
Instead, called directly by agent tools (browseContainer) when handling folder references.
|
|
|
|
Applies the same safety limits as ContainerExtractor.
|
|
"""
|
|
|
|
from typing import Any, Dict, List
|
|
import logging
|
|
import mimetypes
|
|
from pathlib import Path
|
|
|
|
from ..subUtils import makeId
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from modules.datamodels.datamodelContent import ContainerLimitError, ContentContextRef
|
|
from ..subRegistry import Extractor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_TOTAL_EXTRACTED_SIZE = 500 * 1024 * 1024
|
|
MAX_FILE_COUNT = 10000
|
|
MAX_DEPTH = 5
|
|
|
|
|
|
class FolderExtractor(Extractor):
|
|
"""Extracts contents from a local folder path.
|
|
|
|
Unlike other extractors, this does not receive fileBytes. Instead it
|
|
receives a folder path via context["folderPath"] and walks the directory.
|
|
"""
|
|
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
return False
|
|
|
|
def getSupportedExtensions(self) -> list[str]:
|
|
return []
|
|
|
|
def getSupportedMimeTypes(self) -> list[str]:
|
|
return []
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
"""Extract folder contents.
|
|
|
|
context must contain:
|
|
folderPath: str -- absolute path to the folder
|
|
"""
|
|
folderPath = context.get("folderPath", "")
|
|
if not folderPath:
|
|
return []
|
|
|
|
folder = Path(folderPath)
|
|
if not folder.is_dir():
|
|
logger.error(f"FolderExtractor: not a directory: {folderPath}")
|
|
return []
|
|
|
|
rootId = makeId()
|
|
parts: List[ContentPart] = [
|
|
ContentPart(
|
|
id=rootId,
|
|
parentId=None,
|
|
label=folder.name or "folder",
|
|
typeGroup="container",
|
|
mimeType="inode/directory",
|
|
data="",
|
|
metadata={"folderPath": str(folder), "containerType": "folder"},
|
|
)
|
|
]
|
|
|
|
state = {"totalSize": 0, "fileCount": 0}
|
|
try:
|
|
_walkFolder(folder, rootId, "", 0, state, parts)
|
|
except ContainerLimitError as e:
|
|
logger.warning(f"Folder extraction limit reached: {e}")
|
|
parts.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=rootId,
|
|
label="limit_exceeded",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=str(e),
|
|
metadata={"warning": "Folder extraction limit exceeded"},
|
|
))
|
|
|
|
return parts
|
|
|
|
|
|
def _walkFolder(
|
|
folder: Path,
|
|
parentId: str,
|
|
containerPath: str,
|
|
depth: int,
|
|
state: Dict[str, int],
|
|
parts: List[ContentPart],
|
|
) -> None:
|
|
if depth > MAX_DEPTH:
|
|
raise ContainerLimitError(f"Max folder depth {MAX_DEPTH} exceeded")
|
|
|
|
try:
|
|
entries = sorted(folder.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
|
|
except PermissionError:
|
|
logger.warning(f"Permission denied: {folder}")
|
|
return
|
|
|
|
for entry in entries:
|
|
if entry.is_symlink():
|
|
logger.debug(f"Skipping symlink: {entry}")
|
|
continue
|
|
|
|
entryPath = f"{containerPath}/{entry.name}" if containerPath else entry.name
|
|
|
|
if entry.is_dir():
|
|
folderId = makeId()
|
|
parts.append(ContentPart(
|
|
id=folderId,
|
|
parentId=parentId,
|
|
label=entry.name,
|
|
typeGroup="container",
|
|
mimeType="inode/directory",
|
|
data="",
|
|
metadata={"containerPath": entryPath, "containerType": "folder"},
|
|
))
|
|
_walkFolder(entry, folderId, entryPath, depth + 1, state, parts)
|
|
|
|
elif entry.is_file():
|
|
try:
|
|
fileSize = entry.stat().st_size
|
|
except OSError:
|
|
continue
|
|
|
|
state["totalSize"] += fileSize
|
|
state["fileCount"] += 1
|
|
|
|
if state["totalSize"] > MAX_TOTAL_EXTRACTED_SIZE:
|
|
raise ContainerLimitError(f"Total extracted size exceeds {MAX_TOTAL_EXTRACTED_SIZE // (1024 * 1024)} MB")
|
|
if state["fileCount"] > MAX_FILE_COUNT:
|
|
raise ContainerLimitError(f"File count exceeds {MAX_FILE_COUNT}")
|
|
|
|
guessedMime, _ = mimetypes.guess_type(entry.name)
|
|
detectedMime = guessedMime or "application/octet-stream"
|
|
|
|
from ..subRegistry import ExtractorRegistry
|
|
registry = ExtractorRegistry()
|
|
extractor = registry.resolve(detectedMime, entry.name)
|
|
|
|
if extractor and not isinstance(extractor, FolderExtractor):
|
|
try:
|
|
fileData = entry.read_bytes()
|
|
childParts = extractor.extract(fileData, {"fileName": entry.name, "mimeType": detectedMime})
|
|
for part in childParts:
|
|
part.parentId = parentId
|
|
if not part.metadata:
|
|
part.metadata = {}
|
|
part.metadata["containerPath"] = entryPath
|
|
parts.extend(childParts)
|
|
continue
|
|
except Exception as e:
|
|
logger.warning(f"Type-extractor failed for {entry.name}: {e}")
|
|
|
|
import base64
|
|
try:
|
|
fileData = entry.read_bytes()
|
|
encodedData = base64.b64encode(fileData).decode("utf-8")
|
|
except Exception:
|
|
encodedData = ""
|
|
|
|
parts.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=parentId,
|
|
label=entry.name,
|
|
typeGroup="binary",
|
|
mimeType=detectedMime,
|
|
data=encodedData,
|
|
metadata={
|
|
"size": fileSize,
|
|
"containerPath": entryPath,
|
|
"contextRef": ContentContextRef(
|
|
containerPath=entryPath,
|
|
location="file",
|
|
).model_dump(),
|
|
},
|
|
))
|