gateway/modules/serviceCenter/services/serviceExtraction/extractors/extractorFolder.py
2026-03-15 23:38:21 +01:00

184 lines
6.1 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Folder extractor -- treats a local folder reference as a container.
Not registered in the MIME-based ExtractorRegistry (folders have no MIME type).
Instead, called directly by agent tools (browseContainer) when handling folder references.
Applies the same safety limits as ContainerExtractor.
"""
from typing import Any, Dict, List
import logging
import mimetypes
from pathlib import Path
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelContent import ContainerLimitError, ContentContextRef
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
MAX_TOTAL_EXTRACTED_SIZE = 500 * 1024 * 1024
MAX_FILE_COUNT = 10000
MAX_DEPTH = 5
class FolderExtractor(Extractor):
"""Extracts contents from a local folder path.
Unlike other extractors, this does not receive fileBytes. Instead it
receives a folder path via context["folderPath"] and walks the directory.
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return False
def getSupportedExtensions(self) -> list[str]:
return []
def getSupportedMimeTypes(self) -> list[str]:
return []
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""Extract folder contents.
context must contain:
folderPath: str -- absolute path to the folder
"""
folderPath = context.get("folderPath", "")
if not folderPath:
return []
folder = Path(folderPath)
if not folder.is_dir():
logger.error(f"FolderExtractor: not a directory: {folderPath}")
return []
rootId = makeId()
parts: List[ContentPart] = [
ContentPart(
id=rootId,
parentId=None,
label=folder.name or "folder",
typeGroup="container",
mimeType="inode/directory",
data="",
metadata={"folderPath": str(folder), "containerType": "folder"},
)
]
state = {"totalSize": 0, "fileCount": 0}
try:
_walkFolder(folder, rootId, "", 0, state, parts)
except ContainerLimitError as e:
logger.warning(f"Folder extraction limit reached: {e}")
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label="limit_exceeded",
typeGroup="text",
mimeType="text/plain",
data=str(e),
metadata={"warning": "Folder extraction limit exceeded"},
))
return parts
def _walkFolder(
folder: Path,
parentId: str,
containerPath: str,
depth: int,
state: Dict[str, int],
parts: List[ContentPart],
) -> None:
if depth > MAX_DEPTH:
raise ContainerLimitError(f"Max folder depth {MAX_DEPTH} exceeded")
try:
entries = sorted(folder.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
except PermissionError:
logger.warning(f"Permission denied: {folder}")
return
for entry in entries:
if entry.is_symlink():
logger.debug(f"Skipping symlink: {entry}")
continue
entryPath = f"{containerPath}/{entry.name}" if containerPath else entry.name
if entry.is_dir():
folderId = makeId()
parts.append(ContentPart(
id=folderId,
parentId=parentId,
label=entry.name,
typeGroup="container",
mimeType="inode/directory",
data="",
metadata={"containerPath": entryPath, "containerType": "folder"},
))
_walkFolder(entry, folderId, entryPath, depth + 1, state, parts)
elif entry.is_file():
try:
fileSize = entry.stat().st_size
except OSError:
continue
state["totalSize"] += fileSize
state["fileCount"] += 1
if state["totalSize"] > MAX_TOTAL_EXTRACTED_SIZE:
raise ContainerLimitError(f"Total extracted size exceeds {MAX_TOTAL_EXTRACTED_SIZE // (1024 * 1024)} MB")
if state["fileCount"] > MAX_FILE_COUNT:
raise ContainerLimitError(f"File count exceeds {MAX_FILE_COUNT}")
guessedMime, _ = mimetypes.guess_type(entry.name)
detectedMime = guessedMime or "application/octet-stream"
from ..subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
extractor = registry.resolve(detectedMime, entry.name)
if extractor and not isinstance(extractor, FolderExtractor):
try:
fileData = entry.read_bytes()
childParts = extractor.extract(fileData, {"fileName": entry.name, "mimeType": detectedMime})
for part in childParts:
part.parentId = parentId
if not part.metadata:
part.metadata = {}
part.metadata["containerPath"] = entryPath
parts.extend(childParts)
continue
except Exception as e:
logger.warning(f"Type-extractor failed for {entry.name}: {e}")
import base64
try:
fileData = entry.read_bytes()
encodedData = base64.b64encode(fileData).decode("utf-8")
except Exception:
encodedData = ""
parts.append(ContentPart(
id=makeId(),
parentId=parentId,
label=entry.name,
typeGroup="binary",
mimeType=detectedMime,
data=encodedData,
metadata={
"size": fileSize,
"containerPath": entryPath,
"contextRef": ContentContextRef(
containerPath=entryPath,
location="file",
).model_dump(),
},
))