Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding limits on file uploads #475

Merged
merged 5 commits into from
Oct 28, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pydatalab/pydatalab/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,15 @@ class ServerConfig(BaseSettings):
None, description="A dictionary containing metadata to serve at `/info`."
)

MAX_CONTENT_LENGTH: int = Field(
100 * 1000 * 1000,
description=r"""Direct mapping to the equivalent Flask setting. In practice, limits the file size that can be uploaded.
Defaults to 100 GB to avoid filling the tmp directory of a server.

Warning: this value will overwrite any other values passed to `FLASK_MAX_CONTENT_LENGTH` but is included here to clarify
its importance when deploying a datalab instance.""",
)

@root_validator
def validate_cache_ages(cls, values):
if values.get("REMOTE_CACHE_MIN_AGE") > values.get("REMOTE_CACHE_MAX_AGE"):
Expand Down
93 changes: 69 additions & 24 deletions pydatalab/pydatalab/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,35 @@

from bson.objectid import ObjectId
from pymongo import ReturnDocument
from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename

from pydatalab.config import CONFIG, RemoteFilesystem
from pydatalab.logger import LOGGER, logged_route
from pydatalab.models import File
from pydatalab.models.utils import PyObjectId
from pydatalab.mongo import flask_mongo
from pydatalab.mongo import _get_active_mongo_client, flask_mongo
from pydatalab.permissions import get_default_permissions

FILE_DIRECTORY = CONFIG.FILE_DIRECTORY
DIRECTORIES_DICT = {fs.name: fs for fs in CONFIG.REMOTE_FILESYSTEMS}
LIVE_FILE_CUTOFF = datetime.timedelta(days=31)


def get_space_available_bytes() -> int:
"""For the configured file location, return the number of available bytes, as
ascertained from the filesystem blocksize and available block count (via Unix-specific
statvfs system call).

"""
try:
stats = os.statvfs(CONFIG.FILE_DIRECTORY)
except FileNotFoundError:
raise RuntimeError(f"{CONFIG.FILE_DIRECTORY=} was not safely initialised.")

return stats.f_bsize * stats.f_bavail


def _escape_spaces_scp_path(remote_path: str) -> str:
r"""Takes a remote path prefixed by 'ssh://' and encloses
the filename in quotes and escapes spaces to allow for
Expand Down Expand Up @@ -57,7 +72,7 @@ def _sync_file_with_remote(remote_path: str, src: str) -> None:
remote_path = _escape_spaces_scp_path(remote_path)
scp_command = f"scp {re.sub('^ssh://', '', remote_path)} {src}"

os.makedirs(pathlib.Path(src).parent, exist_ok=True)
pathlib.Path(src).parent.mkdir(parents=False, exist_ok=True)

LOGGER.debug("Syncing file with '%s'", scp_command)
proc = subprocess.Popen(
Expand Down Expand Up @@ -291,37 +306,55 @@ def update_uploaded_file(file, file_id, last_modified=None, size_bytes=None):

@logged_route
def save_uploaded_file(
file,
item_ids=None,
block_ids=None,
last_modified=None,
file: FileStorage,
item_ids: list[str] | None = None,
block_ids: list[str] | None = None,
last_modified: datetime.datetime | str | None = None,
size_bytes: int | None = None,
creator_ids: list[PyObjectId | str] | None = None,
):
"""file is a file object from a flask request.
last_modified should be an isodate format. if last_modified is None, the current time will be inserted
) -> dict:
"""Attempt to save a copy of the file object from the request in the file store, and
add its metadata to the database.

Parameters:
file: The flask file object in the request.
item_ids: The item IDs to attempt to attach the file to.
block_ids: The block IDs to attempt to attach the file to.
last_modified: An isoformat datetime for to track as the last time the filed was modified
(otherwise use the current datetime).
size_bytes: A hint for the file size in bytes, will be used to verify ahead of time whether
the file can be saved.
creator_ids: A list of IDs for users who will be registered as the creator of this file,
i.e., retaining write access.

Returns:
A dictionary containing the saved metadata for the file.

"""

from pydatalab.permissions import get_default_permissions

sample_collection = flask_mongo.db.items
file_collection = flask_mongo.db.files

# validate item_ids
if not item_ids:
item_ids = []
if not block_ids:
block_ids = []

for item_id in item_ids:
if not sample_collection.find_one(
if not flask_mongo.db.items.find_one(
{"item_id": item_id, **get_default_permissions(user_only=True)}
):
raise ValueError(f"item_id is invalid: {item_id}")

if file.filename is None:
raise RuntimeError("Filename is missing.")

filename = secure_filename(file.filename)
extension = os.path.splitext(filename)[1]

if isinstance(last_modified, datetime.datetime):
last_modified = last_modified.isoformat()

if not last_modified:
last_modified = datetime.datetime.now().isoformat()

Expand All @@ -347,18 +380,30 @@ def save_uploaded_file(
creator_ids=creator_ids if creator_ids else [],
)

result = file_collection.insert_one(new_file_document.dict())
if not result.acknowledged:
raise IOError(f"db operation failed when trying to insert new file. Result: {result}")
# In one transaction, check if we can save the file, insert it into the database
# and save it, then release the lock
client = _get_active_mongo_client()
with client.start_session(causal_consistency=True) as session:
space = get_space_available_bytes()
if size_bytes is not None and space < size_bytes:
raise RuntimeError(
f"Cannot store file: insufficient space available on disk (required: {size_bytes // 1024 ** 3} GB). Please contact your datalab administrator."
)
file_collection = client.get_database().files
result = file_collection.insert_one(new_file_document.dict(), session=session)
if not result.acknowledged:
raise RuntimeError(
f"db operation failed when trying to insert new file. Result: {result}"
)

inserted_id = result.inserted_id
inserted_id = result.inserted_id

new_directory = os.path.join(FILE_DIRECTORY, str(inserted_id))
file_location = os.path.join(new_directory, filename)
os.makedirs(new_directory)
file.save(file_location)
new_directory = os.path.join(FILE_DIRECTORY, str(inserted_id))
file_location = os.path.join(new_directory, filename)
pathlib.Path(new_directory).mkdir(exist_ok=False)
file.save(file_location)

updated_file_entry = file_collection.find_one_and_update(
updated_file_entry = flask_mongo.db.files.find_one_and_update(
{"_id": inserted_id, **get_default_permissions(user_only=False)},
{
"$set": {
Expand All @@ -373,7 +418,7 @@ def save_uploaded_file(

# update any referenced item_ids
for item_id in item_ids:
sample_update_result = sample_collection.update_one(
sample_update_result = flask_mongo.db.items.update_one(
{"item_id": item_id, **get_default_permissions(user_only=True)},
{"$push": {"file_ObjectIds": inserted_id}},
)
Expand Down Expand Up @@ -455,7 +500,7 @@ def add_file_from_remote_directory(file_entry, item_id, block_ids=None):

new_directory = os.path.join(FILE_DIRECTORY, str(inserted_id))
new_file_location = os.path.join(new_directory, filename)
os.makedirs(new_directory)
pathlib.Path(new_directory).mkdir(exist_ok=True)
_sync_file_with_remote(full_remote_path, new_file_location)

updated_file_entry = file_collection.find_one_and_update(
Expand Down
4 changes: 4 additions & 0 deletions pydatalab/pydatalab/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import logging
import pathlib
from typing import Any, Dict

from dotenv import dotenv_values
Expand Down Expand Up @@ -66,6 +67,9 @@ def create_app(config_override: Dict[str, Any] | None = None) -> Flask:

pydatalab.mongo.create_default_indices()

if CONFIG.FILE_DIRECTORY is not None:
pathlib.Path(CONFIG.FILE_DIRECTORY).mkdir(parents=False, exist_ok=True)

compress.init_app(app)

@app.route("/logout")
Expand Down
8 changes: 7 additions & 1 deletion webapp/src/file_upload.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ import { API_URL } from "@/resources.js";

export default function setupUppy(item_id, trigger_selector, reactive_file_list) {
console.log("setupUppy called with: " + trigger_selector);
var uppy = new Uppy();
var uppy = new Uppy({
restrictions: {
// Somewhat arbitrary restrictions that prevent numbers that would break the server in one go -- the API should also refuse files when 'full'
maxTotalFileSize: 102400000000, // Set this UI restriction arbitrarily high at 100 GB for now --- this is the point at which I would be unsure if the upload could even complete
maxNumberOfFiles: 10000, // Similarly, a max of 10000 files in one upload as a single "File" entry feels reasonable, once we move to uploading folders etc.
},
});
let headers = construct_headers();
uppy
.use(Dashboard, {
Expand Down
Loading