Skip to content

Commit

Permalink
Flesh out snapshot creation
Browse files Browse the repository at this point in the history
  • Loading branch information
ml-evs committed Nov 14, 2023
1 parent 7bf24c3 commit 5ff2ebe
Show file tree
Hide file tree
Showing 6 changed files with 1,034 additions and 1,039 deletions.
1 change: 1 addition & 0 deletions pydatalab/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ navani = {git = "git+https://github.com/the-grey-group/[email protected]"}
python-dateutil = "*"
pybaselines = "*"
rosettasciio = "*"
fabric = "*"

[dev-packages]
pytest = "*"
Expand Down
1,890 changes: 877 additions & 1,013 deletions pydatalab/Pipfile.lock

Large diffs are not rendered by default.

10 changes: 0 additions & 10 deletions pydatalab/backup.py

This file was deleted.

119 changes: 119 additions & 0 deletions pydatalab/pydatalab/backups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import subprocess
import tarfile
import tempfile
from pathlib import Path

from pydatalab.config import CONFIG
from pydatalab.logger import LOGGER


def make_snapshot(output_path: Path, encrypt: bool = False) -> None:
"""Make a compressed snapshot of the entire datalab deployment that
can be restored from with sufficient granularity, e.g., including
config files.
Creates a tar file with the following structure:
- `./files/` - contains all files in `CONFIG.FILE_DIRECTORY`
- `./mongodb/` - contains a dump of the mongodb database
- `./config/` - contains a dump of the server config
Arguments:
output_path: Path to the .tar or .tar.gz output file, will raise
a `FileExistsError` error if the file already exists.
encrypt: Whether to encrypt the snapshot on write.
"""
if output_path.exists():
raise FileExistsError(f"Not overwriting existing file at {output_path}")

if encrypt:
raise NotImplementedError("Snapshot encryption not yet implemented")

if "".join(output_path.suffixes) == ".tar.gz":
mode = "w:gz"
elif output_path.suffixes == ".tar":
mode = "w"
else:
raise RuntimeError(
f"Output path should either be a .tar or .tar.gz file, not {output_path} with {output_path.suffix}"
)

LOGGER.info("Creating snapshot of entire datalab instance.")
LOGGER.debug("Creating snapshot of %s", CONFIG.FILE_DIRECTORY)
# Add contents of `CONFIG.FILE_DIRECTORY` to the tar file
with tarfile.open(output_path, mode=mode) as tar:
for file in Path(CONFIG.FILE_DIRECTORY).iterdir():
tar.add(file, arcname=Path("files") / file.relative_to(CONFIG.FILE_DIRECTORY))

LOGGER.debug("Snapshot of %s created.", CONFIG.FILE_DIRECTORY)

# Take a database dump and add it to the tar file
LOGGER.debug("Taking dump of database %s", CONFIG.MONGO_URI)

# Check that mongodump is available
subprocess.check_output(["mongodump", "--version"])

with tempfile.TemporaryDirectory() as temp_dir:
command = ["mongodump", CONFIG.MONGO_URI, "--out", str(Path(temp_dir).resolve())]
subprocess.check_output(command)

for file in Path(temp_dir).iterdir():
tar.add(file, arcname=Path("mongodb"))

LOGGER.debug("Dump of database %s created.", CONFIG.MONGO_URI)

LOGGER.debug("Dumping server config.")
with tempfile.TemporaryDirectory() as temp_dir:
with open(tmp_config := Path(temp_dir) / "config.json", "w") as f:
data = CONFIG.json(indent=2, skip_defaults=True)
f.write(data)

tar.add(
tmp_config,
arcname=Path("config") / "config.json",
)
LOGGER.debug("Config dump created.")
LOGGER.info("Snapshot saved at %s", output_path)


def restore_snapshot(snapshot_path: Path, decrypt: bool = False):
"""Restore a snapshot created with `make_snapshot` to the current
datalab instance, using the current configuration.
This will overwrite the contents of any existing MongoDB of the same
name.
Arguments:
snapshot_path: Path to the .tar or .tar.gz snapshot file.
"""
LOGGER.info("Attempting to restore snapshot from %s", snapshot_path)
if decrypt:
raise NotImplementedError("Snapshot decryption not yet implemented")
if not snapshot_path.exists():
raise FileNotFoundError(f"Snapshot file not found at {snapshot_path}")

if "".join(snapshot_path.suffixes) == ".tar.gz":
mode = "r:gz"
elif snapshot_path.suffixes == ".tar":
mode = "r"
else:
raise RuntimeError(
f"Snapshot path should either be a .tar or .tar.gz file, not {snapshot_path} with {snapshot_path.suffix}"
)

with tarfile.open(snapshot_path, mode=mode) as tar:
LOGGER.debug("Restoring files from %s", snapshot_path)
files = [m for m in tar.getmembers() if m.name.startswith("files/")]
tar.extractall(path=CONFIG.FILE_DIRECTORY, members=files)
LOGGER.debug("Files restored from %s", snapshot_path)

LOGGER.debug("Restoring database from %s", snapshot_path)
with tempfile.TemporaryDirectory() as temp_dir:
database = [m for m in tar.getmembers() if m.name.startswith("mongodb/")]
tar.extractall(path=temp_dir, members=database)
command = ["mongorestore", CONFIG.MONGO_URI, "--drop", str(Path(temp_dir) / "mongodb")]
subprocess.check_output(command)
LOGGER.debug("Database restored from %s", snapshot_path)

LOGGER.info("Snapshot restored from %s", snapshot_path)
18 changes: 9 additions & 9 deletions pydatalab/pydatalab/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
from pathlib import Path
from typing import Annotated, Any, Dict, List, Optional, Type, Union
from typing import Any, Dict, List, Optional, Type, Union

from pydantic import AnyUrl, BaseModel, BaseSettings, Field, root_validator, validator

Expand Down Expand Up @@ -55,12 +55,10 @@ class Config:
extra = "allow"


CronTabStr = Annotated(
str, Field(regex=r"^(?:\*|\d+(?:-\d+)?)(?:\/\d+)?(?:,\d+(?:-\d+)?(?:\/\d+)?)*$")
)
class BackupStrategy(BaseModel):
"""This model describes the config of a particular backup strategy."""


class BackupConfig(BaseModel):
label: str = Field(description="A human-readable label for the backup strategy.")
hostname: str = Field(description="The hostname of the server on which to store the backup.")
location: Path = Field(
description="The location under which to store the backups on the host. Each backup will be date-stamped and stored in a subdirectory of this location."
Expand All @@ -69,8 +67,10 @@ class BackupConfig(BaseModel):
None,
description="How many copies of this backup type to keep. For example, if the backup runs daily, this number indicates how many previous days worth of backup to keep. If the backup size ever decreases between days, the largest backup will always be kept.",
)
frequency: CronTabStr = Field(
description="The frequency of the backup, described in the crontab syntax."
frequency: Optional[str] = Field(
None,
description="The frequency of the backup, described in the crontab syntax.",
pattern=r"^(?:\*|\d+(?:-\d+)?)(?:\/\d+)?(?:,\d+(?:-\d+)?(?:\/\d+)?)*$",
)
notification_email_address: str | None = Field(
description="An email address to send backup notifications to."
Expand Down Expand Up @@ -157,7 +157,7 @@ class ServerConfig(BaseSettings):
its importance when deploying a datalab instance.""",
)

BACKUPS: Optional[list[BackupConfig]] = Field(
BACKUP_STRATEGIES: Optional[list[BackupStrategy]] = Field(
None, description="A perscription of the desired backup configuration."
)

Expand Down
35 changes: 28 additions & 7 deletions pydatalab/tasks.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import json
import os
import pathlib
import re
import sys
from datetime import datetime
from typing import TYPE_CHECKING, Tuple
from typing import Optional, Tuple

from invoke import Collection, task

from pydatalab.config import BackupStrategy
from pydatalab.logger import setup_log
from pydatalab.models.utils import UserRole

if TYPE_CHECKING:
from pydatalab.config import BackupConfig

ns = Collection()
dev = Collection("dev")
admin = Collection("admin")
Expand Down Expand Up @@ -419,10 +418,20 @@ def generate_random_startingmaterial_id():


@task
def create_backup(_, config: BackupConfig):
from fabric import Connection, Transfer
def create_backup(_, config: Optional[BackupStrategy] = None):
from fabric import Connection
from fabric.transfer import Transfer

if config is None:
print("No config provided, performing dry-run.")
config = BackupStrategy(
hostname="dillbox.ml-evs.science",
location="/tmp",
label="test",
retention=None,
)

from pydatalab.backup import make_snapshot
from pydatalab.backups import make_snapshot

print(f"Initialising backup procedure for {config.label=}")
connection = Connection(config.hostname)
Expand Down Expand Up @@ -494,6 +503,18 @@ def create_backup(_, config: BackupConfig):

admin.add_task(create_backup)


@task
def restore_backup(_, snapshot_path: os.PathLike):
from pathlib import Path

from pydatalab.backups import restore_snapshot

restore_snapshot(Path(snapshot_path))


admin.add_task(restore_backup)

ns.add_collection(dev)
ns.add_collection(admin)
ns.add_collection(migration)

0 comments on commit 5ff2ebe

Please sign in to comment.