datalab-org · ml-evs · Jan 19, 2024 · Jan 18, 2024 · Jan 2, 2024 · Jan 3, 2024
@@ -56,7 +56,7 @@ jobs:
       fail-fast: false
       max-parallel: 2
       matrix:
-        python-version: ["3.9", "3.10"]
+        python-version: ["3.10", "3.11"]
 
     steps:
       - uses: actions/checkout@v3
@@ -66,6 +66,10 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
+      - name: Install MongoDB tools binaries
+        run: |
+          wget https://fastdl.mongodb.org/tools/db/mongodb-database-tools-ubuntu2204-x86_64-100.9.0.deb && sudo apt install ./mongodb-database-tools-*-100.9.0.deb
+
       - name: Install latest compatible versions of immediate dependencies
         working-directory: ./pydatalab
         run: |

@@ -1,7 +1,6 @@
 # Datalab database dirs - these can be removed once we have a better place for them
 files/
 uploads/
-server/
 ssh_config
 logs/
 

@@ -24,7 +24,7 @@ repos:
       - id: mixed-line-ending
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.1.4"
+    rev: "v0.1.13"
     hooks:
       - id: ruff
         args: [--fix]
@@ -36,8 +36,13 @@ repos:
       - id: prettier
         types_or: [javascript, jsx, vue, html, yaml]
 
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.15.0
+    hooks:
+      - id: pyupgrade
+
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.8.0
     hooks:
       - id: mypy
         additional_dependencies: ["types-all", "pydantic~=1.10"]

@@ -64,6 +64,9 @@ services:
     networks:
       - nginx
       - backend
+    environment:
+      - PYDATALAB_TESTING=true
+      - PYDATALAB_MONGO_URI=mongodb://database:27017/datalabvue
 
   database:
     build:

@@ -34,6 +34,7 @@ pybaselines = "*"
 rosettasciio = "*"
 pyjwt = "*"
 invoke = "*"
+paramiko = "*"
 
 [dev-packages]
 pytest = "*"

@@ -72,6 +72,7 @@ Currently, there are two mechanisms for accessing remote files:
 1. You can mount the filesystem locally and provide the path in your datalab config file. For example, for Cambridge Chemistry users, you will have to (connect to the ChemNet VPN and) mount the Grey Group backup servers on your local machine, then define these folders in your config.
 2. Access over SSH: alternatively, you can set up passwordless `ssh` access to a machine (e.g., using `citadel` as a proxy jump), and paths on that remote machine can be configured as separate filesystems. The filesystem metadata will be synced periodically, and any files attached in `datalab` will be downloaded and stored locally on the `pydatalab` server (with the file being kept younger than 1 hour old on each access).
 
+
 ## General Server administration
 
 Currently most administration tasks must be handled directly inside the Python API container.
@@ -86,6 +87,33 @@ It relies on the Excel export feature of ChemInventory and is achieved with `inv
 If a future export is made and reimported, the old entries will be kept and updated, rather than overwritten.
 *datalab* currently has no functionality for chemical inventory management itself; if you wish to support importing from another inventory system, please [raise an issue](https://github.com/the-grey-group/datalab/issues/new).
 
+### Backups
+
+*datalab* provides a way to configure and create a snapshot backups of the database and filestore.
+The option [`BACKUP_STRATEGIES`][pydatalab.config.ServerConfig.BACKUP_STRATEGIES] allows you to list strategies for scheduled backups, with their frequency, storage location (can be local or remote) and retention.
+These backups are only performed when scheduled externally (e.g., via `cron` on the hosting server), or when triggered manually using the `invoke admin.create-backup` task.
+
+The simplest way to create a backup is to run `invoke admin.create-backup --output-path /tmp/backup.tar.gz`, which will create a compressed backup.
+This should be run from the server or container for the API, and will make use of the config to connect to the database and file store.
+This approach will not follow any retention strategy.
+
+Alternatively, you can create a backup given the strategy name defined in the server config, using the same task:
+
+```
+invoke admin.create-backup --strategy-name daily-snapshots
+```
+
+This will apply the retention strategy and any copying to remote resources as configured.
+
+When scheduling backups externally, it is recommended you do not use `cron` inside the server Docker container.
+Instead, you could schedule a job that calls, for example:
+
+```shell
+docker compose exec api pipenv run admin.create-backup --strategy-name daily-snapshots
+```
+
+In the future, this may be integrated directly into the *datalab* server using a Python-based scheduler.
+
 ## Config API Reference
 
 ::: pydatalab.config.ServerConfig

@@ -53,7 +53,7 @@ def read_bruker_1d(
     p_dic, p_data = ng.fileio.bruker.read_pdata(str(processed_data_dir))  # processing data
 
     try:
-        with open(os.path.join(processed_data_dir, "title"), "r") as f:
+        with open(os.path.join(processed_data_dir, "title")) as f:
             topspin_title = f.read()
     except FileNotFoundError:
         topspin_title = None
@@ -101,7 +101,7 @@ def read_topspin_txt(filename, sample_mass_mg=None, nscans=None):
     LEFTRIGHT_REGEX = r"# LEFT = (-?\d+\.\d+) ppm. RIGHT = (-?\d+\.\d+) ppm\."
     SIZE_REGEX = r"SIZE = (\d+)"
 
-    with open(filename, "r") as f:
+    with open(filename) as f:
         header = "".join(itertools.islice(f, MAX_HEADER_LINES))  # read the first 10 lines
     # print(header)
 

@@ -33,7 +33,7 @@ def load(self, location: str | Path) -> tuple[pd.DataFrame, dict, list[str]]:
         if ext == ".txt":
             try:
                 header = []
-                with open(location, "r", encoding="cp1252") as f:
+                with open(location, encoding="cp1252") as f:
                     for line in f:
                         if line.startswith("#"):
                             header.append(line)

@@ -27,7 +27,7 @@ def parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pd.DataFrame, Dict]]
     if not path.exists():
         raise RuntimeError(f"Provided path does not exist: {path!r}")
 
-    with open(path, "r") as f:
+    with open(path) as f:
         # Read start of file until all header keys have been found
         max_header_lines = 8
         reads = 0

@@ -24,7 +24,7 @@ def parse_xrdml(filename: str) -> pd.DataFrame:
         filename: The file to parse.
 
     """
-    with open(filename, "r") as f:
+    with open(filename) as f:
         s = f.read()
 
     start, end = getStartEnd(s)  # extract first and last angle
@@ -70,7 +70,7 @@ def convertSinglePattern(
             )
             return outfn
 
-    with open(filename, "r") as f:
+    with open(filename) as f:
         s = f.read()
 
     print(f"Processing file {filename}")

@@ -0,0 +1,206 @@
+import datetime
+import subprocess
+import tarfile
+import tempfile
+from pathlib import Path
+from typing import Any
+
+from pydatalab.config import CONFIG, BackupStrategy
+from pydatalab.logger import LOGGER
+
+
+def take_snapshot(snapshot_path: Path, encrypt: bool = False) -> None:
+    """Make a compressed snapshot of the entire datalab deployment that
+    can be restored from with sufficient granularity, e.g., including
+    config files.
+
+    Creates a tar file with the following structure:
+        - `./files/` - contains all files in `CONFIG.FILE_DIRECTORY`
+        - `./mongodb/` - contains a dump of the mongodb database
+        - `./config/` - contains a dump of the server config
+
+    Arguments:
+        snapshot_path: Desired path to the .tar or .tar.gz output file, will raise
+            a `FileExistsError` error if the file already exists.
+        encrypt: Whether to encrypt the snapshot on write.
+
+    """
+    if snapshot_path.exists():
+        raise FileExistsError(f"Not overwriting existing file at {snapshot_path}")
+
+    if encrypt:
+        raise NotImplementedError("Snapshot encryption not yet implemented")
+
+    if "".join(snapshot_path.suffixes) == ".tar.gz":
+        mode = "w:gz"
+    elif snapshot_path.suffixes == ".tar":
+        mode = "w"
+    else:
+        raise RuntimeError(
+            f"Output path should either be a .tar or .tar.gz file, not {snapshot_path} with {snapshot_path.suffix}"
+        )
+
+    LOGGER.info("Creating snapshot of entire datalab instance.")
+    LOGGER.debug("Creating snapshot of %s", CONFIG.FILE_DIRECTORY)
+    # Add contents of `CONFIG.FILE_DIRECTORY` to the tar file
+    with tarfile.open(snapshot_path, mode=mode) as tar:
+        for file in Path(CONFIG.FILE_DIRECTORY).iterdir():
+            tar.add(file, arcname=Path("files") / file.relative_to(CONFIG.FILE_DIRECTORY))
+
+        LOGGER.debug("Snapshot of %s created.", CONFIG.FILE_DIRECTORY)
+
+        # Take a database dump and add it to the tar file
+        LOGGER.debug("Taking dump of database %s", CONFIG.MONGO_URI)
+
+        # Check that mongodump is available
+        subprocess.check_output(["mongodump", "--version"])
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            command = ["mongodump", CONFIG.MONGO_URI, "--out", str(Path(temp_dir).resolve())]
+            subprocess.check_output(command)
+
+            for file in Path(temp_dir).iterdir():
+                tar.add(file, arcname=Path("mongodb"))
+
+        LOGGER.debug("Dump of database %s created.", CONFIG.MONGO_URI)
+
+        LOGGER.debug("Dumping server config.")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with open(tmp_config := Path(temp_dir) / "config.json", "w") as f:
+                data = CONFIG.json(indent=2, skip_defaults=True)
+                f.write(data)
+
+                tar.add(
+                    tmp_config,
+                    arcname=Path("config") / "config.json",
+                )
+        LOGGER.debug("Config dump created.")
+        LOGGER.info("Snapshot saved at %s", snapshot_path)
+
+
+def restore_snapshot(snapshot_path: Path, decrypt: bool = False):
+    """Restore a snapshot created with `make_snapshot` to the current
+    datalab instance, using the current configuration.
+
+    This will overwrite the contents of any existing MongoDB of the same
+    name.
+
+    Arguments:
+        snapshot_path: Path to the .tar or .tar.gz snapshot file.
+
+    """
+    LOGGER.info("Attempting to restore snapshot from %s", snapshot_path)
+    if decrypt:
+        raise NotImplementedError("Snapshot decryption not yet implemented")
+    if not snapshot_path.exists():
+        raise FileNotFoundError(f"Snapshot file not found at {snapshot_path}")
+
+    if "".join(snapshot_path.suffixes) == ".tar.gz":
+        mode = "r:gz"
+    elif snapshot_path.suffixes == ".tar":
+        mode = "r"
+    else:
+        raise RuntimeError(
+            f"Snapshot path should either be a .tar or .tar.gz file, not {snapshot_path} with {snapshot_path.suffix}"
+        )
+
+    with tarfile.open(snapshot_path, mode=mode) as tar:
+        LOGGER.debug("Restoring files from %s", snapshot_path)
+        files = [m for m in tar.getmembers() if m.name.startswith("files/")]
+        tar.extractall(path=CONFIG.FILE_DIRECTORY, members=files)
+        LOGGER.debug("Files restored from %s", snapshot_path)
+
+        LOGGER.debug("Restoring database from %s", snapshot_path)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            database = [m for m in tar.getmembers() if m.name.startswith("mongodb/")]
+            tar.extractall(path=temp_dir, members=database)
+            command = ["mongorestore", CONFIG.MONGO_URI, "--drop", str(Path(temp_dir) / "mongodb")]
+            subprocess.check_output(command)
+        LOGGER.debug("Database restored from %s", snapshot_path)
+
+    LOGGER.info("Snapshot restored from %s", snapshot_path)
+
+
+def create_backup(strategy: BackupStrategy) -> bool:
+    """Create a backup given the provided strategy, dealing
+    with any offsite file transfer and desired retention limits.
+
+    Arguments:
+        strategy: The `BackupStrategy` config.
+
+    Returns:
+        bool: Whether the backup was successful.
+
+    """
+
+    snapshot_name = (
+        f"datalab-snapshot-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.tar.gz"
+    )
+
+    if strategy.hostname is None:
+        snapshot_path = strategy.location / snapshot_name
+
+        if not strategy.location.is_dir():
+            strategy.location.mkdir(parents=True, exist_ok=True)
+
+        take_snapshot(snapshot_path)
+
+        existing_snapshots = [
+            str(s) for s in strategy.location.iterdir() if s.name.startswith("datalab-snapshot-")
+        ]
+
+        retention = strategy.retention or 100
+        if len(existing_snapshots) > retention:
+            LOGGER.info(
+                "Cleaning up old snapshots: found %s, retention set to %s",
+                len(existing_snapshots),
+                strategy.retention,
+            )
+            # Sort into reverse order then remove from the end of the list
+            sorted_snapshots = sorted(existing_snapshots, reverse=True)
+            for _ in range(len(sorted_snapshots) - retention):
+                snapshot_to_delete = sorted_snapshots.pop()
+                LOGGER.debug("Cleaning up snapshot %s", snapshot_to_delete)
+                (strategy.location / snapshot_to_delete).unlink()
+    else:
+        from paramiko.client import SSHClient
+        from paramiko.config import SSHConfig
+
+        ssh_config_path = Path.home() / ".ssh" / "config"
+        ssh_cfg: dict[str, Any] = {}
+        if ssh_config_path.exists():
+            _ssh_cfg = SSHConfig.from_path(str(ssh_config_path.resolve()))
+            ssh_cfg = dict(_ssh_cfg.lookup(strategy.hostname))
+
+        client = SSHClient()
+        client.load_system_host_keys()
+        client.connect(strategy.hostname, username=ssh_cfg.get("user", None))
+
+        sftp = client.open_sftp()
+        try:
+            sftp.chdir(path=str(strategy.location))
+        except OSError:
+            sftp.mkdir(path=str(strategy.location))
+            sftp.chdir(path=str(strategy.location))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            snapshot_path = Path(tmp_dir) / snapshot_name
+
+            take_snapshot(snapshot_path)
+
+            # Delete the oldest snapshots
+            if strategy.retention is not None:
+                existing_snapshots = [
+                    s
+                    for s in sftp.listdir(str(strategy.location))
+                    if s.startswith("datalab-snapshot-")
+                ]
+                if len(existing_snapshots) > strategy.retention:
+                    # Sort into reverse order then remove from the end of the list
+                    sorted_snapshots = sorted(existing_snapshots, reverse=True)
+                    for _ in range(len(sorted_snapshots) - strategy.retention):
+                        sftp.remove(str(strategy.location / sorted_snapshots.pop()))
+
+            sftp.put(snapshot_path, str(strategy.location / snapshot_name), confirm=True)
+
+    return True