From 8439f37ce14fb2b73084457b3bab5c68f4230756 Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Wed, 1 Nov 2023 19:09:42 +0000 Subject: [PATCH] Revamp installation docs with additional configuration/administration/deployment info (#490) * Revamp installation docs with additional deployment information * Improve docstrings for config and admin tasks * Greatly expand configuration/administration and installation docs * doc edits * Remove section on tasks.py and tweak text --------- Co-authored-by: jdbocarsly --- INSTALL.md | 74 +++++++++++++++++++---------------- README.md | 12 +++--- mkdocs.yml | 7 ++-- pydatalab/docs/.pages | 2 +- pydatalab/docs/config.md | 66 ++++++++++++++++++++++++++++++- pydatalab/pydatalab/config.py | 15 ++++++- pydatalab/tasks.py | 13 +++++- webapp/package.json | 2 +- 8 files changed, 140 insertions(+), 51 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index d8d041586..41e840017 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,29 +1,37 @@ # Installation +*datalab* is intended to be deployed on a persistent server accessible on the web that can act as a data management platform +for a group of researchers. +The instructions below outline how to make a development installation on your local machine. +We strongly recommend using the [Docker setup instructions](#deployment-with-docker) if you are deploying for use in production. + This repository consists of two components: -- a Flask-based Python web server (`pydatalab`) that communicates with the database backend, +- a Flask-based Python web server (`pydatalab`) that communicates with the database backend - a JavaScript+Vue web application for the user interface. -To run an instance, you will need to install the environments for each component. +`pydatalab` can in principle be used without the web app frontend through its JSON API. + +## Local (development) installation + +To run *datalab*, you will need to install the environments for each component. Firstly, from the desired folder, clone this repository from GitHub to your local machine with `git clone https://github.com/the-grey-group/datalab`. -Alternatively, if you do not wish to contribute to the code, you can simply download the current state as a .zip file from [GitHub](https://github.com/the-grey-group/datalab/archive/refs/heads/main.zip). -Should you wish to just run/deploy the apps themselves, the easiest method is to use Docker ([instructions below](#deployment-with-docker)). +### `pydatalab` server installation -## `pydatalab` server installation +The instructions in this section will leave you with a running JSON API on your host machine. +This can hypothetically be used entirely independently from the web front-end through the JSON API. 1. Install `pipenv` on your machine. - Detailed instructions for installing `pipenv`, `pip` and Python itself can be found on the [`pipenv` website](https://pipenv.pypa.io/en/latest/install/#installing-pipenv). - - We recommend you install `pipenv` from PyPI (with `pip install pipenv` or `pip install --user pipenv`) for the Python distribution of your choice (in a virtual environment or otherwise). This is distinct from the virtual environment that `pipenv` itself will create for the `pydatalab` package. - + - We recommend you install `pipenv` from PyPI (with `pip install pipenv` or `pip install --user pipenv`) for the Python distribution of your choice (in a virtual environment or otherwise). `pipenv` will be used to create its own virtual environment for installation of the `pydatalab` package. 1. Set up MongoDB. 1. Install the free MongoDB community edition (full instructions on the [MongoDB website](https://docs.mongodb.com/manual/installation/)). * For Mac users, MongoDB is available via [HomeBrew](https://github.com/mongodb/homebrew-brew). - You can alternatively run the MongoDB via Docker using the config in this package with `docker-compose up mongo` (see further instructions [below](#deployment-with-docker). - * If you wish to view the database directly, MongoDB has several GUIs, e.g. [MongoDB Compass](https://www.mongodb.com/products/compass) or [RoboMongo](https://robomongo.org/). - - For persistence, you will need to set up MongoDB to run as a service on your computer (or run manually each time you use the site). + * If you wish to view the database directly, MongoDB has several GUIs, e.g. [MongoDB Compass](https://www.mongodb.com/products/compass) or [Studio 3T](https://robomongo.org/). + - For persistence, you will need to set up MongoDB to run as a service on your computer (or run manually each time you run the `pydatalab` server). 1. In MongoDB, create a database called "datalabvue" ([further instructions on the MongoDB website](https://www.mongodb.com/basics/create-database)). - You can do this with the `mongo` shell (`echo "use datalabvue" | mongo`) or with Compass. 1. Install the `pydatalab` package. @@ -31,7 +39,7 @@ Should you wish to just run/deploy the apps themselves, the easiest method is to - This will create a `pipenv` environment for `pydatalab` and all of its dependencies that is registered within *this folder* only. 1. Run the server from the `pydatalab` folder with `pipenv run python pydatalab/main.py`. -The server should now be accessible at http://localhost:5001. If the server is running, navigating to this URL will display "Hello, This is a server". +The server should now be accessible at [http://localhost:5001](http://localhost:5001). If the server is running, navigating to this URL will display a simple dashboard with a textual list of available endpoints. Should you wish to contribute to/modify the Python code, you may wish to perform these extra steps: @@ -40,35 +48,34 @@ Should you wish to contribute to/modify the Python code, you may wish to perform - The hooks that run on each commit can be found in the top-level `.pre-commit-config.yml` file. 1. The tests on the Python code can be run by executing `py.test` from the `pydatalab/` folder. -Additional notes: +#### Additional notes - If the Flask server is running when the source code is changed, it will generally hot-reload without needing to manually restart the server. -- You may have to configure the `MONGO_URI` config in `main.py` depending on your MongoDB setup. In the future, this will be accessible via a config file. - -## Web app - -1. If you do not already have it, install node.js and the Node Package Manager (`npm`). It is recommended not to install node using the official installer, since it is difficult to manage or uninstall, and permissions issues may arise. Intead, it is recommended to install and manage versions using the [node version manager (nvm)][https://github.com/nvm-sh/nvm#installing-and-updating]: +- You may have to set `MONGO_URI` in your config file or environment variables (`PYDATALAB_MONGO_URI`) depending on your MongoDB setup, further details can be found in the [Server Configuration](config.md) instructions. - ```nvm install --lts``` +### Web app - This will install the current recommended version of node and nvm. +1. If you do not already have it, install `node.js` and the Node Package Manager (`npm`). +It is recommended not to install node using the official installer, since it is difficult to manage or uninstall, and permissions issues may arise. +Intead, it is recommended to install and manage versions using the [node version manager (nvm)](https://github.com/nvm-sh/nvm#installing-and-updating): `nvm install --lts`. +This will install the current recommended version of node and nvm. -2. Once installed, use it to install the `yarn` package manager: - ```npm install --global yarn``` - - From this point on, the `npm` command is not needed - all package and script management for the webapp is handled using `yarn`. +2. Once installed, use it to install the `yarn` package manager: `npm install --global yarn` +From this point on, the `npm` command is not needed - all package and script management for the webapp is handled using `yarn`. 3. Navigate to the `webapp/` directory in your local copy of this repository and run `yarn install` (requires ~400 MB of disk space). 4. Run the webapp from a development server with `yarn serve`. -Similar to the Flask development server, these steps will provide a development environment that serves the web app at `localhost:8080` (by default) and automatically reloads it as changes are made to the source code. +#### Additional notes + +Similar to the Flask development server, these steps will provide a development environment that serves the web app at [http://localhost:8081](http://localhost:8081) (by default) and automatically reloads it as changes are made to the source code. + Various other development scripts are available through `yarn`: - `yarn lint`: Lint the javascript code using `eslint`, identifying issues and automatically fixing many. This linting process also runs automatically every time the development server reloads. -- `yarn test:unit`: run the unit/componenet tests using `jest`. These test individual functions or components. +- `yarn test:unit`: run the unit/component tests using `jest`. These test individual functions or components. - `yarn test:e2e`: run end-to-end tests using `cypress`. This will build and serve the app, and launch an instance of Chrome where the tests can be interactively viewed. The tests can also be run without the gui using ```yarn test:e2e --headless```. Note: currently, the tests make requests to the server running on `localhost:5001`. - `yarn build`: Compile an optimized, minimized, version of the app for production. - ## Deployment with Docker [Docker](https://docs.docker.com/) uses virtualization to allow you to build "images" of your software that are transferrable and deployable as "containers" across multiple systems. @@ -85,14 +92,13 @@ The development target mounts the repository in the running container and provid - Individual containers can be launched with `docker compose up ` for the services `mongo`, `app`, `app_dev`, `api` and `api_dev`. - `docker compose stop` will stop all running containers. -## Note on remote filesystems - -This package allows you to attach files from remote filesystems to samples and other entries. -These filesystems can be configured in the config file with the `REMOTE_FILESYSTEMS` option. -In practice, these options should be set in a centralised deployment. - +## Permanent deployment instructions -Currently, there are two mechanisms for accessing remote files: +There are several steps involved from taking the Docker containers above and provisioning a persistent *datalab* server and instance available through the internet. +Many of these involve tuning the server configuration for your group following the [additional documentation](config.md) on configuration, but many additional choices also depend on how you plan to host the containers in the long-term. +Some things to consider: -1. You can mount the filesystem locally and provide the path in your datalab config file. For example, for Cambridge Chemistry users, you will have to (connect to the ChemNet VPN and) mount the Grey Group backup servers on your local machine, then define these folders in your config. -2. Access over `ssh`: alternatively, you can set up passwordless `ssh` access to a machine (e.g., using `citadel` as a proxy jump), and paths on that remote machine can be configured as separate filesystems. The filesystem metadata will be synced periodically, and any attached files will be downloaded and stored locally (with the file being kept younger than 1 hour old on each access). +- Typically you will host the app and API containers on the same server behind a reverse proxy such as [Nginx](https://nginx.org). +- Typically you will need to run the app and API on two different subdomains. +These can be provided perhaps by an IT department, or by configuring DNS settings on your own domain to point to the server. +You will need to configure the app such so that it points at the relevant hosted API (see [app `.env` description](config.md#app). diff --git a/README.md b/README.md index 6be1d13fa..6d0210ccf 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ src="https://img.shields.io/readthedocs/the-datalab?logo=readthedocs&color=bluev > 📢 If you are interested in joining the *datalab* mailing list and helping decide its future, please fill out [the survey](https://forms.gle/etq4pcsQJ3omLBmj6). -> ℹ️ We have created a public deployment of *datalab* for potential users to test. Please register via the magic-link email sign in at [public.datalab.odbx.science](https://public.datalab.odbx.science). Any data stored here will be private but we provide no assurances for availability or data backups, so please do not use this for production work. +> ℹ️ We have created a public deployment of *datalab* for potential users to test. Please register via the magic-link email sign in at [public.datalab.odbx.science](https://public.datalab.odbx.science). Any data stored here will not be visible to others except the admins of the deployment, where it will only be used for debugging purposes. We provide no assurances for availability or data backups on this deployment, so please do not use this for production work. @@ -34,7 +34,7 @@ Importantly, *datalab* stores a network of interconnected research objects in th - a Vue 3 web application for a GUI that can be used to record information on samples alongside raw data files and analysis documents. -The system was developed for and is currently deployed for the +The system was originally developed in and is currently deployed for the [Grey Group](https://www.ch.cam.ac.uk/group/grey/) in the Department of Chemistry at the University of Cambridge. @@ -58,16 +58,16 @@ in the Department of Chemistry at the University of Cambridge. ### UI -- A simple, intuitive UI for recording sample metadata and relationships with - other samples (batches, offshoots), alongside synthesis parameters and raw data. -- Basic analysis and plotting of live and archived data attached to a sample, e.g., +- A simple, intuitive UI for recording sample-based metadata and relationships with + other samples (batches, derivatives, _etc._), alongside synthesis parameters and raw data. +- Basic analysis and plotting of live and archived data attached to a sample, _e.g._, characterisation via XRD or NMR, electrochemical cycling data and images (see "Data blocks" section for a complete list). - Interactive network visualisation of the connections between samples and inventory. ## Development status *datalab* remains under active development, and the API, data models and UI may change significantly between versions without prior notice. -Where possible, and without breaking changes will be listed in the release notes for every pre-v1 release. +Where possible, breaking changes will be listed in the release notes for every pre-v1 release. ## Installation diff --git a/mkdocs.yml b/mkdocs.yml index 5569fb6d5..a628720f8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -54,7 +54,7 @@ plugins: show_root_toc_entry: true show_root_full_path: true show_object_full_path: false - show_category_heading: true + show_category_heading: false show_if_no_docstring: true show_signature_annotations: true show_source: true @@ -65,14 +65,13 @@ plugins: members: true inherited_members: true docstring_style: google - enable_inventory: false + enable_inventory: true filters: - "!^_[^_]" - "!__json_encoder__$" - "!__all__$" - "!__config__$" - - "!ValidatorResults$" - - "!Config" + - "!^Config$" - awesome-pages - autorefs diff --git a/pydatalab/docs/.pages b/pydatalab/docs/.pages index dab24a68c..88ee6df7c 100644 --- a/pydatalab/docs/.pages +++ b/pydatalab/docs/.pages @@ -1,8 +1,8 @@ nav: - index.md - INSTALL.md + - config.md - design - schemas - blocks - rest_api.md - - config.md diff --git a/pydatalab/docs/config.md b/pydatalab/docs/config.md index 7484317bc..61b51445c 100644 --- a/pydatalab/docs/config.md +++ b/pydatalab/docs/config.md @@ -1,2 +1,66 @@ -title: Server configuration +# Server configuration and administration + +*datalab* has 3 main configuration sources. + +1. The Python `ServerConfig` (described below) that allows for *datalab*-specific configuration, such as database connection info, filestore locations and remote filesystem configuration. + - This can be provided via a JSON or YAML config file at the location provided by the `PYDATALAB_CONFIG_FILE` environment variable, or as environment variables themselves, prefixed with `PYDATALAB_`. + - The available configuration variables and their default values are listed below. +2. Additional server configuration provided as environment variables, such as secrets like Flask's `SECRET_KEY`, API keys for external services (e.g., Sendgrid) and OAuth client credentials (for logging in via GitHub or ORCID). + - These can be provided as environment variables or in a `.env` file in the directory from which `pydatalab` is launched. +3. Web app configuration, such as the URL of the relevant *datalab* API and branding (logo URLs, external homepage links). + - These are typically provided as a `.env` file in the directory from which the webapp is built/served. + +## Configuring user registration/authentication + +*datalab* has three supported user registration/authentication +mechanisms: + +1. OAuth2 via GitHub accounts that are public members of appropriate GitHub +organizations +2. OAuth2 via ORCID +3. via magic links sent to email addresses + +Each is configured differently. + +For GitHub, you must register a [GitHub OAuth +application](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) for your instance, providing the client ID and secret in the `.env` for the API. +Then, you can configure `GITHUB_ORG_ALLOW_LIST` with a list of string IDs of GitHub organizations that user's must be a public member of to register an account. +If this value is set to `None`, then no accounts will be able to register, and if it is set to an empty list, then no restrictions will apply. + +For ORCID integration, each *datalab* instance must currently register for the ORCID developer program and request new credentials. +As such, this may be tricky to support for new instances. +We are looking for ways around this in the future. + +To support sign-in via email magic-links, you must currently provide +additional configuration for the [SendGrid](https://sendgrid.com/) web API, i.e., your default email sender (`MAIL_DEFAULT_SENDER`) and SendGrid API key (`MAIL_PASSWORD`), as environment variables for the API container. +There is currently no restrictions on which email addresses can sign up. +This approach will soon also support using any configured SMTP server. + +## Configuring remote filesystems + +This package allows you to attach files from remote filesystems to samples and other entries. +These filesystems can be configured in the config file with the `REMOTE_FILESYSTEMS` option. +In practice, these options should be set in a centralised deployment. + +Currently, there are two mechanisms for accessing remote files: + +1. You can mount the filesystem locally and provide the path in your datalab config file. For example, for Cambridge Chemistry users, you will have to (connect to the ChemNet VPN and) mount the Grey Group backup servers on your local machine, then define these folders in your config. +2. Access over `ssh`: alternatively, you can set up passwordless `ssh` access to a machine (e.g., using `citadel` as a proxy jump), and paths on that remote machine can be configured as separate filesystems. The filesystem metadata will be synced periodically, and any files attached in `datalab` will be downloaded and stored locally on the `pydatalab` server (with the file being kept younger than 1 hour old on each access). + +## Server administration + +Currently most administration tasks must be handled directly inside the Python API container. +Several helper routines are available as `invoke` tasks in `tasks.py` in the `pydatalab` root folder. +You can list all available tasks by running `invoke --list` in the root `pydatalab` folder after installing the package with the `[dev]` extras. +In the future, many admin tasks (e.g., updating user info, allowing/blocking user accounts, defining subgroups) will be accessible in the web UI. + +### Importing chemical inventories + +One such `invoke` task implements the ingestion of a [ChemInventory](https://cheminventory.net) chemical inventory into *datalab*. +It relies on the Excel export feature of ChemInventory and is achieved with `invoke admin.import-cheminventory `. +If a future export is made and reimported, the old entries will be kept and updated, rather than overwritten. +*datalab* currently has no functionality for chemical inventory management itself; if you wish to support importing from another inventory system, please [raise an issue](https://github.com/the-grey-group/datalab/issues/new). + +# Config API Reference + ::: pydatalab.config diff --git a/pydatalab/pydatalab/config.py b/pydatalab/pydatalab/config.py index e3fd3f3e4..ca64dab7d 100644 --- a/pydatalab/pydatalab/config.py +++ b/pydatalab/pydatalab/config.py @@ -9,6 +9,8 @@ from pydatalab.models import Person from pydatalab.models.utils import RandomAlphabeticalRefcodeFactory, RefCodeFactory +__all__ = ("CONFIG", "ServerConfig", "DeploymentMetadata", "RemoteFilesystem") + def config_file_settings(settings: BaseSettings) -> Dict[str, Any]: """Returns a dictionary of server settings loaded from the default or specified @@ -35,6 +37,8 @@ def config_file_settings(settings: BaseSettings) -> Dict[str, Any]: class DeploymentMetadata(BaseModel): + """A model for specifying metadata about a datalab deployment.""" + maintainer: Optional[Person] issue_tracker: Optional[AnyUrl] homepage: Optional[AnyUrl] @@ -52,6 +56,10 @@ class Config: class RemoteFilesystem(BaseModel): + """Configuration for specifying a single remote filesystem + accessible from the server. + """ + name: str hostname: Optional[str] path: Path @@ -103,7 +111,7 @@ class ServerConfig(BaseSettings): BEHIND_REVERSE_PROXY: bool = Field( False, - description="Whether the Flask app is being deployed behind a reverse proxy. If `True`, the reverse proxy middleware described in the Flask docs (https://flask.palletsprojects.com/en/2.2.x/deploying/proxy_fix/) will be attached to the app.", + description="Whether the Flask app is being deployed behind a reverse proxy. If `True`, the reverse proxy middleware described in the [Flask docs](https://flask.palletsprojects.com/en/2.2.x/deploying/proxy_fix/) will be attached to the app.", ) GITHUB_ORG_ALLOW_LIST: Optional[List[str]] = Field( @@ -152,4 +160,7 @@ def update(self, mapping): setattr(self, key.upper(), mapping[key]) -CONFIG = ServerConfig() +CONFIG: ServerConfig = ServerConfig() +"""The global server configuration object. +This is a singleton instance of the `ServerConfig` model. +""" diff --git a/pydatalab/tasks.py b/pydatalab/tasks.py index 2862c193b..b7a6bea6c 100644 --- a/pydatalab/tasks.py +++ b/pydatalab/tasks.py @@ -132,6 +132,7 @@ def manually_register_user( orcid: str | None = None, github_user_id: int | None = None, ): + """Registers a user account with the given identities.""" from pydatalab.models.people import Identity, Person identities = [] @@ -203,6 +204,7 @@ def repair_files(_, resync: bool = True): @task def add_missing_refcodes(_): + """Generates refcodes for any items that are missing them.""" from pydatalab.models.utils import generate_unique_refcode from pydatalab.mongo import get_database @@ -233,8 +235,8 @@ def check_item_validity(_, base_url: str | None = None, starting_materials: bool """This task looks up all sample and cell items and checks that they can be successfully accessed through the API. - Requires the environment variable DATALAB_API_KEY to be set. - Will also additionally pass JSON-formatted values from the DATALAB_HEADERS environment variable. + Requires the environment variable `DATALAB_API_KEY` to be set. + Will also additionally pass JSON-formatted values from the `DATALAB_HEADERS` environment variable. Parameters: base_url: The API URL. @@ -345,6 +347,13 @@ def check_remotes(_, base_url: str | None = None, invalidate_cache: bool = False @task def import_cheminventory(_, filename: str): + """For a given ChemInventory Excel export, ingest the .xlsx file at + into the datalab items collection with type `starting_materials`. + + Parameters: + filename: The filename of the ChemInventory exported spreadsheet to import. + + """ import random import pandas as pd diff --git a/webapp/package.json b/webapp/package.json index 8fabfc8d4..d5b361571 100644 --- a/webapp/package.json +++ b/webapp/package.json @@ -1,6 +1,6 @@ { "name": "datalab-vue", - "version": "0.1.0", + "version": "0.2.5", "private": true, "scripts": { "serve": "vue-cli-service serve",