From 7a3e6ac5e6ca9276cb1c29b11cd0e394eb09402e Mon Sep 17 00:00:00 2001 From: DaMandal0rian <3614052+DaMandal0rian@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:22:47 +0200 Subject: [PATCH] scripts to manage node launch (#342) * scripts to manage node launch * fix stderr false errors from docker * update readme * add timekeeper node logic * update readme for POT entropy * more debug logging and improvements * use sed and ssh instead of sftp for .env modifying --- scripts/launch-nodes/README.md | 117 ++++++++ scripts/launch-nodes/install_dependencies.sh | 42 +++ scripts/launch-nodes/manage_subspace.py | 266 +++++++++++++++++++ scripts/launch-nodes/nodes.toml | 24 ++ 4 files changed, 449 insertions(+) create mode 100644 scripts/launch-nodes/README.md create mode 100755 scripts/launch-nodes/install_dependencies.sh create mode 100755 scripts/launch-nodes/manage_subspace.py create mode 100644 scripts/launch-nodes/nodes.toml diff --git a/scripts/launch-nodes/README.md b/scripts/launch-nodes/README.md new file mode 100644 index 00000000..ef2df726 --- /dev/null +++ b/scripts/launch-nodes/README.md @@ -0,0 +1,117 @@ + +# Subspace Node Manager + +This script manages the deployment of Subspace nodes (RPC, Farmer, and Bootstrap nodes) on multiple servers using SSH. It updates the `.env` file with the specified release version and coordinates the startup sequence to ensure that RPC and Farmer nodes are started first. The Bootstrap node is updated last with the correct `GENESIS_HASH` and then started. + +## Features + +- SSH into multiple servers defined in a TOML configuration file. +- Modify `.env` files in the Subspace directory with a new release version and update `GENESIS_HASH`. +- Restart Subspace nodes using `docker-compose down -v` and `docker-compose up -d`. +- Retrieve the `protocol_version` hash from the RPC node logs and use it to update the Bootstrap node. +- Ensure proper start order (RPC and Farmer nodes first, Bootstrap node last). + +## Prerequisites + +- **Python 3.x** installed on your local machine. +- The following Python libraries (installed via the provided `install_dependencies.sh` script): + - `paramiko` for SSH connections. + - `toml` for reading the configuration file. +- SSH access to the remote servers where the Subspace nodes are running. +- Ensure the remote servers have Docker and Docker Compose installed. + +## Installation + +### Step 1: Install Dependencies + +1. Clone the repository or download the Python script and associated files. +2. Use the provided `install_dependencies.sh` script to install the required Python packages in a virtual environment. + +```bash +chmod +x install_dependencies.sh +./install_dependencies.sh +``` + +This will create a virtual environment (`subspace_env`) and install the required packages: `paramiko` and `toml`. + +### Step 2: Activate the Virtual Environment + +Activate the virtual environment where the dependencies are installed: + +```bash +source subspace_env/bin/activate +``` + +### Step 3: Prepare Configuration + +Create a TOML configuration file (`nodes.toml`) with details for your Bootstrap, RPC, and Farmer nodes. The file should look like this: + +```toml +# TOML file containing server details + +[bootstrap_node] +host = "bootstrap.example.com" +user = "username" +ssh_key = "/path/to/private/key" + +[farmer_rpc_nodes] + +[[farmer_rpc_nodes]] +host = "rpc.example.com" +user = "username" +ssh_key = "/path/to/private/key" +type = "rpc" + +[[farmer_rpc_nodes]] +host = "farmer.example.com" +user = "username" +ssh_key = "/path/to/private/key" +type = "farmer" +``` + +- **`bootstrap_node`:** This section defines the Bootstrap node. +- **`farmer_rpc_nodes`:** This section contains the RPC and Farmer nodes. The `type` field specifies whether the node is an RPC node or a Farmer node. + +### Step 4: Running the Script + +Once the configuration file is ready, make the python script executable and run the Python script with the following command: + +```bash +chmod +x manage_subspace.py +python manage_subspace.py --config nodes.toml --release_version gemini-3h-2024-sep-17 --subspace_dir /home/ubuntu/subspace/subspace --pot_external_entropy random_value +``` + +- `--config`: Path to the TOML configuration file. +- `--release_version`: The release version to be used to update the `DOCKER_TAG` in the `.env` files. +- `--subspace_dir`: Path to the Subspace directory (default: `/home/ubuntu/subspace`). +- `--pot_external_entropy`: The random seed for proof of time entropy + +### Step 5: Deactivate the Virtual Environment + +Once the script has run, deactivate the virtual environment: + +```bash +deactivate +``` + +## Logging and Error Handling + +The script logs important actions and any errors that occur. The following log levels are used: + +- **INFO**: General information about the script's progress (e.g., starting/stopping nodes, modifying files). +- **WARNING**: Warnings about non-critical issues (e.g., retries during protocol version extraction). +- **ERROR**: Errors that prevent successful execution (e.g., failed SSH connections, issues with running commands). + +## Retry Mechanism + +The script includes a retry mechanism when extracting the `protocol_version` from the RPC node logs. It attempts to grep the log multiple times (default 5 retries) with a delay (default 10 seconds) between attempts. + +## License + +This project is licensed under the MIT License. + +## Troubleshooting + +- Ensure you have SSH access to all nodes and that your private key is properly configured. +- Ensure Docker and Docker Compose are installed and configured on the target servers. +- Check your `.env` file permissions to make sure the script can read and write to it. diff --git a/scripts/launch-nodes/install_dependencies.sh b/scripts/launch-nodes/install_dependencies.sh new file mode 100755 index 00000000..91787dc8 --- /dev/null +++ b/scripts/launch-nodes/install_dependencies.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Function to check if Python is installed +check_python() { + if ! command -v python3 &> /dev/null + then + echo "Python3 could not be found. Please install Python 3.x before proceeding." + exit 1 + fi +} + +# Function to create a virtual environment and install dependencies +install_dependencies() { + # Check if virtualenv is installed, if not install it + if ! python3 -m venv --help &> /dev/null; then + echo "virtualenv not found, installing..." + pip3 install virtualenv + fi + + # Create virtual environment + echo "Creating a virtual environment..." + python3 -m venv subspace_env + + # Activate the virtual environment + source subspace_env/bin/activate + + # Install required Python packages + echo "Installing required dependencies with pip..." + pip install paramiko tomli colorlog + + # Deactivate virtual environment after installing + deactivate + + echo "Dependencies installed in 'subspace_env' virtual environment." + echo "To activate it, run: source subspace_env/bin/activate" +} + +# Check for Python installation +check_python + +# Install dependencies +install_dependencies diff --git a/scripts/launch-nodes/manage_subspace.py b/scripts/launch-nodes/manage_subspace.py new file mode 100755 index 00000000..1158d560 --- /dev/null +++ b/scripts/launch-nodes/manage_subspace.py @@ -0,0 +1,266 @@ +import os +import paramiko +import argparse +import tomli +import re +import logging +import colorlog +from time import sleep + +# Configure logging with colorlog +handler = colorlog.StreamHandler() +handler.setFormatter(colorlog.ColoredFormatter( + '%(log_color)s%(asctime)s - %(levelname)s - %(message)s', + log_colors={ + 'DEBUG': 'cyan', + 'INFO': 'green', + 'WARNING': 'yellow', + 'ERROR': 'red', + 'CRITICAL': 'bold_red', + } +)) +logger = colorlog.getLogger(__name__) +logger.addHandler(handler) +logger.setLevel(logging.INFO) + +def ssh_connect(host, user, key_file): + """Establish an SSH connection to a server.""" + try: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(hostname=host, username=user, key_filename=key_file) + logger.info(f"Connected to {host}") + return client + except Exception as e: + logger.error(f"Failed to connect to {host}: {e}") + raise + +def run_command(client, command): + """Run a command over SSH and return the output.""" + try: + stdin, stdout, stderr = client.exec_command(command) + output = stdout.read().decode('utf-8') + error = stderr.read().decode('utf-8') + + # Treat Docker status updates as INFO instead of ERROR + if error: + if any(keyword in error for keyword in ["Stopping", "Stopped", "Creating", "Started", "Removing", "Removed"]): + logger.info(f"Command output: {error.strip()}") + else: + logger.error(f"Error running command: {error.strip()}") + return output, error + except Exception as e: + logger.error(f"Failed to run command: {command}: {e}") + raise + +def docker_compose_down(client, subspace_dir): + """Run sudo docker compose down -v in the subspace directory.""" + try: + command = f'cd {subspace_dir} && sudo docker compose down -v' + logger.info(f"Running sudo docker compose down -v in {subspace_dir}") + run_command(client, command) + except Exception as e: + logger.error(f"Failed to run sudo docker compose down -v: {e}") + raise + +def modify_env_file(client, subspace_dir, release_version, genesis_hash=None, pot_external_entropy=None): + """Modify the .env file to update the Docker tag, Genesis Hash, and POT_EXTERNAL_ENTROPY using sed.""" + try: + # Command to update DOCKER_TAG + commands = [ + f"sed -i 's/^DOCKER_TAG=.*/DOCKER_TAG={release_version}/' {subspace_dir}/.env" + ] + + # Command to update GENESIS_HASH if provided + if genesis_hash: + commands.append(f"sed -i 's/^GENESIS_HASH=.*/GENESIS_HASH={genesis_hash}/' {subspace_dir}/.env") + + # Command to update POT_EXTERNAL_ENTROPY if provided + if pot_external_entropy: + # If POT_EXTERNAL_ENTROPY exists, replace it, otherwise append it + commands.append(f"grep -q '^POT_EXTERNAL_ENTROPY=' {subspace_dir}/.env && " + f"sed -i 's/^POT_EXTERNAL_ENTROPY=.*/POT_EXTERNAL_ENTROPY={pot_external_entropy}/' {subspace_dir}/.env || " + f"echo 'POT_EXTERNAL_ENTROPY={pot_external_entropy}' >> {subspace_dir}/.env") + + # Execute the commands over SSH + for command in commands: + logger.debug(f"Executing command: {command}") + stdin, stdout, stderr = client.exec_command(command) + stdout_text = stdout.read().decode() + stderr_text = stderr.read().decode() + + if stderr_text: + logger.error(f"Error modifying .env file with command: {command}, error: {stderr_text}") + raise Exception(f"Error modifying .env file: {stderr_text}") + else: + logger.info(f"Successfully executed command: {command}") + + except Exception as e: + logger.error(f"Failed to modify .env file: {e}") + raise + +def grep_protocol_version(client, retries=5, interval=30): + """Grep the logs to find the protocol version and extract the hash.""" + logs_command = 'sudo docker logs --tail 100 subspace-archival-node-1 | grep "protocol_version="' + + for attempt in range(retries): + try: + stdout, stderr = run_command(client, logs_command) + match = re.search(r'protocol_version=/subspace/2/([a-f0-9]+)', stdout) + if match: + logger.info(f"Protocol version hash found: {match.group(1)}") + return match.group(1) + else: + logger.warning(f"Protocol version hash not found. Attempt {attempt + 1} of {retries}") + except Exception as e: + logger.error(f"Error grepping protocol version: {e}") + + if attempt < retries - 1: + logger.info(f"Retrying in {interval} seconds...") + sleep(interval) + + logger.error("Failed to retrieve protocol version hash after retries.") + return None + +def docker_compose_up(client, subspace_dir): + """Run sudo docker compose up -d in the subspace directory.""" + try: + command = f'cd {subspace_dir} && sudo docker compose up -d' + logger.info(f"Running sudo docker compose up -d in {subspace_dir}") + run_command(client, command) + except Exception as e: + logger.error(f"Failed to run sudo docker compose up -d: {e}") + raise + +def main(): + # Parse command line arguments + parser = argparse.ArgumentParser(description="Manage Subspace nodes via SSH") + parser.add_argument('--config', required=True, help='Path to the TOML config file') + parser.add_argument('--release_version', required=True, help='Release version to update in the .env file') + parser.add_argument('--subspace_dir', default='/home/ubuntu/subspace', help='Path to the Subspace directory (default: /home/ubuntu/subspace)') + parser.add_argument('--pot_external_entropy', help='POT_EXTERNAL_ENTROPY value for the timekeeper node') + parser.add_argument('--log_level', default='INFO', help='Set the logging level (DEBUG, INFO, WARNING, ERROR)') + args = parser.parse_args() + + # Set logging level based on user input + log_level = args.log_level.upper() + logging.getLogger().setLevel(log_level) + + logger.debug(f"Received POT_EXTERNAL_ENTROPY: {args.pot_external_entropy}") + + # Read configuration from the TOML file using tomli + with open(args.config, 'rb') as f: + config = tomli.load(f) + + bootstrap_node = config['bootstrap_node'] + farmer_rpc_nodes = config['farmer_rpc_nodes'] + timekeeper_node = config['timekeeper'] + + release_version = args.release_version + subspace_dir = args.subspace_dir + + # Step 1: sudo docker compose down -v on all farmer and RPC nodes + for node in farmer_rpc_nodes: + client = None # Initialize the client variable + try: + logger.info(f"Connecting to {node['host']} for sudo docker compose down -v...") + client = ssh_connect(node['host'], node['user'], node['ssh_key']) + + # Run sudo docker compose down -v + docker_compose_down(client, subspace_dir) + + # Close connection after shutdown + client.close() + except Exception as e: + logger.error(f"Error during sudo docker compose down -v on {node['host']}: {e}") + finally: + if client: + client.close() + logger.debug(f"Closed connection for node {node['host']}") + + # Step 2: Update .env and start sudo docker compose for RPC and Farmer nodes + protocol_version_hash = None + for node in farmer_rpc_nodes: + client = None # Initialize the client variable + try: + logger.info(f"Connecting to {node['host']}...") + client = ssh_connect(node['host'], node['user'], node['ssh_key']) + + # Modify the .env file + modify_env_file(client, subspace_dir, release_version) + + # Start sudo docker compose up -d + docker_compose_up(client, subspace_dir) + + # If this is the RPC node, grep the logs for protocol version hash + if node['type'] == 'rpc': + logger.info(f"Waiting for the RPC node to start...") + sleep(30) # Adjust sleep time as necessary + + logger.info(f"Grep protocol version from logs on {node['host']}...") + protocol_version_hash = grep_protocol_version(client) + + if not protocol_version_hash: + logger.error(f"Failed to retrieve protocol version hash on {node['host']}") + continue + + client.close() + except Exception as e: + logger.error(f"Error during update and start on {node['host']}: {e}") + finally: + if client: + client.close() + logger.debug(f"Closed connection for node {node['host']}") + + if timekeeper_node: + client = None # Initialize the client variable + if args.pot_external_entropy: + try: + logger.info(f"Connecting to the timekeeper node {timekeeper_node['host']}...") + client = ssh_connect(timekeeper_node['host'], timekeeper_node['user'], timekeeper_node['ssh_key']) + + # Modify the .env file with the POT_EXTERNAL_ENTROPY value + logger.debug(f"Modifying .env file with POT_EXTERNAL_ENTROPY={args.pot_external_entropy}") + modify_env_file(client, subspace_dir, release_version, pot_external_entropy=args.pot_external_entropy) + + # Start the timekeeper node + docker_compose_up(client, subspace_dir) + + logger.info("Timekeeper node started with the updated POT_EXTERNAL_ENTROPY value.") + except Exception as e: + logger.error(f"Error during timekeeper node update: {e}") + finally: + if client: + client.close() + logger.debug(f"Closed connection to timekeeper node {timekeeper_node['host']}") + else: + logger.warning(f"POT_EXTERNAL_ENTROPY not provided for the timekeeper node, skipping update.") + + # Step 3: SSH into the bootstrap node and update GENESIS_HASH, then start it + if protocol_version_hash: + client = None # Initialize the client variable + try: + logger.info(f"Connecting to the bootstrap node {bootstrap_node['host']} for sudo docker compose down -v...") + client = ssh_connect(bootstrap_node['host'], bootstrap_node['user'], bootstrap_node['ssh_key']) + + # Run sudo docker compose down -v for the bootstrap node + docker_compose_down(client, subspace_dir) + + # Modify .env with the new GENESIS_HASH + modify_env_file(client, subspace_dir, release_version, genesis_hash=protocol_version_hash) + + # Start the bootstrap node + docker_compose_up(client, subspace_dir) + + client.close() + logger.info("Bootstrap node started with the updated Genesis Hash.") + except Exception as e: + logger.error(f"Error during bootstrap node update: {e}") + finally: + if client: + client.close() + else: + logger.error("Protocol version hash not found, skipping bootstrap node start.") + +if __name__ == '__main__': + main() diff --git a/scripts/launch-nodes/nodes.toml b/scripts/launch-nodes/nodes.toml new file mode 100644 index 00000000..c8d26d29 --- /dev/null +++ b/scripts/launch-nodes/nodes.toml @@ -0,0 +1,24 @@ +# TOML file containing server details + +[bootstrap_node] +host = "34.201.40.91" +user = "ubuntu" +ssh_key = "key.pem" + +[[farmer_rpc_nodes]] +host = "54.209.76.129" +user = "ubuntu" +ssh_key = "key.pem" +type = "rpc" + +[[farmer_rpc_nodes]] +host = "44.202.161.154" +user = "ubuntu" +ssh_key = "key.pem" +type = "farmer" + +[timekeeper] +host = "52.200.120.112" +user = "ubuntu" +ssh_key = "key.pem" +type = "timekeeper"