diff --git a/.github/workflows/pr-node-backup.yml b/.github/workflows/pr-node-backup.yml
new file mode 100644
index 0000000..76a9157
--- /dev/null
+++ b/.github/workflows/pr-node-backup.yml
@@ -0,0 +1,18 @@
+name: check PR (node_backup)
+
+on:
+ pull_request:
+ paths:
+ - roles/node_backup/**
+ - .github/**
+
+jobs:
+ run-molecule-tests:
+ strategy:
+ fail-fast: false
+ matrix:
+ molecule-driver: [lxd, docker]
+ uses: ./.github/workflows/reusable-molecule.yml
+ with:
+ role-name: node
+ molecule-driver: ${{ matrix.molecule-driver }}
\ No newline at end of file
diff --git a/galaxy.yml b/galaxy.yml
index 183fe0c..6bcca8d 100644
--- a/galaxy.yml
+++ b/galaxy.yml
@@ -8,7 +8,7 @@ namespace: paritytech
name: chain
# The version of the collection. Must be compatible with semantic versioning
-version: 1.5.1
+version: 1.6.0
# The path to the Markdown (.md) readme file. This path is relative to the root of the collection
readme: README.md
diff --git a/roles/node_backup/.yamllint b/roles/node_backup/.yamllint
new file mode 100644
index 0000000..8827676
--- /dev/null
+++ b/roles/node_backup/.yamllint
@@ -0,0 +1,33 @@
+---
+# Based on ansible-lint config
+extends: default
+
+rules:
+ braces:
+ max-spaces-inside: 1
+ level: error
+ brackets:
+ max-spaces-inside: 1
+ level: error
+ colons:
+ max-spaces-after: -1
+ level: error
+ commas:
+ max-spaces-after: -1
+ level: error
+ comments: disable
+ comments-indentation: disable
+ document-start: disable
+ empty-lines:
+ max: 3
+ level: error
+ hyphens:
+ level: error
+ indentation: disable
+ key-duplicates: enable
+ line-length: disable
+ new-line-at-end-of-file: disable
+ new-lines:
+ type: unix
+ trailing-spaces: disable
+ truthy: disable
diff --git a/roles/node_backup/README.md b/roles/node_backup/README.md
new file mode 100644
index 0000000..7b3855b
--- /dev/null
+++ b/roles/node_backup/README.md
@@ -0,0 +1,8 @@
+node_backup
+=========
+This role will template out the backup script and the backup Prometheus exporter. Also, it creates the relevant systemd units.
+The nodes that we deploy on the same instance, are normal substrate nodes that are syncing the chain.
+The backup is made from the local database. These nodes don't have to do any other work other than synchronization.
+Nodes are stopped during the backup process of the given chain because otherwise, the database will be changing during
+the backup. It corrupts the backup.
+
diff --git a/roles/node_backup/defaults/main.yml b/roles/node_backup/defaults/main.yml
new file mode 100644
index 0000000..d04cd27
--- /dev/null
+++ b/roles/node_backup/defaults/main.yml
@@ -0,0 +1,40 @@
+---
+
+
+# R2 configuration
+node_backup_r2_access_key_id: ""
+node_backup_r2_secret_access_key: ""
+node_backup_r2_api_url: ""
+
+node_backup_max_concurrent_requests: 50
+
+node_backup_schedule:
+ - "*-*-* 01:00:00"
+
+node_backup_user: "polkadot"
+
+node_backup_base_path: "/opt/node_backup"
+node_backup_tmp_path: "/tmp"
+
+# It wipes a local cash of the node-bakcup expoter.
+# It's useful if you rename or remove some backups from the 'node_backup_targets' variable
+node_backup_wipe_cache_enable: false
+
+# List of the nodes deployed to the host
+# service_name - is used to extract information about db type and should be following:
+# node_chain-<[paritydb|rocksdb]-[prune|archive]
+# where: `node_chain` is value of `node_chain` variable from `node` role.
+node_backup_targets: []
+# - service_name: polkadot-rocksdb-prune
+# local_path: /opt/polkadot-rocksdb-prune/chains/polkadot/db
+# rpc_port: 9934
+# # old way of backups. It takes more time to restore and backup
+# # it's true by default
+# tar: false
+# # type of backup. can be 'gcp-native', 'gcp-rclone' or 'r2-rclone'
+# type: 'gcp-rclone'
+# # name of the bucket
+# bucket_name: "backup"
+# # the public domain name of the bucket
+# # it's empty by default
+# bucket_domain: "backup.polkadot.io"
\ No newline at end of file
diff --git a/roles/node_backup/files/exporter.py b/roles/node_backup/files/exporter.py
new file mode 100644
index 0000000..338263f
--- /dev/null
+++ b/roles/node_backup/files/exporter.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import pickle
+import json
+import logging
+import threading
+import traceback
+import io
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from prometheus_client import start_http_server, Gauge
+
+
+LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+
+
+cache_filename = os.path.dirname(__file__) + '/exporter.cache'
+
+backup_labels = ['id', 'storage', 'bucket_name', 'service_name', 'version']
+backup_metrics = {
+ "timestamp": Gauge('node_backup_timestamp',
+ 'Time of the last backup (unix timestamp)',
+ backup_labels),
+ "size": Gauge('node_backup_size',
+ 'Size of the last backup (byte)',
+ backup_labels),
+ "last_block": Gauge('node_backup_last_block',
+ 'Last block in the last backup (byte)',
+ backup_labels),
+ "last_backup": Gauge('node_backup_last_backup',
+ 'Last backup',
+ backup_labels + ['backup_name', 'tar_backup_path', 'backup_path']),
+ "total_size": Gauge('node_backup_total_size',
+ 'Size of all backups (byte)',
+ ['storage', 'bucket_name'])
+}
+
+
+def update_cache(key, value):
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0:
+ with open(cache_filename, 'rb') as f:
+ data = pickle.load(f)
+ else:
+ data = {}
+ data[key] = value
+ with open(cache_filename, 'wb') as f:
+ pickle.dump(data, f)
+
+
+def fetch_cache():
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0:
+ with open(cache_filename, 'rb') as f:
+ data = pickle.load(f)
+ logging.info(f"Fetched from cache: {data}")
+ return data
+ else:
+ return {}
+
+
+def clean_metrics(id, backup_name, version):
+ """
+ Purge records with old versions
+ """
+
+ def check_record(key_value) -> bool:
+ return (
+ id in key_value['labels'] and
+ key_value['name'] != 'node_backup_total_size' and
+ (
+ (key_value['name'] == 'node_backup_last_backup' and backup_name not in key_value['labels']) or
+ version not in key_value['labels']
+ )
+ )
+
+ for metric in backup_metrics.items():
+ current_metrics=[{'name': i.name, 'labels': list(i.labels.values()), 'value': i.value} for i in metric[1].collect()[0].samples]
+ old_metrics = list(filter(check_record, current_metrics))
+ for old_metric in old_metrics:
+ logging.info(f"clean {old_metric['name']} metric with label set: {str(old_metric['labels'])}")
+ metric[1].remove(*old_metric['labels'])
+
+
+def set_metrics(data):
+ id = f"{data['storage']}-{data['bucketName']}-{data['serviceName']}"
+ common_labels={'id': id,
+ 'storage': data['storage'],
+ 'bucket_name': data['bucketName'],
+ 'service_name': data['serviceName'],
+ 'version': data['version']}
+ if data['bucketDomain'] != '':
+ backup_path=f"https://{data['bucketDomain']}/{data['serviceName']}/{data['backupName']}"
+ tar_backup_path=f"https://{data['bucketDomain']}/tar/{data['serviceName']}/{data['backupName']}.tar"
+ elif data['bucketDomain'] == '' and data['storage'] == 'gcp':
+ backup_path=f"gs://{data['bucketName']}/{data['serviceName']}/{data['backupName']}"
+ tar_backup_path=f"https://storage.googleapis.com/{data['bucketName']}/tar/{data['serviceName']}/{data['backupName']}.tar"
+ else:
+ raise Exception("'bucketDomain' has to be defined")
+ clean_metrics(id, data['backupName'], data['version'])
+ backup_metrics['timestamp'].labels(**common_labels).set(int(data['timeStamp']))
+ backup_metrics['size'].labels(**common_labels).set(int(data['size']))
+ backup_metrics['last_block'].labels(**common_labels).set(int(data['lastBlock']))
+ backup_metrics['last_backup'].labels(**common_labels,
+ backup_name=data['backupName'],
+ backup_path=backup_path,
+ tar_backup_path=tar_backup_path).set(1)
+ backup_metrics['total_size'].labels(storage=data['storage'],
+ bucket_name=data['bucketName']).set(int(data['totalSize']))
+ update_cache((data['storage'], data['bucketName'], data['serviceName']), data)
+ logging.info(f"request was processed successfully. data: {data}")
+
+
+class HttpProcessor(BaseHTTPRequestHandler):
+ """
+ HTTP Server
+ """
+ BaseHTTPRequestHandler.server_version = 'Python API'
+
+ def log_message(self, format, *args):
+ message = f"{self.address_string()} {format % args}"
+ logging.info(message)
+
+ def _set_headers(self):
+ self.send_response(200)
+ self.send_header('Content-type', 'application/json; charset=utf-8')
+ self.end_headers()
+
+
+ def do_POST(self):
+ if self.headers.get('Content-Type') != 'application/json':
+ self.send_error(400, "Only application/json supported")
+ self.end_headers()
+ return
+ data = ""
+ try:
+ # read the message and convert it into a python dictionary
+ length = int(self.headers['content-length'])
+ data = self.rfile.read(length)
+
+ set_metrics(json.loads(data))
+ self.send_response(200)
+
+ self._set_headers()
+ self.wfile.write(json.dumps({"status": "OK"}).encode("utf8"))
+ except json.decoder.JSONDecodeError as e:
+ tb_output = io.StringIO()
+ traceback.print_tb(e.__traceback__, file=tb_output)
+ logging.error(f"JSON decoding error. error: '{e}', JSON: '{data}'")
+ logging.error(f"JSON decoding error. traceback:\n{tb_output.getvalue()}")
+ tb_output.close()
+ self.send_error(400, 'JSONDecodeError')
+ return
+ except Exception as e:
+ tb_output = io.StringIO()
+ traceback.print_tb(e.__traceback__, file=tb_output)
+ logging.error(f"request processing error. error: '{e}'")
+ logging.error(f"request processing error. traceback:\n{tb_output.getvalue()}")
+ tb_output.close()
+ self.send_error(500)
+ return
+
+
+def start_servers():
+ """
+ Start HTTP Servers
+ """
+ # Start up the server to expose the metrics.
+ start_http_server(9109) # Metrics server
+ server_address = ('127.0.0.1', 60101) # Data reception server
+ server = HTTPServer(server_address, HttpProcessor)
+ server.serve_forever()
+
+
+if __name__ == '__main__':
+
+ # set up console log handler
+ console = logging.StreamHandler()
+ console.setLevel(logging.INFO)
+ formatter = logging.Formatter(LOGGING_FORMAT)
+ console.setFormatter(formatter)
+ # set up basic logging config
+ logging.basicConfig(format=LOGGING_FORMAT, level=logging.INFO, handlers=[console])
+
+
+ for backup in fetch_cache().items():
+ try:
+ set_metrics(backup[1])
+ except KeyError as e:
+ logging.error(f"cache fetching error. error: {e}, key: {backup[0]}, value: {backup[1]}")
+ except Exception as e:
+ tb_output = io.StringIO()
+ traceback.print_tb(e.__traceback__, file=tb_output)
+ logging.error(f"cache fetching error. error: '{e}'")
+ logging.error(f"cache fetching error. traceback:\n{tb_output.getvalue()}")
+ tb_output.close()
+ sys.exit(1)
+
+ thread = threading.Thread(target=start_servers, args=())
+ thread.daemon = True
+ thread.start()
+ thread.join()
diff --git a/roles/node_backup/handlers/main.yml b/roles/node_backup/handlers/main.yml
new file mode 100644
index 0000000..129b520
--- /dev/null
+++ b/roles/node_backup/handlers/main.yml
@@ -0,0 +1,15 @@
+---
+
+- name: restart node-backup exporter
+ ansible.builtin.systemd:
+ name: "node-backup-exporter"
+ state: restarted
+ enabled: true
+ daemon_reload: true
+
+- name: restart node-backup timer
+ ansible.builtin.systemd:
+ name: "node-backup.timer"
+ state: restarted
+ enabled: true
+ daemon_reload: true
\ No newline at end of file
diff --git a/roles/node_backup/molecule/default/README.md b/roles/node_backup/molecule/default/README.md
new file mode 100644
index 0000000..cde3444
--- /dev/null
+++ b/roles/node_backup/molecule/default/README.md
@@ -0,0 +1,25 @@
+### Collection
+
+Molecula should install collection automatically, If id did not happened run:
+```commandline
+mkdir molecule/default/collections
+ansible-galaxy collection install -f -r molecule/default/collections.yml -p ./molecule/default/collections
+```
+
+### Molecule
+#### Docker
+Test role with docker driver
+```shell
+molecule create
+molecule converge
+molecule destroy
+```
+
+#### LXD
+Test role with LXD driver
+```shell
+DRIVER=lxd molecule create
+DRIVER=lxd molecule converge
+DRIVER=lxd molecule destroy
+```
+
diff --git a/roles/node_backup/molecule/default/collections.yml b/roles/node_backup/molecule/default/collections.yml
new file mode 100644
index 0000000..88bc7f3
--- /dev/null
+++ b/roles/node_backup/molecule/default/collections.yml
@@ -0,0 +1,4 @@
+collections:
+ - name: https://github.com/paritytech/ansible-galaxy.git
+ type: git
+ version: main
\ No newline at end of file
diff --git a/roles/node_backup/molecule/default/converge.yml b/roles/node_backup/molecule/default/converge.yml
new file mode 100644
index 0000000..e860493
--- /dev/null
+++ b/roles/node_backup/molecule/default/converge.yml
@@ -0,0 +1,7 @@
+---
+- name: Converge
+ hosts: all
+ tasks:
+ - name: "Include node backup"
+ ansible.builtin.include_role:
+ name: "node_backup"
diff --git a/roles/node_backup/molecule/default/group_vars/all.yml b/roles/node_backup/molecule/default/group_vars/all.yml
new file mode 100644
index 0000000..578dd8b
--- /dev/null
+++ b/roles/node_backup/molecule/default/group_vars/all.yml
@@ -0,0 +1,35 @@
+## Molecule
+ansible_user: root
+
+## prepare.yml
+#node_legacy_rpc_flags: false
+node_binary: "https://github.com/paritytech/polkadot/releases/download/v0.9.42/polkadot"
+node_chain: "rococo-local"
+node_data_root_path: "/opt/{{node_app_name}}"
+node_chain_backup_restoring_type: "none"
+node_pruning: 256
+# node_binary_deployment: false
+
+# node_backup
+_gcp_bucket: test-blockstore-backups
+node_backup_user: "parity"
+node_backup_r2_access_key_id: "abc"
+node_backup_r2_secret_access_key: "cba"
+node_backup_r2_api_url: "https://a.b"
+node_backup_targets:
+ - service_name: rococo-alice-rocksdb-prune
+ local_path: /opt/rococo-alice-rocksdb-prune/chains/rococo_local_testnet/db
+ rpc_port: 9933
+ bucket_name: "{{ _gcp_bucket }}"
+ type: "gcp-native"
+ - service_name: rococo-bob-paritydb-prune
+ local_path: /opt/rococo-bob-paritydb-prune/chains/rococo_local_testnet/paritydb
+ rpc_port: 9934
+ bucket_name: "{{ _gcp_bucket }}"
+ type: "gcp-rclone"
+ - service_name: rococo-bob-paritydb-prune
+ local_path: /opt/rococo-bob-paritydb-prune/chains/rococo_local_testnet/paritydb
+ rpc_port: 9934
+ bucket_name: "{{ _gcp_bucket }}"
+ type: "r2-rclone"
+ bucket_domain: "c.d"
\ No newline at end of file
diff --git a/roles/node_backup/molecule/default/molecule.yml b/roles/node_backup/molecule/default/molecule.yml
new file mode 100644
index 0000000..4e44ecf
--- /dev/null
+++ b/roles/node_backup/molecule/default/molecule.yml
@@ -0,0 +1,31 @@
+---
+dependency:
+ name: galaxy
+driver:
+ name: ${DRIVER:-docker}
+platforms:
+ - name: molecule-instance-node-backup
+ # LXD
+ source:
+ alias: debian/bullseye/amd64
+ # DOCKER
+ image: "paritytech/debian11:latest"
+ command: ${MOLECULE_DOCKER_COMMAND:-""}
+ privileged: true
+ pre_build_image: true
+
+provisioner:
+ name: ansible
+ options:
+ diff: True
+ config_options:
+ defaults:
+ callbacks_enabled: timer
+verifier:
+ name: ansible
+ options:
+ diff: True
+lint: |
+ set -e
+ yamllint .
+ ansible-lint
diff --git a/roles/node_backup/molecule/default/prepare.yml b/roles/node_backup/molecule/default/prepare.yml
new file mode 100644
index 0000000..15e45c7
--- /dev/null
+++ b/roles/node_backup/molecule/default/prepare.yml
@@ -0,0 +1,43 @@
+- name: Prepare
+ hosts: all
+ gather_facts: false
+ pre_tasks:
+ - name: Install Python3
+ ansible.builtin.raw: apt -y update && apt install -y python3
+ changed_when: false
+ - name: Prepare | create user parity
+ ansible.builtin.user:
+ name: parity
+ tasks:
+ - name: "rococo-alice local"
+ ansible.builtin.include_role:
+ name: parity.chain.node
+ vars:
+ node_rpc_port: 9933
+ node_app_name: "rococo-alice-rocksdb-prune"
+ node_custom_options:
+ - "--alice"
+ - name: "rococo-bob local"
+ ansible.builtin.include_role:
+ name: parity.chain.node
+ vars:
+ node_rpc_port: 9934
+ node_paritydb_enable: true
+ node_app_name: "rococo-bob-paritydb-prune"
+ node_custom_options:
+ - "--bob"
+ - name: Pretend we are in gcp | Install cron, gnupg
+ ansible.builtin.package:
+ name:
+ - cron
+ - gnupg
+ state: present
+ update_cache: true
+ - name: Pretend we are in gcp | Add an Apt signing key
+ ansible.builtin.apt_key:
+ url: https://packages.cloud.google.com/apt/doc/apt-key.gpg
+ state: present
+ - name: Pretend we are in gcp | Add apt repository into sources list
+ ansible.builtin.apt_repository:
+ repo: deb https://packages.cloud.google.com/apt cloud-sdk main
+ state: present
diff --git a/roles/node_backup/molecule/default/verify.yml b/roles/node_backup/molecule/default/verify.yml
new file mode 100644
index 0000000..8161fd5
--- /dev/null
+++ b/roles/node_backup/molecule/default/verify.yml
@@ -0,0 +1,47 @@
+---
+- name: Verify
+ hosts: all
+ gather_facts: false
+ tasks:
+ - name: wait until ~10 blocks created
+ ansible.builtin.uri:
+ url: "http://127.0.0.1:9933"
+ method: "POST"
+ body_format: "json"
+ body:
+ id: 1
+ jsonrpc: "2.0"
+ method: "chain_getHeader"
+ params: []
+ return_content: true
+ register: _node_backup_register_header
+ until: _node_backup_register_header.json.result.number | int(base=16) > 10
+ retries: 10
+ delay: 10
+
+ - name: Print current block
+ ansible.builtin.debug:
+ var: _node_backup_register_header.json.result.number | int(base=16)
+# # todo add tests
+#
+## a) upload to gcp
+# GCP storage emulator is not available yet (https://github.com/googleapis/google-cloud-python/issues/10300),
+# there are third party emulator, but support of gsutils is broken (https://github.com/oittaa/gcp-storage-emulator/issues/186)
+# when emulator will be available:
+# 1. run and configure emulator
+# 2. run script:
+# - name: run backup script
+# ansible.builtin.command: /home/parity/bin/node_backup.sh
+# 3.
+# - name: "rococo-bob local"
+# ansible.builtin.include_role:
+# name: parity.chain.node
+# vars:
+# node_rpc_port: 9935
+# node_paritydb_enable: true
+# node_app_name: "rococo-local-rpc"
+#
+## b) Test backup-exporter:
+# We can push fake data to backup-exporter (like run bash script).
+# Then we can check the Prometheus endpoint to check and match the results.
+# This will allow checking the code of the exporter.
diff --git a/roles/node_backup/tasks/exporter.yml b/roles/node_backup/tasks/exporter.yml
new file mode 100644
index 0000000..8e1e659
--- /dev/null
+++ b/roles/node_backup/tasks/exporter.yml
@@ -0,0 +1,37 @@
+---
+
+- name: node-backup | exporter | remove the cache file
+ ansible.builtin.file:
+ path: "{{ _node_backup_exporter_cache_file }}"
+ state: absent
+ notify: restart node-backup exporter
+ when: node_backup_wipe_cache_enable | bool
+
+- name: node-backup | exporter | copy exporter file
+ ansible.builtin.copy:
+ src: "exporter.py"
+ dest: "{{ _node_backup_exporter_file }}"
+ mode: 0755
+ owner: "{{ node_backup_user }}"
+ group: "{{ node_backup_user }}"
+ notify: restart node-backup exporter
+
+- name: node-backup | exporter | copy exporter systemd unit file
+ ansible.builtin.template:
+ src: "node-backup-exporter.service.j2"
+ dest: "/etc/systemd/system/node-backup-exporter.service"
+ owner: "root"
+ group: "root"
+ mode: "0644"
+ notify: restart node-backup exporter
+
+ # to avoid 2 restarts during the first deploy
+- name: node-backup | exporter | flush handlers
+ ansible.builtin.meta: flush_handlers
+
+- name: node-backup | exporter | start exporter service
+ ansible.builtin.systemd:
+ name: "node-backup-exporter"
+ state: started
+ enabled: true
+ daemon_reload: true
diff --git a/roles/node_backup/tasks/job.yml b/roles/node_backup/tasks/job.yml
new file mode 100644
index 0000000..f0379e4
--- /dev/null
+++ b/roles/node_backup/tasks/job.yml
@@ -0,0 +1,51 @@
+---
+
+- name: node-backup | job | set _node_backup_targets variable 1
+ ansible.builtin.set_fact:
+ _node_backup_targets: []
+
+- name: node-backup | job | set _node_backup_targets variable 2
+ ansible.builtin.set_fact:
+ _node_backup_targets: "{{ _node_backup_targets +
+ [ item | combine({'id': _node_backup_id}, recursive=True) ] }}"
+ vars:
+ _node_backup_id: "{{ (_node_backup_storages[item.type] + '-' + item.bucket_name + '-' + item.service_name) | regex_replace('[^0-9a-zA-Z]+', '-') }}"
+ loop: "{{ node_backup_targets }}"
+
+- name: node-backup | job | copy single backup scripts
+ ansible.builtin.template:
+ src: "single-backup.sh.j2"
+ dest: "{{ _node_backup_scripts_path }}/{{ item.id }}.sh"
+ mode: 0755
+ owner: "root"
+ group: "root"
+ loop: "{{ _node_backup_targets }}"
+ tags: ['node-backup-test']
+
+- name: node-backup | job | copy common backup script
+ ansible.builtin.template:
+ src: "common-backup.sh.j2"
+ dest: "{{ _node_backup_scripts_path }}/common.sh"
+ mode: 0755
+ owner: "root"
+ group: "root"
+ tags: ['node-backup-test']
+
+- name: node-backup | job | copy backup systemd unit files
+ ansible.builtin.template:
+ src: "{{ item }}.j2"
+ dest: "/etc/systemd/system/{{ item }}"
+ owner: "root"
+ group: "root"
+ mode: "0644"
+ loop:
+ - "node-backup.service"
+ - "node-backup.timer"
+ notify: restart node-backup timer
+
+- name: node-backup | job | enable timer
+ ansible.builtin.systemd:
+ name: "node-backup.timer"
+ state: started
+ enabled: true
+ daemon_reload: true
diff --git a/roles/node_backup/tasks/main.yml b/roles/node_backup/tasks/main.yml
new file mode 100644
index 0000000..38a617d
--- /dev/null
+++ b/roles/node_backup/tasks/main.yml
@@ -0,0 +1,49 @@
+---
+
+- name: node-backup | tests
+ ansible.builtin.include_tasks:
+ file: tests.yml
+ apply:
+ tags: ['node-backup', 'node-backup-tests']
+ tags: ['node-backup', 'node-backup-tests']
+
+- name: node-backup | create directories
+ ansible.builtin.file:
+ path: "{{ item.path }}"
+ state: directory
+ mode: "0755"
+ owner: "{{ item.user }}"
+ group: "{{ item.user }}"
+ loop:
+ - path: "{{ node_backup_base_path }}"
+ user: root
+ - path: "{{ _node_backup_scripts_path }}"
+ user: root
+ - path: "{{ _node_backup_exporter_path }}"
+ user: "{{ node_backup_user }}"
+ - path: "{{ _node_backup_log_path }}"
+ user: root
+ - path: "{{ _node_backup_venv_path }}"
+ user: root
+ tags: [ 'node-backup' ]
+
+- name: node-backup | requirements
+ ansible.builtin.include_tasks:
+ file: requirements.yml
+ apply:
+ tags: [ 'node-backup', 'node-backup-requirements' ]
+ tags: [ 'node-backup', 'node-backup-requirements' ]
+
+- name: node-backup | job
+ ansible.builtin.include_tasks:
+ file: job.yml
+ apply:
+ tags: [ 'node-backup', 'node-backup-job' ]
+ tags: [ 'node-backup', 'node-backup-job' ]
+
+- name: node-backup | exporter
+ ansible.builtin.include_tasks:
+ file: exporter.yml
+ apply:
+ tags: [ 'node-backup', 'node-backup-exporter' ]
+ tags: [ 'node-backup', 'node-backup-exporter' ]
diff --git a/roles/node_backup/tasks/requirements.yml b/roles/node_backup/tasks/requirements.yml
new file mode 100644
index 0000000..9207623
--- /dev/null
+++ b/roles/node_backup/tasks/requirements.yml
@@ -0,0 +1,48 @@
+---
+
+- name: node-backup | requirements | install packages
+ ansible.builtin.package:
+ name: "{{ packages }}"
+ state: present
+ update_cache: true
+ vars:
+ packages:
+ - "curl"
+ - "jq"
+ - "expect"
+ - "moreutils"
+ - "python3-venv"
+ - "python3-setuptools"
+
+
+- name: node-backup | requirements | install Python modules
+ ansible.builtin.pip:
+ name:
+ - "prometheus-client==0.17.0"
+ virtualenv: "{{ _node_backup_venv_path }}"
+ virtualenv_command: "python3 -m venv"
+ notify: restart node-backup exporter
+
+- name: node-backup | requirements | configure rclone
+ block:
+
+ - name: node-backup | requirements | install rclone
+ ansible.builtin.apt:
+ deb: "{{ _node_backup_rclone_deb }}"
+
+ - name: node backup | requirements | create rclone config directory
+ ansible.builtin.file:
+ path: "/root/.config/rclone"
+ state: directory
+ mode: 0700
+ owner: "root"
+ group: "root"
+
+ - name: node-backup | requirements | copy R2 config
+ ansible.builtin.template:
+ src: "rclone/rclone.conf.j2"
+ dest: "/root/.config/rclone/rclone.conf"
+ owner: "root"
+ group: "root"
+ mode: 0600
+ when: node_backup_targets | json_query('[].type') | intersect(_node_backup_rclone_types) | length > 0
diff --git a/roles/node_backup/tasks/tests.yml b/roles/node_backup/tasks/tests.yml
new file mode 100644
index 0000000..167d119
--- /dev/null
+++ b/roles/node_backup/tasks/tests.yml
@@ -0,0 +1,31 @@
+---
+
+- name: node-backup | test | check R2 configuration
+ ansible.builtin.fail:
+ msg: "If the R2 backups are used, 'node_backup_r2_access_key_id', 'node_backup_r2_secret_access_key' and 'node_backup_r2_api_url' variables have to be specified"
+ when: node_backup_targets | json_query('[].type') | intersect(_node_backup_r2_types) | length > 0 and
+ ( node_backup_r2_access_key_id == '' or
+ node_backup_r2_secret_access_key == '' or
+ node_backup_r2_api_url == ''
+ )
+
+- name: node-backup | test | check variables
+ ansible.builtin.fail:
+ msg: "'service_name', 'rpc_port', 'type' and 'bucket_name' fields have to be specified for each item in 'node_backup_targets'"
+ when: item.service_name == '' or
+ item.rpc_port == '' or
+ item.type == '' or
+ item.bucket_name == ''
+ loop: "{{ node_backup_targets }}"
+
+- name: node-backup | test | check R2 backups
+ ansible.builtin.fail:
+ msg: "the 'bucket_domain' field has to be specified for R2 backups"
+ when: item.type in _node_backup_r2_types and item.bucket_domain == ''
+ loop: "{{ node_backup_targets }}"
+
+- name: node-backup | test | check backup types
+ ansible.builtin.fail:
+ msg: "{{ item.type }} is not a valid backup type"
+ when: item.type not in (_node_backup_gcp_types + _node_backup_r2_types)
+ loop: "{{ node_backup_targets }}"
diff --git a/roles/node_backup/templates/common-backup.sh.j2 b/roles/node_backup/templates/common-backup.sh.j2
new file mode 100644
index 0000000..c0eafb3
--- /dev/null
+++ b/roles/node_backup/templates/common-backup.sh.j2
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+{% for target in _node_backup_targets %}
+now=$(date +"%Y%m%d-%H%M%S")
+unbuffer bash {{ _node_backup_scripts_path }}/{{ target.id }}.sh "${now}" 2>&1 | tee "{{ _node_backup_log_path }}/{{ target.service_name }}-${now}.txt"
+{% endfor %}
\ No newline at end of file
diff --git a/roles/node_backup/templates/node-backup-exporter.service.j2 b/roles/node_backup/templates/node-backup-exporter.service.j2
new file mode 100644
index 0000000..81df6d3
--- /dev/null
+++ b/roles/node_backup/templates/node-backup-exporter.service.j2
@@ -0,0 +1,12 @@
+[Unit]
+Description=Node backup exporter systemd service
+
+[Service]
+Environment=PYTHONUNBUFFERED=True
+ExecStart={{ _node_backup_venv_path }}/bin/python3 {{ _node_backup_exporter_file }}
+Restart=always
+User={{ node_backup_user }}
+Group={{ node_backup_user }}
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/node_backup/templates/node-backup.service.j2 b/roles/node_backup/templates/node-backup.service.j2
new file mode 100644
index 0000000..8c06940
--- /dev/null
+++ b/roles/node_backup/templates/node-backup.service.j2
@@ -0,0 +1,6 @@
+[Unit]
+Description=Node backup systemd service
+
+[Service]
+Type=oneshot
+ExecStart={{ _node_backup_scripts_path }}/common.sh
diff --git a/roles/node_backup/templates/node-backup.timer.j2 b/roles/node_backup/templates/node-backup.timer.j2
new file mode 100644
index 0000000..c1b5167
--- /dev/null
+++ b/roles/node_backup/templates/node-backup.timer.j2
@@ -0,0 +1,11 @@
+[Unit]
+Description=Node backup systemd timer
+
+[Timer]
+{% for time in node_backup_schedule %}
+OnCalendar={{ time }}
+{% endfor %}
+Persistent=true
+
+[Install]
+WantedBy=timers.target
diff --git a/roles/node_backup/templates/rclone/rclone.conf.j2 b/roles/node_backup/templates/rclone/rclone.conf.j2
new file mode 100644
index 0000000..9d704f9
--- /dev/null
+++ b/roles/node_backup/templates/rclone/rclone.conf.j2
@@ -0,0 +1,19 @@
+{% if node_backup_targets | json_query('[].type') | intersect(_node_backup_r2_types) | length > 0 %}
+[R2backups]
+type = s3
+provider = Cloudflare
+access_key_id = {{ node_backup_r2_access_key_id }}
+secret_access_key = {{ node_backup_r2_secret_access_key }}
+endpoint = {{ node_backup_r2_api_url }}
+acl = private
+upload_cutoff = 1024M
+upload_concurrency = {{ node_backup_max_concurrent_requests }}
+chunk_size = 256M
+{% endif %}
+
+{% if node_backup_targets | json_query('[].type') | intersect(_node_backup_gcp_types) | length > 0 %}
+[GCPbackups]
+type = google cloud storage
+bucket_policy_only = true
+{% endif %}
+
diff --git a/roles/node_backup/templates/single-backup.sh.j2 b/roles/node_backup/templates/single-backup.sh.j2
new file mode 100644
index 0000000..f5a78a7
--- /dev/null
+++ b/roles/node_backup/templates/single-backup.sh.j2
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+
+# We mustn't remove it. Any failed command can bring an inconsistent backup.
+set -eu -o pipefail
+
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Backup $0 Started!\n---\n"
+
+tmp_meta_file="{{ node_backup_tmp_path }}/{{ item.service_name }}.meta.txt"
+tmp_latest_version_file="{{ node_backup_tmp_path }}/{{ item.service_name }}_latest_version.meta.txt"
+
+set -x
+systemctl start {{ item.service_name }}
+set +x
+
+counter=1
+curl_result=""
+
+until echo ${curl_result} | grep 'false'
+do
+ if [ $counter -gt 20 ];then
+ echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) the health check is failed for '{{ item.service_name }}' service. The backup will be skipped!\n---\n"
+ false
+ fi
+ echo -e "Run health-check ${counter}..."
+ set -x
+ curl_result=$(curl --retry 3 --retry-delay 60 --retry-connrefused -s -X POST -H "Content-Type: application/json" \
+ -d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \
+ http://127.0.0.1:{{ item.rpc_port }} | jq '.["result"]["isSyncing"]')
+ set +x
+ if [ $counter -gt 1 ];then
+ sleep 60
+ fi
+ let "counter+=1"
+done
+
+set -x
+last_block=$(curl --retry 3 --retry-connrefused --retry-delay 60 -X POST -H "Content-Type: application/json" \
+ -d '{"id":1, "jsonrpc":"2.0", "method": "system_syncState", "params":[]}' \
+ http://127.0.0.1:{{ item.rpc_port }} \
+ | jq '.["result"]["currentBlock"]')
+
+version=$(curl --retry 3 --retry-connrefused --retry-delay 60 -X POST -H "Content-Type: application/json" \
+ -d '{"id":1, "jsonrpc":"2.0", "method": "system_version", "params":[]}' \
+ http://127.0.0.1:{{ item.rpc_port }} \
+ | jq '.["result"]')
+set +x
+version=${version%\"}
+version=${version#\"}
+time_stamp=$(date +"%s")
+
+SECONDS=0
+
+# Database would be modified during the backup and potentially corrupt the backup. So we'll
+# need to stop the unit and start it again after the backup.
+set -x
+systemctl stop {{ item.service_name }}
+set +x
+
+# Get the list of local files
+local_files=/tmp/local-files-{{ item.service_name }}-${1}
+remote_files=/tmp/remote-files-{{ item.service_name }}-${1}
+find {{ item.local_path }} -mindepth 1 -type f | sed "s|{{ item.local_path }}||g" | sed 's/^\/*//' | sort > ${local_files}
+
+{% if item.type == 'gcp-native' %}
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' backup\n---\n"
+set -x
+gcloud storage \
+ cp -r {{ item.local_path }} gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}
+
+# Get the list of files in the bucket
+gcloud storage ls -r gs://{{ item.bucket_name }}/{{ item.service_name }}/${1} | grep -vF '/:' \
+ | sed "s|gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}/||g" | grep . | sort > ${remote_files}
+set +x
+
+# Check if remote version matches the local one
+if ! diff ${remote_files} ${local_files} -q; then
+ echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) The contents of the remote bucket does not match the local copy for the '{{ item.id }}' backup. Cleaning the remote backup...\n---\n"
+ set -x
+ gcloud storage rm -r gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}
+ set +x
+ echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Show diff and exit!\n---\n"
+ set -x
+ diff ${remote_files} ${local_files}
+ rm -f ${remote_files} ${local_files}
+ set +x
+ exit 1
+fi
+
+set -x
+gcloud storage \
+ cp ${remote_files} gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}/files.txt
+rm -f ${remote_files}
+size=$(gsutil \
+ du -s gs://{{ item.bucket_name }}/{{ item.service_name }}/${1} | awk '{ print $1 }' )
+
+echo -e "size: ${size}\nlastBlock: ${last_block}\nversion: ${version}" > ${tmp_meta_file}
+gcloud storage \
+ cp ${tmp_meta_file} gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}.meta.txt
+rm -f ${tmp_meta_file}
+
+echo "${1}" > ${tmp_latest_version_file}
+gcloud storage \
+ cp ${tmp_latest_version_file} gs://{{ item.bucket_name }}/{{ item.service_name }}/latest_version.meta.txt
+rm -f ${tmp_latest_version_file}
+set +x
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' backup in ${SECONDS} seconds\n---\n"
+
+{% if item.tar | default(true) %}
+SECONDS=0
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' TAR backup\n---\n"
+set -x
+tar -cf - {{ item.local_path }} | gcloud storage \
+ cp - gs://{{ item.bucket_name }}/tar/{{ item.service_name }}/${1}.tar
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' TAR backup in ${SECONDS} seconds\n---\n"
+set +x
+{% endif %}
+
+set -x
+total_size=$(gsutil \
+ du -s gs://{{ item.bucket_name }} | awk '{ print $1 }' )
+set +x
+{% endif %}
+
+
+{% if item.type in _node_backup_rclone_types %}
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' backup\n---\n"
+
+{% if item.type == 'gcp-rclone' %}
+remote="GCPbackups"
+{% elif item.type == 'r2-rclone' %}
+remote="R2backups"
+{% else %}
+{{ "backup type must be defined."/0 }}
+{% endif %}
+
+set -x
+LATEST_BACKUP=$(rclone cat ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/latest_version.meta.txt)
+if [ -n "$LATEST_BACKUP" ]; then
+ rclone copy -v --transfers={{ node_backup_max_concurrent_requests }} \
+ --contimeout=10m --retries 10 --retries-sleep 60 --error-on-no-transfer --fast-list --checksum \
+ ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${LATEST_BACKUP} \
+ ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}
+ echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed copying of the latest backup for the '{{ item.id }}' backup in ${SECONDS} seconds\n---\n"
+ SECONDS=0
+fi
+rclone sync -v --transfers={{ node_backup_max_concurrent_requests }} \
+ --contimeout=10m --retries 10 --retries-sleep 60 --error-on-no-transfer \
+ --update --fast-list --delete-during --disable-http2 --no-gzip-encoding \
+ {{ item.local_path }} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}
+
+# Get the list of files in the bucket
+rclone lsf -R --fast-list --files-only \
+ ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} | sort > ${remote_files}
+set +x
+
+# Check if remote version matches the local one
+if ! diff ${remote_files} ${local_files} -q; then
+ echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) The contents of the remote bucket does not match the local copy for the '{{ item.id }}' backup. Cleaning the remote backup...\n---\n"
+ set -x
+ rclone purge -v --contimeout=10m --retries 10 --retries-sleep 60 --fast-list \
+ ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}
+ set +x
+ echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Show diff and exit!\n---\n"
+ set -x
+ diff ${remote_files} ${local_files}
+ rm -f ${remote_files} ${local_files}
+ set +x
+ exit 1
+fi
+
+set -x
+rclone copyto -v \
+ ${remote_files} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}/files.txt
+rm -f ${remote_files}
+
+size=$(rclone size --json ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} | jq '.bytes')
+
+echo -e "size: ${size}\nlastBlock: ${last_block}\nversion: ${version}" > ${tmp_meta_file}
+rclone copyto -v \
+ ${tmp_meta_file} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}.meta.txt
+rm -f ${tmp_meta_file}
+
+echo "${1}" > ${tmp_latest_version_file}
+rclone copyto -v \
+ ${tmp_latest_version_file} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/latest_version.meta.txt
+rm -f ${tmp_latest_version_file}
+set +x
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' backup in ${SECONDS} seconds\n---\n"
+
+{% if item.tar | default(true) %}
+SECONDS=0
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' TAR backup\n---\n"
+set -x
+tar -cf - {{ item.local_path }} | rclone rcat -v --contimeout=10m --retries 10 --retries-sleep 60 --error-on-no-transfer \
+ --transfers=1 --disable-http2 \
+ ${remote}:{{ item.bucket_name }}/tar/{{ item.service_name }}/${1}.tar
+set +x
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' TAR backup in ${SECONDS} seconds\n---\n"
+{% endif %}
+
+set -x
+total_size=$(rclone size --json ${remote}:{{ item.bucket_name }} | jq '.bytes')
+set +x
+{% endif %}
+
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Notify the backup exporter about the latest successful backup\n---\n"
+set -x
+curl --retry 3 --retry-connrefused --retry-delay 60 -X POST -H "Content-Type: application/json" -d \
+ '{"serviceName":"{{ item.service_name }}", "backupName": "'$1'", "timeStamp": "'$time_stamp'",
+ "size": "'$size'", "totalSize": "'$total_size'", "lastBlock": "'$last_block'", "version": "'$version'",
+ "storage": "{{ _node_backup_storages[item.type] }}", "bucketName": "{{ item.bucket_name }}", "bucketDomain": "{{ item.bucket_domain | default("") }}"}' \
+ http://127.0.0.1:60101
+
+rm -f ${local_files}
+systemctl start {{ item.service_name }}
+set +x
+
+echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Backup $0 Finished!\n---\n"
diff --git a/roles/node_backup/vars/main.yml b/roles/node_backup/vars/main.yml
new file mode 100644
index 0000000..6c7a881
--- /dev/null
+++ b/roles/node_backup/vars/main.yml
@@ -0,0 +1,17 @@
+---
+
+_node_backup_scripts_path: "{{ node_backup_base_path }}/scripts"
+_node_backup_log_path: "{{ node_backup_base_path }}/logs"
+_node_backup_venv_path: "{{ node_backup_base_path }}/venv"
+_node_backup_exporter_path: "{{ node_backup_base_path }}/exporter"
+_node_backup_exporter_file: "{{ _node_backup_exporter_path }}/exporter.py"
+_node_backup_exporter_cache_file: "{{ _node_backup_exporter_path }}/exporter.cache"
+_node_backup_rclone_deb: "https://downloads.rclone.org/v1.63.1/rclone-v1.63.1-linux-amd64.deb"
+
+_node_backup_r2_types: ["r2-rclone"]
+_node_backup_gcp_types: ["gcp-native", "gcp-rclone"]
+_node_backup_rclone_types: ["gcp-rclone", "r2-rclone"]
+_node_backup_storages:
+ r2-rclone: r2
+ gcp-rclone: gcp
+ gcp-native: gcp
\ No newline at end of file
diff --git a/roles/state_exporter/defaults/main.yml b/roles/state_exporter/defaults/main.yml
new file mode 100644
index 0000000..ba4a080
--- /dev/null
+++ b/roles/state_exporter/defaults/main.yml
@@ -0,0 +1,6 @@
+---
+
+state_exporter_name: "state-exporter"
+state_exporter_user: "parity"
+state_exporter_file: "/home/{{ state_exporter_user }}/bin/{{ state_exporter_name }}.py"
+state_exporter_debug: false
diff --git a/roles/state_exporter/files/exporter.py b/roles/state_exporter/files/exporter.py
new file mode 100644
index 0000000..22bc130
--- /dev/null
+++ b/roles/state_exporter/files/exporter.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import schedule
+import time
+import sys
+import os
+import logging
+import traceback
+from prometheus_client import start_http_server, Gauge
+import psutil
+
+LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+
+node_chain_folders = {
+ 'polkadot': 'polkadot',
+ 'kusama': 'ksmcc3',
+ 'westend': 'westend2',
+ 'rococo': 'rococo_v1_12'
+}
+
+process_metrics = {
+ 'polkadot_state_process_cmdline': Gauge(
+ 'polkadot_state_process_cmdline',
+ 'cmdline of a node process',
+ ['name', 'pid', 'cmd_line']),
+ 'polkadot_state_process_threads': Gauge(
+ 'polkadot_state_process_threads',
+ 'number threads of a node process',
+ ['name', 'pid']),
+ 'polkadot_state_process_memory': Gauge(
+ 'polkadot_state_process_memory',
+ 'memory is used by a node process',
+ ['name', 'pid']),
+ 'polkadot_state_process_cpu_percent': Gauge(
+ 'polkadot_state_process_cpu_percent',
+ 'memory is used by a node process',
+ ['name', 'pid'])
+}
+
+node_metrics = {
+ 'polkadot_state_node_session_key': Gauge(
+ 'polkadot_state_node_session_key',
+ 'session key of a node',
+ ['name', 'pid', 'session_key'])
+}
+
+PORT = 9110
+
+
+def update_metrics():
+ processes = {}
+
+ for proc in psutil.process_iter():
+ try:
+ process_cmdline = proc.cmdline()
+ if not (len(process_cmdline) > 1 and '--name' in process_cmdline and '--chain' in process_cmdline):
+ continue
+ process_chain = process_cmdline[::-1][process_cmdline[::-1].index('--chain') - 1]
+ process_name = process_cmdline[::-1][process_cmdline[::-1].index('--name') - 1]
+ process_pid = proc.pid
+ process_base_path = process_cmdline[::-1][process_cmdline[::-1].index('--base-path') - 1]\
+ if '--base-path' in process_cmdline else None
+ # It will delete the previous process if
+ # it's the parent of the current process (it can be docker, bash, etc.)
+ if process_name in processes and processes[process_name]['pid'] < process_pid:
+ del processes[process_name]
+ processes[process_name] = {'pid': process_pid,
+ 'chain': process_chain,
+ 'cmd_line': ' '.join(process_cmdline[1:]),
+ 'threads': proc.num_threads(),
+ 'memory': proc.memory_info().rss,
+ 'cpu_percent': proc.cpu_percent(),
+ 'base_path': process_base_path
+ }
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+ pass
+ except Exception as e:
+ logger.error(e)
+ logger.error(traceback.print_tb(e.__traceback__))
+ return
+ logger.debug('processes were found: ' + str(processes))
+
+ try:
+ # wipe metrics
+ for metric in {**process_metrics, **node_metrics}.items():
+ for sample in metric[1].collect()[0].samples:
+ metric[1].remove(*list(sample.labels.values()))
+
+ for proc in processes:
+ process_metrics['polkadot_state_process_cmdline'].labels(
+ name=proc,
+ pid=processes[proc]['pid'],
+ cmd_line=processes[proc]['cmd_line']).set(1)
+ process_metrics['polkadot_state_process_threads'].labels(
+ name=proc,
+ pid=processes[proc]['pid']).set(processes[proc]['threads'])
+ process_metrics['polkadot_state_process_memory'].labels(
+ name=proc,
+ pid=processes[proc]['pid']).set(processes[proc]['memory'])
+ process_metrics['polkadot_state_process_cpu_percent'].labels(
+ name=proc,
+ pid=processes[proc]['pid']).set(processes[proc]['cpu_percent'])
+ if processes[proc]['base_path']:
+ keystore_path = os.path.join(
+ processes[proc]['base_path'],
+ 'chains',
+ node_chain_folders[processes[proc]['chain']],
+ 'keystore')
+ node_session_key = parse_session_key(keystore_path)
+ if node_session_key:
+ node_metrics['polkadot_state_node_session_key'].labels(
+ name=proc,
+ pid=processes[proc]['pid'],
+ session_key=node_session_key).set(1)
+ except Exception as e:
+ logger.error(e)
+ logger.error(traceback.print_tb(e.__traceback__))
+ return
+
+
+def parse_session_key(dir):
+ # variants of key prefixes in the right order
+ key_formats = (
+ ['6772616e', '62616265', '696d6f6e', '70617261', '61756469'],
+ ['6772616e', '62616265', '696d6f6e', '70617261', '6173676e', '61756469'])
+ possible_prefixes = list(set([j for i in key_formats for j in i]))
+
+ if os.path.isdir(dir):
+ os.chdir(dir)
+ files = os.listdir('.')
+ files = [i for i in files if len(i) == 72 and i[0:8] in possible_prefixes]
+ if not files:
+ return None
+ # find creation time of the newlest key
+ time_of_last_key = sorted(list(set([int(os.path.getmtime(i)) for i in files])))[-1]
+ # parse the newest public keys and them prefixes from names of files.
+ # creation time can have 1 second drift in theory
+ keys = {i[0:8]: i[8:] for i in files if int(os.path.getmtime(i)) in [time_of_last_key - 1, time_of_last_key, time_of_last_key + 1]}
+ logger.debug('keys were found: ' + str(keys) + ' in the keystore path: ' + dir)
+ for key_format in key_formats:
+ if set(keys.keys()) == set(key_format):
+ # build the session key
+ session_key = '0x' + ''.join([keys[i] for i in key_format])
+ logger.debug('the session key was parsed: ' + session_key + ' in the keystore path: ' + dir)
+ return(session_key)
+ logger.error('Error of session key parsing')
+ return None
+
+
+if __name__ == '__main__':
+ global logger
+ logger = logging.getLogger('state_exporter')
+
+ # console handler
+ ch = logging.StreamHandler()
+ if len(sys.argv) > 1 and sys.argv[1] == 'debug':
+ logger.setLevel(logging.DEBUG)
+ else:
+ logger.setLevel(logging.INFO)
+ formatter = logging.Formatter(LOGGING_FORMAT)
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+
+ # Start up the server to expose the metrics
+ start_http_server(PORT) # Metrics server
+ schedule.every(10).seconds.do(update_metrics)
+ while True:
+ schedule.run_pending()
+ time.sleep(1)
diff --git a/roles/state_exporter/handlers/main.yml b/roles/state_exporter/handlers/main.yml
new file mode 100644
index 0000000..199da3e
--- /dev/null
+++ b/roles/state_exporter/handlers/main.yml
@@ -0,0 +1,8 @@
+---
+
+- name: restart state-exporter
+ ansible.builtin.systemd:
+ name: "{{ state_exporter_name }}"
+ state: restarted
+ enabled: true
+ daemon_reload: true
diff --git a/roles/state_exporter/tasks/main.yml b/roles/state_exporter/tasks/main.yml
new file mode 100644
index 0000000..968a8a1
--- /dev/null
+++ b/roles/state_exporter/tasks/main.yml
@@ -0,0 +1,53 @@
+---
+
+- block:
+
+ - name: Exporter | Install apt packages
+ ansible.builtin.package:
+ name: "{{ packages }}"
+ state: present
+ update_cache: true
+ vars:
+ packages:
+ - "python3-prometheus-client"
+ - "python3-schedule"
+ - "python3-psutil"
+
+ - name: Exporter | Create directory
+ ansible.builtin.file:
+ path: "{{ state_exporter_file | dirname }}"
+ state: directory
+ mode: 0755
+ owner: "{{ state_exporter_user }}"
+ group: "{{ state_exporter_user }}"
+
+ - name: Exporter | Copy exporter
+ ansible.builtin.copy:
+ src: "exporter.py"
+ dest: "{{ state_exporter_file }}"
+ mode: 0755
+ owner: "{{ state_exporter_user }}"
+ group: "{{ state_exporter_user }}"
+ notify: restart state-exporter
+
+ - name: Exporter | Copy exporter systemd unit file
+ ansible.builtin.template:
+ src: ".service.j2"
+ dest: "/etc/systemd/system/{{ state_exporter_name }}.service"
+ owner: "root"
+ group: "root"
+ mode: "0600"
+ notify: restart state-exporter
+
+ # to avoid 2 restarts during the first deploy
+ - name: Exporter | Flush handlers
+ ansible.builtin.meta: flush_handlers
+
+ - name: Exporter | Start exporter service
+ ansible.builtin.systemd:
+ name: "{{ state_exporter_name }}"
+ state: started
+ enabled: true
+ daemon_reload: true
+
+ tags: ['state-exporter']
diff --git a/roles/state_exporter/templates/.service.j2 b/roles/state_exporter/templates/.service.j2
new file mode 100644
index 0000000..0e0fefc
--- /dev/null
+++ b/roles/state_exporter/templates/.service.j2
@@ -0,0 +1,13 @@
+[Unit]
+Description=Node backup exporter systemd service
+
+[Service]
+Environment=PYTHONUNBUFFERED=True
+ExecStart={{ state_exporter_file }}{% if state_exporter_debug %} debug{% endif %}
+
+Restart=always
+User={{ state_exporter_user }}
+Group={{ state_exporter_user}}
+
+[Install]
+WantedBy=multi-user.target