From f165f1d22952fef76d8dccac9c6d37f07ad8e6f3 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Sun, 5 Nov 2023 12:42:41 -0700 Subject: [PATCH] Init --- .github/workflows/deploy.yml | 98 ++++++++++++++++++++++ .gitignore | 2 + README.md | 18 ++++ _quarto.yml | 41 +++++++++ changes/2023-11-incident-response/index.md | 34 ++++++++ changes/index.md | 4 + incidents/2023-10-28-disk-failure/index.md | 19 +++++ incidents/index.md | 4 + index.md | 95 +++++++++++++++++++++ nodes/compute.md | 13 +++ nodes/index.md | 5 ++ nodes/storage.md | 44 ++++++++++ services/end-user.md | 6 ++ services/index.md | 5 ++ services/infrastructure.md | 8 ++ styles.css | 1 + 16 files changed, 397 insertions(+) create mode 100644 .github/workflows/deploy.yml create mode 100644 .gitignore create mode 100644 README.md create mode 100644 _quarto.yml create mode 100644 changes/2023-11-incident-response/index.md create mode 100644 changes/index.md create mode 100644 incidents/2023-10-28-disk-failure/index.md create mode 100644 incidents/index.md create mode 100644 index.md create mode 100644 nodes/compute.md create mode 100644 nodes/index.md create mode 100644 nodes/storage.md create mode 100644 services/end-user.md create mode 100644 services/index.md create mode 100644 services/infrastructure.md create mode 100644 styles.css diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..baccd98 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,98 @@ +# Build, and deploy to either GitHub Pages (production), or Netlify (PR previews) +name: "Build and deploy" + +on: + # "Production" deployments run on branch + push: + branches: ["main"] + + # Preview deployments run on PRs + pull_request: + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + + +# Sets permissions of the GITHUB_TOKEN +permissions: + # For GitHub Pages: + contents: "read" + pages: "write" + id-token: "write" + # For PR preview comments: + pull-requests: "write" + + +# Allow one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: true + + +jobs: + # Build job + build: + runs-on: "ubuntu-latest" + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - uses: "quarto-dev/quarto-actions/setup@v2" + + - name: "Render Quarto website" + run: "quarto render ." + + - name: "Upload site artifact" + uses: "actions/upload-pages-artifact@v1" + with: + path: "./_site" + + + # Deploy preview to Netlify IFF this action triggered by PR + # Based on: https://github.com/quarto-dev/quarto-web/blob/main/.github/workflows/preview.yml + deploy_preview: + if: "github.event_name == 'pull_request'" + runs-on: "ubuntu-latest" + needs: "build" + steps: + - name: "Download site artifact" + uses: "actions/download-artifact@v3" + with: + # The name of artifacts created by `actions/upload-pages-artifact` is always "github-pages" + name: "github-pages" + path: "./_site" + + - name: "Untar site artifact" + run: "tar --directory ./_site -xvf ./_site/artifact.tar " + + - name: "Deploy preview to Netlify" + uses: "nwtgck/actions-netlify@v2" + env: + NETLIFY_SITE_ID: "${{ secrets.NETLIFY_SITE_ID }}" + NETLIFY_AUTH_TOKEN: "${{ secrets.NETLIFY_AUTH_TOKEN }}" + with: + publish-dir: "./_site" + production-deploy: false + github-token: "${{ secrets.GITHUB_TOKEN }}" + deploy-message: "Deploy from ${{ github.event.repository.full_name}} GHA: PR ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.title }}" + alias: "${{ github.event.repository.name }}-pr-${{ github.event.pull_request.number }}-preview" + # these all default to 'true' + enable-pull-request-comment: true + enable-commit-comment: false + enable-commit-status: true + overwrites-pull-request-comment: false + timeout-minutes: 1 + + + # Deploy to GH Pages IFF this action triggered by push + deploy: + if: "github.event_name == 'push'" + runs-on: "ubuntu-latest" + needs: "build" + environment: + name: "github-pages" + url: "${{ steps.deployment.outputs.page_url }}" + steps: + - name: "Deploy to GitHub Pages" + id: "deployment" + uses: "actions/deploy-pages@v1" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..47c274c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.quarto/ +/_site/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..30eb4f1 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# Infra + +Describes (and eventually builds) my home service infrastructure. + +> [!NOTE] +> +> This is a _work in progress_. TODO: +> +> * Much of my service-specific config is in private repositories and needs +> to be migrated. I'm not sure yet how I'll organize that config going forward; perhaps +> it will all be in this repo? Maybe sub-repos? +> +> * I'm optimizing for speed in writing the first pass. I need to fill in details like +> model numbers, complete specs, etc. later. +> +> * What information should be published and what should be withheld? +> +> * Create doc structure so storage pools can each have their own page. diff --git a/_quarto.yml b/_quarto.yml new file mode 100644 index 0000000..0e41925 --- /dev/null +++ b/_quarto.yml @@ -0,0 +1,41 @@ +project: + type: "website" + + +website: + title: "Matt Fisher Infrastructure" + site-url: "https://mfisher87.github.io/infra" + # site-path: "/infra" # Needed? + + repo-url: "https://github.com/mfisher87/infra" + repo-actions: + - "edit" + - "source" + - "issue" + + page-footer: + right: "This page is built with [Quarto](https://quarto.org/)." + left: "© 2023" + + sidebar: + background: "#eee" + style: "docked" + search: true + contents: + - text: "Nodes" + href: "nodes/index.md" + contents: "nodes/*.md" + - text: "Services" + href: "services/index.md" + contents: "services/*.md" + - text: "Incidents" + href: "incidents/index.md" + - text: "Changes" + href: "changes/index.md" + + +format: + html: + theme: "cosmo" + css: "styles.css" + toc: true diff --git a/changes/2023-11-incident-response/index.md b/changes/2023-11-incident-response/index.md new file mode 100644 index 0000000..9d8c732 --- /dev/null +++ b/changes/2023-11-incident-response/index.md @@ -0,0 +1,34 @@ +--- +title: "2023-11: Disk failure incident response" +description: | + New storage was added and a new pool was configured in response to this incident. +--- + +## Storage changes + +As a result of this incident, the failed 1TB HDD was replaced with a 1TB HDD in storage +pool 1. A new 4TB HDD was purchesd to replace this drive in pool 1, increasing its +capacity from 5TB (4TB + 1TB data + 4TB parity) to 8TB (4TB + 4TB data + 4TB parity). + +In addition, storage pool 2 was created from in stock 8TB disks to provide migration +space during recovery. + + +## Deployment changes + +The VM which was mistakenly assigned to house its disk on this inappropriate drive will +need to be recreated from scratch to replace all the services that were running on it. + + +## Other changes + +### Labels! + +Finding the physical failed drive was far more of a challenge than I wanted it to be. I +printed labels for each drive bay on the [storage node](/nodes/storage.md) including: + +* Internal SATA port the bay is connected to +* The drive model currently in the bay +* The capacity of the drive currently in the bay +* The functional purpose of the drive currently in the bay (e.g. `Pool1 Parity1`, `Pool2 + Data1`, etc.) diff --git a/changes/index.md b/changes/index.md new file mode 100644 index 0000000..a710527 --- /dev/null +++ b/changes/index.md @@ -0,0 +1,4 @@ +--- +title: "Changes" +listing: default +--- diff --git a/incidents/2023-10-28-disk-failure/index.md b/incidents/2023-10-28-disk-failure/index.md new file mode 100644 index 0000000..a2f769d --- /dev/null +++ b/incidents/2023-10-28-disk-failure/index.md @@ -0,0 +1,19 @@ +--- +title: "2023-10-28: Disk failure" +description: | + A 1TB HDD in the [storage node](/nodes/storage.md) failed. This drive was used for VM + backups, and (mistakenly) for a services VM disk. This resulted in minor data loss. +--- + +This disk was unfortunately unprotected, because I figured it was just VM backups and if +it died, I could replace it and continue. This was perhaps a risky choice, but I also +made a big mistake by accidentally storing a live VM disk on this drive. + +The data lost was unpushed service configuration changes, and potentially some +relatively unimportant secrets. + + +## Infrastructure changes + +See the related [change document](/changes/2023-11-incident-response/index.md) for more +details about changes as a result of this incident. diff --git a/incidents/index.md b/incidents/index.md new file mode 100644 index 0000000..63474e4 --- /dev/null +++ b/incidents/index.md @@ -0,0 +1,4 @@ +--- +title: "Incidents" +listing: default +--- diff --git a/index.md b/index.md new file mode 100644 index 0000000..7beec3c --- /dev/null +++ b/index.md @@ -0,0 +1,95 @@ +--- +title: "My infrastructure" +--- + +## Current state + +I have two physical nodes: + +1. [Storage](nodes/storage.md): An older build from new parts in a Supermicro chassis with 12 + hotswap drive bays. +2. [Compute](nodes/compute.md): A newer build from used eBay parts in a Supermicro + chassis with lots of airflow. + + +### Virtualization/containerization + +Each of these nodes is running _Proxmox VE_. Software is deployed on VMs, enabling +deployment of virtual clustered systems. + +Software is deployed on containers to the extent possible. Sometimes Docker Swarm, +sometimes Docker Compose, sometimes Kubernetes. + +:::{.callout-note} +That may not even be a full listing of my deployment types... + +TODO: Standardize my deployments! +::: + + +### Storage + +My aim is to use simple solutions that minimize [cognitive +load](https://mfisher87.github.io/cognitive_load.html) and maximize flexibility. + +:::{.callout-note} +I made a lot of these decisions a long time ago and don't have my full rationale +anymore. When I find / remember it, update here. +::: + + +#### Redundancy: _mdadm_ + +[_mdadm_](https://en.wikipedia.org/wiki/Mdadm) is a utility for managing software RAID. +I'm using this for operating system drives (mostly SSDs?) to enable servers to survive +drive failures. + + +#### Redundancy: _SnapRAID_ + +[_SnapRAID_](https://www.snapraid.it/) is a non-realtime software RAID solution. I'm +using this for shared storage drives. Parity is calculated and validated (bit-rot +protection) on a schedule. + +I'm also using _UnionFS_ to expose data drives as a unified pool. + +##### Rationale + +* Data drives can be accessed in isolation of the array (even if the array can't be + fully recovered). +* Data files are hashed to protect from bit rot. +* Arrays are flexible to change size, increase parity, etc. + * Disks with data already on them can be added to the array. +* Only the disk being accessed will spin up. +* Between the scheduled parity recalculations, it's possible to "un-delete" files! + + +##### Considerations + +* Parity drives must be among the largest in the pool. For now I have two pools until I + get more drives and can re-organize them in to one pool. + + +### Network + +1GbE + + +## Desired state + +### Backups as a service + +How to make automated (file-level) backups easier? Time to try out some new tools, e.g. +Borg, Restic? + + +### Energy efficiency + +* How to get nodes to sleep when not in use? + +* What services can run on more purpose-built hardware to save energy from e.g. software + video encoding? + +* How can I integrate SBCs (Raspberry Pi, oDroid N2+, H3+) to save energy? + +* How best to monitor energy usage at the node/service level? diff --git a/nodes/compute.md b/nodes/compute.md new file mode 100644 index 0000000..1f184fa --- /dev/null +++ b/nodes/compute.md @@ -0,0 +1,13 @@ +--- +title: "Compute node" +--- + +The newest node in my infrastructure, this node provides various infrastructure and +end-user services. + +Built in 2020 (?) from used parts bought on eBay and parts I had in stock. + + +## Storage + +OS and VM Disk storage is on dual SSDs in an _mdadm_ mirrored array. diff --git a/nodes/index.md b/nodes/index.md new file mode 100644 index 0000000..294c111 --- /dev/null +++ b/nodes/index.md @@ -0,0 +1,5 @@ +--- +title: "Nodes" +listing: + type: table +--- diff --git a/nodes/storage.md b/nodes/storage.md new file mode 100644 index 0000000..a230f11 --- /dev/null +++ b/nodes/storage.md @@ -0,0 +1,44 @@ +--- +title: "Storage node" +--- + +The first node I set up long ago to act as a NAS, this node provides storage services, +but also runs some end-user services. + +Originally built maybe a decade ago from new parts and used parts I had in stock. +Upgraded over time and eventually migrated in to a Supermicro 12-bay hotswap chassis. + + +## Storage + +A total of 14 ports (6 motherboard SATA + 8 SAS expander PCI card). 12 are exposed as +hot-swap drive bays. + +Hot-swap drives are largely exposed as _UnionFS_ drive pools, with parity provided by +SnapRAID. + + +### Pool 1 + +3x 4TB HDDs. 2 data disks totaling 8TB of storage. 1 parity disk (4TB). + + +### Pool 2 + +3x 8TB HDDs. 2 data disks totaling 16TB of storage. 1 parity disk (8TB). + + +### Considerations + +When I add more disks, consider combining the pools. The current set up is to compromise +between maximizing available storage and tolerance for drive failures. + +Pros: + +* Can tolerate 1 drive failure, 2 if I'm lucky and they happen on separate pools. +* 24TB of storage available. + +Cons: + +* If two drives on the same pool fail, I'm in trouble. Combining the pool and having two + parity drives would allow two arbitrary drives to fail. diff --git a/services/end-user.md b/services/end-user.md new file mode 100644 index 0000000..66f76a4 --- /dev/null +++ b/services/end-user.md @@ -0,0 +1,6 @@ +--- +title: "End-user services" +--- + +Services which are accessed directly by end-users, e.g. _HomeAssistant_, wiki, +dashboards, etc. diff --git a/services/index.md b/services/index.md new file mode 100644 index 0000000..294c111 --- /dev/null +++ b/services/index.md @@ -0,0 +1,5 @@ +--- +title: "Nodes" +listing: + type: table +--- diff --git a/services/infrastructure.md b/services/infrastructure.md new file mode 100644 index 0000000..9bfe67b --- /dev/null +++ b/services/infrastructure.md @@ -0,0 +1,8 @@ +--- +title: "Infrastructure services" +--- + +Services which provide resources needed to run [end-user +services](/services/end-user.md). + +E.g. _Proxmox VE_, _Kubernetes_, _Docker_, _Traefik_, _MinIO_, etc. diff --git a/styles.css b/styles.css new file mode 100644 index 0000000..2ddf50c --- /dev/null +++ b/styles.css @@ -0,0 +1 @@ +/* css styles */