From cf67177a673b1ff50be99ff08c8938376bf6e6c2 Mon Sep 17 00:00:00 2001 From: Shane da Silva Date: Thu, 17 Oct 2024 00:18:19 -0700 Subject: [PATCH] Add configuration for EC2 deploys (#380) --- .github/workflows/perform-action.yml | 65 +++++++++++++ Earthfile | 49 ++++++++++ Makefile | 69 ++++++++++++++ deploy-docker-compose.yml | 134 +++++++++++++++++++++++++++ deploy.yml | 92 ++++++++++++++++++ ec2-first-boot.sh | 10 ++ stack-dev.sh | 28 ++++++ 7 files changed, 447 insertions(+) create mode 100644 .github/workflows/perform-action.yml create mode 100644 Earthfile create mode 100644 Makefile create mode 100644 deploy-docker-compose.yml create mode 100644 deploy.yml create mode 100644 ec2-first-boot.sh create mode 100755 stack-dev.sh diff --git a/.github/workflows/perform-action.yml b/.github/workflows/perform-action.yml new file mode 100644 index 00000000..68606ce0 --- /dev/null +++ b/.github/workflows/perform-action.yml @@ -0,0 +1,65 @@ +name: Perform Action +run-name: ${{ inputs.action }} by @${{ github.actor }} + +on: + workflow_call: + secrets: + TERRAFORM_AWS_ACCESS_KEY_ID: + required: false # Not needed for image builds + TERRAFORM_AWS_SECRET_ACCESS_KEY: + required: false # Not needed for image builds + STACK_DEPLOY_SSH_PRIVATE_KEY: + required: true + DOCKERHUB_USER: + required: true + DOCKERHUB_PASSWORD: + required: true + inputs: + action: + description: Action to perform + required: true + type: string + + workflow_dispatch: + inputs: + action: + description: Action to perform + required: true + type: choice + options: + - build + - plan + - deploy + +jobs: + action: + runs-on: [earthly-satellite#backend-runner] + name: ${{ inputs.action }} + steps: + - name: Clone repo + uses: actions/checkout@v4 + + - name: Set release ID + id: set-release + run: | + release=$(date -u +"%Y-%m-%dT%H-%M-%S-%3NZ") + echo "release = $release" + echo "release=$release" >> "$GITHUB_OUTPUT" + + - name: Set commit hash + id: set-commit-hash + run: | + commit_hash=$(git rev-parse HEAD) + echo "commit_hash = $commit_hash" + echo "commit_hash=$commit_hash" >> "$GITHUB_OUTPUT" + + - name: Execute ${{ inputs.action }} + uses: ./.github/actions/command + with: + aws-access-key-id: ${{ secrets.TERRAFORM_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.TERRAFORM_AWS_SECRET_ACCESS_KEY }} + ssh-private-key: ${{ secrets.STACK_DEPLOY_SSH_PRIVATE_KEY }} + dockerhub-username: ${{ secrets.DOCKERHUB_USER }} + dockerhub-password: ${{ secrets.DOCKERHUB_PASSWORD }} + command: make ${{ inputs.action }} ref=${{ steps.set-commit-hash.outputs.commit_hash }} release=${{ steps.set-release.outputs.release }} + healthcheck: curl --fail https://fnames.farcaster.xyz/_health diff --git a/Earthfile b/Earthfile new file mode 100644 index 00000000..8df590a5 --- /dev/null +++ b/Earthfile @@ -0,0 +1,49 @@ +VERSION --run-with-aws 0.8 +PROJECT farcasterxyz/fname-registry +FROM alpine:3.20.3 +WORKDIR /workspace + +ARG --global --required fname_registry_commit_ref +ARG --global +docker_registry=526236635984.dkr.ecr.us-east-1.amazonaws.com/farcasterxyz/fname-registry +ARG --global FNAME_REGISTRY_DOCKER_IMAGE=$docker_registry:$fname_registry_commit_ref + +fname-registry-repo: + COPY . . + SAVE ARTIFACT /workspace /workspace + +fname-registry-prod: + FROM DOCKERFILE -f +fname-registry-repo/workspace/Dockerfile +fname-registry-repo/workspace/* + SAVE IMAGE --push $FNAME_REGISTRY_DOCKER_IMAGE + +stack-repo: + ARG stack_repo_git_url=git@github.com:warpcast/stack # Must use SSH since private + ARG stack_repo_commit_ref=main + GIT CLONE --branch $stack_repo_commit_ref $stack_repo_git_url /repo + SAVE ARTIFACT /repo /repo + +workspace: + FROM DOCKERFILE -f +stack-repo/repo/Dockerfile +stack-repo/repo/* + CACHE --sharing locked --persist /usr/src/app/workspace/cdktf.out + COPY +fname-registry-repo/workspace /usr/src/app/workspace + ARG CI + ENV CI=$CI + ENV FNAME_REGISTRY_DOCKER_IMAGE=$FNAME_REGISTRY_DOCKER_IMAGE + # OTEL is enabled by default by Earthly but causes crashes. Disable since we don't use + # See https://github.com/earthly/earthly/issues/4260 + ENV OTEL_METRICS_EXPORTER=none + ENV OTEL_TRACES_EXPORTER=none + +interactive-cmd: + FROM +workspace + ARG args + RUN --interactive --no-cache --ssh \ + --secret AWS_ACCESS_KEY_ID --secret AWS_SECRET_ACCESS_KEY \ + bun cli.js --workdir ./workspace $args + +cmd: + FROM +workspace + ARG args + RUN --no-cache --ssh \ + --secret AWS_ACCESS_KEY_ID --secret AWS_SECRET_ACCESS_KEY \ + bun cli.js --workdir ./workspace $args diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..e730b54a --- /dev/null +++ b/Makefile @@ -0,0 +1,69 @@ +# Lists `make ` shortcuts which are easier to type than the full command. + +ifeq ($(CI), true) + EARTHLY_INTERACTIVE_ARGS := --ci + CMD_TYPE := cmd + CI_ARGS := --CI=true +else + EARTHLY_INTERACTIVE_ARGS := --interactive + CMD_TYPE := interactive-cmd + CI_ARGS := +endif + +ifeq ($(release),) + release= + RELEASE_ARGS := --release=$(shell node -e "console.log(new Date().toISOString().replace(/\:/g,'-').replace(/\./g,'-'))") +endif + +REF_ARGS := --fname_registry_commit_ref=$(shell git rev-parse HEAD) + +ifdef pod + POD_ARGS := pod:$(pod) +else + POD_ARGS := +endif + +CMD_PREFIX := earthly --env-file-path /dev/null --max-remote-cache + +# Builds (but doesn't publish) the current Docker image specified by fname_registry_commit_ref in the Earthfile and outputs it locally. +.PHONY: build +build: + $(CMD_PREFIX) $(EARTHLY_INTERACTIVE_ARGS) +fname-registry-prod $(REF_ARGS) $(CI_ARGS) + +# Builds and publishes the current Docker image specified by fname_registry_commit_ref in the Earthfile +.PHONY: publish +publish: + $(CMD_PREFIX) --push +fname-registry-prod $(REF_ARGS) + +# Shows what infrastructure changes will be applied on the next deploy, if any. +.PHONY: plan +plan: + $(CMD_PREFIX) --ci --no-output +cmd $(RELEASE_ARGS) $(REF_ARGS) --CI=true --args="plan $(POD_ARGS)" + +# Applies any infrastructure changes without carrying out the rest of the deploy +# process (note: this could still result in a "deploy" if updating ASGs). +.PHONY: apply +apply: publish + $(CMD_PREFIX) $(EARTHLY_INTERACTIVE_ARGS) --no-output +$(CMD_TYPE) $(RELEASE_ARGS) $(REF_ARGS) $(CI_ARGS) --args="deploy --apply-only --yes $(POD_ARGS)" + +# Applies any infrastructure changes (if any) and carries out a deploy. +.PHONY: deploy +deploy: publish + $(CMD_PREFIX) $(EARTHLY_INTERACTIVE_ARGS) --no-output +$(CMD_TYPE) $(RELEASE_ARGS) $(REF_ARGS) $(CI_ARGS) --args="deploy --yes $(POD_ARGS)" + +# DANGEROUS. Deletes infrastructure. +.PHONY: destroy +destroy: + $(CMD_PREFIX) $(EARTHLY_INTERACTIVE_ARGS) --no-output +$(CMD_TYPE) $(RELEASE_ARGS) $(REF_ARGS) $(CI_ARGS) --args="destroy --yes $(POD_ARGS)" + +# Validates configuration files, reporting any issues. +.PHONY: lint +lintit: + $(CMD_PREFIX) $(EARTHLY_INTERACTIVE_ARGS) --no-output +$(CMD_TYPE) $(RELEASE_ARGS) $(REF_ARGS) $(CI_ARGS) --args="lint" + +# Open an SSH console to the specified pod. +# +# e.g. `make ssh pod=mypod` +.PHONY: ssh +ssh: + $(CMD_PREFIX) --no-output +$(CMD_TYPE) $(RELEASE_ARGS) $(REF_ARGS) --args="console $(POD_ARGS)" diff --git a/deploy-docker-compose.yml b/deploy-docker-compose.yml new file mode 100644 index 00000000..0caecb9f --- /dev/null +++ b/deploy-docker-compose.yml @@ -0,0 +1,134 @@ +# Docker Compose configuration shared by all pods for the backend. +# +# If you need to tweak something for a specific pod, reference it as an +# environment variable here and set that environment variable explicitly for +# each pod in deploy.yml + +x-shared-labels: &shared-labels + com.datadoghq.tags.pod: ${POD_NAME} + com.datadoghq.tags.service: fname-registry-${POD_NAME} + +# Blank values implies values are inherited from the environment +x-shared-env-vars: &shared-env-vars + TINI_VERBOSITY: 3 + ENVIRONMENT: prod + FC_NETWORK_ID: "1" + WARPCAST_ADDRESS: "0xABba722926c8302c73e57A25AD8F63753904546f" + CCIP_ADDRESS: "0x145b9934B42F214C101De04b6115285959BDD4F5" + DD_API_KEY: + MNEMONIC: + OP_ALCHEMY_SECRET: + MAINNET_ALCHEMY_SECRET: + ETHERSCAN_API_SECRET: + INFURA_PROJECT_ID: + INFURA_PROJECT_SECRET: + POSTGRES_URL: + POSTGRES_URL_READ: + # Datadog SDK configuration. Agent configuration is further below + DD_API_KEY: + DD_TRACE_AGENT_URL: unix:///var/run/datadog/apm.socket + DD_RUNTIME_METRICS_ENABLED: "true" + DD_PROFILING_ENABLED: "true" + DD_LOGS_INJECTION: "true" + DD_ENV: prod + DD_VERSION: ${RELEASE} + DD_TRACE_DISABLED_PLUGINS: child_process, connect, dns, net + +services: + app: + depends_on: + - datadog # Make sure agent is running so all stats+logs are collected + image: ${FNAME_REGISTRY_DOCKER_IMAGE:-dummy-value-for-linting} + init: true + command: ["node", "index.js"] + environment: + <<: *shared-env-vars + TASK_TYPE: ${TASK_TYPE} + DD_SERVICE: fname-registry-${POD_NAME} + POSTGRES_CONNECTION_POOL_MAX_SIZE: "20" + labels: + <<: *shared-labels + network_mode: host + restart: always + volumes: + - /var/run/datadog/:/var/run/datadog/ # Unix socket + ulimits: + nofile: + soft: 65535 + hard: 65535 + healthcheck: + test: ["CMD-SHELL", "${HEALTHCHECK_CMD}"] + start_period: 15s # Only affects whether Docker determines the container is healthy, does not stop container if unhealthy + interval: 10s + timeout: 10s + retries: 3 + + datadog: + image: public.ecr.aws/datadog/agent:7.57.2 + cgroup: host + pid: host + network_mode: host + restart: always + environment: + DD_LOG_LEVEL: "WARN" # For agent's logs only + DD_ENV: prod + DD_SERVICE: backend-${POD_NAME} + DD_VERSION: ${RELEASE} + DD_API_KEY: ${DD_API_KEY} + DD_HOSTNAME: ${INSTANCE_ID}.merkle.zone + DD_APM_ENABLED: "true" + DD_APM_RECEIVER_SOCKET: /var/run/datadog/apm.socket + DD_APM_NON_LOCAL_TRAFFIC: "true" + DD_LOGS_ENABLED: "true" + DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL: "true" + DD_PROCESS_CONFIG_PROCESS_COLLECTION_ENABLED: "true" + DD_PROCESS_AGENT_ENABLED: "true" + DD_SYSTEM_PROBE_NETWORK_ENABLED: "true" + DD_DOGSTATSD_TAG_CARDINALITY: "low" + DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" + DD_DOGSTATSD_ORIGIN_DETECTION: "true" + DD_DOGSTATSD_SOCKET: /var/run/datadog/dsd.socket + DD_CONTAINER_LABELS_AS_TAGS: '{ + "com.docker.compose.service": "container", + "com.datadoghq.tags.pod": "pod" + }' + DD_HEALTH_PORT: 5555 + cap_add: + - SYS_ADMIN + - SYS_RESOURCE + - SYS_PTRACE + - NET_ADMIN + - NET_BROADCAST + - NET_RAW + - IPC_LOCK + - CHOWN + security_opt: + - apparmor:unconfined + labels: + <<: *shared-labels + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /proc/:/host/proc/:ro + - /opt/datadog-agent/run:/opt/datadog-agent/run:rw + - /sys/fs/cgroup/:/host/sys/fs/cgroup:ro + - /etc/passwd:/etc/passwd:ro + - /var/run/datadog/:/var/run/datadog/ + - /sys/kernel/debug:/sys/kernel/debug + - /etc/group:/etc/group:ro + - /usr/lib/os-release:/usr/lib/os-release:ro + - /sys/kernel/security:/host/sys/kernel/security:ro + + # Watches container health status and restarts containers if failed + # (Docker doesn't do this out of the box, it only restarts if process exits) + autoheal: + environment: + AUTOHEAL_CONTAINER_LABEL: all + AUTOHEAL_START_PERIOD: 120 # Give service time to start up (2 mins) + image: willfarrell/autoheal:1.1.0 + network_mode: none + restart: always + stop_signal: SIGKILL # autoheal doesn't handle signals correctly, so just kill it + volumes: + - /etc/localtime:/etc/localtime:ro + - /var/run/docker.sock:/var/run/docker.sock diff --git a/deploy.yml b/deploy.yml new file mode 100644 index 00000000..88461ff3 --- /dev/null +++ b/deploy.yml @@ -0,0 +1,92 @@ +# Name of the project. +# DON'T change this -- it will rename + recreate all resources (downtime) +project: fname-registry +region: us-east-1 + +network: + id: vpc-0f6f9a87c6da89cc3 # VPC to deploy the service within + subnets: + public: + - subnet-0c692bcb8e0b04af0 # public-1b + - subnet-07ac4939a1d7db9c1 # public-1d + - subnet-0305c0d827e803272 # public-1f + private: + - subnet-09d8bb56d08618935 # private-1b + - subnet-0fdd713886476d21c # private-1d + - subnet-0d4f99e3a3569ec11 # private-1f + +x-shared-pod-options: &shared-pod-options + image: ami-0430baeab16c9655e # 64-bit ARM Amazon Linux 2023 with Docker Compose already installed + sshUser: ec2-user + bastionUser: ec2-user + bastionHost: 3.234.215.16 + compose: deploy-docker-compose.yml + initScript: ec2-first-boot.sh + deploy: + replaceWith: new-instances + shutdownTimeout: 10 + instanceRefreshTimeout: 600 # 10 minutes + +pods: + api: + <<: *shared-pod-options + instanceType: c7g.large # 2 vCPU, 4GiB, ARM, up to 12.5 Gbit + environment: + FNAME_REGISTRY_DOCKER_IMAGE: # Provided by deployment process after image is built + TASK_TYPE: server + HEALTHCHECK_CMD: "curl --connect-timeout 5 http://127.0.0.1:3000/_health" + NODE_OPTIONS: "--enable-source-maps" + loadBalancers: + api: + type: application + public: true + # Only for ALBs. Ensure this is smaller than the application server's keep-alive timeout. + # See: https://adamcrowder.net/posts/node-express-api-and-aws-alb-502/ + idleTimeout: 25 + endpoints: + api: + loadBalancer: + name: api + protocol: HTTPS + port: 443 + cert: warpcast.com + public: false # Only load balancer is public + target: + port: 3000 + protocol: HTTP + deregistration: + delay: 30 # Must be larger than our client timeout + healthCheck: + path: "/_health" + healthyThreshold: 2 + unhealthyThreshold: 2 + timeout: 2 + interval: 5 + autoscaling: + healthCheckGracePeriod: 60 # Sometimes EC2 instances take a while to start + minHealthyPercentage: 100 + maxHealthyPercentage: 200 + minHealthyInstances: 1 + onDemandBaseCapacity: 2 + onDemandPercentageAboveBaseCapacity: 50 + +# Defines the _names_ of secrets (not values!) and which pods have access to them +# Values for each secret must be set manually in AWS Secrets Manager. +secrets: + DD_API_KEY: + FNAME_REGISTRY_MNEMONIC: + as: MNEMONIC + FNAME_REGISTRY_OP_ALCHEMY_SECRET: + as: OP_ALCHEMY_SECRET + FNAME_REGISTRY_MAINNET_ALCHEMY_SECRET: + as: MAINNET_ALCHEMY_SECRET + FNAME_REGISTRY_ETHERSCAN_API_SECRET: + as: ETHERSCAN_API_SECRET + FNAME_REGISTRY_INFURA_PROJECT_ID: + as: INFURA_PROJECT_ID + FNAME_REGISTRY_INFURA_PROJECT_SECRET: + as: INFURA_PROJECT_SECRET + FNAME_REGISTRY_POSTGRES_URL: + as: POSTGRES_URL + FNAME_REGISTRY_POSTGRES_URL_READ: + as: POSTGRES_URL_READ diff --git a/ec2-first-boot.sh b/ec2-first-boot.sh new file mode 100644 index 00000000..8103d777 --- /dev/null +++ b/ec2-first-boot.sh @@ -0,0 +1,10 @@ +# Runs via cloud-init as root user on first boot. + +set -ex -o pipefail + +# Persist across reboots +echo >> /etc/sysctl.conf << EOF +# Increase max number of allowed inbound connections +net.core.somaxconn = 1048576 +EOF +sysctl -p /etc/sysctl.conf # Apply above settings diff --git a/stack-dev.sh b/stack-dev.sh new file mode 100755 index 00000000..4214fdc6 --- /dev/null +++ b/stack-dev.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Useful for testing local changes to `stack`. +# +# DON'T use unless you understand what you are doing. +# +# Requires: +# +# - Stack repo checked out in a sibling directory to this repo +# - SSH key that is authorized to access the hubs +# - ssh-agent with that key loaded and SSH_AUTH_SOCK set in your shell +# - AWS credentials allowing read-only access to our AWS account +# stored in ~/.aws/credentials + +set -euo pipefail + +docker build -t stack -f ../stack/Dockerfile ../stack/. + +aws_access_key_id=$(cat ~/.aws/credentials | sed -n 's/.*aws_access_key_id\s*=\s*\([a-zA-Z0-9+-_]*\).*/\1/p') +aws_secret_access_key=$(cat ~/.aws/credentials | sed -n 's/.*aws_secret_access_key\s*=\s*\([a-zA-Z0-9+-_]*\).*/\1/p') + +docker run --rm -it \ + -v$(pwd):/usr/src/app/workspace \ + -e AWS_ACCESS_KEY_ID=$aws_access_key_id \ + -e AWS_SECRET_ACCESS_KEY=$aws_secret_access_key \ + -v $SSH_AUTH_SOCK:/ssh-agent \ + -e SSH_AUTH_SOCK=/ssh-agent \ + stack "$@"