From d6a85d5ca51f061024406ef3b092d5159bc8878a Mon Sep 17 00:00:00 2001 From: frrist Date: Mon, 29 Jan 2024 18:16:19 -0800 Subject: [PATCH] feat: implement centralized otel collector via terraform - closes #expanso-planning/issues/429 --- .../terraform/cloud-init/cloud-init.yml | 18 +++++ ops/metrics/terraform/gcp/backend.tf | 6 ++ ops/metrics/terraform/gcp/main.tf | 27 ++++++++ .../compute_instances/otelcollector/main.tf | 67 +++++++++++++++++++ .../otelcollector/outputs.tf | 8 +++ .../otelcollector/variables.tf | 45 +++++++++++++ .../terraform/gcp/modules/network/main.tf | 52 ++++++++++++++ .../terraform/gcp/modules/network/outputs.tf | 11 +++ .../gcp/modules/network/variables.tf | 53 +++++++++++++++ ops/metrics/terraform/gcp/outputs.tf | 3 + ops/metrics/terraform/gcp/variables.tf | 35 ++++++++++ ops/metrics/terraform/gcp/vars.tfvars | 7 ++ .../instance_files/otel-collector-config.yaml | 45 +++++++++++++ .../terraform/instance_files/otel.service | 14 ++++ ops/metrics/terraform/instance_files/start.sh | 32 +++++++++ ops/tf/main.tf | 2 + .../gcp/compute_instances/compute/main.tf | 1 + .../compute_instances/compute/variables.tf | 6 ++ .../gcp/compute_instances/requester/main.tf | 2 +- .../compute_instances/requester/variables.tf | 5 ++ .../instance_files/otel-collector.yaml | 9 ++- ops/tf/variables.tf | 5 ++ ops/tf/vars.tfvars | 2 + 23 files changed, 452 insertions(+), 3 deletions(-) create mode 100644 ops/metrics/terraform/cloud-init/cloud-init.yml create mode 100644 ops/metrics/terraform/gcp/backend.tf create mode 100644 ops/metrics/terraform/gcp/main.tf create mode 100644 ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/main.tf create mode 100644 ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/outputs.tf create mode 100644 ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/variables.tf create mode 100644 ops/metrics/terraform/gcp/modules/network/main.tf create mode 100644 ops/metrics/terraform/gcp/modules/network/outputs.tf create mode 100644 ops/metrics/terraform/gcp/modules/network/variables.tf create mode 100644 ops/metrics/terraform/gcp/outputs.tf create mode 100644 ops/metrics/terraform/gcp/variables.tf create mode 100644 ops/metrics/terraform/gcp/vars.tfvars create mode 100644 ops/metrics/terraform/instance_files/otel-collector-config.yaml create mode 100644 ops/metrics/terraform/instance_files/otel.service create mode 100644 ops/metrics/terraform/instance_files/start.sh diff --git a/ops/metrics/terraform/cloud-init/cloud-init.yml b/ops/metrics/terraform/cloud-init/cloud-init.yml new file mode 100644 index 0000000000..4f0dc56a43 --- /dev/null +++ b/ops/metrics/terraform/cloud-init/cloud-init.yml @@ -0,0 +1,18 @@ +#cloud-config + +write_files: + # otel config file + - path: /etc/otel-collector.yaml + encoding: b64 + owner: root:root + permissions: "0600" + content: | + ${otel_config_file} + + # otel service file + - path: /etc/systemd/system/otel.service + encoding: b64 + owner: root:root + permissions: "0600" + content: | + ${otel_service_file} diff --git a/ops/metrics/terraform/gcp/backend.tf b/ops/metrics/terraform/gcp/backend.tf new file mode 100644 index 0000000000..def62ceda5 --- /dev/null +++ b/ops/metrics/terraform/gcp/backend.tf @@ -0,0 +1,6 @@ +terraform { + backend "gcs" { + bucket = "bacalhau-otel-collector-infra-state" + prefix = "terraform" + } +} diff --git a/ops/metrics/terraform/gcp/main.tf b/ops/metrics/terraform/gcp/main.tf new file mode 100644 index 0000000000..2f39e7b491 --- /dev/null +++ b/ops/metrics/terraform/gcp/main.tf @@ -0,0 +1,27 @@ +provider "google" { + project = var.gcp_project_id + region = var.gcp_region + zone = var.gcp_zone +} + +module "gcp_network" { + source = "./modules/network" + region = var.gcp_region + subnet_cidr = "10.0.0.0/16" +} + +module "otel_collector_instance" { + source = "./modules/compute_instances/otelcollector" + + cloud_init_content = "" + + zone = var.gcp_zone + network = module.gcp_network.vpc_network_name + subnetwork = module.gcp_network.subnetwork_name + + boot_image = var.gcp_boot_image + otel_collector_instance_type = var.otel_collector_machine_type + + grafana_prometheus_username = var.grafana_prometheus_username + grafana_prometheus_password = var.grafana_prometheus_password +} \ No newline at end of file diff --git a/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/main.tf b/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/main.tf new file mode 100644 index 0000000000..11fa6557c0 --- /dev/null +++ b/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/main.tf @@ -0,0 +1,67 @@ +resource "google_compute_instance" "otel_collector" { + name = "bacalhau-otel-collector" + machine_type = var.otel_collector_instance_type + zone = var.zone + + metadata = { + startup-script = local.otel_start_script + user-data = data.cloudinit_config.otel_collector_cloud_init.rendered + } + + boot_disk { + initialize_params { + image = var.boot_image + size = var.boot_size + } + } + + network_interface { + network = var.network + subnetwork = var.subnetwork + access_config { + // TODO here is where we may wish to assign a static IP so this instance can be fronted with DNS + // Ephemeral public IP will be assigned + } + } +} + +locals { + // + // templating the bacalhau start script + // + otel_start_script = templatefile("${path.module}/../../../../instance_files/start.sh", { + // Add more arguments as needed + }) + + // + // templating otel config file + // + otel_config_content = templatefile("${path.module}/../../../../instance_files/otel-collector-config.yaml", { + grafana_prometheus_username = var.grafana_prometheus_username + grafana_prometheus_password = var.grafana_prometheus_password + }) + + // + // templating otel service file + // + otel_service_content = templatefile("${path.module}/../../../../instance_files/otel.service", { + // add more arguments as needed + }) + +} + +data "cloudinit_config" "otel_collector_cloud_init" { + gzip = false + base64_encode = false + + // provide parameters to cloud-init like files and arguments to scripts in the above part. + part { + filename = "cloud-config.yaml" + content_type = "text/cloud-config" + + content = templatefile("${path.module}/../../../../cloud-init/cloud-init.yml", { + otel_config_file : base64encode(local.otel_config_content) + otel_service_file : base64encode(local.otel_service_content), + }) + } +} diff --git a/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/outputs.tf b/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/outputs.tf new file mode 100644 index 0000000000..4175f7d4e1 --- /dev/null +++ b/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/outputs.tf @@ -0,0 +1,8 @@ +output "otel_collector_public_ips" { + value = google_compute_instance.otel_collector.*.network_interface.0.access_config.0.nat_ip +} + +output "otel_collector_private_ips" { + value = google_compute_instance.otel_collector.*.network_interface.0.network_ip +} + diff --git a/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/variables.tf b/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/variables.tf new file mode 100644 index 0000000000..0cfd880044 --- /dev/null +++ b/ops/metrics/terraform/gcp/modules/compute_instances/otelcollector/variables.tf @@ -0,0 +1,45 @@ +variable "otel_collector_instance_type" { + description = "The instance type for the otel collector" + type = string +} + +variable "zone" { + description = "The zone in which to provision instances" + type = string +} + +variable "boot_size" { + description = "The size of the boot disk" + type = number + default = 50 +} + +variable "boot_image" { + description = "The boot image for the instances" + type = string +} + +variable "cloud_init_content" { + description = "Content of the cloud-init script" + type = string +} + +variable "network" { + description = "The VPC network to attach to the instances" + type = string +} + +variable "subnetwork" { + description = "The subnetwork to attach to the instances" + type = string +} + +variable "grafana_prometheus_username" { + description = "username for hosted grafana prometheus" + type = string +} + +variable "grafana_prometheus_password" { + description = "password for hosted grafana prometheus" + type = string +} \ No newline at end of file diff --git a/ops/metrics/terraform/gcp/modules/network/main.tf b/ops/metrics/terraform/gcp/modules/network/main.tf new file mode 100644 index 0000000000..34378daf94 --- /dev/null +++ b/ops/metrics/terraform/gcp/modules/network/main.tf @@ -0,0 +1,52 @@ +resource "google_compute_network" "vpc_network" { + name = "${var.region}-bacalhau-otel-vpc-network" + auto_create_subnetworks = var.auto_subnets +} + +resource "google_compute_subnetwork" "subnetwork" { + name = "${var.region}-bacalhau-otel-subnetwork" + ip_cidr_range = var.subnet_cidr + region = var.region + network = google_compute_network.vpc_network.name +} + +resource "google_compute_address" "otel_collector_ip" { + name = "otel-collector-ip" + region = var.region +} + +resource "google_compute_firewall" "google_firewall_egress" { + name = "bacalhau-otel-firewall-egress" + network = google_compute_network.vpc_network.name + + direction = "EGRESS" + + allow { + protocol = "icmp" + } + + allow { + protocol = "tcp" + ports = var.egress_tcp_ports + } + + source_ranges = var.egress_source_ranges +} + +resource "google_compute_firewall" "bacalhau_protocol_firewall_ingress" { + name = "bacalhau-otel-firewall-ingress" + network = google_compute_network.vpc_network.name + + direction = "INGRESS" + + allow { + protocol = "icmp" + } + + allow { + protocol = "tcp" + ports = var.ingress_tcp_ports + } + + source_ranges = var.ingress_source_ranges +} diff --git a/ops/metrics/terraform/gcp/modules/network/outputs.tf b/ops/metrics/terraform/gcp/modules/network/outputs.tf new file mode 100644 index 0000000000..91cb7fb2cf --- /dev/null +++ b/ops/metrics/terraform/gcp/modules/network/outputs.tf @@ -0,0 +1,11 @@ +output "vpc_network_name" { + value = google_compute_network.vpc_network.name +} + +output "subnetwork_name" { + value = google_compute_subnetwork.subnetwork.name +} + +output "requester_ip" { + value = google_compute_address.otel_collector_ip.address +} diff --git a/ops/metrics/terraform/gcp/modules/network/variables.tf b/ops/metrics/terraform/gcp/modules/network/variables.tf new file mode 100644 index 0000000000..82042d3160 --- /dev/null +++ b/ops/metrics/terraform/gcp/modules/network/variables.tf @@ -0,0 +1,53 @@ +variable "region" { + description = "The region to host the network in" + type = string +} + +variable "subnet_cidr" { + description = "The CIDR block for the subnet" + type = string +} + +variable "auto_subnets" { + description = "When true GCP will automatically create subnetworks" + type = bool + default = true +} + +// +// Egress +// +variable "egress_tcp_ports" { + description = "List of TCP ports for egress rules" + type = list(string) + default = [ + // Grafana + "443" + ] +} + +variable "egress_source_ranges" { + description = "Source ranges for egress rules" + type = list(string) + default = ["0.0.0.0/0"] +} + +// +// Ingress +// +variable "ingress_tcp_ports" { + description = "List of TCP ports for ingress rules" + type = list(string) + default = [ + // SSH + "22", + // OpenTelemetry collector + "4318" + ] +} + +variable "ingress_source_ranges" { + description = "Source ranges for ingress rules" + type = list(string) + default = ["0.0.0.0/0"] +} diff --git a/ops/metrics/terraform/gcp/outputs.tf b/ops/metrics/terraform/gcp/outputs.tf new file mode 100644 index 0000000000..ff9b306a48 --- /dev/null +++ b/ops/metrics/terraform/gcp/outputs.tf @@ -0,0 +1,3 @@ +output "otel_collector_public_ip" { + value = module.otel_collector_instance.otel_collector_public_ips +} diff --git a/ops/metrics/terraform/gcp/variables.tf b/ops/metrics/terraform/gcp/variables.tf new file mode 100644 index 0000000000..c997be0f9d --- /dev/null +++ b/ops/metrics/terraform/gcp/variables.tf @@ -0,0 +1,35 @@ +variable "gcp_project_id" { + description = "GCP Project ID" + type = string +} + +variable "gcp_region" { + description = "GCP Region" + type = string +} + +variable "gcp_zone" { + description = "GCP Zone" + type = string +} + +variable "gcp_boot_image" { + description = "Boot image for GCP instances" + type = string + default = "projects/ubuntu-os-cloud/global/images/family/ubuntu-2304-amd64" +} + +variable "otel_collector_machine_type" { + description = "Machine type for collector instances" + type = string +} + +variable "grafana_prometheus_username" { + description = "username for hosted grafana prometheus" + type = string +} + +variable "grafana_prometheus_password" { + description = "password for hosted grafana prometheus" + type = string +} diff --git a/ops/metrics/terraform/gcp/vars.tfvars b/ops/metrics/terraform/gcp/vars.tfvars new file mode 100644 index 0000000000..8bba94ee9c --- /dev/null +++ b/ops/metrics/terraform/gcp/vars.tfvars @@ -0,0 +1,7 @@ +gcp_project_id = "forrest-dev-407420" +gcp_region = "us-west1" +gcp_zone = "us-west1-b" +gcp_boot_image = "projects/forrest-dev-407420/global/images/bacalhau-ubuntu-2004-lts-test" +otel_collector_machine_type = "e2-standard-4" +grafana_prometheus_username = "" +grafana_prometheus_password = "" diff --git a/ops/metrics/terraform/instance_files/otel-collector-config.yaml b/ops/metrics/terraform/instance_files/otel-collector-config.yaml new file mode 100644 index 0000000000..7fd63ff9ac --- /dev/null +++ b/ops/metrics/terraform/instance_files/otel-collector-config.yaml @@ -0,0 +1,45 @@ +extensions: + basicauth/prometheus: + client_auth: + username: ${grafana_prometheus_username} + password: ${grafana_prometheus_password} + +receivers: + otlp: + protocols: + http: + endpoint: "0.0.0.0:4318" + +exporters: + logging: + loglevel: debug + prometheusremotewrite: + endpoint: https://prometheus-us-central1.grafana.net/api/prom/push + auth: + authenticator: basicauth/prometheus + resource_to_telemetry_conversion: + enabled: true + +processors: + batch: + memory_limiter: + check_interval: 5s + limit_mib: 4000 + spike_limit_mib: 500 + resource: + attributes: + - key: service.collector + value: bacalhau-otel-collector + action: insert + attributes/metrics: + actions: + - pattern: net\.sock.+ + action: delete + +service: + extensions: [basicauth/prometheus] + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, attributes/metrics, batch] + exporters: [prometheusremotewrite, logging] \ No newline at end of file diff --git a/ops/metrics/terraform/instance_files/otel.service b/ops/metrics/terraform/instance_files/otel.service new file mode 100644 index 0000000000..2679ca616c --- /dev/null +++ b/ops/metrics/terraform/instance_files/otel.service @@ -0,0 +1,14 @@ +[Unit] +Description=otel collector +Documentation=https://opentelemetry.io/docs/collector/ +Wants=network-online.target +After=network-online.target + +[Service] +User=root +Group=root +Type=simple +ExecStart=/usr/local/bin/otelcol --config=/etc/otel-collector.yaml + +[Install] +WantedBy=multi-user.target diff --git a/ops/metrics/terraform/instance_files/start.sh b/ops/metrics/terraform/instance_files/start.sh new file mode 100644 index 0000000000..adffec5c01 --- /dev/null +++ b/ops/metrics/terraform/instance_files/start.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +function install-otel-collector() { + wget "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.92.0/otelcol-contrib_0.92.0_linux_386.tar.gz" + tar xvf "otelcol-contrib_0.92.0_linux_386.tar.gz" + sudo mv otelcol-contrib /usr/local/bin/otelcol +} + +# reload service files and enable services +function setup-services() { + echo "Loading systemctl services..." + sudo systemctl daemon-reload + echo "Enabling systemctl services..." + sudo systemctl enable otel.service +} + +# start services +function start-services() { + echo "Starting systemctl services..." + sudo systemctl restart otel.service +} + +# setup and start everything +function start() { + echo "Starting..." + # TODO move this into the VMI, maybe? + install-otel-collector + setup-services + start-services +} + +start &> /var/log/startup-script.log diff --git a/ops/tf/main.tf b/ops/tf/main.tf index 6bda588c02..b27cb9baaa 100644 --- a/ops/tf/main.tf +++ b/ops/tf/main.tf @@ -25,6 +25,7 @@ module "requester_instance" { aws_secret_access_key = var.aws_secret_access_key bacalhau_accept_networked_jobs = var.bacalhau_accept_networked_jobs bacalhau_repo_disk_size = var.bacalhau_repo_disk_size + bacalhau_otel_collector_endpoint = var.bacalhau_otel_collector_endpoint } module "compute_instance" { @@ -51,4 +52,5 @@ module "compute_instance" { bacalhau_accept_networked_jobs = var.bacalhau_accept_networked_jobs bacalhau_repo_disk_size = var.bacalhau_repo_disk_size bacalhau_local_disk_size = var.bacalhau_local_disk_size + bacalhau_otel_collector_endpoint = var.bacalhau_otel_collector_endpoint } \ No newline at end of file diff --git a/ops/tf/modules/gcp/compute_instances/compute/main.tf b/ops/tf/modules/gcp/compute_instances/compute/main.tf index 9940e9678c..14370dfe55 100644 --- a/ops/tf/modules/gcp/compute_instances/compute/main.tf +++ b/ops/tf/modules/gcp/compute_instances/compute/main.tf @@ -108,6 +108,7 @@ locals { // templating otel config file // otel_config_content = templatefile("${path.module}/../../../instance_files/otel-collector.yaml", { + bacalhau_otel_collector_endpoint = var.bacalhau_otel_collector_endpoint // add more arguments as needed }) diff --git a/ops/tf/modules/gcp/compute_instances/compute/variables.tf b/ops/tf/modules/gcp/compute_instances/compute/variables.tf index 6b4f1777b7..857b369eab 100644 --- a/ops/tf/modules/gcp/compute_instances/compute/variables.tf +++ b/ops/tf/modules/gcp/compute_instances/compute/variables.tf @@ -69,3 +69,9 @@ variable "bacalhau_local_disk_size" { description = "The size of the disk in GB bacalhau will to store local data" type = number } + + +variable "bacalhau_otel_collector_endpoint" { + description = "The opentelemetry collector endpoint to send metrics to" + type = string +} \ No newline at end of file diff --git a/ops/tf/modules/gcp/compute_instances/requester/main.tf b/ops/tf/modules/gcp/compute_instances/requester/main.tf index 48238fd66a..e592ec02ba 100644 --- a/ops/tf/modules/gcp/compute_instances/requester/main.tf +++ b/ops/tf/modules/gcp/compute_instances/requester/main.tf @@ -87,7 +87,7 @@ locals { // templating otel config file // otel_config_content = templatefile("${path.module}/../../../instance_files/otel-collector.yaml", { - // add more arguments as needed + bacalhau_otel_collector_endpoint = var.bacalhau_otel_collector_endpoint }) // diff --git a/ops/tf/modules/gcp/compute_instances/requester/variables.tf b/ops/tf/modules/gcp/compute_instances/requester/variables.tf index 954d9d5ad4..f34bebe8e9 100644 --- a/ops/tf/modules/gcp/compute_instances/requester/variables.tf +++ b/ops/tf/modules/gcp/compute_instances/requester/variables.tf @@ -59,3 +59,8 @@ variable "bacalhau_repo_disk_size" { description = "The size of the disk in GB bacalhau will to store its repo" type = number } + +variable "bacalhau_otel_collector_endpoint" { + description = "The opentelemetry collector endpoint to send metrics to" + type = string +} diff --git a/ops/tf/modules/instance_files/otel-collector.yaml b/ops/tf/modules/instance_files/otel-collector.yaml index 462cfa3568..a829504f6f 100644 --- a/ops/tf/modules/instance_files/otel-collector.yaml +++ b/ops/tf/modules/instance_files/otel-collector.yaml @@ -48,14 +48,19 @@ processors: exporters: logging: loglevel: debug + otlphttp: + endpoint: ${bacalhau_otel_collector_endpoint} + tls: + insecure: true + insecure_skip_verify: true service: pipelines: traces: receivers: [otlp] processors: [memory_limiter, resourcedetection/gcp, resource, attributes/metrics, batch] - exporters: [logging] + exporters: [logging, otlphttp] metrics: receivers: [otlp, hostmetrics] processors: [memory_limiter, resourcedetection/gcp, resource, attributes/metrics, batch] - exporters: [logging] \ No newline at end of file + exporters: [logging, otlphttp] \ No newline at end of file diff --git a/ops/tf/variables.tf b/ops/tf/variables.tf index 5e2dc02405..d904c2c40e 100644 --- a/ops/tf/variables.tf +++ b/ops/tf/variables.tf @@ -60,3 +60,8 @@ variable "bacalhau_local_disk_size" { type = number default = 50 } + +variable "bacalhau_otel_collector_endpoint" { + description = "The opentelemetry collector endpoint to send metrics to" + type = string +} diff --git a/ops/tf/vars.tfvars b/ops/tf/vars.tfvars index a8e0c63347..19802e158c 100644 --- a/ops/tf/vars.tfvars +++ b/ops/tf/vars.tfvars @@ -17,3 +17,5 @@ aws_access_key_id = "" aws_secret_access_key = "" bacalhau_accept_networked_jobs = true + +bacalhau_otel_collector_endpoint = "" \ No newline at end of file