.github/workflows/workload_checks.yml

# Workload Checks Suite
#
# Runs Vector Workload Checks.
#
# Runs on:
#  - scheduled UTC midnight Tues-Sat
#  - on demand by a PR comment matching either of:
#    - '/ci-run-regression'
#    - '/ci-run-all'
#    (the comment issuer must be a member of the Vector GH team)
#
# This workflow runs the collection of our workload checks, using the latest Vector nightly image,
# which depends on when the workflow is invoked.
#
# The goal is to establish a baseline of check results for a variety of cases
# and visualize trends for important Vector use cases.

name: Workload Checks Suite

on:
  workflow_call:
  workflow_dispatch:
  schedule:
    # At midnight UTC Tue-Sat
    - cron: '0 0 * * 2-6'
env:
  SINGLE_MACHINE_PERFORMANCE_API: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_API }}

jobs:
  compute-metadata:
    name: Compute metadata
    runs-on: ubuntu-latest
    outputs:
      replicas: ${{ steps.experimental-meta.outputs.REPLICAS }}
      warmup-seconds: ${{ steps.experimental-meta.outputs.WARMUP_SECONDS }}
      total-samples: ${{ steps.experimental-meta.outputs.TOTAL_SAMPLES }}
      smp-version: ${{ steps.experimental-meta.outputs.SMP_CRATE_VERSION }}
      lading-version: ${{ steps.experimental-meta.outputs.LADING_VERSION }}

      target-sha: ${{ steps.git-metadata.outputs.TARGET_SHA }}

    steps:
      - uses: actions/checkout@v3

      - name: Get git metadata
        id: git-metadata
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          export TARGET_SHA=$(git rev-parse HEAD)
          echo "TARGET_SHA=${TARGET_SHA}" >> $GITHUB_OUTPUT

          echo "target sha is: ${TARGET_SHA}"

          if [ "${TARGET_SHA}" = "" ] ; then
            echo "TARGET_SHA not found, exiting."
            exit 1
          fi

      - name: Setup experimental metadata
        id: experimental-meta
        run: |
          export WARMUP_SECONDS="45"
          export REPLICAS="10"
          export TOTAL_SAMPLES="600"
          export SMP_CRATE_VERSION="0.11.0"
          export LADING_VERSION="0.19.1"

          echo "warmup seconds: ${WARMUP_SECONDS}"
          echo "replicas: ${REPLICAS}"
          echo "total samples: ${TOTAL_SAMPLES}"
          echo "smp crate version: ${SMP_CRATE_VERSION}"
          echo "lading version: ${LADING_VERSION}"

          echo "WARMUP_SECONDS=${WARMUP_SECONDS}" >> $GITHUB_OUTPUT
          echo "REPLICAS=${REPLICAS}" >> $GITHUB_OUTPUT
          echo "TOTAL_SAMPLES=${TOTAL_SAMPLES}" >> $GITHUB_OUTPUT
          echo "SMP_CRATE_VERSION=${SMP_CRATE_VERSION}" >> $GITHUB_OUTPUT
          echo "LADING_VERSION=${LADING_VERSION}" >> $GITHUB_OUTPUT

  submit-job:
    name: Submit workload checks job
    runs-on: ubuntu-latest
    needs:
      - compute-metadata
    steps:
      - name: Check status, in-progress
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
            -f state='pending' \
            -f description='Experiments submitted to the Workload Checks cluster.' \
            -f context='Workload Checks Suite / submission' \
            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

      - uses: actions/checkout@v3

      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4.0.1
        with:
          aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
          aws-region: us-west-2

      - name: Login to Amazon ECR
        id: login-ecr
        uses: aws-actions/amazon-ecr-login@v2

      - name: Download SMP binary
        run: |
          aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp

      - name: Submit job
        env:
          RUST_LOG: info
        run: |
             CURRENT_DATE=$(date --utc '+%Y_%m_%d')
             RUST_LOG_DEBUG="debug,aws_config::profile::credentials=error"

             chmod +x ${{ runner.temp }}/bin/smp
             RUST_BACKTRACE=1 RUST_LOG="${RUST_LOG_DEBUG}" ${{ runner.temp }}/bin/smp \
                --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \
                job submit-workload \
                --lading-version ${{ needs.compute-metadata.outputs.lading-version }} \
                --total-samples ${{ needs.compute-metadata.outputs.total-samples }} \
                --warmup-seconds ${{ needs.compute-metadata.outputs.warmup-seconds }} \
                --replicas ${{ needs.compute-metadata.outputs.replicas }} \
                --target-image timberio/vector:nightly-debian \
                --target-sha ${{ needs.compute-metadata.outputs.target-sha }} \
                --target-config-dir ${{ github.workspace }}/workload-checks \
                --target-name vector \
                --target-command "/usr/bin/vector" \
                --target-environment-variables "DD_HOSTNAME=smp-workload-checks,DD_DD_URL=http://127.0.0.1:9092,DD_API_KEY=00000001" \
                --tags smp_status=nightly,client_team="vector",tag_date="${CURRENT_DATE}" \
                --submission-metadata ${{ runner.temp }}/submission-metadata

      - uses: actions/upload-artifact@v3
        with:
          name: vector-submission-metadata
          path: ${{ runner.temp }}/submission-metadata

      - name: Await job
        timeout-minutes: 120
        env:
          RUST_LOG: info
        run: |
          chmod +x ${{ runner.temp }}/bin/smp

          ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \
             job status \
              --wait \
              --wait-delay-seconds 60 \
              --wait-timeout-minutes 90 \
              --submission-metadata ${{ runner.temp }}/submission-metadata

      - name: Handle cancellation if necessary
        if: ${{ cancelled() }}
        env:
          RUST_LOG: info
        run: |
          chmod +x ${{ runner.temp }}/bin/smp
          ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} job cancel \
            --submission-metadata ${{ runner.temp }}/submission-metadata

      - name: Check status, cancelled
        if: ${{ cancelled() }}
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
            -f state='failure' \
            -f description='Experiments submitted to the Workload Checks cluster cancelled.' \
            -f context='Workload Checks Suite / submission' \
            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

      - name: Check status, success
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
            -f state='success' \
            -f description='Experiments submitted to the Workload Checks cluster successfully.' \
            -f context='Workload Checks Suite / submission' \
            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

      - name: Check status, failure
        if: ${{ failure() }}
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
            -f state='success' \
            -f description='Experiments submitted to the Workload Checks Suite failed.' \
            -f context='Workload Checks Suite / submission' \
            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}