From e9434e9eeaafd0f3d092e02bb7ae4a5af1be8c28 Mon Sep 17 00:00:00 2001 From: Kamesh Akella Date: Fri, 30 Jun 2023 07:36:13 -0400 Subject: [PATCH] updates for optimizing failure scenario (#402) * updates for optimizing failure scenario * Update benchmark/src/main/content/bin/kc-chaos.sh Co-authored-by: Alexander Schwartz --------- Co-authored-by: Alexander Schwartz --- .../workflows/keycloak-chaos-benchmark.yml | 41 +++++++++---------- benchmark/src/main/content/bin/kc-chaos.sh | 32 +++++++++++---- benchmark/src/main/content/bin/kcb.sh | 28 ++++++++++++- .../modules/ROOT/pages/kcb-modes.adoc | 2 + .../modules/ROOT/pages/util/kc-chaos.adoc | 7 +++- 5 files changed, 78 insertions(+), 32 deletions(-) diff --git a/.github/workflows/keycloak-chaos-benchmark.yml b/.github/workflows/keycloak-chaos-benchmark.yml index 12b190df..202a6dd7 100644 --- a/.github/workflows/keycloak-chaos-benchmark.yml +++ b/.github/workflows/keycloak-chaos-benchmark.yml @@ -28,6 +28,14 @@ on: description: 'Initial users per second' type: string default: '1' + benchmarkRunDuration: + description: 'Benchmark run duration' + type: string + default: '180' + chaosTimeout: + description: 'Benchmark run duration' + type: string + default: '120' skipCreateDeployment: description: 'Skip creating Keycloak deployment' type: boolean @@ -36,10 +44,6 @@ on: description: 'Skip creating dataset' type: boolean default: false - skipDeleteProject: - description: 'Skip deleting project' - type: boolean - default: false concurrency: cluster_${{ inputs.clusterName || format('gh-{0}', github.repository_owner) }} @@ -98,7 +102,7 @@ jobs: disableStickySessions: true - name: Create Keycloak dataset with "${{ inputs.numberOfEntitiesInRealm }}" clients - if: ${{ !inputs.skipCreateDataset }} && inputs.scenarioName == 'authentication.ClientSecret' + if: "!inputs.skipCreateDataset && inputs.scenarioName == 'authentication.ClientSecret'" uses: ./.github/actions/keycloak-create-dataset with: project: ${{ env.PROJECT }} @@ -106,11 +110,12 @@ jobs: maxWaitEntityCreation: ${{ inputs.maxWaitEntityCreation }} - name: Create Keycloak dataset with "${{ inputs.numberOfEntitiesInRealm }}" users - if: ${{ !inputs.skipCreateDataset }} && inputs.scenarioName == 'authentication.AuthorizationCode' + if: "!inputs.skipCreateDataset && inputs.scenarioName == 'authentication.AuthorizationCode'" uses: ./.github/actions/keycloak-create-dataset with: project: ${{ env.PROJECT }} users: ${{ inputs.numberOfEntitiesInRealm }} + maxWaitEntityCreation: ${{ inputs.maxWaitEntityCreation }} - name: Get URLs uses: ./.github/actions/get-keycloak-url @@ -123,11 +128,10 @@ jobs: bin/kcb.sh --scenario=keycloak.scenario."${{ inputs.scenarioName }}" \ --server-url=${{ env.KEYCLOAK_URL }} \ --users-per-sec=${{ inputs.initialUsersPerSecond }} \ - --measurement=180 \ + --measurement=${{ inputs.benchmarkRunDuration }} \ --realm-name=realm-0 \ - --clients-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }} & - timeout 150 bin/kc-chaos.sh & - wait + --chaos=${{ inputs.chaosTimeout }} \ + --clients-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }} working-directory: keycloak-benchmark - name: Run "authentication.AuthorizationCode" failure scenario @@ -136,22 +140,15 @@ jobs: bin/kcb.sh --scenario=keycloak.scenario."${{ inputs.scenarioName }}" \ --server-url=${{ env.KEYCLOAK_URL }} \ --users-per-sec=${{ inputs.initialUsersPerSecond }} \ - --measurement=180 \ + --measurement=${{ inputs.benchmarkRunDuration }} \ --realm-name=realm-0 \ - --users-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }} & - timeout 150 bin/kc-chaos.sh & - wait + --chaos=${{ inputs.chaosTimeout }} \ + --users-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }} working-directory: keycloak-benchmark - - name: Archive Gatling reports + - name: Archive failure benchmark reports and logs uses: actions/upload-artifact@v3 with: name: gatling-results path: keycloak-benchmark/results - retention-days: 5 - - - name: Delete Keycloak deployment - if: ${{ !inputs.skipDeleteProject }} - uses: ./.github/actions/keycloak-delete-deployment - with: - project: ${{ env.PROJECT }} + retention-days: 14 diff --git a/benchmark/src/main/content/bin/kc-chaos.sh b/benchmark/src/main/content/bin/kc-chaos.sh index a4db2f03..de801c10 100755 --- a/benchmark/src/main/content/bin/kc-chaos.sh +++ b/benchmark/src/main/content/bin/kc-chaos.sh @@ -3,15 +3,18 @@ set -e : ${INITIAL_DELAY_SECS:=30} -: ${CHAOS_DELAY_SECS:=60} +: ${CHAOS_DELAY_SECS:=10} : ${PROJECT:="runner-keycloak"} -echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Entering Chaos mode, with an initial delay of $INITIAL_DELAY_SECS seconds" -sleep $INITIAL_DELAY_SECS -echo -e "INFO:$(date '+%F-%T-%Z') Running Chaos scenario - Delete random Keycloak pod" -while true; do +LOGS_DIR=$1 +echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Entering Chaos mode, with an initial delay of $INITIAL_DELAY_SECS seconds\033[0m" +sleep $INITIAL_DELAY_SECS +echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Running Chaos scenario - Delete random Keycloak pod\033[0m" +ATTEMPT=0 +while true; do + ATTEMPT=$[ATTEMPT + 1] RANDOM_KC_POD=$(kubectl \ -n "${PROJECT}" \ -o 'jsonpath={.items[*].metadata.name}' \ @@ -19,9 +22,22 @@ while true; do tr " " "\n" | \ shuf | \ head -n 1) - echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Killing Pod '${RANDOM_KC_POD}' and waiting for ${CHAOS_DELAY_SECS} seconds" + + kubectl get pods -n "${PROJECT}" -l app=keycloak -o wide + kubectl logs -f -n "${PROJECT}" "${RANDOM_KC_POD}" > "$LOGS_DIR/${ATTEMPT}-${RANDOM_KC_POD}.log" 2>&1 & + kubectl describe -n "${PROJECT}" pod "${RANDOM_KC_POD}" > "$LOGS_DIR/${ATTEMPT}-${RANDOM_KC_POD}-complete-resource.log" 2>&1 + kubectl top -n "${PROJECT}" pod -l app=keycloak --sum=true > "$LOGS_DIR/${ATTEMPT}-top.log" 2>&1 + echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Killing Pod '${RANDOM_KC_POD}' and waiting for ${CHAOS_DELAY_SECS} seconds\033[0m" kubectl delete pod -n "${PROJECT}" "${RANDOM_KC_POD}" --grace-period=1 + + START=$(date +%s) + + kubectl wait --for=condition=Available --timeout=600s deployments.apps/keycloak-operator -n "${PROJECT}" || true + kubectl wait --for=condition=Ready --timeout=600s keycloaks.k8s.keycloak.org/keycloak -n "${PROJECT}" || true + + END=$(date +%s) + DIFF=$(( END - START )) + + echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Keycloak pod ${RANDOM_KC_POD} took ${DIFF} seconds to recover\033[0m" sleep "${CHAOS_DELAY_SECS}" - echo -e "\033[0m" done - diff --git a/benchmark/src/main/content/bin/kcb.sh b/benchmark/src/main/content/bin/kcb.sh index 630797ff..91cd0cde 100755 --- a/benchmark/src/main/content/bin/kcb.sh +++ b/benchmark/src/main/content/bin/kcb.sh @@ -34,6 +34,8 @@ JAVA_OPTS="${JAVA_OPTS} -Xmx1G -XX:+HeapDumpOnOutOfMemoryError" DEBUG_MODE="${DEBUG:-false}" DEBUG_PORT="${DEBUG_PORT:-8787}" +CHAOS_MODE="${CHAOS_MODE:-false}" + CONFIG_ARGS=() SERVER_OPTS=() @@ -79,6 +81,10 @@ do MODE=incremental INCREMENT=${1#*=} ;; + --chaos=*) + CHAOS_MODE=true + CHAOS_TIMEOUT=${1#*=} + ;; --) shift break @@ -162,6 +168,14 @@ EOF return ${EXIT_RESULT} } +if [ "$CHAOS_MODE" = "true" ]; then + echo "INFO: Running benchmark with chaos mode, logs output will be available in: $LOGS_DIR" + LOGS_DIR="$DIRNAME/../results/logs/" + + mkdir -p "$LOGS_DIR" + timeout "${CHAOS_TIMEOUT}" bash bin/kc-chaos.sh "${LOGS_DIR}" 2>&1 | tee "${LOGS_DIR}/kc-chaos.log" & +fi + if [ "$MODE" = "incremental" ]; then echo "INFO: Running benchmark in incremental mode." MAX_ATTEMPTS=100 @@ -190,7 +204,7 @@ if [ "$MODE" = "incremental" ]; then exit 1 fi - ((ATTEMPT++)) + ATTEMPT=$[ATTEMPT + 1] run_benchmark_with_workload "$WORKLOAD_UNIT" "$CURRENT_WORKLOAD" "$MEASUREMENT" "$RESULT_ROOT_DIR/$WORKLOAD_UNIT-$CURRENT_WORKLOAD" @@ -237,3 +251,15 @@ else fi exit fi + +if [ "$CHAOS_MODE" = "true" ]; then + : ${PROJECT:="runner-keycloak"} + echo "INFO: Collecting logs at the end of the Chaos benchmark run" + PODS=$(kubectl -n "${PROJECT}" -o 'jsonpath={.items[*].metadata.name}' get pods -l app=keycloak | tr " " "\n") + for POD in $PODS; do + kubectl logs -n "${PROJECT}" "${POD}" > "$LOGS_DIR/End-of-run-${POD}.log" 2>&1 + kubectl describe -n "${PROJECT}" pod "${POD}" > "$LOGS_DIR/End-of-run-${POD}-complete-resource.log" 2>&1 + done + kubectl top -n "${PROJECT}" pod -l app=keycloak --sum=true > "$LOGS_DIR/End-of-run-top.log" 2>&1 + kubectl get pods -n "${PROJECT}" -l app=keycloak -o wide +fi diff --git a/doc/benchmark/modules/ROOT/pages/kcb-modes.adoc b/doc/benchmark/modules/ROOT/pages/kcb-modes.adoc index 6b874e61..b1fbaae3 100644 --- a/doc/benchmark/modules/ROOT/pages/kcb-modes.adoc +++ b/doc/benchmark/modules/ROOT/pages/kcb-modes.adoc @@ -6,4 +6,6 @@ In the `--incremental` mode, the code sets up a benchmark to go through a warm-u In the `--single-run` mode, which is the default mode when the `--incremental` flag is not set on the CLI command, the script directly runs the benchmark with the provided workload and exits without any further processing. +In the `--chaos` mode, the code sets up a single-run mode benchmark in parallel with a failure simulating `kc-chaos.sh` script to kill a random pod after a certain time interval. It can take an input `CHAOS_TIMEOUT` to stop the simulated failures after a given timeout. + Overall the `kcb.sh` script executes the benchmark in either incremental or single-run mode, handles errors and provides informative output messages during the process. diff --git a/doc/kubernetes/modules/ROOT/pages/util/kc-chaos.adoc b/doc/kubernetes/modules/ROOT/pages/util/kc-chaos.adoc index 040de041..7eaa3765 100644 --- a/doc/kubernetes/modules/ROOT/pages/util/kc-chaos.adoc +++ b/doc/kubernetes/modules/ROOT/pages/util/kc-chaos.adoc @@ -25,7 +25,7 @@ Once there is enough load going against the Keycloak application hosted on an ex [source,bash] ---- -./kc-chaos.sh +./kc-chaos.sh ---- Set the environment variables below to configure on how and where this script gets executed. @@ -35,3 +35,8 @@ Set the environment variables below to configure on how and where this script ge `CHAOS_DELAY_SECS`:: Time in seconds the script waits between simulating failures. `PROJECT`:: Namespace of the Keycloak pods. + + +=== Collecting the results + +The chaos script also collects information about the Keycloak failures, Keycloak pod utilization, Keycloak pod restarts, Keycloak logs before killing the keycloak pod and at the end of the run and store them under the `results/logs` directory.