updates for optimizing failure scenario (#402)

* updates for optimizing failure scenario * Update benchmark/src/main/content/bin/kc-chaos.sh Co-authored-by: Alexander Schwartz <[email protected]> --------- Co-authored-by: Alexander Schwartz <[email protected]>
keycloak · Jun 30, 2023 · e9434e9 · e9434e9
1 parent c2a38d9
commit e9434e9
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 32 deletions.
diff --git a/.github/workflows/keycloak-chaos-benchmark.yml b/.github/workflows/keycloak-chaos-benchmark.yml
@@ -28,6 +28,14 @@ on:
         description: 'Initial users per second'
         type: string
         default: '1'
+      benchmarkRunDuration:
+        description: 'Benchmark run duration'
+        type: string
+        default: '180'
+      chaosTimeout:
+        description: 'Benchmark run duration'
+        type: string
+        default: '120'
       skipCreateDeployment:
         description: 'Skip creating Keycloak deployment'
         type: boolean
@@ -36,10 +44,6 @@ on:
         description: 'Skip creating dataset'
         type: boolean
         default: false
-      skipDeleteProject:
-        description: 'Skip deleting project'
-        type: boolean
-        default: false
 
 concurrency: cluster_${{ inputs.clusterName || format('gh-{0}', github.repository_owner) }}
 
@@ -98,19 +102,20 @@ jobs:
           disableStickySessions: true
 
       - name: Create Keycloak dataset with "${{ inputs.numberOfEntitiesInRealm }}" clients
-        if: ${{ !inputs.skipCreateDataset }} && inputs.scenarioName == 'authentication.ClientSecret'
+        if: "!inputs.skipCreateDataset && inputs.scenarioName == 'authentication.ClientSecret'"
         uses: ./.github/actions/keycloak-create-dataset
         with:
           project: ${{ env.PROJECT }}
           clients: ${{ inputs.numberOfEntitiesInRealm }}
           maxWaitEntityCreation: ${{ inputs.maxWaitEntityCreation }}
 
       - name: Create Keycloak dataset with "${{ inputs.numberOfEntitiesInRealm }}" users
-        if: ${{ !inputs.skipCreateDataset }} && inputs.scenarioName == 'authentication.AuthorizationCode'
+        if: "!inputs.skipCreateDataset && inputs.scenarioName == 'authentication.AuthorizationCode'"
         uses: ./.github/actions/keycloak-create-dataset
         with:
           project: ${{ env.PROJECT }}
           users: ${{ inputs.numberOfEntitiesInRealm }}
+          maxWaitEntityCreation: ${{ inputs.maxWaitEntityCreation }}
 
       - name: Get URLs
         uses: ./.github/actions/get-keycloak-url
@@ -123,11 +128,10 @@ jobs:
           bin/kcb.sh --scenario=keycloak.scenario."${{ inputs.scenarioName }}" \
             --server-url=${{ env.KEYCLOAK_URL }} \
             --users-per-sec=${{ inputs.initialUsersPerSecond }}  \
-            --measurement=180 \
+            --measurement=${{ inputs.benchmarkRunDuration }} \
             --realm-name=realm-0 \
-            --clients-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }} &
-          timeout 150 bin/kc-chaos.sh &
-          wait
+            --chaos=${{ inputs.chaosTimeout }} \
+            --clients-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }}
         working-directory: keycloak-benchmark
 
       - name: Run "authentication.AuthorizationCode" failure scenario
@@ -136,22 +140,15 @@ jobs:
           bin/kcb.sh --scenario=keycloak.scenario."${{ inputs.scenarioName }}" \
             --server-url=${{ env.KEYCLOAK_URL }} \
             --users-per-sec=${{ inputs.initialUsersPerSecond }}  \
-            --measurement=180 \
+            --measurement=${{ inputs.benchmarkRunDuration }} \
             --realm-name=realm-0 \
-            --users-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }} &
-          timeout 150 bin/kc-chaos.sh &
-          wait
+            --chaos=${{ inputs.chaosTimeout }} \
+            --users-per-realm=${{ inputs.numberOfEntitiesUsedInTest || inputs.numberOfEntitiesInRealm }}
         working-directory: keycloak-benchmark
 
-      - name: Archive Gatling reports
+      - name: Archive failure benchmark reports and logs
         uses: actions/upload-artifact@v3
         with:
           name: gatling-results
           path: keycloak-benchmark/results
-          retention-days: 5
-
-      - name: Delete Keycloak deployment
-        if: ${{ !inputs.skipDeleteProject }}
-        uses: ./.github/actions/keycloak-delete-deployment
-        with:
-          project: ${{ env.PROJECT }}
+          retention-days: 14
diff --git a/benchmark/src/main/content/bin/kc-chaos.sh b/benchmark/src/main/content/bin/kc-chaos.sh
@@ -3,25 +3,41 @@
 set -e
 
 : ${INITIAL_DELAY_SECS:=30}
-: ${CHAOS_DELAY_SECS:=60}
+: ${CHAOS_DELAY_SECS:=10}
 : ${PROJECT:="runner-keycloak"}
 
-echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Entering Chaos mode, with an initial delay of $INITIAL_DELAY_SECS seconds"
-sleep $INITIAL_DELAY_SECS
-echo -e "INFO:$(date '+%F-%T-%Z') Running Chaos scenario - Delete random Keycloak pod"
-while true; do
+LOGS_DIR=$1
 
+echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Entering Chaos mode, with an initial delay of $INITIAL_DELAY_SECS seconds\033[0m"
+sleep $INITIAL_DELAY_SECS
+echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Running Chaos scenario - Delete random Keycloak pod\033[0m"
 
+ATTEMPT=0
+while true; do
+  ATTEMPT=$[ATTEMPT + 1]
   RANDOM_KC_POD=$(kubectl \
     -n "${PROJECT}" \
     -o 'jsonpath={.items[*].metadata.name}' \
     get pods -l app=keycloak | \
       tr " " "\n" | \
       shuf | \
       head -n 1)
-  echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Killing Pod '${RANDOM_KC_POD}' and waiting for ${CHAOS_DELAY_SECS} seconds"
+
+  kubectl get pods -n "${PROJECT}" -l app=keycloak -o wide
+  kubectl logs -f -n "${PROJECT}" "${RANDOM_KC_POD}" > "$LOGS_DIR/${ATTEMPT}-${RANDOM_KC_POD}.log" 2>&1 &
+  kubectl describe -n "${PROJECT}" pod "${RANDOM_KC_POD}" > "$LOGS_DIR/${ATTEMPT}-${RANDOM_KC_POD}-complete-resource.log" 2>&1
+  kubectl top -n "${PROJECT}" pod -l app=keycloak --sum=true > "$LOGS_DIR/${ATTEMPT}-top.log" 2>&1
+  echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Killing Pod '${RANDOM_KC_POD}' and waiting for ${CHAOS_DELAY_SECS} seconds\033[0m"
   kubectl delete pod -n "${PROJECT}" "${RANDOM_KC_POD}" --grace-period=1
+
+  START=$(date +%s)
+
+  kubectl wait --for=condition=Available --timeout=600s deployments.apps/keycloak-operator -n "${PROJECT}" || true
+  kubectl wait --for=condition=Ready --timeout=600s keycloaks.k8s.keycloak.org/keycloak -n "${PROJECT}" || true
+
+  END=$(date +%s)
+  DIFF=$(( END - START ))
+
+  echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Keycloak pod ${RANDOM_KC_POD} took ${DIFF} seconds to recover\033[0m"
   sleep "${CHAOS_DELAY_SECS}"
-  echo -e "\033[0m"
 done
-
diff --git a/benchmark/src/main/content/bin/kcb.sh b/benchmark/src/main/content/bin/kcb.sh
@@ -34,6 +34,8 @@ JAVA_OPTS="${JAVA_OPTS} -Xmx1G -XX:+HeapDumpOnOutOfMemoryError"
 DEBUG_MODE="${DEBUG:-false}"
 DEBUG_PORT="${DEBUG_PORT:-8787}"
 
+CHAOS_MODE="${CHAOS_MODE:-false}"
+
 CONFIG_ARGS=()
 SERVER_OPTS=()
 
@@ -79,6 +81,10 @@ do
           MODE=incremental
           INCREMENT=${1#*=}
           ;;
+      --chaos=*)
+          CHAOS_MODE=true
+          CHAOS_TIMEOUT=${1#*=}
+          ;;
       --)
           shift
           break
@@ -162,6 +168,14 @@ EOF
   return ${EXIT_RESULT}
 }
 
+if [ "$CHAOS_MODE" = "true" ]; then
+    echo "INFO: Running benchmark with chaos mode, logs output will be available in: $LOGS_DIR"
+    LOGS_DIR="$DIRNAME/../results/logs/"
+
+    mkdir -p "$LOGS_DIR"
+    timeout "${CHAOS_TIMEOUT}" bash bin/kc-chaos.sh "${LOGS_DIR}" 2>&1 | tee "${LOGS_DIR}/kc-chaos.log" &
+fi
+
 if [ "$MODE" = "incremental" ]; then
   echo "INFO: Running benchmark in incremental mode."
   MAX_ATTEMPTS=100
@@ -190,7 +204,7 @@ if [ "$MODE" = "incremental" ]; then
       exit 1
     fi
 
-    ((ATTEMPT++))
+    ATTEMPT=$[ATTEMPT + 1]
 
     run_benchmark_with_workload "$WORKLOAD_UNIT" "$CURRENT_WORKLOAD" "$MEASUREMENT" "$RESULT_ROOT_DIR/$WORKLOAD_UNIT-$CURRENT_WORKLOAD"
 
@@ -237,3 +251,15 @@ else
   fi
   exit
 fi
+
+if [ "$CHAOS_MODE" = "true" ]; then
+    : ${PROJECT:="runner-keycloak"}
+    echo "INFO: Collecting logs at the end of the Chaos benchmark run"
+    PODS=$(kubectl -n "${PROJECT}" -o 'jsonpath={.items[*].metadata.name}' get pods -l app=keycloak | tr " " "\n")
+    for POD in $PODS; do
+      kubectl logs -n "${PROJECT}" "${POD}" > "$LOGS_DIR/End-of-run-${POD}.log" 2>&1
+      kubectl describe -n "${PROJECT}" pod "${POD}" > "$LOGS_DIR/End-of-run-${POD}-complete-resource.log" 2>&1
+    done
+    kubectl top -n "${PROJECT}" pod -l app=keycloak --sum=true > "$LOGS_DIR/End-of-run-top.log" 2>&1
+    kubectl get pods -n "${PROJECT}" -l app=keycloak -o wide
+fi
diff --git a/doc/benchmark/modules/ROOT/pages/kcb-modes.adoc b/doc/benchmark/modules/ROOT/pages/kcb-modes.adoc
@@ -6,4 +6,6 @@ In the `--incremental` mode, the code sets up a benchmark to go through a warm-u
 
 In the `--single-run` mode, which is the default mode when the `--incremental` flag is not set on the CLI command, the script directly runs the benchmark with the provided workload and exits without any further processing.
 
+In the `--chaos` mode, the code sets up a single-run mode benchmark in parallel with a failure simulating `kc-chaos.sh` script to kill a random pod after a certain time interval. It can take an input `CHAOS_TIMEOUT` to stop the simulated failures after a given timeout.
+
 Overall the `kcb.sh` script executes the benchmark in either incremental or single-run mode, handles errors and provides informative output messages during the process.
diff --git a/doc/kubernetes/modules/ROOT/pages/util/kc-chaos.adoc b/doc/kubernetes/modules/ROOT/pages/util/kc-chaos.adoc
@@ -25,7 +25,7 @@ Once there is enough load going against the Keycloak application hosted on an ex
 
 [source,bash]
 ----
-./kc-chaos.sh
+./kc-chaos.sh <RESULT_DIR_PATH>
 ----
 
 Set the environment variables below to configure on how and where this script gets executed.
@@ -35,3 +35,8 @@ Set the environment variables below to configure on how and where this script ge
 `CHAOS_DELAY_SECS`:: Time in seconds the script waits between simulating failures.
 
 `PROJECT`:: Namespace of the Keycloak pods.
+
+
+=== Collecting the results
+
+The chaos script also collects information about the Keycloak failures, Keycloak pod utilization, Keycloak pod restarts, Keycloak logs before killing the keycloak pod and at the end of the run and store them under the `results/logs` directory.