Skip to content

Commit

Permalink
[testing-on-gke] fixes and improvements (#2581)
Browse files Browse the repository at this point in the history
Description
Add missing configs in create-cluster command (bug in current bash script)
remove pii from bash script (privacy concern)
expose force_update_gcsfuse_code (to allow users to use a specific code branch for testing even on their local clone of gcsfuse repo)
Handle pod-status=Unknown (found this value of pod-status recently and had to handle it same as error)
fix log in run-automated (minor improvement)

* fix log in run-automated

* Handle pod-status=Unknown

* expose force_update_gcsfuse_code

* remove pii from bash script

* Add missing configs in create-cluster command

Added the following in `gcloud container clusters create` command:
1. --network-performance-configs=total-egress-bandwidth-tier=TIER_1
1. --workload-metadata=GKE_METADATA
  • Loading branch information
gargnitingoogle authored Oct 11, 2024
1 parent 766dec9 commit ea81f26
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 21 deletions.
4 changes: 2 additions & 2 deletions perfmetrics/scripts/testing_on_gke/examples/run-automated.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ rm -rfv log fio/output.csv dlio/output.csv

# Run the script.
start_time=$(date +%Y-%m-%dT%H:%M:%SZ)
echo 'Run started at ${start_time}'
echo "Run started at ${start_time}"
touch log
(./run-gke-tests.sh --debug |& tee -a log) || true
# Use the following if you want to run it in a tmux session instead.
# tmux new-session -d -s ${instance_id} 'bash -c "(./run-gke-tests.sh --debug |& tee -a log); sleep 604800 "'
end_time=$(date +%Y-%m-%dT%H:%M:%SZ)
echo 'Run ended at ${end_time}'
echo "Run ended at ${end_time}"

# Some post-run steps to be taken for output collection.
if test -n "${workload_config}"; then
Expand Down
38 changes: 19 additions & 19 deletions perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,8 @@ function exitWithError() { echoerror $@ ; exitWithFailure ; }

# Default values, to be used for parameters in case user does not specify them.
# GCP related
readonly DEFAULT_PROJECT_ID="gcs-fuse-test"
readonly DEFAULT_PROJECT_NUMBER=927584127901
readonly DEFAULT_ZONE="us-west1-b"
# GKE cluster related
readonly DEFAULT_CLUSTER_NAME="${USER}-testing-us-west1-1"
readonly DEFAULT_NODE_POOL=default-pool
readonly DEFAULT_MACHINE_TYPE="n2-standard-96"
readonly DEFAULT_NUM_NODES=8
Expand Down Expand Up @@ -74,11 +71,11 @@ function printHelp() {
echo "ENV_OPTIONS (all are optional): "
echo ""
# GCP related
echo "project_id=<project-id default=\"${DEFAULT_PROJECT_ID}\">"
echo "project_number=<number default=\"${DEFAULT_PROJECT_NUMBER}\">"
echo "project_id=<project-id>"
echo "project_number=<number>"
echo "zone=<region-zone default=\"${DEFAULT_ZONE}\">"
# GKE cluster related
echo "cluster_name=<cluster-name default=\"${DEFAULT_CLUSTER_NAME}\">"
echo "cluster_name=<cluster-name>"
echo "node_pool=<pool-name default=\"${DEFAULT_NODE_POOL}\">"
echo "machine_type=<machine-type default=\"${DEFAULT_MACHINE_TYPE}\">"
echo "num_nodes=<number from 1-8, default=\"${DEFAULT_NUM_NODES}\">"
Expand All @@ -95,6 +92,7 @@ function printHelp() {
echo "instance_id=<string, not containing spaces, representing unique id for particular test-run e.g. \"${DEFAULT_INSTANCE_ID}\""
echo "workload_config=<path/to/workload/configuration/file e.g. /a/b/c.json >"
echo "output_dir=</absolute/path/to/output/dir, output files will be written at output_dir/fio/output.csv and output_dir/dlio/output.csv>"
echo "force_update_gcsfuse_code=<true|false, to force-update the gcsfuse-code to given branch if gcsfuse_src_dir has been set. Default=\"${DEFAULT_FORCE_UPDATE_GCSFUSE_CODE}\">"
echo ""
echo ""
echo ""
Expand All @@ -112,20 +110,17 @@ fi

# Set environment variables.
# GCP related
if test -n "${project_id}"; then
if test -z "${project_number}"; then
exitWithError "project_id was set, but not project_number. Either both should be specified, or neither."
fi
elif test -n "${project_number}"; then
exitWithError "project_number was set, but not project_id. Either both should be specified, or neither."
else
export project_id=${DEFAULT_PROJECT_ID}
export project_number=${DEFAULT_PROJECT_NUMBER}
echo "Neither project_id, nor project_number were set, so defaulting to project_id=${DEFAULT_PROJECT_ID}, project_number=${DEFAULT_PROJECT_NUMBER}"
if test -z "${project_id}"; then
exitWithError "project_id was not set"
fi
if test -z "${project_number}"; then
exitWithError "project_number was not set"
fi
test -n "${zone}" || export zone=${DEFAULT_ZONE}
# GKE cluster related
test -n "${cluster_name}" || export cluster_name=${DEFAULT_CLUSTER_NAME}
if test -z "${cluster_name}"; then
exitWithError "${cluster_name} was not set."
fi
test -n "${node_pool}" || export node_pool=${DEFAULT_NODE_POOL}
test -n "${machine_type}" || export machine_type=${DEFAULT_MACHINE_TYPE}
test -n "${num_nodes}" || export num_nodes=${DEFAULT_NUM_NODES}
Expand Down Expand Up @@ -226,6 +221,7 @@ function printRunParameters() {
echo "instance_id=\"${instance_id}\""
echo "workload_config=\"${workload_config}\""
echo "output_dir=\"${output_dir}\""
echo "force_update_gcsfuse_code=\"${force_update_gcsfuse_code}\""
echo ""
echo ""
echo ""
Expand Down Expand Up @@ -433,7 +429,7 @@ function ensureGkeCluster() {
fi
gcloud container clusters update ${cluster_name} --project=${project_id} --location=${zone} --workload-pool=${project_id}.svc.id.goog
else
gcloud container clusters create ${cluster_name} --project=${project_id} --zone "${zone}" --workload-pool=${project_id}.svc.id.goog --machine-type "${machine_type}" --image-type "COS_CONTAINERD" --num-nodes ${num_nodes} --ephemeral-storage-local-ssd count=${num_ssd}
gcloud container clusters create ${cluster_name} --project=${project_id} --zone "${zone}" --workload-pool=${project_id}.svc.id.goog --machine-type "${machine_type}" --image-type "COS_CONTAINERD" --num-nodes ${num_nodes} --ephemeral-storage-local-ssd count=${num_ssd} --network-performance-configs=total-egress-bandwidth-tier=TIER_1 --workload-metadata=GKE_METADATA
fi
}
Expand Down Expand Up @@ -618,11 +614,15 @@ function waitTillAllPodsComplete() {
if [ ${num_completed_pods} -gt 0 ]; then
printf ${num_completed_pods}" pod(s) have completed.\n"
fi
num_noncompleted_pods=$(echo "${podslist}" | tail -n +2 | grep -i -v 'completed\|succeeded\|fail\|error' | wc -l)
num_noncompleted_pods=$(echo "${podslist}" | tail -n +2 | grep -i -v 'completed\|succeeded\|fail\|error\|unknown' | wc -l)
num_failed_pods=$(echo "${podslist}" | tail -n +2 | grep -i 'failed' | wc -l)
if [ ${num_failed_pods} -gt 0 ]; then
printf ${num_failed_pods}" pod(s) have failed.\n\n"
fi
num_unknown_pods=$(echo "${podslist}" | tail -n +2 | grep -i 'unknown' | wc -l)
if [ ${num_unknown_pods} -gt 0 ]; then
printf ${num_unknown_pods}" pod(s) have status 'Unknown'.\n\n"
fi
if [ ${num_noncompleted_pods} -eq 0 ]; then
printf "\nAll pods have completed.\n\n"
break
Expand Down

0 comments on commit ea81f26

Please sign in to comment.