From 57f88a573810abfb09d97ed56ea2eab489f438cc Mon Sep 17 00:00:00 2001 From: UDIT GAURAV <35391335+uditgaurav@users.noreply.github.com> Date: Sun, 14 Jun 2020 19:11:47 +0530 Subject: [PATCH] chore(feat): Add kubelet service kill experiment in generic experiment list (#1542) * chore(feat): Add kubelet service kill experiment in generic experiment list Signed-off-by: Udit Gaurav --- .../kubelet-service-kill.j2 | 42 +++++ .../kubelet_service_kill.yml | 145 ++++++++++++++++ .../generic/kubelet_service_kill/README.md | 14 ++ .../generic/kubelet_service_kill/chaosutil.j2 | 3 + .../kubelet_service_kill_ansible_logic.yml | 156 ++++++++++++++++++ ...let_service_kill_ansible_prerequisites.yml | 4 + .../kubelet_service_kill_k8s_job.yml | 62 +++++++ 7 files changed, 426 insertions(+) create mode 100644 chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2 create mode 100644 chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml create mode 100644 experiments/generic/kubelet_service_kill/README.md create mode 100644 experiments/generic/kubelet_service_kill/chaosutil.j2 create mode 100644 experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_logic.yml create mode 100644 experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_prerequisites.yml create mode 100644 experiments/generic/kubelet_service_kill/kubelet_service_kill_k8s_job.yml diff --git a/chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2 b/chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2 new file mode 100644 index 00000000000..4cd83526266 --- /dev/null +++ b/chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2 @@ -0,0 +1,42 @@ +--- +apiVersion: v1 +kind: Pod +metadata: + name: service-kill-{{ run_id }} + labels: + app: service-kill + name: service-kill-{{ run_id }} +{% if chaos_uid is defined and chaos_uid != '' %} + chaosUID: {{ chaos_uid }} +{% endif %} +spec: + nodeName: {{ node_name }} + restartPolicy: Never + containers: + - name: service-kill + image: ubuntu:16.04 + command: ["/bin/bash"] + args: ["-c", "sleep 10 && systemctl stop kubelet && sleep {{ c_duration }} && systemctl start kubelet"] + resources: + requests: + cpu: 10m + memory: 5M + limits: + cpu: 100m + memory: 20M + volumeMounts: + - name: bus + mountPath: /var/run + - name: root + mountPath: /node + securityContext: + privileged: true + tty: true + volumes: + - name: bus + hostPath: + path: /var/run + - name: root + hostPath: + path: / + type: "" \ No newline at end of file diff --git a/chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml b/chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml new file mode 100644 index 00000000000..56fd08377f8 --- /dev/null +++ b/chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml @@ -0,0 +1,145 @@ +--- +- block: + + - block: + + - name: "[Prepare]: Select the application pod name" + shell: > + kubectl get pod -l {{ a_label }} -n {{ a_ns }} + -o=custom-columns=:metadata.name --no-headers + | shuf | head -1 + args: + executable: /bin/bash + register: app_pod_name + + - name: "[Prepare]: Recording the application pod name" + set_fact: + app_pod: "{{ app_pod_name.stdout }}" + + when: "app_pod is not defined or app_pod == ''" + + - name: "[Prepare]: Identify the application node name" + shell: > + kubectl get pod {{ app_pod }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.nodeName + args: + executable: /bin/bash + register: app_node + + - name: "[Prepare]: Record the application node name" + set_fact: + app_node: "{{ app_node.stdout }}" + + - block: + - name: "[Prepare:] Generate a run id if not passed from the engine/experiment" + shell: echo $(mktemp) | cut -d '.' -f 2 | cut -c -6 + register: rand_string + + - set_fact: + run_id: "{{ rand_string.stdout | lower }}" + when: "run_id is not defined or run_id == ''" + + - name: "[Event]: Generating an Event for ChaosInjection" + include_tasks: /utils/common/generate-kubernetes-chaos-events.yml + vars: + stage: "ChaosInject" + exp_pod_name: "{{ chaos_pod_name }}" + engine_ns: "{{ c_ns }}" + message: "Injecting {{ c_experiment }} chaos on {{ app_node }} node" + when: "c_engine is defined and c_engine != ''" + + - name: "[Prepare]: Patch the run_id to kubelet service kill helper pod template" + template: + src: /chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2 + dest: /tmp/kubelet-service-kill.yml + vars: + node_name: "{{ app_node }}" + + # Setting pod_running_status to nil + - set_fact: + pod_running_status: "" + + # Kubelet service kill pod creation is attempted for a total of 3 times, if it is not immediately schedulable due to transient node conditions + # If the kubelet-servie-kill pod is not schedulable across these 3 tries, the experiment is failed with message indicating improper cluster state. + - name: "[Prepare]: Including the util to create the chaos pod" + include_tasks: /utils/common/create_chaos_pod.yml + vars: + pod_ns: "{{ c_ns }}" + c_path: "/tmp/kubelet-service-kill.yml" + pod_label: "name=service-kill-{{ run_id }}" + with_sequence: start=1 end=3 + + # Failing the execution, If kubelet-service-kill pod won't come to running state after three retries. + - fail: + msg: "kubelet_service_kill lib failed, Unable to create as kubelet_service_kill pod couldn't be scheduled on the {{ node_name }} node" + when: "pod_running_status is not defined or pod_running_status != 'Running'" + + - name: "[Status]: Waiting for node to get in NotReady state" + shell: | + kubectl get nodes {{ app_node }} --no-headers | awk '{print$2}' + args: + executable: /bin/bash + register: node_state + until: node_state.stdout == 'NotReady' + delay: 2 + retries: 90 + + - name: "[Wait]: Wait for the chaos duration of {{ c_duration }}s" + wait_for: + timeout: "{{ c_duration }}" + + - name: "[Status]: Checking the node status after chaos" + shell: | + kubectl get nodes {{ app_node }} --no-headers | awk '{print$2}' + args: + executable: /bin/bash + register: node_state + until: node_state.stdout == 'Ready' + delay: 2 + retries: 90 + + - name: "[CleanUP]: Tear down service kill infra" + shell: > + kubectl delete -f /tmp/kubelet-service-kill.yml -n {{ c_ns }} + args: + executable: /bin/bash + register: result + + - name: "[Status]: Confirm that the svc chaos helper pod is teminated successfully" + shell: > + kubectl get pod -l name=service-kill-{{ run_id }} --no-headers -o custom-columns=:status.phase -n {{ a_ns }} | sort | uniq + args: + executable: /bin/bash + register: result_status + until: result_status.stdout =='' + delay: 2 + retries: 90 + + rescue: + + - block: + + - name: "[CleanUP]: Tear down service kill infra" + shell: > + kubectl delete -f /tmp/kubelet-service-kill.yml -n {{ c_ns }} + args: + executable: /bin/bash + register: result + when: "chaos_pod_result.rc == 0" + + - name: "[Status]: Confirm that the svc chaos helper pod is teminated successfully" + shell: > + kubectl get pod -l name=service-kill-{{ run_id }} --no-headers -o custom-columns=:status.phase -n {{ a_ns }} | sort | uniq + args: + executable: /bin/bash + register: result_status + until: result_status.stdout =='' + delay: 2 + retries: 90 + + when: "(pod_running_status is defined and pod_running_status == 'Running') and chaos_pod_result is defined" + + - fail: + msg: "kubelet_service_kill lib failed" + when: true + \ No newline at end of file diff --git a/experiments/generic/kubelet_service_kill/README.md b/experiments/generic/kubelet_service_kill/README.md new file mode 100644 index 00000000000..3cd0c2042e9 --- /dev/null +++ b/experiments/generic/kubelet_service_kill/README.md @@ -0,0 +1,14 @@ +## Experiment Metadata + + + + + + + + + + + + +
Name Description Documentation Link
Kubelet Service Kill This experiment causes kubelet service kill gracefully for a certain chaos duration. The experiment aims to verify resiliency of applications whose replicas may be evicted or becomes unreachable on account on nodes turning unschedulable (Not Ready) due to kubelet service kill. Added soon
diff --git a/experiments/generic/kubelet_service_kill/chaosutil.j2 b/experiments/generic/kubelet_service_kill/chaosutil.j2 new file mode 100644 index 00000000000..53be4231fb1 --- /dev/null +++ b/experiments/generic/kubelet_service_kill/chaosutil.j2 @@ -0,0 +1,3 @@ +{% if c_lib is defined and c_lib == 'litmus' %} +c_util: "/chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml" +{% endif %} diff --git a/experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_logic.yml b/experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_logic.yml new file mode 100644 index 00000000000..31fb972dcb4 --- /dev/null +++ b/experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_logic.yml @@ -0,0 +1,156 @@ +--- +- hosts: localhost + connection: local + + vars: + c_experiment: "kubelet-service-kill" + c_duration: "{{ lookup('env','TOTAL_CHAOS_DURATION') }}" + ramp_time: "{{ lookup('env','RAMP_TIME') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_label: "{{ lookup('env','APP_LABEL') }}" + a_kind: "{{ lookup('env','APP_KIND') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + auxiliary_appinfo: "{{ lookup('env','AUXILIARY_APPINFO') }}" + chaos_uid: "{{ lookup('env','CHAOS_UID') }}" + c_engine: "{{ lookup('env','CHAOSENGINE') }}" + chaos_pod_name: "{{ lookup('env','POD_NAME') }}" + c_ns: "{{ lookup('env','CHAOS_NAMESPACE') }}" + c_lib: "{{ lookup('env','LIB') }}" + + tasks: + + - block: + + ## DETERMINE THE CHAOSLIB TASKFILES TO BE USED + - include: kubelet_service_kill_ansible_prerequisites.yml + + - name: "[PreReq]: Including the chaos util for the {{ c_experiment }} experiment" + include_vars: + file: /tmp/chaosutil.yml + + ## GENERATE EXPERIMENT RESULT NAME + - name: "[PreReq]: Constructing the chaos result name" + set_fact: + c_result: "{{ c_engine }}-{{ c_experiment }}" + when: "c_engine is defined and c_engine != ''" + + ## RECORD START-OF-EXPERIMENT IN LITMUSCHAOS RESULT CR + - name: "[PreReq]: Updating the chaos result of {{ c_experiment }} experiment (SOT)" + include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ c_ns }}" + + ## DISPLAY APP INFORMATION + - name: "[Info]: Display the application information passed via the test job" + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "Ramp Time : {{ ramp_time }}" + + ## PRE-CHAOS APPLICATION STATUS CHECK + - name: "[Status]: Verify that the AUT (Application Under Test) is running (pre-chaos)" + include_tasks: "/utils/common/status_app_pod.yml" + vars: + app_ns: "{{ a_ns }}" + app_label: "{{ a_label }}" + delay: 2 + retries: 90 + + # Auxiliary application health check status + - block: + + - name: Record auxiliary appinfo + set_fact: + auxiliary_appinfo_list: "{{ auxiliary_appinfo.split(',') }}" + + - name: "[Status]: Verify that the Auxiliary Applications are running (pre-chaos)" + include: /utils/common/status_app_pod.yml + vars: + app_ns: "{{ item.split(':')[0] }}" + app_label: "{{ item.split(':')[1] }}" + delay: 2 + retries: 90 + with_items: + - "{{ auxiliary_appinfo_list }}" + + when: auxiliary_appinfo is defined and auxiliary_appinfo != '' + + ## RECORD EVENT FOR PRE-CHAOS CHECK + - name: "[Event]: Generating an Event for PreChaosCheck" + include_tasks: /utils/common/generate-kubernetes-chaos-events.yml + vars: + stage: "PreChaosCheck" + exp_pod_name: "{{ chaos_pod_name }}" + engine_ns: "{{ c_ns }}" + message: "AUT is Running successfully" + when: "c_engine is defined and c_engine != ''" + + ## READY TO START SERVICE CHAOS + - name: "[Prepare]: Including the kubelet service kill lib" + include_tasks: "{{ c_util }}" + + ## POST-CHAOS APPLICATION STATUS CHECK + + - name: "[Status]: Verify that the AUT (Application Under Test) is running (post-chaos)" + include_tasks: "/utils/common/status_app_pod.yml" + vars: + app_ns: "{{ a_ns }}" + app_label: "{{ a_label }}" + delay: 2 + retries: 90 + + # Auxiliary application health check status + - block: + + - name: "[Status]: Verify that the Auxiliary Applications are running (pre-chaos)" + include: /utils/common/status_app_pod.yml + vars: + app_ns: "{{ item.split(':')[0] }}" + app_label: "{{ item.split(':')[1] }}" + delay: 2 + retries: 90 + with_items: + - "{{ auxiliary_appinfo_list }}" + + when: auxiliary_appinfo is defined and auxiliary_appinfo != '' + + ## RECORD EVENT FOR POST-CHAOS CHECK + - name: "[Event]: Generating an Event for PostChaosCheck" + include_tasks: /utils/common/generate-kubernetes-chaos-events.yml + vars: + stage: "PostChaosCheck" + exp_pod_name: "{{ chaos_pod_name }}" + engine_ns: "{{ c_ns }}" + message: "AUT is Running successfully" + when: "c_engine is defined and c_engine != ''" + + - set_fact: + flag: "Pass" + + - name: "[Result]: Getting the final result of {{ c_experiment }} experiment" + debug: + msg: "{{ c_experiment }} experiment has been {{ flag }}ed" + + rescue: + + - set_fact: + flag: "Fail" + + - name: "[Result]: Getting the final result of {{ c_experiment }} experiment" + debug: + msg: "{{ c_experiment }} experiment has been {{ flag }}ed" + + always: + + ## Getting failure step from experiment-pod + - include_tasks: /utils/runtime/getting_failure_step.yml + + ## RECORD END-OF-TEST IN LITMUSCHAOS RESULT CR + - name: "[The End]: Updating the chaos result of {{ c_experiment }} experiment (EOT)" + include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ c_ns }}" diff --git a/experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_prerequisites.yml b/experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_prerequisites.yml new file mode 100644 index 00000000000..e539fee2fb8 --- /dev/null +++ b/experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_prerequisites.yml @@ -0,0 +1,4 @@ +- name: "[PreReq] Identify the chaos util for {{ c_experiment }} experiment" + template: + src: chaosutil.j2 + dest: /tmp/chaosutil.yml diff --git a/experiments/generic/kubelet_service_kill/kubelet_service_kill_k8s_job.yml b/experiments/generic/kubelet_service_kill/kubelet_service_kill_k8s_job.yml new file mode 100644 index 00000000000..77672e2baf9 --- /dev/null +++ b/experiments/generic/kubelet_service_kill/kubelet_service_kill_k8s_job.yml @@ -0,0 +1,62 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: kubelet-service-kill- +spec: + template: + metadata: + labels: + experiment: kubelet-service-kill + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + ## ENTER THE NAMESPACE WHERE THE APPLICATION IS RUNNING + - name: APP_NAMESPACE + value: '' + + ## ENTER THE LABEL OF THE APPLICATION + - name: APP_LABEL + value: '' + + ## ENTER THE KIND OF APPLICATION + - name: APP_KIND + value: '' + + ## TOTAL CHAOS DURATION OF EXPERIMENT + - name: TOTAL_CHAOS_DURATION + value: '90' + + ## PERIOD TO WAIT BEFORE INJECTION OF CHAOS IN SEC + - name: RAMP_TIME + value: '' + + ## PROVIDE AUXILIARY APPLICATION DETAILS - NAMESPACE AND LABELS OF THE APPLICATIONS + ## Sample input is "ns1:app=percona,ns2:name=nginx" + - name: AUXILIARY_APPINFO + value: '' + + ## PROVIDE THE LIB + ## ONLY LITMUS SUPPORTED + - name: LIB + value: 'litmus' + + # provide the chaos namespace + - name: CHAOS_NAMESPACE + value: 'litmus' + + - name: CHAOS_SERVICE_ACCOUNT + valueFrom: + fieldRef: + fieldPath: spec.serviceAccountName + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/generic/kubelet_service_kill/kubelet_service_kill_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"]