Skip to content

Commit

Permalink
Add possibility to change node drain duration (#2102)
Browse files Browse the repository at this point in the history
Add possibility to change duration of node drain
litmus chaos test. This is needed for CNFs with
longer startup/shutdown.
Additionally, fix litmus waiter code and timeout module.
Slight refactor of LitmusManager module.

Refs: #2098

Signed-off-by: Martin Matyas <[email protected]>
  • Loading branch information
martin-mat authored Jul 10, 2024
1 parent 2a133f8 commit 7f8c910
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 43 deletions.
1 change: 1 addition & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT=120
CNF_TESTSUITE_NODE_READINESS_TIMEOUT=240
CNF_TESTSUITE_POD_READINESS_TIMEOUT=180
CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT=1800
CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION=90
```

#### Running The Linter in Developer Mode
Expand Down
65 changes: 26 additions & 39 deletions src/tasks/litmus_setup.cr
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ module LitmusManager
DOWNLOADED_LITMUS_FILE = "litmus-operator-downloaded.yaml"
MODIFIED_LITMUS_FILE = "litmus-operator-modified.yaml"
LITMUS_NAMESPACE = "litmus"
LITMUS_K8S_DOMAIN = "litmuschaos.io"



Expand Down Expand Up @@ -85,60 +86,46 @@ module LitmusManager
appNodeName_response.to_s
end

private def self.get_status_info(chaos_resource, test_name, output_format, namespace) : {Int32, String}
status_cmd = "kubectl get #{chaos_resource}.#{LITMUS_K8S_DOMAIN} #{test_name} -n #{namespace} -o '#{output_format}'"
Log.info { "Getting litmus status info: #{status_cmd}" }
status_code = Process.run("#{status_cmd}", shell: true, output: status_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
status_response = status_response.to_s
Log.info { "status_code: #{status_code}, response: #{status_response}" }
{status_code, status_response}
end

private def self.get_status_info_until(chaos_resource, test_name, output_format, timeout, namespace, &block)
repeat_with_timeout(timeout: timeout, errormsg: "Litmus response timed-out") do
status_code, status_response = get_status_info(chaos_resource, test_name, output_format, namespace)
status_code == 0 && yield status_response
end
end

## wait_for_test will wait for the completion of litmus test
def self.wait_for_test(test_name, chaos_experiment_name, args, namespace : String = "default")
chaos_result_name = "#{test_name}-#{chaos_experiment_name}"

experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.engineStatus}'"
Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args)

## Wait for completion of chaosengine which indicates the completion of chaos
repeat_with_timeout(timeout: LITMUS_CHAOS_TEST_TIMEOUT, errormsg: "Litmus test has timed-out") do
status_code = Process.run("#{experimentStatus_cmd}",
shell: true,
output: experimentStatus_response = IO::Memory.new,
error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "#{chaos_experiment_name} status_code: #{status_code}" } if check_verbose(args)
experimentStatus = experimentStatus_response.to_s
Log.for("wait_for_test").info {"#{chaos_experiment_name} experiment status: " + experimentStatus}
if (experimentStatus != "Waiting for Job Creation" && experimentStatus != "Running" && experimentStatus != "Completed")
true
else
status_code == 0 && experimentStatus == "Completed"
end
Log.info { "wait_for_test: #{chaos_result_name}" }

get_status_info_until("chaosengine", test_name, "jsonpath={.status.engineStatus}", LITMUS_CHAOS_TEST_TIMEOUT, namespace) do |engineStatus|
["completed", "stopped"].includes?(engineStatus)
end

verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
Log.for("wait_for_test").info { "Checking experiment verdict #{verdict_cmd}" } if check_verbose(args)
## Check the chaosresult verdict
repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Litmus verdict aquiring has timed-out") do
status_code = Process.run("#{verdict_cmd}",
shell: true,
output: verdict_response = IO::Memory.new,
error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args)
Log.for("wait_for_test").info { "verdict: #{verdict_response.to_s}" } if check_verbose(args)
verdict = verdict_response.to_s
status_code == 0 && verdict != "Awaited"
get_status_info_until("chaosresults", chaos_result_name, "jsonpath={.status.experimentStatus.verdict}", GENERIC_OPERATION_TIMEOUT, namespace) do |verdict|
verdict != "Awaited"
end
end

## check_chaos_verdict will check the verdict of chaosexperiment
def self.check_chaos_verdict(chaos_result_name, chaos_experiment_name, args, namespace : String = "default") : Bool
verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
Log.for("LitmusManager.check_chaos_verdict").debug { "Checking experiment verdict with command: #{verdict_cmd}" }
status_code = Process.run("#{verdict_cmd}", shell: true, output: verdict_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
Log.for("LitmusManager.check_chaos_verdict").debug { "status_code: #{status_code}; verdict: #{verdict_response.to_s}" }
verdict = verdict_response.to_s
_, verdict = get_status_info("chaosresult", chaos_result_name, "jsonpath={.status.experimentStatus.verdict}", namespace)

emoji_test_failed= "🗡️💀♻️"
if verdict == "Pass"
return true
else
Log.for("LitmusManager.check_chaos_verdict#details").debug do
verdict_details_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o json"
status_code = Process.run("#{verdict_details_cmd}", shell: true, output: verdict_details_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
"#{verdict_details_response.to_s}"
status_code, verdict_details_response = get_status_info("chaosresult", chaos_result_name, "json", namespace)
"#{verdict_details_response}"
end

Log.for("LitmusManager.check_chaos_verdict").info {"#{chaos_experiment_name} chaos test failed: #{chaos_result_name}, verdict: #{verdict}"}
Expand Down
4 changes: 3 additions & 1 deletion src/tasks/utils/chaos_templates.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
NODE_DRAIN_TOTAL_CHAOS_DURATION = ENV.has_key?("CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION") ? ENV["CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION"].to_i : 90

class ChaosTemplates
class PodIoStress
def initialize(
Expand Down Expand Up @@ -113,7 +115,7 @@ class ChaosTemplates
@deployment_label : String,
@deployment_label_value : String,
@app_nodename : String,
@total_chaos_duration : String = "90"
@total_chaos_duration : String = "#{NODE_DRAIN_TOTAL_CHAOS_DURATION}"
)
end
ECR.def_to_s("src/templates/chaos_templates/node_drain.yml.ecr")
Expand Down
6 changes: 3 additions & 3 deletions src/tasks/utils/timeouts.cr
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ LITMUS_CHAOS_TEST_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOU

def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block)
start_time = Time.utc
while (Time.utc - start_time).seconds < timeout
while (Time.utc - start_time).to_i < timeout
result = yield
if result.nil?
if reset_on_nil
Expand All @@ -21,8 +21,8 @@ def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block)
return true
end
sleep delay
Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).seconds} seconds" }
Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).to_i} seconds" }
end
Log.error { errormsg }
false
end
end

0 comments on commit 7f8c910

Please sign in to comment.