Skip to content

Commit

Permalink
Merge pull request #1842 from cncf/test-init-logs
Browse files Browse the repository at this point in the history
[maintain/1841] Add log lines and organize log messages in tests
  • Loading branch information
agentpoyo authored Oct 26, 2023
2 parents a785acc + 921de6a commit 8758bce
Show file tree
Hide file tree
Showing 14 changed files with 730 additions and 483 deletions.
8 changes: 6 additions & 2 deletions src/tasks/platform/hardware_and_scheduling.cr
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ namespace "platform" do
desc "Does the Platform use a runtime that is oci compliant"
task "oci_compliant" do |_, args|
task_response = CNFManager::Task.task_runner(args, check_cnf_installed=false) do |args|
task_start_time = Time.utc
testsuite_task = "oci_compliant"
Log.for(testsuite_task).info { "Starting test" }

resp = KubectlClient::Get.container_runtimes
all_oci_runtimes = true
resp.each do |x|
Expand All @@ -25,10 +29,10 @@ namespace "platform" do
LOGGING.info "all_oci_runtimes: #{all_oci_runtimes}"
if all_oci_runtimes
emoji_chaos_oci_compliant="📶☠️"
upsert_passed_task("oci_compliant","✔️ PASSED: Your platform is using the following runtimes: [#{KubectlClient::Get.container_runtimes.join(",")}] which are OCI compliant runtimes #{emoji_chaos_oci_compliant}", Time.utc)
upsert_passed_task(testsuite_task,"✔️ PASSED: Your platform is using the following runtimes: [#{KubectlClient::Get.container_runtimes.join(",")}] which are OCI compliant runtimes #{emoji_chaos_oci_compliant}", task_start_time)
else
emoji_chaos_oci_compliant="📶☠️"
upsert_failed_task("oci_compliant", "✖️ FAILED: Platform has at least one node that uses a non OCI compliant runtime #{emoji_chaos_oci_compliant}", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Platform has at least one node that uses a non OCI compliant runtime #{emoji_chaos_oci_compliant}", task_start_time)
end
end
end
Expand Down
32 changes: 24 additions & 8 deletions src/tasks/platform/observability.cr
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ namespace "platform" do

desc "Does the Platform have Kube State Metrics installed"
task "kube_state_metrics", ["install_cluster_tools"] do |_, args|
task_start_time = Time.utc
testsuite_task = "kube_state_metrics"
Log.for(testsuite_task).info { "Starting test" }

unless check_poc(args)
Log.info { "skipping kube_state_metrics: not in poc mode" }
puts "SKIPPED: Kube State Metrics".colorize(:yellow)
Expand All @@ -32,15 +36,19 @@ namespace "platform" do

if found
emoji_kube_state_metrics="📶☠️"
upsert_passed_task("kube_state_metrics","✔️ PASSED: Your platform is using the release for kube state metrics #{emoji_kube_state_metrics}", Time.utc)
upsert_passed_task(testsuite_task,"✔️ PASSED: Your platform is using the release for kube state metrics #{emoji_kube_state_metrics}", task_start_time)
else
emoji_kube_state_metrics="📶☠️"
upsert_failed_task("kube_state_metrics", "✖️ FAILED: Your platform does not have kube state metrics installed #{emoji_kube_state_metrics}", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Your platform does not have kube state metrics installed #{emoji_kube_state_metrics}", task_start_time)
end
end

desc "Does the Platform have a Node Exporter installed"
task "node_exporter", ["install_cluster_tools"] do |_, args|
task_start_time = Time.utc
testsuite_task = "node_exporter"
Log.for(testsuite_task).info { "Starting test" }

unless check_poc(args)
Log.info { "skipping node_exporter: not in poc mode" }
puts "SKIPPED: Node Exporter".colorize(:yellow)
Expand All @@ -57,16 +65,20 @@ namespace "platform" do
Log.info { "Found Process: #{found}" }
if found
emoji_node_exporter="📶☠️"
upsert_passed_task("node_exporter","✔️ PASSED: Your platform is using the node exporter #{emoji_node_exporter}", Time.utc)
upsert_passed_task(testsuite_task,"✔️ PASSED: Your platform is using the node exporter #{emoji_node_exporter}", task_start_time)
else
emoji_node_exporter="📶☠️"
upsert_failed_task("node_exporter", "✖️ FAILED: Your platform does not have the node exporter installed #{emoji_node_exporter}", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Your platform does not have the node exporter installed #{emoji_node_exporter}", task_start_time)
end
end


desc "Does the Platform have the prometheus adapter installed"
task "prometheus_adapter", ["install_cluster_tools"] do |_, args|
task_start_time = Time.utc
testsuite_task = "prometheus_adapter"
Log.for(testsuite_task).info { "Starting test" }

unless check_poc(args)
Log.info { "skipping prometheus_adapter: not in poc mode" }
puts "SKIPPED: Prometheus Adapter".colorize(:yellow)
Expand All @@ -83,15 +95,19 @@ namespace "platform" do

if found
emoji_prometheus_adapter="📶☠️"
upsert_passed_task("prometheus_adapter","✔️ PASSED: Your platform is using the prometheus adapter #{emoji_prometheus_adapter}", Time.utc)
upsert_passed_task(testsuite_task,"✔️ PASSED: Your platform is using the prometheus adapter #{emoji_prometheus_adapter}", task_start_time)
else
emoji_prometheus_adapter="📶☠️"
upsert_failed_task("prometheus_adapter", "✖️ FAILED: Your platform does not have the prometheus adapter installed #{emoji_prometheus_adapter}", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Your platform does not have the prometheus adapter installed #{emoji_prometheus_adapter}", task_start_time)
end
end

desc "Does the Platform have the K8s Metrics Server installed"
task "metrics_server", ["install_cluster_tools"] do |_, args|
task_start_time = Time.utc
testsuite_task = "metrics_server"
Log.for(testsuite_task).info { "Starting test" }

unless check_poc(args)
Log.info { "skipping metrics_server: not in poc mode" }
puts "SKIPPED: Metrics Server".colorize(:yellow)
Expand All @@ -108,10 +124,10 @@ namespace "platform" do
found = KernelIntrospection::K8s.find_first_process(CloudNativeIntrospection::METRICS_SERVER)
if found
emoji_metrics_server="📶☠️"
upsert_passed_task("metrics_server","✔️ PASSED: Your platform is using the metrics server #{emoji_metrics_server}", Time.utc)
upsert_passed_task(testsuite_task, "✔️ PASSED: Your platform is using the metrics server #{emoji_metrics_server}", task_start_time)
else
emoji_metrics_server="📶☠️"
upsert_failed_task("metrics_server", "✖️ FAILED: Your platform does not have the metrics server installed #{emoji_metrics_server}", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Your platform does not have the metrics server installed #{emoji_metrics_server}", task_start_time)
end
end
end
Expand Down
21 changes: 14 additions & 7 deletions src/tasks/platform/platform.cr
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ end

desc "Does the platform pass the K8s conformance tests?"
task "k8s_conformance" do |_, args|
VERBOSE_LOGGING.info "k8s_conformance" if check_verbose(args)
task_start_time = Time.utc
testsuite_task = "k8s_conformance"
Log.for(testsuite_task).info { "Starting test" }

begin
current_dir = FileUtils.pwd
VERBOSE_LOGGING.debug current_dir if check_verbose(args)
Log.for(testsuite_task).debug { "current dir: #{current_dir}" }
sonobuoy = "#{tools_path}/sonobuoy/sonobuoy"

# Clean up old results
Expand All @@ -34,7 +37,7 @@ task "k8s_conformance" do |_, args|
output: delete_stdout = IO::Memory.new,
error: delete_stderr = IO::Memory.new
)
Log.for("verbose").info { delete_stdout } if check_verbose(args)
Log.for(testsuite_task).debug { "sonobuoy delete output: #{delete_stdout}" }

# Run the tests
testrun_stdout = IO::Memory.new
Expand Down Expand Up @@ -70,10 +73,10 @@ task "k8s_conformance" do |_, args|

failed_count = ((results.match(/Failed: (.*)/)).try &.[1])
if failed_count.to_s.to_i > 0
upsert_failed_task("k8s_conformance", "✖️ FAILED: K8s conformance test has #{failed_count} failure(s)!", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: K8s conformance test has #{failed_count} failure(s)!", task_start_time)

else
upsert_passed_task("k8s_conformance", "✔️ PASSED: K8s conformance test has no failures", Time.utc)
upsert_passed_task(testsuite_task, "✔️ PASSED: K8s conformance test has no failures", task_start_time)
end
rescue ex
Log.error { ex.message }
Expand All @@ -88,6 +91,10 @@ end
desc "Is Cluster Api available and managing a cluster?"
task "clusterapi_enabled" do |_, args|
CNFManager::Task.task_runner(args, check_cnf_installed=false) do
task_start_time = Time.utc
testsuite_task = "clusterapi_enabled"
Log.for(testsuite_task).info { "Starting test" }

unless check_poc(args)
Log.info { "skipping clusterapi_enabled: not in poc mode" }
puts "SKIPPED: ClusterAPI Enabled".colorize(:yellow)
Expand Down Expand Up @@ -134,9 +141,9 @@ task "clusterapi_enabled" do |_, args|
emoji_control=""

if clusterapi_namespaces_json["items"]? && clusterapi_namespaces_json["items"].as_a.size > 0 && clusterapi_control_planes_json["items"]? && clusterapi_control_planes_json["items"].as_a.size > 0
resp = upsert_passed_task("clusterapi_enabled", "✔️ Cluster API is enabled #{emoji_control}", Time.utc)
resp = upsert_passed_task(testsuite_task, "✔️ Cluster API is enabled #{emoji_control}", task_start_time)
else
resp = upsert_failed_task("clusterapi_enabled", "✖️ Cluster API NOT enabled #{emoji_control}", Time.utc)
resp = upsert_failed_task(testsuite_task, "✖️ Cluster API NOT enabled #{emoji_control}", task_start_time)
end

resp
Expand Down
12 changes: 8 additions & 4 deletions src/tasks/platform/resilience.cr
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ namespace "platform" do

desc "Does the Platform recover the node and reschedule pods when a worker node fails"
task "worker_reboot_recovery" do |_, args|
task_start_time = Time.utc
testsuite_task = "worker_reboot_recovery"
Log.for(testsuite_task).info { "Starting test" }

unless check_destructive(args)
Log.info { "skipping node_failure: not in destructive mode" }
puts "SKIPPED: Node Failure".colorize(:yellow)
Expand Down Expand Up @@ -43,7 +47,7 @@ namespace "platform" do
pod_ready = KubectlClient::Get.pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2]
pod_ready_timeout = pod_ready_timeout - 1
if pod_ready_timeout == 0
upsert_failed_task("worker_reboot_recovery", "✖️ FAILED: Failed to install reboot daemon", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Failed to install reboot daemon", task_start_time)
exit 1
end
sleep 1
Expand All @@ -67,7 +71,7 @@ namespace "platform" do
Log.info { "Node Ready Status: #{node_ready}" }
node_failure_timeout = node_failure_timeout - 1
if node_failure_timeout == 0
upsert_failed_task("worker_reboot_recovery", "✖️ FAILED: Node failed to go offline", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Node failed to go offline", task_start_time)
exit 1
end
sleep 1
Expand All @@ -85,14 +89,14 @@ namespace "platform" do
Log.info { "Node Ready Status: #{node_ready}" }
node_online_timeout = node_online_timeout - 1
if node_online_timeout == 0
upsert_failed_task("worker_reboot_recovery", "✖️ FAILED: Node failed to come back online", Time.utc)
upsert_failed_task(testsuite_task, "✖️ FAILED: Node failed to come back online", task_start_time)
exit 1
end
sleep 1
end

emoji_worker_reboot_recovery=""
resp = upsert_passed_task("worker_reboot_recovery","✔️ PASSED: Node came back online #{emoji_worker_reboot_recovery}", Time.utc)
resp = upsert_passed_task(testsuite_task,"✔️ PASSED: Node came back online #{emoji_worker_reboot_recovery}", task_start_time)


ensure
Expand Down
36 changes: 24 additions & 12 deletions src/tasks/platform/security.cr
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,19 @@ namespace "platform" do
desc "Is the platform control plane hardened"
task "control_plane_hardening", ["kubescape_scan"] do |_, args|
task_response = CNFManager::Task.task_runner(args, check_cnf_installed=false) do |args|
VERBOSE_LOGGING.info "control_plane_hardening" if check_verbose(args)
task_start_time = Time.utc
testsuite_task = "control_plane_hardening"
Log.for(testsuite_task).info { "Starting test" }

results_json = Kubescape.parse
test_json = Kubescape.test_by_test_name(results_json, "Control plane hardening")
test_report = Kubescape.parse_test_report(test_json)

emoji_security="🔓🔑"
if test_report.failed_resources.size == 0
upsert_passed_task("control_plane_hardening", "✔️ PASSED: Control plane hardened #{emoji_security}", Time.utc)
upsert_passed_task(testsuite_task, "✔️ PASSED: Control plane hardened #{emoji_security}", task_start_time)
else
resp = upsert_failed_task("control_plane_hardening", "✖️ FAILED: Control plane not hardened #{emoji_security}", Time.utc)
resp = upsert_failed_task(testsuite_task, "✖️ FAILED: Control plane not hardened #{emoji_security}", task_start_time)
test_report.failed_resources.map {|r| stdout_failure(r.alert_message) }
stdout_failure("Remediation: #{test_report.remediation}")
resp
Expand All @@ -34,16 +37,19 @@ namespace "platform" do
task "cluster_admin", ["kubescape_scan"] do |_, args|
next if args.named["offline"]?
CNFManager::Task.task_runner(args, check_cnf_installed=false) do |args, config|
VERBOSE_LOGGING.info "cluster_admin" if check_verbose(args)
task_start_time = Time.utc
testsuite_task = "cluster_admin"
Log.for(testsuite_task).info { "Starting test" }

results_json = Kubescape.parse
test_json = Kubescape.test_by_test_name(results_json, "Cluster-admin binding")
test_report = Kubescape.parse_test_report(test_json)

emoji_security="🔓🔑"
if test_report.failed_resources.size == 0
upsert_passed_task("cluster_admin", "✔️ PASSED: No users with cluster admin role found #{emoji_security}", Time.utc)
upsert_passed_task(testsuite_task, "✔️ PASSED: No users with cluster admin role found #{emoji_security}", task_start_time)
else
resp = upsert_failed_task("cluster_admin", "✖️ FAILED: Users with cluster admin role found #{emoji_security}", Time.utc)
resp = upsert_failed_task(testsuite_task, "✖️ FAILED: Users with cluster admin role found #{emoji_security}", task_start_time)
test_report.failed_resources.map {|r| stdout_failure(r.alert_message) }
stdout_failure("Remediation: #{test_report.remediation}")
resp
Expand All @@ -56,16 +62,19 @@ namespace "platform" do
next if args.named["offline"]?

CNFManager::Task.task_runner(args, check_cnf_installed=false) do |args, config|
Log.for("verbose").info { "exposed_dashboard" } if check_verbose(args)
task_start_time = Time.utc
testsuite_task = "exposed_dashboard"
Log.for(testsuite_task).info { "Starting test" }

results_json = Kubescape.parse
test_json = Kubescape.test_by_test_name(results_json, "Exposed dashboard")
test_report = Kubescape.parse_test_report(test_json)

emoji_security = "🔓🔑"
if test_report.failed_resources.size == 0
upsert_passed_task("exposed_dashboard", "✔️ PASSED: No exposed dashboard found in the cluster #{emoji_security}", Time.utc)
upsert_passed_task(testsuite_task, "✔️ PASSED: No exposed dashboard found in the cluster #{emoji_security}", task_start_time)
else
resp = upsert_failed_task("exposed_dashboard", "✖️ FAILED: Found exposed dashboard in the cluster #{emoji_security}", Time.utc)
resp = upsert_failed_task(testsuite_task, "✖️ FAILED: Found exposed dashboard in the cluster #{emoji_security}", task_start_time)
test_report.failed_resources.map {|r| stdout_failure(r.alert_message) }
stdout_failure("Remediation: #{test_report.remediation}")
resp
Expand All @@ -76,17 +85,20 @@ namespace "platform" do
desc "Check if the CNF is running containers with name tiller in their image name?"
task "helm_tiller" do |_, args|
emoji_security="🔓🔑"
Log.for("verbose").info { "platform:helm_tiller" }
task_start_time = Time.utc
testsuite_task = "helm_tiller"
Log.for(testsuite_task).info { "Starting test" }

Kyverno.install

CNFManager::Task.task_runner(args, check_cnf_installed=false) do |args, config|
policy_path = Kyverno.best_practice_policy("disallow_helm_tiller/disallow_helm_tiller.yaml")
failures = Kyverno::PolicyAudit.run(policy_path, EXCLUDE_NAMESPACES)

if failures.size == 0
resp = upsert_passed_task("helm_tiller", "✔️ PASSED: No Helm Tiller containers are running #{emoji_security}", Time.utc)
resp = upsert_passed_task(testsuite_task, "✔️ PASSED: No Helm Tiller containers are running #{emoji_security}", task_start_time)
else
resp = upsert_failed_task("helm_tiller", "✖️ FAILED: Containers with the Helm Tiller image are running #{emoji_security}", Time.utc)
resp = upsert_failed_task(testsuite_task, "✖️ FAILED: Containers with the Helm Tiller image are running #{emoji_security}", task_start_time)
failures.each do |failure|
failure.resources.each do |resource|
puts "#{resource.kind} #{resource.name} in #{resource.namespace} namespace failed. #{failure.message}".colorize(:red)
Expand Down
2 changes: 2 additions & 0 deletions src/tasks/utils/points.cr
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,8 @@ module CNFManager
end_time = Time.utc
task_runtime = (end_time - start_time).milliseconds

Log.for("#{task}").info { "task_runtime=#{task_runtime}; start_time=#{start_time}; end_time:#{end_time}" }

# The task result info has to be appeneded to an array of YAML::Any
# So encode it into YAML and parse it back again to assign it.
#
Expand Down
Loading

0 comments on commit 8758bce

Please sign in to comment.