From d7f80991b9cc425c1f1b1c56693092c64d1e0d86 Mon Sep 17 00:00:00 2001 From: Sergio Cazzolato Date: Fri, 27 Sep 2024 14:26:35 -0300 Subject: [PATCH] tests: adding extra scenarios for fault injection (#14489) * tests: adding missing scenarios for fault injection This change includes some scenarios that are missing to check fault injection. In this case are included fault/reboot for: - install snapd --dangerous - remodel - install kernel component - update boot config * fix shellcheck errors * adding few extra workers to make sure the full run can be completed * remove channel from test which is not used * update the expected values to Done in all the scenarios * update the result status in core20-fault-inject-on-refresh test * Fix tests with new status check * fix shellcheck error * fix failing tests * fix shellcheck * revert change on refresh scenario --- spread.yaml | 6 +- .../core20-fault-inject-on-install/task.yaml | 79 ++++++++++++ .../core20-fault-inject-on-refresh/task.yaml | 22 +--- .../component.yaml | 5 + .../task.yaml | 91 ++++++++++++++ .../core20-fault-inject-on-remodel/task.yaml | 118 ++++++++++++++++++ .../cloud.conf | 7 ++ .../defaults.yaml | 6 + .../task.yaml | 100 +++++++++++++++ 9 files changed, 411 insertions(+), 23 deletions(-) create mode 100644 tests/nested/core/core20-fault-inject-on-install/task.yaml create mode 100644 tests/nested/manual/core20-fault-inject-on-install-component/component.yaml create mode 100644 tests/nested/manual/core20-fault-inject-on-install-component/task.yaml create mode 100644 tests/nested/manual/core20-fault-inject-on-remodel/task.yaml create mode 100644 tests/nested/manual/core20-fault-inject-on-update-config/cloud.conf create mode 100644 tests/nested/manual/core20-fault-inject-on-update-config/defaults.yaml create mode 100644 tests/nested/manual/core20-fault-inject-on-update-config/task.yaml diff --git a/spread.yaml b/spread.yaml index 174d872f49f..53f02a6162c 100644 --- a/spread.yaml +++ b/spread.yaml @@ -301,15 +301,15 @@ backends: - ubuntu-20.04-64: image: ubuntu-2004-64-virt-enabled storage: 20G - workers: 10 + workers: 12 - ubuntu-22.04-64: image: ubuntu-2204-64-virt-enabled storage: 25G - workers: 12 + workers: 14 - ubuntu-24.04-64: image: ubuntu-2404-64-virt-enabled storage: 25G - workers: 12 + workers: 14 google-nested-arm: type: google diff --git a/tests/nested/core/core20-fault-inject-on-install/task.yaml b/tests/nested/core/core20-fault-inject-on-install/task.yaml new file mode 100644 index 00000000000..2cb8d30ff98 --- /dev/null +++ b/tests/nested/core/core20-fault-inject-on-install/task.yaml @@ -0,0 +1,79 @@ +summary: Ensure that snapd snap can be installed when a panic/reboot occurs during the process + +details: | + The test checks that if a fault is injected during the snapd snap install, + then the change continues an is completed with a predictable outcome. + +systems: [ubuntu-2*] + +environment: + TAG/snapd_panic_auto_connect: after-auto-connect + FAULT/snapd_panic_auto_connect: panic + TAG/snapd_reboot_auto_connect: after-auto-connect + FAULT/snapd_reboot_auto_connect: reboot + STATUS: Done + +prepare: | + # automatically cleaned up in restore + echo "Inject a $FAULT on $TAG" + cat < fault-inject.conf + [Service] + Environment=SNAPPY_TESTING=1 + Environment=SNAPD_FAULT_INJECT=$TAG:$FAULT + EOF + + echo "Wait for the system to be seeded first" + remote.exec "sudo snap wait system seed.loaded" + + remote.push fault-inject.conf + remote.exec "sudo mkdir -p /etc/systemd/system/snapd.service.d" + remote.exec "sudo cp -v fault-inject.conf /etc/systemd/system/snapd.service.d/" + remote.exec "sudo systemctl daemon-reload" + remote.exec "sudo systemctl restart snapd.service" + + cp "$(ls /tmp/work-dir/snapd_snap/snapd_*.snap)" snapd.snap + +execute: | + SNAP=snapd + REBOOT=false + + if [ "$FAULT" = reboot ]; then + REBOOT=true + fi + + # Get the initial snap revision + INITIAL_REV="$(remote.exec snap list | grep -E "^$SNAP .*" | awk ' {print $3} ')" + + # Refresh and reboot before the process is completed + boot_id="$(tests.nested boot-id)" + remote.push "$PWD/${SNAP}.snap" + change_id="$(remote.exec "sudo snap install --dangerous --no-wait ${SNAP}.snap")" + + if [ "$REBOOT" = true ]; then + remote.wait-for reboot "$boot_id" + boot_id="$(tests.nested boot-id)" + fi + + echo "And snap refresh is completed" + remote.exec "snap watch $change_id" || true + # shellcheck disable=SC2016 + retry --wait 1 -n 60 --env "CHANGE_ID=$change_id" --env "SNAP=$SNAP" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + + # Check if the fault was injected + remote.exec "test -e /var/lib/snapd/faults/$TAG:$FAULT" + + if [ "$FAULT" = panic ]; then + echo "Ensure the panic was injected and triggered stamp file is present" + remote.exec "sudo journalctl -u snapd | grep -A 3 panic" > output + MATCH "$TAG":$FAULT < output + MATCH "osutil.injectFault" < output + fi + + FINAL_STATUS="$(remote.exec snap changes | grep -E "^$change_id .*" | awk ' {print $2} ')" + CURRENT_REV="$(remote.exec snap list | grep -E "^$SNAP .*" | awk ' {print $3} ')" + + echo "Ensure the change has no steps with Error" + [ "$FINAL_STATUS" = Done ] + remote.exec "snap change $change_id" | NOMATCH "^Error .*" + echo "Ensure the initial revision is not the current one" + test "$INITIAL_REV" != "$CURRENT_REV" diff --git a/tests/nested/core/core20-fault-inject-on-refresh/task.yaml b/tests/nested/core/core20-fault-inject-on-refresh/task.yaml index a7c1c48b685..5e48686ed16 100644 --- a/tests/nested/core/core20-fault-inject-on-refresh/task.yaml +++ b/tests/nested/core/core20-fault-inject-on-refresh/task.yaml @@ -45,25 +45,6 @@ environment: TAG/gadget_reboot_refresh_gadget_assets: refresh-gadget-assets FAULT/gadget_reboot_refresh_gadget_assets: reboot - # TODO: Add the following scenarios in a different test - # These should go in a remodel test - #TAG/gadget_remodel_boot_assets: remodel-boot-assets - #TAG/kernel_remodel_boot_assets: remodel-boot-assets - - # This needs a change in the gadget yaml like in cmdline-option test - # TAG/gadget_update_command_line_gadget: update-command-line-gadget - # TAG/gadget_update_config_bootloader: update-config-bootloader - - # For this scenario we need to refresh to a kernel with components - # See this test nested/manual/kernel-modules-components - # TAG/kernel_panic_prepare_kernel_components: prepare-kernel-components - # FAULT/kernel_panic_prepare_kernel_components: panic - - # This scenario need to install a new snapd with --dangerous instead of - # refresh from the store - # TAG/snapd_panic_auto_connect: after-auto-connect - # FAULT/snapd_panic_auto_connect: panic - prepare: | # automatically cleaned up in restore echo "Inject a $FAULT on $TAG" @@ -143,7 +124,7 @@ execute: | echo "And snap refresh is completed" remote.exec "snap watch $change_id" || true # shellcheck disable=SC2016 - retry --wait 1 -n 60 --env "CHANGE_ID=$change_id" --env "SNAP=$SNAP" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + retry --wait 1 -n 60 --env "CHANGE_ID=$change_id" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' # Check if the fault was injected remote.exec "test -e /var/lib/snapd/faults/$TAG:$FAULT" @@ -158,6 +139,7 @@ execute: | FINAL_STATUS="$(remote.exec snap changes | grep -E "^$change_id .*" | awk ' {print $2} ')" CURRENT_REV="$(remote.exec snap list | grep -E "^$SNAP .*" | awk ' {print $3} ')" + # Check the final status is the expected one if [ "$FINAL_STATUS" = Error ]; then echo "Ensure the change has steps with Error" remote.exec "snap change $change_id" | MATCH "^Error .*" diff --git a/tests/nested/manual/core20-fault-inject-on-install-component/component.yaml b/tests/nested/manual/core20-fault-inject-on-install-component/component.yaml new file mode 100644 index 00000000000..6f7de4a6b6b --- /dev/null +++ b/tests/nested/manual/core20-fault-inject-on-install-component/component.yaml @@ -0,0 +1,5 @@ +component: pc-kernel+wifi-comp +type: kernel-modules +version: 1.0 +summary: wifi simulator +description: wifi simulator for testing purposes diff --git a/tests/nested/manual/core20-fault-inject-on-install-component/task.yaml b/tests/nested/manual/core20-fault-inject-on-install-component/task.yaml new file mode 100644 index 00000000000..5036ac9a9d9 --- /dev/null +++ b/tests/nested/manual/core20-fault-inject-on-install-component/task.yaml @@ -0,0 +1,91 @@ +summary: Ensure that a kernel component can be installed when a panic occurs during the process + +details: | + The test checks that if a fault is injected during the installation of a kernel component, + then the change continues an is completed with a predictable outcome. + +systems: [-ubuntu-1*, -ubuntu-20*, -ubuntu-22*] + +environment: + TAG/kernel_panic_prepare_kernel_components: prepare-kernel-components + FAULT/kernel_panic_prepare_kernel_components: panic + + NESTED_BUILD_SNAPD_FROM_CURRENT: true + NESTED_REPACK_KERNEL_SNAP: false + NESTED_ENABLE_OVMF: true + +prepare: | + # Modify kernel and create a component + VERSION="$(tests.nested show version)" + snap download --channel="$VERSION"/beta pc-kernel + unsquashfs -d kernel pc-kernel_*.snap + kern_ver=$(find kernel/modules/* -maxdepth 0 -printf "%f\n") + comp_ko_dir=wifi-comp/modules/"$kern_ver"/wireless/ + mkdir -p "$comp_ko_dir" + mkdir -p wifi-comp/meta/ + cp component.yaml wifi-comp/meta/ + hwsim_path=$(find kernel -name mac80211_hwsim.ko\*) + cp "$hwsim_path" "$comp_ko_dir" + snap pack wifi-comp + + # Create kernel without the kernel module + rm "$hwsim_path" + # depmod wants a lib subdir, fake it and remove after invocation + mkdir kernel/lib + ln -s ../modules kernel/lib/modules + depmod -b kernel/ "$kern_ver" + rm -rf kernel/lib + rm pc-kernel_*.snap + # append component meta-information + printf 'components:\n wifi-comp:\n type: kernel-modules\n' >> kernel/meta/snap.yaml + snap pack kernel + + cp pc-kernel_*.snap "$(tests.nested get extra-snaps-path)" + tests.nested build-image core + tests.nested create-vm core + + echo "Inject a $FAULT on $TAG" + cat < fault-inject.conf + [Service] + Environment=SNAPPY_TESTING=1 + Environment=SNAPD_FAULT_INJECT=$TAG:$FAULT + EOF + + echo "Wait for the system to be seeded first" + remote.exec "sudo snap wait system seed.loaded" + + remote.push fault-inject.conf + remote.exec "sudo mkdir -p /etc/systemd/system/snapd.service.d" + remote.exec "sudo cp -v fault-inject.conf /etc/systemd/system/snapd.service.d/" + remote.exec "sudo systemctl daemon-reload" + remote.exec "sudo systemctl restart snapd.service" + +execute: | + # install the component + comp_file=pc-kernel+wifi-comp_1.0.comp + remote.push "$comp_file" + change_id="$(remote.exec sudo snap install --dangerous --no-wait "$comp_file")" + + echo "And snap kernel component is installed" + remote.exec "snap watch $change_id" || true + # shellcheck disable=SC2016 + retry --wait 1 -n 60 --env "CHANGE_ID=$change_id" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + + # Check if the fault was injected + remote.exec "test -e /var/lib/snapd/faults/$TAG:$FAULT" + + if [ "$FAULT" = panic ]; then + echo "Ensure the panic was injected and triggered stamp file is present" + remote.exec "sudo journalctl -u snapd | grep -A 3 panic" > output + MATCH "$TAG":$FAULT < output + MATCH "osutil.injectFault" < output + fi + + FINAL_STATUS="$(remote.exec snap changes | grep -E "^$change_id .*" | awk ' {print $2} ')" + KERNEL_VERSION=$(remote.exec uname -r) + + echo "Ensure the change has no steps with Error" + [ "$FINAL_STATUS" = Done ] + remote.exec "snap change $change_id" | NOMATCH "^Error .*" + echo "Ensure the component is installed" + remote.exec "test -e /var/lib/snapd/kernel/pc-kernel/x1/lib/modules/$KERNEL_VERSION/updates/wifi-comp" diff --git a/tests/nested/manual/core20-fault-inject-on-remodel/task.yaml b/tests/nested/manual/core20-fault-inject-on-remodel/task.yaml new file mode 100644 index 00000000000..246b35eea02 --- /dev/null +++ b/tests/nested/manual/core20-fault-inject-on-remodel/task.yaml @@ -0,0 +1,118 @@ +summary: Ensure that a remodel can be completed when a panic occurs during the process + +details: | + The test checks that if a fault is injected during a remodel, + then the change continues an is completed with a predictable outcome. + +systems: [ubuntu-2*] + +environment: + NESTED_CUSTOM_MODEL: $TESTSLIB/assertions/valid-for-testing-pc-{VERSION}.model + NESTED_ENABLE_TPM: true + NESTED_ENABLE_SECURE_BOOT: true + NESTED_BUILD_SNAPD_FROM_CURRENT: true + + TAG/gadget_panic_remodel_boot_assets: remodel-boot-assets + FAULT/gadget_panic_remodel_boot_assets: panic + STATUS/gadget_panic_remodel_boot_assets: Done + TAG/gadget_reboot_remodel_boot_assets: remodel-boot-assets + FAULT/gadget_reboot_remodel_boot_assets: reboot + STATUS/gadget_reboot_remodel_boot_assets: Error + TAG/kernel_panic_remodel_boot_assets: remodel-boot-assets + FAULT/kernel_panic_remodel_boot_assets: panic + STATUS/kernel_panic_remodel_boot_assets: Done + TAG/kernel_reboot_remodel_boot_assets: remodel-boot-assets + FAULT/kernel_reboot_remodel_boot_assets: reboot + STATUS/kernel_reboot_remodel_boot_assets: Error + +prepare: | + tests.nested build-image core + tests.nested create-vm core + + remote.wait-for device-initialized + + # automatically cleaned up in restore + echo "Inject a $FAULT on $TAG" + cat < fault-inject.conf + [Service] + Environment=SNAPPY_TESTING=1 + Environment=SNAPD_FAULT_INJECT=$TAG:$FAULT + EOF + + echo "Wait for the system to be seeded first" + remote.exec "sudo snap wait system seed.loaded" + + remote.push fault-inject.conf + remote.exec "sudo mkdir -p /etc/systemd/system/snapd.service.d" + remote.exec "sudo cp -v fault-inject.conf /etc/systemd/system/snapd.service.d/" + remote.exec "sudo systemctl daemon-reload" + remote.exec "sudo systemctl restart snapd.service" + +execute: | + VERSION="$(tests.nested show version)" + SNAP= + REBOOT=false + SECOND_REBOOT=false + + if [ "$FAULT" = reboot ]; then + REBOOT=true + fi + + case "$SPREAD_VARIANT" in + gadget_*) + SNAP=pc + ;; + kernel_*) + SNAP=pc-kernel + if [ "$FAULT" = reboot ]; then + SECOND_REBOOT=true + else + REBOOT=true + fi + ;; + *) + echo "scenario no supported: $SPREAD_VARIANT" + exit 1 + ;; + esac + + # Remodel + boot_id="$(tests.nested boot-id)" + + remote.push "$TESTSLIB/assertions/valid-for-testing-pc-revno-2-$VERSION.model" + change_id="$(remote.exec sudo snap remodel --no-wait "valid-for-testing-pc-revno-2-$VERSION.model")" + + if [ "$REBOOT" = true ]; then + remote.wait-for reboot "$boot_id" + boot_id="$(tests.nested boot-id)" + fi + + if [ "$SECOND_REBOOT" = true ]; then + remote.wait-for reboot "$boot_id" + fi + + echo "And snap remodel is completed" + remote.exec "snap watch $change_id" || true + # shellcheck disable=SC2016 + retry --wait 1 -n 300 --env "CHANGE_ID=$change_id" --env "SNAP=$SNAP" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + + # Check if the fault was injected + remote.exec "test -e /var/lib/snapd/faults/$TAG:$FAULT" + + if [ "$FAULT" = panic ]; then + echo "Ensure the panic was injected and triggered stamp file is present" + remote.exec "sudo journalctl -u snapd | grep -A 3 panic" > output + MATCH "$TAG":$FAULT < output + MATCH "osutil.injectFault" < output + fi + + FINAL_STATUS="$(remote.exec snap changes | grep -E "^$change_id .*" | awk ' {print $2} ')" + + [ "$FINAL_STATUS" = "$STATUS" ] + if [ "$FINAL_STATUS" = Error ]; then + echo "Ensure the remodel was not done" + not remote.exec "snap list hello-world" + else + echo "Ensure the remodel was done" + remote.exec "snap list hello-world" + fi diff --git a/tests/nested/manual/core20-fault-inject-on-update-config/cloud.conf b/tests/nested/manual/core20-fault-inject-on-update-config/cloud.conf new file mode 100644 index 00000000000..cf6b58982b8 --- /dev/null +++ b/tests/nested/manual/core20-fault-inject-on-update-config/cloud.conf @@ -0,0 +1,7 @@ +#cloud-config +datasource_list: [None] +users: + - name: user1 + sudo: "ALL=(ALL) NOPASSWD:ALL" + lock_passwd: false + plain_text_passwd: "ubuntu" diff --git a/tests/nested/manual/core20-fault-inject-on-update-config/defaults.yaml b/tests/nested/manual/core20-fault-inject-on-update-config/defaults.yaml new file mode 100644 index 00000000000..6417309d656 --- /dev/null +++ b/tests/nested/manual/core20-fault-inject-on-update-config/defaults.yaml @@ -0,0 +1,6 @@ +defaults: + system: + refresh: + hold: "@HOLD-TIME@" + journal: + persistent: true diff --git a/tests/nested/manual/core20-fault-inject-on-update-config/task.yaml b/tests/nested/manual/core20-fault-inject-on-update-config/task.yaml new file mode 100644 index 00000000000..9198df30e20 --- /dev/null +++ b/tests/nested/manual/core20-fault-inject-on-update-config/task.yaml @@ -0,0 +1,100 @@ +summary: Ensure that the kernel command line can be updated when a panic occurs during the process + +details: | + The test checks that if a fault is injected during the update of the kernel command + line, then the change continues an is completed with a predictable outcome. + +systems: [ubuntu-2*] + +environment: + TAG/gadget_panic_command_line: update-command-line-gadget + FAULT/gadget_panic_command_line: panic + #TAG/gadget_panic_config_bootloader: update-config-bootloader + #FAULT/gadget_panic_config_bootloader: panic + + NESTED_BUILD_SNAPD_FROM_CURRENT: true + NESTED_REPACK_GADGET_SNAP: false + NESTED_ENABLE_OVMF: true + +prepare: | + # Get the snakeoil key and cert for signing gadget assets (shim) + KEY_NAME=$(tests.nested download snakeoil-key) + SNAKEOIL_KEY="$PWD/$KEY_NAME.key" + SNAKEOIL_CERT="$PWD/$KEY_NAME.pem" + + # Get the nested system version + VERSION="$(tests.nested show version)" + + snap download --basename=pc --channel="$VERSION/edge" pc + unsquashfs -d pc-gadget pc.snap + + # delay all refreshes for a week from now, as otherwise refreshes for our + # snaps (which are asserted by the testrootorg authority-id) may happen, which + # will break things because the signing keys won't match, etc. and + # specifically snap-bootstrap in the kernel snap from the store won't trust + # the seed keys to unlock the encrypted data partition in the initramfs + sed defaults.yaml -e "s/@HOLD-TIME@/$(date --date="next week" +%Y-%m-%dT%H:%M:%S%:z)/" >> \ + pc-gadget/meta/gadget.yaml + + # install the cloud.conf + cp cloud.conf pc-gadget/cloud.conf + + # Sign boot assets + tests.nested secboot-sign gadget pc-gadget "$SNAKEOIL_KEY" "$SNAKEOIL_CERT" + + # Add a list of allowed kernel arguments + allow_ls="\nkernel-cmdline:\n allow:\n" + for arg in 'extra.val=1' 'extra.flag' 'foo=*'; do + allow_ls="${allow_ls} - ${arg}\n" + done + printf "%b" "$allow_ls" >> pc-gadget/meta/gadget.yaml + + snap pack pc-gadget/ "$(tests.nested get extra-snaps-path)" + + tests.nested build-image core + tests.nested create-vm core + + echo "Inject a $FAULT on $TAG" + cat < fault-inject.conf + [Service] + Environment=SNAPPY_TESTING=1 + Environment=SNAPD_FAULT_INJECT=$TAG:$FAULT + EOF + + echo "Wait for the system to be seeded first" + remote.exec "sudo snap wait system seed.loaded" + + remote.push fault-inject.conf + remote.exec "sudo mkdir -p /etc/systemd/system/snapd.service.d" + remote.exec "sudo cp -v fault-inject.conf /etc/systemd/system/snapd.service.d/" + remote.exec "sudo systemctl daemon-reload" + remote.exec "sudo systemctl restart snapd.service" + +execute: | + # Update the cmdline + remote.exec "sudo snap set system system.kernel.dangerous-cmdline-append='extradang.val=1 extradang.flag'" + # For this update, there are 3 changes generated + #3 Done today at 13:23 UTC today at 13:26 UTC Change configuration of "core" snap + #4 Done today at 13:23 UTC today at 13:25 UTC Update kernel command line due to change in system configuration + #5 Done today at 13:25 UTC today at 13:26 UTC Update kernel command line due to change in system configuration + + # shellcheck disable=SC2016 + retry --wait 1 -n 120 --env "CHANGE_ID=3" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + # shellcheck disable=SC2016 + retry --wait 1 -n 30 --env "CHANGE_ID=4" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + # shellcheck disable=SC2016 + retry --wait 1 -n 30 --env "CHANGE_ID=5" sh -c 'remote.exec snap changes | MATCH "${CHANGE_ID} .* (Done|Error) .*"' + + # Check if the fault was injected + remote.exec "test -e /var/lib/snapd/faults/$TAG:$FAULT" + + for change_id in 3 4 5; do + FINAL_STATUS="$(remote.exec snap changes | grep -E "^$change_id .*" | awk ' {print $2} ')" + + echo "Ensure the change has no steps with Error" + [ "$FINAL_STATUS" = Done ] + remote.exec "snap change $change_id" | NOMATCH "^Error .*" + done + + echo "Ensure the command line has been updated" + remote.exec "sudo cat /var/lib/snapd/modeenv" | MATCH "extradang.val=1"