From f8cc3364c9a16d42a666968dff90dbfbf7219260 Mon Sep 17 00:00:00 2001 From: Yury Kulazhenkov Date: Wed, 24 Jul 2024 14:26:16 +0300 Subject: [PATCH 1/9] Remove logic that installs rdma-core package The logic removed to align the code for all platforms. Before this change the state was following: * Ubuntu - the logic is completely broken and newer executed * RHCOS - validation only * RHEL - package installation (partially broken in RHEL 8 and RHEL 9) The commit changes logic for all OS to do a validation only. Signed-off-by: Yury Kulazhenkov --- cmd/sriov-network-config-daemon/service.go | 4 +- .../service_test.go | 6 +- doc/quickstart.md | 1 + pkg/daemon/daemon.go | 2 +- pkg/daemon/daemon_test.go | 2 +- pkg/helper/mock/mock_helper.go | 177 +--------- pkg/host/internal/common.go | 19 -- pkg/host/internal/kernel/kernel.go | 306 +----------------- pkg/host/mock/mock_host.go | 177 +--------- pkg/host/types/interfaces.go | 26 +- 10 files changed, 54 insertions(+), 666 deletions(-) delete mode 100644 pkg/host/internal/common.go diff --git a/cmd/sriov-network-config-daemon/service.go b/cmd/sriov-network-config-daemon/service.go index f86151cb7..afa5a5582 100644 --- a/cmd/sriov-network-config-daemon/service.go +++ b/cmd/sriov-network-config-daemon/service.go @@ -146,9 +146,9 @@ func phasePre(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helpe return fmt.Errorf("failed to remove sriov result file: %v", err) } - _, err := hostHelpers.TryEnableRdma() + _, err := hostHelpers.CheckRDMAEnabled() if err != nil { - setupLog.Error(err, "warning, failed to enable RDMA") + setupLog.Error(err, "warning, failed to check RDMA state") } hostHelpers.TryEnableTun() hostHelpers.TryEnableVhostNet() diff --git a/cmd/sriov-network-config-daemon/service_test.go b/cmd/sriov-network-config-daemon/service_test.go index 629949060..d7b9134f8 100644 --- a/cmd/sriov-network-config-daemon/service_test.go +++ b/cmd/sriov-network-config-daemon/service_test.go @@ -158,7 +158,7 @@ var _ = Describe("Service", func() { "/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"), }, }) - hostHelpers.EXPECT().TryEnableRdma().Return(true, nil) + hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil) hostHelpers.EXPECT().TryEnableTun().Return() hostHelpers.EXPECT().TryEnableVhostNet().Return() hostHelpers.EXPECT().DiscoverSriovDevices(hostHelpers).Return([]sriovnetworkv1.InterfaceExt{{ @@ -183,7 +183,7 @@ var _ = Describe("Service", func() { "/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"), }, }) - hostHelpers.EXPECT().TryEnableRdma().Return(true, nil) + hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil) hostHelpers.EXPECT().TryEnableTun().Return() hostHelpers.EXPECT().TryEnableVhostNet().Return() @@ -211,7 +211,7 @@ var _ = Describe("Service", func() { "/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"), }, }) - hostHelpers.EXPECT().TryEnableRdma().Return(true, nil) + hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil) hostHelpers.EXPECT().TryEnableTun().Return() hostHelpers.EXPECT().TryEnableVhostNet().Return() hostHelpers.EXPECT().DiscoverSriovDevices(hostHelpers).Return([]sriovnetworkv1.InterfaceExt{{ diff --git a/doc/quickstart.md b/doc/quickstart.md index f3f352bb5..18d686741 100644 --- a/doc/quickstart.md +++ b/doc/quickstart.md @@ -5,6 +5,7 @@ 1. A supported SRIOV hardware on the cluster nodes. Supported models can be found [here](https://github.com/k8snetworkplumbingwg/sriov-network-operator/blob/master/doc/supported-hardware.md). 2. Kubernetes or Openshift cluster running on bare metal nodes. 3. Multus-cni is deployed as default CNI plugin, and there is a default CNI plugin (flannel, openshift-sdn etc.) available for Multus-cni. +4. On RedHat Enterprise Linux and Ubuntu operating systems, the `rdma-core` package must be installed to support RDMA resource provisioning. On RedHat CoreOS the package installation is not required. ## Installation diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 0fd1ec2cb..4af9958f7 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -129,7 +129,7 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { if !vars.UsingSystemdMode { log.Log.V(0).Info("Run(): daemon running in daemon mode") - dn.HostHelpers.TryEnableRdma() + dn.HostHelpers.CheckRDMAEnabled() dn.HostHelpers.TryEnableTun() dn.HostHelpers.TryEnableVhostNet() err := systemd.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift) diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index a1e29dcb2..ce932a1f1 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -145,7 +145,7 @@ var _ = Describe("Config Daemon", func() { platformHelper.EXPECT().IsHypershift().Return(false).AnyTimes() vendorHelper := mock_helper.NewMockHostHelpersInterface(mockCtrl) - vendorHelper.EXPECT().TryEnableRdma().Return(true, nil).AnyTimes() + vendorHelper.EXPECT().CheckRDMAEnabled().Return(true, nil).AnyTimes() vendorHelper.EXPECT().TryEnableVhostNet().AnyTimes() vendorHelper.EXPECT().TryEnableTun().AnyTimes() vendorHelper.EXPECT().PrepareNMUdevRule([]string{"0x1014", "0x154c"}).Return(nil).AnyTimes() diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index 23b32a1d4..baf48f2e3 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -122,6 +122,21 @@ func (mr *MockHostHelpersInterfaceMockRecorder) BindDriverByBusAndDevice(bus, de return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BindDriverByBusAndDevice", reflect.TypeOf((*MockHostHelpersInterface)(nil).BindDriverByBusAndDevice), bus, device, driver) } +// CheckRDMAEnabled mocks base method. +func (m *MockHostHelpersInterface) CheckRDMAEnabled() (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CheckRDMAEnabled") + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CheckRDMAEnabled indicates an expected call of CheckRDMAEnabled. +func (mr *MockHostHelpersInterfaceMockRecorder) CheckRDMAEnabled() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckRDMAEnabled", reflect.TypeOf((*MockHostHelpersInterface)(nil).CheckRDMAEnabled)) +} + // Chroot mocks base method. func (m *MockHostHelpersInterface) Chroot(arg0 string) (func() error, error) { m.ctrl.T.Helper() @@ -279,36 +294,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) EnableHwTcOffload(ifaceName inte return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnableHwTcOffload", reflect.TypeOf((*MockHostHelpersInterface)(nil).EnableHwTcOffload), ifaceName) } -// EnableRDMA mocks base method. -func (m *MockHostHelpersInterface) EnableRDMA(conditionFilePath, serviceName, packageManager string) (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "EnableRDMA", conditionFilePath, serviceName, packageManager) - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// EnableRDMA indicates an expected call of EnableRDMA. -func (mr *MockHostHelpersInterfaceMockRecorder) EnableRDMA(conditionFilePath, serviceName, packageManager interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnableRDMA", reflect.TypeOf((*MockHostHelpersInterface)(nil).EnableRDMA), conditionFilePath, serviceName, packageManager) -} - -// EnableRDMAOnRHELMachine mocks base method. -func (m *MockHostHelpersInterface) EnableRDMAOnRHELMachine() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "EnableRDMAOnRHELMachine") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// EnableRDMAOnRHELMachine indicates an expected call of EnableRDMAOnRHELMachine. -func (mr *MockHostHelpersInterfaceMockRecorder) EnableRDMAOnRHELMachine() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnableRDMAOnRHELMachine", reflect.TypeOf((*MockHostHelpersInterface)(nil).EnableRDMAOnRHELMachine)) -} - // EnableService mocks base method. func (m *MockHostHelpersInterface) EnableService(service *types.Service) error { m.ctrl.T.Helper() @@ -527,21 +512,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) GetNicSriovMode(pciAddr interfac return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNicSriovMode", reflect.TypeOf((*MockHostHelpersInterface)(nil).GetNicSriovMode), pciAddr) } -// GetOSPrettyName mocks base method. -func (m *MockHostHelpersInterface) GetOSPrettyName() (string, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetOSPrettyName") - ret0, _ := ret[0].(string) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetOSPrettyName indicates an expected call of GetOSPrettyName. -func (mr *MockHostHelpersInterfaceMockRecorder) GetOSPrettyName() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetOSPrettyName", reflect.TypeOf((*MockHostHelpersInterface)(nil).GetOSPrettyName)) -} - // GetPciAddressFromInterfaceName mocks base method. func (m *MockHostHelpersInterface) GetPciAddressFromInterfaceName(interfaceName string) (string, error) { m.ctrl.T.Helper() @@ -602,35 +572,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) HasDriver(pciAddr interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HasDriver", reflect.TypeOf((*MockHostHelpersInterface)(nil).HasDriver), pciAddr) } -// InstallRDMA mocks base method. -func (m *MockHostHelpersInterface) InstallRDMA(packageManager string) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "InstallRDMA", packageManager) - ret0, _ := ret[0].(error) - return ret0 -} - -// InstallRDMA indicates an expected call of InstallRDMA. -func (mr *MockHostHelpersInterfaceMockRecorder) InstallRDMA(packageManager interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallRDMA", reflect.TypeOf((*MockHostHelpersInterface)(nil).InstallRDMA), packageManager) -} - -// IsCoreOS mocks base method. -func (m *MockHostHelpersInterface) IsCoreOS() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "IsCoreOS") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// IsCoreOS indicates an expected call of IsCoreOS. -func (mr *MockHostHelpersInterfaceMockRecorder) IsCoreOS() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsCoreOS", reflect.TypeOf((*MockHostHelpersInterface)(nil).IsCoreOS)) -} - // IsKernelArgsSet mocks base method. func (m *MockHostHelpersInterface) IsKernelArgsSet(cmdLine, karg string) bool { m.ctrl.T.Helper() @@ -674,21 +615,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) IsKernelModuleLoaded(name interf return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsKernelModuleLoaded", reflect.TypeOf((*MockHostHelpersInterface)(nil).IsKernelModuleLoaded), name) } -// IsRHELSystem mocks base method. -func (m *MockHostHelpersInterface) IsRHELSystem() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "IsRHELSystem") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// IsRHELSystem indicates an expected call of IsRHELSystem. -func (mr *MockHostHelpersInterfaceMockRecorder) IsRHELSystem() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsRHELSystem", reflect.TypeOf((*MockHostHelpersInterface)(nil).IsRHELSystem)) -} - // IsServiceEnabled mocks base method. func (m *MockHostHelpersInterface) IsServiceEnabled(servicePath string) (bool, error) { m.ctrl.T.Helper() @@ -733,21 +659,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) IsSwitchdev(name interface{}) *g return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsSwitchdev", reflect.TypeOf((*MockHostHelpersInterface)(nil).IsSwitchdev), name) } -// IsUbuntuSystem mocks base method. -func (m *MockHostHelpersInterface) IsUbuntuSystem() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "IsUbuntuSystem") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// IsUbuntuSystem indicates an expected call of IsUbuntuSystem. -func (mr *MockHostHelpersInterfaceMockRecorder) IsUbuntuSystem() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsUbuntuSystem", reflect.TypeOf((*MockHostHelpersInterface)(nil).IsUbuntuSystem)) -} - // LoadKernelModule mocks base method. func (m *MockHostHelpersInterface) LoadKernelModule(name string, args ...string) error { m.ctrl.T.Helper() @@ -855,21 +766,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) PrepareVFRepUdevRule() *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PrepareVFRepUdevRule", reflect.TypeOf((*MockHostHelpersInterface)(nil).PrepareVFRepUdevRule)) } -// RdmaIsLoaded mocks base method. -func (m *MockHostHelpersInterface) RdmaIsLoaded() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "RdmaIsLoaded") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// RdmaIsLoaded indicates an expected call of RdmaIsLoaded. -func (mr *MockHostHelpersInterfaceMockRecorder) RdmaIsLoaded() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RdmaIsLoaded", reflect.TypeOf((*MockHostHelpersInterface)(nil).RdmaIsLoaded)) -} - // ReadService mocks base method. func (m *MockHostHelpersInterface) ReadService(servicePath string) (*types.Service, error) { m.ctrl.T.Helper() @@ -929,20 +825,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) RebindVfToDefaultDriver(pciAddr return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RebindVfToDefaultDriver", reflect.TypeOf((*MockHostHelpersInterface)(nil).RebindVfToDefaultDriver), pciAddr) } -// ReloadDriver mocks base method. -func (m *MockHostHelpersInterface) ReloadDriver(driver string) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ReloadDriver", driver) - ret0, _ := ret[0].(error) - return ret0 -} - -// ReloadDriver indicates an expected call of ReloadDriver. -func (mr *MockHostHelpersInterfaceMockRecorder) ReloadDriver(driver interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReloadDriver", reflect.TypeOf((*MockHostHelpersInterface)(nil).ReloadDriver), driver) -} - // RemoveDisableNMUdevRule mocks base method. func (m *MockHostHelpersInterface) RemoveDisableNMUdevRule(pfPciAddress string) error { m.ctrl.T.Helper() @@ -1118,35 +1000,6 @@ func (mr *MockHostHelpersInterfaceMockRecorder) SetVfAdminMac(vfAddr, pfLink, vf return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetVfAdminMac", reflect.TypeOf((*MockHostHelpersInterface)(nil).SetVfAdminMac), vfAddr, pfLink, vfLink) } -// TriggerUdevEvent mocks base method. -func (m *MockHostHelpersInterface) TriggerUdevEvent() error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "TriggerUdevEvent") - ret0, _ := ret[0].(error) - return ret0 -} - -// TriggerUdevEvent indicates an expected call of TriggerUdevEvent. -func (mr *MockHostHelpersInterfaceMockRecorder) TriggerUdevEvent() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TriggerUdevEvent", reflect.TypeOf((*MockHostHelpersInterface)(nil).TriggerUdevEvent)) -} - -// TryEnableRdma mocks base method. -func (m *MockHostHelpersInterface) TryEnableRdma() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "TryEnableRdma") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// TryEnableRdma indicates an expected call of TryEnableRdma. -func (mr *MockHostHelpersInterfaceMockRecorder) TryEnableRdma() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryEnableRdma", reflect.TypeOf((*MockHostHelpersInterface)(nil).TryEnableRdma)) -} - // TryEnableTun mocks base method. func (m *MockHostHelpersInterface) TryEnableTun() { m.ctrl.T.Helper() diff --git a/pkg/host/internal/common.go b/pkg/host/internal/common.go deleted file mode 100644 index b32d2cd93..000000000 --- a/pkg/host/internal/common.go +++ /dev/null @@ -1,19 +0,0 @@ -package internal - -import ( - "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" -) - -const ( - HostPathFromDaemon = consts.Host - RedhatReleaseFile = "/etc/redhat-release" - RhelRDMAConditionFile = "/usr/libexec/rdma-init-kernel" - RhelRDMAServiceName = "rdma" - RhelPackageManager = "yum" - - UbuntuRDMAConditionFile = "/usr/sbin/rdma-ndd" - UbuntuRDMAServiceName = "rdma-ndd" - UbuntuPackageManager = "apt-get" - - GenericOSReleaseFile = "/etc/os-release" -) diff --git a/pkg/host/internal/kernel/kernel.go b/pkg/host/internal/kernel/kernel.go index 958f2590a..95a5c44a5 100644 --- a/pkg/host/internal/kernel/kernel.go +++ b/pkg/host/internal/kernel/kernel.go @@ -11,7 +11,6 @@ import ( sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" - "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/host/internal" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/host/types" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" @@ -259,320 +258,43 @@ func (k *kernel) GetDriverByBusAndDevice(bus, device string) (string, error) { return getDriverByBusAndDevice(bus, device) } -func (k *kernel) TryEnableRdma() (bool, error) { - log.Log.V(2).Info("tryEnableRdma()") +// CheckRDMAEnabled returns true if RDMA modules are loaded on host +func (k *kernel) CheckRDMAEnabled() (bool, error) { + log.Log.V(2).Info("CheckRDMAEnabled()") chrootDefinition := utils.GetChrootExtension() - // check if the driver is already loaded in to the system - _, stderr, mlx4Err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("grep --quiet 'mlx4_en' <(%s lsmod)", chrootDefinition)) - if mlx4Err != nil && len(stderr) != 0 { - log.Log.Error(mlx4Err, "tryEnableRdma(): failed to check for kernel module 'mlx4_en'", "stderr", stderr) - return false, fmt.Errorf(stderr) - } - - _, stderr, mlx5Err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("grep --quiet 'mlx5_core' <(%s lsmod)", chrootDefinition)) + _, stderr, mlx5Err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("%s lsmod | grep --quiet 'mlx5_core'", chrootDefinition)) if mlx5Err != nil && len(stderr) != 0 { - log.Log.Error(mlx5Err, "tryEnableRdma(): failed to check for kernel module 'mlx5_core'", "stderr", stderr) + log.Log.Error(mlx5Err, "CheckRDMAEnabled(): failed to check for kernel module 'mlx5_core'", "stderr", stderr) return false, fmt.Errorf(stderr) } - if mlx4Err != nil && mlx5Err != nil { - log.Log.Error(nil, "tryEnableRdma(): no RDMA capable devices") + if mlx5Err != nil { + log.Log.Error(nil, "CheckRDMAEnabled(): no RDMA capable devices") return false, nil } - - isRhelSystem, err := k.IsRHELSystem() - if err != nil { - log.Log.Error(err, "tryEnableRdma(): failed to check if the machine is base on RHEL") - return false, err - } - - // RHEL check - if isRhelSystem { - return k.EnableRDMAOnRHELMachine() - } - - isUbuntuSystem, err := k.IsUbuntuSystem() - if err != nil { - log.Log.Error(err, "tryEnableRdma(): failed to check if the machine is base on Ubuntu") - return false, err - } - - if isUbuntuSystem { - return k.EnableRDMAOnUbuntuMachine() - } - - osName, err := k.GetOSPrettyName() - if err != nil { - log.Log.Error(err, "tryEnableRdma(): failed to check OS name") - return false, err - } - - log.Log.Error(nil, "tryEnableRdma(): Unsupported OS", "name", osName) - return false, fmt.Errorf("unable to load RDMA unsupported OS: %s", osName) -} - -func (k *kernel) EnableRDMAOnRHELMachine() (bool, error) { - log.Log.Info("EnableRDMAOnRHELMachine()") - isCoreOsSystem, err := k.IsCoreOS() - if err != nil { - log.Log.Error(err, "EnableRDMAOnRHELMachine(): failed to check if the machine runs CoreOS") - return false, err - } - - // CoreOS check - if isCoreOsSystem { - isRDMALoaded, err := k.RdmaIsLoaded() - if err != nil { - log.Log.Error(err, "EnableRDMAOnRHELMachine(): failed to check if RDMA kernel modules are loaded") - return false, err - } - - return isRDMALoaded, nil - } - - // RHEL - log.Log.Info("EnableRDMAOnRHELMachine(): enabling RDMA on RHEL machine") - isRDMAEnable, err := k.EnableRDMA(internal.RhelRDMAConditionFile, internal.RhelRDMAServiceName, internal.RhelPackageManager) - if err != nil { - log.Log.Error(err, "EnableRDMAOnRHELMachine(): failed to enable RDMA on RHEL machine") - return false, err - } - - // check if we need to install rdma-core package - if isRDMAEnable { - isRDMALoaded, err := k.RdmaIsLoaded() - if err != nil { - log.Log.Error(err, "EnableRDMAOnRHELMachine(): failed to check if RDMA kernel modules are loaded") - return false, err - } - - // if ib kernel module is not loaded trigger a loading - if isRDMALoaded { - err = k.TriggerUdevEvent() - if err != nil { - log.Log.Error(err, "EnableRDMAOnRHELMachine() failed to trigger udev event") - return false, err - } - } - } - - return true, nil -} - -func (k *kernel) EnableRDMAOnUbuntuMachine() (bool, error) { - log.Log.Info("EnableRDMAOnUbuntuMachine(): enabling RDMA on RHEL machine") - isRDMAEnable, err := k.EnableRDMA(internal.UbuntuRDMAConditionFile, internal.UbuntuRDMAServiceName, internal.UbuntuPackageManager) - if err != nil { - log.Log.Error(err, "EnableRDMAOnUbuntuMachine(): failed to enable RDMA on Ubuntu machine") - return false, err - } - - // check if we need to install rdma-core package - if isRDMAEnable { - isRDMALoaded, err := k.RdmaIsLoaded() - if err != nil { - log.Log.Error(err, "EnableRDMAOnUbuntuMachine(): failed to check if RDMA kernel modules are loaded") - return false, err - } - - // if ib kernel module is not loaded trigger a loading - if isRDMALoaded { - err = k.TriggerUdevEvent() - if err != nil { - log.Log.Error(err, "EnableRDMAOnUbuntuMachine() failed to trigger udev event") - return false, err - } - } - } - - return true, nil -} - -func (k *kernel) IsRHELSystem() (bool, error) { - log.Log.Info("IsRHELSystem(): checking for RHEL machine") - path := internal.RedhatReleaseFile - if !vars.UsingSystemdMode { - path = filepath.Join(internal.HostPathFromDaemon, path) - } - if _, err := os.Stat(path); err != nil { - if os.IsNotExist(err) { - log.Log.V(2).Info("IsRHELSystem() not a RHEL machine") - return false, nil - } - - log.Log.Error(err, "IsRHELSystem() failed to check for os release file", "path", path) - return false, err - } - - return true, nil -} - -func (k *kernel) IsCoreOS() (bool, error) { - log.Log.Info("IsCoreOS(): checking for CoreOS machine") - path := internal.RedhatReleaseFile - if !vars.UsingSystemdMode { - path = filepath.Join(internal.HostPathFromDaemon, path) - } - - data, err := os.ReadFile(path) - if err != nil { - log.Log.Error(err, "IsCoreOS(): failed to read RHEL release file on path", "path", path) - return false, err - } - - if strings.Contains(string(data), "CoreOS") { - return true, nil - } - - return false, nil -} - -func (k *kernel) IsUbuntuSystem() (bool, error) { - log.Log.Info("IsUbuntuSystem(): checking for Ubuntu machine") - path := internal.GenericOSReleaseFile - if !vars.UsingSystemdMode { - path = filepath.Join(internal.HostPathFromDaemon, path) - } - - if _, err := os.Stat(path); err != nil { - if os.IsNotExist(err) { - log.Log.Error(nil, "IsUbuntuSystem() os-release on path doesn't exist", "path", path) - return false, err - } - - log.Log.Error(err, "IsUbuntuSystem() failed to check for os release file", "path", path) - return false, err - } - - stdout, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("grep -i --quiet 'ubuntu' %s", path)) - if err != nil && len(stderr) != 0 { - log.Log.Error(err, "IsUbuntuSystem(): failed to check for ubuntu operating system name in os-releasae file", "stderr", stderr) - return false, fmt.Errorf(stderr) - } - - if len(stdout) > 0 { - return true, nil - } - - return false, nil + return k.rdmaModulesAreLoaded() } -func (k *kernel) RdmaIsLoaded() (bool, error) { - log.Log.V(2).Info("RdmaIsLoaded()") +func (k *kernel) rdmaModulesAreLoaded() (bool, error) { + log.Log.V(2).Info("rdmaModulesAreLoaded()") chrootDefinition := utils.GetChrootExtension() // check if the driver is already loaded in to the system - _, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("grep --quiet '\\(^ib\\|^rdma\\)' <(%s lsmod)", chrootDefinition)) + _, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("%s lsmod | grep --quiet '\\(^ib\\|^rdma\\)'", chrootDefinition)) if err != nil && len(stderr) != 0 { - log.Log.Error(err, "RdmaIsLoaded(): fail to check if ib and rdma kernel modules are loaded", "stderr", stderr) + log.Log.Error(err, "rdmaModulesAreLoaded(): fail to check if ib and rdma kernel modules are loaded", "stderr", stderr) return false, fmt.Errorf(stderr) } if err != nil { + log.Log.Error(nil, "rdmaModulesAreLoaded(): RDMA modules are not loaded, you may need to install rdma-core package") return false, nil } - + log.Log.V(2).Info("rdmaModulesAreLoaded(): RDMA modules are loaded") return true, nil } -func (k *kernel) EnableRDMA(conditionFilePath, serviceName, packageManager string) (bool, error) { - path := conditionFilePath - if !vars.UsingSystemdMode { - path = filepath.Join(internal.HostPathFromDaemon, path) - } - log.Log.Info("EnableRDMA(): checking for service file", "path", path) - - if _, err := os.Stat(path); err != nil { - if os.IsNotExist(err) { - log.Log.V(2).Info("EnableRDMA(): RDMA server doesn't exist") - err = k.InstallRDMA(packageManager) - if err != nil { - log.Log.Error(err, "EnableRDMA() failed to install RDMA package") - return false, err - } - - err = k.TriggerUdevEvent() - if err != nil { - log.Log.Error(err, "EnableRDMA() failed to trigger udev event") - return false, err - } - - return false, nil - } - - log.Log.Error(err, "EnableRDMA() failed to check for os release file", "path", path) - return false, err - } - - log.Log.Info("EnableRDMA(): service installed", "name", serviceName) - return true, nil -} - -func (k *kernel) InstallRDMA(packageManager string) error { - log.Log.Info("InstallRDMA(): installing RDMA") - chrootDefinition := utils.GetChrootExtension() - - stdout, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("%s %s install -y rdma-core", chrootDefinition, packageManager)) - if err != nil && len(stderr) != 0 { - log.Log.Error(err, "InstallRDMA(): failed to install RDMA package", "stdout", stdout, "stderr", stderr) - return err - } - - return nil -} - -func (k *kernel) TriggerUdevEvent() error { - log.Log.Info("TriggerUdevEvent(): installing RDMA") - - err := k.ReloadDriver("mlx4_en") - if err != nil { - return err - } - - err = k.ReloadDriver("mlx5_core") - if err != nil { - return err - } - - return nil -} - -func (k *kernel) ReloadDriver(driverName string) error { - log.Log.Info("ReloadDriver(): reload driver", "name", driverName) - chrootDefinition := utils.GetChrootExtension() - - _, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("%s modprobe -r %s && %s modprobe %s", chrootDefinition, driverName, chrootDefinition, driverName)) - if err != nil && len(stderr) != 0 { - log.Log.Error(err, "ReloadDriver(): failed to reload kernel module", - "name", driverName, "stderr", stderr) - return err - } - - return nil -} - -func (k *kernel) GetOSPrettyName() (string, error) { - path := internal.GenericOSReleaseFile - if !vars.UsingSystemdMode { - path = filepath.Join(internal.HostPathFromDaemon, path) - } - - log.Log.Info("GetOSPrettyName(): getting os name from os-release file") - - stdout, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("cat %s | grep PRETTY_NAME | cut -c 13-", path)) - if err != nil && len(stderr) != 0 { - log.Log.Error(err, "GetOSPrettyName(): failed to check for operating system name in os-release file", "stderr", stderr) - return "", fmt.Errorf(stderr) - } - - if len(stdout) > 0 { - return stdout, nil - } - - return "", fmt.Errorf("failed to find pretty operating system name") -} - // IsKernelLockdownMode returns true when kernel lockdown mode is enabled // TODO: change this to return error func (k *kernel) IsKernelLockdownMode() bool { diff --git a/pkg/host/mock/mock_host.go b/pkg/host/mock/mock_host.go index 72155f557..36c96b69a 100644 --- a/pkg/host/mock/mock_host.go +++ b/pkg/host/mock/mock_host.go @@ -121,6 +121,21 @@ func (mr *MockHostManagerInterfaceMockRecorder) BindDriverByBusAndDevice(bus, de return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BindDriverByBusAndDevice", reflect.TypeOf((*MockHostManagerInterface)(nil).BindDriverByBusAndDevice), bus, device, driver) } +// CheckRDMAEnabled mocks base method. +func (m *MockHostManagerInterface) CheckRDMAEnabled() (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CheckRDMAEnabled") + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CheckRDMAEnabled indicates an expected call of CheckRDMAEnabled. +func (mr *MockHostManagerInterfaceMockRecorder) CheckRDMAEnabled() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckRDMAEnabled", reflect.TypeOf((*MockHostManagerInterface)(nil).CheckRDMAEnabled)) +} + // CompareServices mocks base method. func (m *MockHostManagerInterface) CompareServices(serviceA, serviceB *types.Service) (bool, error) { m.ctrl.T.Helper() @@ -249,36 +264,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) EnableHwTcOffload(ifaceName inte return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnableHwTcOffload", reflect.TypeOf((*MockHostManagerInterface)(nil).EnableHwTcOffload), ifaceName) } -// EnableRDMA mocks base method. -func (m *MockHostManagerInterface) EnableRDMA(conditionFilePath, serviceName, packageManager string) (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "EnableRDMA", conditionFilePath, serviceName, packageManager) - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// EnableRDMA indicates an expected call of EnableRDMA. -func (mr *MockHostManagerInterfaceMockRecorder) EnableRDMA(conditionFilePath, serviceName, packageManager interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnableRDMA", reflect.TypeOf((*MockHostManagerInterface)(nil).EnableRDMA), conditionFilePath, serviceName, packageManager) -} - -// EnableRDMAOnRHELMachine mocks base method. -func (m *MockHostManagerInterface) EnableRDMAOnRHELMachine() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "EnableRDMAOnRHELMachine") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// EnableRDMAOnRHELMachine indicates an expected call of EnableRDMAOnRHELMachine. -func (mr *MockHostManagerInterfaceMockRecorder) EnableRDMAOnRHELMachine() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnableRDMAOnRHELMachine", reflect.TypeOf((*MockHostManagerInterface)(nil).EnableRDMAOnRHELMachine)) -} - // EnableService mocks base method. func (m *MockHostManagerInterface) EnableService(service *types.Service) error { m.ctrl.T.Helper() @@ -451,21 +436,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) GetNicSriovMode(pciAddr interfac return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNicSriovMode", reflect.TypeOf((*MockHostManagerInterface)(nil).GetNicSriovMode), pciAddr) } -// GetOSPrettyName mocks base method. -func (m *MockHostManagerInterface) GetOSPrettyName() (string, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetOSPrettyName") - ret0, _ := ret[0].(string) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetOSPrettyName indicates an expected call of GetOSPrettyName. -func (mr *MockHostManagerInterfaceMockRecorder) GetOSPrettyName() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetOSPrettyName", reflect.TypeOf((*MockHostManagerInterface)(nil).GetOSPrettyName)) -} - // GetPciAddressFromInterfaceName mocks base method. func (m *MockHostManagerInterface) GetPciAddressFromInterfaceName(interfaceName string) (string, error) { m.ctrl.T.Helper() @@ -526,35 +496,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) HasDriver(pciAddr interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HasDriver", reflect.TypeOf((*MockHostManagerInterface)(nil).HasDriver), pciAddr) } -// InstallRDMA mocks base method. -func (m *MockHostManagerInterface) InstallRDMA(packageManager string) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "InstallRDMA", packageManager) - ret0, _ := ret[0].(error) - return ret0 -} - -// InstallRDMA indicates an expected call of InstallRDMA. -func (mr *MockHostManagerInterfaceMockRecorder) InstallRDMA(packageManager interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InstallRDMA", reflect.TypeOf((*MockHostManagerInterface)(nil).InstallRDMA), packageManager) -} - -// IsCoreOS mocks base method. -func (m *MockHostManagerInterface) IsCoreOS() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "IsCoreOS") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// IsCoreOS indicates an expected call of IsCoreOS. -func (mr *MockHostManagerInterfaceMockRecorder) IsCoreOS() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsCoreOS", reflect.TypeOf((*MockHostManagerInterface)(nil).IsCoreOS)) -} - // IsKernelArgsSet mocks base method. func (m *MockHostManagerInterface) IsKernelArgsSet(cmdLine, karg string) bool { m.ctrl.T.Helper() @@ -598,21 +539,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) IsKernelModuleLoaded(name interf return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsKernelModuleLoaded", reflect.TypeOf((*MockHostManagerInterface)(nil).IsKernelModuleLoaded), name) } -// IsRHELSystem mocks base method. -func (m *MockHostManagerInterface) IsRHELSystem() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "IsRHELSystem") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// IsRHELSystem indicates an expected call of IsRHELSystem. -func (mr *MockHostManagerInterfaceMockRecorder) IsRHELSystem() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsRHELSystem", reflect.TypeOf((*MockHostManagerInterface)(nil).IsRHELSystem)) -} - // IsServiceEnabled mocks base method. func (m *MockHostManagerInterface) IsServiceEnabled(servicePath string) (bool, error) { m.ctrl.T.Helper() @@ -657,21 +583,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) IsSwitchdev(name interface{}) *g return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsSwitchdev", reflect.TypeOf((*MockHostManagerInterface)(nil).IsSwitchdev), name) } -// IsUbuntuSystem mocks base method. -func (m *MockHostManagerInterface) IsUbuntuSystem() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "IsUbuntuSystem") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// IsUbuntuSystem indicates an expected call of IsUbuntuSystem. -func (mr *MockHostManagerInterfaceMockRecorder) IsUbuntuSystem() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsUbuntuSystem", reflect.TypeOf((*MockHostManagerInterface)(nil).IsUbuntuSystem)) -} - // LoadKernelModule mocks base method. func (m *MockHostManagerInterface) LoadKernelModule(name string, args ...string) error { m.ctrl.T.Helper() @@ -733,21 +644,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) PrepareVFRepUdevRule() *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PrepareVFRepUdevRule", reflect.TypeOf((*MockHostManagerInterface)(nil).PrepareVFRepUdevRule)) } -// RdmaIsLoaded mocks base method. -func (m *MockHostManagerInterface) RdmaIsLoaded() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "RdmaIsLoaded") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// RdmaIsLoaded indicates an expected call of RdmaIsLoaded. -func (mr *MockHostManagerInterfaceMockRecorder) RdmaIsLoaded() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RdmaIsLoaded", reflect.TypeOf((*MockHostManagerInterface)(nil).RdmaIsLoaded)) -} - // ReadService mocks base method. func (m *MockHostManagerInterface) ReadService(servicePath string) (*types.Service, error) { m.ctrl.T.Helper() @@ -807,20 +703,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) RebindVfToDefaultDriver(pciAddr return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RebindVfToDefaultDriver", reflect.TypeOf((*MockHostManagerInterface)(nil).RebindVfToDefaultDriver), pciAddr) } -// ReloadDriver mocks base method. -func (m *MockHostManagerInterface) ReloadDriver(driver string) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ReloadDriver", driver) - ret0, _ := ret[0].(error) - return ret0 -} - -// ReloadDriver indicates an expected call of ReloadDriver. -func (mr *MockHostManagerInterfaceMockRecorder) ReloadDriver(driver interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReloadDriver", reflect.TypeOf((*MockHostManagerInterface)(nil).ReloadDriver), driver) -} - // RemoveDisableNMUdevRule mocks base method. func (m *MockHostManagerInterface) RemoveDisableNMUdevRule(pfPciAddress string) error { m.ctrl.T.Helper() @@ -947,35 +829,6 @@ func (mr *MockHostManagerInterfaceMockRecorder) SetVfAdminMac(vfAddr, pfLink, vf return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetVfAdminMac", reflect.TypeOf((*MockHostManagerInterface)(nil).SetVfAdminMac), vfAddr, pfLink, vfLink) } -// TriggerUdevEvent mocks base method. -func (m *MockHostManagerInterface) TriggerUdevEvent() error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "TriggerUdevEvent") - ret0, _ := ret[0].(error) - return ret0 -} - -// TriggerUdevEvent indicates an expected call of TriggerUdevEvent. -func (mr *MockHostManagerInterfaceMockRecorder) TriggerUdevEvent() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TriggerUdevEvent", reflect.TypeOf((*MockHostManagerInterface)(nil).TriggerUdevEvent)) -} - -// TryEnableRdma mocks base method. -func (m *MockHostManagerInterface) TryEnableRdma() (bool, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "TryEnableRdma") - ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// TryEnableRdma indicates an expected call of TryEnableRdma. -func (mr *MockHostManagerInterfaceMockRecorder) TryEnableRdma() *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryEnableRdma", reflect.TypeOf((*MockHostManagerInterface)(nil).TryEnableRdma)) -} - // TryEnableTun mocks base method. func (m *MockHostManagerInterface) TryEnableTun() { m.ctrl.T.Helper() diff --git a/pkg/host/types/interfaces.go b/pkg/host/types/interfaces.go index ea2944938..5918dca34 100644 --- a/pkg/host/types/interfaces.go +++ b/pkg/host/types/interfaces.go @@ -12,12 +12,8 @@ type KernelInterface interface { TryEnableTun() // TryEnableVhostNet load the vhost-net kernel module TryEnableVhostNet() - // TryEnableRdma tries to enable RDMA on the machine base on the operating system - // if the package doesn't exist it will also will try to install it - // supported operating systems are RHEL RHCOS and ubuntu - TryEnableRdma() (bool, error) - // TriggerUdevEvent triggers a udev event - TriggerUdevEvent() error + // CheckRDMAEnabled returns true if RDMA modules are loaded on host + CheckRDMAEnabled() (bool, error) // GetCurrentKernelArgs reads the /proc/cmdline to check the current kernel arguments GetCurrentKernelArgs() (string, error) // IsKernelArgsSet check is the requested kernel arguments are set @@ -52,26 +48,8 @@ type KernelInterface interface { LoadKernelModule(name string, args ...string) error // IsKernelModuleLoaded returns try if the requested kernel module is loaded IsKernelModuleLoaded(name string) (bool, error) - // ReloadDriver reloads a requested driver - ReloadDriver(driver string) error // IsKernelLockdownMode returns true if the kernel is in lockdown mode IsKernelLockdownMode() bool - // IsRHELSystem returns try if the system is a RHEL base - IsRHELSystem() (bool, error) - // IsUbuntuSystem returns try if the system is an ubuntu base - IsUbuntuSystem() (bool, error) - // IsCoreOS returns true if the system is a CoreOS or RHCOS base - IsCoreOS() (bool, error) - // RdmaIsLoaded returns try if RDMA kernel modules are loaded - RdmaIsLoaded() (bool, error) - // EnableRDMA enable RDMA on the system - EnableRDMA(conditionFilePath, serviceName, packageManager string) (bool, error) - // InstallRDMA install RDMA packages on the system - InstallRDMA(packageManager string) error - // EnableRDMAOnRHELMachine enable RDMA on a RHEL base system - EnableRDMAOnRHELMachine() (bool, error) - // GetOSPrettyName returns OS name - GetOSPrettyName() (string, error) } type NetworkInterface interface { From 0f1a72e648caf273e685cc22fbbda6de87927b35 Mon Sep 17 00:00:00 2001 From: Tobias Giese Date: Thu, 11 Jul 2024 13:39:25 +0200 Subject: [PATCH 2/9] feat: implement MlxResetFW to reset the FW on VF changes Signed-off-by: Tobias Giese --- Dockerfile.sriov-network-config-daemon | 4 +++- pkg/helper/mock/mock_helper.go | 14 ++++++++++++++ pkg/plugins/mellanox/mellanox_plugin.go | 11 ++++++++++- pkg/vendors/mellanox/mellanox.go | 18 ++++++++++++++++++ pkg/vendors/mellanox/mock/mock_mellanox.go | 14 ++++++++++++++ 5 files changed, 59 insertions(+), 2 deletions(-) diff --git a/Dockerfile.sriov-network-config-daemon b/Dockerfile.sriov-network-config-daemon index 219d77c6d..35533448f 100644 --- a/Dockerfile.sriov-network-config-daemon +++ b/Dockerfile.sriov-network-config-daemon @@ -5,7 +5,9 @@ RUN make _build-sriov-network-config-daemon BIN_PATH=build/_output/cmd FROM quay.io/centos/centos:stream9 ARG MSTFLINT=mstflint -RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata $ARCH_DEP_PKGS && yum clean all +# We have to ensure that pciutils is installed. This package is needed for mstfwreset to succeed. +# xref pkg/vendors/mellanox/mellanox.go#L150 +RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata pciutils $ARCH_DEP_PKGS && yum clean all LABEL io.k8s.display-name="sriov-network-config-daemon" \ io.k8s.description="This is a daemon that manage and config sriov network devices in Kubernetes cluster" COPY --from=builder /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/build/_output/cmd/sriov-network-config-daemon /usr/bin/ diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index 23b32a1d4..03e0da573 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -811,6 +811,20 @@ func (mr *MockHostHelpersInterfaceMockRecorder) MlxConfigFW(attributesToChange i return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxConfigFW", reflect.TypeOf((*MockHostHelpersInterface)(nil).MlxConfigFW), attributesToChange) } +// MlxResetFW mocks base method. +func (m *MockHostHelpersInterface) MlxResetFW(pciAddresses []string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MlxResetFW", pciAddresses) + ret0, _ := ret[0].(error) + return ret0 +} + +// MlxResetFW indicates an expected call of MlxResetFW. +func (mr *MockHostHelpersInterfaceMockRecorder) MlxResetFW(pciAddresses interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxResetFW", reflect.TypeOf((*MockHostHelpersInterface)(nil).MlxResetFW), pciAddresses) +} + // MstConfigReadData mocks base method. func (m *MockHostHelpersInterface) MstConfigReadData(arg0 string) (string, string, error) { m.ctrl.T.Helper() diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index e5e9a9913..827a226ac 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -20,6 +20,7 @@ type MellanoxPlugin struct { helpers helper.HostHelpersInterface } +var pciAddressesToReset []string var attributesToChange map[string]mlx.MlxNic var mellanoxNicsStatus map[string]map[string]sriovnetworkv1.InterfaceExt var mellanoxNicsSpec map[string]sriovnetworkv1.Interface @@ -52,6 +53,7 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS needDrain = false needReboot = false err = nil + pciAddressesToReset = []string{} attributesToChange = map[string]mlx.MlxNic{} mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{} mellanoxNicsSpec = map[string]sriovnetworkv1.Interface{} @@ -132,6 +134,10 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS if needReboot || changeWithoutReboot { attributesToChange[ifaceSpec.PciAddress] = *attrs } + + if needReboot { + pciAddressesToReset = append(pciAddressesToReset, ifaceSpec.PciAddress) + } } // Set total VFs to 0 for mellanox interfaces with no spec @@ -202,7 +208,10 @@ func (p *MellanoxPlugin) Apply() error { return nil } log.Log.Info("mellanox plugin Apply()") - return p.helpers.MlxConfigFW(attributesToChange) + if err := p.helpers.MlxConfigFW(attributesToChange); err != nil { + return err + } + return p.helpers.MlxResetFW(pciAddressesToReset) } // nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed diff --git a/pkg/vendors/mellanox/mellanox.go b/pkg/vendors/mellanox/mellanox.go index c6631ed28..82410c7f8 100644 --- a/pkg/vendors/mellanox/mellanox.go +++ b/pkg/vendors/mellanox/mellanox.go @@ -6,6 +6,7 @@ import ( "strconv" "strings" + kerrors "k8s.io/apimachinery/pkg/util/errors" "sigs.k8s.io/controller-runtime/pkg/log" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" @@ -60,6 +61,7 @@ type MellanoxInterface interface { GetMlxNicFwData(pciAddress string) (current, next *MlxNic, err error) MlxConfigFW(attributesToChange map[string]MlxNic) error + MlxResetFW(pciAddresses []string) error } type mellanoxHelper struct { @@ -141,6 +143,22 @@ func (m *mellanoxHelper) GetMellanoxBlueFieldMode(PciAddress string) (BlueFieldM return -1, fmt.Errorf("MellanoxBlueFieldMode(): unknown device status for %s", PciAddress) } +func (m *mellanoxHelper) MlxResetFW(pciAddresses []string) error { + log.Log.Info("mellanox-plugin resetFW()") + var errs []error + for _, pciAddress := range pciAddresses { + cmdArgs := []string{"-d", pciAddress, "--skip_driver", "-l", "3", "-y", "reset"} + log.Log.Info("mellanox-plugin: resetFW()", "cmd-args", cmdArgs) + // We have to ensure that pciutils is installed into the container image Dockerfile.sriov-network-config-daemon + _, stderr, err := m.utils.RunCommand("mstfwreset", cmdArgs...) + if err != nil { + log.Log.Error(err, "mellanox-plugin resetFW(): failed", "stderr", stderr) + errs = append(errs, err) + } + } + return kerrors.NewAggregate(errs) +} + func (m *mellanoxHelper) MlxConfigFW(attributesToChange map[string]MlxNic) error { log.Log.Info("mellanox-plugin configFW()") for pciAddr, fwArgs := range attributesToChange { diff --git a/pkg/vendors/mellanox/mock/mock_mellanox.go b/pkg/vendors/mellanox/mock/mock_mellanox.go index 78c1d4903..1a0ca3120 100644 --- a/pkg/vendors/mellanox/mock/mock_mellanox.go +++ b/pkg/vendors/mellanox/mock/mock_mellanox.go @@ -79,6 +79,20 @@ func (mr *MockMellanoxInterfaceMockRecorder) MlxConfigFW(attributesToChange inte return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxConfigFW", reflect.TypeOf((*MockMellanoxInterface)(nil).MlxConfigFW), attributesToChange) } +// MlxResetFW mocks base method. +func (m *MockMellanoxInterface) MlxResetFW(pciAddresses []string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MlxResetFW", pciAddresses) + ret0, _ := ret[0].(error) + return ret0 +} + +// MlxResetFW indicates an expected call of MlxResetFW. +func (mr *MockMellanoxInterfaceMockRecorder) MlxResetFW(pciAddresses interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxResetFW", reflect.TypeOf((*MockMellanoxInterface)(nil).MlxResetFW), pciAddresses) +} + // MstConfigReadData mocks base method. func (m *MockMellanoxInterface) MstConfigReadData(arg0 string) (string, string, error) { m.ctrl.T.Helper() From 08b524e388fd42bd4fe12dc8580e3f2fd22f628d Mon Sep 17 00:00:00 2001 From: Tobias Giese Date: Tue, 23 Jul 2024 21:47:59 +0200 Subject: [PATCH 3/9] chore: add feature flag for mellanox fw reset To not enable the new feature by default we want to add a feature flag first. Signed-off-by: Tobias Giese --- cmd/sriov-network-config-daemon/start.go | 15 +++++++++++++++ pkg/consts/constants.go | 3 +++ pkg/daemon/daemon.go | 14 ++++++++++++++ pkg/daemon/daemon_test.go | 9 ++++++--- pkg/plugins/mellanox/mellanox_plugin.go | 6 +++++- pkg/vars/vars.go | 3 +++ 6 files changed, 46 insertions(+), 4 deletions(-) diff --git a/cmd/sriov-network-config-daemon/start.go b/cmd/sriov-network-config-daemon/start.go index efe35bcbf..f003e5435 100644 --- a/cmd/sriov-network-config-daemon/start.go +++ b/cmd/sriov-network-config-daemon/start.go @@ -26,6 +26,7 @@ import ( "github.com/spf13/cobra" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -41,6 +42,7 @@ import ( snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/daemon" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms" @@ -276,6 +278,18 @@ func runStartCmd(cmd *cobra.Command, args []string) error { } go nodeWriter.Run(stopCh, refreshCh, syncCh) + // Init feature gates once to prevent race conditions. + defaultConfig := &sriovnetworkv1.SriovOperatorConfig{} + err = kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig) + if err != nil { + log.Log.Error(err, "Failed to get default SriovOperatorConfig object") + return err + } + featureGates := featuregate.New() + featureGates.Init(defaultConfig.Spec.FeatureGates) + vars.MlxPluginFwReset = featureGates.IsEnabled(consts.MellanoxFirmwareResetFeatureGate) + log.Log.Info("Enabled featureGates", "featureGates", featureGates.String()) + setupLog.V(0).Info("Starting SriovNetworkConfigDaemon") err = daemon.New( kClient, @@ -288,6 +302,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error { syncCh, refreshCh, eventRecorder, + featureGates, startOpts.disabledPlugins, ).Run(stopCh, exitCh) if err != nil { diff --git a/pkg/consts/constants.go b/pkg/consts/constants.go index cbfe9ad98..f3c076111 100644 --- a/pkg/consts/constants.go +++ b/pkg/consts/constants.go @@ -139,6 +139,9 @@ const ( // ManageSoftwareBridgesFeatureGate: enables management of software bridges by the operator ManageSoftwareBridgesFeatureGate = "manageSoftwareBridges" + // MellanoxFirmwareResetFeatureGate: enables the firmware reset via mstfwreset before a reboot + MellanoxFirmwareResetFeatureGate = "mellanoxFirmwareReset" + // The path to the file on the host filesystem that contains the IB GUID distribution for IB VFs InfinibandGUIDConfigFilePath = SriovConfBasePath + "/infiniband/guids" ) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 0fd1ec2cb..5e327dc87 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -5,6 +5,7 @@ import ( "fmt" "math/rand" "os/exec" + "reflect" "sync" "time" @@ -23,6 +24,7 @@ import ( snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" sninformer "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/informers/externalversions" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms" @@ -82,6 +84,8 @@ type Daemon struct { workqueue workqueue.RateLimitingInterface eventRecorder *EventRecorder + + featureGate featuregate.FeatureGate } func New( @@ -95,6 +99,7 @@ func New( syncCh <-chan struct{}, refreshCh chan<- Message, er *EventRecorder, + featureGates featuregate.FeatureGate, disabledPlugins []string, ) *Daemon { return &Daemon{ @@ -113,6 +118,7 @@ func New( &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(updateDelay), 1)}, workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, maxUpdateBackoff)), "SriovNetworkNodeState"), eventRecorder: er, + featureGate: featureGates, disabledPlugins: disabledPlugins, } } @@ -286,6 +292,7 @@ func (dn *Daemon) operatorConfigAddHandler(obj interface{}) { } func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) { + oldCfg := old.(*sriovnetworkv1.SriovOperatorConfig) newCfg := new.(*sriovnetworkv1.SriovOperatorConfig) if newCfg.Namespace != vars.Namespace || newCfg.Name != consts.DefaultConfigName { log.Log.V(2).Info("unsupported SriovOperatorConfig", "namespace", newCfg.Namespace, "name", newCfg.Name) @@ -299,6 +306,13 @@ func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) { dn.disableDrain = newDisableDrain log.Log.Info("Set Disable Drain", "value", dn.disableDrain) } + + if !reflect.DeepEqual(oldCfg.Spec.FeatureGates, newCfg.Spec.FeatureGates) { + dn.featureGate.Init(newCfg.Spec.FeatureGates) + log.Log.Info("Updated featureGates", "featureGates", dn.featureGate.String()) + } + + vars.MlxPluginFwReset = dn.featureGate.IsEnabled(consts.MellanoxFirmwareResetFeatureGate) } func (dn *Daemon) nodeStateSyncHandler() error { diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index a1e29dcb2..d89e91991 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -18,18 +18,18 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" - mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock" - "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" - snclient "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned/fake" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" mock_helper "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper/mock" + mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/openshift" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/fake" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/generic" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" ) func TestConfigDaemon(t *testing.T) { @@ -151,6 +151,8 @@ var _ = Describe("Config Daemon", func() { vendorHelper.EXPECT().PrepareNMUdevRule([]string{"0x1014", "0x154c"}).Return(nil).AnyTimes() vendorHelper.EXPECT().PrepareVFRepUdevRule().Return(nil).AnyTimes() + featureGates := featuregate.New() + sut = New( kClient, snclient, @@ -162,6 +164,7 @@ var _ = Describe("Config Daemon", func() { syncCh, refreshCh, er, + featureGates, nil, ) diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 827a226ac..10b0152bb 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -9,6 +9,7 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox" ) @@ -211,7 +212,10 @@ func (p *MellanoxPlugin) Apply() error { if err := p.helpers.MlxConfigFW(attributesToChange); err != nil { return err } - return p.helpers.MlxResetFW(pciAddressesToReset) + if vars.MlxPluginFwReset { + return p.helpers.MlxResetFW(pciAddressesToReset) + } + return nil } // nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed diff --git a/pkg/vars/vars.go b/pkg/vars/vars.go index 9321261dc..fc7108ed8 100644 --- a/pkg/vars/vars.go +++ b/pkg/vars/vars.go @@ -54,6 +54,9 @@ var ( // ManageSoftwareBridges global variable which reflects state of manageSoftwareBridges feature ManageSoftwareBridges = false + // MlxPluginFwReset global variable enables mstfwreset before rebooting a node on VF changes + MlxPluginFwReset = false + // FilesystemRoot used by test to mock interactions with filesystem FilesystemRoot = "" From 7c592c52f350d6fdcb615fe45b86aa4ec5e0365b Mon Sep 17 00:00:00 2001 From: kramaranya Date: Tue, 30 Jul 2024 11:35:05 +0100 Subject: [PATCH 4/9] manifests: set required-scc for openshift workloads --- bindata/manifests/daemon/daemonset.yaml | 1 + bindata/manifests/operator-webhook/server.yaml | 2 ++ bindata/manifests/webhook/server.yaml | 2 ++ config/manager/manager.yaml | 2 ++ deploy/operator.yaml | 2 ++ deployment/sriov-network-operator-chart/templates/operator.yaml | 2 ++ 6 files changed, 11 insertions(+) diff --git a/bindata/manifests/daemon/daemonset.yaml b/bindata/manifests/daemon/daemonset.yaml index e3e5526e0..c9b151520 100644 --- a/bindata/manifests/daemon/daemonset.yaml +++ b/bindata/manifests/daemon/daemonset.yaml @@ -23,6 +23,7 @@ spec: openshift.io/component: network annotations: kubectl.kubernetes.io/default-container: sriov-network-config-daemon + openshift.io/required-scc: privileged spec: hostNetwork: true hostPID: true diff --git a/bindata/manifests/operator-webhook/server.yaml b/bindata/manifests/operator-webhook/server.yaml index 188a7f182..29e369680 100644 --- a/bindata/manifests/operator-webhook/server.yaml +++ b/bindata/manifests/operator-webhook/server.yaml @@ -22,6 +22,8 @@ spec: metadata: labels: app: operator-webhook + annotations: + openshift.io/required-scc: restricted-v2 spec: securityContext: runAsNonRoot: true diff --git a/bindata/manifests/webhook/server.yaml b/bindata/manifests/webhook/server.yaml index 659f71b7e..95e020671 100644 --- a/bindata/manifests/webhook/server.yaml +++ b/bindata/manifests/webhook/server.yaml @@ -25,6 +25,8 @@ spec: component: network type: infra openshift.io/component: network + annotations: + openshift.io/required-scc: restricted-v2 spec: securityContext: runAsNonRoot: true diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 586fa3f17..9247ac6d1 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -21,6 +21,8 @@ spec: metadata: labels: control-plane: controller-manager + annotations: + openshift.io/required-scc: restricted-v2 spec: securityContext: runAsNonRoot: true diff --git a/deploy/operator.yaml b/deploy/operator.yaml index 4236db22e..4f1412969 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -15,6 +15,8 @@ spec: metadata: labels: name: sriov-network-operator + annotations: + openshift.io/required-scc: restricted-v2 spec: affinity: nodeAffinity: diff --git a/deployment/sriov-network-operator-chart/templates/operator.yaml b/deployment/sriov-network-operator-chart/templates/operator.yaml index cd4e08192..29aead96d 100644 --- a/deployment/sriov-network-operator-chart/templates/operator.yaml +++ b/deployment/sriov-network-operator-chart/templates/operator.yaml @@ -15,6 +15,8 @@ spec: maxUnavailable: 33% template: metadata: + annotations: + openshift.io/required-scc: restricted-v2 labels: name: sriov-network-operator spec: From edf9cab76294c6023b409b477a2fb851d2e1f2f3 Mon Sep 17 00:00:00 2001 From: Tobias Giese Date: Mon, 12 Aug 2024 11:44:02 +0200 Subject: [PATCH 5/9] docs: add feature gates to our readme Signed-off-by: Tobias Giese --- README.md | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 914dd42f0..9759b0574 100644 --- a/README.md +++ b/README.md @@ -327,19 +327,37 @@ spec: node-role.kubernetes.io/worker: "" ``` -### Resource Injector Policy +## Feature Gates -By default, the Resource injector webhook has a failed policy of ignored, this was implemented to not block pod creation -in case the webhook is not available. +Feature gates are used to enable or disable specific features in the operator. -with a feature introduced in Kubernetes 1.28(Beta) called [MatchConditions](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchconditions) -we can move the webhook failed policy to be Fail. In this case the operator configured the Mutating webhook for the resource -injector only on pods with the secondary network annotation of `k8s.v1.cni.cncf.io/networks`. -It's possible to enable the feature with a FeatureGate via the SriovOperatorConfig object +> **NOTE**: As features mature and graduate to stable status, default settings may change, and feature gates might be removed in future releases. Keep this in mind when configuring feature gates and ensure your environment is compatible with any updates. -> **NOTE**: the feature is disabled by default +### Available Feature Gates -**Example**: +1. **Parallel NIC Configuration** (`parallelNicConfig`) + - **Description:** Allows the configuration of NICs in parallel, which can potentially reduce the time required for network setup. + - **Default:** Disabled + +2. **Resource Injector Match Condition** (`resourceInjectorMatchCondition`) + - **Description:** Switches the resource injector's webhook failure policy from "Ignore" to "Fail" by utilizing the `MatchConditions` feature introduced in Kubernetes 1.28. This ensures the webhook only targets pods with the `k8s.v1.cni.cncf.io/networks` annotation, improving reliability without affecting other pods. + - **Default:** Disabled + +3. **Metrics Exporter** (`metricsExporter`) + - **Description:** Enables the metrics exporter on the same node where the config-daemon is running. This helps in collecting and exporting metrics related to SR-IOV network devices. + - **Default:** Disabled + +4. **Manage Software Bridges** (`manageSoftwareBridges`) + - **Description:** Allows the operator to manage software bridges. This feature gate is useful for environments where bridge management is required. + - **Default:** Disabled + +5. **Mellanox Firmware Reset** (`mellanoxFirmwareReset`) + - **Description:** Enables the firmware reset via `mstfwreset` before a system reboot. This feature is specific to Mellanox network devices and is used to ensure that the firmware is properly reset during system maintenance. + - **Default:** Disabled + +### Enabling Feature Gates + +To enable a feature gate, add it to your configuration file or command line with the desired state. For example, to enable the `resourceInjectorMatchCondition` feature gate, you would specify: ```yaml apiVersion: sriovnetwork.openshift.io/v1 @@ -348,7 +366,6 @@ metadata: name: default namespace: sriov-network-operator spec: - ... featureGates: resourceInjectorMatchCondition: true ... From 09d5f7537abf816be4182aa91e98b2f1a7478983 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Mon, 12 Aug 2024 10:50:29 +0300 Subject: [PATCH 6/9] Fix reconcile vf functional test we should not lower the MTU on a primary SDN nic. with this change we select a PF that is not the default route of the node or not the nic connected to the ovs bridge for openshift Signed-off-by: Sebastian Sch --- test/conformance/tests/test_sriov_operator.go | 336 ++++++++++++------ 1 file changed, 230 insertions(+), 106 deletions(-) diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 63a5df259..c1db065b2 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -1003,112 +1003,6 @@ var _ = Describe("[sriov] operator", func() { By("Checking that second pod is able to use released VF") waitForPodRunning(runningPodB) }) - - It("should reconcile managed VF if status is changed", func() { - originalMtu := sriovDevice.Mtu - lowerMtu := originalMtu - 500 - - By("manually decreasing the MTU") - _, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", lowerMtu, sriovDevice.PciAddress, sriovDevice.Name)) - Expect(err).ToNot(HaveOccurred()) - Expect(errOutput).To(Equal("")) - - By("waiting for the mtu to be updated in the status") - Eventually(func() sriovv1.InterfaceExts { - nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - return nodeState.Status.Interfaces - }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( - IgnoreExtras, - Fields{ - "Name": Equal(sriovDevice.Name), - "Mtu": Equal(lowerMtu), - "PciAddress": Equal(sriovDevice.PciAddress), - "NumVfs": Equal(sriovDevice.NumVfs), - }))) - - By("waiting for the mtu to be restored") - Eventually(func() sriovv1.InterfaceExts { - nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - return nodeState.Status.Interfaces - }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( - IgnoreExtras, - Fields{ - "Name": Equal(sriovDevice.Name), - "Mtu": Equal(originalMtu), - "PciAddress": Equal(sriovDevice.PciAddress), - "NumVfs": Equal(sriovDevice.NumVfs), - }))) - - higherMtu := originalMtu + 500 - - By("manually increasing the MTU") - _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", higherMtu, sriovDevice.PciAddress, sriovDevice.Name)) - Expect(err).ToNot(HaveOccurred()) - Expect(errOutput).To(Equal("")) - - By("waiting for the mtu to be updated in the status") - Eventually(func() sriovv1.InterfaceExts { - nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - return nodeState.Status.Interfaces - }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( - IgnoreExtras, - Fields{ - "Name": Equal(sriovDevice.Name), - "Mtu": Equal(higherMtu), - "PciAddress": Equal(sriovDevice.PciAddress), - "NumVfs": Equal(sriovDevice.NumVfs), - }))) - - By("expecting the mtu to consistently stay at the new higher level") - Consistently(func() sriovv1.InterfaceExts { - nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - return nodeState.Status.Interfaces - }, 3*time.Minute, 15*time.Second).Should(ContainElement(MatchFields( - IgnoreExtras, - Fields{ - "Name": Equal(sriovDevice.Name), - "Mtu": Equal(higherMtu), - "PciAddress": Equal(sriovDevice.PciAddress), - "NumVfs": Equal(sriovDevice.NumVfs), - }))) - - By("manually returning the MTU to the original level") - _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", originalMtu, sriovDevice.PciAddress, sriovDevice.Name)) - Expect(err).ToNot(HaveOccurred()) - Expect(errOutput).To(Equal("")) - - By("waiting for the mtu to be updated in the status") - Eventually(func() sriovv1.InterfaceExts { - nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - return nodeState.Status.Interfaces - }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( - IgnoreExtras, - Fields{ - "Name": Equal(sriovDevice.Name), - "Mtu": Equal(originalMtu), - "PciAddress": Equal(sriovDevice.PciAddress), - "NumVfs": Equal(sriovDevice.NumVfs), - }))) - - By("expecting the mtu to consistently stay at the original level") - Consistently(func() sriovv1.InterfaceExts { - nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) - Expect(err).ToNot(HaveOccurred()) - return nodeState.Status.Interfaces - }, 3*time.Minute, 15*time.Second).Should(ContainElement(MatchFields( - IgnoreExtras, - Fields{ - "Name": Equal(sriovDevice.Name), - "Mtu": Equal(originalMtu), - "PciAddress": Equal(sriovDevice.PciAddress), - "NumVfs": Equal(sriovDevice.NumVfs), - }))) - }) }) Context("CNI Logging level", func() { @@ -1604,6 +1498,219 @@ var _ = Describe("[sriov] operator", func() { Expect(output).To(ContainSubstring("0")) }) }) + + Context("Daemon reconcile vf", func() { + var node string + resourceName := "mturesource" + var numVfs int + var intf *sriovv1.InterfaceExt + var err error + + BeforeEach(func() { + if discovery.Enabled() { + node, resourceName, numVfs, intf, err = discovery.DiscoveredResources(clients, + sriovInfos, operatorNamespace, defaultFilterPolicy, + func(node string, sriovDeviceList []*sriovv1.InterfaceExt) (*sriovv1.InterfaceExt, bool) { + if len(sriovDeviceList) == 0 { + return nil, false + } + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + for _, ifc := range nodeState.Spec.Interfaces { + if ifc.NumVfs > 0 { + for _, device := range sriovDeviceList { + if device.Name == ifc.Name { + return device, true + } + } + } + } + return nil, false + }, + ) + Expect(err).ToNot(HaveOccurred()) + if node == "" || resourceName == "" || numVfs < 5 || intf == nil { + Skip("Insufficient resources to run test in discovery mode") + } + } else { + node = sriovInfos.Nodes[0] + sriovDeviceList, err := sriovInfos.FindSriovDevices(node) + Expect(err).ToNot(HaveOccurred()) + unusedSriovDevices, err := findUnusedSriovDevices(node, sriovDeviceList) + if err != nil { + Skip(err.Error()) + } + intf = unusedSriovDevices[0] + By("Using device " + intf.Name + " on node " + node) + + mtuPolicy := &sriovv1.SriovNetworkNodePolicy{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-mtupolicy", + Namespace: operatorNamespace, + }, + + Spec: sriovv1.SriovNetworkNodePolicySpec{ + NodeSelector: map[string]string{ + "kubernetes.io/hostname": node, + }, + Mtu: 1500, + NumVfs: 5, + ResourceName: resourceName, + Priority: 99, + NicSelector: sriovv1.SriovNetworkNicSelector{ + PfNames: []string{intf.Name}, + }, + DeviceType: "netdevice", + }, + } + + err = clients.Create(context.Background(), mtuPolicy) + Expect(err).ToNot(HaveOccurred()) + + WaitForSRIOVStable() + By("waiting for the resources to be available") + Eventually(func() int64 { + testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)] + allocatable, _ := resNum.AsInt64() + return allocatable + }, 10*time.Minute, time.Second).Should(Equal(int64(5))) + } + + sriovNetwork := &sriovv1.SriovNetwork{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-mtuvolnetwork", + Namespace: operatorNamespace, + }, + Spec: sriovv1.SriovNetworkSpec{ + ResourceName: resourceName, + IPAM: `{"type":"host-local","subnet":"10.10.10.0/24","rangeStart":"10.10.10.171","rangeEnd":"10.10.10.181","routes":[{"dst":"0.0.0.0/0"}],"gateway":"10.10.10.1"}`, + NetworkNamespace: namespaces.Test, + }} + + // We need this to be able to run the connectivity checks on Mellanox cards + if intf.DeviceID == "1015" { + sriovNetwork.Spec.SpoofChk = off + } + + err = clients.Create(context.Background(), sriovNetwork) + + Expect(err).ToNot(HaveOccurred()) + waitForNetAttachDef("test-mtuvolnetwork", namespaces.Test) + + // update the interface + intf = getInterfaceFromNodeStateByPciAddress(node, intf.PciAddress) + }) + + It("should reconcile managed VF if status is changed", func() { + originalMtu := intf.Mtu + lowerMtu := originalMtu - 500 + + By("manually decreasing the MTU") + _, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", lowerMtu, intf.PciAddress, intf.Name)) + Expect(err).ToNot(HaveOccurred()) + Expect(errOutput).To(Equal("")) + + By("waiting for the mtu to be updated in the status") + Eventually(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(intf.Name), + "Mtu": Equal(lowerMtu), + "PciAddress": Equal(intf.PciAddress), + "NumVfs": Equal(intf.NumVfs), + }))) + + By("waiting for the mtu to be restored") + Eventually(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(intf.Name), + "Mtu": Equal(originalMtu), + "PciAddress": Equal(intf.PciAddress), + "NumVfs": Equal(intf.NumVfs), + }))) + + higherMtu := originalMtu + 500 + + By("manually increasing the MTU") + _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", higherMtu, intf.PciAddress, intf.Name)) + Expect(err).ToNot(HaveOccurred()) + Expect(errOutput).To(Equal("")) + + By("waiting for the mtu to be updated in the status") + Eventually(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(intf.Name), + "Mtu": Equal(higherMtu), + "PciAddress": Equal(intf.PciAddress), + "NumVfs": Equal(intf.NumVfs), + }))) + + By("expecting the mtu to consistently stay at the new higher level") + Consistently(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 15*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(intf.Name), + "Mtu": Equal(higherMtu), + "PciAddress": Equal(intf.PciAddress), + "NumVfs": Equal(intf.NumVfs), + }))) + + By("manually returning the MTU to the original level") + _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", originalMtu, intf.PciAddress, intf.Name)) + Expect(err).ToNot(HaveOccurred()) + Expect(errOutput).To(Equal("")) + + By("waiting for the mtu to be updated in the status") + Eventually(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(intf.Name), + "Mtu": Equal(originalMtu), + "PciAddress": Equal(intf.PciAddress), + "NumVfs": Equal(intf.NumVfs), + }))) + + By("expecting the mtu to consistently stay at the original level") + Consistently(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 15*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(intf.Name), + "Mtu": Equal(originalMtu), + "PciAddress": Equal(intf.PciAddress), + "NumVfs": Equal(intf.NumVfs), + }))) + }) + }) + }) }) @@ -2256,3 +2363,20 @@ func assertNodeStateHasVFMatching(nodeName string, fields Fields) { ), ) } + +func getInterfaceFromNodeStateByPciAddress(node, pciAddress string) *sriovv1.InterfaceExt { + intf := &sriovv1.InterfaceExt{} + nodeState := &sriovv1.SriovNetworkNodeState{} + err := clients.Get(context.Background(), runtimeclient.ObjectKey{Name: node, Namespace: operatorNamespace}, nodeState) + Expect(err).ToNot(HaveOccurred()) + found := false + for _, state := range nodeState.Status.Interfaces { + if state.PciAddress == pciAddress { + intf = state.DeepCopy() + found = true + break + } + } + Expect(found).To(BeTrue()) + return intf +} From 77c9410f3d8e113b4c463bb459222f911eb22749 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Mon, 29 Jul 2024 17:23:19 +0300 Subject: [PATCH 7/9] Fix virtual cluster redeploy script create a common helm deploy script to use Signed-off-by: Sebastian Sch --- hack/deploy-operator-helm.sh | 31 ++++++++++++++++ hack/run-e2e-conformance-virtual-cluster.sh | 39 +++++---------------- hack/virtual-cluster-redeploy.sh | 17 ++++++--- 3 files changed, 52 insertions(+), 35 deletions(-) create mode 100755 hack/deploy-operator-helm.sh diff --git a/hack/deploy-operator-helm.sh b/hack/deploy-operator-helm.sh new file mode 100755 index 000000000..73b5b4733 --- /dev/null +++ b/hack/deploy-operator-helm.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -xeo pipefail + +here="$(dirname "$(readlink --canonicalize "${BASH_SOURCE[0]}")")" +root="$(readlink --canonicalize "$here/..")" + +export ADMISSION_CONTROLLERS_ENABLED=true +export ADMISSION_CONTROLLERS_CERTIFICATES_CERT_MANAGER_ENABLED=true +export NAMESPACE="sriov-network-operator" +export OPERATOR_NAMESPACE="sriov-network-operator" + +source hack/env.sh + +HELM_MODE=${HELM_MODE:-install} + +HELM_VALUES_OPTS="\ + --set images.operator=${SRIOV_NETWORK_OPERATOR_IMAGE} \ + --set images.sriovConfigDaemon=${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE} \ + --set images.sriovCni=${SRIOV_CNI_IMAGE} \ + --set images.sriovDevicePlugin=${SRIOV_DEVICE_PLUGIN_IMAGE} \ + --set images.resourcesInjector=${NETWORK_RESOURCES_INJECTOR_IMAGE} \ + --set images.webhook=${SRIOV_NETWORK_WEBHOOK_IMAGE} \ + --set operator.admissionControllers.enabled=${ADMISSION_CONTROLLERS_ENABLED} \ + --set operator.admissionControllers.certificates.certManager.enabled=${ADMISSION_CONTROLLERS_CERTIFICATES_CERT_MANAGER_ENABLED} \ + --set sriovOperatorConfig.deploy=true" + +PATH=$PATH:${root}/bin +make helm +helm ${HELM_MODE} -n ${NAMESPACE} --create-namespace \ + $HELM_VALUES_OPTS \ + --wait sriov-network-operator ./deployment/sriov-network-operator-chart diff --git a/hack/run-e2e-conformance-virtual-cluster.sh b/hack/run-e2e-conformance-virtual-cluster.sh index 8dbf51593..1a75a280d 100755 --- a/hack/run-e2e-conformance-virtual-cluster.sh +++ b/hack/run-e2e-conformance-virtual-cluster.sh @@ -15,6 +15,13 @@ root="$(readlink --canonicalize "$here/..")" NUM_OF_WORKERS=${NUM_OF_WORKERS:-2} total_number_of_nodes=$((1 + NUM_OF_WORKERS)) +## Global configuration +export NAMESPACE="sriov-network-operator" +export OPERATOR_NAMESPACE="sriov-network-operator" +export SKIP_VAR_SET="" +export OPERATOR_EXEC=kubectl +export CLUSTER_HAS_EMULATED_PF=TRUE + if [ "$NUM_OF_WORKERS" -lt 2 ]; then echo "Min number of workers is 2" exit 1 @@ -364,36 +371,8 @@ do ATTEMPTS=$((ATTEMPTS+1)) done - -source hack/env.sh - -export ADMISSION_CONTROLLERS_ENABLED=true -export ADMISSION_CONTROLLERS_CERTIFICATES_CERT_MANAGER_ENABLED=true -export SKIP_VAR_SET="" -export NAMESPACE="sriov-network-operator" -export OPERATOR_NAMESPACE="sriov-network-operator" -export CNI_BIN_PATH=/opt/cni/bin -export OPERATOR_EXEC=kubectl -export CLUSTER_HAS_EMULATED_PF=TRUE - - -HELM_VALUES_OPTS="\ - --set images.operator=${SRIOV_NETWORK_OPERATOR_IMAGE} \ - --set images.sriovConfigDaemon=${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE} \ - --set images.sriovCni=${SRIOV_CNI_IMAGE} \ - --set images.sriovDevicePlugin=${SRIOV_DEVICE_PLUGIN_IMAGE} \ - --set images.resourcesInjector=${NETWORK_RESOURCES_INJECTOR_IMAGE} \ - --set images.webhook=${SRIOV_NETWORK_WEBHOOK_IMAGE} \ - --set operator.admissionControllers.enabled=${ADMISSION_CONTROLLERS_ENABLED} \ - --set operator.admissionControllers.certificates.certManager.enabled=${ADMISSION_CONTROLLERS_CERTIFICATES_CERT_MANAGER_ENABLED} \ - --set sriovOperatorConfig.deploy=true" - -PATH=$PATH:${root}/bin -make helm -helm install -n ${NAMESPACE} --create-namespace \ - $HELM_VALUES_OPTS \ - --wait sriov-network-operator ./deployment/sriov-network-operator-chart - +# Deploy the sriov operator via helm +hack/deploy-operator-helm.sh echo "## create certificates for webhook" cat < registry-login.conf internal_registry="image-registry.openshift-image-registry.svc:5000" - pass=$( jq .\"$internal_registry\".password registry-login.conf ) + pass=$( jq .\"image-registry.openshift-image-registry.svc:5000\".auth registry-login.conf ) + pass=`echo ${pass:1:-1} | base64 -d` + + # dockercfg password is in the form `:password`. We need to trim the `:` prefix + pass=${pass#":"} registry="default-route-openshift-image-registry.apps.${cluster_name}.${domain_name}" - podman login -u serviceaccount -p ${pass:1:-1} $registry --tls-verify=false + podman login -u serviceaccount -p ${pass} $registry --tls-verify=false export SRIOV_NETWORK_OPERATOR_IMAGE="$registry/$NAMESPACE/sriov-network-operator:latest" export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE="$registry/$NAMESPACE/sriov-network-config-daemon:latest" @@ -44,6 +48,7 @@ else fi export ADMISSION_CONTROLLERS_ENABLED=true +export OPERATOR_LEADER_ELECTION_ENABLE=true export SKIP_VAR_SET="" export OPERATOR_NAMESPACE=$NAMESPACE export OPERATOR_EXEC=kubectl @@ -67,9 +72,11 @@ if [ $CLUSTER_TYPE == "openshift" ]; then export SRIOV_NETWORK_OPERATOR_IMAGE="image-registry.openshift-image-registry.svc:5000/$NAMESPACE/sriov-network-operator:latest" export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE="image-registry.openshift-image-registry.svc:5000/$NAMESPACE/sriov-network-config-daemon:latest" export SRIOV_NETWORK_WEBHOOK_IMAGE="image-registry.openshift-image-registry.svc:5000/$NAMESPACE/sriov-network-operator-webhook:latest" + echo "## deploying SRIOV Network Operator" + hack/deploy-setup.sh $NAMESPACE +else + export HELM_MODE=upgrade + hack/deploy-operator-helm.sh fi -echo "## deploying SRIOV Network Operator" -hack/deploy-setup.sh $NAMESPACE - kubectl -n ${NAMESPACE} delete po --all From 5a950b6c4cd8aaf27817db5be4767e956671b82f Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Wed, 22 May 2024 16:09:15 +0200 Subject: [PATCH 8/9] e2e: Verify metrics-exporter expose netdevice metrics Exposed metrics can be verified by scraping the prometheus endpoint on the `sriov-network-metrics-exporter` pod. Add a test that spawns an SR-IOV consuming pod and verifies its receiving counter increase when the interface is pinged from outside. Signed-off-by: Andrea Panattoni --- go.mod | 4 +- .../tests/test_exporter_metrics.go | 187 ++++++++++++++++++ test/util/namespaces/namespaces.go | 25 +++ 3 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 test/conformance/tests/test_exporter_metrics.go diff --git a/go.mod b/go.mod index fe9005b1b..0353c7ec1 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,8 @@ require ( github.com/pkg/errors v0.9.1 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 github.com/prometheus-operator/prometheus-operator/pkg/client v0.68.0 + github.com/prometheus/client_model v0.5.0 + github.com/prometheus/common v0.45.0 github.com/safchain/ethtool v0.3.0 github.com/spf13/cobra v1.7.0 github.com/stretchr/testify v1.8.4 @@ -125,8 +127,6 @@ require ( github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_golang v1.17.0 // indirect - github.com/prometheus/client_model v0.5.0 // indirect - github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect github.com/robfig/cron v1.2.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect diff --git a/test/conformance/tests/test_exporter_metrics.go b/test/conformance/tests/test_exporter_metrics.go new file mode 100644 index 000000000..e81f63067 --- /dev/null +++ b/test/conformance/tests/test_exporter_metrics.go @@ -0,0 +1,187 @@ +package tests + +import ( + "context" + "fmt" + "strings" + + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster" + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/discovery" + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces" + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network" + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/pod" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { + + BeforeAll(func() { + if cluster.VirtualCluster() { + Skip("IGB driver does not support VF statistics") + } + + err := namespaces.Create(namespaces.Test, clients) + Expect(err).ToNot(HaveOccurred()) + + err = namespaces.Clean(operatorNamespace, namespaces.Test, clients, discovery.Enabled()) + Expect(err).ToNot(HaveOccurred()) + + featureFlagInitialValue := isFeatureFlagEnabled("metricsExporter") + DeferCleanup(func() { + By("Restoring initial feature flag value") + setFeatureFlag("metricsExporter", featureFlagInitialValue) + }) + + By("Enabling `metricsExporter` feature flag") + setFeatureFlag("metricsExporter", true) + + By("Adding monitoring label to " + operatorNamespace) + err = namespaces.AddLabel(clients, context.Background(), operatorNamespace, "openshift.io/cluster-monitoring", "true") + Expect(err).ToNot(HaveOccurred()) + + WaitForSRIOVStable() + }) + + It("collects metrics regarding receiving traffic via VF", func() { + sriovInfos, err := cluster.DiscoverSriov(clients, operatorNamespace) + Expect(err).ToNot(HaveOccurred()) + + node, nic, err := sriovInfos.FindOneSriovNodeAndDevice() + Expect(err).ToNot(HaveOccurred()) + By("Using device " + nic.Name + " on node " + node) + + _, err = network.CreateSriovPolicy(clients, "test-me-policy-", operatorNamespace, nic.Name, node, 2, "metricsResource", "netdevice") + Expect(err).ToNot(HaveOccurred()) + + err = network.CreateSriovNetwork(clients, nic, "test-me-network", namespaces.Test, operatorNamespace, "metricsResource", ipamIpv4) + Expect(err).ToNot(HaveOccurred()) + waitForNetAttachDef("test-me-network", namespaces.Test) + + pod := createTestPod(node, []string{"test-me-network"}) + + ips, err := network.GetSriovNicIPs(pod, "net1") + Expect(err).ToNot(HaveOccurred()) + Expect(ips).NotTo(BeNil(), "No sriov network interface found.") + Expect(len(ips)).Should(Equal(1)) + + initialMetrics := getMetricsForNode(node) + initialRxBytes := getCounterForPod(initialMetrics, pod, "sriov_vf_rx_bytes") + initialRxPackets := getCounterForPod(initialMetrics, pod, "sriov_vf_rx_packets") + + for _, ip := range ips { + pingPod(ip, node, "test-me-network") + } + + finalMetrics := getMetricsForNode(node) + finalRxBytes := getCounterForPod(finalMetrics, pod, "sriov_vf_rx_bytes") + finalRxPackets := getCounterForPod(finalMetrics, pod, "sriov_vf_rx_packets") + + Expect(finalRxBytes).Should(BeNumerically(">", initialRxBytes)) + Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets)) + }) + +}) + +func getMetricsForNode(nodeName string) map[string]*dto.MetricFamily { + metricsExporterPods, err := clients.Pods(operatorNamespace).List(context.Background(), metav1.ListOptions{ + LabelSelector: "app=sriov-network-metrics-exporter", + FieldSelector: "spec.nodeName=" + nodeName, + }) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, metricsExporterPods.Items).ToNot(HaveLen(0), "At least one operator pod expected") + + metricsExporterPod := metricsExporterPods.Items[0] + + command := []string{"curl", "http://127.0.0.1:9110/metrics"} + stdout, stderr, err := pod.ExecCommand(clients, &metricsExporterPod, command...) + Expect(err).ToNot(HaveOccurred(), + "pod: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", metricsExporterPod.Namespace, metricsExporterPod.Name, command, stdout, stderr) + + // Clean the scraped output from carriage returns + stdout = strings.ReplaceAll(stdout, "\r", "") + + var parser expfmt.TextParser + mf, err := parser.TextToMetricFamilies(strings.NewReader(stdout)) + Expect(err).ToNot(HaveOccurred()) + + return mf +} + +func getCounterForPod(mf map[string]*dto.MetricFamily, p *corev1.Pod, metricName string) float64 { + pciAddress := findPciAddressForPod(mf, p) + return findCounterForPciAddr(mf, pciAddress, metricName) +} + +func findPciAddressForPod(mf map[string]*dto.MetricFamily, p *corev1.Pod) string { + kubePodDeviceMetric := findKubePodDeviceMetric(mf, p) + for _, labelPair := range kubePodDeviceMetric.Label { + if labelPair.GetName() == "pciAddr" { + return *labelPair.Value + } + } + + Fail(fmt.Sprintf("Can't find PCI Address for pod [%s/%s] in metrics %+v", p.Name, p.Namespace, mf)) + return "" +} + +func findKubePodDeviceMetric(mf map[string]*dto.MetricFamily, pod *corev1.Pod) *dto.Metric { + metricFamily, ok := mf["sriov_kubepoddevice"] + Expect(ok).To(BeTrue(), "sriov_kubepoddevice metric not found: %+v", mf) + + kubePodDeviceMetric := findMetricForPod(metricFamily.Metric, pod) + Expect(kubePodDeviceMetric).ToNot(BeNil(), "sriov_kubepoddevice metric for pod [%s/%s] not found: %+v", pod.Name, pod.Namespace, mf) + + return kubePodDeviceMetric +} + +func findCounterForPciAddr(mf map[string]*dto.MetricFamily, pciAddress string, metricName string) float64 { + metricFamily, ok := mf[metricName] + Expect(ok).To(BeTrue(), "metric %s not found: %+v", metricName, mf) + + metric := findMetricFor(metricFamily.Metric, map[string]string{ + "pciAddr": pciAddress, + }) + Expect(metric).ToNot(BeNil(), "metric %s for pciAddr %s not found: %+v", metricName, pciAddress, mf) + + return *metric.GetCounter().Value +} + +func findMetricForPod(metrics []*dto.Metric, pod *corev1.Pod) *dto.Metric { + return findMetricFor(metrics, map[string]string{ + "pod": pod.Name, + "namespace": pod.Namespace, + }) +} + +func findMetricFor(metrics []*dto.Metric, labelsToMatch map[string]string) *dto.Metric { + for _, metric := range metrics { + if areLabelsMatching(metric.Label, labelsToMatch) { + return metric + } + } + + return nil +} + +func areLabelsMatching(labels []*dto.LabelPair, labelsToMatch map[string]string) bool { + for _, labelPair := range labels { + valueToMatch, ok := labelsToMatch[labelPair.GetName()] + if !ok { + continue + } + + if *labelPair.Value != valueToMatch { + return false + } + } + + return true +} diff --git a/test/util/namespaces/namespaces.go b/test/util/namespaces/namespaces.go index 9e865b7d1..5ed106398 100644 --- a/test/util/namespaces/namespaces.go +++ b/test/util/namespaces/namespaces.go @@ -10,6 +10,7 @@ import ( k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/utils/pointer" runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -165,3 +166,27 @@ func Clean(operatorNamespace, namespace string, cs *testclient.ClientSet, discov } return nil } + +func AddLabel(cs corev1client.NamespacesGetter, ctx context.Context, namespaceName, key, value string) error { + ns, err := cs.Namespaces().Get(context.Background(), namespaceName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get namespace [%s]: %v", namespaceName, err) + } + + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + + if ns.Labels[key] == value { + return nil + } + + ns.Labels[key] = value + + _, err = cs.Namespaces().Update(ctx, ns, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update namespace [%s] with label [%s: %s]: %v", namespaceName, key, value, err) + } + + return nil +} From 1183e4f40949ef7ad05007e4e6abe424a1c9ed1d Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Wed, 21 Aug 2024 11:24:21 +0200 Subject: [PATCH 9/9] bug(shutdown): Fix panic when webhooks are disabled When webooks are disabled, shutdown procedure produces the following panic error: ``` 2024-08-13T12:45:04.971685297Z INFO shutdown utils/shutdown.go:22 Done clearing finalizers on exit 2024-08-13T12:45:04.971713179Z INFO shutdown utils/shutdown.go:23 Seting webhook failure policies to Ignore on exit 2024-08-13T12:45:04.978386488Z ERROR shutdown utils/shutdown.go:64 Error getting webhook {"error": "validatingwebhookconfigurations.admissionregistration.k8s.io \"sriov-operator-webhook-config\" not found"} panic: runtime error: index out of range [0] with length 0 goroutine 1 [running]: github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils.updateValidatingWebhook(0x37d7788?) /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils/shutdown.go:75 +0x198 github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils.updateWebhooks() /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils/shutdown.go:64 +0xa5 github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils.Shutdown() /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils/shutdown.go:23 +0x14 main.main() /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/main.go:296 +0x1e6a ``` Fix the panic error and add an end2end test case to cover it. Signed-off-by: Andrea Panattoni --- pkg/utils/shutdown.go | 1 + test/conformance/tests/test_sriov_operator.go | 22 ++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pkg/utils/shutdown.go b/pkg/utils/shutdown.go index 208e2073c..f8f9618d4 100644 --- a/pkg/utils/shutdown.go +++ b/pkg/utils/shutdown.go @@ -71,6 +71,7 @@ func updateValidatingWebhook(c *kubernetes.Clientset) { webhook, err := validatingWebhookClient.Get(context.TODO(), consts.OperatorWebHookName, metav1.GetOptions{}) if err != nil { shutdownLog.Error(err, "Error getting webhook") + return } webhook.Webhooks[0].FailurePolicy = &failurePolicyIgnore _, err = validatingWebhookClient.Update(context.TODO(), webhook, metav1.UpdateOptions{}) diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index c1db065b2..b665c99f0 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -241,7 +241,10 @@ var _ = Describe("[sriov] operator", func() { }) }) - It("should gracefully restart quickly", func() { + DescribeTable("should gracefully restart quickly", func(webookEnabled bool) { + DeferCleanup(setSriovOperatorSpecFlag(operatorNetworkInjectorFlag, webookEnabled)) + DeferCleanup(setSriovOperatorSpecFlag(operatorWebhookFlag, webookEnabled)) + // This test case ensure leader election process runs smoothly when the operator's pod is restarted. oldLease, err := clients.CoordinationV1Interface.Leases(operatorNamespace).Get(context.Background(), consts.LeaderElectionID, metav1.GetOptions{}) if k8serrors.IsNotFound(err) { @@ -268,7 +271,10 @@ var _ = Describe("[sriov] operator", func() { g.Expect(newLease.Spec.HolderIdentity).ToNot(Equal(oldLease.Spec.HolderIdentity)) }, 30*time.Second, 5*time.Second).Should(Succeed()) - }) + }, + Entry("webhooks enabled", true), + Entry("webhooks disabled", true), + ) Context("SriovNetworkMetricsExporter", func() { BeforeEach(func() { @@ -2136,19 +2142,24 @@ func defaultFilterPolicy(policy sriovv1.SriovNetworkNodePolicy) bool { return policy.Spec.DeviceType == "netdevice" } -func setSriovOperatorSpecFlag(flagName string, flagValue bool) { +func setSriovOperatorSpecFlag(flagName string, flagValue bool) func() { cfg := sriovv1.SriovOperatorConfig{} err := clients.Get(context.TODO(), runtimeclient.ObjectKey{ Name: "default", Namespace: operatorNamespace, }, &cfg) + ret := func() {} + Expect(err).ToNot(HaveOccurred()) if flagName == operatorNetworkInjectorFlag && cfg.Spec.EnableInjector != flagValue { cfg.Spec.EnableInjector = flagValue err = clients.Update(context.TODO(), &cfg) Expect(err).ToNot(HaveOccurred()) Expect(cfg.Spec.EnableInjector).To(Equal(flagValue)) + ret = func() { + setSriovOperatorSpecFlag(flagName, !flagValue) + } } if flagName == operatorWebhookFlag && cfg.Spec.EnableOperatorWebhook != flagValue { @@ -2156,6 +2167,9 @@ func setSriovOperatorSpecFlag(flagName string, flagValue bool) { clients.Update(context.TODO(), &cfg) Expect(err).ToNot(HaveOccurred()) Expect(cfg.Spec.EnableOperatorWebhook).To(Equal(flagValue)) + ret = func() { + setSriovOperatorSpecFlag(flagName, !flagValue) + } } if flagValue { @@ -2171,6 +2185,8 @@ func setSriovOperatorSpecFlag(flagName string, flagValue bool) { } }, 1*time.Minute, 10*time.Second).WithOffset(1).Should(Succeed()) } + + return ret } func setOperatorConfigLogLevel(level int) {