Skip to content

Commit

Permalink
Merge pull request #530 from mlguerrero12/removelaststate
Browse files Browse the repository at this point in the history
Verify status changes on managed interfaces
  • Loading branch information
SchSeba committed Apr 24, 2024
2 parents c5d4773 + 0d1a11a commit 6d367c6
Show file tree
Hide file tree
Showing 11 changed files with 445 additions and 126 deletions.
167 changes: 98 additions & 69 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,77 +313,60 @@ func (dn *Daemon) nodeStateSyncHandler() error {
latest := dn.desiredNodeState.GetGeneration()
log.Log.V(0).Info("nodeStateSyncHandler(): new generation", "generation", latest)

if dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() {
if vars.UsingSystemdMode {
serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host")
return err
}
postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host")
return err
}

// if the service doesn't exist we should continue to let the k8s plugin to create the service files
// this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
// the system service and reboot the node the config-daemon doesn't need to do anything.
if !(serviceEnabled && postNetworkServiceEnabled) {
sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed,
LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
"sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
} else {
sriovResult, err = systemd.ReadSriovResult()
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host")
return err
}
}
if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed {
log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError)

// add the error but don't requeue
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusFailed,
lastSyncError: sriovResult.LastSyncError,
}
<-dn.syncCh
return nil
}
}
log.Log.V(0).Info("nodeStateSyncHandler(): Interface not changed")
if dn.desiredNodeState.Status.LastSyncError != "" ||
dn.desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh the status
<-dn.syncCh
// load plugins if it has not loaded
if len(dn.loadedPlugins) == 0 {
dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
}

return nil
}

if dn.desiredNodeState.GetGeneration() == 1 && len(dn.desiredNodeState.Spec.Interfaces) == 0 {
err = dn.HostHelpers.ClearPCIAddressFolder()
if vars.UsingSystemdMode && dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() {
serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath)
if err != nil {
log.Log.Error(err, "failed to clear the PCI address configuration")
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host")
return err
}
postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host")
return err
}

log.Log.V(0).Info(
"nodeStateSyncHandler(): interface policy spec not yet set by controller for sriovNetworkNodeState",
"name", dn.desiredNodeState.Name)
if dn.desiredNodeState.Status.SyncStatus != "Succeeded" {
// if the service doesn't exist we should continue to let the k8s plugin to create the service files
// this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
// the system service and reboot the node the config-daemon doesn't need to do anything.
if !(serviceEnabled && postNetworkServiceEnabled) {
sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed,
LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
"sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
} else {
sriovResult, err = systemd.ReadSriovResult()
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host")
return err
}
}
if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed {
log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError)

// add the error but don't requeue
dn.refreshCh <- Message{
syncStatus: "Succeeded",
lastSyncError: "",
syncStatus: consts.SyncStatusFailed,
lastSyncError: sriovResult.LastSyncError,
}
// wait for writer to refresh status
<-dn.syncCh
return nil
}
}

skip, err := dn.shouldSkipReconciliation(dn.desiredNodeState)
if err != nil {
return err
}
// Reconcile only when there are changes in the spec or status of managed VFs.
if skip {
return nil
}

Expand All @@ -404,15 +387,6 @@ func (dn *Daemon) nodeStateSyncHandler() error {
}
dn.desiredNodeState.Status = updatedState.Status

// load plugins if it has not loaded
if len(dn.loadedPlugins) == 0 {
dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
}
}

reqReboot := false
reqDrain := false

Expand Down Expand Up @@ -564,6 +538,61 @@ func (dn *Daemon) nodeStateSyncHandler() error {
return nil
}

func (dn *Daemon) shouldSkipReconciliation(latestState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
var err error

// Skip when SriovNetworkNodeState object has just been created.
if latestState.GetGeneration() == 1 && len(latestState.Spec.Interfaces) == 0 {
err = dn.HostHelpers.ClearPCIAddressFolder()
if err != nil {
log.Log.Error(err, "failed to clear the PCI address configuration")
return false, err
}

log.Log.V(0).Info(
"shouldSkipReconciliation(): interface policy spec not yet set by controller for sriovNetworkNodeState",
"name", latestState.Name)
if latestState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh status
<-dn.syncCh
}
return true, nil
}

// Verify changes in the spec of the SriovNetworkNodeState CR.
if dn.currentNodeState.GetGeneration() == latestState.GetGeneration() && !dn.isDrainCompleted() {
for _, p := range dn.loadedPlugins {
// Verify changes in the status of the SriovNetworkNodeState CR.
changed, err := p.CheckStatusChanges(latestState)
if err != nil {
return false, err
}
if changed {
return false, nil
}
}

log.Log.V(0).Info("shouldSkipReconciliation(): Interface not changed")
if latestState.Status.LastSyncError != "" ||
latestState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh the status
<-dn.syncCh
}

return true, nil
}

return false, nil
}

// handleDrain: adds the right annotation to the node and nodeState object
// returns true if we need to finish the reconcile loop and wait for a new object
func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/plugins/fake/fake_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ func (f *FakePlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState
return false, false, nil
}

func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
return false, nil
}

func (f *FakePlugin) Apply() error {
return nil
}
128 changes: 84 additions & 44 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bytes"
"errors"
"os/exec"
"reflect"
"strconv"
"strings"
"syscall"
Expand Down Expand Up @@ -52,7 +51,6 @@ type GenericPlugin struct {
PluginName string
SpecVersion string
DesireState *sriovnetworkv1.SriovNetworkNodeState
LastState *sriovnetworkv1.SriovNetworkNodeState
DriverStateMap DriverStateMapType
DesiredKernelArgs map[string]bool
helpers helper.HostHelpersInterface
Expand Down Expand Up @@ -141,6 +139,37 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt
return
}

// CheckStatusChanges verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *GenericPlugin) CheckStatusChanges(current *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
log.Log.Info("generic-plugin CheckStatusChanges()")

for _, iface := range current.Spec.Interfaces {
found := false
for _, ifaceStatus := range current.Status.Interfaces {
// TODO: remove the check for ExternallyManaged - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/632
if iface.PciAddress == ifaceStatus.PciAddress && !iface.ExternallyManaged {
found = true
if sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) {
log.Log.Info("CheckStatusChanges(): status changed for interface", "address", iface.PciAddress)
return true, nil
}
break
}
}
if !found {
log.Log.Info("CheckStatusChanges(): no status found for interface", "address", iface.PciAddress)
}
}

missingKernelArgs, err := p.getMissingKernelArgs()
if err != nil {
log.Log.Error(err, "generic-plugin CheckStatusChanges(): failed to verify missing kernel arguments")
return false, err
}

return len(missingKernelArgs) != 0, nil
}

func (p *GenericPlugin) syncDriverState() error {
for _, driverState := range p.DriverStateMap {
if !driverState.DriverLoaded && driverState.NeedDriverFunc(p.DesireState, driverState) {
Expand All @@ -159,14 +188,6 @@ func (p *GenericPlugin) syncDriverState() error {
func (p *GenericPlugin) Apply() error {
log.Log.Info("generic plugin Apply()", "desiredState", p.DesireState.Spec)

if p.LastState != nil {
log.Log.Info("generic plugin Apply()", "lastState", p.LastState.Spec)
if reflect.DeepEqual(p.LastState.Spec.Interfaces, p.DesireState.Spec.Interfaces) {
log.Log.Info("generic plugin Apply(): desired and latest state are the same, nothing to apply")
return nil
}
}

if err := p.syncDriverState(); err != nil {
return err
}
Expand All @@ -188,8 +209,7 @@ func (p *GenericPlugin) Apply() error {
}
return err
}
p.LastState = &sriovnetworkv1.SriovNetworkNodeState{}
*p.LastState = *p.DesireState

return nil
}

Expand Down Expand Up @@ -252,38 +272,49 @@ func (p *GenericPlugin) addToDesiredKernelArgs(karg string) {
}
}

// syncDesiredKernelArgs Should be called to set all the kernel arguments. Returns bool if node update is needed.
func (p *GenericPlugin) syncDesiredKernelArgs() (bool, error) {
needReboot := false
// getMissingKernelArgs gets Kernel arguments that have not been set.
func (p *GenericPlugin) getMissingKernelArgs() ([]string, error) {
missingArgs := make([]string, 0, len(p.DesiredKernelArgs))
if len(p.DesiredKernelArgs) == 0 {
return false, nil
return nil, nil
}

kargs, err := p.helpers.GetCurrentKernelArgs()
if err != nil {
return false, err
return nil, err
}
for desiredKarg, attempted := range p.DesiredKernelArgs {
set := p.helpers.IsKernelArgsSet(kargs, desiredKarg)
if !set {
if attempted {
log.Log.V(2).Info("generic plugin syncDesiredKernelArgs(): previously attempted to set kernel arg",
"karg", desiredKarg)
}
// There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because
// the daemon encountered a potentially one-time error. However we always want to make sure that the kernel
// argument is set once the daemon goes through node state sync again.
update, err := setKernelArg(desiredKarg)
if err != nil {
log.Log.Error(err, "generic plugin syncDesiredKernelArgs(): fail to set kernel arg", "karg", desiredKarg)
return false, err
}
if update {
needReboot = true
log.Log.V(2).Info("generic plugin syncDesiredKernelArgs(): need reboot for setting kernel arg", "karg", desiredKarg)
}
p.DesiredKernelArgs[desiredKarg] = true

for desiredKarg := range p.DesiredKernelArgs {
if !p.helpers.IsKernelArgsSet(kargs, desiredKarg) {
missingArgs = append(missingArgs, desiredKarg)
}
}
return missingArgs, nil
}

// syncDesiredKernelArgs should be called to set all the kernel arguments. Returns bool if node update is needed.
func (p *GenericPlugin) syncDesiredKernelArgs(kargs []string) (bool, error) {
needReboot := false

for _, karg := range kargs {
if p.DesiredKernelArgs[karg] {
log.Log.V(2).Info("generic-plugin syncDesiredKernelArgs(): previously attempted to set kernel arg",
"karg", karg)
}
// There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because
// the daemon encountered a potentially one-time error. However we always want to make sure that the kernel
// argument is set once the daemon goes through node state sync again.
update, err := setKernelArg(karg)
if err != nil {
log.Log.Error(err, "generic-plugin syncDesiredKernelArgs(): fail to set kernel arg", "karg", karg)
return false, err
}
if update {
needReboot = true
log.Log.V(2).Info("generic-plugin syncDesiredKernelArgs(): need reboot for setting kernel arg", "karg", karg)
}
p.DesiredKernelArgs[karg] = true
}
return needReboot, nil
}

Expand Down Expand Up @@ -351,19 +382,28 @@ func (p *GenericPlugin) addVfioDesiredKernelArg(state *sriovnetworkv1.SriovNetwo
}
}

func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool, err error) {
needReboot = false
func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
needReboot := false

p.addVfioDesiredKernelArg(state)

updateNode, err := p.syncDesiredKernelArgs()
missingKernelArgs, err := p.getMissingKernelArgs()
if err != nil {
log.Log.Error(err, "generic plugin needRebootNode(): failed to set the desired kernel arguments")
log.Log.Error(err, "generic-plugin needRebootNode(): failed to verify missing kernel arguments")
return false, err
}
if updateNode {
log.Log.V(2).Info("generic plugin needRebootNode(): need reboot for updating kernel arguments")
needReboot = true

if len(missingKernelArgs) != 0 {
needReboot, err = p.syncDesiredKernelArgs(missingKernelArgs)
if err != nil {
log.Log.Error(err, "generic-plugin needRebootNode(): failed to set the desired kernel arguments")
return false, err
}
if needReboot {
log.Log.V(2).Info("generic-plugin needRebootNode(): need reboot for updating kernel arguments")
}
}

return needReboot, nil
}

Expand Down
Loading

0 comments on commit 6d367c6

Please sign in to comment.