Skip to content

Commit

Permalink
fix(operator/inventory): reset node inventory on watcher reconnect (#228
Browse files Browse the repository at this point in the history
)

Signed-off-by: Artur Troian <[email protected]>
  • Loading branch information
troian authored Apr 10, 2024
1 parent 015638e commit 49d5caa
Showing 1 changed file with 13 additions and 18 deletions.
31 changes: 13 additions & 18 deletions operator/inventory/node-discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,6 @@ func (dp *nodeDiscovery) monitor() error {
gpusIDs := make(RegistryGPUVendors)
currLabels := make(map[string]string)
currPods := make(map[string]corev1.Pod)
currPodsInitCount := 0

select {
case <-dp.ctx.Done():
Expand Down Expand Up @@ -429,18 +428,13 @@ func (dp *nodeDiscovery) monitor() error {
pods, terr := kc.CoreV1().Pods(corev1.NamespaceAll).List(dp.ctx, metav1.ListOptions{
FieldSelector: fields.OneTermEqualSelector("spec.nodeName", dp.name).String(),
})

if terr != nil {
return terr
}

for name := range currPods {
pod := currPods[name]

subPodAllocatedResources(&node, &pod)
nodeResetAllocated(&node)

delete(currPods, name)
}
currPods = make(map[string]corev1.Pod)

for name := range pods.Items {
pod := pods.Items[name].DeepCopy()
Expand All @@ -449,8 +443,6 @@ func (dp *nodeDiscovery) monitor() error {
currPods[pod.Name] = *pod
}

currPodsInitCount = len(currPods)

return nil
}

Expand Down Expand Up @@ -535,9 +527,9 @@ func (dp *nodeDiscovery) monitor() error {
if _, exists := currPods[obj.Name]; !exists {
currPods[obj.Name] = *obj.DeepCopy()
addPodAllocatedResources(&node, obj)
} else {
currPodsInitCount--
}

signalState()
case watch.Deleted:
pod, exists := currPods[obj.Name]
if !exists {
Expand All @@ -547,14 +539,8 @@ func (dp *nodeDiscovery) monitor() error {

subPodAllocatedResources(&node, &pod)

if currPodsInitCount > 0 {
currPodsInitCount--
}

delete(currPods, obj.Name)
}

if currPodsInitCount == 0 {
signalState()
}
case <-statech:
Expand Down Expand Up @@ -660,6 +646,15 @@ func (dp *nodeDiscovery) initNodeInfo(gpusIds RegistryGPUVendors) (v1.Node, erro
return res, nil
}

func nodeResetAllocated(node *v1.Node) {
node.Resources.CPU.Quantity.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.GPU.Quantity.Allocated = resource.NewQuantity(0, resource.DecimalSI)
node.Resources.Memory.Quantity.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.EphemeralStorage.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.VolumesAttached.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.VolumesMounted.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
}

func addPodAllocatedResources(node *v1.Node, pod *corev1.Pod) {
for _, container := range pod.Spec.Containers {
for name, quantity := range container.Resources.Requests {
Expand Down

0 comments on commit 49d5caa

Please sign in to comment.