Skip to content

Commit

Permalink
fix(hatchery/vsphere): check boottime (#7110)
Browse files Browse the repository at this point in the history
  • Loading branch information
yesnault authored Sep 16, 2024
1 parent 91e1167 commit 27d89eb
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 129 deletions.
107 changes: 72 additions & 35 deletions engine/hatchery/vsphere/hatchery.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ import (
"github.com/gorilla/mux"
"github.com/rockbears/log"
"github.com/vmware/govmomi/vim25/mo"
"github.com/vmware/govmomi/vim25/types"

"github.com/ovh/cds/engine/api"
"github.com/ovh/cds/engine/service"
"github.com/ovh/cds/sdk"
"github.com/ovh/cds/sdk/cdsclient"
"github.com/ovh/cds/sdk/hatchery"
cdslog "github.com/ovh/cds/sdk/log"
"github.com/ovh/cds/sdk/namesgenerator"
"github.com/ovh/cds/sdk/telemetry"
)
Expand Down Expand Up @@ -195,7 +195,7 @@ func (h *HatcheryVSphere) CanSpawn(ctx context.Context, model sdk.WorkerStarterW
continue
}
if annot.JobID == jobID {
log.Info(ctx, "can't span worker for job %d because there is a registering worker %q for the same job", jobID, vm.Name)
log.Info(ctx, "can't span worker for job %s because there is a registering worker %q for the same job", jobID, vm.Name)
return false
}
}
Expand Down Expand Up @@ -318,8 +318,8 @@ func (h *HatcheryVSphere) killDisabledWorkers(ctx context.Context) {
for _, w := range workerPoolDisabled {
for _, s := range srvs {
if s.Name == w.Name {
log.Info(ctx, " killDisabledWorkers %v", s.Name)
_ = h.deleteServer(ctx, s)
log.Info(ctx, " killDisabledWorkers markToDelete %v", s.Name)
h.markToDelete(ctx, s.Name)
break
}
}
Expand All @@ -345,6 +345,8 @@ func (h *HatcheryVSphere) killAwolServers(ctx context.Context) {
srvs := h.getVirtualMachines(ctx)

for _, s := range srvs {
ctx = context.WithValue(ctx, cdslog.AuthWorkerName, s.Name)

var annot = getVirtualMachineCDSAnnotation(ctx, s)
if annot == nil {
continue
Expand All @@ -353,52 +355,87 @@ func (h *HatcheryVSphere) killAwolServers(ctx context.Context) {
continue
}

if sdk.IsInArray(s.Name, h.cacheProvisioning.restarting) {
// if VM is marked to be deleted by the spawn goroutine (could be provision- or real worker), then delete it now.
if h.isMarkedToDelete(s) {
log.Info(ctx, "deleting machine %q as it's already marked to be deleted", s.Name)
if err := h.deleteServer(ctx, s); err != nil {
ctx = sdk.ContextWithStacktrace(ctx, err)
log.Error(ctx, "killAwolServers> cannot delete server (markedToDelete) %s", s.Name)
}
continue
}

// skipping vm starting with provision-
if strings.HasPrefix(s.Name, "provision-") {
continue
}

var isMarkToDelete = h.isMarkedToDelete(s)
var isPoweredOff = s.Summary.Runtime.PowerState != types.VirtualMachinePowerStatePoweredOn
if annot.Model {
continue
}

var bootTime = annot.Created
if s.Runtime.BootTime != nil {
bootTime = *s.Runtime.BootTime
// reload virtual machine to have fresh data from vsphere
vm, err := h.vSphereClient.LoadVirtualMachine(ctx, s.Name)
if err != nil {
ctx = sdk.ContextWithStacktrace(ctx, err)
log.Error(ctx, "unable to load vm: %v", err)
return
}
if !isPoweredOff && !isMarkToDelete {
// If the worker is not registered on CDS API the TTL is WorkerRegistrationTTL (default 10 minutes)
var expire = bootTime.Add(time.Duration(h.Config.WorkerRegistrationTTL) * time.Minute)
// Else it's WorkerTTL (default 120 minutes)
for _, w := range allWorkers {
if w.Name() == s.Name {
expire = bootTime.Add(time.Duration(h.Config.WorkerTTL) * time.Minute)
break
}

// gettings events for this vm, we have to check if we have a types.VmStartingEvent
eventVmStartingEvent, err := h.vSphereClient.LoadVirtualMachineEvents(ctx, vm, "VmStartingEvent")
if err != nil {
ctx = sdk.ContextWithStacktrace(ctx, err)
log.Error(ctx, "unable to load VmStartingEvent events: %v", err)
return
}

// if we don't have a types.VmStartingEvent, we skip this vm
if len(eventVmStartingEvent) == 0 {
log.Debug(ctx, "killAwolServers> no VmStartingEvent found - we keep this vm")
continue
}

var vmStartedTime = eventVmStartingEvent[0].GetEvent().CreatedTime

// If the worker is not registered on CDS API the TTL is WorkerRegistrationTTL (default 10 minutes)
// The registration duration, is the time between the createTime of the VM and the start time of the worker from cds api point of view
var expire = vmStartedTime.Add(time.Duration(h.Config.WorkerRegistrationTTL) * time.Minute)

// Else it's WorkerTTL (default 120 minutes)
for _, w := range allWorkers {
// if the worker is knowned by CDS Api, the worker TTL is used:
// if the CDS session is set to 24 h, and the worker TTL to 2h,
// then, the worker will be removed even if he is working on a job
// it should be the same duration as the CDS session.
if w.Name() == s.Name {
expire = vmStartedTime.Add(time.Duration(h.Config.WorkerTTL) * time.Minute)
break
}
}

log.Debug(ctx, "checking if %v is outdated. Created on :%v. Expires on %v", s.Name, bootTime, expire)
// If the VM is older that the WorkerTTL config, let's mark it as delete
log.Debug(ctx, "checking if %v is outdated. vmStartedTime: %v. expires:%v", s.Name, vmStartedTime, expire)
// If the VM is older that the WorkerTTL config, let's mark it as delete
if time.Now().After(expire) {
log.Info(ctx, "deleting machine %q - expired. vmStartedTime:%s expire:%s", s.Name, vmStartedTime, expire)

if time.Now().After(expire) {
vm, err := h.vSphereClient.LoadVirtualMachine(ctx, s.Name)
// debug with event if necessary
if log.Factory().GetLevel() == log.LevelDebug {
events, err := h.vSphereClient.LoadVirtualMachineEvents(ctx, vm, "")
if err != nil {
ctx = sdk.ContextWithStacktrace(ctx, err)
log.Error(ctx, "unable to load vm %s: %v", s.Name, err)
continue
log.Error(ctx, "event machine %q - can't load LoadVirtualMachineEvents", s.Name, err)
}
for _, e := range events {
log.Debug(ctx, "event machine %q - event: %T details:%+v", s.Name, e, e)
}
log.Info(ctx, "virtual machine %q as been created on %q, it has to be deleted - expire %q", s.Name, bootTime, expire)
h.markToDelete(ctx, vm)
}
}

// If the VM is mark as delete or is OFF and is not a model or a register-only VM, let's delete it
// We also exclude not used provisionned VM from deletion
isNotUsedProvisionned := annot.Provisioning && annot.WorkerName == s.Name
if isMarkToDelete || (isPoweredOff && (!annot.Model || annot.RegisterOnly) && !isNotUsedProvisionned) {
log.Info(ctx, "deleting machine %q as been created on annot.Created:%q runtime.BootTime:%q, it has to be deleted - powerState:%s isMarkToDelete:%t isPoweredOff:%t annot.Model:%t annot.RegisterOnly:%t", s.Name, annot.Created, s.Runtime.BootTime, s.Summary.Runtime.PowerState, isMarkToDelete, isPoweredOff, annot.Model, annot.RegisterOnly)
if err := h.deleteServer(ctx, s); err != nil {
ctx = sdk.ContextWithStacktrace(ctx, err)
log.Error(ctx, "killAwolServers> cannot delete server %s", s.Name)
log.Error(ctx, "killAwolServers> cannot delete server (expire) %s", s.Name)
}
} else {
log.Debug(ctx, "We keep %v as not expired", s.Name)
}
}
}
Expand Down
99 changes: 63 additions & 36 deletions engine/hatchery/vsphere/hatchery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,29 +222,9 @@ func TestHatcheryVSphere_killDisabledWorkers(t *testing.T) {
},
).AnyTimes()

var vm = object.VirtualMachine{
Common: object.Common{},
}

c.EXPECT().LoadVirtualMachine(gomock.Any(), "worker1").DoAndReturn(
func(ctx context.Context, name string) (*object.VirtualMachine, error) {
return &vm, nil
},
)

c.EXPECT().ShutdownVirtualMachine(gomock.Any(), &vm).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine) error {
return nil
},
)

c.EXPECT().DestroyVirtualMachine(gomock.Any(), &vm).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine) error {
return nil
},
)

h.killDisabledWorkers(context.Background())

assert.Equal(t, 1, len(h.cacheToDelete.list))
}

func TestHatcheryVSphere_killAwolServers(t *testing.T) {
Expand All @@ -265,6 +245,8 @@ func TestHatcheryVSphere_killAwolServers(t *testing.T) {
},
},
}
h.Config.WorkerTTL = 5
h.Config.WorkerRegistrationTTL = 5

cdsclient.EXPECT().WorkerList(gomock.Any()).DoAndReturn(
func(ctx context.Context) ([]sdk.Worker, error) {
Expand All @@ -279,6 +261,45 @@ func TestHatcheryVSphere_killAwolServers(t *testing.T) {
},
)

c.EXPECT().GetVirtualMachinePowerState(gomock.Any(), gomock.Any()).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine) (types.VirtualMachinePowerState, error) {
switch vm.Name() {
case "worker0":
return types.VirtualMachinePowerStatePoweredOn, nil
default:
return types.VirtualMachinePowerStatePoweredOff, nil
}
},
).AnyTimes()

c.EXPECT().LoadVirtualMachineEvents(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine, eventTypes ...string) ([]types.BaseEvent, error) {
t.Logf("LoadVirtualMachineEvents for %s", vm.Name())
switch vm.Name() {
case "worker0":
return []types.BaseEvent{
&types.VmStartingEvent{
VmEvent: types.VmEvent{
Event: types.Event{
CreatedTime: time.Now(),
},
},
},
}, nil
default:
return []types.BaseEvent{
&types.VmStartingEvent{
VmEvent: types.VmEvent{
Event: types.Event{
CreatedTime: time.Now().Add(-10 * time.Minute),
},
},
},
}, nil
}
},
).Times(5)

c.EXPECT().ListVirtualMachines(gomock.Any()).DoAndReturn(
func(ctx context.Context) ([]mo.VirtualMachine, error) {
return []mo.VirtualMachine{
Expand Down Expand Up @@ -339,31 +360,37 @@ func TestHatcheryVSphere_killAwolServers(t *testing.T) {
},
}, nil
},
).Times(2)
).Times(1)

var vm0 = object.VirtualMachine{
Common: object.Common{
InventoryPath: "worker0",
},
}
var vm1 = object.VirtualMachine{Common: object.Common{}}
var vm3 = object.VirtualMachine{Common: object.Common{}}
var vm1 = object.VirtualMachine{Common: object.Common{InventoryPath: "worker1"}}
var vm3 = object.VirtualMachine{Common: object.Common{InventoryPath: "worker3"}}

c.EXPECT().LoadVirtualMachine(gomock.Any(), gomock.Any()).DoAndReturn(
func(ctx context.Context, vmname string) (*object.VirtualMachine, error) {
t.Logf("calling LoadVirtualMachine: %s", vmname)
switch vmname {
case "worker0":
return &vm0, nil
case "worker1":
return &vm1, nil
case "worker3":
return &vm3, nil
}
return nil, fmt.Errorf("not expected: %s", vmname)
},
).Times(5)

c.EXPECT().LoadVirtualMachine(gomock.Any(), "worker0").DoAndReturn(
func(ctx context.Context, name string) (*object.VirtualMachine, error) { return &vm0, nil },
)
c.EXPECT().LoadVirtualMachine(gomock.Any(), "worker1").DoAndReturn(
func(ctx context.Context, name string) (*object.VirtualMachine, error) { return &vm1, nil },
)
c.EXPECT().ShutdownVirtualMachine(gomock.Any(), &vm1).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine) error { return nil },
)
c.EXPECT().DestroyVirtualMachine(gomock.Any(), &vm1).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine) error { return nil },
)
c.EXPECT().LoadVirtualMachine(gomock.Any(), "worker3").DoAndReturn(
func(ctx context.Context, name string) (*object.VirtualMachine, error) { return &vm3, nil },
)
c.EXPECT().ShutdownVirtualMachine(gomock.Any(), &vm3).DoAndReturn(
func(ctx context.Context, vm *object.VirtualMachine) error { return nil },
)
Expand Down Expand Up @@ -649,8 +676,8 @@ func TestHatcheryVSphere_provisioning_start_one(t *testing.T) {

var workerVM object.VirtualMachine

c.EXPECT().NewVirtualMachine(gomock.Any(), gomock.Any(), &workerRef).DoAndReturn(
func(ctx context.Context, cloneSpec *types.VirtualMachineCloneSpec, ref *types.ManagedObjectReference) (*object.VirtualMachine, error) {
c.EXPECT().NewVirtualMachine(gomock.Any(), gomock.Any(), &workerRef, gomock.Any()).DoAndReturn(
func(ctx context.Context, cloneSpec *types.VirtualMachineCloneSpec, ref *types.ManagedObjectReference, vmName string) (*object.VirtualMachine, error) {
assert.False(t, cloneSpec.Template)
assert.True(t, cloneSpec.PowerOn)
var givenAnnotation annotation
Expand Down
28 changes: 24 additions & 4 deletions engine/hatchery/vsphere/mock_vsphere/client_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 27d89eb

Please sign in to comment.