Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add concurrent VM boot checkup #38

Merged
merged 3 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Cluster admin should create the following cluster-reader permissions for dedicat
|spec.timeout|How much time before the checkup will try to close itself|False|Default is 10m|
|spec.param.storageClass|Optional storage class to be used instead of the default one|False||
|spec.param.vmiTimeout|Optional timeout for VMI operations|False|Default is 3m|
|spec.param.numOfVMs|Optional number of concurrent VMs to boot|False|Default is 10|

### Example

Expand Down Expand Up @@ -63,6 +64,8 @@ kubectl get configmap storage-checkup-config -n <target-namespace> -o yaml
|status.failureReason|Failure reason in case of a failure||
|status.startTimestamp|Checkup start timestamp|RFC 3339|
|status.completionTimestamp|Checkup completion timestamp|RFC 3339|
|status.result.cnvVersion|OpenShift Virtualization version||
|status.result.ocpVersion|OpenShift Container Platform cluster version||
|status.result.defaultStorageClass|Indicates whether there is a default storage class||
|status.result.pvcBound|PVC of 10Mi created and bound by the provisioner||
|status.result.storageProfilesWithEmptyClaimPropertySets|StorageProfiles with empty claimPropertySets (unknown provisioners)||
Expand All @@ -78,3 +81,4 @@ kubectl get configmap storage-checkup-config -n <target-namespace> -o yaml
|status.result.vmVolumeClone|VM volume clone type used (efficient or host-assisted) and fallback reason||
|status.result.vmLiveMigration|VM live-migration||
|status.result.vmHotplugVolume|VM volume hotplug and unplug||
|status.result.concurrentVMBoot|Concurrent VM boot from a golden image||
127 changes: 98 additions & 29 deletions pkg/internal/checkup/checkup.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"fmt"
"log"
"strings"
"sync"
"time"

corev1 "k8s.io/api/core/v1"
Expand All @@ -36,7 +37,6 @@ import (
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/apimachinery/pkg/util/wait"

vmispec "github.com/kiagnose/kubevirt-storage-checkup/pkg/internal/checkup/vmi"
"github.com/kiagnose/kubevirt-storage-checkup/pkg/internal/config"
"github.com/kiagnose/kubevirt-storage-checkup/pkg/internal/status"

Expand Down Expand Up @@ -92,6 +92,8 @@ const (
ErrVMsWithUnsetEfsStorageClass = "there are VMs using an EFS storageclass where the gid and uid are not set in the storageclass"
ErrGoldenImagesNotUpToDate = "there are golden images whose DataImportCron is not up to date or DataSource is not ready"
ErrGoldenImageNoDataSource = "dataSource has no PVC or Snapshot source"
ErrBootFailedOnSomeVMs = "some of the VMs failed to complete boot on time"
MessageBootCompletedOnAllVMs = "Boot completed on all VMs on time"
MessageSkipNoGoldenImage = "Skip check - no golden image PVC or Snapshot"
MessageSkipNoVMI = "Skip check - no VMI"
MessageSkipSingleNode = "Skip check - single node"
Expand Down Expand Up @@ -181,7 +183,7 @@ func (c *Checkup) Run(ctx context.Context) error {
return err
}

if err := c.checkVMICreation(ctx, &errStr); err != nil {
if err := c.checkVMIBoot(ctx, &errStr); err != nil {
return err
}
if err := c.checkVMILiveMigration(ctx, &errStr); err != nil {
Expand All @@ -191,6 +193,10 @@ func (c *Checkup) Run(ctx context.Context) error {
return err
}

if err := c.checkConcurrentVMIBoot(ctx, &errStr); err != nil {
return err
}

if errStr != "" {
return errors.New(errStr)
}
Expand Down Expand Up @@ -774,37 +780,26 @@ func (c *Checkup) Results() status.Results {
return c.results
}

func (c *Checkup) checkVMICreation(ctx context.Context, errStr *string) error {
const randomStringLen = 5

log.Print("checkVMICreation")
func (c *Checkup) checkVMIBoot(ctx context.Context, errStr *string) error {
log.Print("checkVMIBoot")
if c.goldenImagePvc == nil && c.goldenImageSnap == nil {
log.Print(MessageSkipNoGoldenImage)
c.results.VMBootFromGoldenImage = MessageSkipNoGoldenImage
return nil
}

vmName := fmt.Sprintf("%s-%s", VMIUnderTestNamePrefix, rand.String(randomStringLen))
c.vmUnderTest = newVMUnderTest(vmName, c.goldenImagePvc, c.goldenImageSnap, c.checkupConfig)
vmName := uniqueVMName()
c.vmUnderTest = newVMUnderTest(vmName, c.goldenImagePvc, c.goldenImageSnap, c.checkupConfig, false)
log.Printf("Creating VM %q", vmName)
if _, err := c.client.CreateVirtualMachine(ctx, c.namespace, c.vmUnderTest); err != nil {
return fmt.Errorf("failed to create VM: %w", err)
}

if err := c.waitForVMIStatus(ctx, "successfully booted", &c.results.VMBootFromGoldenImage, errStr,
func(vmi *kvcorev1.VirtualMachineInstance) (done bool, err error) {
for i := range vmi.Status.Conditions {
condition := vmi.Status.Conditions[i]
if condition.Type == kvcorev1.VirtualMachineInstanceReady && condition.Status == corev1.ConditionTrue {
return true, nil
}
}
return false, nil
}); err != nil {
if err := c.waitForVMIBoot(ctx, vmName, &c.results.VMBootFromGoldenImage, errStr); err != nil {
return err
}

pvc, err := c.client.GetPersistentVolumeClaim(ctx, c.namespace, vmispec.OSDataVolumName)
pvc, err := c.client.GetPersistentVolumeClaim(ctx, c.namespace, getVMDvName(vmName))
if err != nil {
return err
}
Expand Down Expand Up @@ -842,7 +837,8 @@ func (c *Checkup) checkVMILiveMigration(ctx context.Context, errStr *string) err
return nil
}

vmi, err := c.client.GetVirtualMachineInstance(ctx, c.namespace, c.vmUnderTest.Name)
vmName := c.vmUnderTest.Name
vmi, err := c.client.GetVirtualMachineInstance(ctx, c.namespace, vmName)
if err != nil {
return err
}
Expand All @@ -864,15 +860,15 @@ func (c *Checkup) checkVMILiveMigration(ctx context.Context, errStr *string) err
Name: "vmim",
},
Spec: kvcorev1.VirtualMachineInstanceMigrationSpec{
VMIName: c.vmUnderTest.Name,
VMIName: vmName,
},
}

if _, err := c.client.CreateVirtualMachineInstanceMigration(ctx, c.namespace, vmim); err != nil {
return fmt.Errorf("failed to create VMI LiveMigration: %w", err)
}

if err := c.waitForVMIStatus(ctx, "migration completed", &c.results.VMLiveMigration, errStr,
if err := c.waitForVMIStatus(ctx, vmName, "migration completed", &c.results.VMLiveMigration, errStr,
func(vmi *kvcorev1.VirtualMachineInstance) (done bool, err error) {
if ms := vmi.Status.MigrationState; ms != nil {
if ms.Completed {
Expand Down Expand Up @@ -933,11 +929,12 @@ func (c *Checkup) checkVMIHotplugVolume(ctx context.Context, errStr *string) err
},
}

if err := c.client.AddVirtualMachineInstanceVolume(ctx, c.namespace, c.vmUnderTest.Name, addVolumeOpts); err != nil {
vmName := c.vmUnderTest.Name
if err := c.client.AddVirtualMachineInstanceVolume(ctx, c.namespace, vmName, addVolumeOpts); err != nil {
return err
}

if err := c.waitForVMIStatus(ctx, "hotplug volume ready", &c.results.VMHotplugVolume, errStr,
if err := c.waitForVMIStatus(ctx, vmName, "hotplug volume ready", &c.results.VMHotplugVolume, errStr,
func(vmi *kvcorev1.VirtualMachineInstance) (done bool, err error) {
for i := range vmi.Status.VolumeStatus {
vs := vmi.Status.VolumeStatus[i]
Expand All @@ -955,11 +952,11 @@ func (c *Checkup) checkVMIHotplugVolume(ctx context.Context, errStr *string) err
Name: hotplugVolumeName,
}

if err := c.client.RemoveVirtualMachineInstanceVolume(ctx, c.namespace, c.vmUnderTest.Name, removeVolumeOpts); err != nil {
if err := c.client.RemoveVirtualMachineInstanceVolume(ctx, c.namespace, vmName, removeVolumeOpts); err != nil {
return err
}

if err := c.waitForVMIStatus(ctx, "hotplug volume removed", &c.results.VMHotplugVolume, errStr,
if err := c.waitForVMIStatus(ctx, vmName, "hotplug volume removed", &c.results.VMHotplugVolume, errStr,
func(vmi *kvcorev1.VirtualMachineInstance) (done bool, err error) {
for i := range vmi.Status.VolumeStatus {
vs := vmi.Status.VolumeStatus[i]
Expand All @@ -975,11 +972,83 @@ func (c *Checkup) checkVMIHotplugVolume(ctx context.Context, errStr *string) err
return nil
}

type checkVMIStatusFn func(*kvcorev1.VirtualMachineInstance) (done bool, err error)
func (c *Checkup) checkConcurrentVMIBoot(ctx context.Context, errStr *string) error {
numOfVMs := c.checkupConfig.NumOfVMs
log.Printf("checkConcurrentVMIBoot numOfVMs:%d", numOfVMs)

func (c *Checkup) waitForVMIStatus(ctx context.Context, checkMsg string, result, errStr *string, checkVMIStatus checkVMIStatusFn) error {
vmName := c.vmUnderTest.Name
if c.goldenImagePvc == nil && c.goldenImageSnap == nil {
log.Print(MessageSkipNoGoldenImage)
c.results.ConcurrentVMBoot = MessageSkipNoGoldenImage
return nil
}

var wg sync.WaitGroup
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you wanted to simplify, you could get rid of sync.WaitGroup and just have 2 loops

  • loop over creation
  • loop over waiting for ready

definitely not a must, just something to consider

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally prefer the goroutine-way. Looping over creation is less "concurrent", as it's serialized and you will see the last VM created long after the first one when you create for example 100 VMs. Tried that in the first shot but didn't really like this behavior. Looping over the wait for ready is ok (although it's cheaper from the goroutine if we already have it), but since it's serial we won't have the nice indication in the log whenever each VM is ready.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right the creation http request will block. nvm

isBootOk := true

for i := 0; i < numOfVMs; i++ {
wg.Add(1)
go func() {
defer wg.Done()

vmName := uniqueVMName()
log.Printf("Creating VM %q", vmName)
vm := newVMUnderTest(vmName, c.goldenImagePvc, c.goldenImageSnap, c.checkupConfig, true)
if _, err := c.client.CreateVirtualMachine(ctx, c.namespace, vm); err != nil {
log.Printf("failed to create VM %q: %s", vmName, err)
isBootOk = false
return
}

defer func() {
if err := c.client.DeleteVirtualMachine(ctx, c.namespace, vmName); err != nil {
log.Printf("failed to delete VM %q: %s", vmName, err)
}
}()

var result, errs string
if err := c.waitForVMIBoot(ctx, vmName, &result, &errs); err != nil || errs != "" {
log.Printf("failed waiting for VM boot %q", vmName)
isBootOk = false
}
}()
}

wg.Wait()
if !isBootOk {
log.Print(ErrBootFailedOnSomeVMs)
c.results.ConcurrentVMBoot = ErrBootFailedOnSomeVMs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe also keep track of the failed VMs and report how many of them errored

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not useful imho, as we are not interested in partial success, and you have already have each VMI boot status in the log anyway.

appendSep(errStr, ErrBootFailedOnSomeVMs)
return nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see all the returns from this function are nil, should you return some error when boot fails to abort the checkup? Or is this expected?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we concurrently boot the VMs via goroutines, and we don't really have a tight timing expectations here (we have a timeout for the whole checkup), I think the best way is to wait for the goroutines to complete (they have early defer for the case of failure). We would like to continue to following checks (in the future), so I see no reason to return a error.

}

log.Print(MessageBootCompletedOnAllVMs)
c.results.ConcurrentVMBoot = MessageBootCompletedOnAllVMs

return nil
}

func (c *Checkup) waitForVMIBoot(ctx context.Context, vmName string, result, errStr *string) error {
return c.waitForVMIStatus(ctx, vmName, "successfully booted", result, errStr,
func(vmi *kvcorev1.VirtualMachineInstance) (done bool, err error) {
for i := range vmi.Status.Conditions {
condition := vmi.Status.Conditions[i]
if condition.Type == kvcorev1.VirtualMachineInstanceReady && condition.Status == corev1.ConditionTrue {
return true, nil
}
}
return false, nil
})
}

func uniqueVMName() string {
const randomStringLen = 5
return fmt.Sprintf("%s-%s", VMIUnderTestNamePrefix, rand.String(randomStringLen))
}

type checkVMIStatusFn func(*kvcorev1.VirtualMachineInstance) (done bool, err error)

func (c *Checkup) waitForVMIStatus(ctx context.Context, vmName, checkMsg string, result, errStr *string,
checkVMIStatus checkVMIStatusFn) error {
conditionFn := func(ctx context.Context) (bool, error) {
vmi, err := c.client.GetVirtualMachineInstance(ctx, c.namespace, vmName)
if err != nil {
Expand Down
Loading
Loading