Skip to content

Commit

Permalink
test: automate scale test execution
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Castilio dos Santos <[email protected]>
  • Loading branch information
alexcastilio committed Jan 20, 2025
1 parent b3cd0ec commit b4284fc
Show file tree
Hide file tree
Showing 11 changed files with 299 additions and 52 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/daily-scale-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Daily Scale Test

on:
push:
branches:
- alexcastilio/scale-test-workflow
# schedule:
# - cron: "0 0 * * *"

permissions:
contents: read
id-token: write

jobs:
call-scale-test:
uses: ./.github/workflows/scale-test.yaml
with:
num_deployments: 300
num_replicas: 100
# TODO: Fix values
num_netpol: 300
# num_nodes: 100
cleanup: false
secrets: inherit
26 changes: 12 additions & 14 deletions .github/workflows/scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
description: "Image Namespace (if not set, default namespace will be used)"
type: string
image_tag:
description: "Image Tag (if not set, default for this commit will be used)"
description: "Image Tag (if not set, latest commit from 'main' will be used)"
type: string
num_deployments:
description: "Number of Traffic Deployments"
Expand All @@ -36,25 +36,21 @@ on:

workflow_call:
inputs:
resource_group:
description: "Azure Resource Group"
required: true
type: string
cluster_name:
description: "AKS Cluster Name"
required: true
type: string
num_deployments:
description: "Number of Traffic Deployments"
default: 1000
default: 100
type: number
num_replicas:
description: "Number of Traffic Replicas per Deployment"
default: 40
default: 10
type: number
num_netpol:
description: "Number of Network Policies"
default: 1000
default: 100
type: number
num_nodes:
description: "Number of nodes per pool"
default: 100
type: number
cleanup:
description: "Clean up environment after test"
Expand Down Expand Up @@ -100,8 +96,10 @@ jobs:
IMAGE_NAMESPACE: ${{ github.repository }}
TAG: ${{ inputs.image_tag }}
AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }}
NODES_PER_POOL: ${{ inputs.num_nodes }}
CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }}
shell: bash
run: |
set -euo pipefail
[[ $TAG == "" ]] && TAG=$(make version)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false
[[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA)
56 changes: 54 additions & 2 deletions test/e2e/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ package common

import (
"flag"
"os"
"os/user"
"strconv"
"testing"
"time"

"github.com/microsoft/retina/test/e2e/framework/params"
"github.com/stretchr/testify/require"
)

Expand All @@ -30,10 +30,62 @@ var (
Architectures = []string{"amd64", "arm64"}
CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing")
DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing")
ScaleTestInfra = ScaleTestInfraHandler{
location: params.Location,
subscriptionID: params.SubscriptionID,
resourceGroup: params.ResourceGroup,
clusterName: params.ClusterName,
nodesPerPool: params.NodesPerPool,
}
)

type ScaleTestInfraHandler struct {
location string
subscriptionID string
resourceGroup string
clusterName string
nodesPerPool string
}

func (s ScaleTestInfraHandler) GetSubscriptionID(t *testing.T) string {
require.NotEmpty(t, s.subscriptionID)
return s.subscriptionID
}

func (s ScaleTestInfraHandler) GetLocation(t *testing.T) string {

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)
if s.location == "" {
return "eastus2"
}
return s.location
}

func (s ScaleTestInfraHandler) GetResourceGroup(t *testing.T) string {
if s.resourceGroup != "" {
return s.resourceGroup
}
// Use the cluster name as the resource group name by default.
return s.GetClusterName(t)
}

func (s ScaleTestInfraHandler) GetNodesPerPool(t *testing.T) int32 {
if s.nodesPerPool == "" {
// Default to 100 nodes per pool
return 100
}
nodesPerPool, err := strconv.Atoi(s.nodesPerPool)
require.NoError(t, err, "NODES_PER_POOL must be an integer")
return int32(nodesPerPool)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure

Code scanning / CodeQL

Incorrect conversion between integer types High test

Incorrect conversion of an integer with architecture-dependent bit size from
strconv.Atoi
to a lower bit size type int32 without an upper bound check.
}

func (s ScaleTestInfraHandler) GetClusterName(t *testing.T) string {

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)
if s.clusterName != "" {
return s.clusterName
}
return "retina-scale-test"
}

func ClusterNameForE2ETest(t *testing.T) string {
clusterName := os.Getenv("CLUSTER_NAME")
clusterName := params.ClusterName
if clusterName == "" {
curuser, err := user.Current()
require.NoError(t, err)
Expand Down
10 changes: 5 additions & 5 deletions test/e2e/framework/azure/create-cluster-with-npm.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ const (
clusterTimeout = 15 * time.Minute
clusterCreateTicker = 30 * time.Second
pollFrequency = 5 * time.Second
AgentARMSKU = "Standard_D4pls_v5"
AuxilaryNodeCount = 1
AgentARMSKU = "Standard_D4pls_v6"
)

type CreateNPMCluster struct {
Expand All @@ -35,6 +34,7 @@ type CreateNPMCluster struct {
PodCidr string
DNSServiceIP string
ServiceCidr string
NodesPerPool int32
}

func (c *CreateNPMCluster) Prevalidate() error {
Expand All @@ -47,15 +47,15 @@ func (c *CreateNPMCluster) Stop() error {

func (c *CreateNPMCluster) Run() error {
// Start with default cluster template
npmCluster := GetStarterClusterTemplate(c.Location)
npmCluster := GetStarterClusterTemplate(c.Location, c.NodesPerPool)

npmCluster.Properties.NetworkProfile.NetworkPolicy = to.Ptr(armcontainerservice.NetworkPolicyAzure)

//nolint:appendCombine // separate for verbosity
npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](AuxilaryNodeCount),
Count: to.Ptr[int32](c.NodesPerPool),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeUser),
OSType: to.Ptr(armcontainerservice.OSTypeWindows),
Expand Down Expand Up @@ -86,7 +86,7 @@ func (c *CreateNPMCluster) Run() error {
npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](AuxilaryNodeCount),
Count: to.Ptr[int32](c.NodesPerPool),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeUser),
OSType: to.Ptr(armcontainerservice.OSTypeLinux),
Expand Down
6 changes: 3 additions & 3 deletions test/e2e/framework/azure/create-cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func (c *CreateCluster) Run() error {
return fmt.Errorf("failed to create client: %w", err)
}

poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil)
if err != nil {
return fmt.Errorf("failed to finish the create cluster request: %w", err)
}
Expand All @@ -49,7 +49,7 @@ func (c *CreateCluster) Run() error {
return nil
}

func GetStarterClusterTemplate(location string) armcontainerservice.ManagedCluster {
func GetStarterClusterTemplate(location string, numOfNodes int32) armcontainerservice.ManagedCluster {
id := armcontainerservice.ResourceIdentityTypeSystemAssigned
return armcontainerservice.ManagedCluster{
Location: to.Ptr(location),
Expand All @@ -70,7 +70,7 @@ func GetStarterClusterTemplate(location string) armcontainerservice.ManagedClust
{
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](MaxNumberOfNodes),
Count: to.Ptr[int32](numOfNodes),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeSystem),
OSType: to.Ptr(armcontainerservice.OSTypeLinux),
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/framework/azure/enable-ama.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ az aks update --enable-azure-monitor-metrics \
return fmt.Errorf("failed to write cluster JSON to file for AMA: %w", err)
}

poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil)
if err != nil {
return fmt.Errorf("failed to finish the update cluster request for AMA: %w", err)
}
Expand Down
76 changes: 76 additions & 0 deletions test/e2e/framework/kubernetes/label-nodes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package kubernetes

import (
"context"
"encoding/json"
"log"
"fmt"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)

type patchStringValue struct {
Op string `json:"op"`
Path string `json:"path"`
Value string `json:"value"`
}

type LabelNodes struct {
KubeConfigFilePath string
Labels map[string]string
}

func (l *LabelNodes) Prevalidate() error {
return nil
}

func (l *LabelNodes) Run() error {
config, err := clientcmd.BuildConfigFromFlags("", l.KubeConfigFilePath)
if err != nil {
return fmt.Errorf("error building kubeconfig: %w", err)
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
defer cancel()

nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to get nodes: %w", err)
}

patch := []patchStringValue{}
for k, v := range l.Labels {
patch = append(patch, patchStringValue{
Op: "add",
Path: "/metadata/labels/" + k,
Value: v,
})
}
b, err := json.Marshal(patch)
if err != nil {
return fmt.Errorf("failed to marshal patch: %w", err)
}

for _, node := range nodes.Items {

Check failure on line 63 in test/e2e/framework/kubernetes/label-nodes.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

rangeValCopy: each iteration copies 728 bytes (consider pointers or indexing) (gocritic)

Check failure on line 63 in test/e2e/framework/kubernetes/label-nodes.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

rangeValCopy: each iteration copies 728 bytes (consider pointers or indexing) (gocritic)

Check failure on line 63 in test/e2e/framework/kubernetes/label-nodes.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

rangeValCopy: each iteration copies 728 bytes (consider pointers or indexing) (gocritic)

Check failure on line 63 in test/e2e/framework/kubernetes/label-nodes.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

rangeValCopy: each iteration copies 728 bytes (consider pointers or indexing) (gocritic)
log.Println("Labeling node", node.Name)
_, err = clientset.CoreV1().Nodes().Patch(ctx, node.Name, types.JSONPatchType, b, metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("failed to patch pod: %w", err)
}
}

return nil
}

func (l *LabelNodes) Stop() error {
return nil
}
17 changes: 17 additions & 0 deletions test/e2e/framework/params/params.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package params

import (
"os"
)

var (
Location = os.Getenv("LOCATION")
SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID")
ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP")
ClusterName = os.Getenv("CLUSTER_NAME")
NodesPerPool = os.Getenv("NODES_PER_POOL")
NumDeployments = os.Getenv("NUM_DEPLOYMENTS")
NumReplicas = os.Getenv("NUM_REPLICAS")
NumNetworkPolicies = os.Getenv("NUM_NET_POL")
CleanUp = os.Getenv("CLEANUP")
)
1 change: 1 addition & 0 deletions test/e2e/jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string
PodCidr: "10.128.0.0/9",
DNSServiceIP: "192.168.0.10",
ServiceCidr: "192.168.0.0/28",
NodesPerPool: 1,
}, nil)

job.AddStep(&azure.GetAKSKubeConfig{
Expand Down
Loading

0 comments on commit b4284fc

Please sign in to comment.