Skip to content

Commit d998c63

Browse files
committed
test: automate scale test execution
Signed-off-by: Alex Castilio dos Santos <alexsantos@microsoft.com>
1 parent b3cd0ec commit d998c63

File tree

11 files changed

+314
-53
lines changed

11 files changed

+314
-53
lines changed
+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: Daily Scale Test
2+
3+
on:
4+
push:
5+
branches:
6+
- alexcastilio/scale-test-workflow
7+
schedule:
8+
- cron: "0 0 * * *"
9+
10+
permissions:
11+
contents: read
12+
id-token: write
13+
14+
jobs:
15+
call-scale-test:
16+
uses: ./.github/workflows/scale-test.yaml
17+
with:
18+
num_deployments: 300
19+
num_replicas: 100
20+
# TODO: Fix values
21+
num_netpol: 300
22+
# num_nodes: 100
23+
cleanup: false
24+
secrets: inherit

.github/workflows/scale-test.yaml

+12-14
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ on:
1515
description: "Image Namespace (if not set, default namespace will be used)"
1616
type: string
1717
image_tag:
18-
description: "Image Tag (if not set, default for this commit will be used)"
18+
description: "Image Tag (if not set, latest commit from 'main' will be used)"
1919
type: string
2020
num_deployments:
2121
description: "Number of Traffic Deployments"
@@ -36,25 +36,21 @@ on:
3636

3737
workflow_call:
3838
inputs:
39-
resource_group:
40-
description: "Azure Resource Group"
41-
required: true
42-
type: string
43-
cluster_name:
44-
description: "AKS Cluster Name"
45-
required: true
46-
type: string
4739
num_deployments:
4840
description: "Number of Traffic Deployments"
49-
default: 1000
41+
default: 100
5042
type: number
5143
num_replicas:
5244
description: "Number of Traffic Replicas per Deployment"
53-
default: 40
45+
default: 10
5446
type: number
5547
num_netpol:
5648
description: "Number of Network Policies"
57-
default: 1000
49+
default: 100
50+
type: number
51+
num_nodes:
52+
description: "Number of nodes per pool"
53+
default: 100
5854
type: number
5955
cleanup:
6056
description: "Clean up environment after test"
@@ -100,8 +96,10 @@ jobs:
10096
IMAGE_NAMESPACE: ${{ github.repository }}
10197
TAG: ${{ inputs.image_tag }}
10298
AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }}
99+
NODES_PER_POOL: ${{ inputs.num_nodes }}
100+
CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }}
103101
shell: bash
104102
run: |
105103
set -euo pipefail
106-
[[ $TAG == "" ]] && TAG=$(make version)
107-
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false
104+
[[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7)
105+
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA)

test/e2e/common/common.go

+54-2
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ package common
66

77
import (
88
"flag"
9-
"os"
109
"os/user"
1110
"strconv"
1211
"testing"
1312
"time"
1413

14+
"github.com/microsoft/retina/test/e2e/framework/params"
1515
"github.com/stretchr/testify/require"
1616
)
1717

@@ -30,10 +30,62 @@ var (
3030
Architectures = []string{"amd64", "arm64"}
3131
CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing")
3232
DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing")
33+
ScaleTestInfra = ScaleTestInfraHandler{
34+
location: params.Location,
35+
subscriptionID: params.SubscriptionID,
36+
resourceGroup: params.ResourceGroup,
37+
clusterName: params.ClusterName,
38+
nodesPerPool: params.NodesPerPool,
39+
}
3340
)
3441

42+
type ScaleTestInfraHandler struct {
43+
location string
44+
subscriptionID string
45+
resourceGroup string
46+
clusterName string
47+
nodesPerPool string
48+
}
49+
50+
func (s ScaleTestInfraHandler) GetSubscriptionID(t *testing.T) string {
51+
require.NotEmpty(t, s.subscriptionID)
52+
return s.subscriptionID
53+
}
54+
55+
func (s ScaleTestInfraHandler) GetLocation(t *testing.T) string {
56+
if s.location == "" {
57+
return "eastus2"
58+
}
59+
return s.location
60+
}
61+
62+
func (s ScaleTestInfraHandler) GetResourceGroup(t *testing.T) string {
63+
if s.resourceGroup != "" {
64+
return s.resourceGroup
65+
}
66+
// Use the cluster name as the resource group name by default.
67+
return s.GetClusterName(t)
68+
}
69+
70+
func (s ScaleTestInfraHandler) GetNodesPerPool(t *testing.T) int32 {
71+
if s.nodesPerPool == "" {
72+
// Default to 100 nodes per pool
73+
return 100
74+
}
75+
nodesPerPool, err := strconv.Atoi(s.nodesPerPool)
76+
require.NoError(t, err, "NODES_PER_POOL must be an integer")
77+
return int32(nodesPerPool)
78+
}
79+
80+
func (s ScaleTestInfraHandler) GetClusterName(t *testing.T) string {
81+
if s.clusterName != "" {
82+
return s.clusterName
83+
}
84+
return "retina-scale-test"
85+
}
86+
3587
func ClusterNameForE2ETest(t *testing.T) string {
36-
clusterName := os.Getenv("CLUSTER_NAME")
88+
clusterName := params.ClusterName
3789
if clusterName == "" {
3890
curuser, err := user.Current()
3991
require.NoError(t, err)

test/e2e/framework/azure/create-cluster-with-npm.go

+20-6
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@ var (
1818
)
1919

2020
const (
21+
largeClusterTimeout = 30 * time.Minute
2122
clusterTimeout = 15 * time.Minute
2223
clusterCreateTicker = 30 * time.Second
2324
pollFrequency = 5 * time.Second
24-
AgentARMSKU = "Standard_D4pls_v5"
25-
AuxilaryNodeCount = 1
25+
AgentARMSKU = "Standard_D4pls_v6"
2626
)
2727

2828
type CreateNPMCluster struct {
@@ -35,6 +35,7 @@ type CreateNPMCluster struct {
3535
PodCidr string
3636
DNSServiceIP string
3737
ServiceCidr string
38+
NodesPerPool int32
3839
}
3940

4041
func (c *CreateNPMCluster) Prevalidate() error {
@@ -47,15 +48,19 @@ func (c *CreateNPMCluster) Stop() error {
4748

4849
func (c *CreateNPMCluster) Run() error {
4950
// Start with default cluster template
50-
npmCluster := GetStarterClusterTemplate(c.Location)
51+
npmCluster := GetStarterClusterTemplate(c.Location, c.NodesPerPool)
5152

5253
npmCluster.Properties.NetworkProfile.NetworkPolicy = to.Ptr(armcontainerservice.NetworkPolicyAzure)
54+
npmCluster.Properties.NetworkProfile.PodCidr = to.Ptr(c.PodCidr)
55+
56+
podSubnetId := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s", c.SubscriptionID, c.ResourceGroupName, c.VnetName, c.SubnetName)
57+
npmCluster.Properties.AgentPoolProfiles[0].PodSubnetID = to.Ptr(podSubnetId)
5358

5459
//nolint:appendCombine // separate for verbosity
5560
npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all
5661
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
5762
// AvailabilityZones: []*string{to.Ptr("1")},
58-
Count: to.Ptr[int32](AuxilaryNodeCount),
63+
Count: to.Ptr[int32](c.NodesPerPool),
5964
EnableNodePublicIP: to.Ptr(false),
6065
Mode: to.Ptr(armcontainerservice.AgentPoolModeUser),
6166
OSType: to.Ptr(armcontainerservice.OSTypeWindows),
@@ -64,6 +69,7 @@ func (c *CreateNPMCluster) Run() error {
6469
VMSize: to.Ptr(AgentSKU),
6570
Name: to.Ptr("ws22"),
6671
MaxPods: to.Ptr(int32(MaxPodsPerNode)),
72+
PodSubnetID: to.Ptr(podSubnetId),
6773
})
6874

6975
/* todo: add azlinux node pool
@@ -86,14 +92,15 @@ func (c *CreateNPMCluster) Run() error {
8692
npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all
8793
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
8894
// AvailabilityZones: []*string{to.Ptr("1")},
89-
Count: to.Ptr[int32](AuxilaryNodeCount),
95+
Count: to.Ptr[int32](c.NodesPerPool),
9096
EnableNodePublicIP: to.Ptr(false),
9197
Mode: to.Ptr(armcontainerservice.AgentPoolModeUser),
9298
OSType: to.Ptr(armcontainerservice.OSTypeLinux),
9399
ScaleDownMode: to.Ptr(armcontainerservice.ScaleDownModeDelete),
94100
VMSize: to.Ptr(AgentARMSKU),
95101
Name: to.Ptr("arm64"),
96102
MaxPods: to.Ptr(int32(MaxPodsPerNode)),
103+
PodSubnetID: to.Ptr(podSubnetId),
97104
})
98105

99106
npmCluster.Properties.AutoUpgradeProfile = &armcontainerservice.ManagedClusterAutoUpgradeProfile{
@@ -105,7 +112,14 @@ func (c *CreateNPMCluster) Run() error {
105112
if err != nil {
106113
return fmt.Errorf("failed to obtain a credential: %w", err)
107114
}
108-
ctx, cancel := context.WithTimeout(context.Background(), clusterTimeout)
115+
116+
var timeout time.Duration
117+
if c.NodesPerPool > 20 {
118+
timeout = largeClusterTimeout
119+
} else {
120+
timeout = clusterTimeout
121+
}
122+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
109123
defer cancel()
110124

111125
clientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil)

test/e2e/framework/azure/create-cluster.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ func (c *CreateCluster) Run() error {
3737
return fmt.Errorf("failed to create client: %w", err)
3838
}
3939

40-
poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
40+
poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil)
4141
if err != nil {
4242
return fmt.Errorf("failed to finish the create cluster request: %w", err)
4343
}
@@ -49,7 +49,7 @@ func (c *CreateCluster) Run() error {
4949
return nil
5050
}
5151

52-
func GetStarterClusterTemplate(location string) armcontainerservice.ManagedCluster {
52+
func GetStarterClusterTemplate(location string, numOfNodes int32) armcontainerservice.ManagedCluster {
5353
id := armcontainerservice.ResourceIdentityTypeSystemAssigned
5454
return armcontainerservice.ManagedCluster{
5555
Location: to.Ptr(location),
@@ -70,7 +70,7 @@ func GetStarterClusterTemplate(location string) armcontainerservice.ManagedClust
7070
{
7171
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
7272
// AvailabilityZones: []*string{to.Ptr("1")},
73-
Count: to.Ptr[int32](MaxNumberOfNodes),
73+
Count: to.Ptr[int32](numOfNodes),
7474
EnableNodePublicIP: to.Ptr(false),
7575
Mode: to.Ptr(armcontainerservice.AgentPoolModeSystem),
7676
OSType: to.Ptr(armcontainerservice.OSTypeLinux),

test/e2e/framework/azure/enable-ama.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ az aks update --enable-azure-monitor-metrics \
9595
return fmt.Errorf("failed to write cluster JSON to file for AMA: %w", err)
9696
}
9797

98-
poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
98+
poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil)
9999
if err != nil {
100100
return fmt.Errorf("failed to finish the update cluster request for AMA: %w", err)
101101
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package kubernetes
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"log"
7+
"fmt"
8+
"time"
9+
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/apimachinery/pkg/types"
12+
"k8s.io/client-go/kubernetes"
13+
"k8s.io/client-go/tools/clientcmd"
14+
)
15+
16+
type patchStringValue struct {
17+
Op string `json:"op"`
18+
Path string `json:"path"`
19+
Value string `json:"value"`
20+
}
21+
22+
type LabelNodes struct {
23+
KubeConfigFilePath string
24+
Labels map[string]string
25+
}
26+
27+
func (l *LabelNodes) Prevalidate() error {
28+
return nil
29+
}
30+
31+
func (l *LabelNodes) Run() error {
32+
config, err := clientcmd.BuildConfigFromFlags("", l.KubeConfigFilePath)
33+
if err != nil {
34+
return fmt.Errorf("error building kubeconfig: %w", err)
35+
}
36+
37+
clientset, err := kubernetes.NewForConfig(config)
38+
if err != nil {
39+
return fmt.Errorf("error creating Kubernetes client: %w", err)
40+
}
41+
42+
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
43+
defer cancel()
44+
45+
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
46+
if err != nil {
47+
return fmt.Errorf("failed to get nodes: %w", err)
48+
}
49+
50+
patch := []patchStringValue{}
51+
for k, v := range l.Labels {
52+
patch = append(patch, patchStringValue{
53+
Op: "add",
54+
Path: "/metadata/labels/" + k,
55+
Value: v,
56+
})
57+
}
58+
b, err := json.Marshal(patch)
59+
if err != nil {
60+
return fmt.Errorf("failed to marshal patch: %w", err)
61+
}
62+
63+
for _, node := range nodes.Items {
64+
log.Println("Labeling node", node.Name)
65+
_, err = clientset.CoreV1().Nodes().Patch(ctx, node.Name, types.JSONPatchType, b, metav1.PatchOptions{})
66+
if err != nil {
67+
return fmt.Errorf("failed to patch pod: %w", err)
68+
}
69+
}
70+
71+
return nil
72+
}
73+
74+
func (l *LabelNodes) Stop() error {
75+
return nil
76+
}

test/e2e/framework/params/params.go

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package params
2+
3+
import (
4+
"os"
5+
)
6+
7+
var (
8+
Location = os.Getenv("LOCATION")
9+
SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID")
10+
ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP")
11+
ClusterName = os.Getenv("CLUSTER_NAME")
12+
NodesPerPool = os.Getenv("NODES_PER_POOL")
13+
NumDeployments = os.Getenv("NUM_DEPLOYMENTS")
14+
NumReplicas = os.Getenv("NUM_REPLICAS")
15+
NumNetworkPolicies = os.Getenv("NUM_NET_POL")
16+
CleanUp = os.Getenv("CLEANUP")
17+
)

test/e2e/jobs/jobs.go

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ func CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string
4343
PodCidr: "10.128.0.0/9",
4444
DNSServiceIP: "192.168.0.10",
4545
ServiceCidr: "192.168.0.0/28",
46+
NodesPerPool: 1,
4647
}, nil)
4748

4849
job.AddStep(&azure.GetAKSKubeConfig{

0 commit comments

Comments
 (0)