Skip to content
This repository was archived by the owner on Jan 11, 2023. It is now read-only.

Commit fe3f715

Browse files
authored
Network validation checks during provision (#2196)
* Add DNS + HTTPS checks, capture DNS packets * ARM doesn’t like ‘{‘ * standardizing retrycmd_if_failure usage patterns * Adding DNS pre-check for aptdocker.azureedge.net * tracking time for each retried provision event * standardizing to 3 masters api model for e2e tests * retain e2e resources for debugging * getting metrics logs from all cluster hosts * improved master/agent host retrieval * lint * lint * Adding “agent” substring to e2e api model pools * invalid agent pool name * revert agent forwarding ssh config * restore cleanup * add agent dns validation * 5 seconds between etcddisk mount retries
1 parent 7923b96 commit fe3f715

9 files changed

+135
-38
lines changed

examples/e2e-tests/kubernetes/release/default/definition.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
}
99
},
1010
"masterProfile": {
11-
"count": 1,
11+
"count": 3,
1212
"dnsPrefix": "",
1313
"vmSize": "Standard_D2_v2",
1414
"OSDiskSizeGB": 200,
@@ -18,7 +18,7 @@
1818
},
1919
"agentPoolProfiles": [
2020
{
21-
"name": "md",
21+
"name": "agentmd",
2222
"count": 3,
2323
"vmSize": "Standard_D2_v2",
2424
"OSDiskSizeGB": 200,
@@ -28,7 +28,7 @@
2828
"vnetSubnetId": "/subscriptions/SUB_ID/resourceGroups/RG_NAME/providers/Microsoft.Network/virtualNetworks/VNET_NAME/subnets/SUBNET_NAME"
2929
},
3030
{
31-
"name": "sa",
31+
"name": "agentsa",
3232
"count": 3,
3333
"vmSize": "Standard_D2_v2",
3434
"OSDiskSizeGB": 200,

parts/k8s/kubernetesagentcustomdata.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,10 @@ coreos:
165165
ExecStart=/opt/azure/containers/provision-setup.sh
166166
{{else}}
167167
runcmd:
168-
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
169168
- echo `date`,`hostname`, startruncmd>>/opt/m
169+
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
170+
- retrycmd_if_failure nc -zw1 $(grep nameserver /etc/resolv.conf | cut -d \ -f 2) 53
171+
- retrycmd_if_failure nc -zw1 azure.com 443
170172
- apt-mark hold walinuxagent{{GetKubernetesAgentPreprovisionYaml .}}
171173
- echo `date`,`hostname`, preaptupdate>>/opt/m
172174
- retrycmd_if_failure apt-get update
@@ -178,7 +180,7 @@ runcmd:
178180
- systemctl start rpcbind
179181
- systemctl start rpc-statd
180182
- echo `date`,`hostname`, predockerinstall>>/opt/m
181-
- retrycmd_if_failure curl --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 -fsSL https://aptdocker.azureedge.net/gpg | apt-key add -
183+
- curl --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 -fsSL https://aptdocker.azureedge.net/gpg | apt-key add -
182184
- echo "deb {{WrapAsVariable "dockerEngineDownloadRepo"}} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list
183185
- "echo \"Package: docker-engine\nPin: version {{WrapAsVariable "dockerEngineVersion"}}\nPin-Priority: 550\n\" > /etc/apt/preferences.d/docker.pref"
184186
- retrycmd_if_failure apt-get update

parts/k8s/kubernetesmastercustomdata.yml

+6-3
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ MASTER_ARTIFACTS_CONFIG_PLACEHOLDER
292292
content: |
293293
#!/bin/bash
294294
set -x
295-
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
295+
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
296296
ETCD_VER=v{{WrapAsVariable "etcdVersion"}}
297297
DOWNLOAD_URL={{WrapAsVariable "etcdDownloadURLBase"}}
298298
mkdir -p /tmp/etcd-download
@@ -314,7 +314,7 @@ MASTER_ARTIFACTS_CONFIG_PLACEHOLDER
314314
owner: "root"
315315
content: |
316316
#!/bin/bash
317-
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
317+
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
318318
/bin/echo DAEMON_ARGS=--name "{{WrapAsVerbatim "variables('masterVMNames')[copyIndex(variables('masterOffset'))]"}}" --initial-advertise-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --advertise-client-urls "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-client-urls "{{WrapAsVerbatim "concat(variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))], ',http://127.0.0.1:', variables('masterEtcdClientPort'))"}}" --initial-cluster-token "k8s-etcd-cluster" --initial-cluster "{{WrapAsVerbatim "variables('masterEtcdClusterStates')[div(variables('masterCount'), 2)]"}} --data-dir "/var/lib/etcddisk"" --initial-cluster-state "new" | tee -a /etc/default/etcd
319319
sudo /bin/chown -R etcd:etcd /var/lib/etcd/default
320320
/opt/azure/containers/mountetcd.sh
@@ -347,7 +347,9 @@ coreos:
347347
ExecStart=/opt/azure/containers/provision-setup.sh
348348
{{else}}
349349
runcmd:
350-
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
350+
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
351+
- retrycmd_if_failure nc -zw1 $(grep nameserver /etc/resolv.conf | cut -d \ -f 2) 53
352+
- retrycmd_if_failure nc -zw1 azure.com 443
351353
- /opt/azure/containers/setup-etcd.sh > /opt/azure/containers/setup-etcd.log 2>&1
352354
- apt-mark hold walinuxagent {{GetKubernetesMasterPreprovisionYaml}}
353355
- /bin/echo DAEMON_ARGS=--name "{{WrapAsVerbatim "variables('masterVMNames')[copyIndex(variables('masterOffset'))]"}}" --peer-client-cert-auth --peer-trusted-ca-file={{WrapAsVariable "etcdCaFilepath"}} --peer-cert-file={{WrapAsVerbatim "variables('etcdPeerCertFilepath')[copyIndex(variables('masterOffset'))]"}} --peer-key-file={{WrapAsVerbatim "variables('etcdPeerKeyFilepath')[copyIndex(variables('masterOffset'))]"}} --initial-advertise-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --client-cert-auth --trusted-ca-file={{WrapAsVariable "etcdCaFilepath"}} --cert-file={{WrapAsVariable "etcdServerCertFilepath"}} --key-file={{WrapAsVariable "etcdServerKeyFilepath"}} --advertise-client-urls "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-client-urls "{{WrapAsVerbatim "concat(variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))], ',https://127.0.0.1:', variables('masterEtcdClientPort'))"}}" --initial-cluster-token "k8s-etcd-cluster" --initial-cluster "{{WrapAsVerbatim "variables('masterEtcdClusterStates')[div(variables('masterCount'), 2)]"}} --data-dir "/var/lib/etcddisk"" --initial-cluster-state "new" | tee -a /etc/default/etcd
@@ -363,6 +365,7 @@ runcmd:
363365
- retrycmd_if_failure curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}"/v2/machines
364366
- retrycmd_if_failure apt-get update
365367
- retrycmd_if_failure apt-get install -y apt-transport-https ca-certificates
368+
- retrycmd_if_failure nc -zw1 aptdocker.azureedge.net 443
366369
- curl --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 -fsSL https://aptdocker.azureedge.net/gpg | apt-key add -
367370
- echo "deb {{WrapAsVariable "dockerEngineDownloadRepo"}} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list
368371
- "echo \"Package: docker-engine\nPin: version {{WrapAsVariable "dockerEngineVersion"}}\nPin-Priority: 550\n\" > /etc/apt/preferences.d/docker.pref"

parts/k8s/kubernetesmastercustomscript.sh

+20-8
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@
2323
# KUBECONFIG_KEY ETCD_SERVER_CERTIFICATE ETCD_SERVER_PRIVATE_KEY ETCD_CLIENT_CERTIFICATE ETCD_CLIENT_PRIVATE_KEY
2424
# ETCD_PEER_CERTIFICATES ETCD_PEER_PRIVATE_KEYS ADMINUSER MASTER_INDEX
2525

26+
# Capture Interesting Network Stuffs during provision
27+
packetCaptureProvision() {
28+
tcpdump -G 600 -W 1 -n -vv -w /var/log/azure/dnsdump.pcap -Z root -i eth0 udp port 53 > /dev/null 2>&1 &
29+
}
30+
31+
packetCaptureProvision
32+
2633
# Find distro name via ID value in releases files and upcase
2734
OS=$(cat /etc/*-release | grep ^ID= | tr -d 'ID="' | awk '{print toupper($0)}')
2835
UBUNTU_OS_NAME="UBUNTU"
@@ -49,7 +56,7 @@ ensureRunCommandCompleted()
4956
echo "waiting for runcmd to finish"
5057
for i in {1..900}; do
5158
if [ -e /opt/azure/containers/runcmd.complete ]; then
52-
echo "runcmd finished"
59+
echo "runcmd finished, took $i seconds"
5360
break
5461
fi
5562
sleep 1
@@ -185,6 +192,7 @@ function ensureKubectl() {
185192
if [ -e $KUBECTL ]
186193
then
187194
kubectlfound=0
195+
echo "kubectl installed successfully, took $i seconds"
188196
break
189197
fi
190198
sleep 1
@@ -203,6 +211,7 @@ function downloadUrl () {
203211
# Wrapper around curl to download blobs more reliably.
204212
# Workaround the --retry issues with a for loop and set a max timeout.
205213
for i in 1 2 3 4 5; do curl --max-time 60 -fsSL ${1}; [ $? -eq 0 ] && break || sleep 10; done
214+
echo Executed curl for \"${1}\" $i times
206215
}
207216

208217
function setMaxPods () {
@@ -466,6 +475,7 @@ function systemctlEnableAndCheck() {
466475
systemctl is-enabled $1
467476
enabled=$?
468477
else
478+
echo "$1 took $i seconds to be enabled by systemctl"
469479
break
470480
fi
471481
sleep 1
@@ -489,7 +499,7 @@ function ensureDocker() {
489499
echo "status $?"
490500
/bin/systemctl restart docker
491501
else
492-
echo "docker started"
502+
echo "docker started, took $i seconds"
493503
dockerStarted=0
494504
break
495505
fi
@@ -543,15 +553,15 @@ function ensureApiserver() {
543553
$KUBECTL cluster-info
544554
if [ "$?" = "0" ]
545555
then
546-
echo "kubernetes started"
556+
echo "kubernetes started, took $i seconds"
547557
kubernetesStarted=0
548558
break
549559
fi
550560
else
551561
/usr/bin/docker ps | grep apiserver
552562
if [ "$?" = "0" ]
553563
then
554-
echo "kubernetes started"
564+
echo "kubernetes started, took $i seconds"
555565
kubernetesStarted=0
556566
break
557567
fi
@@ -570,10 +580,10 @@ function ensureEtcd() {
570580
curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --max-time 60 https://127.0.0.1:2379/v2/machines;
571581
if [ $? -eq 0 ]
572582
then
573-
echo "Etcd setup successfully"
583+
echo "Etcd setup successfully, took $i seconds"
574584
break
575585
fi
576-
sleep 5
586+
sleep 1
577587
done
578588
}
579589

@@ -585,14 +595,16 @@ function ensureEtcdDataDir() {
585595
return
586596
else
587597
echo "/var/lib/etcddisk was not found at /dev/sdc1. Trying to mount all devices."
598+
s = 5
588599
for i in {1..60}; do
589600
sudo mount -a && mount | grep /dev/sdc1 | grep /var/lib/etcddisk;
590601
if [ "$?" = "0" ]
591602
then
592-
echo "/var/lib/etcddisk mounted at: /dev/sdc1"
603+
(( t = ${i} * ${s} ))
604+
echo "/var/lib/etcddisk mounted at: /dev/sdc1, took $t seconds"
593605
return
594606
fi
595-
sleep 5
607+
sleep $s
596608
done
597609
fi
598610

parts/k8s/kubernetesmastergenerateproxycertscript.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ openssl genrsa -out $PROXY_CLIENT_KEY 2048
2020
openssl req -new -key $PROXY_CLIENT_KEY -out $PROXY_CLIENT_CSR -subj '/CN=aggregator/O=system:masters'
2121
openssl x509 -req -days 730 -in $PROXY_CLIENT_CSR -CA $PROXY_CRT -CAkey $PROXY_CA_KEY -set_serial 02 -out $PROXY_CLIENT_CRT
2222

23-
retrycmd_if_failure() { for i in 1 2 3 4 5 6 7 8 9 10; do $@; [ $? -eq 0 ] && break || sleep 30; done ; }
23+
retrycmd_if_failure() { for i in $(seq 1 10); do $@; [ $? -eq 0 ] && break || sleep 30; done; echo Executed \"$@\" $i times; }
2424

2525
write_certs_to_disk() {
2626
etcdctl get $ETCD_REQUESTHEADER_CLIENT_CA > $K8S_PROXY_CA_CRT_FILEPATH

test/e2e/azure/cli.go

+29
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ type ResourceGroup struct {
2828
Location string
2929
}
3030

31+
// VM represents an azure vm
32+
type VM struct {
33+
Name string `json:"name"`
34+
}
35+
3136
// Deployment represents a deployment of an acs cluster
3237
type Deployment struct {
3338
Name string // Name of the deployment
@@ -217,3 +222,27 @@ func (a *Account) UpdateRouteTables(subnet, vnet string) error {
217222
}
218223
return nil
219224
}
225+
226+
// GetHosts will get a list of vms in the resource group
227+
func (a *Account) GetHosts(name string) ([]VM, error) {
228+
var resourceGroup string
229+
if name != "" {
230+
resourceGroup = name
231+
} else {
232+
resourceGroup = a.ResourceGroup.Name
233+
}
234+
cmd := exec.Command("az", "vm", "list", "-g", resourceGroup)
235+
util.PrintCommand(cmd)
236+
out, err := cmd.CombinedOutput()
237+
if err != nil {
238+
log.Printf("Error while trying to get vm list:%s\n", out)
239+
return nil, err
240+
}
241+
v := []VM{{}}
242+
err = json.Unmarshal(out, &v)
243+
if err != nil {
244+
log.Printf("Error unmarshalling account json:%s\n", err)
245+
log.Printf("JSON:%s\n", out)
246+
}
247+
return v, nil
248+
}

test/e2e/remote/ssh.go

+21
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"net"
99
"os"
1010
"os/exec"
11+
"path/filepath"
1112
"time"
1213

1314
"github.com/Azure/acs-engine/test/e2e/kubernetes/util"
@@ -117,6 +118,26 @@ func (c *Connection) Read(path string) ([]byte, error) {
117118
return out, nil
118119
}
119120

121+
// CopyRemote uses this ssh connection to scp remote files
122+
func (c *Connection) CopyRemote(hostname, path string) error {
123+
cmd := exec.Command("ssh-add", c.PrivateKeyPath)
124+
out, err := cmd.CombinedOutput()
125+
if err != nil {
126+
log.Printf("Error output:%s\n", out)
127+
return err
128+
}
129+
remoteCommand := fmt.Sprintf("scp -o StrictHostKeyChecking=no %s:%s /tmp/%s-%s", hostname, path, hostname, filepath.Base(path))
130+
connectString := fmt.Sprintf("%s@%s", c.User, c.Host)
131+
cmd = exec.Command("ssh", "-A", "-i", c.PrivateKeyPath, "-o", "ConnectTimeout=30", "-o", "StrictHostKeyChecking=no", connectString, "-p", c.Port, remoteCommand)
132+
util.PrintCommand(cmd)
133+
out, err = cmd.CombinedOutput()
134+
if err != nil {
135+
log.Printf("Error output:%s\n", out)
136+
return err
137+
}
138+
return nil
139+
}
140+
120141
// ExecuteWithRetries will keep retrying a command until it does not return an error or the duration is exceeded
121142
func (c *Connection) ExecuteWithRetries(cmd string, sleep, duration time.Duration) ([]byte, error) {
122143
outCh := make(chan []byte, 1)

test/e2e/runner.go

+3-15
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package main
22

33
import (
44
"fmt"
5-
"io/ioutil"
65
"log"
76
"os"
87
"os/signal"
@@ -132,20 +131,9 @@ func teardown() {
132131
if err != nil {
133132
log.Printf("cliProvisioner.FetchProvisioningMetrics error: %s\n", err)
134133
}
135-
for _, fp := range []string{"/var/log/azure/cluster-provision.log", "/var/log/cloud-init.log",
136-
"/var/log/cloud-init-output.log", "/var/log/syslog", "/var/log/azure/custom-script/handler.log",
137-
"/opt/m", "/opt/azure/containers/kubelet.sh", "/opt/azure/containers/mountetcd.sh",
138-
"/opt/azure/containers/provision.sh", "/opt/azure/containers/setup-etcd.sh",
139-
"/opt/azure/provision-ps.log"} {
140-
data, err := cliProvisioner.FetchProvisioningMetrics(fp)
141-
if err != nil {
142-
log.Printf("cliProvisioner.FetchProvisioningMetrics error: %s\n", err)
143-
}
144-
target := filepath.Join(logsPath, filepath.Base(fp))
145-
err = ioutil.WriteFile(target, data, 0777)
146-
if err != nil {
147-
log.Printf("ioutil.WriteFile error: %s\n", err)
148-
}
134+
err = cliProvisioner.FetchProvisioningMetrics(logsPath, cfg, acct)
135+
if err != nil {
136+
log.Printf("cliProvisioner.FetchProvisioningMetrics error: %s\n", err)
149137
}
150138
}
151139
if cfg.CleanUpOnExit {

test/e2e/runner/cli_provisioner.go

+48-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"os"
99
"os/exec"
1010
"path/filepath"
11+
"strings"
1112
"time"
1213

1314
"github.com/Azure/acs-engine/test/e2e/azure"
@@ -199,16 +200,57 @@ func (cli *CLIProvisioner) waitForNodes() error {
199200
return nil
200201
}
201202

202-
// FetchProvisioningMetrics gets a file from the master
203-
func (cli *CLIProvisioner) FetchProvisioningMetrics(path string) ([]byte, error) {
203+
// FetchProvisioningMetrics gets provisioning files from all hosts in a cluster
204+
func (cli *CLIProvisioner) FetchProvisioningMetrics(path string, cfg *config.Config, acct *azure.Account) error {
205+
var masters, agents []string
206+
hosts, err := acct.GetHosts("")
207+
if err != nil {
208+
return err
209+
}
210+
for _, host := range hosts {
211+
if strings.Contains(host.Name, "master") {
212+
masters = append(masters, host.Name)
213+
} else if strings.Contains(host.Name, "agent") {
214+
agents = append(agents, host.Name)
215+
}
216+
}
217+
agentFiles := []string{"/var/log/azure/cluster-provision.log", "/var/log/cloud-init.log",
218+
"/var/log/cloud-init-output.log", "/var/log/syslog", "/var/log/azure/custom-script/handler.log",
219+
"/opt/m", "/opt/azure/containers/kubelet.sh", "/opt/azure/containers/provision.sh",
220+
"/opt/azure/provision-ps.log", "/var/log/azure/dnsdump.pcap"}
221+
masterFiles := agentFiles
222+
masterFiles = append(masterFiles, "/opt/azure/containers/mountetcd.sh", "/opt/azure/containers/setup-etcd.sh")
204223
hostname := fmt.Sprintf("%s.%s.cloudapp.azure.com", cli.Config.Name, cli.Config.Location)
205224
conn, err := remote.NewConnection(hostname, "22", cli.Engine.ClusterDefinition.Properties.LinuxProfile.AdminUsername, cli.Config.GetSSHKeyPath())
206225
if err != nil {
207-
return nil, err
226+
return err
227+
}
228+
for _, master := range masters {
229+
for _, fp := range masterFiles {
230+
err := conn.CopyRemote(master, fp)
231+
if err != nil {
232+
return fmt.Errorf("Error reading file from path (%s):%s", path, err)
233+
}
234+
}
235+
}
236+
237+
for _, agent := range agents {
238+
for _, fp := range agentFiles {
239+
err := conn.CopyRemote(agent, fp)
240+
if err != nil {
241+
return fmt.Errorf("Error reading file from path (%s):%s", path, err)
242+
}
243+
}
208244
}
209-
data, err := conn.Read(path)
245+
connectString := fmt.Sprintf("%s@%s:/tmp/k8s-*", conn.User, hostname)
246+
logsPath := filepath.Join(cfg.CurrentWorkingDir, "_logs", hostname)
247+
cmd := exec.Command("scp", "-i", conn.PrivateKeyPath, "-o", "ConnectTimeout=30", "-o", "StrictHostKeyChecking=no", connectString, logsPath)
248+
util.PrintCommand(cmd)
249+
out, err := cmd.CombinedOutput()
210250
if err != nil {
211-
return nil, fmt.Errorf("Error reading file from path (%s):%s", path, err)
251+
log.Printf("Error output:%s\n", out)
252+
return err
212253
}
213-
return data, nil
254+
255+
return nil
214256
}

0 commit comments

Comments
 (0)