Merge pull request #645 from NVIDIA/master

20.08.1 Release, pull a bunch of bugfixes for HA K8S, CentOS, etc. into 20.08
NVIDIA · Aug 25, 2020 · 762494b · 762494b
2 parents 6e716ab + 478f529
commit 762494b
Show file tree

Hide file tree

Showing 23 changed files with 186 additions and 109 deletions.
diff --git a/.jenkins-scripts/get-k8s-debug.sh b/.jenkins-scripts/get-k8s-debug.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -x
+source .jenkins-scripts/jenkins-common.sh
+
+# Ensure working directory is root
+cd "${ROOT_DIR}"
+
+export KF_DIR=${ROOT_DIR}/config/kubeflow
+export KFCTL=${ROOT_DIR}/config/kfctl
+
+# Get some basic info about all nodes
+kubectl describe nodes
+kubectl get nodes
+
+# Get some basic info about all running pods
+kubectl get pods -A
+kubectl get daemonsets -A
+
+# Get helm status (requires helm install)
+helm list
diff --git a/.jenkins-scripts/test-dashboard.sh b/.jenkins-scripts/test-dashboard.sh
@@ -13,7 +13,7 @@ source ./scripts/k8s_deploy_dashboard_user.sh
 timeout=120
 time=0
 while [ ${time} -lt ${timeout} ]; do
-  curl -ks --raw -L "${dashboard_url}" && \
+  curl -ks --raw -kL "${dashboard_url}" | grep "Kubernetes Dashboard" && \
     echo "Dashboard URLs are all responding" && exit 0
   let time=$time+15
   sleep 15

diff --git a/.jenkins-scripts/test-kubeflow-pipeline.py b/.jenkins-scripts/test-kubeflow-pipeline.py
@@ -10,28 +10,28 @@
 def test_kubeflow_op():
     op = kfp.dsl.ContainerOp(
       name='kubeflow-test-op',
-      image='nvcr.io/nvidia/rapidsai/rapidsai:cuda10.1-runtime-centos7',
-      command=["/bin/bash", "-cx"],
+      image='busybox',
+      command=["/bin/sh", "-cx"],
       arguments=["echo 'Container started!'"],
       file_outputs={}
       )                 
 kfp.compiler.Compiler().compile(test_kubeflow_op, 'kubeflow-test.yml')
 
 # Connect to Kubeflow and create job, this simply rungs RAPIDS and prints out a message                 
 while True:
+    time.sleep(30) # Occassionally Kubeflow fails to respond even when all deployments are up. I don't know why, sometimes it is a 403, sometimes a 500, and sometimes it works. So we will just wait and re-try until the test/script times out.
     try:
         print("Submitting Kubeflow pipeline")
         run_result = kfp.Client(host=None).create_run_from_pipeline_package('kubeflow-test.yml', arguments={})
         break # This means it worked!
     except kfp_server_api.rest.ApiException as e:
         print("Hit an error, waiting and trying again: {}".format(e))
-        time.sleep(30) # Occassionally Kubeflow fails to respond even when all deployments are up. I don't know why, sometimes it is a 403, sometimes a 500, and sometimes it works. So we will just wait and re-try until the test/script times out.
 
-for i in range(70): # The test .sh times out after 600 seconds. So we run a little longer than that. This accounts mostly for NGC download time.
+for i in range(70): # The test eventually times out. So we run a little longer than that. This accounts mostly for NGC download time.
     print("Polling for pipeline status: {} - {}".format(run_result, i))
-    status = kfp.Client(host=None).get_run(run_result.run_id).run.status
-    if status == "Succeeded":
+    run = kfp.Client(host=None).get_run(run_result.run_id).run
+    if run.status == "Succeeded":
         print("SUCCESS: Kubeflow launched a container successfully")
         break
-    print("Got {}, waiting some more...".format(status))
+    print("Got {}, waiting some more... {}".format(run.status, run))
     time.sleep(10) # Wait 10 seconds and poll
diff --git a/.jenkins-scripts/test-kubeflow-pipeline.sh b/.jenkins-scripts/test-kubeflow-pipeline.sh
@@ -5,16 +5,17 @@ source .jenkins-scripts/jenkins-common.sh
 # Ensure working directory is root
 cd "${ROOT_DIR}"
 
+export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db" # TODO: We will only poll for these, because other services currently fail to come up in Jenkins due to low disk space
+
 # Install the optional kfp package
 sudo pip3 install kfp
 
-# Wait for the kubeflow pipeline service to be ready, and then wait another 30 seconds for other random Kubeflow initialization
-# Don't wait for katib or a few other things that take longer to initialize
-export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db"
+# Wait for the kubeflow pipeline service to be ready
 ./scripts/k8s_deploy_kubeflow.sh -w
 
 kubectl get pods -n kubeflow # Do this for debug purposes
 
 # Run the Kubeflow pipeline test, this will build a pipeline that launches an NGC container
 # For some reason the initial pipeline creation hangs sometime (and doesn't timeout or error out or provide any logging) so we run this twice until success or timeout
-timeout 600 python3 .jenkins-scripts/test-kubeflow-pipeline.py || timeout 600 python3 .jenkins-scripts/test-kubeflow-pipeline.py
+python3 .jenkins-scripts/test-kubeflow-pipeline.py
+kubectl get pods -n kubeflow # Do this for debug purposes
diff --git a/.jenkins-scripts/test-kubeflow.sh b/.jenkins-scripts/test-kubeflow.sh
@@ -1,12 +1,37 @@
 #!/bin/bash
 set -x
 source .jenkins-scripts/jenkins-common.sh
+cp /var/lib/jenkins/kustomize ${ROOT_DIR}/config # kustomize is saved off on the Jenkins server because the kustomize servers often rate-limit causing failed downloads
 
 # Ensure working directory is root
 cd "${ROOT_DIR}"
 
 export KF_DIR=${ROOT_DIR}/config/kubeflow
 export KFCTL=${ROOT_DIR}/config/kfctl
+export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db" # TODO: We will only poll for these, because other services currently fail to come up in Jenkins due to low disk space
+
+# Deploy Kubflow with Dex
+source ./scripts/k8s_deploy_kubeflow.sh -x
+
+# The deployment script exports the http endpoints, verify it returns a 200
+# It typically takes ~5 minutes for all pods and services to start, so we poll
+timeout=600
+time=0
+while [ ${time} -lt ${timeout} ]; do
+  curl -s --raw -L "${kf_url}" && \
+    echo "Kubeflow is homepage is up " && break
+  let time=$time+15
+  sleep 15
+done
+curl -s --raw -L "${kf_url}" || exit 1 # If Kubeflow didn't come up in 600 seconds, fail
+
+# Wait for it to come up and view pods
+./scripts/k8s_deploy_kubeflow.sh -w
+kubectl get pods -n kubeflow
+
+# Delete Kubflow and view namespaces
+./scripts/k8s_deploy_kubeflow.sh -d
+kubectl get ns
 
 # Deploy Kubflow
 source ./scripts/k8s_deploy_kubeflow.sh
@@ -17,11 +42,8 @@ timeout=600
 time=0
 while [ ${time} -lt ${timeout} ]; do
   curl -s --raw -L "${kf_url}" && \
-    echo "Kubeflow is homepage is up " && exit 0
+    echo "Kubeflow is homepage is up " && exit 0 # Rather than poll here, we wait for the later kubeflow-pipeline test to poll and proceed to save testing time; kubeflow will continue coming up as monitoring and k8s dashboard tests run
   let time=$time+15
   sleep 15
 done
-
-# Kubeflow deployment failure
-echo "Kubeflow did not come up in time"
-exit 1
+curl -s --raw -L "${kf_url}" || exit 1 # If Kubeflow didn't come up in 600 seconds, fail
diff --git a/.jenkins-scripts/test-monitoring.sh b/.jenkins-scripts/test-monitoring.sh
@@ -16,11 +16,14 @@ while [ ${time} -lt ${timeout} ]; do
   curl -s --raw -L "${prometheus_url}" && \
     curl -s --raw -L "${grafana_url}" && \
     curl -s --raw -L "${alertmanager_url}"  && \
-    echo "Monitoring URLs are all responding" && exit 0
+    echo "Monitoring URLs are all responding" && break
   let time=$time+15
   sleep 15
 done
 
+# Delete Monitoring
+source ./scripts/k8s_deploy_monitoring.sh -d && exit 0
+
 # Monitoring deployment failure
 echo "Monitoring did not come up in time"
 exit 1
diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml
@@ -26,11 +26,12 @@ deepops_gpu_operator_enabled: false
 # Addons deployed in kube-system namespaces are handled.
 #podsecuritypolicy_enabled: false
 
-# kubespray v2.12.2 deploys dashboard 1.10.1 which is no longer supported in k8s 1.16
-# https://github.com/kubernetes/dashboard/issues/4401#issuecomment-540476478
+# Pin the version of kubespray dashboard https://github.com/kubernetes/dashboard/releases/tag/v2.0.3
 dashboard_enabled: true
-dashboard_image_tag: "v2.0.0-rc5"
+dashboard_image_tag: "v2.0.3"
 dashboard_image_repo: "kubernetesui/dashboard"
+dashboard_metrics_scrape_tagr: "v1.0.4"
+dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper"
 
 # kubespray v2.13.1 deploys helm v3.1.2
 helm_version: "v3.1.2"

diff --git a/config.example/group_vars/slurm-cluster.yml b/config.example/group_vars/slurm-cluster.yml
@@ -131,4 +131,4 @@ allow_user_set_gpu_clocks: no
 ################################################################################
 slurm_install_enroot: true
 slurm_install_pyxis: true
-slurm_pyxis_version: 0.8.0
+slurm_pyxis_version: 0.8.1
diff --git a/jenkins/Jenkinsfile b/jenkins/Jenkinsfile
@@ -36,6 +36,11 @@ pipeline {
             bash -x ./.jenkins-scripts/test-cluster-up.sh
           '''
 
+          echo "Get K8S Cluster Status"
+          sh '''
+            bash -x ./.jenkins-scripts/get-k8s-debug.sh
+          '''
+
           echo "Verify we can run a GPU job"
           sh '''
             timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh

diff --git a/jenkins/Jenkinsfile-multi-nightly b/jenkins/Jenkinsfile-multi-nightly
@@ -36,6 +36,11 @@ pipeline {
             bash -x ./.jenkins-scripts/test-cluster-up.sh
           '''
 
+          echo "Get K8S Cluster Status"
+          sh '''
+            bash -x ./.jenkins-scripts/get-k8s-debug.sh
+          '''
+
           echo "Verify we can run a GPU job"
           sh '''
             timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -53,7 +58,7 @@ pipeline {
 
           echo "Test Kubeflow installation"
           sh '''
-             timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
+             timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
           '''
 
           echo "Test Monitoring installation"
@@ -124,6 +129,11 @@ pipeline {
             bash -x ./.jenkins-scripts/test-cluster-up.sh
           '''
 
+          echo "Get K8S Cluster Status"
+          sh '''
+            bash -x ./.jenkins-scripts/get-k8s-debug.sh
+          '''
+
           echo "Verify we can run a GPU job"
           sh '''
             timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -141,7 +151,7 @@ pipeline {
 
           echo "Test Kubeflow installation"
           sh '''
-             timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
+             timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
           '''
 
           echo "Test Monitoring installation"

diff --git a/jenkins/Jenkinsfile-nightly b/jenkins/Jenkinsfile-nightly
@@ -36,6 +36,11 @@ pipeline {
             bash -x ./.jenkins-scripts/test-cluster-up.sh
           '''
 
+          echo "Get K8S Cluster Status"
+          sh '''
+            bash -x ./.jenkins-scripts/get-k8s-debug.sh
+          '''
+
           echo "Verify we can run a GPU job"
           sh '''
             timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -53,7 +58,7 @@ pipeline {
 
           echo "Test Kubeflow installation"
           sh '''
-             timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
+             timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
           '''
 
           echo "Test Monitoring installation"
@@ -124,6 +129,11 @@ pipeline {
             bash -x ./.jenkins-scripts/test-cluster-up.sh
           '''
 
+          echo "Get K8S Cluster Status"
+          sh '''
+            bash -x ./.jenkins-scripts/get-k8s-debug.sh
+          '''
+
           echo "Verify we can run a GPU job"
           sh '''
             timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
@@ -141,7 +151,7 @@ pipeline {
 
           echo "Test Kubeflow installation"
           sh '''
-             timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
+             timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
           '''
 
           echo "Test Monitoring installation"

diff --git a/playbooks/k8s-cluster.yml b/playbooks/k8s-cluster.yml
@@ -200,4 +200,15 @@
   tags:
     - local
 
+# Remove taint from kube-master nodes.
+# This keeps backwards compatibility and allows a few services (monitoring/etc.) to run properly.
+- hosts: kube-master
+  gather_facts: false
+  vars:
+    ansible_become: no
+  tasks:
+    - name: kubeadm | Remove taint for master with node role
+      command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/master:NoSchedule-"
+      delegate_to: localhost
+      failed_when: false # Taint will not be present if kube-master also under kube-node
 
diff --git a/playbooks/nvidia-gpu-operator.yml b/playbooks/nvidia-gpu-operator.yml
@@ -14,7 +14,7 @@
     - nvidia-gpu-operator
 
 # GPU operator
-- hosts: kube-master
+- hosts: kube-master[0]
   become: yes
   tasks:
     - name: Install helm chart for GPU operator

diff --git a/playbooks/nvidia-k8s-gpu-device-plugin.yml b/playbooks/nvidia-k8s-gpu-device-plugin.yml
@@ -1,5 +1,5 @@
 ---
-- hosts: kube-master
+- hosts: kube-master[0]
   become: true
   tasks:
     - name: install k8s GPU plugin

diff --git a/playbooks/slurm.yml b/playbooks/slurm.yml
@@ -12,7 +12,7 @@
       authorized_key:
         user: root
         state: present
-        key: "{{ lookup('file', lookup('env','HOME') + '/.ssh/id_rsa.pub') }}"
+        key: "{{ lookup('file', ansible_ssh_private_key_file | default(lookup('env','HOME') + '/.ssh/id_rsa') + '.pub') }}"
 
 # Build slurm first on all nodes
 - hosts: slurm-cluster

diff --git a/roles/lmod/defaults/main.yml b/roles/lmod/defaults/main.yml
@@ -0,0 +1,9 @@
+---
+lmod_rhel_epel_repo_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/"
+lmod_rhel_epel_repo_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
+
+# include some reasonable defaults for module paths
+sm_prefix: "/sw"
+sm_module_root: "{{ sm_prefix }}/modules"
+sm_module_path: "{{ sm_module_root }}/all"
+sm_software_path: "{{ sm_prefix }}/software"