Skip to content

Commit

Permalink
Merge pull request #645 from NVIDIA/master
Browse files Browse the repository at this point in the history
20.08.1 Release, pull a bunch of bugfixes for HA K8S, CentOS, etc. into 20.08
  • Loading branch information
michael-balint authored Aug 25, 2020
2 parents 6e716ab + 478f529 commit 762494b
Show file tree
Hide file tree
Showing 23 changed files with 186 additions and 109 deletions.
20 changes: 20 additions & 0 deletions .jenkins-scripts/get-k8s-debug.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
set -x
source .jenkins-scripts/jenkins-common.sh

# Ensure working directory is root
cd "${ROOT_DIR}"

export KF_DIR=${ROOT_DIR}/config/kubeflow
export KFCTL=${ROOT_DIR}/config/kfctl

# Get some basic info about all nodes
kubectl describe nodes
kubectl get nodes

# Get some basic info about all running pods
kubectl get pods -A
kubectl get daemonsets -A

# Get helm status (requires helm install)
helm list
2 changes: 1 addition & 1 deletion .jenkins-scripts/test-dashboard.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ source ./scripts/k8s_deploy_dashboard_user.sh
timeout=120
time=0
while [ ${time} -lt ${timeout} ]; do
curl -ks --raw -L "${dashboard_url}" && \
curl -ks --raw -kL "${dashboard_url}" | grep "Kubernetes Dashboard" && \
echo "Dashboard URLs are all responding" && exit 0
let time=$time+15
sleep 15
Expand Down
14 changes: 7 additions & 7 deletions .jenkins-scripts/test-kubeflow-pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,28 @@
def test_kubeflow_op():
op = kfp.dsl.ContainerOp(
name='kubeflow-test-op',
image='nvcr.io/nvidia/rapidsai/rapidsai:cuda10.1-runtime-centos7',
command=["/bin/bash", "-cx"],
image='busybox',
command=["/bin/sh", "-cx"],
arguments=["echo 'Container started!'"],
file_outputs={}
)
kfp.compiler.Compiler().compile(test_kubeflow_op, 'kubeflow-test.yml')

# Connect to Kubeflow and create job, this simply rungs RAPIDS and prints out a message
while True:
time.sleep(30) # Occassionally Kubeflow fails to respond even when all deployments are up. I don't know why, sometimes it is a 403, sometimes a 500, and sometimes it works. So we will just wait and re-try until the test/script times out.
try:
print("Submitting Kubeflow pipeline")
run_result = kfp.Client(host=None).create_run_from_pipeline_package('kubeflow-test.yml', arguments={})
break # This means it worked!
except kfp_server_api.rest.ApiException as e:
print("Hit an error, waiting and trying again: {}".format(e))
time.sleep(30) # Occassionally Kubeflow fails to respond even when all deployments are up. I don't know why, sometimes it is a 403, sometimes a 500, and sometimes it works. So we will just wait and re-try until the test/script times out.

for i in range(70): # The test .sh times out after 600 seconds. So we run a little longer than that. This accounts mostly for NGC download time.
for i in range(70): # The test eventually times out. So we run a little longer than that. This accounts mostly for NGC download time.
print("Polling for pipeline status: {} - {}".format(run_result, i))
status = kfp.Client(host=None).get_run(run_result.run_id).run.status
if status == "Succeeded":
run = kfp.Client(host=None).get_run(run_result.run_id).run
if run.status == "Succeeded":
print("SUCCESS: Kubeflow launched a container successfully")
break
print("Got {}, waiting some more...".format(status))
print("Got {}, waiting some more... {}".format(run.status, run))
time.sleep(10) # Wait 10 seconds and poll
9 changes: 5 additions & 4 deletions .jenkins-scripts/test-kubeflow-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@ source .jenkins-scripts/jenkins-common.sh
# Ensure working directory is root
cd "${ROOT_DIR}"

export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db" # TODO: We will only poll for these, because other services currently fail to come up in Jenkins due to low disk space

# Install the optional kfp package
sudo pip3 install kfp

# Wait for the kubeflow pipeline service to be ready, and then wait another 30 seconds for other random Kubeflow initialization
# Don't wait for katib or a few other things that take longer to initialize
export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db"
# Wait for the kubeflow pipeline service to be ready
./scripts/k8s_deploy_kubeflow.sh -w

kubectl get pods -n kubeflow # Do this for debug purposes

# Run the Kubeflow pipeline test, this will build a pipeline that launches an NGC container
# For some reason the initial pipeline creation hangs sometime (and doesn't timeout or error out or provide any logging) so we run this twice until success or timeout
timeout 600 python3 .jenkins-scripts/test-kubeflow-pipeline.py || timeout 600 python3 .jenkins-scripts/test-kubeflow-pipeline.py
python3 .jenkins-scripts/test-kubeflow-pipeline.py
kubectl get pods -n kubeflow # Do this for debug purposes
32 changes: 27 additions & 5 deletions .jenkins-scripts/test-kubeflow.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,37 @@
#!/bin/bash
set -x
source .jenkins-scripts/jenkins-common.sh
cp /var/lib/jenkins/kustomize ${ROOT_DIR}/config # kustomize is saved off on the Jenkins server because the kustomize servers often rate-limit causing failed downloads

# Ensure working directory is root
cd "${ROOT_DIR}"

export KF_DIR=${ROOT_DIR}/config/kubeflow
export KFCTL=${ROOT_DIR}/config/kfctl
export KUBEFLOW_DEPLOYMENTS="profiles-deployment centraldashboard ml-pipeline minio mysql metadata-db" # TODO: We will only poll for these, because other services currently fail to come up in Jenkins due to low disk space

# Deploy Kubflow with Dex
source ./scripts/k8s_deploy_kubeflow.sh -x

# The deployment script exports the http endpoints, verify it returns a 200
# It typically takes ~5 minutes for all pods and services to start, so we poll
timeout=600
time=0
while [ ${time} -lt ${timeout} ]; do
curl -s --raw -L "${kf_url}" && \
echo "Kubeflow is homepage is up " && break
let time=$time+15
sleep 15
done
curl -s --raw -L "${kf_url}" || exit 1 # If Kubeflow didn't come up in 600 seconds, fail

# Wait for it to come up and view pods
./scripts/k8s_deploy_kubeflow.sh -w
kubectl get pods -n kubeflow

# Delete Kubflow and view namespaces
./scripts/k8s_deploy_kubeflow.sh -d
kubectl get ns

# Deploy Kubflow
source ./scripts/k8s_deploy_kubeflow.sh
Expand All @@ -17,11 +42,8 @@ timeout=600
time=0
while [ ${time} -lt ${timeout} ]; do
curl -s --raw -L "${kf_url}" && \
echo "Kubeflow is homepage is up " && exit 0
echo "Kubeflow is homepage is up " && exit 0 # Rather than poll here, we wait for the later kubeflow-pipeline test to poll and proceed to save testing time; kubeflow will continue coming up as monitoring and k8s dashboard tests run
let time=$time+15
sleep 15
done

# Kubeflow deployment failure
echo "Kubeflow did not come up in time"
exit 1
curl -s --raw -L "${kf_url}" || exit 1 # If Kubeflow didn't come up in 600 seconds, fail
5 changes: 4 additions & 1 deletion .jenkins-scripts/test-monitoring.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@ while [ ${time} -lt ${timeout} ]; do
curl -s --raw -L "${prometheus_url}" && \
curl -s --raw -L "${grafana_url}" && \
curl -s --raw -L "${alertmanager_url}" && \
echo "Monitoring URLs are all responding" && exit 0
echo "Monitoring URLs are all responding" && break
let time=$time+15
sleep 15
done

# Delete Monitoring
source ./scripts/k8s_deploy_monitoring.sh -d && exit 0

# Monitoring deployment failure
echo "Monitoring did not come up in time"
exit 1
7 changes: 4 additions & 3 deletions config.example/group_vars/k8s-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ deepops_gpu_operator_enabled: false
# Addons deployed in kube-system namespaces are handled.
#podsecuritypolicy_enabled: false

# kubespray v2.12.2 deploys dashboard 1.10.1 which is no longer supported in k8s 1.16
# https://github.com/kubernetes/dashboard/issues/4401#issuecomment-540476478
# Pin the version of kubespray dashboard https://github.com/kubernetes/dashboard/releases/tag/v2.0.3
dashboard_enabled: true
dashboard_image_tag: "v2.0.0-rc5"
dashboard_image_tag: "v2.0.3"
dashboard_image_repo: "kubernetesui/dashboard"
dashboard_metrics_scrape_tagr: "v1.0.4"
dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper"

# kubespray v2.13.1 deploys helm v3.1.2
helm_version: "v3.1.2"
Expand Down
2 changes: 1 addition & 1 deletion config.example/group_vars/slurm-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,4 @@ allow_user_set_gpu_clocks: no
################################################################################
slurm_install_enroot: true
slurm_install_pyxis: true
slurm_pyxis_version: 0.8.0
slurm_pyxis_version: 0.8.1
5 changes: 5 additions & 0 deletions jenkins/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ pipeline {
bash -x ./.jenkins-scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
bash -x ./.jenkins-scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
Expand Down
14 changes: 12 additions & 2 deletions jenkins/Jenkinsfile-multi-nightly
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ pipeline {
bash -x ./.jenkins-scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
bash -x ./.jenkins-scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
Expand All @@ -53,7 +58,7 @@ pipeline {

echo "Test Kubeflow installation"
sh '''
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
'''

echo "Test Monitoring installation"
Expand Down Expand Up @@ -124,6 +129,11 @@ pipeline {
bash -x ./.jenkins-scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
bash -x ./.jenkins-scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
Expand All @@ -141,7 +151,7 @@ pipeline {

echo "Test Kubeflow installation"
sh '''
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
'''

echo "Test Monitoring installation"
Expand Down
14 changes: 12 additions & 2 deletions jenkins/Jenkinsfile-nightly
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ pipeline {
bash -x ./.jenkins-scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
bash -x ./.jenkins-scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
Expand All @@ -53,7 +58,7 @@ pipeline {

echo "Test Kubeflow installation"
sh '''
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
'''

echo "Test Monitoring installation"
Expand Down Expand Up @@ -124,6 +129,11 @@ pipeline {
bash -x ./.jenkins-scripts/test-cluster-up.sh
'''

echo "Get K8S Cluster Status"
sh '''
bash -x ./.jenkins-scripts/get-k8s-debug.sh
'''

echo "Verify we can run a GPU job"
sh '''
timeout 500 bash -x ./.jenkins-scripts/run-gpu-job.sh
Expand All @@ -141,7 +151,7 @@ pipeline {

echo "Test Kubeflow installation"
sh '''
timeout 1500 bash -x ./.jenkins-scripts/test-kubeflow.sh
timeout 3000 bash -x ./.jenkins-scripts/test-kubeflow.sh
'''

echo "Test Monitoring installation"
Expand Down
11 changes: 11 additions & 0 deletions playbooks/k8s-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,4 +200,15 @@
tags:
- local

# Remove taint from kube-master nodes.
# This keeps backwards compatibility and allows a few services (monitoring/etc.) to run properly.
- hosts: kube-master
gather_facts: false
vars:
ansible_become: no
tasks:
- name: kubeadm | Remove taint for master with node role
command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/master:NoSchedule-"
delegate_to: localhost
failed_when: false # Taint will not be present if kube-master also under kube-node

2 changes: 1 addition & 1 deletion playbooks/nvidia-gpu-operator.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
- nvidia-gpu-operator

# GPU operator
- hosts: kube-master
- hosts: kube-master[0]
become: yes
tasks:
- name: Install helm chart for GPU operator
Expand Down
2 changes: 1 addition & 1 deletion playbooks/nvidia-k8s-gpu-device-plugin.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
- hosts: kube-master
- hosts: kube-master[0]
become: true
tasks:
- name: install k8s GPU plugin
Expand Down
2 changes: 1 addition & 1 deletion playbooks/slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
authorized_key:
user: root
state: present
key: "{{ lookup('file', lookup('env','HOME') + '/.ssh/id_rsa.pub') }}"
key: "{{ lookup('file', ansible_ssh_private_key_file | default(lookup('env','HOME') + '/.ssh/id_rsa') + '.pub') }}"

# Build slurm first on all nodes
- hosts: slurm-cluster
Expand Down
9 changes: 9 additions & 0 deletions roles/lmod/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
lmod_rhel_epel_repo_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/"
lmod_rhel_epel_repo_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"

# include some reasonable defaults for module paths
sm_prefix: "/sw"
sm_module_root: "{{ sm_prefix }}/modules"
sm_module_path: "{{ sm_module_root }}/all"
sm_software_path: "{{ sm_prefix }}/software"
Loading

0 comments on commit 762494b

Please sign in to comment.