Build on Schedule #141

Workflow file for this run

.github/workflows/build_on_schedule.yml at e9032fb

	name: Build on Schedule

	on:
	schedule:
	# run at 00:00 of every Sunday
	- cron: "0 0 * * *"
	workflow_dispatch:

	jobs:
	build:
	name: Build and Test Colossal-AI
	if: github.repository == 'hpcaitech/ColossalAI'
	runs-on: [self-hosted, gpu]
	container:
	image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
	options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
	timeout-minutes: 90
	steps:
	- name: Check GPU Availability # ensure all GPUs have enough memory
	id: check-avai
	run: \|
	avai=true
	ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	endIndex=$(($ngpu-1))
	for i in $(seq 0 $endIndex);
	do
	gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
	[ "$gpu_used" -gt "2000" ] && avai=false
	done

	echo "GPU is available: $avai"
	echo "avai=$avai" >> $GITHUB_OUTPUT

	- uses: actions/checkout@v2
	if: steps.check-avai.outputs.avai == 'true'
	with:
	repository: hpcaitech/TensorNVMe
	ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
	path: TensorNVMe

	- name: Install tensornvme
	if: steps.check-avai.outputs.avai == 'true'
	run: \|
	cd TensorNVMe
	conda install cmake
	pip install -r requirements.txt
	DISABLE_URING=1 pip install -v .

	- uses: actions/checkout@v2
	if: steps.check-avai.outputs.avai == 'true'
	with:
	ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}

	- name: Install Colossal-AI
	if: steps.check-avai.outputs.avai == 'true'
	run: \|
	[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
	BUILD_EXT=1 pip install -v -e .
	cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
	pip install --no-cache-dir -r requirements/requirements-test.txt

	- name: Unit Testing
	if: steps.check-avai.outputs.avai == 'true'
	run: \|
	PYTHONPATH=$PWD pytest \
	-m "not largedist" \
	--durations=0 \
	tests/
	env:
	LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
	LLAMA_PATH: /data/scratch/llama-tiny
	MOE_TENSOR_PATH: /data/scratch/moe_tensors

	- name: Notify Lark
	id: message-preparation
	if: ${{ failure() }}
	run: \|
	url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
	msg="Scheduled Build and Test failed, please visit $url for details"
	echo $msg
	python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
	env:
	SERVER_URL: ${{github.server_url }}
	REPO: ${{ github.repository }}
	RUN_ID: ${{ github.run_id }}
	WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Build on Schedule #141

Workflow file

Build on Schedule #141

Jobs

Run details

Workflow file for this run