Skip to content

Commit

Permalink
Merge branch 'karpathy:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
dagelf authored Jul 1, 2024
2 parents c2ffa48 + 61625d1 commit 7c78d9a
Show file tree
Hide file tree
Showing 57 changed files with 3,489 additions and 668 deletions.
32 changes: 21 additions & 11 deletions .github/workflows/ci_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,19 @@ jobs:
run: python train_gpt2.py

- name: Compile training and testing program
run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

- name: Train model (With OpenMP)
run: OMP_NUM_THREADS=8 ./train_gpt2cu

- name: Train model (FP32) with gpt2_124M.bin
run: |
run: |
PRECISION=FP32 make train_gpt2cu
./train_gpt2cu -b 4 -t 64 -l 1e-4 -v 200 -s 200 -a 1 -x 10 -e gpt2_124M.bin
- name: Build FP32 precision
run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu

- name: Run default
run: ./test_gpt2cu

Expand All @@ -52,7 +52,7 @@ jobs:

- name: Run recompute LN
run: ./test_gpt2cu -r 2

- name: Build BF16 precision
run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu

Expand All @@ -67,18 +67,18 @@ jobs:

- name: Run recompute LN
run: ./test_gpt2cu -r 2

- name: Train model fp32 (With OpenMP)
run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu

- name: Execute testing program (With OpenMP)
run: OMP_NUM_THREADS=8 ./test_gpt2cu

- name: Execute testing program fp32 (With OpenMP)
run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu

- name: Compile training and testing program without OpenMP
run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

- name: Train model (No OpenMP)
run: NO_OMP=1 ./train_gpt2cu
Expand All @@ -88,14 +88,14 @@ jobs:

- name: Execute testing program (No OpenMP)
run: ./test_gpt2cu -b 32

- name: Execute testing program fp32 (No OpenMP)
run: ./test_gpt2fp32cu

- name: Install cuDNN-frontend
run:
run:
git clone https://github.com/NVIDIA/cudnn-frontend.git

- name: Build with cuDNN
run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

Expand All @@ -110,3 +110,13 @@ jobs:

- name: Execute testing program fp32 with cuDNN
run: ./test_gpt2fp32cu

unit-tests-gpu:
runs-on: ubicloud-gpu-standard-1-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Test Device<->File IO
run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io
100 changes: 100 additions & 0 deletions .github/workflows/ci_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
name: Unit, Static and other Tests

on:
create:
workflow_dispatch:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
dataloader_test:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: test the dataloader without / with sanitize address
run: |
cd dev/test
make PRECISION=BF16 test_dataloader
./test_dataloader
make clean
make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader
./test_dataloader
ptx_and_sass_files:
runs-on: ubuntu-latest
container:
image: nvidia/cuda:12.4.1-devel-ubuntu22.04

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install OpenMP and OpenMPI
run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev

- name: Generate ptx/sass files and upload them to persistent storage
run: |
mkdir -p dev/cuda/ptx_sass_logs
make train_gpt2cu
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass
cd dev/cuda
make -j all_ptx
make -j all_sass
cp *.ptx ptx_sass_logs/
cp *.sass ptx_sass_logs/
ls ptx_sass_logs/
- name: Generate ptx/sass files for A100 and upload them to persistent storage
run: |
mkdir -p dev/cuda/ptx_sass_logs_A100
make train_gpt2cu GPU_COMPUTE_CAPABILITY=80
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass
cd dev/cuda
make -j GPU_COMPUTE_CAPABILITY=80 all_ptx
make -j GPU_COMPUTE_CAPABILITY=80 all_sass
cp *.ptx ptx_sass_logs_A100/
cp *.sass ptx_sass_logs_A100/
ls ptx_sass_logs_A100/
- name: Generate ptx/sass files for H100 and upload them to persistent storage
run: |
mkdir -p dev/cuda/ptx_sass_logs_H100
make train_gpt2cu GPU_COMPUTE_CAPABILITY=90
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass
cd dev/cuda
make -j GPU_COMPUTE_CAPABILITY=90 all_ptx
make -j GPU_COMPUTE_CAPABILITY=90 all_sass
cp *.ptx ptx_sass_logs_H100/
cp *.sass ptx_sass_logs_H100/
ls ptx_sass_logs_H100/
- name: Upload ptx/sass files
uses: actions/upload-artifact@v4
with:
name: ptx_sass_files
path: dev/cuda/ptx_sass_logs/
retention-days: 30 # days to retain

- name: Upload ptx/sass files for A100
uses: actions/upload-artifact@v4
with:
name: ptx_sass_files_A100
path: dev/cuda/ptx_sass_logs_A100/
retention-days: 30 # days to retain

- name: Upload ptx/sass files for H100
uses: actions/upload-artifact@v4
with:
name: ptx_sass_files_H100
path: dev/cuda/ptx_sass_logs_H100/
retention-days: 30 # days to retain
36 changes: 25 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,27 +188,41 @@ else
endif
endif

# Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
# Check if NCCL is available, include if so, for multi-GPU training
ifeq ($(NO_MULTI_GPU), 1)
$(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
$(info → Multi-GPU (NCCL) is manually disabled)
else
ifneq ($(OS), Windows_NT)
# Detect if running on macOS or Linux
ifeq ($(SHELL_UNAME), Darwin)
$(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping OpenMPI + NCCL support)
else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
$(info ✓ OpenMPI found, OK to train with multiple GPUs)
NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
NVCC_LDLIBS += -lmpi -lnccl
$(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping NCCL support)
else ifeq ($(shell dpkg -l | grep -q nccl && echo "exists"), exists)
$(info ✓ NCCL found, OK to train with multiple GPUs)
NVCC_FLAGS += -DMULTI_GPU
NVCC_LDLIBS += -lnccl
else
$(info ✗ OpenMPI is not found, disabling multi-GPU support)
$(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
$(info ✗ NCCL is not found, disabling multi-GPU support)
$(info ---> On Linux you can try install NCCL with `sudo apt install libnccl2 libnccl-dev`)
endif
endif
endif

# Attempt to find and include OpenMPI on the system
OPENMPI_DIR ?= /usr/lib/x86_64-linux-gnu/openmpi
OPENMPI_LIB_PATH = $(OPENMPI_DIR)/lib/
OPENMPI_INCLUDE_PATH = $(OPENMPI_DIR)/include/
ifeq ($(NO_USE_MPI), 1)
$(info → MPI is manually disabled)
else ifeq ($(shell [ -d $(OPENMPI_LIB_PATH) ] && [ -d $(OPENMPI_INCLUDE_PATH) ] && echo "exists"), exists)
$(info ✓ MPI enabled)
NVCC_INCLUDES += -I$(OPENMPI_INCLUDE_PATH)
NVCC_LDFLAGS += -L$(OPENMPI_LIB_PATH)
NVCC_LDLIBS += -lmpi
NVCC_FLAGS += -DUSE_MPI
else
$(info ✗ MPI not found)
endif

# Precision settings, default to bf16 but ability to override
PRECISION ?= BF16
VALID_PRECISIONS := FP32 FP16 BF16
Expand Down Expand Up @@ -266,5 +280,5 @@ profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)

clean:
$(REMOVE_FILES) $(TARGETS)
$(REMOVE_FILES) $(TARGETS)
$(REMOVE_BUILD_OBJECT_FILES)
48 changes: 39 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,34 @@ debugging tip: when you run the `make` command to build the binary, modify it by
If you won't be training on multiple nodes, aren't interested in mixed precision, and are interested in learning CUDA, the fp32 (legacy) files might be of interest to you. These are files that were "checkpointed" early in the history of llm.c and frozen in time. They are simpler, more portable, and possibly easier to understand. Run the 1 GPU, fp32 code like this:

```bash
pip install -r requirements.txt
python dev/data/tinyshakespeare.py
python train_gpt2.py
chmod u+x ./dev/download_starter_pack.sh
./dev/download_starter_pack.sh
make train_gpt2fp32cu
./train_gpt2fp32cu
```

The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text.
The download_starter_pack.sh script is a quick & easy way to get started and it downloads a bunch of .bin files that help get you off the ground. These contain: 1) the GPT-2 124M model saved in fp32, in bfloat16, 2) a "debug state" used in unit testing (a small batch of data, and target activations and gradients), 3) the GPT-2 tokenizer, and 3) the tokenized [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. Alternatively, instead of running the .sh script, you can re-create these artifacts manually as follows:

```bash
pip install -r requirements.txt
python dev/data/tinyshakespeare.py
python train_gpt2.py
```

## quick start (CPU)

The "I am so GPU poor that I don't even have one GPU" section. You can still enjoy seeing llm.c train! But you won't go too far. Just like the fp32 version above, the CPU version is an even earlier checkpoint in the history of llm.c, back when it was just a simple reference implementation in C. For example, instead of training from scratch, you can finetune a GPT-2 small (124M) to output Shakespeare-like text, as an example:

```bash
pip install -r requirements.txt
python dev/data/tinyshakespeare.py
python train_gpt2.py
chmod u+x ./dev/download_starter_pack.sh
./dev/download_starter_pack.sh
make train_gpt2
OMP_NUM_THREADS=8 ./train_gpt2
```

The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. The output looks like this on my MacBook Pro (Apple Silicon M3 Max):
If you'd prefer to avoid running the starter pack script, then as mentioned in the previous section you can reproduce the exact same .bin files and artifacts by running `python dev/data/tinyshakespeare.py` and then `python train_gpt2.py`.

The above lines (1) download an already tokenized [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset and download the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. The output looks like this on my MacBook Pro (Apple Silicon M3 Max):

```
[GPT-2]
Expand Down Expand Up @@ -128,19 +134,40 @@ sudo apt-get -y install libcudnn9-dev-cuda-12

On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.

**multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
## multi-GPU training

Make sure you install MPI and NCCL, e.g. on Linux:

```bash
sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
```

For NCCL follow the instructions from the [official website](https://developer.nvidia.com/nccl/nccl-download) (e.g. network installer)

and then:

```bash
make train_gpt2cu
mpirun -np <number of GPUs> ./train_gpt2cu
```

or simply run one of our scripts under `./scripts/`.

## multi-node training

Make sure you've installed `NCCL` following instructions from [multi-GPU](#multi-gpu-training) section.

There are 3 ways we currently support that allow you to run multi-node training:
1) Use OpenMPI to exchange nccl id and initialize NCCL. See e.g. `./scripts/multi_node/run_gpt2_124M_mpi.sh` script for details.
2) Use shared file system to init NCCL. See `./scripts/multi_node/run_gpt2_124M_fs.sbatch` script for details.
3) Use TCP sockets to init NCCL. See `./scripts/multi_node/run_gpt2_124M_tcp.sbatch` script for details.

Note:
* If you're running in a slurm environment and your slurm doesn't support PMIx (which we assume will be a common situation given that `slurm-wlm` dropped PMIx support) you will have to use FS (2) or TCP (3) approach. To test whether your slurm supports PMIx run: `srun --mpi=list` and see whether you get `pmix` in the output.
* If you don't have slurm set up, you can kick off a multi-node run using `mpirun` - MPI (1).

None of these 3 methods is superior, we just offer you options so that you can run in your specific environment.

## experiments / sweeps

Just as an example process to sweep learning rates on a machine with 4 GPUs on TinyStories. Run a shell script `sweep.sh` (after you of course `chmod u+x sweep.sh`):
Expand Down Expand Up @@ -198,6 +225,9 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
- Mojo
- [llm.🔥](https://github.com/dorjeduck/llm.mojo) by @[dorjeduck](https://github.com/dorjeduck): a Mojo port of this project

- OpenCL
- [llm.c](https://github.com/krrishnarraj/llm.c) by @[krrishnarraj](https://github.com/krrishnarraj): an OpenCL port of this project

- Rust
- [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance
- [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project
Expand Down
26 changes: 24 additions & 2 deletions dev/cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,20 @@ ifeq ($(NVCC),)
$(error nvcc not found.)
endif

ifneq ($(CI),true) # if not in CI, then use the GPU query
ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) # assume if NVCC is present, then this likely is too
GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
endif
endif

# Compiler flags
CFLAGS = -O3 --use_fast_math
ifeq ($(GPU_COMPUTE_CAPABILITY),) # set to defaults if: make GPU_COMPUTE_CAPABILITY=
CFLAGS = -O3 --use_fast_math
else
CFLAGS = -O3 --use_fast_math --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
endif

NVCCFLAGS = -lcublas -lcublasLt -std=c++17
MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/

Expand All @@ -20,6 +32,8 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
# Build all targets
TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward global_norm
all: $(TARGETS)
all_ptx: $(TARGETS:%=%.ptx)
all_sass: $(TARGETS:%=%.sass)

# Individual targets: forward pass
attention_forward: attention_forward.cu
Expand Down Expand Up @@ -54,6 +68,14 @@ global_norm: global_norm.cu
nccl_all_reduce: nccl_all_reduce.cu
$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce

# Generate PTX using cuobjdump
%.ptx: %
cuobjdump --dump-ptx $< > $@

# Generate SASS using cuobjdump
%.sass: %
cuobjdump --dump-sass $< > $@

# Run all targets
run_all: all
@for target in $(TARGETS); do \
Expand All @@ -65,4 +87,4 @@ run_all: all

# Clean up
clean:
rm -f $(TARGETS)
rm -f $(TARGETS) *.ptx *.sass
Loading

0 comments on commit 7c78d9a

Please sign in to comment.