Merge remote-tracking branch 'upstream/main' into zsl/vllm-spmd

volcengine · Jan 27, 2025 · 6f55342 · 6f55342
2 parents bc689b6 + 12b0b59
commit 6f55342
Show file tree

Hide file tree

Showing 73 changed files with 1,837 additions and 399 deletions.
diff --git a/.github/workflows/dataset.yml b/.github/workflows/dataset.yml
@@ -16,6 +16,8 @@ on:
       - "**/*.py"
       - .github/workflows/dataset.yml
 
+
+
 jobs:
   ray:
     runs-on: [self-hosted, gpu]

diff --git a/.github/workflows/e2e_digit_completion.yml b/.github/workflows/e2e_digit_completion.yml
@@ -17,6 +17,8 @@ on:
       - .github/workflows/e2e_digit_completion.yml
       - "tests/e2e/*.sh"
 
+
+
 jobs:
   e2e_digit_completion:
     runs-on: [self-hosted, l20-0]

diff --git a/.github/workflows/e2e_gsm8k.yml b/.github/workflows/e2e_gsm8k.yml
@@ -17,6 +17,8 @@ on:
       - .github/workflows/e2e_gsm8k.yml
       - "tests/e2e/*.sh"
 
+
+
 jobs:
   e2e_gsm8k:
     runs-on: [self-hosted, l20-1]

diff --git a/.github/workflows/e2e_lora.yml b/.github/workflows/e2e_lora.yml
@@ -0,0 +1,48 @@
+name: e2e_lora
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/e2e_lora.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/e2e_lora.yml
+      - "tests/e2e/*.sh"
+
+
+
+jobs:
+  e2e_lora:
+    runs-on: [self-hosted, l20-1]
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1"
+      HF_HUB_ENABLE_HF_TRANSFER: 1
+    container:
+      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      options: --gpus all --shm-size=10g
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+            fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install hf_transfer peft
+          pip3 install -e .[test]
+      - name: Prepare gsm8k dataset
+        run: |
+          ray stop --force
+          python3 examples/data_preprocess/gsm8k.py
+      - name: Running gsm8k e2e training tests with LoRA
+        run: |
+          ray stop --force
+          bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/
diff --git a/.github/workflows/e2e_sft.yml b/.github/workflows/e2e_sft.yml
@@ -0,0 +1,56 @@
+name: e2e_sft
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/e2e_sft.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/e2e_sft.yml
+      - "tests/e2e/*.sh"
+
+
+
+jobs:
+  e2e_sft:
+    runs-on: [self-hosted, l20-1]
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1"
+      HF_HUB_ENABLE_HF_TRANSFER: 1
+    container:
+      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      options: --gpus all --shm-size=10g
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+            fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install hf_transfer
+          pip3 install -e .[test]
+      - name: Prepare gsm8k dataset
+        run: |
+          ray stop --force
+          python3 examples/data_preprocess/gsm8k.py
+      - name: Running gsm8k e2e training tests on 8 L20 GPUs with rmpad using function rm
+        run: |
+          ray stop --force
+          bash tests/sft/run_sft.sh
+      - name: Running gsm8k e2e training tests on 8 L20 GPUs with sequence parallism
+        run: |
+          ray stop --force
+          bash examples/sft/gsm8k/run_qwen_05_sp2.sh 8 $HOME/ckpts/
+      - name: Check loss difference between sequence parallel vs. default implementation
+        run: |
+          ray stop --force
+          bash tests/sft/run_sft_sp_loss_match.sh
diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
@@ -16,6 +16,8 @@ on:
       - "**/*.py"
       - .github/workflows/model.yml
 
+
+
 jobs:
   model_rmpad:
     runs-on: [self-hosted, l20-1]

diff --git a/.github/workflows/ray_test.yml b/.github/workflows/ray_test.yml
@@ -16,6 +16,8 @@ on:
       - "**/*.py"
       - .github/workflows/ray_test.yml
 
+
+
 jobs:
   ray:
     runs-on: [self-hosted, l20-0]

diff --git a/README.md b/README.md
@@ -39,12 +39,15 @@ veRL is fast with:
 - **vLLM** and **TGI** for rollout generation, **SGLang** support coming soon.
 - huggingface models support
 - Supervised fine-tuning
-- Reward model training
-- Reinforcement learning from human feedback with PPO
-- flash-attention integration, sequence packing, and long context support
+- Reinforcement learning from human feedback with [PPO](https://github.com/volcengine/verl/tree/main/examples/ppo_trainer) and [GRPO](https://github.com/volcengine/verl/tree/main/examples/grpo_trainer)
+  - Support model-based reward and function-based reward (verifiable reward)
+- flash-attention integration, sequence packing, and long context support via DeepSpeed Ulysses
 - scales up to 70B models and hundreds of GPUs
 - experiment tracking with wandb and mlflow
 
+## Upcoming Features
+- Reward model training
+- DPO training
 
 ## Getting Started
 
@@ -54,7 +57,7 @@ Checkout this [Jupyter Notebook](https://github.com/volcengine/verl/tree/main/ex
 - [Installation](https://verl.readthedocs.io/en/latest/start/install.html)
 - [Quickstart](https://verl.readthedocs.io/en/latest/start/quickstart.html)
 
-**Running an PPO example step-by-step:**
+**Running a PPO example step-by-step:**
 - Data and Reward Preparation
   - [Prepare Data (Parquet) for Post-Training](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html)
   - [Implement Reward Function for Dataset](https://verl.readthedocs.io/en/latest/preparation/reward_function.html)
@@ -77,6 +80,8 @@ Checkout this [Jupyter Notebook](https://github.com/volcengine/verl/tree/main/ex
   - [Add models with the FSDP backend](https://verl.readthedocs.io/en/latest/advance/fsdp_extension.html)
   - [Add models with the Megatron-LM backend](https://verl.readthedocs.io/en/latest/advance/megatron_extension.html)
 
+## Performance Tuning Guide
+The performance is essential for on-policy RL algorithm. We write a detailed performance tuning guide to allow people tune the performance. See [here](https://verl.readthedocs.io/en/latest/perf/perf_tuning.html) for more details.
 
 ## Citation and acknowledgement
 
@@ -95,9 +100,10 @@ If you find the project helpful, please cite:
 
 verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and supported by Anyscale, Bytedance, LMSys.org, Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, and University of Hong Kong.
 
-## Publications Using veRL
+## Awesome work using veRL
 - [Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization](https://arxiv.org/abs/2410.09302)
 - [Flaming-hot Initiation with Regular Execution Sampling for Large Language Models](https://arxiv.org/abs/2410.21236)
 - [Process Reinforcement Through Implicit Rewards](https://github.com/PRIME-RL/PRIME/)
+- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of DeepSeek R1 Zero in countdown and multiplication tasks
 
 We are HIRING! Send us an [email](mailto:[email protected]) if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
diff --git a/docs/examples/config.rst b/docs/examples/config.rst
@@ -59,60 +59,79 @@ Actor/Rollout/Reference Policy
 .. code:: yaml
 
    actor_rollout_ref:
-     hybrid_engine: True
-     model:
-       path: ~/models/deepseek-llm-7b-chat
-       external_lib: null
-       override_config: {}
-       enable_gradient_checkpointing: False
-     actor:
-       strategy: fsdp  # This is for backward-compatibility
-       ppo_mini_batch_size: 256
-       ppo_micro_batch_size: 64
-       grad_clip: 1.0
-       clip_ratio: 0.2
-       entropy_coeff: 0.001
-       ppo_epochs: 1
-       shuffle: True
-       optim:
-         lr: 1e-6
-         lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-         min_lr_ratio: null   # only useful for warmup with cosine
-         warmup_style: constant  # select from constant/cosine
-         total_training_steps: -1  # must be override by program
-       fsdp_config:
-         wrap_policy:
-           # transformer_layer_cls_to_wrap: None
-           min_num_params: 0
-         param_offload: False
-         grad_offload: False
-         optimizer_offload: False
-     ref:
-       fsdp_config:
-         param_offload: False
-         wrap_policy:
-           # transformer_layer_cls_to_wrap: None
-           min_num_params: 0
-       log_prob_micro_batch_size: 128
-     rollout:
-       name: vllm
-       temperature: 1.0
-       top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-       top_p: 1
-       response_length: ${data.max_response_length}
-       # for vllm rollout
-       dtype: bfloat16 # should align with FSDP
-       gpu_memory_utilization: 0.5
-       ignore_eos: False
-       enforce_eager: True
-       free_cache_engine: True
-       load_format: dummy_dtensor # or dummy_hf or dummy_megatron
-       tensor_model_parallel_size: 2
-       max_num_batched_tokens: 8192
-       max_num_seqs: 1024
-       log_prob_micro_batch_size: 128
-       # for vllm and hf rollout
-       do_sample: True
+    hybrid_engine: True
+    model:
+      path: ~/models/deepseek-llm-7b-chat
+      external_lib: null
+      override_config: { }
+      enable_gradient_checkpointing: False
+      use_remove_padding: False
+    actor:
+      strategy: fsdp  # This is for backward-compatibility
+      ppo_mini_batch_size: 256
+      ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+      ppo_micro_batch_size_per_gpu: 8
+      use_dynamic_bsz: False
+      ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+      grad_clip: 1.0
+      clip_ratio: 0.2
+      entropy_coeff: 0.001
+      use_kl_loss: False # True for GRPO
+      kl_loss_coef: 0.001 # for grpo
+      kl_loss_type: low_var_kl # for grpo
+      ppo_epochs: 1
+      shuffle: False
+      ulysses_sequence_parallel_size: 1 # sp size
+      optim:
+        lr: 1e-6
+        lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+        min_lr_ratio: null   # only useful for warmup with cosine
+        warmup_style: constant  # select from constant/cosine
+        total_training_steps: -1  # must be override by program
+      fsdp_config:
+        wrap_policy:
+          # transformer_layer_cls_to_wrap: None
+          min_num_params: 0
+        param_offload: False
+        grad_offload: False
+        optimizer_offload: False
+        fsdp_size: -1
+    ref:
+      fsdp_config:
+        param_offload: False
+        wrap_policy:
+          # transformer_layer_cls_to_wrap: None
+          min_num_params: 0
+      log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+      log_prob_micro_batch_size_per_gpu: 16
+      log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+      log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+      ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+    rollout:
+      name: vllm
+      temperature: 1.0
+      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+      top_p: 1
+      prompt_length: ${data.max_prompt_length}  # not use for opensource
+      response_length: ${data.max_response_length}
+      # for vllm rollout
+      dtype: bfloat16 # should align with FSDP
+      gpu_memory_utilization: 0.5
+      ignore_eos: False
+      enforce_eager: True
+      free_cache_engine: True
+      load_format: dummy_dtensor
+      tensor_model_parallel_size: 2
+      max_num_batched_tokens: 8192
+      max_num_seqs: 1024
+      log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+      log_prob_micro_batch_size_per_gpu: 16
+      log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+      log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+      # for hf rollout
+      do_sample: True
+      # number of responses (i.e. num sample times)
+      n: 1 # > 1 for grpo
 
 **Common config for actor, rollout and reference model**
 
@@ -136,11 +155,15 @@ Actor/Rollout/Reference Policy
 
 - ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split
   into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO
-  updates
+  updates. The ppo_mini_batch_size is a global num across all workers/gpus
+
+- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu] 
+  Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass,
+  trading speed for GPU memory. The value represent the global view.
 
-- ``actor_rollout_ref.actor.ppo_micro_batch_size``: Similar to gradient
-  accumulation, the micro_batch_size for one forward pass, trading speed
-  for GPU memory
+- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient
+  accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed
+  for GPU memory. The value represent the local num per gpu.
 
 - ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor
   updates
@@ -176,8 +199,12 @@ Actor/Rollout/Reference Policy
 - ``actor_rollout_ref.ref``: FSDP config same as actor. **For models
   larger than 7B, it's recommended to turn on offload for ref by
   default**
-- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: The batch size
-  for one forward pass in the computation of ``ref_log_prob``.
+
+- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
+  The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num.
+
+- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size
+  for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu.
 
 **Rollout Model**
 
@@ -201,8 +228,11 @@ Actor/Rollout/Reference Policy
 - ``tensor_model_parallel_size``: TP size for rollout. Only effective
   for vllm.
 
-- ``log_prob_micro_batch_size``: Micro_batch_size (The batch size for
-  one forward pass) for recalculating log_prob.
+- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
+  The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num.
+
+- ``log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for
+  one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu.
 
 - ``do_sample``: Whether to sample. If set to False, the rollout model
   will perform greedy sampling. We disable ``do_sample`` during
@@ -260,7 +290,7 @@ Reward Model
        fsdp_config:
          min_num_params: 0
          param_offload: False
-     micro_batch_size: 64
+     micro_batch_size_per_gpu: 16
      max_length: null
 
 - ``reward_model.enable``: Whether to enable reward model. If False, we
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,8 @@ on: @@
           - "**/*.py"
           - .github/workflows/dataset.yml
     jobs:
       ray:
         runs-on: [self-hosted, gpu]
@@ Expand Down @@