[CI] Adjust how the tests are run in Slurm (#600)

* Adjust how the tests are run in CI * Fix schedule/auto_parallelize and adapt its test to A100 * Remove pybind11-stubgen from CI environment because it's conflict against pyproject unless --no-build-isolation
roastduck · Mar 14, 2024 · 87bfe21 · 87bfe21
1 parent 2e285e9
commit 87bfe21
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 13 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -37,9 +37,7 @@ jobs:
           source /opt/spack/share/spack/setup-env.sh
           spack load [email protected]%[email protected] [email protected]/ehz25ml [email protected]/uopt2y4 [email protected] java@11 [email protected]
           source ci-script/prepare-python-environment.sh
-          # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission
-          # Setting OMP_NUM_THREADS=256 seems to work around the conflict of PyTorch
-          OMP_NUM_THREADS=256 OMP_PROC_BIND=true srun --exclusive -N 1 -p ja --gres=gpu:v100:1 pytest --color=yes test
+          srun -N 1 -c 64 -p octave --gres=gpu:a100:1 pytest --color=yes -m "not performance_sensitive" test
   build-and-test-gcc-minimal-run_in_tree:
     runs-on: self-hosted
     if: github.event.pull_request.draft == false
@@ -65,8 +63,7 @@ jobs:
           source /opt/spack/share/spack/setup-env.sh
           spack load [email protected]%[email protected] java@11 [email protected]
           source ci-script/prepare-python-environment.sh
-          # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission
-          OMP_PROC_BIND=true PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p ja pytest --color=yes test
+          PYTHONPATH=build:python:$PYTHONPATH srun -N 1 -c 64 -p ja pytest --color=yes -m "not performance_sensitive" test
   build-and-test-clang-run-in-tree:
     runs-on: self-hosted
     if: github.event.pull_request.draft == false
@@ -92,5 +89,30 @@ jobs:
           source /opt/spack/share/spack/setup-env.sh
           spack load [email protected]%[email protected] java@11 llvm@16%gcc@12
           source ci-script/prepare-python-environment.sh
+          PYTHONPATH=build:python:$PYTHONPATH srun -N 1 -c 64 -p ja pytest --color=yes -m "not performance_sensitive" test
+  build-and-test-gcc-cuda-mkl-exclusively:
+    runs-on: self-hosted
+    if: github.event.pull_request.draft == false
+    steps:
+      - uses: roastduck/checkout@main
+        with:
+          ssh-key: ${{ secrets.CI }}
+          submodules: true
+          fetch-depth: 0
+      - name: Build ffi module in Release
+        run: |
+          git submodule foreach --recursive git clean -ffdx
+          git submodule foreach --recursive git reset --hard
+          source /opt/spack/share/spack/setup-env.sh
+          spack load [email protected]%[email protected] [email protected]/ehz25ml [email protected]/uopt2y4 [email protected] java@11 [email protected]
+          source ci-script/prepare-python-environment.sh
+          # -C requires a new enough pip
+          pip3 install --upgrade pip
+          pip3 install . -C--local=with-cuda.toml -C--local=ci-script/with-spack-mkl.toml
+      - name: Run PyTest
+        run: |
+          source /opt/spack/share/spack/setup-env.sh
+          spack load [email protected]%[email protected] [email protected]/ehz25ml [email protected]/uopt2y4 [email protected] java@11 [email protected]
+          source ci-script/prepare-python-environment.sh
           # Set OMP_PROC_BIND to make OpenMP happy for 30.schedule/test_auto_fission_fuse.py::test_tune_fission
-          OMP_PROC_BIND=true PYTHONPATH=build:python:$PYTHONPATH srun --exclusive -N 1 -p ja pytest --color=yes test
+          OMP_PROC_BIND=true srun --exclusive=user -N 1 -c 256 -p ja --gres=gpu:v100:1 pytest --color=yes -m "performance_sensitive" test
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    performance_sensitive: These tests should be run on a exlusively dedicated node
diff --git a/requirements.txt b/requirements.txt
@@ -25,7 +25,6 @@ numpy==1.24.3
 packaging==23.1
 pluggy==1.0.0
 py_build_cmake==0.1.8
-pybind11-stubgen==0.13.0
 Pygments==2.15.1
 pymdown-extensions==10.0
 pytest==7.3.1

diff --git a/src/schedule/auto_parallelize.cc b/src/schedule/auto_parallelize.cc
@@ -570,7 +570,7 @@ void Schedule::autoParallelize(const Ref<Target> &target) {
                 });
 
             // III b. Reduction
-            if (!needParRed) {
+            if (localParaAll.size() == localParaNoRed.size() || !needParRed) {
                 commitTransaction();
             } else {
                 abortTransaction();

diff --git a/test/31.auto_schedule/test_auto_fission_fuse.py b/test/31.auto_schedule/test_auto_fission_fuse.py
@@ -90,6 +90,7 @@ def test_stmt_in_between_2():
     assert logs == ["swap(L2, S1)", "fuse(L1, L2, true)"]
 
 
+@pytest.mark.performance_sensitive
 def test_tune_fuse():
     # We may fuse these loops. But fusing them will make it impossible to parallelize.
     # After tuning, we will end up in not fusing them
@@ -133,6 +134,7 @@ def test_tune_fuse():
         assert "fuse" not in log
 
 
+@pytest.mark.performance_sensitive
 def test_tune_fission():
     # The reverse schedule of `test_tune_fuse`
 
@@ -178,6 +180,7 @@ def test_tune_fission():
     assert "fission" in ", ".join(logs)
 
 
+@pytest.mark.performance_sensitive
 @pytest.mark.skipif(not ft.with_cuda(), reason="requires CUDA")
 def test_tune_with_cond():
     # Fuse loops that can parallelize. Don't fuse loops that can't

diff --git a/test/31.auto_schedule/test_auto_parallelize.py b/test/31.auto_schedule/test_auto_parallelize.py
@@ -51,10 +51,10 @@ def test_3_levels():
 
 @pytest.mark.skipif(not ft.with_cuda(), reason="requires CUDA")
 def test_gpu_basic_static_small():
-    with ft.VarDef([("x", (10, 10, 2), "int32", "input", "cpu"),
-                    ("y", (10, 10, 2), "int32", "output", "cpu")]) as (x, y):
-        with ft.For("i", 0, 10, label="Li") as i:
-            with ft.For("j", 0, 10, label="Lj") as j:
+    with ft.VarDef([("x", (20, 20, 2), "int32", "input", "cpu"),
+                    ("y", (20, 20, 2), "int32", "output", "cpu")]) as (x, y):
+        with ft.For("i", 0, 20, label="Li") as i:
+            with ft.For("j", 0, 20, label="Lj") as j:
                 y[i, j, 0] = x[i, j, 0] + 1
 
     device = ft.GPU()
@@ -68,7 +68,7 @@ def test_gpu_basic_static_small():
     logs = list(map(str, s.logs()))
     print(logs)
     assert fnmatch_list(logs, [
-        f'split(Lj, -1, {num_sm // 10}, 0)', 'merge(Li, $split.0{Lj})',
+        f'split(Lj, -1, {num_sm // 20}, 0)', 'merge(Li, $split.0{Lj})',
         'parallelize($merge{Li, $split.0{Lj}}, blockIdx.x, *)',
         'parallelize($split.1{Lj}, threadIdx.y, *)'
     ])