[AMD] Remove stream pipeliner v1 (#4845)

We have flipped stream pipeliner v2 on as default for quite sometime. All known issues has been fixed. So now remove old v1 pipeliner. Note that this changes know `num_stages` are handled: previously we used to enable pipelining if `num_stages` is `0`, which really is not a good behavior. Now switched to follow common practice where `0`/`1` won't trigger pipelining anymore; need `2` or more to trigger. Given downstream users might be using `0` in the codebase, right now we `assert` to give developers a clear indication the switch of behavior instead of silently drop the perf. The `assert` is expected to be dropped sometime down the line. --------- Co-authored-by: Lei Zhang <[email protected]>
triton-lang · Oct 18, 2024 · 76ed94d · 76ed94d
1 parent d4e5a78
commit 76ed94d
Show file tree

Hide file tree

Showing 8 changed files with 8 additions and 982 deletions.
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -60,7 +60,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
   mlir::registerTritonAMDGPUReorderInstructions();
-  mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
 

diff --git a/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir b/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -29,7 +29,7 @@ def min_dot_size(target: GPUTarget):
 class HIPOptions:
     num_warps: int = 4
     waves_per_eu: int = 1
-    num_stages: int = 0
+    num_stages: int = 2
     num_ctas: int = 1
     extern_libs: dict = None
     cluster_dims: tuple = (1, 1, 1)
@@ -215,23 +215,19 @@ def make_ttgir(mod, metadata, options):
         passes.ttgpuir.add_remove_layout_conversions(pm)
         amd.passes.ttgpuir.add_optimize_epilogue(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
-        use_new_pipeliner = os.getenv("TRITON_HIP_USE_NEW_STREAM_PIPELINE", "1") == "1"
         if amd.has_matrix_core_feature(options.arch):
-            if use_new_pipeliner:
-                # In the old pipeliner we only support num_stages = 0/1, which means something
-                # different than the NVIDIA side. In the new pipeliner we unify the num_stages
-                # interpretation. Default to use 2 stages if not explicitly set.
-                num_stages = options.num_stages if options.num_stages != 0 else 2
-                amd.passes.ttgpuir.add_stream_pipelinev2(pm, num_stages)
-            else:
-                if options.num_stages == 0:
-                    amd.passes.ttgpuir.add_stream_pipeline(pm)
+            assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. "
+                                             "We used to trigger software pipelining with "
+                                             "num_stages == 0. Now it will not happen anymore; "
+                                             "please update to use num_stages == 2 for "
+                                             "equivalent behavior in the past.")
+            amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages)
             passes.common.add_canonicalizer(pm)
         amd.passes.ttgpuir.insert_instruction_sched_hints(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_reduce_data_duplication(pm)
-        if use_new_pipeliner or options.num_stages != 0:
+        if amd.has_matrix_core_feature(options.arch):
             amd.passes.ttgpuir.add_reorder_instructions(pm)
         amd.passes.ttgpuir.add_canonicalize_pointers(pm)
         passes.common.add_canonicalizer(pm)

diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
@@ -6,8 +6,6 @@
 
 namespace mlir {
 
-std::unique_ptr<Pass> createTritonAMDGPUStreamPipelinePass();
-
 std::unique_ptr<Pass> createTritonAMDGPUStreamPipelineV2Pass(int numStages = 2);
 
 std::unique_ptr<Pass>

diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -3,19 +3,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::ModuleOp"> {
-  let summary = "pipeline";
-
-  let description = [{
-    Pipeline global loads through registers to shared memory while computing on previous
-    tile
-  }];
-
-  let constructor = "mlir::createTritonAMDGPUStreamPipelinePass()";
-
-  let dependentDialects = [];
-}
-
 def TritonAMDGPUStreamPipelineV2 : Pass<"tritonamdgpu-stream-pipeline-v2", "mlir::ModuleOp"> {
   let summary = "pipeline";
 

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
@@ -3,7 +3,6 @@ add_triton_library(TritonAMDGPUTransforms
   CanonicalizePointers.cpp
   OptimizeEpilogue.cpp
   ReorderInstructions.cpp
-  StreamPipeline.cpp
   StreamPipelineV2.cpp
   MfmaGroup.cpp