-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Mark grid size loads with range metadata #113019
base: users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups
Are you sure you want to change the base?
AMDGPU: Mark grid size loads with range metadata #113019
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesOnly handles the v5 case. Full diff: https://github.com/llvm/llvm-project/pull/113019.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 1bb5e794da7dd6..5fc0c36359b6f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
@@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
} // end anonymous namespace
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+ uint32_t MaxNumGroups) {
+ if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+ return;
+
+ if (!Load->getType()->isIntegerTy(32))
+ return;
+
+ // TODO: If there is existing range metadata, preserve it if it is stricter.
+ MDBuilder MDB(Load->getContext());
+ MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+ Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
Function *F = CI->getParent()->getParent();
@@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
const bool HasUniformWorkGroupSize =
F->getFnAttribute("uniform-work-group-size").getValueAsBool();
- if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+ SmallVector<unsigned> MaxNumWorkgroups =
+ AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
+
+ if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+ none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
return false;
Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
@@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
if (IsV5OrAbove) { // Base is ImplicitArgPtr.
switch (Offset) {
case HIDDEN_BLOCK_COUNT_X:
- if (LoadSize == 4)
+ if (LoadSize == 4) {
BlockCounts[0] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+ }
break;
case HIDDEN_BLOCK_COUNT_Y:
- if (LoadSize == 4)
+ if (LoadSize == 4) {
BlockCounts[1] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+ }
break;
case HIDDEN_BLOCK_COUNT_Z:
- if (LoadSize == 4)
+ if (LoadSize == 4) {
BlockCounts[2] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+ }
break;
case HIDDEN_GROUP_SIZE_X:
if (LoadSize == 2)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 961a9220b48d6b..5a899b755419ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
TM.getSubtarget<R600Subtarget>(F));
}
+// FIXME: This has no reason to be in subtarget
SmallVector<unsigned>
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
new file mode 100644
index 00000000000000..9064292129928f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+
+define i32 @use_grid_size_x_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_y_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
+; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
+ %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
+ ret i32 %grid.size.y
+}
+
+define i32 @use_grid_size_z_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
+; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
+ %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
+ ret i32 %grid.size.z
+}
+
+define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 {
+; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT: ret <2 x i16> [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret <2 x i16> %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max() #2 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_zero() #3 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" }
+attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
+
+!0 = !{i32 0, i32 -1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[RNG0]] = !{i32 1, i32 37}
+; CHECK: [[RNG1]] = !{i32 1, i32 43}
+; CHECK: [[RNG2]] = !{i32 1, i32 90}
+; CHECK: [[RNG3]] = !{i32 1, i32 -1}
+;.
|
Only handles the v5 case.
2005b26
to
58a330c
Compare
297173f
to
5336b21
Compare
Only handles the v5 case.