From 408841def8d2d7ea2fd30ea0fc60ca16389d0c31 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Oct 2024 23:05:51 +0400 Subject: [PATCH 1/4] AMDGPU: Propagate amdgpu-max-num-workgroups attribute I'm not sure what the interpretation of 0 is supposed to be, AMDGPUUsage doesn't say. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 154 +++++++++++- ...ttr-amdgpu-max-num-workgroups-propagate.ll | 228 ++++++++++++++++++ 2 files changed, 380 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 687a7339da379d..58b90ddc36605c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -179,6 +179,11 @@ class AMDGPUInformationCache : public InformationCache { return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; } + SmallVector getMaxNumWorkGroups(const Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return ST.getMaxNumWorkGroups(F); + } + /// Get code object version. unsigned getCodeObjectVersion() const { return CodeObjectVersion; @@ -821,6 +826,150 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, "AAAMDFlatWorkGroupSize is only valid for function position"); } +struct TupleDecIntegerRangeState : public AbstractState { + DecIntegerState X, Y, Z; + + bool isValidState() const override { + return X.isValidState() && Y.isValidState() && Z.isValidState(); + } + + bool isAtFixpoint() const override { + return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint(); + } + + ChangeStatus indicateOptimisticFixpoint() override { + return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() | + Z.indicateOptimisticFixpoint(); + } + + ChangeStatus indicatePessimisticFixpoint() override { + return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() | + Z.indicatePessimisticFixpoint(); + } + + TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) { + X ^= Other.X; + Y ^= Other.Y; + Z ^= Other.Z; + return *this; + } + + bool operator==(const TupleDecIntegerRangeState &Other) const { + return X == Other.X && Y == Other.Y && Z == Other.Z; + } + + TupleDecIntegerRangeState &getAssumed() { return *this; } + const TupleDecIntegerRangeState &getAssumed() const { return *this; } +}; + +using AAAMDMaxNumWorkgroupsState = + StateWrapper; + +/// Propagate amdgpu-max-num-workgroups attribute. +struct AAAMDMaxNumWorkgroups + : public StateWrapper { + using Base = StateWrapper; + + AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast(A.getInfoCache()); + + SmallVector MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F); + + // FIXME: What is the interpretation of 0? + for (unsigned &Entry : MaxNumWorkgroups) { + if (Entry == 0) + Entry = std::numeric_limits::max(); + } + + X.takeKnownMinimum(MaxNumWorkgroups[0]); + Y.takeKnownMinimum(MaxNumWorkgroups[1]); + Z.takeKnownMinimum(MaxNumWorkgroups[2]); + + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName() + << "->" << getAssociatedFunction()->getName() << '\n'); + + const auto *CallerInfo = A.getAAFor( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + if (!CallerInfo) + return false; + + Change |= + clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); + return true; + }; + + bool AllCallSitesKnown = true; + if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + return indicatePessimisticFixpoint(); + + return Change; + } + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP, + Attributor &A); + + ChangeStatus manifest(Attributor &A) override { + Function *F = getAssociatedFunction(); + // TODO: Skip adding if worst case? + LLVMContext &Ctx = F->getContext(); + SmallString<32> Buffer; + raw_svector_ostream OS(Buffer); + OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed(); + + // TODO: Should annotate loads of the group size for this to do anything + // useful. + return A.manifestAttrs( + getIRPosition(), + {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())}, + /* ForceReplace= */ true); + } + + const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; } + + const std::string getAsStr(Attributor *) const override { + std::string Buffer = "AAAMDMaxNumWorkgroupsState["; + raw_string_ostream OS(Buffer); + OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed() + << ']'; + return OS.str(); + } + + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDMaxNumWorkgroups + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + void trackStatistics() const override {} + + /// Unique ID (due to the unique address) + static const char ID; +}; + +const char AAAMDMaxNumWorkgroups::ID = 0; + +AAAMDMaxNumWorkgroups & +AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A); + llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position"); +} + /// Propagate amdgpu-waves-per-eu attribute. struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) @@ -1043,8 +1192,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, DenseSet Allowed( {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, - &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, - &AAPointerInfo::ID, &AAPotentialConstantValues::ID, + &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, + &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); @@ -1068,6 +1217,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, for (auto *F : Functions) { A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); + A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll new file mode 100644 index 00000000000000..bd1aa6e6ed4701 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s + +; External call to avoid inferring argument attributes. This makes the +; final attribute groups easier to read +declare void @dummy() + +define void @extern_callee() { +; CHECK-LABEL: define void @extern_callee( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define internal void @callee_1_2_3() { +; CHECK-LABEL: define internal void @callee_1_2_3( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_1_2_3() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @callee_1_2_3() +; CHECK-NEXT: call void @extern_callee() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @callee_1_2_3() + call void @extern_callee() + call void @dummy() + ret void +} + +attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"} + +; -> 100,10,99 +define internal void @callee_merge_100_8_32__16_10_99() { +; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_100_8_32() #1 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + ret void +} + +attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"} + +define amdgpu_kernel void @kernel_16_10_99() #2 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + call void @dummy() + ret void +} + +attributes #2 = {"amdgpu-max-num-workgroups"="16,10,99"} + +define internal void @merge_to_worst_case() { +; CHECK-LABEL: define internal void @merge_to_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define internal void @callee_x_worst_case() { +; CHECK-LABEL: define internal void @callee_x_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_x_maximum() #3 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @callee_x_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @callee_x_worst_case() + call void @dummy() + ret void +} + +attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"} + +define amdgpu_kernel void @kernel_y_maximum() #4 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum( +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @dummy() + ret void +} + +attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"} + +define amdgpu_kernel void @kernel_z_maximum() #5 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum( +; CHECK-SAME: ) #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @dummy() + ret void +} + +attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"} + +; Make sure the attribute isn't lost from the callee. +define internal void @annotated_callee_from_unannotated_kernel() #6 { +; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel( +; CHECK-SAME: ) #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"} + +define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee() { +; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @annotated_callee_from_unannotated_kernel() +; CHECK-NEXT: ret void +; + call void @annotated_callee_from_unannotated_kernel() + ret void +} + + +define internal void @annotated_callee_merge_caller() #7 { +; CHECK-LABEL: define internal void @annotated_callee_merge_caller( +; CHECK-SAME: ) #[[ATTR9:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"} + +define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 { +; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller( +; CHECK-SAME: ) #[[ATTR10:[0-9]+]] { +; CHECK-NEXT: call void @annotated_callee_merge_caller() +; CHECK-NEXT: ret void +; + call void @annotated_callee_merge_caller() + ret void +} + +attributes #8 = {"amdgpu-max-num-workgroups"="256,128,2048"} + +define internal void @called_by_explicit_worst_case() { +; CHECK-LABEL: define internal void @called_by_explicit_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_explicit_worst_case() #9 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case( +; CHECK-SAME: ) #[[ATTR11:[0-9]+]] { +; CHECK-NEXT: call void @called_by_explicit_worst_case() +; CHECK-NEXT: ret void +; + call void @called_by_explicit_worst_case() + ret void +} + +attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +;. From 2d48557d3987770bf02f0172a6f9b849eaa99102 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 21 Oct 2024 16:59:28 -0700 Subject: [PATCH 2/4] Comment parameter name --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 58b90ddc36605c..b10100023f058a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -911,7 +911,9 @@ struct AAAMDMaxNumWorkgroups }; bool AllCallSitesKnown = true; - if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + if (!A.checkForAllCallSites(CheckCallSite, *this, + /*RequireAllCallSites=*/true, + AllCallSitesKnown)) return indicatePessimisticFixpoint(); return Change; From 1bb618462735275f4cf46a0d937ffef1abc51175 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 21 Oct 2024 17:03:32 -0700 Subject: [PATCH 3/4] Add a shader entry test --- .../AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll index bd1aa6e6ed4701..366432e0fc6cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -62,6 +62,16 @@ define amdgpu_kernel void @kernel_100_8_32() #1 { ret void } +define amdgpu_cs void @amdgpu_cs_100_8_32() #1 { +; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32( +; CHECK-SAME: ) #[[ATTR3]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + ret void +} + attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"} define amdgpu_kernel void @kernel_16_10_99() #2 { From 58a330c0e9f0aba24b4c6473c251b70e7ca1e9cd Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 21 Oct 2024 17:08:05 -0700 Subject: [PATCH 4/4] Check isValidState --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index b10100023f058a..151f33ada18862 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -902,7 +902,7 @@ struct AAAMDMaxNumWorkgroups const auto *CallerInfo = A.getAAFor( *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); - if (!CallerInfo) + if (!CallerInfo || !CallerInfo->isValidState()) return false; Change |=