Skip to content

Commit

Permalink
cpu: aarch64: make softmax ops use stateless ACL interface fix comments
Browse files Browse the repository at this point in the history
Signed-off-by: Ye Tao <[email protected]>
Change-Id: I6cdad6325d764c1efdbb96cb438c531b67accc63
  • Loading branch information
taoye9 committed Oct 22, 2024
1 parent 04ce01f commit 8ca34a1
Showing 1 changed file with 9 additions and 13 deletions.
22 changes: 9 additions & 13 deletions src/cpu/aarch64/acl_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,24 +77,20 @@ status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) {
// associated with calling the external library and the negative
// coefficient on total_size as ACL being faster at processing
// each element
auto calculate_performance_diff = [](dnnl::impl::dim_t outer_size,
dnnl::impl::dim_t axis_size,
const int threads,
double sec_coff) {
double acl_ref_performance_diff = 1 + 0.005 * outer_size
+ sec_coff * axis_size
* std::ceil(double(outer_size) / threads);

if (threads > 1 || outer_size > 1) {
auto calculate_performance_diff = [=](double axis_coeff) {
double acl_ref_performance_diff = 1 + 0.005 * outer_size_
+ axis_coeff * axis_size_
* std::ceil(double(outer_size_) / threads);

if (threads > 1 || outer_size_ > 1) {
acl_ref_performance_diff
+= 17; // Adds constant overhead for using threads within ACL
}
return acl_ref_performance_diff;
};

if (inner_size_ == 1) {
double acl_ref_performance_diff = calculate_performance_diff(
outer_size_, axis_size_, threads, -0.0027);
double acl_ref_performance_diff = calculate_performance_diff(-0.0027);
if (acl_ref_performance_diff > 0) return status::unimplemented;

// If the inner size is 1, we can get rid of the dimension.
Expand All @@ -111,8 +107,8 @@ status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) {
// A rough empirical heuristic, see comment above
// The only difference here is that ACL does a reorder, and so
// is considerably better
double acl_ref_performance_diff = calculate_performance_diff(
outer_size_, axis_size_, threads, -0.01);
double acl_ref_performance_diff
= calculate_performance_diff(-0.01 * inner_size_);
if (acl_ref_performance_diff > 0) return status::unimplemented;

// Irrespective of the input dimensions, we construct a tensor
Expand Down

0 comments on commit 8ca34a1

Please sign in to comment.