From 79b019b102c5d68843d52473f7d26a80597d84d2 Mon Sep 17 00:00:00 2001
From: Dmitrii Zarukin <dmitry.zarukin@intel.com>
Date: Thu, 23 Jun 2022 11:43:07 -0700
Subject: [PATCH] api: depthwise post-op with any kernel, stride and padding
 values

---
 doc/programming_model/attributes_post_ops.md  |  10 +-
 include/oneapi/dnnl/dnnl.h                    |  64 ++++
 include/oneapi/dnnl/dnnl.hpp                  | 136 ++++++---
 .../src/benchdnn_generator.py                 |   3 +-
 scripts/verbose_converter/src/dnnl_parser.py  |  12 +-
 src/common/primitive_attr.cpp                 |  67 ++--
 src/common/primitive_attr.hpp                 |  19 +-
 src/common/primitive_hashing.cpp              |   4 +
 src/common/serialization.cpp                  |   2 +
 src/common/verbose.cpp                        |   3 +-
 src/cpu/dw_convolution_utils.hpp              |  21 +-
 .../x64/jit_avx512_common_1x1_convolution.cpp |   6 +-
 src/cpu/x64/jit_sse41_1x1_convolution.cpp     |   5 +-
 tests/benchdnn/conv/conv_dw_fusion.cpp        |  25 +-
 tests/benchdnn/dnn_types.cpp                  |  50 +--
 tests/benchdnn/dnn_types.hpp                  |   9 +-
 tests/benchdnn/doc/knobs_attr.md              |   7 +-
 .../inputs/conv/harness_conv_fused_depthwise  | 288 ++++++++++++++++++
 tests/benchdnn/utils/parser.cpp               |   2 +-
 tests/gtests/test_iface_attr.cpp              |  20 ++
 20 files changed, 629 insertions(+), 124 deletions(-)

diff --git a/doc/programming_model/attributes_post_ops.md b/doc/programming_model/attributes_post_ops.md
index 94f24b264ff..78cc63638df 100644
--- a/doc/programming_model/attributes_post_ops.md
+++ b/doc/programming_model/attributes_post_ops.md
@@ -164,12 +164,14 @@ convolution.
 The @ref dnnl::primitive::kind of this post-op
 is #dnnl::primitive::kind::convolution.
 
-There are two variants of this post-op: `dw_k3s1p1` and `dw_k3s2p1` for stride-1
-and stride-2 respectively.
+Three variants of depthwise post-op are supported:
+* `dw_k3s1p1` for the case of stride 1, kernel size 3, and left padding of 1.
+* `dw_k3s2p1` for the case of stride 2, kernel size 3, and left padding of 1.
+* `dw` for a general case.
 
 API:
-- C: @ref dnnl_post_ops_append_dw_k3s1p1 , @ref dnnl_post_ops_append_dw_k3s2p1
-- C++: @ref dnnl::post_ops::append_dw_k3s1p1 , @ref dnnl::post_ops::append_dw_k3s2p1
+- C: @ref dnnl_post_ops_append_dw , @ref dnnl_post_ops_append_dw_k3s1p1 , @ref dnnl_post_ops_append_dw_k3s2p1
+- C++: @ref dnnl::post_ops::append_dw , @ref dnnl::post_ops::append_dw_k3s1p1 , @ref dnnl::post_ops::append_dw_k3s2p1
 
 For better readability, below we assume a 2D convolution and use the following
 notations:
diff --git a/include/oneapi/dnnl/dnnl.h b/include/oneapi/dnnl/dnnl.h
index 2673c624535..f71084445aa 100644
--- a/include/oneapi/dnnl/dnnl.h
+++ b/include/oneapi/dnnl/dnnl.h
@@ -792,6 +792,70 @@ dnnl_status_t DNNL_API dnnl_post_ops_get_params_eltwise(
         const_dnnl_post_ops_t post_ops, int index, float *scale,
         dnnl_alg_kind_t *alg_kind, float *alpha, float *beta);
 
+/// Appends a depthwise post-op convolution.
+///
+/// This post-op can only be fused with a 2D 1x1 convolution (convolution with
+/// weights spatial dimensions equal to 1 i.e., kh=kw=1).
+///
+/// The kind of this post-op is #dnnl_convolution.
+///
+/// The number of outputs for primitive with fusion is one. The output spatial
+/// size can be derived as below:
+///
+/// output_height = ceil(output_height_1x1_convolution, stride)
+/// output_width = ceil(output_width_1x1_convolution, stride)
+///
+/// See @ref dev_guide_attributes_post_ops_depthwise and
+/// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info.
+///
+/// @param post_ops Post-ops.
+/// @param weights_data_type Weights data type of depthwise post-op
+/// @param bias_data_type Bias data type of depthwise post-op
+/// @param dst_data_type Output data type of depthwise post-op
+/// @param kernel_size Size of kernel of depthwise post-op
+/// @param stride_size Size of stride of depthwise post-op
+/// @param padding_l_size Size of left and top paddings of depthwise post-op
+/// @param count Output length of the array of scaling factors @p scales.
+/// @param mask Output scaling factors correspondence mask that defines the
+///     correspondence between the output tensor dimensions and the @p
+///     scales array. The set i-th bit indicates that a dedicated output scaling
+///     factor is used for each index along that dimension. The mask value of 0
+///     implies a common scaling factor for the whole output tensor.
+/// @param scales Output pointer to a constant array of float scaling factors.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise
+dnnl_status_t DNNL_API dnnl_post_ops_append_dw(dnnl_post_ops_t post_ops,
+        dnnl_data_type_t weights_data_type, dnnl_data_type_t bias_data_type,
+        dnnl_data_type_t dst_data_type, dnnl_dim_t kernel_size,
+        dnnl_dim_t stride_size, dnnl_dim_t padding_l_size, dnnl_dim_t count,
+        int mask, const float *scales);
+
+/// Returns the parameters of an depthwise post-op.
+///
+/// @param post_ops Post-ops.
+/// @param index Index of the elementwise post-op.
+/// @param weights_data_type Weights data type of depthwise post-op
+/// @param bias_data_type Bias data type of depthwise post-op
+/// @param dst_data_type Output data type of depthwise post-op
+/// @param kernel_size Size of kernel of depthwise post-op
+/// @param stride_size Size of stride of depthwise post-op
+/// @param padding_l_size Size of left and top paddings of depthwise post-op
+/// @param count Output length of the array of scaling factors @p scales.
+/// @param mask Output scaling factors correspondence mask that defines the
+///     correspondence between the output tensor dimensions and the @p
+///     scales array. The set i-th bit indicates that a dedicated output scaling
+///     factor is used for each index along that dimension. The mask value of 0
+///     implies a common scaling factor for the whole output tensor.
+/// @param scales Output pointer to a constant array of float scaling factors.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise
+dnnl_status_t DNNL_API dnnl_post_ops_get_params_dw(
+        const_dnnl_post_ops_t post_ops, int index,
+        dnnl_data_type_t *weights_data_type, dnnl_data_type_t *bias_data_type,
+        dnnl_data_type_t *dst_data_type, dnnl_dim_t *kernel_size,
+        dnnl_dim_t *stride_size, dnnl_dim_t *padding_l_size, dnnl_dim_t *count,
+        int *mask, const float **scales);
+
 /// Appends a depthwise post-op convolution with stride 1.
 ///
 /// This post-op can only be fused with a 2D 1x1 convolution (convolution with
diff --git a/include/oneapi/dnnl/dnnl.hpp b/include/oneapi/dnnl/dnnl.hpp
index 0a6fa01b2d8..2fecd007dc9 100644
--- a/include/oneapi/dnnl/dnnl.hpp
+++ b/include/oneapi/dnnl/dnnl.hpp
@@ -3081,7 +3081,7 @@ struct post_ops : public handle<dnnl_post_ops_t> {
         aalgorithm = static_cast<dnnl::algorithm>(c_alg);
     }
 
-    /// Appends a depthwise post-op convolution with stride 1.
+    /// Appends a depthwise post-op convolution.
     ///
     /// This post-op can only be fused with a 2D 1x1 convolution (convolution
     /// with weights spatial dimension equal to 1 i.e., kh=kw=1).
@@ -3089,11 +3089,10 @@ struct post_ops : public handle<dnnl_post_ops_t> {
     /// The kind of this post-op is #dnnl_convolution.
     ///
     /// The number of outputs for primitive remain same as before. The output
-    /// size remain same as the original primitive due to stride=1.
-    ///
-    /// The Post-op can be defined as:
+    /// spatial size can be derived as below:
     ///
-    ///      dst[:] <- scales * (conv_dw(conv_1x1))
+    /// output_height = ceil(output_height_1x1_convolution, stride)
+    /// output_width = ceil(output_width_1x1_convolution, stride)
     ///
     /// See @ref dev_guide_attributes_post_ops_depthwise and
     /// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info.
@@ -3101,6 +3100,9 @@ struct post_ops : public handle<dnnl_post_ops_t> {
     /// @param weights_data_type Weights data type of depthwise post-op
     /// @param bias_data_type Bias data type of depthwise post-op
     /// @param dst_data_type Output data type of depthwise post-op
+    /// @param kernel_size Size of kernel of depthwise post-op
+    /// @param stride_size Size of stride of depthwise post-op
+    /// @param padding_l_size Size of left and top paddings of depthwise post-op
     /// @param mask Output scaling factors correspondence mask that defines the
     ///     correspondence between the output tensor dimensions and the
     ///     @p scales array. The set i-th bit indicates that a dedicated output
@@ -3109,14 +3111,17 @@ struct post_ops : public handle<dnnl_post_ops_t> {
     ///     tensor.
     /// @param scales Output pointer to a constant array of float scaling
     ///     factors.
-    void append_dw_k3s1p1(memory::data_type weights_data_type,
+    void append_dw(memory::data_type weights_data_type,
             memory::data_type bias_data_type, memory::data_type dst_data_type,
-            int mask, const std::vector<float> &scales) {
+            memory::dim kernel_size, memory::dim stride_size,
+            memory::dim padding_l_size, int mask,
+            const std::vector<float> &scales) {
 
-        error::wrap_c_api(dnnl_post_ops_append_dw_k3s1p1(get(),
+        error::wrap_c_api(dnnl_post_ops_append_dw(get(),
                                   memory::convert_to_c(weights_data_type),
                                   memory::convert_to_c(bias_data_type),
                                   memory::convert_to_c(dst_data_type),
+                                  kernel_size, stride_size, padding_l_size,
                                   scales.size(), mask, scales.data()),
                 "could not append depthwise post-op");
     }
@@ -3135,24 +3140,34 @@ struct post_ops : public handle<dnnl_post_ops_t> {
     ///     tensor.
     /// @param scales Output pointer to a constant array of float scaling
     ///     factors.
-    void get_params_dw_k3s1p1(int index, memory::data_type &weights_data_type,
+    void get_params_dw(int index, memory::data_type &weights_data_type,
             memory::data_type &bias_data_type, memory::data_type &dst_data_type,
-            int &mask, std::vector<float> &scales) const {
+            memory::dim &kernel_size, memory::dim &stride_size,
+            memory::dim &padding_l_size, int &mask,
+            std::vector<float> &scales) const {
 
         dnnl_data_type_t c_weights_data_type;
         dnnl_data_type_t c_bias_data_type;
         dnnl_data_type_t c_dst_data_type;
+        dnnl_dim_t c_kernel_size;
+        dnnl_dim_t c_stride_size;
+        dnnl_dim_t c_padding_l_size;
         dnnl_dim_t count;
         int c_mask;
         const float *c_scales;
-        error::wrap_c_api(dnnl_post_ops_get_params_dw_k3s1p1(get(), index,
-                                  &c_weights_data_type, &c_bias_data_type,
-                                  &c_dst_data_type, &count, &c_mask, &c_scales),
+        error::wrap_c_api(
+                dnnl_post_ops_get_params_dw(get(), index, &c_weights_data_type,
+                        &c_bias_data_type, &c_dst_data_type, &c_kernel_size,
+                        &c_stride_size, &c_padding_l_size, &count, &c_mask,
+                        &c_scales),
                 "could not get parameters of depthwise post-op");
 
         weights_data_type = static_cast<memory::data_type>(c_weights_data_type);
         bias_data_type = static_cast<memory::data_type>(c_bias_data_type);
         dst_data_type = static_cast<memory::data_type>(c_dst_data_type);
+        kernel_size = c_kernel_size;
+        stride_size = c_stride_size;
+        padding_l_size = c_padding_l_size;
         scales.resize(count);
 
         mask = c_mask;
@@ -3161,6 +3176,67 @@ struct post_ops : public handle<dnnl_post_ops_t> {
         return;
     }
 
+    /// Appends a depthwise post-op convolution with stride 1.
+    ///
+    /// This post-op can only be fused with a 2D 1x1 convolution (convolution
+    /// with weights spatial dimension equal to 1 i.e., kh=kw=1).
+    ///
+    /// The kind of this post-op is #dnnl_convolution.
+    ///
+    /// The number of outputs for primitive remain same as before. The output
+    /// size remain same as the original primitive due to stride=1.
+    ///
+    /// The Post-op can be defined as:
+    ///
+    ///      dst[:] <- scales * (conv_dw(conv_1x1))
+    ///
+    /// See @ref dev_guide_attributes_post_ops_depthwise and
+    /// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info.
+    ///
+    /// @param weights_data_type Weights data type of depthwise post-op
+    /// @param bias_data_type Bias data type of depthwise post-op
+    /// @param dst_data_type Output data type of depthwise post-op
+    /// @param mask Output scaling factors correspondence mask that defines the
+    ///     correspondence between the output tensor dimensions and the
+    ///     @p scales array. The set i-th bit indicates that a dedicated output
+    ///     scaling factor is used for each index along that dimension. The mask
+    ///     value of 0 implies a common scaling factor for the whole output
+    ///     tensor.
+    /// @param scales Output pointer to a constant array of float scaling
+    ///     factors.
+    void append_dw_k3s1p1(memory::data_type weights_data_type,
+            memory::data_type bias_data_type, memory::data_type dst_data_type,
+            int mask, const std::vector<float> &scales) {
+
+        append_dw(weights_data_type, bias_data_type, dst_data_type, 3, 1, 1,
+                mask, scales);
+    }
+
+    /// Returns the parameters of an depthwise post-op with stride 1.
+    ///
+    /// @param index Index of the elementwise post-op.
+    /// @param weights_data_type Weights data type of depthwise post-op
+    /// @param bias_data_type Bias data type of depthwise post-op
+    /// @param dst_data_type Output data type of depthwise post-op
+    /// @param mask Output scaling factors correspondence mask that defines the
+    ///     correspondence between the output tensor dimensions and the
+    ///     @p scales array. The set i-th bit indicates that a dedicated output
+    ///     scaling factor is used for each index along that dimension. The mask
+    ///     value of 0 implies a common scaling factor for the whole output
+    ///     tensor.
+    /// @param scales Output pointer to a constant array of float scaling
+    ///     factors.
+    void get_params_dw_k3s1p1(int index, memory::data_type &weights_data_type,
+            memory::data_type &bias_data_type, memory::data_type &dst_data_type,
+            int &mask, std::vector<float> &scales) const {
+
+        memory::dim kernel_size;
+        memory::dim stride_size;
+        memory::dim padding_l_size;
+        get_params_dw(index, weights_data_type, bias_data_type, dst_data_type,
+                kernel_size, stride_size, padding_l_size, mask, scales);
+    }
+
     /// Appends a depthwise post-op convolution with stride 2.
     ///
     /// This post-op can only be fused with a 2D 1x1 convolution (convolution
@@ -3197,13 +3273,8 @@ struct post_ops : public handle<dnnl_post_ops_t> {
     void append_dw_k3s2p1(memory::data_type weights_data_type,
             memory::data_type bias_data_type, memory::data_type dst_data_type,
             int mask, const std::vector<float> &scales) {
-
-        error::wrap_c_api(dnnl_post_ops_append_dw_k3s2p1(get(),
-                                  memory::convert_to_c(weights_data_type),
-                                  memory::convert_to_c(bias_data_type),
-                                  memory::convert_to_c(dst_data_type),
-                                  scales.size(), mask, scales.data()),
-                "could not append depthwise post-op");
+        append_dw(weights_data_type, bias_data_type, dst_data_type, 3, 2, 1,
+                mask, scales);
     }
 
     /// Returns the parameters of an depthwise post-op with stride 2.
@@ -3224,26 +3295,11 @@ struct post_ops : public handle<dnnl_post_ops_t> {
             memory::data_type &bias_data_type, memory::data_type &dst_data_type,
             int &mask, std::vector<float> &scales) const {
 
-        dnnl_data_type_t c_weights_data_type;
-        dnnl_data_type_t c_bias_data_type;
-        dnnl_data_type_t c_dst_data_type;
-        dnnl_dim_t count;
-        int c_mask;
-        const float *c_scales;
-        error::wrap_c_api(dnnl_post_ops_get_params_dw_k3s2p1(get(), index,
-                                  &c_weights_data_type, &c_bias_data_type,
-                                  &c_dst_data_type, &count, &c_mask, &c_scales),
-                "could not get parameters of depthwise post-op");
-
-        weights_data_type = static_cast<memory::data_type>(c_weights_data_type);
-        bias_data_type = static_cast<memory::data_type>(c_bias_data_type);
-        dst_data_type = static_cast<memory::data_type>(c_dst_data_type);
-        scales.resize(count);
-
-        mask = c_mask;
-        for (dnnl_dim_t c = 0; c < count; ++c)
-            scales[c] = c_scales[c];
-        return;
+        memory::dim kernel_size;
+        memory::dim stride_size;
+        memory::dim padding_l_size;
+        get_params_dw(index, weights_data_type, bias_data_type, dst_data_type,
+                kernel_size, stride_size, padding_l_size, mask, scales);
     }
 
     /// Appends a binary post-op.
diff --git a/scripts/verbose_converter/src/benchdnn_generator.py b/scripts/verbose_converter/src/benchdnn_generator.py
index c3551f0f9d0..1c5b6255bd2 100644
--- a/scripts/verbose_converter/src/benchdnn_generator.py
+++ b/scripts/verbose_converter/src/benchdnn_generator.py
@@ -449,7 +449,8 @@ def convert_binary_post_op(post_op):
 
     def convert_dw_post_op(post_op):
         policy = convert_scale_policy(post_op['scales']['mask'])
-        po = post_op['alg'] + ':' + post_op['dst_dt'] + ':' + policy
+        po = post_op['alg'] + ':' + post_op['ksp'] + ':' + post_op[
+            'dst_dt'] + ':' + policy
         if post_op['scales']['value'] != None:
             po += ':' + post_op['scales']['value']
         return po
diff --git a/scripts/verbose_converter/src/dnnl_parser.py b/scripts/verbose_converter/src/dnnl_parser.py
index 226bb1a2183..5d294d37f27 100644
--- a/scripts/verbose_converter/src/dnnl_parser.py
+++ b/scripts/verbose_converter/src/dnnl_parser.py
@@ -131,6 +131,7 @@ def convert_binary_post_op(value):
                     def convert_dw_post_op(value):
                         p_op = {
                             'alg': '',
+                            'ksp': '',
                             'dst_dt': 'f32',
                             'wei_dt': 'f32',
                             'scales': {
@@ -141,13 +142,14 @@ def convert_dw_post_op(value):
                         params = value.split(':')
                         len_params = len(params)
                         p_op['alg'] = params[0]
-                        if len_params > 1:
-                            p_op['dst_dt'] = params[1]
+                        p_op['ksp'] = params[1]
                         if len_params > 2:
-                            p_op['wei_dt'] = 's8'
-                            p_op['scales']['mask'] = params[2]
+                            p_op['dst_dt'] = params[2]
                         if len_params > 3:
-                            p_op['scales']['value'] = params[3]
+                            p_op['wei_dt'] = 's8'
+                            p_op['scales']['mask'] = params[3]
+                        if len_params > 4:
+                            p_op['scales']['value'] = params[4]
                         return p_op
 
                     def convert_eltwise_post_op(value):
diff --git a/src/common/primitive_attr.cpp b/src/common/primitive_attr.cpp
index b43dd5af397..d5c3b47a30e 100644
--- a/src/common/primitive_attr.cpp
+++ b/src/common/primitive_attr.cpp
@@ -221,18 +221,28 @@ dnnl::impl::status_t post_ops_t::entry_t::set_depthwise_scales(
     return dnnl::impl::status::success;
 }
 
-status_t post_ops_t::append_dw_k3s1p1(data_type_t wei_dt, data_type_t bias_dt,
-        data_type_t dst_dt, dim_t count, int mask, const float *scales) {
+status_t post_ops_t::append_dw(data_type_t wei_dt, data_type_t bias_dt,
+        data_type_t dst_dt, dim_t kernel_size, dim_t stride_size,
+        dim_t padding_l_size, dim_t count, int mask, const float *scales) {
     if (len() == post_ops_limit) return out_of_memory;
     bool ok = wei_dt != data_type::undef && dst_dt != data_type::undef
             && IMPLICATION(count > 0, scales) && mask >= 0;
     if (!ok) return invalid_arguments;
 
+    ok = ok && kernel_size > 0 && stride_size > 0;
+    if (!ok) return invalid_arguments;
+
+    // Avoiding cases when kernel in pad area
+    ok = ok && (padding_l_size + 1) <= kernel_size;
+    if (!ok) return invalid_arguments;
+
     entry_.emplace_back();
     auto &e = entry_.back();
     e.kind = primitive_kind::convolution;
     auto &d = e.depthwise_conv;
-    d.stride = 1;
+    d.kernel = kernel_size;
+    d.stride = stride_size;
+    d.padding = padding_l_size;
     d.wei_dt = wei_dt;
     d.bias_dt = bias_dt;
     d.dst_dt = dst_dt;
@@ -243,17 +253,6 @@ status_t post_ops_t::append_dw_k3s1p1(data_type_t wei_dt, data_type_t bias_dt,
     return e.set_depthwise_scales(scales);
 }
 
-status_t post_ops_t::append_dw_k3s2p1(data_type_t wei_dt, data_type_t bias_dt,
-        data_type_t dst_dt, dim_t count, int mask, const float *scales) {
-
-    auto status
-            = append_dw_k3s1p1(wei_dt, bias_dt, dst_dt, count, mask, scales);
-    if (status != success) return status;
-    entry_.back().depthwise_conv.stride = 2;
-
-    return success;
-}
-
 status_t post_ops_t::append_binary(
         alg_kind_t alg, const memory_desc_t *user_src1_desc) {
     if (len() == post_ops_limit) return out_of_memory;
@@ -610,13 +609,45 @@ status_t dnnl_post_ops_get_params_eltwise(const post_ops_t *post_ops, int index,
     return success;
 }
 
+status_t dnnl_post_ops_append_dw(post_ops_t *post_ops, data_type_t wei_dt,
+        data_type_t bias_dt, data_type_t dst_dt, dim_t kernel_size,
+        dim_t stride_size, dim_t padding_l_size, dim_t count, int mask,
+        const float *scales) {
+    if (post_ops == nullptr) return invalid_arguments;
+
+    return post_ops->append_dw(wei_dt, bias_dt, dst_dt, kernel_size,
+            stride_size, padding_l_size, count, mask, scales);
+}
+
+status_t dnnl_post_ops_get_params_dw(const post_ops_t *post_ops, int index,
+        data_type_t *wei_dt, data_type_t *bias_dt, data_type_t *dst_dt,
+        dim_t *kernel, dim_t *stride, dim_t *padding, dim_t *count, int *mask,
+        const float **scales) {
+
+    if (!simple_get_params_check(post_ops, index, primitive_kind::convolution))
+        return invalid_arguments;
+
+    const auto &d = post_ops->entry_[index].depthwise_conv;
+    if (wei_dt) *wei_dt = d.wei_dt;
+    if (bias_dt) *bias_dt = d.bias_dt;
+    if (dst_dt) *dst_dt = d.dst_dt;
+    if (kernel) *kernel = d.kernel;
+    if (stride) *stride = d.stride;
+    if (padding) *padding = d.padding;
+    if (count) *count = d.count;
+    if (mask) *mask = d.mask;
+    if (scales) *scales = d.scales;
+
+    return success;
+}
+
 status_t dnnl_post_ops_append_dw_k3s1p1(post_ops_t *post_ops,
         data_type_t wei_dt, data_type_t bias_dt, data_type_t dst_dt,
         dim_t count, int mask, const float *scales) {
     if (post_ops == nullptr) return invalid_arguments;
 
-    return post_ops->append_dw_k3s1p1(
-            wei_dt, bias_dt, dst_dt, count, mask, scales);
+    return post_ops->append_dw(
+            wei_dt, bias_dt, dst_dt, 3, 1, 1, count, mask, scales);
 }
 
 status_t dnnl_post_ops_get_params_dw_k3s1p1(const post_ops_t *post_ops,
@@ -643,8 +674,8 @@ status_t dnnl_post_ops_append_dw_k3s2p1(post_ops_t *post_ops,
         dim_t count, int mask, const float *scales) {
     if (post_ops == nullptr) return invalid_arguments;
 
-    return post_ops->append_dw_k3s2p1(
-            wei_dt, bias_dt, dst_dt, count, mask, scales);
+    return post_ops->append_dw(
+            wei_dt, bias_dt, dst_dt, 3, 2, 1, count, mask, scales);
 }
 
 status_t dnnl_post_ops_get_params_dw_k3s2p1(const post_ops_t *post_ops,
diff --git a/src/common/primitive_attr.hpp b/src/common/primitive_attr.hpp
index 86b4b268ceb..cefab214f4b 100644
--- a/src/common/primitive_attr.hpp
+++ b/src/common/primitive_attr.hpp
@@ -355,7 +355,9 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
         };
 
         struct depthwise_conv_t {
-            int stride;
+            dnnl::impl::dim_t kernel;
+            dnnl::impl::dim_t stride;
+            dnnl::impl::dim_t padding;
             dnnl::impl::data_type_t wei_dt;
             dnnl::impl::data_type_t bias_dt;
             dnnl::impl::data_type_t dst_dt;
@@ -449,7 +451,11 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
                     break;
                 case primitive_kind::convolution:
                     // Depthwise Only
-                    ret = depthwise_conv.stride == rhs.depthwise_conv.stride
+                    ret = depthwise_conv.kernel == rhs.depthwise_conv.kernel
+                            && depthwise_conv.stride
+                                    == rhs.depthwise_conv.stride
+                            && depthwise_conv.padding
+                                    == rhs.depthwise_conv.padding
                             && depthwise_conv.wei_dt
                                     == rhs.depthwise_conv.wei_dt
                             && depthwise_conv.bias_dt
@@ -515,12 +521,11 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
             dnnl::impl::data_type_t dt = dnnl_data_type_undef);
     dnnl::impl::status_t append_eltwise(
             float scale, dnnl::impl::alg_kind_t alg, float alpha, float beta);
-    dnnl::impl::status_t append_dw_k3s1p1(dnnl::impl::data_type_t wei_dt,
+    dnnl::impl::status_t append_dw(dnnl::impl::data_type_t wei_dt,
             dnnl::impl::data_type_t bias_dt, dnnl::impl::data_type_t dst_dt,
-            dnnl::impl::dim_t count, int mask, const float *scales);
-    dnnl::impl::status_t append_dw_k3s2p1(dnnl::impl::data_type_t wei_dt,
-            dnnl::impl::data_type_t bias_dt, dnnl::impl::data_type_t dst_dt,
-            dnnl::impl::dim_t count, int mask, const float *scales);
+            dnnl::impl::dim_t kernel_size, dnnl::impl::dim_t stride_size,
+            dnnl::impl::dim_t padding_l_size, dnnl::impl::dim_t count, int mask,
+            const float *scales);
     dnnl::impl::status_t append_binary(dnnl::impl::alg_kind_t alg,
             const dnnl::impl::memory_desc_t *user_src1_desc);
     dnnl::impl::status_t append_prelu(int mask);
diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp
index 874b9078165..c34b1adddc7 100644
--- a/src/common/primitive_hashing.cpp
+++ b/src/common/primitive_hashing.cpp
@@ -255,8 +255,12 @@ size_t get_attr_hash(const primitive_attr_t &attr) {
                 seed = hash_combine(seed, static_cast<size_t>(entry.sum.dt));
                 break;
             case primitive_kind::convolution:
+                seed = hash_combine(
+                        seed, static_cast<size_t>(entry.depthwise_conv.kernel));
                 seed = hash_combine(
                         seed, static_cast<size_t>(entry.depthwise_conv.stride));
+                seed = hash_combine(seed,
+                        static_cast<size_t>(entry.depthwise_conv.padding));
                 seed = hash_combine(
                         seed, static_cast<size_t>(entry.depthwise_conv.wei_dt));
                 seed = hash_combine(seed,
diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp
index 3ebc836062f..0a82255685e 100644
--- a/src/common/serialization.cpp
+++ b/src/common/serialization.cpp
@@ -181,7 +181,9 @@ void serialize_attr(
                 sstream.write(&entry.sum.dt);
                 break;
             case primitive_kind::convolution:
+                sstream.write(&entry.depthwise_conv.kernel);
                 sstream.write(&entry.depthwise_conv.stride);
+                sstream.write(&entry.depthwise_conv.padding);
                 sstream.write(&entry.depthwise_conv.wei_dt);
                 sstream.write(&entry.depthwise_conv.bias_dt);
                 sstream.write(&entry.depthwise_conv.dst_dt);
diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp
index 3beb44b73b5..5cc041f6fb8 100644
--- a/src/common/verbose.cpp
+++ b/src/common/verbose.cpp
@@ -435,7 +435,8 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
                 case primitive_kind::convolution: {
                     using namespace data_type;
                     const auto &c = e.depthwise_conv;
-                    ss << delim << "dw_k3s" << c.stride << "p1";
+                    ss << delim << "dw:k" << c.kernel << "s" << c.stride << "p"
+                       << c.padding;
                     if (c.wei_dt == s8 || c.dst_dt != f32)
                         ss << ":" << c.dst_dt;
                     if (c.count > 0 && c.wei_dt == s8) {
diff --git a/src/cpu/dw_convolution_utils.hpp b/src/cpu/dw_convolution_utils.hpp
index 23d581eee59..bfa2cd2423d 100644
--- a/src/cpu/dw_convolution_utils.hpp
+++ b/src/cpu/dw_convolution_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -63,17 +63,26 @@ inline status_t get_depthwise_conv_desc(convolution_desc_t &cd_dw,
     const auto g = src_dw_d.dims()[1];
     const auto ih = src_dw_d.dims()[ndims - 2];
     const auto iw = src_dw_d.dims()[ndims - 1];
+    const auto kernel = dw_po.kernel;
     const auto stride = dw_po.stride;
+    const auto padding = dw_po.padding;
 
-    const dims_t weights_tz = {g, 1, 1, 3, 3};
+    const dims_t weights_tz = {g, 1, 1, kernel, kernel};
 
-    const dims_t dst_tz
-            = {n, oc, utils::div_up(ih, stride), utils::div_up(iw, stride)};
+    // Not following standard convolution formula for output shapes since
+    // right/top padding might be greated than left/top one.
+    const dim_t oh = utils::div_up(ih, stride);
+    const dim_t ow = utils::div_up(iw, stride);
+    const dims_t dst_tz = {n, oc, oh, ow};
 
     const dims_t bias_tz = {oc};
-    const dims_t pad_tz = {1, 1};
+    const dims_t pad_tz = {padding, padding};
     const dims_t stride_tz = {stride, stride};
 
+    const dim_t pad_h_r = (oh - 1) * stride - ih + kernel - padding;
+    const dim_t pad_w_r = (ow - 1) * stride - iw + kernel - padding;
+    const dims_t pad_r_tz = {pad_h_r, pad_w_r};
+
     memory_desc_t src_md, weights_md, bias_md, dst_md;
 
     const auto src_dw_tag = src_dw_d.matches_one_of_tag(
@@ -97,7 +106,7 @@ inline status_t get_depthwise_conv_desc(convolution_desc_t &cd_dw,
     CHECK(conv_desc_init(&cd_dw, prop_kind::forward_inference,
             alg_kind::convolution_auto, &src_md, &weights_md,
             with_bias ? &bias_md : nullptr, &dst_md, stride_tz, nullptr, pad_tz,
-            pad_tz));
+            pad_r_tz));
 
     return status::success;
 }
diff --git a/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp b/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp
index 291e0c8587d..57f50b514e7 100644
--- a/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp
@@ -134,7 +134,6 @@ void jit_avx512_common_1x1_convolution_fwd_t<src_type, wei_type,
             scratchpad, memory_tracking::names::prefix_fusion);
     dst_data_t *pbuf;
     size_t row_offset;
-    const int jcp_dw_kh = 3;
     const int nb_buffer = jcp.nb_load_blocking;
     std::vector<dst_data_t *> addrs;
     // End
@@ -191,8 +190,9 @@ void jit_avx512_common_1x1_convolution_fwd_t<src_type, wei_type,
                 : g * nb_oc + ocb;
         const size_t dst_off = data_blk_off(dst_d, n, oc_off_idx, od, oh, ow);
 
-        p.output_data = jcp.with_dw_conv ? pbuf + (oh % jcp_dw_kh) * row_offset
-                                         : &dst[dst_off];
+        p.output_data = jcp.with_dw_conv
+                ? pbuf + (oh % pd()->dw_conv_pd_->jcp_.kh) * row_offset
+                : &dst[dst_off];
         p.bias_data = bias
                 ? &bias[oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block)]
                 : nullptr;
diff --git a/src/cpu/x64/jit_sse41_1x1_convolution.cpp b/src/cpu/x64/jit_sse41_1x1_convolution.cpp
index cfa93ed7a91..49d55dfb22a 100644
--- a/src/cpu/x64/jit_sse41_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_sse41_1x1_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2021 Intel Corporation
+* Copyright 2017-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -103,7 +103,6 @@ void jit_sse41_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
     data_t *pbuf {nullptr};
     size_t row_offset {};
     const int nb_buffer = jcp.nb_load_blocking;
-    const int jcp_dw_kh = 3;
     std::vector<data_t *> addrs;
 
     auto step = [](int default_step, int remaining, int tail_step) {
@@ -147,7 +146,7 @@ void jit_sse41_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
         const int oc_off_idx = (is_dst_layout_nxc ? jcp.oc_block : 1) * _ocb;
 
         par_conv.output_data = jcp.with_dw_conv
-                ? pbuf + (oh % jcp_dw_kh) * row_offset
+                ? pbuf + (oh % pd()->dw_conv_pd_->jcp_.kh) * row_offset
                 : &dst[data_blk_off(dst_d, n, oc_off_idx, oh, ow)];
 
         par_conv.bias_data = &bias[_ocb * jcp.oc_block];
diff --git a/tests/benchdnn/conv/conv_dw_fusion.cpp b/tests/benchdnn/conv/conv_dw_fusion.cpp
index a61851f79ab..2e96b4ec527 100644
--- a/tests/benchdnn/conv/conv_dw_fusion.cpp
+++ b/tests/benchdnn/conv/conv_dw_fusion.cpp
@@ -201,7 +201,9 @@ std::unique_ptr<prb_t> get_fused_conv_prb(const prb_t *prb) {
                   << fused_conv_po.dst_dt;
     auto p_dw_cfg = conv::str2cfg(dw_cfg_ss.str().c_str());
 
-    auto stride = fused_conv_po.stride;
+    const auto kernel = fused_conv_po.kernel;
+    const auto stride = fused_conv_po.stride;
+    const auto padding = fused_conv_po.padding;
     bool is_3d = prb->ndims >= 5;
     bool is_2d = prb->ndims >= 4;
 
@@ -213,18 +215,21 @@ std::unique_ptr<prb_t> get_fused_conv_prb(const prb_t *prb) {
     cd.ih = is_2d ? prb->oh : 1;
     cd.iw = prb->ow;
     cd.oc = prb->oc;
-    cd.od = is_3d ? div_up(cd.id, stride) : 1;
-    cd.oh = is_2d ? div_up(cd.ih, stride) : 1;
-    cd.ow = div_up(cd.iw, stride);
-    cd.kd = is_3d ? 3 : 1;
-    cd.kh = is_2d ? 3 : 1;
-    cd.kw = 3;
+    cd.kd = is_3d ? kernel : 1;
+    cd.kh = is_2d ? kernel : 1;
+    cd.kw = kernel;
     cd.sd = is_3d ? stride : 1;
     cd.sh = is_2d ? stride : 1;
     cd.sw = stride;
-    cd.pd = is_3d;
-    cd.ph = is_2d;
-    cd.pw = 1;
+    cd.pd = is_3d ? padding : 0;
+    cd.ph = is_2d ? padding : 0;
+    cd.pw = padding;
+    // Not following standard convolution formula for output shapes since
+    // right/top padding might be greated than left/top one.
+    cd.od = is_3d ? div_up(cd.id, stride) : 1;
+    cd.oh = is_2d ? div_up(cd.ih, stride) : 1;
+    cd.ow = div_up(cd.iw, stride);
+
     cd.has_groups = true;
     cd.ndims = prb->ndims;
     cd.init_pad_r(false); // is_deconv = false for conv descriptor
diff --git a/tests/benchdnn/dnn_types.cpp b/tests/benchdnn/dnn_types.cpp
index 34231f358e6..896da5f4be3 100644
--- a/tests/benchdnn/dnn_types.cpp
+++ b/tests/benchdnn/dnn_types.cpp
@@ -298,6 +298,7 @@ static po_table_entry_t kind_table[] = {
         // sum
         {pk_t::SUM, {"sum"}, dnnl_alg_kind_undef},
         // depthwise convolution
+        {pk_t::DW, {"dw"}, dnnl_convolution_auto},
         {pk_t::DW_K3S1P1, {"dw_k3s1p1"}, dnnl_convolution_auto},
         {pk_t::DW_K3S2P1, {"dw_k3s2p1"}, dnnl_convolution_auto},
         // eltwise
@@ -438,21 +439,6 @@ int attr_t::post_ops_t::from_str(const std::string &s) {
     *this = post_ops_t();
     if (s.empty()) return OK;
 
-    // TODO: remove me after a while
-    if (s.front() == '\'' || s.back() == '\'') {
-        BENCHDNN_PRINT(0, "%s\n",
-                "ERROR: `--attr-post-ops` no longer requires opening and "
-                "closing `'` (and `\"` for CLI) quotes. Please discard them to "
-                "proceed with successful parsing.");
-        return FAIL;
-    } else if (s.find_first_of(";", 0) != std::string::npos) {
-        BENCHDNN_PRINT(0, "%s\n",
-                "ERROR: `--attr-post-ops` no longer accepts `;` as post-ops "
-                "delimiter. Please use `+` as a delimiter between several "
-                "post-ops, i.e. `--attr-post-ops=sum+relu`.");
-        return FAIL;
-    }
-
     size_t start_pos = 0;
     while (start_pos != std::string::npos) {
         auto subs = parser::get_substr(s, start_pos, '+');
@@ -481,6 +467,27 @@ int attr_t::post_ops_t::from_str(const std::string &s) {
             // sum dt, if specified, should be defined
             if (e.sum.dt == dnnl_data_type_undef) return FAIL;
         } else if (e.is_convolution_kind()) {
+            if (kind == DW) {
+                // `DW` has input of `dw:kXsYpZ`, while rest have `dw_k3sXp1`.
+                const auto str_dw_params
+                        = parser::get_substr(subs, subs_pos, ':');
+                size_t pos = 0, idx = 0;
+
+                pos += idx;
+                if (str_dw_params[pos] != 'k') return FAIL;
+                e.convolution.kernel = std::stoi(&str_dw_params[++pos], &idx);
+
+                pos += idx;
+                if (str_dw_params[pos] != 's') return FAIL;
+                e.convolution.stride = std::stoi(&str_dw_params[++pos], &idx);
+
+                pos += idx;
+                if (str_dw_params[pos] != 'p') return FAIL;
+                e.convolution.padding = std::stoi(&str_dw_params[++pos]);
+
+                if (subs_pos == std::string::npos) continue;
+            }
+
             e.convolution.dst_dt
                     = str2dt(parser::get_substr(subs, subs_pos, ':').c_str());
             if (e.convolution.dst_dt == dnnl_data_type_undef) return FAIL;
@@ -548,7 +555,7 @@ bool attr_t::post_ops_t::entry_t::is_sum_kind() const {
     return kind == SUM;
 }
 bool attr_t::post_ops_t::entry_t::is_convolution_kind() const {
-    return kind == DW_K3S1P1 || kind == DW_K3S2P1;
+    return kind == DW || kind == DW_K3S1P1 || kind == DW_K3S2P1;
 }
 bool attr_t::post_ops_t::entry_t::is_eltwise_kind() const {
     return kind > ELTWISE_START && kind < ELTWISE_END;
@@ -709,6 +716,10 @@ std::ostream &operator<<(std::ostream &s, const attr_t::post_ops_t &post_ops) {
                 s << ":" << e.sum.zero_point;
             if (e.sum.dt != dnnl_data_type_undef) s << ":" << e.sum.dt;
         } else if (e.is_convolution_kind()) {
+            if (e.kind == pk_t::DW) {
+                s << ":k" << e.convolution.kernel << "s" << e.convolution.stride
+                  << "p" << e.convolution.padding;
+            }
             const auto &co = e.convolution.oscale;
             if (e.convolution.dst_dt != dnnl_f32 || !co.is_def())
                 s << ":" << e.convolution.dst_dt;
@@ -963,11 +974,10 @@ dnnl_primitive_attr_t create_dnnl_attr(
                 const auto count = scales ? os_args.get_count(policy) : 0;
                 const auto mask = os_args.get_mask(policy);
 
-                const auto dnnl_post_ops_append_dw = e.convolution.stride == 1
-                        ? dnnl_post_ops_append_dw_k3s1p1
-                        : dnnl_post_ops_append_dw_k3s2p1;
                 DNN_SAFE_V(dnnl_post_ops_append_dw(ops, wei_dt, bia_dt,
-                        e.convolution.dst_dt, count, mask, scales));
+                        e.convolution.dst_dt, e.convolution.kernel,
+                        e.convolution.stride, e.convolution.padding, count,
+                        mask, scales));
             } else if (e.is_eltwise_kind()) {
                 DNN_SAFE_V(dnnl_post_ops_append_eltwise(ops, e.eltwise.scale,
                         e.eltwise.alg, e.eltwise.alpha, e.eltwise.beta));
diff --git a/tests/benchdnn/dnn_types.hpp b/tests/benchdnn/dnn_types.hpp
index 811e33c6dc1..5ec7b463433 100644
--- a/tests/benchdnn/dnn_types.hpp
+++ b/tests/benchdnn/dnn_types.hpp
@@ -200,6 +200,7 @@ struct attr_t {
             // sum
             SUM,
             // depthwise convolution
+            DW,
             DW_K3S1P1,
             DW_K3S2P1,
             // eltwise
@@ -264,8 +265,12 @@ struct attr_t {
                 } else if (is_eltwise_kind()) {
                     eltwise.alg = kind2dnnl_kind(kind);
                 } else if (is_convolution_kind()) {
-                    convolution.stride = kind == DW_K3S1P1 ? 1 : 2;
                     convolution.oscale = scale_t();
+                    if (kind != DW) {
+                        convolution.kernel = 3;
+                        convolution.stride = kind == DW_K3S1P1 ? 1 : 2;
+                        convolution.padding = 1;
+                    }
                 } else if (is_binary_kind()) {
                     binary.alg = kind2dnnl_kind(kind);
                 }
@@ -284,7 +289,9 @@ struct attr_t {
                 float scale = 1.f;
             } eltwise;
             struct {
+                int kernel = 0;
                 int stride = 0;
+                int padding = 0;
                 dnnl_data_type_t dst_dt = dnnl_f32;
                 scale_t oscale;
             } convolution;
diff --git a/tests/benchdnn/doc/knobs_attr.md b/tests/benchdnn/doc/knobs_attr.md
index 8a99b9b3e95..45526ae940e 100644
--- a/tests/benchdnn/doc/knobs_attr.md
+++ b/tests/benchdnn/doc/knobs_attr.md
@@ -8,8 +8,7 @@
     --attr-zero-points=ARG:POLICY:ZEROPOINT[*][+...]
     --attr-post-ops=SUM[:SCALE[:ZERO_POINT[:DATA_TYPE]]]
                     ELTWISE[:ALPHA[:BETA[:SCALE]]]
-                    DW_K3S1P1[:DST_DT[:OUTPUTSCALE]]
-                    DW_K3S2P1[:DST_DT[:OUTPUTSCALE]]
+                    DW:KkSsPp[:DST_DT[:OUTPUTSCALE]]
                     BINARY:DT[:POLICY[:TAG]]
 ```
 
@@ -118,8 +117,8 @@ specified. `SCALE` has same notation and semantics as for `SUM` kind, but
 requires both `ALPHA` and `BETA` to be specified. `SCALE` is applicable only
 when output tensor has integer data type.
 
-`DW_K3S1P1` and `DW_K3S2P1` post operation kinds append depthwise convolution
-with kernel size of 3, strides of 1 and 2 correspondently and paddings of 1.
+`DW:KkSsPp` post operation kind appends depthwise convolution with kernel size
+of `k`, stride size of `s`, and left padding size of `p`.
 These kinds are applicable only for convolution operation with kernel size of 1
 as of now. They support optional argument `DST_DT`, which defines destination
 tensor data type. Refer to [data types](knobs_dt.md) for details. Optional
diff --git a/tests/benchdnn/inputs/conv/harness_conv_fused_depthwise b/tests/benchdnn/inputs/conv/harness_conv_fused_depthwise
index 3263c136af1..a008db3e571 100644
--- a/tests/benchdnn/inputs/conv/harness_conv_fused_depthwise
+++ b/tests/benchdnn/inputs/conv/harness_conv_fused_depthwise
@@ -60,3 +60,291 @@
 --cfg=u8s8u8
 --attr-post-ops=relu:0.5+dw_k3s2p1:s32:per_oc:2.5+relu,dw_k3s2p1:f32:common:2
 --batch=shapes_fused_large_src
+
+
+# f32 dw with extended kernels, strides and padding.
+--reset
+--skip-impl=
+--cfg=f32
+--mb=1,2,16
+
+# effD1
+--attr-post-ops=dw:k3s1p1
+ic32oc16_ih320oh320kh1sh1dh0ph0_n"effD1_1.1"
+ic24oc144_ih160oh160kh1sh1dh0ph0_n"effD1_1.2"
+ic80oc480_ih40oh40kh1sh1dh0ph0_n"effD1_1.3"
+ic192oc1152_ih20oh20kh1sh1dh0ph0_n"effD1_1.4"
+ic320oc1920_ih20oh20kh1sh1dh0ph0_n"effD1_1.5"
+ic88oc88_ih10oh10kh1sh1dh0ph0_n"effD1_1.6"
+ic88oc88_ih20oh20kh1sh1dh0ph0_n"effD1_1.7"
+ic112oc88_ih40oh40kh1sh1dh0ph0_n"effD1_1.8"
+ic40oc88_ih80oh80kh1sh1dh0ph0_n"effD1_1.9"
+ic88oc88_ih5oh5kh1sh1dh0ph0_n"effD1_1.10"
+
+--attr-post-ops=dw:k3s2p0
+ic16oc96_ih320oh320kh1sh1dh0ph0_n"effD1_2.1"
+ic40oc240_ih80oh80kh1sh1dh0ph0_n"effD1_2.2"
+
+--attr-post-ops=dw:k5s2p1
+ic24oc144_ih160oh160kh1sh1dh0ph0_n"effD1_3.1"
+ic112oc672_ih40oh40kh1sh1dh0ph0_n"effD1_3.2"
+
+--attr-post-ops=dw:k5s1p2
+ic40oc240_ih80oh80kh1sh1dh0ph0_n"effD1_4.1"
+ic80oc480_ih40oh40kh1sh1dh0ph0_n"effD1_4.2"
+ic112oc672_ih40oh40kh1sh1dh0ph0_n"effD1_4.3"
+ic192oc1152_ih20oh20kh1sh1dh0ph0_n"effD1_4.4"
+
+# effD4
+--attr-post-ops=dw:k3s1p1
+ic48oc24_ih512oh512kh1sh1dh0ph0_n"effD4_1.1"
+ic32oc192_ih256oh256kh1sh1dh0ph0_n"effD4_1.2"
+ic112oc672_ih64oh64kh1sh1dh0ph0_n"effD4_1.3"
+ic272oc1632_ih32oh32kh1sh1dh0ph0_n"effD4_1.4"
+ic448oc2688_ih32oh32kh1sh1dh0ph0_n"effD4_1.5"
+ic448oc224_ih8oh8kh1sh1dh0ph0_n"effD4_1.6"
+ic448oc224_ih16oh16kh1sh1dh0ph0_n"effD4_1.7"
+ic448oc224_ih32oh32kh1sh1dh0ph0_n"effD4_1.8"
+ic448oc224_ih64oh64kh1sh1dh0ph0_n"effD4_1.9"
+ic448oc224_ih128oh128kh1sh1dh0ph0_n"effD4_1.10"
+
+--attr-post-ops=dw:k3s2p0
+ic24oc144_ih512oh512kh1sh1dh0ph0_n"effD4_2.1"
+ic56oc336_ih128oh128kh1sh1dh0ph0_n"effD4_2.2"
+
+--attr-post-ops=dw:k5s2p1
+ic32oc192_ih256oh256kh1sh1dh0ph0_n"effD4_3.1"
+ic160oc960_ih64oh64kh1sh1dh0ph0_n"effD4_3.2"
+
+--attr-post-ops=dw:k5s1p2
+ic56oc336_ih128oh128kh1sh1dh0ph0_n"effD4_4.1"
+ic112oc672_ih64oh64kh1sh1dh0ph0_n"effD4_4.2"
+ic160oc960_ih64oh64kh1sh1dh0ph0_n"effD4_4.3"
+ic272oc1632_ih32oh32kh1sh1dh0ph0_n"effD4_4.4"
+
+# faster_rcnn_nas_lowproposals_coco
+--attr-post-ops=dw:k3s1p1
+ic42oc42_ih300oh300kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.1"
+ic168oc168_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.2"
+ic84oc84_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.3"
+ic336oc336_ih75oh75kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.4"
+ic672oc672_ih9oh9kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.5"
+
+--attr-post-ops=dw:k5s1p2
+ic42oc42_ih300oh300kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.1"
+ic84oc84_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.2"
+ic168oc168_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.3"
+ic336oc336_ih75oh75kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.4"
+ic672oc672_ih9oh9kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.5"
+
+--attr-post-ops=dw:k7s1p3
+ic96oc42_ih300oh300kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.1"
+ic84oc84_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.2"
+ic336oc336_ih75oh75kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.3"
+ic672oc672_ih9oh9kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.4"
+
+# deeplab
+--attr-post-ops=dw:k3s1p0
+ic64oc384_ih129oh129kh1sh1dh0ph0_n"deeplab_1.1"
+ic96oc576_ih129oh129kh1sh1dh0ph0_n"deeplab_1.2"
+ic160oc960_ih129oh129kh1sh1dh0ph0_n"deeplab_1.3"
+
+--attr-post-ops=dw:k3s1p1
+ic24oc144_ih257oh257kh1sh1dh0ph0_n"deeplab_2.1"
+ic32oc192_ih129oh129kh1sh1dh0ph0_n"deeplab_2.2"
+
+--attr-post-ops=dw:k3s2p1
+ic16oc96_ih513oh513kh1sh1dh0ph0_n"deeplab_3.1"
+ic24oc144_ih257oh257kh1sh1dh0ph0_n"deeplab_3.2"
+
+# deeplab_v3
+--attr-post-ops=dw:k3s1p0
+ic64oc384_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_1.1"
+ic96oc576_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_1.2"
+ic160oc960_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_1.3"
+
+--attr-post-ops=dw:k3s1p1
+ic24oc144_ih129oh129kh1sh1dh0ph0_n"deeplab_v3_2.1"
+ic32oc192_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_2.2"
+
+--attr-post-ops=dw:k3s2p1
+ic16oc96_ih257oh257kh1sh1dh0ph0_n"deeplab_v3_3.1"
+ic24oc144_ih129oh129kh1sh1dh0ph0_n"deeplab_v3_3.2"
+
+# rmnet_ssd
+--attr-post-ops=dw:k3s1p1
+ic32oc8_ih200oh200kh1sh1dh0ph0_n"rmnet_ssd_1.1"
+ic64oc16_ih100oh100kh1sh1dh0ph0_n"rmnet_ssd_1.2"
+ic128oc32_ih50oh50kh1sh1dh0ph0_n"rmnet_ssd_1.3"
+ic256oc64_ih25oh25kh1sh1dh0ph0_n"rmnet_ssd_1.4"
+ic128oc128_ih50oh50kh1sh1dh0ph0_n"rmnet_ssd_1.5"
+ic256oc256_ih25oh25kh1sh1dh0ph0_n"rmnet_ssd_1.6"
+
+--attr-post-ops=dw:k3s2p0
+ic32oc16_ih200oh200kh1sh1dh0ph0_n"rmnet_ssd_2.1"
+ic64oc32_ih100oh100kh1sh1dh0ph0_n"rmnet_ssd_2.2"
+ic128oc64_ih50oh50kh1sh1dh0ph0_n"rmnet_ssd_2.3"
+
+# nasnet_a_large_331
+--attr-post-ops=dw:k3s1p1
+ic42oc42_ih83oh83kh1sh1dh0ph0_n"nasnet_a_large_331_1.1"
+ic168oc168_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_1.2"
+ic336oc336_ih21oh21kh1sh1dh0ph0_n"nasnet_a_large_331_1.3"
+ic672oc672_ih11oh11kh1sh1dh0ph0_n"nasnet_a_large_331_1.4"
+
+--attr-post-ops=dw:k5s1p1
+ic42oc42_ih83oh83kh1sh1dh0ph0_n"nasnet_a_large_331_2.1"
+ic84oc84_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_2.2"
+ic168oc168_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_2.3"
+ic336oc336_ih21oh21kh1sh1dh0ph0_n"nasnet_a_large_331_2.4"
+ic672oc672_ih11oh11kh1sh1dh0ph0_n"nasnet_a_large_331_2.5"
+
+--attr-post-ops=dw:k7s1p3
+ic96oc42_ih83oh83kh1sh1dh0ph0_n"nasnet_a_large_331_3.1"
+ic84oc84_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_3.2"
+ic336oc336_ih21oh21kh1sh1dh0ph0_n"nasnet_a_large_331_3.3"
+ic672oc672_ih11oh11kh1sh1dh0ph0_n"nasnet_a_large_331_3.4"
+
+
+# bf16 dw with extended kernels, strides and padding.
+--reset
+--skip-impl=
+--cfg=bf16bf16bf16
+--mb=1,2,16
+
+# effD1
+--attr-post-ops=dw:k3s1p1:bf16
+ic32oc16_ih320oh320kh1sh1dh0ph0_n"effD1_1.1"
+ic24oc144_ih160oh160kh1sh1dh0ph0_n"effD1_1.2"
+ic80oc480_ih40oh40kh1sh1dh0ph0_n"effD1_1.3"
+ic192oc1152_ih20oh20kh1sh1dh0ph0_n"effD1_1.4"
+ic320oc1920_ih20oh20kh1sh1dh0ph0_n"effD1_1.5"
+ic88oc88_ih10oh10kh1sh1dh0ph0_n"effD1_1.6"
+ic88oc88_ih20oh20kh1sh1dh0ph0_n"effD1_1.7"
+ic112oc88_ih40oh40kh1sh1dh0ph0_n"effD1_1.8"
+ic40oc88_ih80oh80kh1sh1dh0ph0_n"effD1_1.9"
+ic88oc88_ih5oh5kh1sh1dh0ph0_n"effD1_1.10"
+
+--attr-post-ops=dw:k3s2p0:bf16
+ic16oc96_ih320oh320kh1sh1dh0ph0_n"effD1_2.1"
+ic40oc240_ih80oh80kh1sh1dh0ph0_n"effD1_2.2"
+
+--attr-post-ops=dw:k5s2p1:bf16
+ic24oc144_ih160oh160kh1sh1dh0ph0_n"effD1_3.1"
+ic112oc672_ih40oh40kh1sh1dh0ph0_n"effD1_3.2"
+
+--attr-post-ops=dw:k5s1p2:bf16
+ic40oc240_ih80oh80kh1sh1dh0ph0_n"effD1_4.1"
+ic80oc480_ih40oh40kh1sh1dh0ph0_n"effD1_4.2"
+ic112oc672_ih40oh40kh1sh1dh0ph0_n"effD1_4.3"
+ic192oc1152_ih20oh20kh1sh1dh0ph0_n"effD1_4.4"
+
+# effD4
+--attr-post-ops=dw:k3s1p1:bf16
+ic48oc24_ih512oh512kh1sh1dh0ph0_n"effD4_1.1"
+ic32oc192_ih256oh256kh1sh1dh0ph0_n"effD4_1.2"
+ic112oc672_ih64oh64kh1sh1dh0ph0_n"effD4_1.3"
+ic272oc1632_ih32oh32kh1sh1dh0ph0_n"effD4_1.4"
+ic448oc2688_ih32oh32kh1sh1dh0ph0_n"effD4_1.5"
+ic448oc224_ih8oh8kh1sh1dh0ph0_n"effD4_1.6"
+ic448oc224_ih16oh16kh1sh1dh0ph0_n"effD4_1.7"
+ic448oc224_ih32oh32kh1sh1dh0ph0_n"effD4_1.8"
+ic448oc224_ih64oh64kh1sh1dh0ph0_n"effD4_1.9"
+ic448oc224_ih128oh128kh1sh1dh0ph0_n"effD4_1.10"
+
+--attr-post-ops=dw:k3s2p0:bf16
+ic24oc144_ih512oh512kh1sh1dh0ph0_n"effD4_2.1"
+ic56oc336_ih128oh128kh1sh1dh0ph0_n"effD4_2.2"
+
+--attr-post-ops=dw:k5s2p1:bf16
+ic32oc192_ih256oh256kh1sh1dh0ph0_n"effD4_3.1"
+ic160oc960_ih64oh64kh1sh1dh0ph0_n"effD4_3.2"
+
+--attr-post-ops=dw:k5s1p2:bf16
+ic56oc336_ih128oh128kh1sh1dh0ph0_n"effD4_4.1"
+ic112oc672_ih64oh64kh1sh1dh0ph0_n"effD4_4.2"
+ic160oc960_ih64oh64kh1sh1dh0ph0_n"effD4_4.3"
+ic272oc1632_ih32oh32kh1sh1dh0ph0_n"effD4_4.4"
+
+# faster_rcnn_nas_lowproposals_coco
+--attr-post-ops=dw:k3s1p1:bf16
+ic42oc42_ih300oh300kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.1"
+ic168oc168_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.2"
+ic84oc84_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.3"
+ic336oc336_ih75oh75kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.4"
+ic672oc672_ih9oh9kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_1.5"
+
+--attr-post-ops=dw:k5s1p2:bf16
+ic42oc42_ih300oh300kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.1"
+ic84oc84_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.2"
+ic168oc168_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.3"
+ic336oc336_ih75oh75kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.4"
+ic672oc672_ih9oh9kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_2.5"
+
+--attr-post-ops=dw:k7s1p3:bf16
+ic96oc42_ih300oh300kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.1"
+ic84oc84_ih150oh150kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.2"
+ic336oc336_ih75oh75kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.3"
+ic672oc672_ih9oh9kh1sh1dh0ph0_n"faster_rcnn_nas_lowproposals_coco_3.4"
+
+# deeplab
+--attr-post-ops=dw:k3s1p0:bf16
+ic64oc384_ih129oh129kh1sh1dh0ph0_n"deeplab_1.1"
+ic96oc576_ih129oh129kh1sh1dh0ph0_n"deeplab_1.2"
+ic160oc960_ih129oh129kh1sh1dh0ph0_n"deeplab_1.3"
+
+--attr-post-ops=dw:k3s1p1:bf16
+ic24oc144_ih257oh257kh1sh1dh0ph0_n"deeplab_2.1"
+ic32oc192_ih129oh129kh1sh1dh0ph0_n"deeplab_2.2"
+
+--attr-post-ops=dw:k3s2p1:bf16
+ic16oc96_ih513oh513kh1sh1dh0ph0_n"deeplab_3.1"
+ic24oc144_ih257oh257kh1sh1dh0ph0_n"deeplab_3.2"
+
+# deeplab_v3
+--attr-post-ops=dw:k3s1p0:bf16
+ic64oc384_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_1.1"
+ic96oc576_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_1.2"
+ic160oc960_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_1.3"
+
+--attr-post-ops=dw:k3s1p1:bf16
+ic24oc144_ih129oh129kh1sh1dh0ph0_n"deeplab_v3_2.1"
+ic32oc192_ih65oh65kh1sh1dh0ph0_n"deeplab_v3_2.2"
+
+--attr-post-ops=dw:k3s2p1:bf16
+ic16oc96_ih257oh257kh1sh1dh0ph0_n"deeplab_v3_3.1"
+ic24oc144_ih129oh129kh1sh1dh0ph0_n"deeplab_v3_3.2"
+
+# rmnet_ssd
+--attr-post-ops=dw:k3s1p1:bf16
+ic32oc8_ih200oh200kh1sh1dh0ph0_n"rmnet_ssd_1.1"
+ic64oc16_ih100oh100kh1sh1dh0ph0_n"rmnet_ssd_1.2"
+ic128oc32_ih50oh50kh1sh1dh0ph0_n"rmnet_ssd_1.3"
+ic256oc64_ih25oh25kh1sh1dh0ph0_n"rmnet_ssd_1.4"
+ic128oc128_ih50oh50kh1sh1dh0ph0_n"rmnet_ssd_1.5"
+ic256oc256_ih25oh25kh1sh1dh0ph0_n"rmnet_ssd_1.6"
+
+--attr-post-ops=dw:k3s2p0:bf16
+ic32oc16_ih200oh200kh1sh1dh0ph0_n"rmnet_ssd_2.1"
+ic64oc32_ih100oh100kh1sh1dh0ph0_n"rmnet_ssd_2.2"
+ic128oc64_ih50oh50kh1sh1dh0ph0_n"rmnet_ssd_2.3"
+
+# nasnet_a_large_331
+--attr-post-ops=dw:k3s1p1:bf16
+ic42oc42_ih83oh83kh1sh1dh0ph0_n"nasnet_a_large_331_1.1"
+ic168oc168_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_1.2"
+ic336oc336_ih21oh21kh1sh1dh0ph0_n"nasnet_a_large_331_1.3"
+ic672oc672_ih11oh11kh1sh1dh0ph0_n"nasnet_a_large_331_1.4"
+
+--attr-post-ops=dw:k5s1p1:bf16
+ic42oc42_ih83oh83kh1sh1dh0ph0_n"nasnet_a_large_331_2.1"
+ic84oc84_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_2.2"
+ic168oc168_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_2.3"
+ic336oc336_ih21oh21kh1sh1dh0ph0_n"nasnet_a_large_331_2.4"
+ic672oc672_ih11oh11kh1sh1dh0ph0_n"nasnet_a_large_331_2.5"
+
+--attr-post-ops=dw:k7s1p3:bf16
+ic96oc42_ih83oh83kh1sh1dh0ph0_n"nasnet_a_large_331_3.1"
+ic84oc84_ih42oh42kh1sh1dh0ph0_n"nasnet_a_large_331_3.2"
+ic336oc336_ih21oh21kh1sh1dh0ph0_n"nasnet_a_large_331_3.3"
+ic672oc672_ih11oh11kh1sh1dh0ph0_n"nasnet_a_large_331_3.4"
diff --git a/tests/benchdnn/utils/parser.cpp b/tests/benchdnn/utils/parser.cpp
index c17a20f788f..f46158bd17d 100644
--- a/tests/benchdnn/utils/parser.cpp
+++ b/tests/benchdnn/utils/parser.cpp
@@ -170,7 +170,7 @@ bool parse_attr_post_ops(std::vector<attr_t::post_ops_t> &po, const char *str,
             = "POST-OPS\n    Specifies post-ops attribute. `POST-OPS` syntax "
               "is one of those:\n    * SUM[:SCALE[:ZERO_POINT[:DATA_TYPE]]]\n  "
               "  * ELTWISE[:ALPHA[:BETA[:SCALE]]]\n    * "
-              "DW_K3S1P1[:DST_DT[:OUTPUTSCALE]]\n    * "
+              "DW:KkSsPp[:DST_DT[:OUTPUTSCALE]]\n    * "
               "BINARY:DT[:POLICY[:TAG]]\n    More details at "
               "https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn/"
               "doc/knobs_attr.md\n";
diff --git a/tests/gtests/test_iface_attr.cpp b/tests/gtests/test_iface_attr.cpp
index a301600f9a6..42bbbddd777 100644
--- a/tests/gtests/test_iface_attr.cpp
+++ b/tests/gtests/test_iface_attr.cpp
@@ -462,6 +462,26 @@ HANDLE_EXCEPTIONS_FOR_TEST_F(attr_test_t, DepthwiseFusionPostop) {
     ASSERT_EQ(dst_dt, memory::data_type::f32);
     ASSERT_EQ(scales_mask, 0);
     ASSERT_EQ(scales_in, scales_out);
+
+    scales_in = {};
+    ops.append_dw(memory::data_type::s8, memory::data_type::f32,
+            memory::data_type::u8, 5, 2, 1, 0, scales_in);
+    attr.set_post_ops(ops);
+
+    ASSERT_EQ(attr.get_post_ops().kind(3), primitive::kind::convolution);
+
+    memory::dim kernel, stride, padding;
+    attr.get_post_ops().get_params_dw(3, wei_dt, bias_dt, dst_dt, kernel,
+            stride, padding, scales_mask, scales_out);
+
+    ASSERT_EQ(wei_dt, memory::data_type::s8);
+    ASSERT_EQ(bias_dt, memory::data_type::f32);
+    ASSERT_EQ(dst_dt, memory::data_type::u8);
+    ASSERT_EQ(kernel, 5);
+    ASSERT_EQ(stride, 2);
+    ASSERT_EQ(padding, 1);
+    ASSERT_EQ(scales_mask, 0);
+    ASSERT_EQ(scales_in, scales_out);
 }
 
 HANDLE_EXCEPTIONS_FOR_TEST_F(attr_test_t, DepthwiseFusion) {