Improve split operator by oneDNN reorder primitive (#20757)

bgawrych · web-flow · commit 50a8ee85efd6 · 2022-01-13T11:38:55.000+01:00
* Add oneDNN support for array_split operator

* benchmark.py

* refactor

* update

* review fixes

* fix sanity

* fix

* review

* Apply review comments
diff --git a/src/operator/nn/dnnl/dnnl_base-inl.h b/src/operator/nn/dnnl/dnnl_base-inl.h
@@ -197,6 +197,7 @@ bool SupportDNNLTranspose(const NDArray& data);
 bool SupportDNNLBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
 bool SupportDNNLLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs);
 bool SupportDNNLReshape(const NDArray& input, const NDArray& output);
+bool SupportDNNLSplit(const NDArray& input);
 bool SupportDNNLStack(const std::vector<NDArray>& inputs);
 }  // namespace op
 
diff --git a/src/operator/nn/dnnl/dnnl_ops-inl.h b/src/operator/nn/dnnl/dnnl_ops-inl.h
@@ -132,6 +132,12 @@ void DNNLSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
                               const std::vector<OpReqType>& req,
                               const std::vector<NDArray>& out_data);
 
+void DNNLSplitForward(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs);
+
 /* For sum */
 void DNNLSumForward(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
diff --git a/src/operator/nn/dnnl/dnnl_split-inl.h b/src/operator/nn/dnnl/dnnl_split-inl.h
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_split-inl.h
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+#include <vector>
+
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+using split_fwd_t    = dnnl::reorder;
+using split_fwd_pd_t = dnnl::reorder::primitive_desc;
+
+class DNNLSplitFwd {
+ public:
+  struct Tensors {
+    Tensors(const NDArray& input, const std::vector<NDArray>& outputs);
+
+    const NDArray& input;
+    const std::vector<NDArray>& outputs;
+  };
+
+  static DNNLSplitFwd& GetCached(const SplitParam& param,
+                                 const Tensors& tensors,
+                                 const TShape& split_pts,
+                                 const int split_axis);
+
+  DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, const int split_axis);
+
+  void Execute(const Tensors& tensors,
+               const TShape& split_pts,
+               const int split_axis,
+               const std::vector<OpReqType>& req) const;
+
+ private:
+  std::vector<split_fwd_t> split_fwds;
+  std::vector<split_fwd_pd_t> split_pds;
+  dnnl::memory::dims strides;
+};
+
+}  // namespace op
+}  // namespace mxnet
+#endif
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_split.cc
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include "../../tensor/matrix_op-inl.h"
+#include "./dnnl_split-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLSplit(const NDArray& input) {
+  static const std::set<int> supported_dtypes = {
+      mshadow::kFloat32, mshadow::kBfloat16, mshadow::kInt32, mshadow::kInt8, mshadow::kUint8};
+  return supported_dtypes.count(input.dtype());
+}
+
+void DNNLSplitForward(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs) {
+  const SplitParam& param = dmlc::get<SplitParam>(attrs.parsed);
+  const auto tensors      = DNNLSplitFwd::Tensors(inputs[0], outputs);
+
+  const auto& ishape   = tensors.input.shape();
+  const int split_axis = param.axis >= 0 ? param.axis : param.axis + ishape.ndim();
+  const mxnet::TShape split_pts =
+      (param.sections > 0) ? GetSplitIndices(tensors.input.shape(), split_axis, param.sections) :
+                             param.indices;
+
+  const auto& fwd = DNNLSplitFwd::GetCached(param, tensors, split_pts, split_axis);
+  fwd.Execute(tensors, split_pts, split_axis, req);
+}
+
+DNNLSplitFwd::Tensors::Tensors(const NDArray& input, const std::vector<NDArray>& outputs)
+    : input(input), outputs(outputs) {}
+
+typedef ParamOpSign<SplitParam> DNNLSplitSignature;
+
+DNNLSplitFwd& DNNLSplitFwd::GetCached(const SplitParam& param,
+                                      const Tensors& tensors,
+                                      const TShape& split_pts,
+                                      const int split_axis) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSplitSignature, DNNLSplitFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSplitSignature, DNNLSplitFwd, OpHash> fwds;
+#endif
+
+  DNNLSplitSignature key(param);
+  key.AddSign(tensors.input);
+  key.AddSign(tensors.outputs);
+  key.AddSign(split_pts);
+  key.AddSign(split_axis);
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLSplitFwd fwd(tensors, split_pts, split_axis);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+DNNLSplitFwd::DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, const int split_axis) {
+  const auto cpu_engine = CpuEngine::Get()->get_engine();
+  const auto input      = tensors.input.Reorder2Default();
+  const auto& ishape    = input.shape();
+  const auto& dtype     = get_dnnl_type(input.dtype());
+  const auto format_tag = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(ishape.ndim()));
+
+  strides = dnnl::memory::dims(ishape.ndim(), 1);
+  // last dim stride = 1, start loop from the penultimate
+  for (int i = ishape.ndim() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ishape[i + 1];
+  }
+
+  for (int i = 0; i < tensors.outputs.size(); ++i) {
+    const auto& out = tensors.outputs[i];
+    if (out.shape().Size() == 0) {
+      continue;
+    }
+    dnnl::memory::dims dnnl_dims(ishape.begin(), ishape.end());
+    // ending split point is always last dimension
+    int end_split_pt      = (i + 1 >= split_pts.ndim()) ? ishape[split_axis] : split_pts[i + 1];
+    dnnl_dims[split_axis] = end_split_pt - split_pts[i];
+
+    auto in_mem_desc  = dnnl::memory::desc(dnnl_dims, dtype, strides);
+    auto out_mem_desc = dnnl::memory::desc(dnnl_dims, dtype, format_tag);
+
+    const auto split_pd = split_fwd_pd_t(cpu_engine, in_mem_desc, cpu_engine, out_mem_desc);
+    split_pds.emplace_back(split_pd);
+    split_fwds.emplace_back(split_fwd_t(split_pd));
+  }
+}
+
+void DNNLSplitFwd::Execute(const Tensors& tensors,
+                           const TShape& split_pts,
+                           const int split_axis,
+                           const std::vector<OpReqType>& req) const {
+  const auto& cpu_engine = CpuEngine::Get()->get_engine();
+
+  const auto& input_tensor = tensors.input.Reorder2Default();
+  int out_idx = 0, primitive_idx = 0;
+  int axis_offset      = strides[split_axis] * GetTypeSize(input_tensor.dtype());
+  std::byte* input_ptr = reinterpret_cast<std::byte*>(input_tensor.data().dptr_);
+
+  for (const auto& out : tensors.outputs) {
+    if (out.shape().Size() == 0) {
+      out_idx++;
+      continue;
+    }
+    int offset  = split_pts[out_idx] * axis_offset;
+    auto in_mem = dnnl::memory(split_pds[primitive_idx].src_desc(), cpu_engine, input_ptr + offset);
+
+    auto out_mem = CreateDNNLMem(out, split_pds[primitive_idx].dst_desc(), req[out_idx]);
+    DNNLStream::Get()->RegisterPrimArgs(split_fwds[primitive_idx],
+                                        {{DNNL_ARG_SRC, in_mem}, {DNNL_ARG_DST, *out_mem.second}});
+
+    CommitOutput(out, out_mem);
+    ++out_idx;
+    ++primitive_idx;
+  }
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
@@ -3062,6 +3062,11 @@ struct SplitParam : public dmlc::Parameter<SplitParam> {
     (*dict)["squeeze_axis"] = squeeze_axis_s.str();
     (*dict)["sections"]     = sections_s.str();
   }
+
+  bool operator==(const SplitParam& other) const {
+    return this->indices == other.indices && this->axis == other.axis &&
+           this->squeeze_axis == other.squeeze_axis && this->sections == other.sections;
+  }
 };  // struct SplitParam
 
 inline mxnet::TShape GetSplitIndices(const mxnet::TShape& ishape, int axis, int sections) {
@@ -3451,6 +3456,17 @@ struct hash<mxnet::op::ExpandDimParam> {
   }
 };
 
+template <>
+struct hash<mxnet::op::SplitParam> {
+  size_t operator()(const mxnet::op::SplitParam& val) {
+    size_t ret = 0;
+    ret        = dmlc::HashCombine(ret, val.indices);
+    ret        = dmlc::HashCombine(ret, val.axis);
+    ret        = dmlc::HashCombine(ret, val.squeeze_axis);
+    ret        = dmlc::HashCombine(ret, val.sections);
+    return ret;
+  }
+};
 }  // namespace std
 
 #endif  // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
@@ -30,6 +30,7 @@
 #include "../nn/dnnl/dnnl_reshape-inl.h"
 #include "../nn/dnnl/dnnl_slice-inl.h"
 #include "../nn/dnnl/dnnl_transpose-inl.h"
+#include "../nn/dnnl/dnnl_split-inl.h"
 #endif
 
 namespace mxnet {
@@ -1177,6 +1178,32 @@ Example::
     .add_argument("data", "NDArray-or-Symbol", "Input ndarray")
     .add_arguments(DepthToSpaceParam::__FIELDS__());
 
+#if MXNET_USE_ONEDNN == 1
+static void SplitForwardEx(const nnvm::NodeAttrs& attrs,
+                           const OpContext& op_ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  CHECK(!inputs.empty());
+  if (SupportDNNLSplit(inputs[0])) {
+    DNNL_OPCHECK_INIT(/*is backward*/ false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLSplitForward, attrs, op_ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(SplitOpForward<cpu>, attrs, op_ctx, inputs, req, outputs);
+  } else {
+    FallBackCompute(SplitOpForward<cpu>, attrs, op_ctx, inputs, req, outputs);
+  }
+}
+
+inline static bool SplitInferStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int>* in_attrs,
+                                         std::vector<int>* out_attrs) {
+  return DNNLStorageType(
+      attrs, dev_mask, /*support onednn*/ true, dispatch_mode, in_attrs, out_attrs);
+}
+#endif  // MXNET_USE_ONEDNN == 1
+
 NNVM_REGISTER_OP(_split_v2)
     .add_alias("_npi_split")
     .add_alias("_npi_array_split")
@@ -1246,6 +1273,11 @@ Example::
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
+#if MXNET_USE_ONEDNN == 1
+    .set_attr<FComputeEx>("FComputeEx<cpu>", SplitForwardEx)
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FInferStorageType>("FInferStorageType", SplitInferStorageType)
+#endif
     .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_split_v2_backward"})
     .add_argument("data", "NDArray-or-Symbol", "The input")