apache
diff --git a/‎docs/static_site/src/pages/api/faq/env_var.md‎
Lines changed: 8 additions & 4 deletions b/‎docs/static_site/src/pages/api/faq/env_var.md‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/common/utils.h‎
Lines changed: 19 additions & 1 deletion b/‎src/common/utils.h‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/operator/nn/mkldnn/mkldnn_base-inl.h‎
Lines changed: 6 additions & 3 deletions b/‎src/operator/nn/mkldnn/mkldnn_base-inl.h‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/operator/nn/mkldnn/mkldnn_rnn-inl.h‎
Lines changed: 21 additions & 17 deletions b/‎src/operator/nn/mkldnn/mkldnn_rnn-inl.h‎
Lines changed: 21 additions & 17 deletions
@@ -283,11 +283,11 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   If no such algorithm exists given other constraints, MXNet will error out. This variable affects the choice
   of CUDNN convolution algorithms. Please see [CUDNN developer guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html) for more details.
 
-* MXNET_CPU_PARALLEL_COPY_SIZE
+* MXNET_CPU_PARALLEL_SIZE
   - Values: Int ```(default=200000)```
-  - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
-  - When the array size is bigger than or equal to  this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
-  - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
+  - The minimum size to call parallel operations by OpenMP for CPU context.
+  - When the array size is bigger than or equal to this threshold, the operation implemented by OpenMP is executed with the Recommended OMP Thread Count.
+  - When the array size is less than this threshold, the operation is implemented naively in single thread.
 
 * MXNET_OPTIMIZER_AGGREGATION_SIZE
   - Values: Int ```(default=4)```
@@ -343,6 +343,10 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - Values: 0(false) or 1(true) ```(default=1)```
   - If this variable is set, MXNet will simplify the computation graph, eliminating duplicated operations on the same inputs.
 
+* MXNET_USE_MKLDNN_RNN
+  - Values: 0(false) or 1(true) ```(default=1)```
+  - This variable controls whether to use the MKL-DNN backend in fused RNN operator for CPU context. There are two fusion implementations of RNN operator in MXNet. The MKL-DNN implementation has a better performance than the naive one, but the latter is more stable in the backward operation currently.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
 
@@ -760,7 +760,7 @@ inline void EmplaceBackZeros(const NDArrayStorageType stype, const mxnet::TShape
  */
 template<typename DType>
 inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
-  static index_t copy_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000);
+  static index_t copy_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_SIZE", 200000);
   if (size >= copy_block_size) {
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
     for (index_t i = 0; i < size; ++i) {
@@ -771,6 +771,24 @@ inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
   }
 }
 
+/*!
+ * \breif parallelize add by OpenMP
+ */
+template<typename DType>
+inline void ParallelAdd(DType* dst, const DType* src, index_t size) {
+  static index_t add_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_SIZE", 200000);
+  if (size >= add_block_size) {
+    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] += src[i];
+    }
+  } else {
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] += src[i];
+    }
+  }
+}
+
 /*!
  * \brief If numpy compatibility is turned off (default), the shapes passed in
  * by users follow the legacy shape definition:
 
@@ -132,9 +132,12 @@ static inline bool SupportMKLDNN(int dtype, const mxnet::TShape &shape) {
   return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4);
 }
 
-static inline bool SupportMKLDNNRNN(const NDArray &input) {
-  int ndim = input.shape().ndim();
-  return (input.dtype() == mshadow::kFloat32) && (ndim == 3);
+static inline bool SupportMKLDNNRnn(const NDArray &input) {
+  if (input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 3
+      && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {
+    return true;
+  }
+  return false;
 }
 
 static inline bool SupportMKLDNNQuantize(int dtype) {
 
@@ -120,33 +120,32 @@ class RnnPrimitive {
   template<typename rnn_fwd, typename... Args>
   static RnnPrimitive Create(Args&&... args) {
     RnnPrimitive rnn_fwd_prim;
-    rnn_fwd_prim.pd_.reset(
-      new typename rnn_fwd::desc(std::forward<Args>(args)...),
-      [](typename rnn_fwd::desc* pd) {
-        delete reinterpret_cast<typename rnn_fwd::desc*>(pd);
+    auto fwd_desc = typename rnn_fwd::desc(std::forward<Args>(args)...);
+    rnn_fwd_prim.fwd_pd_.reset(
+      new typename rnn_fwd::primitive_desc(fwd_desc, CpuEngine::Get()->get_engine()),
+      [](typename rnn_fwd::primitive_desc* pd) {
+        delete reinterpret_cast<typename rnn_fwd::primitive_desc*>(pd);
       });
-    const typename rnn_fwd::desc& fwd_desc =
-        *(reinterpret_cast<typename rnn_fwd::desc*>(rnn_fwd_prim.pd_.get()));
-    typename rnn_fwd::primitive_desc fwd_pd(fwd_desc, CpuEngine::Get()->get_engine());
-    rnn_fwd_prim.weights_layer_desc_ = fwd_pd.weights_layer_desc();
-    rnn_fwd_prim.weights_iter_desc_  = fwd_pd.weights_iter_desc();
-    rnn_fwd_prim.workspace_desc_ = fwd_pd.workspace_desc();
+    auto fwd_pd = reinterpret_cast<typename rnn_fwd::primitive_desc*>(rnn_fwd_prim.fwd_pd_.get());
+    rnn_fwd_prim.weights_layer_desc_ = fwd_pd->weights_layer_desc();
+    rnn_fwd_prim.weights_iter_desc_  = fwd_pd->weights_iter_desc();
+    rnn_fwd_prim.workspace_desc_ = fwd_pd->workspace_desc();
 
-    rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_fwd(fwd_pd));
+    rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_fwd(*fwd_pd));
 
     return rnn_fwd_prim;
   }
 
   RnnPrimitive() {
-    this->pd_ = nullptr;
+    this->fwd_pd_ = nullptr;
     this->primitive_ = nullptr;
     this->weights_layer_desc_ = mkldnn::memory::desc();
     this->weights_iter_desc_ = mkldnn::memory::desc();
     this->workspace_desc_ = mkldnn::memory::desc();
   }
 
   RnnPrimitive(const RnnPrimitive& rnn_fwd_prim) {
-    this->pd_ = rnn_fwd_prim.pd_;
+    this->fwd_pd_ = rnn_fwd_prim.fwd_pd_;
     this->primitive_ = rnn_fwd_prim.primitive_;
     this->weights_layer_desc_ = rnn_fwd_prim.weights_layer_desc_;
     this->weights_iter_desc_ = rnn_fwd_prim.weights_iter_desc_;
@@ -155,7 +154,7 @@ class RnnPrimitive {
 
   RnnPrimitive& operator=(const RnnPrimitive& rnn_fwd_prim) {
     if (this != &rnn_fwd_prim) {
-      this->pd_ = rnn_fwd_prim.pd_;
+      this->fwd_pd_ = rnn_fwd_prim.fwd_pd_;
       this->primitive_ = rnn_fwd_prim.primitive_;
       this->weights_layer_desc_ = rnn_fwd_prim.weights_layer_desc_;
       this->weights_iter_desc_ = rnn_fwd_prim.weights_iter_desc_;
@@ -165,7 +164,7 @@ class RnnPrimitive {
     return *this;
   }
 
-  const void* GetPrimDesc() const { return pd_.get(); }
+  const void* GetPrimDesc() const { return fwd_pd_.get(); }
   const mkldnn::primitive& GetPrim() const { return *primitive_; }
 
   const mkldnn::memory::desc& GetLayerDesc() const {
@@ -181,7 +180,7 @@ class RnnPrimitive {
   }
 
  private:
-  std::shared_ptr<void> pd_;
+  std::shared_ptr<void> fwd_pd_;
   std::shared_ptr<mkldnn::primitive> primitive_;
   mkldnn::memory::desc weights_layer_desc_;
   mkldnn::memory::desc weights_iter_desc_;
@@ -370,7 +369,10 @@ class MKLDNNRnnBackward {
   void SetDataGradsMem(void* diff_src, void* diff_state, void* diff_statecell,
                        void* diff_out, void* diff_state_out, void* diff_statecell_out,
                        const int dtype = mshadow::kFloat32);
-  void CommitWeightsDiff(void* diff_weights, void* diff_bias, const int dtype = mshadow::kFloat32);
+  void SetNativeWeightsGrads() const;
+  void CommitWeightsGrads(void* diff_weights, void* diff_bias,
+                          const OpReqType req,
+                          const int dtype = mshadow::kFloat32);
 
   const mkldnn::primitive& GetBwd() const { return *bwd_.primitive_; }
   const mkldnn_args_map_t& GetArgsMap() const { return net_args_; }
@@ -385,6 +387,8 @@ class MKLDNNRnnBackward {
 
   mkldnn_shared_mem_t diff_weights_layer_;
   mkldnn_shared_mem_t diff_weights_iter_;
+  mkldnn_shared_mem_t diff_weights_layer_r_;
+  mkldnn_shared_mem_t diff_weights_iter_r_;
   mkldnn_shared_mem_t diff_bias_;
 
   mkldnn_args_map_t net_args_;
Original file line number	Diff line number	Diff line change
`@@ -132,9 +132,12 @@ static inline bool SupportMKLDNN(int dtype, const mxnet::TShape &shape) {`
`132`	`132`	`return dtype == mshadow::kFloat32 && (ndim == 1 \|\| ndim == 2 \|\| ndim == 4);`
`133`	`133`	`}`
`134`	`134`
`135`		`-static inline bool SupportMKLDNNRNN(const NDArray &input) {`
`136`		`- int ndim = input.shape().ndim();`
`137`		`- return (input.dtype() == mshadow::kFloat32) && (ndim == 3);`
	`135`	`+static inline bool SupportMKLDNNRnn(const NDArray &input) {`
	`136`	`+ if (input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 3`
	`137`	`+ && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {`
	`138`	`+ return true;`
	`139`	`+ }`
	`140`	`+ return false;`
`138`	`141`	`}`
`139`	`142`
`140`	`143`	`static inline bool SupportMKLDNNQuantize(int dtype) {`