apache
diff --git a/‎ci/docker/Dockerfile.build.centos7‎
Lines changed: 1 addition & 1 deletion b/‎ci/docker/Dockerfile.build.centos7‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/docker/Dockerfile.build.ubuntu‎
Lines changed: 1 addition & 0 deletions b/‎ci/docker/Dockerfile.build.ubuntu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/static_site/src/pages/api/faq/env_var.md‎
Lines changed: 47 additions & 0 deletions b/‎docs/static_site/src/pages/api/faq/env_var.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎include/mxnet/storage.h‎
Lines changed: 4 additions & 3 deletions b/‎include/mxnet/storage.h‎
Lines changed: 4 additions & 3 deletions
@@ -88,7 +88,7 @@ SHELL [ "/usr/bin/scl", "enable", "devtoolset-7", "rh-python38", "rh-maven35" ]
 
 # Install minimum required cmake version
 RUN cd /usr/local/src && \
-    wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh && \
+    wget -nv --no-check-certificate https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh && \
     sh cmake-3.13.5-Linux-x86_64.sh --prefix=/usr/local --skip-license && \
     rm cmake-3.13.5-Linux-x86_64.sh
 
 
@@ -161,6 +161,7 @@ ARG BASE_IMAGE
 RUN export SHORT_CUDA_VERSION=${CUDA_VERSION%.*} && \
     export OS_RELEASE="$(cat /etc/os-release)" && \
     apt-get update && \
+    apt-get install -y --allow-change-held-packages libcudnn8 libcudnn8-dev && \
     if [[ ${OS_RELEASE} == *"Bionic"* ]]; then \
         if [ ${SHORT_CUDA_VERSION} = 11.0 ]; then \
             TRT_VERSION="7.2.0-1+cuda11.0"; \
 
@@ -295,16 +295,62 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - Value of 1 chooses the best algo in a limited workspace
   - Value of 2 chooses the fastest algo whose memory requirements may be larger than the default workspace threshold
 
+* MXNET_CUDNN_HEUR_MODE
+  - Values: 0 or 1 (available since cuDNN 8.1) ```(default=1 for cuDNN 8.1 and later, otherwise 0)```
+  - Choose cuDNN heuristics mode.
+  - If set to '0', use fast decision tree based method.
+  - If set to '1', use neural network based method. It generalizes better for unknown or uncommon models.
+
+* MXNET_CUDNN_ALGO_VERBOSE_LEVEL
+  - Values: 0, 1, or 2 ```(default=0)```
+  - The level of printed output describing the "convolution engine" configurations
+  - Value of 0 produces no output
+  - Value of 1 outputs for the chosen config the engine number ("algo"), additional parameters ("knobs") and numerical notes
+  - Value of 2 outputs the same info as with a '1' setting, but for all configs considered
+  The output can be used to develop engine config filtering strategies to modify model behaviors.
+  Numerical accuracy may be improved by filtering out configs shown with 'rp', 'w' or 'fft' (i.e. reduced precision, winograd, or fft).
+  The configs are output with their list-index, as suggested by cuDNN, and with the chosen config flagged with a '*'.
+  If autotuning is enabled (MXNET_CUDNN_AUTOTUNE_DEFAULT != 0), the measured kernel times will be reported.
+
 * MXNET_CUDA_ALLOW_TENSOR_CORE
   - 0(false) or 1(true) ```(default=1)```
   - If set to '0', disallows Tensor Core use in CUDA ops.
   - If set to '1', allows Tensor Core use in CUDA ops.
   - This variable can only be set once in a session.
+  - Also controls filtering cuDNN engines with CUDNN_NUMERICAL_NOTE_TENSOR_CORE.
 
 * MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION
   - 0(false) or 1(true) ```(default=0)```
   - If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
   - If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
+  - Also controls filtering cuDNN engines with CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS (such engines are disallowed if set to 0).
+
+* MXNET_CUDNN_ALLOW_REDUCED_PRECISION_REDUCTION
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows cuDNN engines with CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION.
+  - If set to '1', allows cuDNN engines with CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION.
+
+* MXNET_CUDNN_ALLOW_FFT
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows cuDNN engines with CUDNN_NUMERICAL_NOTE_FFT.
+  - If set to '1', allows cuDNN engines with CUDNN_NUMERICAL_NOTE_FFT.
+
+* MXNET_CUDNN_ALLOW_WINOGRAD
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows cuDNN engines with CUDNN_NUMERICAL_NOTE_WINOGRAD.
+  - If set to '1', allows cuDNN engines with CUDNN_NUMERICAL_NOTE_WINOGRAD.
+
+* MXNET_CUDNN_DISABLED_CONV_FWD_ENGINES
+  - Comma-separated list of cuDNN convolution forward engine numbers to disable.
+  - Normally should be left alone, unless you know what you're doing.
+
+* MXNET_CUDNN_DISABLED_CONV_DGRAD_ENGINES
+  - Comma-separated list of cuDNN convolution dgrad engine numbers to disable.
+  - Normally should be left alone, unless you know what you're doing.
+
+* MXNET_CUDNN_DISABLED_CONV_WGRAD_ENGINES
+  - Comma-separated list of cuDNN convolution wgrad engine numbers to disable.
+  - Normally should be left alone, unless you know what you're doing.
 
 * MXNET_CUDA_LIB_CHECKING
   - 0(false) or 1(true) ```(default=1)```
@@ -342,6 +388,7 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - If set to true, MXNet will only use deterministic algorithms in forward and backward computation.
   If no such algorithm exists given other constraints, MXNet will error out. This variable affects the choice
   of CUDNN convolution algorithms. Please see [CUDNN developer guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html) for more details.
+  - Also controls filtering cuDNN engines with CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC (such engines are disallowed if set to 1).
 
 * MXNET_CPU_PARALLEL_SIZE
   - Values: Int ```(default=200000)```
 
@@ -86,20 +86,21 @@ class Storage {
    * \brief Allocate a new contiguous memory for a given size.
    * \param size Total size of memory in bytes.
    * \param ctx Context information about the device and ID.
+   * \param failsafe Return a handle with a null dptr if out of memory, rather than exit.
    * \return Handle struct.
    */
-  Handle Alloc(size_t size, Context ctx) {
+  Handle Alloc(size_t size, Context ctx, bool failsafe = false) {
     Handle hd;
     hd.size = size;
     hd.ctx = ctx;
-    this->Alloc(&hd);
+    this->Alloc(&hd, failsafe);
     return hd;
   }
   /*!
    * \brief Allocate a new contiguous memory for a given size.
    * \param handle handle initialized with size and ctx
    */
-  virtual void Alloc(Handle* handle) = 0;
+  virtual void Alloc(Handle* handle, bool failsafe = false) = 0;
   /*!
    * \brief Increase ref counter on shared memory.
    * \param handle handle to shared memory.