init-22
diff --git a/‎algoperf/jax_utils.py‎
Lines changed: 130 additions & 0 deletions b/‎algoperf/jax_utils.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎algoperf/pytorch_utils.py‎
Lines changed: 41 additions & 0 deletions b/‎algoperf/pytorch_utils.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎algoperf/spec.py‎
Lines changed: 4 additions & 2 deletions b/‎algoperf/spec.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎algoperf/workloads/cifar/cifar_jax/workload.py‎
Lines changed: 1 addition & 7 deletions b/‎algoperf/workloads/cifar/cifar_jax/workload.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎algoperf/workloads/criteo1tb/criteo1tb_jax/models.py‎
Lines changed: 15 additions & 10 deletions b/‎algoperf/workloads/criteo1tb/criteo1tb_jax/models.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py‎
Lines changed: 10 additions & 9 deletions b/‎algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py‎
Lines changed: 10 additions & 9 deletions
@@ -0,0 +1,130 @@
+from collections.abc import Sequence
+
+import flax.linen as nn
+from flax.linen.module import compact
+from flax.linen.module import merge_param
+from flax.linen.module import Module
+from flax.typing import PRNGKey
+import jax
+from jax import lax
+from jax import random
+import jax.numpy as jnp
+
+
+# Custom Layers
+class Dropout(Module):
+  # pylint: disable=line-too-long
+  """Create a dropout layer.
+    Forked from 
+    https://flax-linen.readthedocs.io/en/latest/_modules/flax/linen/stochastic.html#Dropout. 
+    The reference dropout implementation is modified support changes 
+    to dropout rate during  training by:
+    1) adding rate argument to the __call__ method.
+    2) removing the if-else condition to check for edge cases, which 
+    will trigger a recompile for jitted code.
+
+    .. note::
+      When using :meth:`Module.apply() <flax.linen.Module.apply>`, make sure
+      to include an RNG seed named ``'dropout'``. Dropout isn't necessary for
+      variable initialization.
+
+    Example usage::
+
+      >>> import flax.linen as nn
+      >>> import jax, jax.numpy as jnp
+
+      >>> class MLP(nn.Module):
+      ...   @nn.compact
+      ...   def __call__(self, x, train):
+      ...     x = nn.Dense(4)(x)
+      ...     x = nn.Dropout(0.5, deterministic=not train)(x)
+      ...     return x
+
+      >>> model = MLP()
+      >>> x = jnp.ones((1, 3))
+      >>> variables = model.init(jax.random.key(0), x, train=False) # don't use dropout
+      >>> model.apply(variables, x, train=False) # don't use dropout
+      Array([[-0.17875527,  1.6255447 , -1.2431065 , -0.02554005]], dtype=float32)
+      >>> model.apply(variables, x, train=True, rngs={'dropout': jax.random.key(1)}) # use dropout
+      Array([[-0.35751054,  3.2510893 ,  0.        ,  0.        ]], dtype=float32)
+
+    Attributes:
+      rate: the dropout probability.  (_not_ the keep rate!)
+      broadcast_dims: dimensions that will share the same dropout mask
+      deterministic: if false the inputs are scaled by ``1 / (1 - rate)`` 
+        and masked, whereas if true, no mask is applied and the inputs are 
+        returned as is.
+      rng_collection: the rng collection name to use when requesting an rng 
+        key.
+    """
+
+  rate: float | None = None
+  broadcast_dims: Sequence[int] = ()
+  deterministic: bool | None = None
+  rng_collection: str = "dropout"
+  legacy: bool = False
+
+  @compact
+  def __call__(
+      self,
+      inputs,
+      deterministic: bool | None = None,
+      rate: float | None = None,
+      rng: PRNGKey | None = None,
+  ):
+    """Applies a random dropout mask to the input.
+
+        Args:
+          inputs: the inputs that should be randomly masked.
+          deterministic: if false the inputs are scaled by ``1 / (1 - rate)`` 
+            and masked, whereas if true, no mask is applied and the inputs are
+            returned as is.
+          rate: the dropout probability.  (_not_ the keep rate!)
+          rng: an optional PRNGKey used as the random key, if not specified, 
+            one will be generated using ``make_rng`` with the 
+            ``rng_collection`` name.
+
+        Returns:
+          The masked inputs reweighted to preserve mean.
+        """
+    deterministic = merge_param("deterministic",
+                                self.deterministic,
+                                deterministic)
+
+    # Override self.rate if rate is passed to __call__
+    if rate is None:
+      rate = self.rate
+
+    if self.legacy:
+      if rate == 0.0:
+        return inputs
+
+      # Prevent gradient NaNs in 1.0 edge-case.
+      if rate == 1.0:
+        return jnp.zeros_like(inputs)
+
+    if deterministic:
+      return inputs
+
+    keep_prob = 1.0 - rate
+    if rng is None:
+      rng = self.make_rng(self.rng_collection)
+    broadcast_shape = list(inputs.shape)
+    for dim in self.broadcast_dims:
+      broadcast_shape[dim] = 1
+    mask = random.bernoulli(rng, p=keep_prob, shape=broadcast_shape)
+    mask = jnp.broadcast_to(mask, inputs.shape)
+    return lax.select(mask, inputs / keep_prob, jnp.zeros_like(inputs))
+
+
+# Utilities for debugging
+def print_jax_model_summary(model, fake_inputs):
+  """Prints a summary of the jax module."""
+  tabulate_fn = nn.tabulate(
+      model,
+      jax.random.PRNGKey(0),
+      console_kwargs={
+          "force_terminal": False, "force_jupyter": False, "width": 240
+      },
+  )
+  print(tabulate_fn(fake_inputs, train=False))
@@ -5,7 +5,10 @@
 import jax
 import tensorflow as tf
 import torch
+from torch import nn
+from torch import Tensor
 import torch.distributed as dist
+import torch.nn.functional as F
 
 from algoperf import spec
 from algoperf.profiler import Profiler
@@ -77,3 +80,41 @@ def update_batch_norm_fn(module: spec.ParameterContainer,
         module.momentum = 0.0
     elif hasattr(module, 'momentum_backup'):
       module.momentum = module.momentum_backup
+
+
+class CustomDropout(nn.Module):
+  """A module around torch.nn.functional.dropout."""
+
+  def __init__(self):
+    super().__init__()
+    self._supports_custom_dropout = True
+
+  def forward(self, x: Tensor, p: float) -> Tensor:
+    return F.dropout(x, p, training=self.training)
+
+
+class CustomDropout2d(nn.Module):
+  """A module around torch.nn.functional.dropout2d."""
+
+  def __init__(self):
+    super().__init__()
+    self._supports_custom_dropout = True
+
+  def forward(self, x: Tensor, p: float) -> Tensor:
+    return F.dropout2d(x, p, training=self.training)
+
+
+class SequentialWithDropout(nn.Sequential):
+  """Sequential of modules with dropout."""
+
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._supports_custom_dropout = True
+
+  def forward(self, x: Tensor, p: float) -> Tensor:
+    for module in self:
+      if getattr(module, '_supports_custom_dropout', False):
+        x = module(x, p)
+      else:
+        x = module(x)
+    return x
@@ -247,7 +247,8 @@ def init_model_fn(self,
   #         ModelAuxiliaryState,
   #         ForwardPassMode,
   #         RandomState,
-  #         bool],
+  #         bool,
+  #         float],
   #     Tensor]
   @abc.abstractmethod
   def model_fn(self,
@@ -256,7 +257,8 @@ def model_fn(self,
                model_state: ModelAuxiliaryState,
                mode: ForwardPassMode,
                rng: RandomState,
-               update_batch_norm: bool) -> Tuple[Tensor, ModelAuxiliaryState]:
+               update_batch_norm: bool,
+               dropout_rate: float) -> Tuple[Tensor, ModelAuxiliaryState]:
     """Return logits_batch"""
     # Possible side effect of updating BN.
 
 
@@ -79,14 +79,8 @@ def sync_batch_stats(
     new_model_state['batch_stats'] = avg_fn(model_state['batch_stats'])
     return new_model_state
 
-  def init_model_fn(
-      self,
-      rng: spec.RandomState,
-      dropout_rate: Optional[float] = None,
-      aux_dropout_rate: Optional[float] = None) -> spec.ModelInitState:
+  def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
     """Dropout is unused."""
-    del dropout_rate
-    del aux_dropout_rate
     model_cls = getattr(models, 'ResNet18')
     model = model_cls(num_classes=self._num_classes, dtype=jnp.float32)
     self._model = model
 
@@ -1,11 +1,14 @@
 """A JAX implementation of DLRM-Small."""
-
 from typing import Sequence
 
 import flax.linen as nn
 from jax import nn as jnn
 import jax.numpy as jnp
 
+from algoperf.jax_utils import Dropout
+
+DROPOUT_RATE = 0.0
+
 
 class DLRMResNet(nn.Module):
   """Define a DLRMResNet model.
@@ -23,12 +26,13 @@ class DLRMResNet(nn.Module):
   mlp_bottom_dims: Sequence[int] = (256, 256, 256)
   mlp_top_dims: Sequence[int] = (256, 256, 256, 256, 1)
   embed_dim: int = 128
-  dropout_rate: float = 0.0
+  dropout_rate: float = DROPOUT_RATE
   use_layer_norm: bool = False  # Unused.
   embedding_init_multiplier: float = None  # Unused
 
   @nn.compact
-  def __call__(self, x, train):
+  def __call__(self, x, train, dropout_rate=DROPOUT_RATE):
+
     bot_mlp_input, cat_features = jnp.split(x, [self.num_dense_features], 1)
     cat_features = jnp.asarray(cat_features, dtype=jnp.int32)
 
@@ -88,8 +92,8 @@ def scaled_init(key, shape, dtype=jnp.float_):
               stddev=jnp.sqrt(1.0 / mlp_top_dims[layer_idx])))(
                   top_mlp_input)
       x = nn.relu(x)
-      if self.dropout_rate and layer_idx == num_layers_top - 2:
-        x = nn.Dropout(rate=self.dropout_rate, deterministic=not train)(x)
+      if dropout_rate and layer_idx == num_layers_top - 2:
+        x = Dropout(dropout_rate, deterministic=not train)(x, rate=dropout_rate)
       top_mlp_input += x
     # In the DLRM model the last layer width is always 1. We can hardcode that
     # below.
@@ -151,7 +155,8 @@ class DlrmSmall(nn.Module):
   embedding_init_multiplier: float = None
 
   @nn.compact
-  def __call__(self, x, train):
+  def __call__(self, x, train, dropout_rate=DROPOUT_RATE):
+
     bot_mlp_input, cat_features = jnp.split(x, [self.num_dense_features], 1)
     cat_features = jnp.asarray(cat_features, dtype=jnp.int32)
 
@@ -210,10 +215,10 @@ def scaled_init(key, shape, dtype=jnp.float_):
         top_mlp_input = nn.relu(top_mlp_input)
         if self.use_layer_norm:
           top_mlp_input = nn.LayerNorm()(top_mlp_input)
-      if (self.dropout_rate is not None and self.dropout_rate > 0.0 and
+      if (dropout_rate is not None and dropout_rate > 0.0 and
           layer_idx == num_layers_top - 2):
-        top_mlp_input = nn.Dropout(
-            rate=self.dropout_rate, deterministic=not train)(
-                top_mlp_input)
+        top_mlp_input = Dropout(
+            dropout_rate, deterministic=not train)(
+                top_mlp_input, rate=dropout_rate)
     logits = top_mlp_input
     return logits
@@ -72,36 +72,34 @@ def loss_fn(
   def init_model_fn(
       self,
       rng: spec.RandomState,
-      dropout_rate: Optional[float] = None,
-      aux_dropout_rate: Optional[float] = None,
       tabulate: Optional[bool] = False,
   ) -> spec.ModelInitState:
     """Only dropout is used."""
-    del aux_dropout_rate
     if self.use_resnet:
       model_class = models.DLRMResNet
     else:
       model_class = models.DlrmSmall
+
     self._model = model_class(
         vocab_size=self.vocab_size,
         num_dense_features=self.num_dense_features,
         mlp_bottom_dims=self.mlp_bottom_dims,
         mlp_top_dims=self.mlp_top_dims,
         embed_dim=self.embed_dim,
-        dropout_rate=dropout_rate,
         use_layer_norm=self.use_layer_norm,
         embedding_init_multiplier=self.embedding_init_multiplier)
 
-    params_rng, dropout_rng = jax.random.split(rng)
+    params_rng, _ = jax.random.split(rng)
     init_fake_batch_size = 2
     num_categorical_features = 26
     num_dense_features = 13
     input_size = num_dense_features + num_categorical_features
     input_shape = (init_fake_batch_size, input_size)
     init_fn = functools.partial(self._model.init, train=False)
-    initial_variables = jax.jit(init_fn)(
-        {'params': params_rng, 'dropout': dropout_rng},
-        jnp.ones(input_shape, jnp.float32))
+    initial_variables = jax.jit(init_fn)({
+        'params': params_rng,
+    },
+                                         jnp.ones(input_shape, jnp.float32))
     initial_params = initial_variables['params']
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
@@ -117,14 +115,17 @@ def model_fn(
       model_state: spec.ModelAuxiliaryState,
       mode: spec.ForwardPassMode,
       rng: spec.RandomState,
-      update_batch_norm: bool) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
+      update_batch_norm: bool,
+      dropout_rate: float = models.DROPOUT_RATE
+  ) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
     del model_state
     del update_batch_norm
     inputs = augmented_and_preprocessed_input_batch['inputs']
     train = mode == spec.ForwardPassMode.TRAIN
     apply_kwargs = {'train': train}
     if train:
       apply_kwargs['rngs'] = {'dropout': rng}
+      apply_kwargs['dropout_rate'] = dropout_rate
     logits_batch = self._model.apply({'params': params}, inputs, **apply_kwargs)
     return logits_batch, None