Wan training: Resolve training mode bug with dropout and layer_forward

ninatu · martinarroyo · ninatu · commit e205aa1d306e · 2026-03-11T14:25:32.000Z
- Conditionally apply dropout only when rate &gt; 0.
- Use standard list initialization.
- Add rngs parameter to layer_forward (essential for gradient checkpointing with dropout &gt; 0)

Co-authored-by: martinarroyo &lt;martinarroyo@google.com&gt;
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1239,7 +1239,10 @@ def __call__(
 
     with jax.named_scope("proj_attn"):
       hidden_states = self.proj_attn(attn_output)
-      hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
+      if self.drop_out.rate > 0:
+        hidden_states = self.drop_out(
+            hidden_states, deterministic=deterministic, rngs=rngs
+        )
     return hidden_states
 
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -262,7 +262,10 @@ def conditional_named_scope(self, name: str):
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
     hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
     hidden_states = checkpoint_name(hidden_states, "ffn_activation")
-    hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
+    if self.drop_out.rate > 0:
+      hidden_states = self.drop_out(
+          hidden_states, deterministic=deterministic, rngs=rngs
+      )
     with jax.named_scope("proj_out"):
       return self.proj_out(hidden_states)  # output is (4, 75600, 5120)
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan_vace.py b/src/maxdiffusion/models/wan/transformers/transformer_wan_vace.py
@@ -487,10 +487,10 @@ def __call__(
       raise NotImplementedError("scan_layers is not supported yet")
     else:
       # Prepare VACE hints
-      control_hidden_states_list = nnx.List([])
+      control_hidden_states_list = []
       for i, vace_block in enumerate(self.vace_blocks):
 
-        def layer_forward(hidden_states, control_hidden_states):
+        def layer_forward(hidden_states, control_hidden_states, rngs):
           return vace_block(
               hidden_states=hidden_states,
               encoder_hidden_states=encoder_hidden_states,
@@ -507,14 +507,16 @@ def layer_forward(hidden_states, control_hidden_states):
             self.names_which_can_be_offloaded,
             prevent_cse=not self.scan_layers,
         )
-        conditioning_states, control_hidden_states = rematted_layer_forward(hidden_states, control_hidden_states)
+        conditioning_states, control_hidden_states = rematted_layer_forward(
+            hidden_states, control_hidden_states, rngs
+        )
         control_hidden_states_list.append((conditioning_states, control_hidden_states_scale[i]))
 
       control_hidden_states_list = control_hidden_states_list[::-1]
 
       for i, block in enumerate(self.blocks):
 
-        def layer_forward_vace(hidden_states):
+        def layer_forward_vace(hidden_states, rngs):
           return block(
               hidden_states,
               encoder_hidden_states,
@@ -530,7 +532,7 @@ def layer_forward_vace(hidden_states):
             self.names_which_can_be_offloaded,
             prevent_cse=not self.scan_layers,
         )
-        hidden_states = rematted_layer_forward(hidden_states)
+        hidden_states = rematted_layer_forward(hidden_states, rngs)
         if i in self.config.vace_layers:
           control_hint, scale = control_hidden_states_list.pop()
           hidden_states = hidden_states + control_hint * scale