Wan training: Fix WAN training timestep sampling with continuous sampling and introduce disable_training_weights, add max_grad_norm and max_abs_grad logging.

ninatu · martinarroyo · ninatu · commit f30daacb4a54 · 2026-03-11T14:26:32.000Z
- Switched timestamp sampling from discrete to continuous.
- Add max_grad_norm and max_abs_grad calculation and logging.
- Introduced `config.disable_training_weights` to optionally disable mid-point loss weighting.

Co-authored-by: martinarroyo &lt;martinarroyo@google.com&gt;
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -282,6 +282,7 @@ output_dir: 'sdxl-model-finetuned'
 per_device_batch_size: 1.0
 # If global_batch_size % jax.device_count is not 0, use FSDP sharding.
 global_batch_size: 0
+disable_training_weights: False # if True, disables the use of mid-point loss weighting
 
 # For creating tfrecords from dataset
 tfrecords_dir: ''
diff --git a/src/maxdiffusion/configs/base_wan_1_3b.yml b/src/maxdiffusion/configs/base_wan_1_3b.yml
@@ -238,6 +238,7 @@ output_dir: 'sdxl-model-finetuned'
 per_device_batch_size: 1.0
 # If global_batch_size % jax.device_count is not 0, use FSDP sharding.
 global_batch_size: 0
+disable_training_weights: False # if True, disables the use of mid-point loss weighting
 
 # For creating tfrecords from dataset
 tfrecords_dir: ''
diff --git a/src/maxdiffusion/schedulers/scheduling_flow_match_flax.py b/src/maxdiffusion/schedulers/scheduling_flow_match_flax.py
@@ -150,11 +150,9 @@ def set_timesteps(
 
     linear_timesteps_weights = None
     if training:
-      x = timesteps
-      y = jnp.exp(-2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2)
-      y_shifted = y - jnp.min(y)
-      bsmntw_weighing = y_shifted * (num_inference_steps / jnp.sum(y_shifted))
-      linear_timesteps_weights = bsmntw_weighing
+      linear_timesteps_weights = self._calculate_training_weights(
+          timesteps, num_inference_steps
+      )
 
     return state.replace(
         sigmas=sigmas,
@@ -164,6 +162,56 @@ def set_timesteps(
         num_inference_steps=num_inference_steps,
     )
 
+  def _calculate_training_weights(
+      self, timesteps: jnp.ndarray, num_inference_steps: int
+  ) -> jnp.ndarray:
+    """Calculates the training weight for a given timestep."""
+    x = timesteps
+    y = jnp.exp(-2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2)
+    y_shifted = y - jnp.min(y)
+    bsmntw_weighing = y_shifted * (num_inference_steps / jnp.sum(y_shifted))
+    linear_timesteps_weights = bsmntw_weighing
+    return linear_timesteps_weights
+
+  def sample_timesteps(self, timestep_rng, batch_size):
+    # 1. Sample continuous timesteps t in [0, 1]
+    t = jax.random.uniform(timestep_rng, (batch_size,))
+
+    # 2. Apply the "Shift" weighting (Time shifting)
+    t_shifted = (t * self.config.shift) / (1 + (self.config.shift - 1) * t)
+
+    # 3. Scale t to [0,  self.config.num_train_timesteps]
+    timesteps = t_shifted.squeeze() * self.config.num_train_timesteps
+
+    return timesteps
+
+  def apply_flow_match(
+      self,
+      noise: jnp.ndarray,
+      batch_images: jnp.ndarray,
+      timesteps: jnp.ndarray,
+  ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    """Apply flow match to the batch of images.
+
+    Replaces: scheduler.add_noise + scheduler.training_target +
+    scheduler.training_weight
+    """
+
+    t = timesteps.astype(jnp.float32) / self.config.num_train_timesteps
+    broadcast_shape = (-1,) + (1,) * (batch_images.ndim - 1)
+    t = t.reshape(broadcast_shape)
+
+    sigma = (1 - t) * self.config.sigma_min + t * self.config.sigma_max
+
+    noisy_latents = (1 - sigma) * batch_images + sigma * noise
+    target = noise - batch_images
+
+    training_weights = self._calculate_training_weights(
+        timesteps, self.config.num_train_timesteps
+    )
+
+    return noisy_latents, target, training_weights
+
   def _find_timestep_id(self, state: FlowMatchSchedulerState, timestep: jnp.ndarray) -> jnp.ndarray:
     """Finds the index of the closest timestep in the scheduler's `timesteps` array."""
     timestep = jnp.asarray(timestep, dtype=state.timesteps.dtype)
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -24,6 +24,7 @@
 import tensorflow as tf
 import jax.numpy as jnp
 import jax
+import jaxopt
 from jax.sharding import PartitionSpec as P
 from flax import nnx
 from maxdiffusion.schedulers import FlaxFlowMatchScheduler
@@ -453,38 +454,53 @@ def loss_fn(params):
     model = nnx.merge(state.graphdef, params, state.rest_of_state)
     latents = data["latents"].astype(config.weights_dtype)
     encoder_hidden_states = data["encoder_hidden_states"].astype(config.weights_dtype)
+
     bsz = latents.shape[0]
-    timesteps = jax.random.randint(
-        timestep_rng,
-        (bsz,),
-        0,
-        scheduler.config.num_train_timesteps,
+    timesteps = scheduler.sample_timesteps(timestep_rng, bsz)
+    noise = jax.random.normal(
+        key=new_rng, shape=latents.shape, dtype=latents.dtype
+    )
+    noisy_latents, training_target, training_weight = (
+        scheduler.apply_flow_match(noise, latents, timesteps)
     )
-    noise = jax.random.normal(key=new_rng, shape=latents.shape, dtype=latents.dtype)
-    noisy_latents = scheduler.add_noise(scheduler_state, latents, noise, timesteps)
-
     with jax.named_scope("forward_pass"):
       model_pred = model(
           hidden_states=noisy_latents,
           timestep=timesteps,
           encoder_hidden_states=encoder_hidden_states,
           deterministic=False,
-          rngs=nnx.Rngs(dropout_rng),
+          rngs=nnx.Rngs(dropout=dropout_rng),
       )
 
     with jax.named_scope("loss"):
-      training_target = scheduler.training_target(latents, noise, timesteps)
-      training_weight = jnp.expand_dims(scheduler.training_weight(scheduler_state, timesteps), axis=(1, 2, 3, 4))
       loss = (training_target - model_pred) ** 2
-      loss = loss * training_weight
+      if not config.disable_training_weights:
+        training_weight = jnp.expand_dims(training_weight, axis=(1, 2, 3, 4))
+        loss = loss * training_weight
       loss = jnp.mean(loss)
 
     return loss
 
   grad_fn = nnx.value_and_grad(loss_fn)
   loss, grads = grad_fn(state.params)
+  max_grad_norm = jaxopt.tree_util.tree_l2_norm(grads)
+
+  max_abs_grad = jax.tree_util.tree_reduce(
+      lambda max_val, arr: jnp.maximum(max_val, jnp.max(jnp.abs(arr))),
+      grads,
+      initializer=-1.0,
+  )
+
+  metrics = {
+      "scalar": {
+          "learning/loss": loss,
+          "learning/max_grad_norm": max_grad_norm,
+          "learning/max_abs_grad": max_abs_grad,
+      },
+      "scalars": {},
+  }
+
   new_state = state.apply_gradients(grads=grads)
-  metrics = {"scalar": {"learning/loss": loss}, "scalars": {}}
   return new_state, scheduler_state, metrics, new_rng
 
 
@@ -495,14 +511,14 @@ def eval_step(state, data, rng, scheduler_state, scheduler, config):
 
   # The loss function logic is identical to training. We are evaluating the model's
   # ability to perform its core training objective (e.g., denoising).
-  @jax.jit
   def loss_fn(params, latents, encoder_hidden_states, timesteps, rng):
     # Reconstruct the model from its definition and parameters
     model = nnx.merge(state.graphdef, params, state.rest_of_state)
 
     noise = jax.random.normal(key=rng, shape=latents.shape, dtype=latents.dtype)
-    noisy_latents = scheduler.add_noise(scheduler_state, latents, noise, timesteps)
-
+    noisy_latents, training_target, training_weight = (
+        scheduler.apply_flow_match(noise, latents, timesteps)
+    )
     # Get the model's prediction
     model_pred = model(
         hidden_states=noisy_latents,
@@ -512,10 +528,11 @@ def loss_fn(params, latents, encoder_hidden_states, timesteps, rng):
     )
 
     # Calculate the loss against the target
-    training_target = scheduler.training_target(latents, noise, timesteps)
-    training_weight = jnp.expand_dims(scheduler.training_weight(scheduler_state, timesteps), axis=(1, 2, 3, 4))
     loss = (training_target - model_pred) ** 2
-    loss = loss * training_weight
+    if not config.disable_training_weights:
+      training_weight = jnp.expand_dims(training_weight, axis=(1, 2, 3, 4))
+      loss = loss * training_weight
+
     # Calculate the mean loss per sample across all non-batch dimensions.
     loss = loss.reshape(loss.shape[0], -1).mean(axis=1)