revert shardings

entrpn · entrpn · commit 1ca1b1bc10e1 · 2025-08-26T17:18:20.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -734,7 +734,7 @@ def __init__(
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.
     # Dims are [num_blocks, embed, heads]
-    kernel_axes = ("embed", None, "heads")
+    kernel_axes = (None, "embed", "heads")
     qkv_init_kernel = nnx.with_partitioning(nnx.initializers.lecun_normal(), kernel_axes)
 
     self.query = nnx.Linear(
@@ -748,8 +748,8 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
+                None,
                 "embed",
-                "heads",
             ),
         ),
     )
@@ -765,8 +765,8 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
+                None,
                 "embed",
-                "heads",
             ),
         ),
     )
@@ -782,8 +782,8 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
+                None,
                 "embed",
-                "heads"
             ),
         ),
     )
@@ -792,15 +792,15 @@ def __init__(
         rngs=rngs,
         in_features=self.inner_dim,
         out_features=self.inner_dim,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads", None)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), (None, "heads", "embed")),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                "embed",
-                None
+                None,
+                "heads"
             ),
         ),
     )
diff --git a/src/maxdiffusion/models/gradient_checkpoint.py b/src/maxdiffusion/models/gradient_checkpoint.py
@@ -39,6 +39,7 @@ class GradientCheckpointType(Enum):
   NONE = auto()
   FULL = auto()
   MATMUL_WITHOUT_BATCH = auto()
+  OFFLOAD_MATMUL_WITHOUT_BATCH = auto()
   ATTN = auto()
 
   @classmethod
@@ -65,10 +66,16 @@ def to_jax_policy(self):
         return SKIP_GRADIENT_CHECKPOINT_KEY
       case GradientCheckpointType.FULL:
         return None
-      case GradientCheckpointType.ATTN:
-        return cp.save_and_offload_only_these_names(
-            names_which_can_be_saved=[], names_which_can_be_offloaded=[], offload_src="device", offload_dst="pinned_host"
+      case GradientCheckpointType.OFFLOAD_MATMUL_WITHOUT_BATCH:
+        return cp.offload_dot_with_no_batch_dims(
+           offload_src="device", offload_dst="pinned_host"
         )
+      case GradientCheckpointType.ATTN:
+        offload_policy = cp.save_and_offload_only_these_names(
+              names_which_can_be_saved=[], names_which_can_be_offloaded=["attn_output"], offload_src="device", offload_dst="pinned_host"
+          )
+        policy = jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
+        return cp.save_from_both_policies(offload_policy, policy)
       case GradientCheckpointType.MATMUL_WITHOUT_BATCH:
         return jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -175,11 +175,12 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "embed",
                 None,
                 "mlp",
+                "embed",
             ),
         ),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None, "embed")),
     )
 
   def __call__(self, x: jax.Array) -> jax.Array:
@@ -217,7 +218,6 @@ def __init__(
       raise NotImplementedError(f"{activation_fn} is not implemented.")
 
     self.drop_out = nnx.Dropout(dropout)
-
     self.proj_out = nnx.Linear(
         rngs=rngs,
         in_features=inner_dim,
@@ -229,9 +229,9 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
+                None,
                 "embed",
                 "mlp",
-                None,
             ),
         ),
     )
@@ -319,8 +319,7 @@ def __init__(
 
     key = rngs.params()
     self.adaln_scale_shift_table = nnx.Param(
-      jax.random.normal(key, (1, 6, dim)) / dim**0.5,
-      sharding=("embed",))
+      jax.random.normal(key, (1, 6, dim)) / dim**0.5,)
 
   def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, temb: jax.Array, rotary_emb: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None,):
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -149,7 +149,7 @@ def start_training(self):
 
     pipeline = self.load_checkpoint()
     # Generate a sample before training to compare against generated sample after training.
-    #pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
+    pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
 
     # save some memory.
     del pipeline.vae