activation offloading

entrpn · entrpn · commit 72892187b7f6 · 2025-08-29T17:39:55.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -800,7 +800,7 @@ def __init__(
             nnx.initializers.zeros,
             (
                 None,
-                "heads"
+                "heads",
             ),
         ),
     )
diff --git a/src/maxdiffusion/models/gradient_checkpoint.py b/src/maxdiffusion/models/gradient_checkpoint.py
@@ -71,15 +71,25 @@ def to_jax_policy(self):
            offload_src="device", offload_dst="pinned_host"
         )
       case GradientCheckpointType.ATTN:
-        offload_policy = cp.save_and_offload_only_these_names(
-              names_which_can_be_saved=[], names_which_can_be_offloaded=["attn_output"], offload_src="device", offload_dst="pinned_host"
+        policy = cp.save_and_offload_only_these_names(
+              names_which_can_be_saved=[], 
+              names_which_can_be_offloaded=[
+                #"attn_output",
+                #"query_proj",
+                #"key_proj",
+                #"value_proj",
+                #"xq_out",
+                #"xk_out",
+                "ffn_activation"
+              ],
+              offload_src="device",
+              offload_dst="pinned_host"
           )
-        policy = jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
-        return cp.save_from_both_policies(offload_policy, policy)
+        return policy
       case GradientCheckpointType.MATMUL_WITHOUT_BATCH:
         return jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
 
-  def apply(self, module: nnx.Module) -> nnx.Module:
+  def apply(self, module: nnx.Module, static_argnums=()) -> nnx.Module:
     """
     Applies a gradient checkpoint policy to a module
     if no policy is needed, it will return the module as is
@@ -97,4 +107,5 @@ def apply(self, module: nnx.Module) -> nnx.Module:
         module,
         prevent_cse=False,
         policy=policy,
+        static_argnums=static_argnums
     )
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -19,6 +19,7 @@
 import jax
 import jax.numpy as jnp
 from jax.sharding import PartitionSpec
+from jax.ad_checkpoint import checkpoint_name
 from flax import nnx
 import numpy as np
 from .... import common_types
@@ -42,7 +43,7 @@ def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int):
   t_dim = attention_head_dim - h_dim - w_dim
   freqs = []
   for dim in [t_dim, h_dim, w_dim]:
-    freq = get_1d_rotary_pos_embed(dim, max_seq_len, theta, freqs_dtype=jnp.float64, use_real=False)
+    freq = get_1d_rotary_pos_embed(dim, max_seq_len, theta, freqs_dtype=jnp.float32, use_real=False)
     freqs.append(freq)
   freqs = jnp.concatenate(freqs, axis=1)
   t_size = attention_head_dim // 2 - 2 * (attention_head_dim // 6)
@@ -180,7 +181,7 @@ def __init__(
                 "embed",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None, "embed")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None, "embed",)),
     )
 
   def __call__(self, x: jax.Array) -> jax.Array:
@@ -237,9 +238,10 @@ def __init__(
     )
 
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
-    hidden_states = self.act_fn(hidden_states)
+    hidden_states = self.act_fn(hidden_states) # Output is (4, 75600, 13824)
+    hidden_states = checkpoint_name(hidden_states, "ffn_activation")
     hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
-    return self.proj_out(hidden_states)
+    return self.proj_out(hidden_states) # output is (4, 75600, 5120)
 
 
 class WanTransformerBlock(nnx.Module):

Original file line number	Diff line number	Diff line change
`@@ -800,7 +800,7 @@ def __init__(`
`800`	`800`	`nnx.initializers.zeros,`
`801`	`801`	`(`
`802`	`802`	`None,`
`803`		`- "heads"`
	`803`	`+ "heads",`
`804`	`804`	`),`
`805`	`805`	`),`
`806`	`806`	`)`