Merge pull request #396 from AI-Hypercomputer:fix-ulysses-custom

Google-ML-Automation · Google-ML-Automation · commit 71b41388b020 · 2026-05-04T10:13:54.000-07:00
PiperOrigin-RevId: 910099466
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -35,6 +35,7 @@
 from . import quantizations
 from .modeling_flax_utils import get_activation
 
+LOG2E = math.log2(math.e)
 
 Array = common_types.Array
 Mesh = common_types.Mesh
@@ -591,9 +592,7 @@ def wrap_ulysses_attention(query, key, value):
           heads_per_tile = getattr(flash_block_sizes, "heads_per_tile", heads_per_tile)
 
       if use_base2_exp:
-        query_scaled = query * 1.44269504
-      else:
-        query_scaled = query
+        query = query * LOG2E
 
       query, kv_size, query_seq_len = _pad_data_for_flash(query, heads, bq)
       key, _, key_seq_len = _pad_data_for_flash(key, heads, bkv)
@@ -612,7 +611,7 @@ def wrap_ulysses_attention(query, key, value):
       )
 
       vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0))
-      attention_output = vmapped_splash(query_scaled, key, value)
+      attention_output = vmapped_splash(query, key, value)
       attention_output = jnp.swapaxes(attention_output, 2, 3)
       attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
     else:
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -214,7 +214,7 @@ def user_init(raw_keys):
     # Verify qkv is sharded across sequence.
     attention = raw_keys["attention"]
     uses_ring_attention = "ring" in attention
-    uses_ulysses_attention = attention == "ulysses"
+    uses_ulysses_attention = "ulysses" in attention
     uses_uniform_sequence_sharding = raw_keys["attention_sharding_uniform"]
     if uses_ring_attention or uses_ulysses_attention or uses_uniform_sequence_sharding:
       max_logging.log(