wyfEmma
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algoperf/checkpoint_utils.py‎
Lines changed: 48 additions & 1 deletion b/‎algoperf/checkpoint_utils.py‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎algoperf/param_utils.py‎
Lines changed: 2 additions & 0 deletions b/‎algoperf/param_utils.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎algoperf/pytorch_utils.py‎
Lines changed: 5 additions & 2 deletions b/‎algoperf/pytorch_utils.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎algoperf/random_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/random_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algoperf/workloads/finewebedu_lm/__init__.py‎ b/‎algoperf/workloads/finewebedu_lm/__init__.py‎
diff --git a/‎algoperf/workloads/finewebedu_lm/finewebedu_lm_jax/__init__.py‎ b/‎algoperf/workloads/finewebedu_lm/finewebedu_lm_jax/__init__.py‎
@@ -25,4 +25,4 @@ scoring/plots/
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
 
-algoperf/_version.py
+algoperf/_version.py
@@ -5,14 +5,16 @@
 """
 
 import os
-from typing import Sequence, Tuple
+from typing import Optional, Sequence, Tuple
 
 import numpy as np
+import orbax.checkpoint as ocp
 import torch
 from absl import logging
 from flax import jax_utils
 from flax.training import checkpoints as flax_checkpoints
 from flax.training.checkpoints import latest_checkpoint
+from orbax.checkpoint.type_handlers import NumpyHandler
 from tensorflow.io import gfile  # pytype: disable=import-error
 
 from algoperf import spec
@@ -30,6 +32,51 @@
 ]
 
 
+class BoolHandler(NumpyHandler):
+  """
+  An implementation of TypeHandler for np.bool_ that inherits from NumpyHandler.
+  It works by treating the scalar as a 0-dimensional array.
+  """
+
+  def typestr(self) -> str:
+    """Unique string identifier for this handler."""
+    return 'np.bool_'
+
+  async def serialize(
+    self,
+    values: Sequence[np.bool_],
+    infos: Sequence,
+    args: Optional[Sequence[ocp.SaveArgs]] = None,
+  ):
+    """
+    Serializes a sequence of np.bool_ scalars by first converting them
+    to 0-dim numpy arrays and then calling the parent NumpyHandler.
+    """
+    # Convert each scalar np.bool_ to a 0-dimensional np.ndarray
+    array_values = [np.asarray(v, dtype=np.bool_) for v in values]
+    # Use the parent class's robust serialization logic
+    return await super().serialize(array_values, infos, args)
+
+  async def deserialize(
+    self,
+    infos: Sequence,
+    args: Optional[Sequence[ocp.RestoreArgs]] = None,
+  ) -> Sequence[np.bool_]:
+    """
+    Deserializes into a sequence of np.bool_ scalars by calling the
+    parent handler and then converting the resulting 0-dim arrays.
+    """
+    # Parent deserialize will return a sequence of 0-dimensional np.ndarray
+    results = await super().deserialize(infos, args)
+
+    # Convert each 0-d array back to an np.bool_ scalar using .item()
+    scalar_results = [np.bool_(r.item()) for r in results]
+    return scalar_results
+
+
+ocp.type_handlers.register_type_handler(np.bool_, BoolHandler(), override=True)
+
+
 def maybe_restore_checkpoint(
   framework: str,
   optimizer_state: spec.OptimizerState,
 
@@ -44,6 +44,8 @@ def pytorch_param_types(
         param_types[name] = spec.ParameterType.ATTENTION_BIAS
       elif 'in_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_QKV
+      elif 'qkv' in name:
+        param_types[name] = spec.ParameterType.ATTENTION_QKV
       elif 'kv_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_KV
       elif 'k_proj' in name or 'key' in name:
 
@@ -21,14 +21,17 @@
 
 def pytorch_setup() -> Tuple[bool, int, torch.device, int]:
   torch.set_float32_matmul_precision('high')
+
   use_pytorch_ddp = 'LOCAL_RANK' in os.environ
   rank = int(os.environ['LOCAL_RANK']) if use_pytorch_ddp else 0
   device = torch.device(f'cuda:{rank}' if torch.cuda.is_available() else 'cpu')
   n_gpus = torch.cuda.device_count()
   return use_pytorch_ddp, rank, device, n_gpus
 
 
-def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
+def pytorch_init(
+  use_pytorch_ddp: bool, rank: int, profiler: Profiler, limit_tf_threads=True
+) -> None:
   # Make sure no GPU memory is preallocated to Jax.
   os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
   # Only use CPU for Jax to avoid memory issues.
@@ -40,7 +43,7 @@ def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
 
   if use_pytorch_ddp:
     # Avoid tf input pipeline creating too many threads.
-    if rank != 0:
+    if rank != 0 and limit_tf_threads:
       tf.config.threading.set_intra_op_parallelism_threads(1)
       tf.config.threading.set_inter_op_parallelism_threads(1)
 
 
@@ -35,13 +35,13 @@ def _signed_to_unsigned(seed: SeedType) -> SeedType:
 
 def _fold_in(seed: SeedType, data: Any) -> List[Union[SeedType, Any]]:
   rng = np.random.RandomState(seed=_signed_to_unsigned(seed))
-  new_seed = rng.randint(MIN_INT32, MAX_INT32, dtype=np.int32)
+  new_seed = rng.randint(MIN_INT32, MAX_INT32, dtype=np.uint32)
   return [new_seed, data]
 
 
 def _split(seed: SeedType, num: int = 2) -> SeedType:
   rng = np.random.RandomState(seed=_signed_to_unsigned(seed))
-  return rng.randint(MIN_INT32, MAX_INT32, dtype=np.int32, size=[num, 2])
+  return rng.randint(MIN_INT32, MAX_INT32, dtype=np.uint32, size=[num, 2])
 
 
 def _PRNGKey(seed: SeedType) -> SeedType:  # pylint: disable=invalid-name