wyfEmma
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algoperf/pytorch_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎algoperf/pytorch_utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎algoperf/workloads/cifar/cifar_pytorch/workload.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/workloads/cifar/cifar_pytorch/workload.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algoperf/workloads/criteo1tb/workload.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/workloads/criteo1tb/workload.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algoperf/workloads/fastmri/workload.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/workloads/fastmri/workload.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algoperf/workloads/imagenet_resnet/imagenet_pytorch/workload.py‎
Lines changed: 109 additions & 7 deletions b/‎algoperf/workloads/imagenet_resnet/imagenet_pytorch/workload.py‎
Lines changed: 109 additions & 7 deletions
diff --git a/‎algoperf/workloads/imagenet_resnet/workload.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/workloads/imagenet_resnet/workload.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algoperf/workloads/imagenet_vit/imagenet_pytorch/models.py‎
Lines changed: 7 additions & 7 deletions b/‎algoperf/workloads/imagenet_vit/imagenet_pytorch/models.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎algoperf/workloads/imagenet_vit/workload.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/workloads/imagenet_vit/workload.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algoperf/workloads/librispeech_conformer/workload.py‎
Lines changed: 2 additions & 2 deletions b/‎algoperf/workloads/librispeech_conformer/workload.py‎
Lines changed: 2 additions & 2 deletions
@@ -31,7 +31,7 @@ The MLCommons™ **AlgoPerf: Training Algorithms benchmark** is designed to find
 When training neural nets, practitioners face many critical yet often opaque decisions: What optimizer to choose? How should its learning rate be tuned? What learning rate schedule should be used? These choices can make or break training, yet the community has lacked a clear, standardized way to identify the state of the art.
 Unlike benchmarks focused on hardware or model architecture, AlgoPerf isolates the **training algorithm** itself, which includes the optimizer, regularization, data selection, and hyperparameters like the learning rate schedule. By standardizing the benchmark process, AlgoPerf offers a meaningful apples-to-apples comparison of training algorithms and follows the following **key principles**:
 
-- 🎯 **Fixed Target, Model & Hardware:** Submitted training algorithms must train a set of [**fixed models**](/docs/DOCUMENTATION.md#workloads) to a pre-defined validation performance target as fast as possible. All submissions use the same model architecture and are run on the same [**standardized hardware**](/docs/DOCUMENTATION.md#benchmarking-hardware) (8x NVIDIA V100 GPUs). This isolates the training algorithm's performance and allows a fair apples-to-apples comparison.
+- 🎯 **Fixed Target, Model & Hardware:** Submitted training algorithms must train a set of [**fixed models**](/docs/DOCUMENTATION.md#workloads) to a pre-defined validation performance target as fast as possible. All submissions use the same model architecture and are run on the same [**standardized hardware**](/docs/DOCUMENTATION.md#benchmarking-hardware) (4x A100 (40GB) GPUs). This isolates the training algorithm's performance and allows a fair apples-to-apples comparison.
 - ⏱️ **Time-To-Result:** Submissions are evaluated based on the total wall-clock time required to reach the target, rewarding practical and efficient algorithms.
 - 🧠 **Diverse Workloads:** The benchmark includes [**8 diverse deep learning workloads**](/docs/DOCUMENTATION.md#workloads) across domains like image classification, speech recognition, and machine translation. A submission's score is computed by aggregating its performance, using [**performance profiles**](/docs/DOCUMENTATION.md#benchmark-score-using-performance-profiles), across all workloads to ensure general-purpose algorithms.
 - 📦 **Fully-Specified Algorithms:** Submissions must be complete procedures and thus hyperparameter tuning is treated as part of the algorithm. Submissions can either provide a search space for automated tuning ([**External tuning ruleset**](/docs/DOCUMENTATION.md#external-tuning-ruleset)) or be hyperparameter-free ([**Self-tuning ruleset**](/docs/DOCUMENTATION.md#self-tuning-ruleset)) with any tuning done automatically and "on the clock". This measures an algorithm's _total_ practical cost and provides practitioners with a complete method, eliminating the guesswork of how to apply it.
 
@@ -20,6 +20,7 @@
 
 
 def pytorch_setup() -> Tuple[bool, int, torch.device, int]:
+  torch.set_float32_matmul_precision('high')
   use_pytorch_ddp = 'LOCAL_RANK' in os.environ
   rank = int(os.environ['LOCAL_RANK']) if use_pytorch_ddp else 0
   device = torch.device(f'cuda:{rank}' if torch.cuda.is_available() else 'cpu')
 
@@ -110,12 +110,12 @@ def _build_dataset(
       batch_size=ds_iter_batch_size,
       shuffle=not USE_PYTORCH_DDP and is_train,
       sampler=sampler,
-      num_workers=4 if is_train else self.eval_num_workers,
+      num_workers=2 * N_GPUS if is_train else self.eval_num_workers,
       pin_memory=True,
       drop_last=is_train,
     )
-    dataloader = data_utils.PrefetchedWrapper(dataloader, DEVICE)
     dataloader = data_utils.cycle(dataloader, custom_sampler=USE_PYTORCH_DDP)
+    dataloader = data_utils.dataloader_iterator_wrapper(dataloader, DEVICE)
     return dataloader
 
   def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
 
@@ -95,11 +95,11 @@ def train_stddev(self):
 
   @property
   def max_allowed_runtime_sec(self) -> int:
-    return 7_703  # ~2.1 hours.
+    return 8_915  # ~2.4 hours.
 
   @property
   def eval_period_time_sec(self) -> int:
-    return 2 * 60  # 2 mins.
+    return 356  # approx 25 evals
 
   def _build_input_queue(
     self,
 
@@ -95,11 +95,11 @@ def accelerations(self):
 
   @property
   def max_allowed_runtime_sec(self) -> int:
-    return 4_430  # ~1.2 hours
+    return 2_745  # ~0.7 hours
 
   @property
   def eval_period_time_sec(self) -> int:
-    return 80
+    return 110  # approx 25 evals
 
   @property
   def step_hint(self) -> int:
 
@@ -3,18 +3,25 @@
 import contextlib
 import functools
 import itertools
+import json
 import math
 import os
 import random
-from typing import Dict, Iterator, Optional, Tuple
+import time
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterator, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torchvision import transforms
-from torchvision.datasets.folder import ImageFolder
+from torchvision.datasets.folder import (
+  IMG_EXTENSIONS,
+  ImageFolder,
+  default_loader,
+)
 
 import algoperf.random_utils as prng
 from algoperf import data_utils, param_utils, pytorch_utils, spec
@@ -28,6 +35,100 @@
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_utils.pytorch_setup()
 
 
+class CachedImageFolder(ImageFolder):
+  """ImageFolder that caches the file listing to avoid repeated filesystem scans."""
+
+  def __init__(
+    self,
+    root: Union[str, Path],
+    cache_file: Optional[Union[str, Path]] = None,
+    transform: Optional[Callable] = None,
+    target_transform: Optional[Callable] = None,
+    loader: Callable[[str], Any] = default_loader,
+    is_valid_file: Optional[Callable[[str], bool]] = None,
+    allow_empty: bool = False,
+    rebuild_cache: bool = False,
+    cache_build_timeout_minutes: int = 30,
+  ):
+    self.root = os.path.abspath(root)
+    self.transform = transform
+    self.target_transform = target_transform
+    self.loader = loader
+    self.extensions = IMG_EXTENSIONS if is_valid_file is None else None
+
+    # Default cache location: .cache_index.json in the root directory
+    if cache_file is None:
+      cache_file = os.path.join(self.root, '.cache_index.json')
+    self.cache_file = cache_file
+
+    is_distributed = dist.is_available() and dist.is_initialized()
+    rank = dist.get_rank() if is_distributed else 0
+
+    cache_exists = os.path.exists(self.cache_file)
+    needs_rebuild = rebuild_cache or not cache_exists
+
+    if needs_rebuild:
+      # We only want one process to build the cache
+      # and others to wait for it to finish.
+      if rank == 0:
+        self._build_and_save_cache(is_valid_file, allow_empty)
+      if is_distributed:
+        self._wait_for_cache(timeout_minutes=cache_build_timeout_minutes)
+        dist.barrier()
+
+    self._load_from_cache()
+
+    self.targets = [s[1] for s in self.samples]
+    self.imgs = self.samples
+
+  def _wait_for_cache(self, timeout_minutes: int):
+    """Poll for cache file to exist."""
+    timeout_seconds = timeout_minutes * 60
+    poll_interval = 5
+    elapsed = 0
+
+    while not os.path.exists(self.cache_file):
+      if elapsed >= timeout_seconds:
+        raise TimeoutError(
+          f'Timed out waiting for cache file after {timeout_minutes} minutes: {self.cache_file}'
+        )
+      time.sleep(poll_interval)
+      elapsed += poll_interval
+
+  def _load_from_cache(self):
+    """Load classes and samples from cache file."""
+    with open(os.path.abspath(self.cache_file), 'r') as f:
+      cache = json.load(f)
+    self.classes = cache['classes']
+    self.class_to_idx = cache['class_to_idx']
+    # Convert relative paths back to absolute
+    self.samples = [
+      (os.path.join(self.root, rel_path), idx)
+      for rel_path, idx in cache['samples']
+    ]
+
+  def _build_and_save_cache(self, is_valid_file, allow_empty):
+    """Scan filesystem, build index, and save to cache."""
+    self.classes, self.class_to_idx = self.find_classes(self.root)
+    self.samples = self.make_dataset(
+      self.root,
+      class_to_idx=self.class_to_idx,
+      extensions=self.extensions,
+      is_valid_file=is_valid_file,
+      allow_empty=allow_empty,
+    )
+
+    cache = {
+      'classes': self.classes,
+      'class_to_idx': self.class_to_idx,
+      'samples': [
+        (os.path.relpath(path, self.root), idx) for path, idx in self.samples
+      ],
+    }
+    with open(os.path.abspath(self.cache_file), 'w') as f:
+      json.dump(cache, f)
+
+
 def imagenet_v2_to_torch(
   batch: Dict[str, spec.Tensor],
 ) -> Dict[str, spec.Tensor]:
@@ -119,8 +220,10 @@ def _build_dataset(
       )
 
     folder = 'train' if 'train' in split else 'val'
-    dataset = ImageFolder(
-      os.path.join(data_dir, folder), transform=transform_config
+    dataset = CachedImageFolder(
+      os.path.join(data_dir, folder),
+      transform=transform_config,
+      cache_file='.imagenet_{}_cache_index.json'.format(split),
     )
 
     if split == 'eval_train':
@@ -145,16 +248,16 @@ def _build_dataset(
         sampler = data_utils.DistributedEvalSampler(
           dataset, num_replicas=N_GPUS, rank=RANK, shuffle=False
         )
-
     dataloader = torch.utils.data.DataLoader(
       dataset,
       batch_size=ds_iter_batch_size,
       shuffle=not USE_PYTORCH_DDP and is_train,
       sampler=sampler,
-      num_workers=4 if is_train else self.eval_num_workers,
+      num_workers=5 * N_GPUS if is_train else self.eval_num_workers,
       pin_memory=True,
       drop_last=is_train,
       persistent_workers=is_train,
+      prefetch_factor=N_GPUS,
     )
     dataloader = data_utils.PrefetchedWrapper(dataloader, DEVICE)
     dataloader = data_utils.cycle(
@@ -163,7 +266,6 @@ def _build_dataset(
       use_mixup=use_mixup,
       mixup_alpha=0.2,
     )
-
     return dataloader
 
   def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
 
@@ -103,11 +103,11 @@ def resize_size(self) -> int:
 
   @property
   def max_allowed_runtime_sec(self) -> int:
-    return 66_159  # ~18.4 hours
+    return 49_918  # ~13.8 hours
 
   @property
   def eval_period_time_sec(self) -> int:
-    return 510  # 8.5 minutes.
+    return 1_996  # approx 25 evals
 
   def _build_dataset(
     self,
 
@@ -5,7 +5,6 @@
 and https://github.com/lucidrains/vit-pytorch.
 """
 
-import math
 from typing import Any, Optional, Tuple, Union
 
 import torch
@@ -126,13 +125,14 @@ def forward(self, x: spec.Tensor, dropout_rate: float) -> spec.Tensor:
     value_layer = self.transpose_for_scores(self.value(x))
     query_layer = self.transpose_for_scores(mixed_query_layer)
 
-    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-    attention_scores = attention_scores / math.sqrt(self.head_dim)
-
-    attention_probs = F.softmax(attention_scores, dim=-1)
-    attention_probs = F.dropout(attention_probs, dropout_rate, self.training)
+    # Use built-in scaled_dot_product_attention (Flash Attention when available)
+    context_layer = F.scaled_dot_product_attention(
+      query_layer,
+      key_layer,
+      value_layer,
+      dropout_p=dropout_rate if self.training else 0.0,
+    )
 
-    context_layer = torch.matmul(attention_probs, value_layer)
     context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
     new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_dim,)
     context_layer = context_layer.view(new_context_layer_shape)
 
@@ -88,11 +88,11 @@ def eval_batch_size(self) -> int:
 
   @property
   def max_allowed_runtime_sec(self) -> int:
-    return 69_768  # ~19.4 hours
+    return 64_292  # ~17.8 hours
 
   @property
   def eval_period_time_sec(self) -> int:
-    return 7 * 60  # 7 mins.
+    return 2_571  # 7 mins.
 
   def _build_dataset(
     self,
 
@@ -80,11 +80,11 @@ def train_stddev(self):
 
   @property
   def max_allowed_runtime_sec(self) -> int:
-    return 58_015  # ~16.1 hours
+    return 43_680  # ~16.1 hours
 
   @property
   def eval_period_time_sec(self) -> int:
-    return 24 * 60
+    return 1747  # approx 25 evals
 
   @property
   def step_hint(self) -> int: