diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c6da822..0ac926f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,8 +194,9 @@ add_executable(gpt2 example/gpt2/main.cc example/common/tiny_shakespeare_dataset.cc example/common/utils.cc - example/gpt2/checkpoint_loader.cc + example/common/checkpoint_loader.cc example/common/tokenizer.cc + example/gpt2/checkpoint_loader.cc ) link_infini_train_exe(gpt2) @@ -203,8 +204,9 @@ add_executable(llama3 example/llama3/main.cc example/common/tiny_shakespeare_dataset.cc example/common/utils.cc - example/llama3/checkpoint_loader.cc + example/common/checkpoint_loader.cc example/common/tokenizer.cc + example/llama3/checkpoint_loader.cc ) link_infini_train_exe(llama3) diff --git a/example/common/checkpoint_loader.cc b/example/common/checkpoint_loader.cc new file mode 100644 index 00000000..29b04564 --- /dev/null +++ b/example/common/checkpoint_loader.cc @@ -0,0 +1,119 @@ +#include "example/common/checkpoint_loader.h" + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" + +#include "infini_train/include/nn/modules/transformer/transformer_config.h" +#include "infini_train/include/nn/parallel/global.h" +#include "infini_train/include/tensor.h" + +using namespace infini_train; +namespace nn = infini_train::nn; + +// TODO(jym): ckpt is a new checkpoint format; bin is the legacy format. Keeping both as an interim solution; plan to +// consolidate into one later. +ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args) { + ResumeFromCheckpointResult result; + if (args.resume_root.empty()) { + LOG(INFO) << "No checkpoint specified for resume. Starting training from scratch."; + return result; + } + + int ddp_world_size = nn::parallel::global::GetDataParallelSize(); + int tp_world_size = nn::parallel::global::GetTensorParallelSize(); + int sp_world_size = nn::parallel::global::GetSequenceParallelEnabled() ? tp_world_size : 1; + int pp_world_size = nn::parallel::global::GetPipelineParallelSize(); + + std::filesystem::path resume_dir = args.resume_root; + if (args.rank.IsParallel()) { + const auto rank_dir = resume_dir / std::format("rank_{:06d}", args.rank.GlobalRank()); + if (std::filesystem::exists(rank_dir)) { + resume_dir = rank_dir; + } + } + + Checkpoint::Load(resume_dir, *args.model, args.optimizer.get(), args.state); + + result.global_step = static_cast(args.state.global_step); + + CHECK_EQ(args.state.n_layer, args.model_config.n_layer) + << "n_layer mismatch: ckpt=" << args.state.n_layer << ", config=" << args.model_config.n_layer; + CHECK_EQ(args.state.n_head, args.model_config.n_head) + << "n_head mismatch: ckpt=" << args.state.n_head << ", config=" << args.model_config.n_head; + CHECK_EQ(args.state.n_kv_head, args.model_config.n_kv_head) + << "n_kv_head mismatch: ckpt=" << args.state.n_kv_head << ", config=" << args.model_config.n_kv_head; + CHECK_EQ(args.state.n_embd, args.model_config.n_embd) + << "n_embd mismatch: ckpt=" << args.state.n_embd << ", config=" << args.model_config.n_embd; + CHECK_EQ(args.state.vocab_size, args.model_config.vocab_size) + << "vocab_size mismatch: ckpt=" << args.state.vocab_size << ", config=" << args.model_config.vocab_size; + + CHECK_EQ(args.state.ddp_size, ddp_world_size) << "DDP size mismatch: checkpoint has DDP=" << args.state.ddp_size + << ", but current run has DDP=" << ddp_world_size; + CHECK_EQ(args.state.tp_size, tp_world_size) + << "TP size mismatch: checkpoint has TP=" << args.state.tp_size << ", but current run has TP=" << tp_world_size; + CHECK_EQ(args.state.sp_size, sp_world_size) + << "SP size mismatch: checkpoint has SP=" << args.state.sp_size << ", but current run has SP=" << sp_world_size; + CHECK_EQ(args.state.pp_size, pp_world_size) + << "PP size mismatch: checkpoint has PP=" << args.state.pp_size << ", but current run has PP=" << pp_world_size; + + result.consumed_batches = static_cast(std::max(args.state.consumed_batches, 0)); + if (args.rank.IsMainRank()) { + LOG(INFO) << std::format("Resume training from step {}, last_lr {:.3e}, consumed_batches {}", + args.state.global_step, args.state.last_lr, args.state.consumed_batches); + } + + return result; +} + +void SaveCheckpoint(const SaveCheckpointArgs &args) { + const auto ckpt_start = std::chrono::high_resolution_clock::now(); + + TrainerState state; + state.global_step = args.global_step; + state.consumed_batches = static_cast(args.consumed_batches); + state.last_lr = args.last_lr; + state.n_layer = args.n_layer; + state.n_head = args.n_head; + state.n_kv_head = args.n_kv_head; + state.n_embd = args.n_embd; + state.vocab_size = args.vocab_size; + state.ddp_size = args.ddp_size; + state.tp_size = args.tp_size; + state.sp_size = args.sp_size; + state.pp_size = args.pp_size; + + Checkpoint::Save(args.save_dir, args.model, &args.optimizer, state); + + const auto ckpt_end = std::chrono::high_resolution_clock::now(); + const double ckpt_ms = std::chrono::duration(ckpt_end - ckpt_start).count(); + + if (!args.rank.IsMainRank()) { + return; + } + + LOG(INFO) << std::format("Checkpoint saved at: {} ({:.2f} ms)", args.save_dir.string(), ckpt_ms); + + if (!args.prune_step_checkpoints) { + return; + } + + std::vector ckpts; + if (std::filesystem::exists(args.checkpoint_root_dir)) { + for (const auto &entry : std::filesystem::directory_iterator(args.checkpoint_root_dir)) { + if (entry.is_directory() && entry.path().filename().string().starts_with("checkpoint_step_")) { + ckpts.push_back(entry.path()); + } + } + std::sort(ckpts.begin(), ckpts.end()); + while (ckpts.size() > args.max_checkpoint_keep) { + std::filesystem::remove_all(ckpts.front()); + ckpts.erase(ckpts.begin()); + } + } +} diff --git a/example/common/checkpoint_loader.h b/example/common/checkpoint_loader.h new file mode 100644 index 00000000..c8b4e939 --- /dev/null +++ b/example/common/checkpoint_loader.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include +#include + +#include "infini_train/include/checkpoint.h" +#include "infini_train/include/dataloader.h" +#include "infini_train/include/nn/modules/module.h" +#include "infini_train/include/nn/parallel/rank.h" +#include "infini_train/include/optimizer.h" + +using namespace infini_train; +namespace nn = infini_train::nn; + +namespace infini_train::nn { +class TransformerConfig; +} + +struct ResumeFromCheckpointArgs { + std::filesystem::path resume_root; + const nn::parallel::Rank &rank; + std::shared_ptr model; + std::shared_ptr optimizer; + DistributedDataLoader &train_loader; + const nn::TransformerConfig &model_config; + TrainerState &state; +}; + +struct ResumeFromCheckpointResult { + int global_step = 0; + size_t consumed_batches = 0; +}; + +struct SaveCheckpointArgs { + std::filesystem::path save_dir; + int64_t global_step = 0; + size_t consumed_batches = 0; + double last_lr = 0.0; + int64_t n_layer = 0; + int64_t n_head = 0; + int64_t n_kv_head = 0; + int64_t n_embd = 0; + int64_t vocab_size = 0; + int ddp_size = 1; + int tp_size = 1; + int sp_size = 1; + int pp_size = 1; + bool no_save_optim = false; + bool prune_step_checkpoints = false; + std::filesystem::path checkpoint_root_dir; + size_t max_checkpoint_keep = 0; + const nn::parallel::Rank &rank; + const nn::Module &model; + const Optimizer &optimizer; +}; + +ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args); + +void SaveCheckpoint(const SaveCheckpointArgs &args); diff --git a/example/gpt2/checkpoint_loader.cc b/example/gpt2/checkpoint_loader.cc index 4a7789e9..d5353531 100644 --- a/example/gpt2/checkpoint_loader.cc +++ b/example/gpt2/checkpoint_loader.cc @@ -5,15 +5,12 @@ #include #include #include -#include #include #include #include #include "glog/logging.h" -#include "example/common/utils.h" -#include "example/gpt2/config.h" #include "infini_train/include/nn/modules/normalization.h" #include "infini_train/include/nn/modules/sparse.h" #include "infini_train/include/nn/modules/transformer/causal_self_attention.h" @@ -24,29 +21,24 @@ #include "infini_train/include/nn/parallel/tensor_parallel.h" #include "infini_train/include/tensor.h" +#include "example/common/utils.h" +#include "example/gpt2/config.h" + using namespace infini_train; namespace nn = infini_train::nn; namespace { -constexpr int kRandomSeed = 42; +constexpr int32_t kGPT2Magic = 20240326; +constexpr int32_t kGPT2FP32Version = 3; +constexpr int32_t kGPT2BF16Version = 5; -// TODO(dcj): make this rng generator compatible with torch later -static std::mt19937 gen{kRandomSeed}; -} // namespace - -namespace { -constexpr int32_t kHeaderMagic = 20240326; -constexpr int32_t kHeaderFP32Version = 3; -constexpr int32_t kHeaderBF16Version = 5; - -std::tuple DetermineAndCheckVersion(const std::vector &header, - size_t offset) { +std::tuple DetermineAndCheckVersion(const std::vector &header, size_t offset) { const auto version = BytesToType(header, offset); switch (version) { - case kHeaderBF16Version: - return {version, infini_train::DataType::kBFLOAT16}; - case kHeaderFP32Version: - return {version, infini_train::DataType::kFLOAT32}; + case kGPT2BF16Version: + return {version, DataType::kBFLOAT16}; + case kGPT2FP32Version: + return {version, DataType::kFLOAT32}; default: LOG(FATAL) << "Unsupported version: " << version << " at " << __FILE__ << ":" << __LINE__; return {}; // Unreachable, but keeps compiler happy @@ -54,9 +46,9 @@ std::tuple DetermineAndCheckVersion(const std:: } } // namespace -namespace gpt2 { - -std::shared_ptr LoadFromLLMC(const std::string &filepath) { +// TODO(jym): ckpt is a new checkpoint format; bin is the legacy format. Keeping both as an interim solution; plan to +// consolidate into one later. +std::shared_ptr gpt2::LoadFromLLMC(const std::string &filepath) { if (!std::filesystem::exists(filepath)) { LOG(FATAL) << "File not found: " << filepath; } @@ -65,9 +57,9 @@ std::shared_ptr LoadFromLLMC(const std::string &filepath) const auto header = ReadSeveralBytesFromIfstream(256 * sizeof(int32_t), &ifs); const auto magic = BytesToType(header, 0); - CHECK_EQ(magic, kHeaderMagic); + CHECK_EQ(magic, kGPT2Magic); auto [version, dtype] = DetermineAndCheckVersion(header, 4); - CHECK_EQ(version, kHeaderFP32Version); + CHECK_EQ(version, kGPT2FP32Version); auto tp_size = nn::parallel::global::GetTensorParallelSize(); @@ -428,4 +420,3 @@ std::shared_ptr LoadFromLLMC(const std::string &filepath) return local_gpt2; } -} // namespace gpt2 diff --git a/example/gpt2/checkpoint_loader.h b/example/gpt2/checkpoint_loader.h index e80c356e..6df14a7e 100644 --- a/example/gpt2/checkpoint_loader.h +++ b/example/gpt2/checkpoint_loader.h @@ -1,11 +1,12 @@ #pragma once +#include #include #include namespace infini_train::nn { class TransformerModel; -} // namespace infini_train::nn +} namespace gpt2 { std::shared_ptr LoadFromLLMC(const std::string &filepath); diff --git a/example/gpt2/config.h b/example/gpt2/config.h index 078f9fd5..c95d6517 100644 --- a/example/gpt2/config.h +++ b/example/gpt2/config.h @@ -38,5 +38,4 @@ inline void SanitizeGPT2Config(const nn::TransformerConfig &c) { CHECK(c.activation_type == nn::MLPType::kGELU) << "GPT-2 requires GELU activation"; CHECK(c.norm_type == nn::NormType::kLayerNorm) << "GPT-2 requires LayerNorm"; } - } // namespace gpt2 diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc index c12b5a28..627c0f09 100644 --- a/example/gpt2/main.cc +++ b/example/gpt2/main.cc @@ -1,6 +1,9 @@ +#include #include #include +#include #include +#include #include #include #include @@ -10,6 +13,7 @@ #include "glog/logging.h" #include "infini_train/include/autocast.h" +#include "infini_train/include/checkpoint.h" #include "infini_train/include/core/runtime/device_guard.h" #include "infini_train/include/dataloader.h" #include "infini_train/include/device.h" @@ -34,11 +38,13 @@ #include "infini_train/include/utils/precision_check_config.h" #include "infini_train/include/utils/precision_checker.h" +#include "example/common/checkpoint_loader.h" #include "example/common/tiny_shakespeare_dataset.h" #include "example/common/tokenizer.h" #include "example/gpt2/checkpoint_loader.h" #include "example/gpt2/config.h" +// TODO(jym): Reorganize CLI flags into categories for better readability and maintainability. // I/O DEFINE_string(input_bin, "", "input .bin to train on"); DEFINE_string(input_val_bin, "", "input .bin to eval validation loss on"); @@ -77,6 +83,11 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage."); // precision DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)"); +DEFINE_uint32(save_interval, 0, "save checkpoint every N steps; 0 disables saving"); +DEFINE_string(load, "", "checkpoint directory to resume from"); +DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints"); +DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep"); +DEFINE_bool(no_save_optim, false, "whether optimizer state is persisted in checkpoints"); // precision check DEFINE_string( precision_check, "", @@ -193,6 +204,8 @@ void Train(const nn::parallel::Rank &rank) { gpt2::SanitizeGPT2Config(model_config); model = std::make_shared(model_config); } + auto llmc_model = std::dynamic_pointer_cast(model); + CHECK(llmc_model != nullptr) << "Failed to cast model to GPT2 for LLMC checkpoint I/O."; model->To(device); @@ -292,8 +305,8 @@ void Train(const nn::parallel::Rank &rank) { // TODO(dcj): support more complex optimizer later // auto optimizer = optimizers::SGD(model->Parameters(), FLAGS_learning_rate); - auto optimizer_creator = optimizers::SGD::Create(FLAGS_learning_rate); std::shared_ptr optimizer = nullptr; + auto optimizer_creator = optimizers::SGD::Create(FLAGS_learning_rate); if (FLAGS_use_distributed_optimizer) { auto model_chunks = (pp_world_size > 1) @@ -306,6 +319,7 @@ void Train(const nn::parallel::Rank &rank) { } auto train_iter = train_loader.begin(); + std::shared_ptr loss_fn = (tp_world_size > 1) ? std::static_pointer_cast( std::make_shared(model_config.original_vocab_size)) @@ -315,9 +329,55 @@ void Train(const nn::parallel::Rank &rank) { auto impl = core::GetDeviceGuardImpl(device.type()); - LOG(INFO) << "start training"; + int start_step = 0; + TrainerState state; + const auto resume_result = ResumeFromCheckpoint({.resume_root = FLAGS_load, + .rank = rank, + .model = model, + .optimizer = optimizer, + .train_loader = train_loader, + .model_config = model_config, + .state = state}); + start_step = resume_result.global_step; + size_t consumed_batches = resume_result.consumed_batches; + + // TODO(jym): Replace with Sampler abstraction when available. + // Skip dataloader to resume from the correct batch position. + if (consumed_batches > 0) { + size_t start = train_iter.BatchIndex(); + // Each rank processes every ddp_world_size-th batch starting from its own rank. + // num_skips calculates how many ++ iterations to reach the saved batch position. + size_t num_skips = (consumed_batches - start) / ddp_world_size; + for (size_t i = 0; i < num_skips; ++i) { ++train_iter; } + } - for (int step = 0; step < FLAGS_num_iteration + 1; ++step) { + auto save_checkpoint + = [&](const std::filesystem::path &save_dir, int64_t global_step, bool prune_step_checkpoints) { + SaveCheckpoint({ + .save_dir = save_dir, + .global_step = global_step, + .consumed_batches = consumed_batches, + .last_lr = FLAGS_learning_rate, + .n_layer = model_config.n_layer, + .n_head = model_config.n_head, + .n_kv_head = model_config.n_kv_head, + .n_embd = model_config.n_embd, + .vocab_size = model_config.vocab_size, + .ddp_size = ddp_world_size, + .tp_size = tp_world_size, + .sp_size = sp_world_size, + .pp_size = pp_world_size, + .no_save_optim = FLAGS_no_save_optim, + .prune_step_checkpoints = prune_step_checkpoints, + .checkpoint_root_dir = FLAGS_save, + .max_checkpoint_keep = FLAGS_max_checkpoint_keep, + .rank = rank, + .model = *model, + .optimizer = *optimizer, + }); + }; + + for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) { // Reset precision check counters at start of each iteration for file overwrite utils::PrecisionChecker::ResetCounters(); @@ -367,6 +427,7 @@ void Train(const nn::parallel::Rank &rank) { // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below // TODO(dcj): support dataloader.reset() later ++train_iter; + consumed_batches = train_iter.BatchIndex(); x = std::make_shared(x->To(device)); y = std::make_shared(y->To(device)); @@ -397,6 +458,7 @@ void Train(const nn::parallel::Rank &rank) { // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below // TODO(dcj): support dataloader.reset() later ++train_iter; + consumed_batches = train_iter.BatchIndex(); x = std::make_shared(x->To(device)); y = std::make_shared(y->To(device)); @@ -431,6 +493,15 @@ void Train(const nn::parallel::Rank &rank) { } } } + + if (FLAGS_save_interval > 0 && (step + 1) % FLAGS_save_interval == 0) { + std::filesystem::path step_dir + = std::filesystem::path(FLAGS_save) / std::format("checkpoint_step_{:06d}", step + 1); + if (rank.IsParallel()) { + step_dir /= std::format("rank_{:06d}", rank.GlobalRank()); + } + save_checkpoint(step_dir, step + 1, true); + } } // Save LoRA weights if enabled and path specified @@ -439,6 +510,12 @@ void Train(const nn::parallel::Rank &rank) { nn::lora::SaveLoRAWeights(model, FLAGS_lora_save_path); } + std::filesystem::path final_dir = std::filesystem::path(FLAGS_save) / "checkpoint_final"; + if (rank.IsParallel()) { + final_dir /= std::format("rank_{:06d}", rank.GlobalRank()); + } + save_checkpoint(final_dir, FLAGS_num_iteration, false); + #ifdef PROFILE_MODE Profiler::Instance().Report("gpt2.report", Profiler::SortBy::DeviceTimePercentage); Profiler::Instance().PrintRecords("gpt2.records.log"); diff --git a/example/llama3/checkpoint_loader.cc b/example/llama3/checkpoint_loader.cc index f29bc540..98eacb2f 100644 --- a/example/llama3/checkpoint_loader.cc +++ b/example/llama3/checkpoint_loader.cc @@ -5,15 +5,12 @@ #include #include #include -#include #include #include #include #include "glog/logging.h" -#include "example/common/utils.h" -#include "example/llama3/config.h" #include "infini_train/include/nn/modules/normalization.h" #include "infini_train/include/nn/modules/transformer/causal_self_attention.h" #include "infini_train/include/nn/modules/transformer/mlp.h" @@ -22,24 +19,20 @@ #include "infini_train/include/nn/parallel/tensor_parallel.h" #include "infini_train/include/tensor.h" +#include "example/common/utils.h" +#include "example/llama3/config.h" + using namespace infini_train; namespace nn = infini_train::nn; -namespace { -constexpr int kRandomSeed = 42; - -// TODO(zbl): make this rng generator compatible with torch later -static std::mt19937 gen{kRandomSeed}; -} // namespace - namespace { constexpr int32_t kLLaMA3Magic = 20240803; constexpr int32_t kLLaMA3FP32Version = 3; } // namespace -namespace llama3 { - -std::shared_ptr LoadFromLLMC(const std::string &filepath) { +// TODO(jym): ckpt is a new checkpoint format; bin is the legacy format. Keeping both as an interim solution; plan to +// consolidate into one later. +std::shared_ptr llama3::LoadFromLLMC(const std::string &filepath) { if (!std::filesystem::exists(filepath)) { LOG(FATAL) << "File not found: " << filepath; } @@ -345,4 +338,3 @@ std::shared_ptr LoadFromLLMC(const std::string &filepath) return llama3; } -} // namespace llama3 diff --git a/example/llama3/checkpoint_loader.h b/example/llama3/checkpoint_loader.h index d4aea3d0..caf81933 100644 --- a/example/llama3/checkpoint_loader.h +++ b/example/llama3/checkpoint_loader.h @@ -1,11 +1,12 @@ #pragma once +#include #include #include namespace infini_train::nn { class TransformerModel; -} // namespace infini_train::nn +} namespace llama3 { std::shared_ptr LoadFromLLMC(const std::string &filepath); diff --git a/example/llama3/main.cc b/example/llama3/main.cc index 117551d5..ebea24ee 100644 --- a/example/llama3/main.cc +++ b/example/llama3/main.cc @@ -1,5 +1,8 @@ +#include #include +#include #include +#include #include #include #include @@ -8,6 +11,7 @@ #include "glog/logging.h" #include "infini_train/include/autocast.h" +#include "infini_train/include/checkpoint.h" #include "infini_train/include/core/runtime/device_guard.h" #include "infini_train/include/dataloader.h" #include "infini_train/include/device.h" @@ -33,11 +37,13 @@ #include "infini_train/include/profiler.h" #endif +#include "example/common/checkpoint_loader.h" #include "example/common/tiny_shakespeare_dataset.h" #include "example/common/tokenizer.h" #include "example/llama3/checkpoint_loader.h" #include "example/llama3/config.h" +// TODO(jym): Reorganize CLI flags into categories for better readability and maintainability. // I/O DEFINE_string(input_bin, "", "input .bin to train on"); DEFINE_string(input_val_bin, "", "input .bin to eval validation loss on"); @@ -75,6 +81,12 @@ DEFINE_uint32(pipeline_parallel, 1, "Pipeline Parallel world size, specified the DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage."); // precision DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)"); +DEFINE_uint32(save_interval, 0, "save checkpoint every N steps; 0 disables saving"); +DEFINE_string(load, "", "checkpoint directory to resume from"); +DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints"); +DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep"); +DEFINE_bool(no_save_optim, true, "whether optimizer state is persisted in checkpoints"); + // precision check DEFINE_string( precision_check, "", @@ -177,6 +189,8 @@ void Train(const nn::parallel::Rank &rank) { llama3::SanitizeLLaMA3Config(model_config); model = std::make_shared(model_config); } + auto llmc_model = std::dynamic_pointer_cast(model); + CHECK(llmc_model != nullptr) << "Failed to cast model to LLaMA3 for LLMC checkpoint I/O."; model->To(device); @@ -293,7 +307,58 @@ void Train(const nn::parallel::Rank &rank) { auto impl = core::GetDeviceGuardImpl(device.type()); - for (int step = 0; step < FLAGS_num_iteration + 1; ++step) { + int start_step = 0; + TrainerState state; + const auto resume_result = ResumeFromCheckpoint({ + .resume_root = FLAGS_load, + .rank = rank, + .model = model, + .optimizer = optimizer, + .train_loader = train_loader, + .model_config = model_config, + .state = state, + }); + + start_step = resume_result.global_step; + size_t consumed_batches = resume_result.consumed_batches; + + // TODO(jym): Replace with Sampler abstraction when available. + // Skip dataloader to resume from the correct batch position. + if (consumed_batches > 0) { + size_t start = train_iter.BatchIndex(); + // Each rank processes every ddp_world_size-th batch starting from its own rank. + // num_skips calculates how many ++ iterations to reach the saved batch position. + size_t num_skips = (consumed_batches - start) / ddp_world_size; + for (size_t i = 0; i < num_skips; ++i) { ++train_iter; } + } + + auto save_checkpoint + = [&](const std::filesystem::path &save_dir, int64_t global_step, bool prune_step_checkpoints) { + SaveCheckpoint({ + .save_dir = save_dir, + .global_step = global_step, + .consumed_batches = consumed_batches, + .last_lr = FLAGS_learning_rate, + .n_layer = model_config.n_layer, + .n_head = model_config.n_head, + .n_kv_head = model_config.n_kv_head, + .n_embd = model_config.n_embd, + .vocab_size = model_config.vocab_size, + .ddp_size = ddp_world_size, + .tp_size = tp_world_size, + .sp_size = sp_world_size, + .pp_size = pp_world_size, + .no_save_optim = FLAGS_no_save_optim, + .prune_step_checkpoints = prune_step_checkpoints, + .checkpoint_root_dir = FLAGS_save, + .max_checkpoint_keep = FLAGS_max_checkpoint_keep, + .rank = rank, + .model = *model, + .optimizer = *optimizer, + }); + }; + + for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) { // Reset precision check counters at start of each iteration for file overwrite utils::PrecisionChecker::ResetCounters(); @@ -343,6 +408,7 @@ void Train(const nn::parallel::Rank &rank) { // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below // TODO(dcj): support dataloader.reset() later ++train_iter; + consumed_batches = train_iter.BatchIndex(); x = std::make_shared(x->To(device)); y = std::make_shared(y->To(device)); @@ -372,6 +438,7 @@ void Train(const nn::parallel::Rank &rank) { // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below // TODO(dcj): support dataloader.reset() later ++train_iter; + consumed_batches = train_iter.BatchIndex(); x = std::make_shared(x->To(device)); y = std::make_shared(y->To(device)); @@ -406,6 +473,15 @@ void Train(const nn::parallel::Rank &rank) { } } } + + if (FLAGS_save_interval > 0 && (step + 1) % FLAGS_save_interval == 0) { + std::filesystem::path step_dir + = std::filesystem::path(FLAGS_save) / std::format("checkpoint_step_{:06d}", step + 1); + if (rank.IsParallel()) { + step_dir /= std::format("rank_{:06d}", rank.GlobalRank()); + } + save_checkpoint(step_dir, step + 1, true); + } } // Save LoRA weights if enabled and path specified @@ -414,6 +490,12 @@ void Train(const nn::parallel::Rank &rank) { nn::lora::SaveLoRAWeights(model, FLAGS_lora_save_path); } + std::filesystem::path final_dir = std::filesystem::path(FLAGS_save) / "checkpoint_final"; + if (rank.IsParallel()) { + final_dir /= std::format("rank_{:06d}", rank.GlobalRank()); + } + save_checkpoint(final_dir, FLAGS_num_iteration, false); + #ifdef PROFILE_MODE Profiler::Instance().Report("llama3.report", Profiler::SortBy::DeviceTimePercentage); Profiler::Instance().PrintRecords("llama3.records.log"); diff --git a/infini_train/include/checkpoint.h b/infini_train/include/checkpoint.h new file mode 100644 index 00000000..0abe566d --- /dev/null +++ b/infini_train/include/checkpoint.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace infini_train { +class Optimizer; +class Tensor; +namespace nn { +class Module; +} + +struct TrainerState { + int64_t global_step = 0; + int64_t consumed_batches = 0; + // FIXME(jym): learning_rate should be restored from scheduler state, move `last_lr` from TrainerState to + // SchedulerState later + double last_lr = 0.0; + int64_t n_layer = 0; + int64_t n_head = 0; + int64_t n_kv_head = 0; + int64_t n_embd = 0; + int64_t vocab_size = 0; + int ddp_size = 1; + int tp_size = 1; + int sp_size = 1; + int pp_size = 1; +}; + +class Checkpoint { +public: + static void Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer *optimizer, + const TrainerState &state, bool no_save_optim = false); + + static void Load(const std::filesystem::path &checkpoint_dir, nn::Module &model, Optimizer *optimizer, + TrainerState &state, bool load_optimizer_state = true); + +private: + static void SaveStateDictBinary(const std::filesystem::path &path, + const std::unordered_map> &state_dict); + + static std::unordered_map> + LoadStateDictBinary(const std::filesystem::path &path); + + static void SaveTrainerState(const std::filesystem::path &path, const TrainerState &state); + static TrainerState LoadTrainerState(const std::filesystem::path &path); +}; + +} // namespace infini_train diff --git a/infini_train/include/dataloader.h b/infini_train/include/dataloader.h index ad7fbcda..38fa02a4 100644 --- a/infini_train/include/dataloader.h +++ b/infini_train/include/dataloader.h @@ -24,6 +24,8 @@ class DataLoaderIterator { friend bool operator!=(const DataLoaderIterator &lhs, const DataLoaderIterator &rhs); friend bool operator==(const DataLoaderIterator &lhs, const DataLoaderIterator &rhs); + size_t BatchIndex() const; + private: const Dataset *dataset_ = nullptr; // not owned size_t batch_size_ = 0; diff --git a/infini_train/include/nn/modules/module.h b/infini_train/include/nn/modules/module.h index f366661b..36762d52 100644 --- a/infini_train/include/nn/modules/module.h +++ b/infini_train/include/nn/modules/module.h @@ -47,8 +47,8 @@ class Module : public std::enable_shared_from_this { const std::string &type() const; - // TODO: Change return type to filterable iterator (like PyTorch's named_parameters with prefix matching) virtual std::vector> Parameters() const; + std::vector>> NamedParameters(bool remove_duplicate = true) const; bool has_parameter(const std::string &name) const; std::shared_ptr *mutable_parameter(const std::string &name); const std::shared_ptr ¶meter(const std::string &name) const; @@ -61,6 +61,8 @@ class Module : public std::enable_shared_from_this { std::unordered_map> StateDict() const; + void LoadStateDict(const std::unordered_map> &state_dict); + // operator() calls hooks and Forward std::vector> operator()(const std::vector> &input_tensors); diff --git a/infini_train/include/nn/parallel/ddp/distributed_optimizer.h b/infini_train/include/nn/parallel/ddp/distributed_optimizer.h index bc31442e..9366dd3c 100644 --- a/infini_train/include/nn/parallel/ddp/distributed_optimizer.h +++ b/infini_train/include/nn/parallel/ddp/distributed_optimizer.h @@ -28,6 +28,10 @@ class DistributedOptimizer final : public infini_train::Optimizer { void ZeroGrad(bool set_to_none = true) override; + std::unordered_map> StateDict() const override; + + void LoadStateDict(const std::unordered_map> &state_dict) override; + void StartGradSync(); void FinishGradSync(); @@ -48,6 +52,8 @@ class DistributedOptimizer final : public infini_train::Optimizer { // shard params std::vector> shard_params_; + std::vector shard_param_names_; + std::vector>> named_shard_params_; // Base optimizer (SGD, Adam and etc.) std::shared_ptr base_optimizer_; diff --git a/infini_train/include/optimizer.h b/infini_train/include/optimizer.h index fb0ae2d5..8559b261 100644 --- a/infini_train/include/optimizer.h +++ b/infini_train/include/optimizer.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include namespace infini_train { @@ -10,33 +12,38 @@ class Tensor; } namespace infini_train { class Optimizer; - using OptimizerCreator = std::function(const std::vector> ¶ms)>; +using OptimizerCreatorNamed = std::function( + const std::vector>> &named_params)>; class Optimizer { public: explicit Optimizer(const std::vector> ¶ms); + Optimizer(const std::vector>> &named_params); virtual void ZeroGrad(bool set_to_none = true); virtual void Step() = 0; + virtual std::unordered_map> StateDict() const { return {}; }; + + virtual void LoadStateDict(const std::unordered_map> &state_dict) {} + protected: std::vector> params_; + std::vector param_names_; }; namespace optimizers { class SGD : public Optimizer { public: SGD(const std::vector> ¶ms, float learning_rate); + SGD(const std::vector>> &named_params, float learning_rate); void Step() override; - static OptimizerCreator Create(float learning_rate) { - return [learning_rate](const std::vector> ¶ms) { - return std::make_shared(params, learning_rate); - }; - } + static OptimizerCreator Create(float learning_rate); + static OptimizerCreatorNamed CreateNamed(float learning_rate); private: const float learning_rate_ = 0.0; @@ -46,15 +53,18 @@ class Adam : public Optimizer { public: Adam(const std::vector> ¶ms, float learning_rate = 1e-3, float beta1 = 0.9, float beta2 = 0.999, float eps = 1e-8); + Adam(const std::vector>> &named_params, float learning_rate = 1e-3, + float beta1 = 0.9, float beta2 = 0.999, float eps = 1e-8); void Step() override; + std::unordered_map> StateDict() const override; + + void LoadStateDict(const std::unordered_map> &state_dict) override; static OptimizerCreator Create(float learning_rate = 1e-3, float beta1 = 0.9, float beta2 = 0.999, - float eps = 1e-8) { - return [=](const std::vector> ¶ms) { - return std::make_shared(params, learning_rate, beta1, beta2, eps); - }; - } + float eps = 1e-8); + static OptimizerCreatorNamed CreateNamed(float learning_rate = 1e-3, float beta1 = 0.9, float beta2 = 0.999, + float eps = 1e-8); private: int64_t t_; diff --git a/infini_train/include/utils/string_utils.h b/infini_train/include/utils/string_utils.h new file mode 100644 index 00000000..af736b30 --- /dev/null +++ b/infini_train/include/utils/string_utils.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +#include "glog/logging.h" + +namespace infini_train::utils { +std::string DimsToString(const std::vector &dims); +} diff --git a/infini_train/src/checkpoint.cc b/infini_train/src/checkpoint.cc new file mode 100644 index 00000000..7ee4447d --- /dev/null +++ b/infini_train/src/checkpoint.cc @@ -0,0 +1,248 @@ +#include "infini_train/include/checkpoint.h" + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" + +#include "infini_train/include/nn/modules/module.h" +#include "infini_train/include/optimizer.h" +#include "infini_train/include/tensor.h" + +namespace infini_train { +namespace { +constexpr uint32_t kCkptMagic = 0x54504B43; // CKPT +constexpr uint32_t kCkptVersion = 1; + +uint32_t PeekMagic(const std::filesystem::path &path) { + std::ifstream ifs(path, std::ios::binary); + CHECK(ifs.is_open()) << "Failed to open checkpoint file: " << path; + uint32_t magic = 0; + ifs.read(reinterpret_cast(&magic), sizeof(magic)); + return magic; +} + +void WriteString(std::ofstream *ofs, const std::string &value) { + uint32_t len = static_cast(value.size()); + ofs->write(reinterpret_cast(&len), sizeof(len)); + ofs->write(value.data(), len); +} + +std::string ReadString(std::ifstream *ifs) { + uint32_t len = 0; + ifs->read(reinterpret_cast(&len), sizeof(len)); + std::string s(len, '\0'); + ifs->read(s.data(), len); + return s; +} + +std::string ExtractStringField(const std::string &content, const std::string &key, const std::string &fallback) { + const auto token = std::string("\"") + key + "\""; + const auto key_pos = content.find(token); + if (key_pos == std::string::npos) { + return fallback; + } + const auto colon_pos = content.find(':', key_pos); + const auto first_quote = content.find('"', colon_pos + 1); + const auto second_quote = content.find('"', first_quote + 1); + if (first_quote == std::string::npos || second_quote == std::string::npos) { + return fallback; + } + return content.substr(first_quote + 1, second_quote - first_quote - 1); +} + +template T ExtractNumberField(const std::string &content, const std::string &key, T fallback) { + const auto token = std::string("\"") + key + "\""; + const auto key_pos = content.find(token); + if (key_pos == std::string::npos) { + return fallback; + } + const auto colon_pos = content.find(':', key_pos); + if (colon_pos == std::string::npos) { + return fallback; + } + size_t value_start = colon_pos + 1; + while (value_start < content.size() && (content[value_start] == ' ' || content[value_start] == '\n')) { + ++value_start; + } + size_t value_end = value_start; + while (value_end < content.size() && content[value_end] != ',' && content[value_end] != '\n' + && content[value_end] != '}') { + ++value_end; + } + std::stringstream ss(content.substr(value_start, value_end - value_start)); + T value = fallback; + ss >> value; + if (ss.fail()) { + return fallback; + } + return value; +} +} // namespace + +void Checkpoint::Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer *optimizer, + const TrainerState &state, bool no_save_optim) { + std::filesystem::create_directories(checkpoint_dir); + LOG(INFO) << "[CKPT] Save begin: dir=" << checkpoint_dir << ", global_step=" << state.global_step; + + const auto model_path = checkpoint_dir / ("model.ckpt"); + + SaveStateDictBinary(model_path, model.StateDict()); + + if (!no_save_optim) { + CHECK(optimizer != nullptr) << "Optimizer pointer is null, cannot save optimizer state."; + auto opt_state = optimizer->StateDict(); + if (!opt_state.empty()) { + const auto opt_path = checkpoint_dir / "optimizer.ckpt"; + SaveStateDictBinary(opt_path, opt_state); + } + } + + SaveTrainerState(checkpoint_dir / "trainer_state.json", state); + LOG(ERROR) << "[CKPT] Save done: dir=" << checkpoint_dir; +} + +void Checkpoint::Load(const std::filesystem::path &checkpoint_dir, nn::Module &model, Optimizer *optimizer, + TrainerState &state, bool load_optimizer_state) { + const auto model_path = checkpoint_dir / "model.ckpt"; + LOG(INFO) << "[CKPT] Loading model: " << model_path; + + model.LoadStateDict(LoadStateDictBinary(model_path)); + + if (optimizer == nullptr) { + LOG(ERROR) << "[CKPT] No optimizer instance, skip optimizer state loading."; + } else if (load_optimizer_state) { + const auto opt_path = checkpoint_dir / "optimizer.ckpt"; + if (std::filesystem::exists(opt_path)) { + LOG(ERROR) << "[CKPT] Loading optimizer: " << opt_path; + optimizer->LoadStateDict(LoadStateDictBinary(opt_path)); + } else { + LOG(ERROR) << "[CKPT] Optimizer state not found, skip: " << opt_path; + } + } else { + LOG(ERROR) << "[CKPT] load_optimizer_state=false, skip optimizer state loading."; + } + + state = LoadTrainerState(checkpoint_dir / "trainer_state.json"); + LOG(ERROR) << "[CKPT] Load done: global_step=" << state.global_step + << ", consumed_batches =" << state.consumed_batches << ", last_lr=" << state.last_lr + << ", topology(ddp,tp,sp,pp)=(" << state.ddp_size << "," << state.tp_size << "," << state.sp_size << "," + << state.pp_size << ")"; +} + +void Checkpoint::SaveStateDictBinary(const std::filesystem::path &path, + const std::unordered_map> &state_dict) { + std::ofstream ofs(path, std::ios::binary); + CHECK(ofs.is_open()) << "Failed to open checkpoint file: " << path; + + uint32_t magic = kCkptMagic; + uint32_t version = kCkptVersion; + uint32_t count = static_cast(state_dict.size()); + ofs.write(reinterpret_cast(&magic), sizeof(magic)); + ofs.write(reinterpret_cast(&version), sizeof(version)); + ofs.write(reinterpret_cast(&count), sizeof(count)); + + for (const auto &[name, tensor] : state_dict) { + WriteString(&ofs, name); + + const int8_t dtype = static_cast(tensor->Dtype()); + ofs.write(reinterpret_cast(&dtype), sizeof(dtype)); + + const auto &dims = tensor->Dims(); + uint32_t ndim = static_cast(dims.size()); + ofs.write(reinterpret_cast(&ndim), sizeof(ndim)); + for (const auto dim : dims) { ofs.write(reinterpret_cast(&dim), sizeof(dim)); } + + Tensor cpu_tensor = tensor->To(Device()); + uint64_t bytes = static_cast(cpu_tensor.SizeInBytes()); + ofs.write(reinterpret_cast(&bytes), sizeof(bytes)); + ofs.write(reinterpret_cast(cpu_tensor.DataPtr()), static_cast(bytes)); + } +} + +std::unordered_map> +Checkpoint::LoadStateDictBinary(const std::filesystem::path &path) { + std::ifstream ifs(path, std::ios::binary); + CHECK(ifs.is_open()) << "Failed to open checkpoint file: " << path; + + uint32_t magic = 0; + uint32_t version = 0; + uint32_t count = 0; + ifs.read(reinterpret_cast(&magic), sizeof(magic)); + ifs.read(reinterpret_cast(&version), sizeof(version)); + ifs.read(reinterpret_cast(&count), sizeof(count)); + + CHECK_EQ(magic, kCkptMagic) << "Invalid checkpoint magic: " << path; + CHECK_EQ(version, kCkptVersion) << "Unsupported checkpoint version: " << path; + + std::unordered_map> state; + for (uint32_t i = 0; i < count; ++i) { + const std::string name = ReadString(&ifs); + + int8_t dtype_raw = 0; + ifs.read(reinterpret_cast(&dtype_raw), sizeof(dtype_raw)); + DataType dtype = static_cast(dtype_raw); + + uint32_t ndim = 0; + ifs.read(reinterpret_cast(&ndim), sizeof(ndim)); + std::vector dims(ndim); + for (uint32_t d = 0; d < ndim; ++d) { ifs.read(reinterpret_cast(&dims[d]), sizeof(dims[d])); } + + uint64_t bytes = 0; + ifs.read(reinterpret_cast(&bytes), sizeof(bytes)); + + auto tensor = std::make_shared(dims, dtype, Device()); + CHECK_EQ(bytes, tensor->SizeInBytes()) << "Tensor bytes mismatch for key: " << name; + ifs.read(reinterpret_cast(tensor->DataPtr()), static_cast(bytes)); + state.emplace(name, tensor); + } + + return state; +} + +void Checkpoint::SaveTrainerState(const std::filesystem::path &path, const TrainerState &state) { + std::ofstream ofs(path); + CHECK(ofs.is_open()) << "Failed to open trainer state file: " << path; + ofs << "{\n"; + ofs << " \"n_layer\": " << state.n_layer << ",\n"; + ofs << " \"n_head\": " << state.n_head << ",\n"; + ofs << " \"n_kv_head\": " << state.n_kv_head << ",\n"; + ofs << " \"n_embd\": " << state.n_embd << ",\n"; + ofs << " \"vocab_size\": " << state.vocab_size << "\n"; + ofs << " \"global_step\": " << state.global_step << ",\n"; + ofs << " \"consumed_batches \": " << state.consumed_batches << ",\n"; + ofs << " \"last_lr\": " << state.last_lr << ",\n"; + ofs << " \"ddp_size\": " << state.ddp_size << ",\n"; + ofs << " \"tp_size\": " << state.tp_size << ",\n"; + ofs << " \"sp_size\": " << state.sp_size << ",\n"; + ofs << " \"pp_size\": " << state.pp_size << "\n"; + ofs << "}\n"; +} + +// TODO(jym): Add TrainerState JSON version compatibility, referencing PyTorch's checkpoint +// versioning. +TrainerState Checkpoint::LoadTrainerState(const std::filesystem::path &path) { + std::ifstream ifs(path); + CHECK(ifs.is_open()) << "Failed to open trainer state file: " << path; + const std::string content((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); + + TrainerState state; + state.n_layer = ExtractNumberField(content, "n_layer", 0); + state.n_head = ExtractNumberField(content, "n_head", 0); + state.n_kv_head = ExtractNumberField(content, "n_kv_head", 0); + state.n_embd = ExtractNumberField(content, "n_embd", 0); + state.vocab_size = ExtractNumberField(content, "vocab_size", 0); + state.global_step = ExtractNumberField(content, "global_step", 0); + state.consumed_batches = ExtractNumberField(content, "consumed_batches ", 0); + state.last_lr = ExtractNumberField(content, "last_lr", 0.0); + state.ddp_size = ExtractNumberField(content, "ddp_size", 1); + state.tp_size = ExtractNumberField(content, "tp_size", 1); + state.sp_size = ExtractNumberField(content, "sp_size", 1); + state.pp_size = ExtractNumberField(content, "pp_size", 1); + return state; +} +} // namespace infini_train diff --git a/infini_train/src/dataloader.cc b/infini_train/src/dataloader.cc index 322df553..b7cc94f2 100644 --- a/infini_train/src/dataloader.cc +++ b/infini_train/src/dataloader.cc @@ -78,6 +78,8 @@ bool operator==(const DataLoaderIterator &lhs, const DataLoaderIterator &rhs) { return lhs.batch_idx_ == rhs.batch_idx_; } +size_t DataLoaderIterator::BatchIndex() const { return batch_idx_; } + DataLoader::DataLoader(const std::shared_ptr &dataset, size_t batch_size) : dataset_(dataset), batch_size_(batch_size), max_batch_idx_((dataset_->Size() + batch_size_ - 1) / batch_size_) {} diff --git a/infini_train/src/nn/modules/module.cc b/infini_train/src/nn/modules/module.cc index 6d48dcab..486edc89 100644 --- a/infini_train/src/nn/modules/module.cc +++ b/infini_train/src/nn/modules/module.cc @@ -11,9 +11,9 @@ #include "infini_train/include/autograd/function.h" #include "infini_train/include/common/hook.h" #include "infini_train/include/device.h" -#include "infini_train/include/nn/parallel/global.h" #include "infini_train/include/tensor.h" #include "infini_train/include/utils/global_module_hook_registry.h" +#include "infini_train/include/utils/string_utils.h" #ifndef UNLIKELY #define UNLIKELY(x) __builtin_expect(!!(x), 0) @@ -28,24 +28,40 @@ Module::Module(const std::string &type) : type_(type), device_(Device()) {} const std::string &Module::type() const { return type_; } std::vector> Module::Parameters() const { + auto namedParameters = NamedParameters(); std::vector> params; - std::unordered_set visited; - - auto AddIfUnvisited = [&](const std::shared_ptr ¶m) { - if (visited.insert(param.get()).second) { - params.push_back(param); - } - }; - - // Add parameters of this module - for (const auto &[_, param] : parameters_) { AddIfUnvisited(param); } + params.reserve(namedParameters.size()); + for (auto &[name, param] : namedParameters) { params.push_back(param); } + return params; +} - // Recursively add parameters of submodules - for (const auto &[_, module] : modules_) { - for (const auto ¶m : module->Parameters()) { AddIfUnvisited(param); } - } +std::vector>> Module::NamedParameters(bool remove_duplicate) const { + std::vector>> result; + std::unordered_set visited; - return params; + std::function collect + = [&](const std::string &prefix, const Module *mod) { + for (const auto &[name, param] : mod->parameters_) { + auto full_name = prefix.empty() ? name : prefix + "." + name; + + if (!remove_duplicate) { + result.emplace_back(full_name, param); + continue; + } + if (visited.insert(param.get()).second) { + result.emplace_back(full_name, param); + } + } + + for (const auto &[name, child] : mod->modules_) { + auto child_prefix = prefix.empty() ? name : prefix + "." + name; + collect(child_prefix, child.get()); + } + }; + + collect("", this); + + return result; } bool Module::has_parameter(const std::string &name) const { return parameters_.find(name) != parameters_.end(); } @@ -147,6 +163,52 @@ std::unordered_map> Module::StateDict() con return state; } +void Module::LoadStateDict(const std::unordered_map> &state_dict) { + // Current behavior: missing keys / shape / dtype mismatches are FATAL errors; + // unexpected keys in state_dict are WARNING-only and silently ignored. + + // Stage 1: Validate all keys, shapes, and dtypes without copying + std::vector error_msgs; + std::unordered_set visited_keys; + auto expected = StateDict(); + + for (const auto &[name, dst] : expected) { + visited_keys.insert(name); + if (!state_dict.contains(name)) { + error_msgs.push_back(std::format("Missing key: {}", name)); + continue; + } + const auto &src = state_dict.at(name); + if (dst->Dims() != src->Dims()) { + error_msgs.push_back(std::format("Shape mismatch for '{}': expected={}, got={}", name, + infini_train::utils::DimsToString(dst->Dims()), + infini_train::utils::DimsToString(src->Dims()))); + } + if (dst->Dtype() != src->Dtype()) { + error_msgs.push_back(std::format("Dtype mismatch for '{}': expected={}, got={}", name, + kDataTypeToDesc.at(dst->Dtype()), kDataTypeToDesc.at(src->Dtype()))); + } + } + + for (const auto &[name, src] : state_dict) { + if (!visited_keys.contains(name)) { + LOG(WARNING) << std::format("Unexpected key in state_dict: {}", name); + } + } + + if (!error_msgs.empty()) { + std::string msg = "LoadStateDict failed:"; + for (const auto &err : error_msgs) { msg += "\n " + err; } + LOG(FATAL) << msg; + } + + // Stage 2: All checks passed, now copy data + for (const auto &[name, dst] : expected) { + const auto &src = state_dict.at(name); + dst->CopyFrom(*src); + } +} + std::vector> Module::Forward(const std::vector> &input_tensors) { LOG(FATAL) << "Forward function not implemented for this module"; return {}; diff --git a/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc b/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc index 55e5800b..b259204b 100644 --- a/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc +++ b/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc @@ -11,23 +11,17 @@ DistributedOptimizer::DistributedOptimizer(OptimizerCreator creator, const std::vector> &model_chunks, size_t ddp_world_size, size_t ddp_rank) : Optimizer(full_params), ddp_world_size_(ddp_world_size), ddp_rank_(ddp_rank) { - CHECK(ddp_world_size_ > 1) << "DistributedOptimizer: ddp_world_size must be greater than 1."; - for (size_t i = 0; i < model_chunks.size(); ++i) { auto ddp_chunk = std::dynamic_pointer_cast(model_chunks[i]); CHECK(ddp_chunk) << "DistributedOptimizer: model_chunks[" << i << "] is not a DDP model."; - param_grad_buffers_.insert(param_grad_buffers_.end(), ddp_chunk->param_grad_buffers().begin(), ddp_chunk->param_grad_buffers().end()); bucket_groups_.insert(bucket_groups_.end(), ddp_chunk->bucket_groups().begin(), ddp_chunk->bucket_groups().end()); } - BuildShardParamsAndBindGrads(); - - // Build base optimizer - base_optimizer_ = creator(shard_params_); + base_optimizer_ = creator(full_params); CHECK(base_optimizer_) << "DistributedOptimizer: failed to create base optimizer."; } @@ -36,13 +30,10 @@ void DistributedOptimizer::BuildShardParamsAndBindGrads() { for (const auto &group : bucket_groups_) { for (const auto &bucket : group->buckets()) { - auto bucket_param = bucket->param_data(); auto bucket_grad = bucket->grad_data(); - CHECK(bucket_param) << "DistributedOptimizer requires param buffer."; CHECK(bucket_grad) << "DistributedOptimizer requires grad buffer."; - CHECK_EQ(bucket_param->NumElements() % ddp_world_size_, 0); const size_t bucket_shard_numel = bucket_param->NumElements() / ddp_world_size_; const size_t bucket_shard_start = ddp_rank_ * bucket_shard_numel; @@ -53,7 +44,6 @@ void DistributedOptimizer::BuildShardParamsAndBindGrads() { size_t param_start_in_bucket = 0, param_end_in_bucket = 0; auto found = bucket->GetTensorLocInBucket(param, param_start_in_bucket, param_end_in_bucket); CHECK(found) << "DistributedOptimizer: param not found in bucket mapping."; - const size_t local_start = std::max(param_start_in_bucket, bucket_shard_start); const size_t local_end = std::min(param_end_in_bucket, bucket_shard_end); if (local_end <= local_start) { @@ -63,24 +53,18 @@ void DistributedOptimizer::BuildShardParamsAndBindGrads() { const size_t piece_numel = local_end - local_start; CHECK_GT(piece_numel, 0); - const size_t param_piece_offset_bytes = local_start * kDataTypeToSize.at(bucket_param->Dtype()); const size_t grad_piece_offset_bytes = local_start * kDataTypeToSize.at(bucket_grad->Dtype()); auto param_piece = std::make_shared(*bucket_param, param_piece_offset_bytes, std::vector{static_cast(piece_numel)}); - auto grad_piece = std::make_shared(*bucket_grad, grad_piece_offset_bytes, std::vector{static_cast(piece_numel)}); - param_piece->set_grad(grad_piece); shard_params_.push_back(param_piece); } } } - - CHECK(!shard_params_.empty()) << "DistributedOptimizer: this DP rank owns no param pieces. " - << "Check bucket padding/divisibility and param bucketing order."; } void DistributedOptimizer::StartGradSync() { @@ -128,4 +112,13 @@ void DistributedOptimizer::Step() { FinishParamSync(/*skip_next_bucket_dispatch=*/true); } +std::unordered_map> DistributedOptimizer::StateDict() const { + CHECK(base_optimizer_) << "DistributedOptimizer: base optimizer is null."; + return base_optimizer_->StateDict(); +} + +void DistributedOptimizer::LoadStateDict(const std::unordered_map> &state_dict) { + CHECK(base_optimizer_) << "DistributedOptimizer: base optimizer is null."; + base_optimizer_->LoadStateDict(state_dict); +} } // namespace infini_train::nn::parallel diff --git a/infini_train/src/optimizer.cc b/infini_train/src/optimizer.cc index d5589b01..95d62d74 100644 --- a/infini_train/src/optimizer.cc +++ b/infini_train/src/optimizer.cc @@ -1,5 +1,6 @@ #include "infini_train/include/optimizer.h" +#include #include #include "infini_train/include/core/runtime/device_guard.h" @@ -10,6 +11,15 @@ namespace infini_train { Optimizer::Optimizer(const std::vector> ¶ms) : params_(params) {} +Optimizer::Optimizer(const std::vector>> &named_params) { + params_.reserve(named_params.size()); + param_names_.reserve(named_params.size()); + for (const auto &[name, param] : named_params) { + params_.push_back(param); + param_names_.push_back(name); + } +} + void Optimizer::ZeroGrad(bool set_to_none) { for (auto param : params_) { param->ZeroGrad(set_to_none); } } @@ -19,6 +29,9 @@ namespace optimizers { SGD::SGD(const std::vector> ¶ms, float learning_rate) : Optimizer(params), learning_rate_(learning_rate) {} +SGD::SGD(const std::vector>> &named_params, float learning_rate) + : Optimizer(named_params), learning_rate_(learning_rate) {} + void SGD::Step() { for (auto param : params_) { if (!param->grad()) { @@ -32,9 +45,20 @@ void SGD::Step() { } } +OptimizerCreator SGD::Create(float learning_rate) { + return [learning_rate](const std::vector> ¶ms) { + return std::make_shared(params, learning_rate); + }; +} + +OptimizerCreatorNamed SGD::CreateNamed(float learning_rate) { + return [learning_rate](const std::vector>> &named_params) { + return std::make_shared(named_params, learning_rate); + }; +} + Adam::Adam(const std::vector> ¶ms, float learning_rate, float beta1, float beta2, float eps) : Optimizer(params), t_(0), learning_rate_(learning_rate), beta1_(beta1), beta2_(beta2), eps_(eps) { - for (const auto ¶m : params_) { m_.emplace_back(std::make_shared(param->Dims(), param->Dtype(), param->GetDevice())); v_.emplace_back(std::make_shared(param->Dims(), param->Dtype(), param->GetDevice())); @@ -43,6 +67,17 @@ Adam::Adam(const std::vector> ¶ms, float learning_ra } } +Adam::Adam(const std::vector>> &named_params, float learning_rate, + float beta1, float beta2, float eps) + : Optimizer(named_params), t_(0), learning_rate_(learning_rate), beta1_(beta1), beta2_(beta2), eps_(eps) { + for (const auto &[name, param] : named_params) { + m_.emplace_back(std::make_shared(param->Dims(), param->Dtype(), param->GetDevice())); + v_.emplace_back(std::make_shared(param->Dims(), param->Dtype(), param->GetDevice())); + m_.back()->Fill(0.0); + v_.back()->Fill(0.0); + } +} + void Adam::Step() { ++t_; @@ -62,5 +97,45 @@ void Adam::Step() { kernel.Call(grad, param, m, v, learning_rate_, beta1_, beta2_, eps_, t_); } } + +OptimizerCreator Adam::Create(float learning_rate, float beta1, float beta2, float eps) { + return [=](const std::vector> ¶ms) { + return std::make_shared(params, learning_rate, beta1, beta2, eps); + }; +} + +OptimizerCreatorNamed Adam::CreateNamed(float learning_rate, float beta1, float beta2, float eps) { + return [=](const std::vector>> &named_params) { + return std::make_shared(named_params, learning_rate, beta1, beta2, eps); + }; +} + +std::unordered_map> Adam::StateDict() const { + std::unordered_map> state; + for (size_t i = 0; i < m_.size(); ++i) { + state.emplace(std::format("adam.m.{}", i), m_[i]); + state.emplace(std::format("adam.v.{}", i), v_[i]); + } + + auto t_tensor = std::make_shared(std::vector{}, DataType::kINT64, Device()); + *static_cast(t_tensor->DataPtr()) = t_; + state.emplace("adam.t", t_tensor); + return state; +} + +void Adam::LoadStateDict(const std::unordered_map> &state_dict) { + for (size_t i = 0; i < m_.size(); ++i) { + const auto m_key = std::format("adam.m.{}", i); + const auto v_key = std::format("adam.v.{}", i); + CHECK(state_dict.contains(m_key)) << "Missing optimizer state: " << m_key; + CHECK(state_dict.contains(v_key)) << "Missing optimizer state: " << v_key; + m_[i]->CopyFrom(state_dict.at(m_key)); + v_[i]->CopyFrom(state_dict.at(v_key)); + } + + CHECK(state_dict.contains("adam.t")) << "Missing optimizer state: adam.t"; + const Tensor t_cpu = state_dict.at("adam.t")->To(Device()); + t_ = *static_cast(t_cpu.DataPtr()); +} } // namespace optimizers } // namespace infini_train diff --git a/infini_train/src/utils/string_utils.cc b/infini_train/src/utils/string_utils.cc new file mode 100644 index 00000000..e045f081 --- /dev/null +++ b/infini_train/src/utils/string_utils.cc @@ -0,0 +1,16 @@ +#include "infini_train/include/utils/string_utils.h" + +namespace infini_train::utils { +std::string DimsToString(const std::vector &dims) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < dims.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << dims[i]; + } + oss << "]"; + return oss.str(); +} +} // namespace infini_train::utils diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash index e3c67293..1f7d4400 100755 --- a/scripts/run_models_and_profile.bash +++ b/scripts/run_models_and_profile.bash @@ -72,6 +72,10 @@ PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)"; : "${PROFILE_LOG_DIR:=./profile_ COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}" RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}" CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}" +CKPT_CLEAN_DIRS=( + "/data1/ckpt" + "./checkpoints" +) mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR" @@ -114,6 +118,17 @@ clean_build_dir() { rm -rf "${BUILD_DIR:?}/"* } +# Clean checkpoint directories (called once at start of script) +clean_checkpoints() { + echo -e "\033[1;31m[CLEAN] Removing checkpoint directories from previous run\033[0m" + for dir in "${CKPT_CLEAN_DIRS[@]}"; do + if [[ -d "$dir" ]]; then + echo -e "\033[1;31m[CLEAN] Removing: ${dir}\033[0m" + rm -rf "${dir:?}" + fi + done +} + # Run a command and log output run_and_log() { local cmd="$1" @@ -208,14 +223,32 @@ move_profile_logs() { done } -# Build "--key value" arg string from test_groups[gi].tests[ti].args (shell-escaped) +# Build "--key value" arg string from tests[i].args. +# For checkpoint-related args, automatically isolate by model and run mode +# (resume/no_resume) to avoid cross-test overwrites in one-click runs. args_string_for_test() { local group_idx="$1" local test_idx="$2" - jq -r --argjson g "$group_idx" --argjson t "$test_idx" ' - .test_groups[$g].tests[$t].args - | to_entries[] - | "--\(.key)=\(.value|tostring)" + local model_name="$3" + local test_id="$4" + + jq -r --argjson g "$group_idx" --argjson t "$test_idx" --arg model "$model_name" --arg test_id "$test_id" ' + def namespaced_path($p; $model; $mode): + if ($p | test("/checkpoint_step_[0-9]+($|/)")) then + ($p | capture("^(?.*)/(?checkpoint_step_[0-9]+(?:/.*)?)$")) as $m + | ($m.prefix + "/" + $model + "/" + $mode + "/" + $m.step) + else + ($p + "/" + $model + "/" + $mode) + end; + + .test_groups[$g].tests[$t].args as $args + | (if ($args | has("load")) then "resume" else "no_resume" end) as $run_mode + | (if (($args.load // "") | test("no_resume")) then "no_resume" else "resume" end) as $resume_src_mode + | $args + | (if has("save") then .save = namespaced_path(.save; $model; $run_mode) else . end) + | (if has("load") then .load = namespaced_path(.load; $model; $resume_src_mode) else . end) + | to_entries[] + | "--\(.key) \(.value|tostring)" ' "$CONFIG_FILE" | paste -sd' ' - } @@ -269,16 +302,20 @@ for ((id=0; id +#include + +#include "gtest/gtest.h" + +#include "infini_train/include/checkpoint.h" +#include "infini_train/include/nn/modules/linear.h" +#include "infini_train/include/nn/modules/module.h" +#include "infini_train/include/optimizer.h" +#include "infini_train/include/tensor.h" + +#include "tests/common/test_utils.h" + +using namespace infini_train; +namespace nn = infini_train::nn; + +class CheckpointSerializationTest : public test::InfiniTrainTest {}; + +TEST_P(CheckpointSerializationTest, SaveAndLoadModelFP32) { + auto dir = std::filesystem::temp_directory_path() / "test_ckpt_fp32"; + std::filesystem::remove_all(dir); + + auto model1 = std::make_shared(3, 2, true, GetDevice()); + auto p1 = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); + p1->Fill(0.42f); + *model1->mutable_parameter("weight") = p1; + auto p2 = std::make_shared(std::vector{4}, DataType::kFLOAT32, GetDevice()); + p2->Fill(-1.5f); + *model1->mutable_parameter("bias") = p2; + + auto opt1 = std::make_shared(model1->Parameters(), 0.01); + TrainerState saved{.global_step = 42, .consumed_batches = 100}; + Checkpoint::Save(dir, *model1, opt1.get(), saved); + + auto model2 = std::make_shared(3, 2, true, GetDevice()); + auto q1 = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); + q1->Fill(0.0f); + *model2->mutable_parameter("weight") = q1; + auto q2 = std::make_shared(std::vector{4}, DataType::kFLOAT32, GetDevice()); + q2->Fill(0.0f); + *model2->mutable_parameter("bias") = q2; + auto opt2 = std::make_shared(model2->Parameters(), 0.01); + + TrainerState loaded; + Checkpoint::Load(dir, *model2, opt2.get(), loaded); + + EXPECT_EQ(loaded.global_step, 42); + EXPECT_EQ(loaded.consumed_batches, 100); + + auto w1_cpu = model2->parameter("weight")->To(Device()); + const float *data = static_cast(w1_cpu.DataPtr()); + for (int i = 0; i < 6; ++i) { EXPECT_NEAR(data[i], 0.42f, 1e-6); } + + std::filesystem::remove_all(dir); +} + +TEST_P(CheckpointSerializationTest, InferFormat) { + auto dir = std::filesystem::temp_directory_path() / "test_ckpt_fmt"; + std::filesystem::remove_all(dir); + + auto model = std::make_shared(1, 2, true, GetDevice()); + auto p = std::make_shared(std::vector{2}, DataType::kFLOAT32, GetDevice()); + p->Fill(1.0f); + *model->mutable_parameter("weight") = p; + auto opt = std::make_shared(model->Parameters(), 0.01); + TrainerState state; + Checkpoint::Save(dir, *model, opt.get(), state); + + auto model2 = std::make_shared(1, 2, true, GetDevice()); + auto p2 = std::make_shared(std::vector{2}, DataType::kFLOAT32, GetDevice()); + p2->Fill(0.0f); + *model2->mutable_parameter("weight") = p2; + TrainerState loaded; + Checkpoint::Load(dir, *model2, nullptr, loaded); + + EXPECT_NEAR(static_cast(model2->parameter("weight")->To(Device()).DataPtr())[0], 1.0f, 1e-6); + + std::filesystem::remove_all(dir); +} + +INFINI_TRAIN_REGISTER_TEST(CheckpointSerializationTest); diff --git a/tests/checkpoint/test_optimizer_state.cc b/tests/checkpoint/test_optimizer_state.cc new file mode 100644 index 00000000..b2029b78 --- /dev/null +++ b/tests/checkpoint/test_optimizer_state.cc @@ -0,0 +1,73 @@ +#include + +#include "gtest/gtest.h" + +#include "infini_train/include/optimizer.h" +#include "infini_train/include/tensor.h" + +#include "tests/common/test_utils.h" + +using namespace infini_train; + +class OptimizerStateTest : public test::InfiniTrainTest {}; + +// ---------- Adam StateDict ---------- +TEST_P(OptimizerStateTest, AdamStateDictKeys) { + auto param = std::make_shared(std::vector{3, 4}, DataType::kFLOAT32, GetDevice()); + param->set_requires_grad(true); + param->Fill(1.0f); + + auto adam = std::make_shared(std::vector>{{param}}, 0.001); + + adam->ZeroGrad(); + adam->Step(); // t=1 + adam->Step(); // t=2 + + auto state = adam->StateDict(); + EXPECT_GT(state.size(), 0); + EXPECT_TRUE(state.count("adam.m.0")); + EXPECT_TRUE(state.count("adam.v.0")); + EXPECT_TRUE(state.count("adam.t")); + + auto t_cpu = state["adam.t"]->To(Device()); + int64_t t_val = *static_cast(t_cpu.DataPtr()); + EXPECT_EQ(t_val, 2); +} + +// ---------- Adam LoadStateDict roundtrip ---------- +TEST_P(OptimizerStateTest, AdamStateDictRoundTrip) { + auto param1 = std::make_shared(std::vector{2, 2}, DataType::kFLOAT32, GetDevice()); + param1->set_requires_grad(true); + param1->Fill(1.0f); + + auto adam1 = std::make_shared(std::vector>{{param1}}, 0.001); + adam1->ZeroGrad(); + adam1->Step(); + adam1->Step(); + adam1->Step(); + + auto saved = adam1->StateDict(); + + auto param2 = std::make_shared(std::vector{2, 2}, DataType::kFLOAT32, GetDevice()); + param2->set_requires_grad(true); + param2->Fill(1.0f); + + auto adam2 = std::make_shared(std::vector>{{param2}}, 0.001); + adam2->LoadStateDict(saved); + + adam2->ZeroGrad(); + adam2->Step(); + auto restored = adam2->StateDict(); + auto t_cpu = restored["adam.t"]->To(Device()); + EXPECT_EQ(*static_cast(t_cpu.DataPtr()), 4); // 3 + 1 +} + +// ---------- SGD ---------- +TEST_P(OptimizerStateTest, SGDStateDictEmpty) { + auto param = std::make_shared(std::vector{2, 2}, DataType::kFLOAT32, GetDevice()); + param->set_requires_grad(true); + auto sgd = std::make_shared(std::vector>{{param}}, 0.01); + EXPECT_TRUE(sgd->StateDict().empty()); +} + +INFINI_TRAIN_REGISTER_TEST(OptimizerStateTest); diff --git a/tests/checkpoint/test_trainer_state.cc b/tests/checkpoint/test_trainer_state.cc new file mode 100644 index 00000000..35277564 --- /dev/null +++ b/tests/checkpoint/test_trainer_state.cc @@ -0,0 +1,110 @@ +#include +#include +#include + +#include "gtest/gtest.h" + +#include "infini_train/include/checkpoint.h" +#include "infini_train/include/nn/modules/linear.h" +#include "infini_train/include/nn/modules/module.h" +#include "infini_train/include/optimizer.h" +#include "infini_train/include/tensor.h" + +#include "tests/common/test_utils.h" + +using namespace infini_train; +namespace nn = infini_train::nn; + +class TrainerStateTest : public test::InfiniTrainTest {}; + +TEST_P(TrainerStateTest, DefaultValues) { + TrainerState state; + EXPECT_EQ(state.global_step, 0); + EXPECT_EQ(state.consumed_batches, 0); + EXPECT_EQ(state.n_layer, 0); + EXPECT_EQ(state.n_head, 0); + EXPECT_EQ(state.n_kv_head, 0); + EXPECT_EQ(state.n_embd, 0); + EXPECT_EQ(state.vocab_size, 0); + EXPECT_EQ(state.ddp_size, 1); + EXPECT_EQ(state.tp_size, 1); + EXPECT_EQ(state.sp_size, 1); + EXPECT_EQ(state.pp_size, 1); + EXPECT_EQ(state.last_lr, 0.0); +} + +TEST_P(TrainerStateTest, TrainerStateFileCreated) { + auto dir = std::filesystem::temp_directory_path() / "test_trainer_json"; + std::filesystem::remove_all(dir); + + TrainerState saved{.global_step = 30, .consumed_batches = 1200, .last_lr = 0.001}; + + auto model = std::make_shared(1, 2, true, GetDevice()); + auto p = std::make_shared(std::vector{2}, DataType::kFLOAT32, GetDevice()); + p->Fill(1.0f); + *model->mutable_parameter("weight") = p; + auto opt = std::make_shared(model->Parameters(), 0.01); + + Checkpoint::Save(dir, *model, opt.get(), saved); + + EXPECT_TRUE(std::filesystem::exists(dir / "trainer_state.json")); + + std::ifstream ifs(dir / "trainer_state.json"); + std::string content((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); + EXPECT_NE(content.find("\"global_step\""), std::string::npos); + EXPECT_NE(content.find("\"consumed_batches \""), std::string::npos); + + std::filesystem::remove_all(dir); +} + +TEST_P(TrainerStateTest, RoundTrip) { + auto dir = std::filesystem::temp_directory_path() / "test_trainer_rt"; + std::filesystem::remove_all(dir); + + TrainerState saved{ + .global_step = 99, + .consumed_batches = 5000, + .last_lr = 3e-4, + .n_layer = 24, + .n_head = 16, + .n_kv_head = 8, + .n_embd = 1024, + .vocab_size = 128256, + .ddp_size = 2, + .tp_size = 1, + .sp_size = 1, + .pp_size = 2, + }; + + auto model1 = std::make_shared(1, 3, true, GetDevice()); + auto p1 = std::make_shared(std::vector{3}, DataType::kFLOAT32, GetDevice()); + p1->Fill(0.5f); + *model1->mutable_parameter("weight") = p1; + auto opt1 = std::make_shared(model1->Parameters(), 0.01); + + Checkpoint::Save(dir, *model1, opt1.get(), saved); + + auto model2 = std::make_shared(1, 3, true, GetDevice()); + auto p2 = std::make_shared(std::vector{3}, DataType::kFLOAT32, GetDevice()); + p2->Fill(0.0f); + *model2->mutable_parameter("weight") = p2; + auto opt2 = std::make_shared(model2->Parameters(), 0.01); + + TrainerState loaded; + Checkpoint::Load(dir, *model2, opt2.get(), loaded); + + EXPECT_EQ(loaded.global_step, 99); + EXPECT_EQ(loaded.consumed_batches, 5000); + EXPECT_NEAR(loaded.last_lr, 3e-4, 1e-10); + EXPECT_EQ(loaded.n_layer, 24); + EXPECT_EQ(loaded.n_head, 16); + EXPECT_EQ(loaded.n_kv_head, 8); + EXPECT_EQ(loaded.n_embd, 1024); + EXPECT_EQ(loaded.vocab_size, 128256); + EXPECT_EQ(loaded.ddp_size, 2); + EXPECT_EQ(loaded.pp_size, 2); + + std::filesystem::remove_all(dir); +} + +INFINI_TRAIN_REGISTER_TEST(TrainerStateTest); diff --git a/tests/optimizer/test_optimizer_creation.cc b/tests/optimizer/test_optimizer_creation.cc index fbaa61e8..33bb1e28 100644 --- a/tests/optimizer/test_optimizer_creation.cc +++ b/tests/optimizer/test_optimizer_creation.cc @@ -2,7 +2,6 @@ #include "gtest/gtest.h" -#include "infini_train/include/nn/parallel/global.h" #include "infini_train/include/optimizer.h" #include "infini_train/include/tensor.h" @@ -31,7 +30,7 @@ TEST_P(OptimizerCreationTest, SGDMultiParams) { for (int i = 0; i < 3; ++i) { auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); param->set_requires_grad(true); - params.push_back(param); + params.emplace_back(param); } auto optimizer = std::make_shared(params, 0.01); EXPECT_NE(optimizer, nullptr); @@ -42,10 +41,50 @@ TEST_P(OptimizerCreationTest, AdamMultiParams) { for (int i = 0; i < 3; ++i) { auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); param->set_requires_grad(true); - params.push_back(param); + params.emplace_back(param); } + auto optimizer = std::make_shared(params, 0.001); EXPECT_NE(optimizer, nullptr); } +TEST_P(OptimizerCreationTest, SGDCreationNamed) { + auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); + param->set_requires_grad(true); + auto optimizer = std::make_shared( + std::vector>>{{"weight", param}}, 0.01); + EXPECT_NE(optimizer, nullptr); +} + +TEST_P(OptimizerCreationTest, AdamCreationNamed) { + auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); + param->set_requires_grad(true); + auto optimizer = std::make_shared( + std::vector>>{{"weight", param}}, 0.001); + EXPECT_NE(optimizer, nullptr); +} + +TEST_P(OptimizerCreationTest, SGDMultiNamedParams) { + std::vector>> named_params; + for (int i = 0; i < 3; ++i) { + auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); + param->set_requires_grad(true); + named_params.emplace_back("p" + std::to_string(i), param); + } + auto optimizer = std::make_shared(named_params, 0.01); + EXPECT_NE(optimizer, nullptr); +} + +TEST_P(OptimizerCreationTest, AdamMultiNamedParams) { + std::vector>> named_params; + for (int i = 0; i < 3; ++i) { + auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); + param->set_requires_grad(true); + named_params.emplace_back("p" + std::to_string(i), param); + } + + auto optimizer = std::make_shared(named_params, 0.001); + EXPECT_NE(optimizer, nullptr); +} + INFINI_TRAIN_REGISTER_TEST(OptimizerCreationTest); diff --git a/tests/optimizer/test_optimizer_step.cc b/tests/optimizer/test_optimizer_step.cc index 66ef1be7..50b1059f 100644 --- a/tests/optimizer/test_optimizer_step.cc +++ b/tests/optimizer/test_optimizer_step.cc @@ -49,7 +49,7 @@ TEST_P(OptimizerStepTest, SGDMultiParams) { for (int i = 0; i < 3; ++i) { auto param = std::make_shared(std::vector{2, 3}, DataType::kFLOAT32, GetDevice()); param->set_requires_grad(true); - params.push_back(param); + params.emplace_back(param); } auto optimizer = std::make_shared(params, 0.01); EXPECT_NE(optimizer, nullptr);