From 8a74bd62ab19551292fac5eaec1902d68a8fa6f4 Mon Sep 17 00:00:00 2001
From: cx <cx2016013@163.com>
Date: Thu, 4 Jun 2026 02:24:00 +0000
Subject: [PATCH 1/2] feat: add InfiniOps optional kernel provider

Add USE_INFINIOPS CMake option wiring the third_party/InfiniOps
subproject as an optional kernel backend (requires USE_CUDA). Provide
the InfiniOps adapter, CUDA handle factory, and Add/Gemm kernel
implementations registered via REGISTER_KERNEL.

Split the CUDA Add/Gemm registrations into dedicated registry sources
so they can be excluded when InfiniOps supplies those kernels instead.
Enable USE_INFINIOPS in the test build config.
---
 CMakeLists.txt                                |  36 ++++++
 .../core/kernel_provider/infiniops/adapter.h  |  42 +++++++
 .../core/kernel_provider/infiniops/adapter.cc | 116 ++++++++++++++++++
 .../kernel_provider/infiniops/cuda/handle.cc  |  24 ++++
 .../kernel_provider/infiniops/elementwise.cc  |  71 +++++++++++
 .../core/kernel_provider/infiniops/gemm.cc    |  74 +++++++++++
 infini_train/src/kernels/cuda/common/gemm.cu  |   2 -
 .../src/kernels/cuda/common/gemm_registry.cu  |   4 +
 infini_train/src/kernels/cuda/elementwise.cu  |   1 -
 .../kernels/cuda/elementwise_add_registry.cu  |  12 ++
 scripts/test_config.json                      |   4 +-
 11 files changed, 381 insertions(+), 5 deletions(-)
 create mode 100644 infini_train/include/core/kernel_provider/infiniops/adapter.h
 create mode 100644 infini_train/src/core/kernel_provider/infiniops/adapter.cc
 create mode 100644 infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc
 create mode 100644 infini_train/src/core/kernel_provider/infiniops/elementwise.cc
 create mode 100644 infini_train/src/core/kernel_provider/infiniops/gemm.cc
 create mode 100644 infini_train/src/kernels/cuda/common/gemm_registry.cu
 create mode 100644 infini_train/src/kernels/cuda/elementwise_add_registry.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c6da822..3e0cf826 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
 option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running" ON)
+option(USE_INFINIOPS "Use InfiniOps as an optional kernel provider" OFF)
 option(BUILD_TEST "Build InfiniTrain tests" OFF)
 
 project(infini_train VERSION 0.5.0 LANGUAGES CXX)
@@ -51,6 +52,31 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)
 
 include_directories(${PROJECT_SOURCE_DIR})
 
+if(USE_INFINIOPS)
+  if(NOT USE_CUDA)
+    message(FATAL_ERROR "USE_INFINIOPS=ON currently requires USE_CUDA=ON")
+  endif()
+
+  add_compile_definitions(USE_INFINIOPS=1)
+
+  set(INFINIOPS_SOURCE_DIR "${PROJECT_SOURCE_DIR}/third_party/InfiniOps")
+  if(NOT EXISTS "${INFINIOPS_SOURCE_DIR}/CMakeLists.txt")
+    message(FATAL_ERROR
+      "USE_INFINIOPS=ON requires InfiniOps under third_party/InfiniOps. "
+      "Run: git submodule update --init third_party/InfiniOps")
+  endif()
+
+  set(WITH_CPU OFF CACHE BOOL "Enable InfiniOps CPU backend" FORCE)
+  set(WITH_NVIDIA ON CACHE BOOL "Enable InfiniOps NVIDIA backend" FORCE)
+  add_subdirectory(${INFINIOPS_SOURCE_DIR} ${CMAKE_BINARY_DIR}/third_party/InfiniOps EXCLUDE_FROM_ALL)
+  if(NOT TARGET infiniops)
+    message(FATAL_ERROR "InfiniOps third-party project did not define target `infiniops`")
+  endif()
+  if(NOT TARGET InfiniOps::infiniops)
+    add_library(InfiniOps::infiniops ALIAS infiniops)
+  endif()
+endif()
+
 if(PROFILE_MODE)
   add_compile_definitions(PROFILE_MODE=1)
 endif()
@@ -62,6 +88,9 @@ endif()
 # Framework core sources (*.cc), excluding cpu kernels (they are built separately)
 file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
 list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
+if(NOT USE_INFINIOPS)
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/kernel_provider/infiniops/.*\.cc$")
+endif()
 if(NOT USE_CUDA)
   list(FILTER SRC EXCLUDE REGEX ".*runtime/cuda/.*")
   list(FILTER SRC EXCLUDE REGEX ".*ccl/cuda/.*")
@@ -100,6 +129,10 @@ if(USE_CUDA)
 
   # Only compile CUDA kernels / cuda sources here (your original used src/*.cu)
   file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
+  if(USE_INFINIOPS)
+    list(FILTER CUDA_KERNELS EXCLUDE REGEX ".*infini_train/src/kernels/cuda/common/gemm_registry[.]cu$")
+    list(FILTER CUDA_KERNELS EXCLUDE REGEX ".*infini_train/src/kernels/cuda/elementwise_add_registry[.]cu$")
+  endif()
 
   add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
   set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
@@ -126,6 +159,9 @@ endif()
 # ------------------------------------------------------------------------------
 
 add_library(infini_train STATIC ${SRC})
+if(USE_INFINIOPS)
+  target_link_libraries(infini_train PUBLIC InfiniOps::infiniops)
+endif()
 target_link_libraries(infini_train
   PUBLIC
     glog
diff --git a/infini_train/include/core/kernel_provider/infiniops/adapter.h b/infini_train/include/core/kernel_provider/infiniops/adapter.h
new file mode 100644
index 00000000..a5417ccc
--- /dev/null
+++ b/infini_train/include/core/kernel_provider/infiniops/adapter.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <handle.h>
+
+#include "data_type.h"
+#include "tensor.h"
+
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/device.h"
+
+namespace infini_train {
+class Tensor;
+} // namespace infini_train
+
+namespace infini_train::core {
+class Stream;
+} // namespace infini_train::core
+
+namespace infini_train::kernel_provider::infiniops {
+
+infini::ops::DataType ToOpsDataType(DataType dtype);
+
+infini::ops::Device ToOpsDevice(const Device &device);
+
+using HandleFactory = infini::ops::Handle (*)(const Device &device, core::Stream *stream);
+
+void RegisterHandleFactory(Device::DeviceType type, HandleFactory factory);
+
+infini::ops::Handle GetHandle(const Device &device);
+
+infini::ops::Tensor ToOpsTensor(const std::shared_ptr<Tensor> &tensor);
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device);
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device,
+                                const std::vector<int64_t> &strides);
+
+} // namespace infini_train::kernel_provider::infiniops
diff --git a/infini_train/src/core/kernel_provider/infiniops/adapter.cc b/infini_train/src/core/kernel_provider/infiniops/adapter.cc
new file mode 100644
index 00000000..d1b07bf4
--- /dev/null
+++ b/infini_train/src/core/kernel_provider/infiniops/adapter.cc
@@ -0,0 +1,116 @@
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+
+#include <map>
+#include <unordered_map>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::kernel_provider::infiniops {
+
+namespace {
+
+inline const std::unordered_map<DataType, infini::ops::DataType> kOpsDataTypeMap = {
+    {DataType::kFLOAT16, infini::ops::DataType::kFloat16}, {DataType::kBFLOAT16, infini::ops::DataType::kBFloat16},
+    {DataType::kFLOAT32, infini::ops::DataType::kFloat32}, {DataType::kFLOAT64, infini::ops::DataType::kFloat64},
+    {DataType::kINT8, infini::ops::DataType::kInt8},       {DataType::kINT16, infini::ops::DataType::kInt16},
+    {DataType::kINT32, infini::ops::DataType::kInt32},     {DataType::kINT64, infini::ops::DataType::kInt64},
+    {DataType::kUINT8, infini::ops::DataType::kUInt8},     {DataType::kUINT16, infini::ops::DataType::kUInt16},
+    {DataType::kUINT32, infini::ops::DataType::kUInt32},   {DataType::kUINT64, infini::ops::DataType::kUInt64},
+};
+
+inline const std::unordered_map<Device::DeviceType, infini::ops::Device::Type> kOpsDeviceTypeMap = {
+    {Device::DeviceType::kCUDA, infini::ops::Device::Type::kNvidia},
+    {Device::DeviceType::kCPU, infini::ops::Device::Type::kCpu},
+};
+
+std::map<Device::DeviceType, HandleFactory> &HandleFactories() {
+    static std::map<Device::DeviceType, HandleFactory> factories;
+    return factories;
+}
+
+} // namespace
+
+void RegisterHandleFactory(Device::DeviceType type, HandleFactory factory) {
+    CHECK(factory != nullptr);
+    auto &factories = HandleFactories();
+    CHECK(!factories.contains(type)) << "InfiniOps handle factory already registered for device type "
+                                     << static_cast<int>(type);
+    factories.emplace(type, factory);
+}
+
+infini::ops::Handle GetHandle(const Device &device) {
+    auto &factories = HandleFactories();
+    auto it = factories.find(device.type());
+    CHECK(it != factories.end()) << "InfiniOps handle factory is not registered for device type "
+                                 << static_cast<int>(device.type());
+
+    auto *stream = core::GetDeviceGuardImpl(device.type())->GetStream(device);
+    return it->second(device, stream);
+}
+
+infini::ops::DataType ToOpsDataType(DataType dtype) {
+    auto it = kOpsDataTypeMap.find(dtype);
+    if (it == kOpsDataTypeMap.end()) {
+        LOG(FATAL) << "Unsupported DataType for InfiniOps: " << static_cast<int>(dtype);
+        __builtin_unreachable();
+    }
+    return it->second;
+}
+
+infini::ops::Device ToOpsDevice(const Device &device) {
+    auto it = kOpsDeviceTypeMap.find(device.type());
+    if (it == kOpsDeviceTypeMap.end()) {
+        LOG(FATAL) << "Unsupported DeviceType for InfiniOps: " << static_cast<int>(device.type());
+        __builtin_unreachable();
+    }
+    return {it->second, device.index()};
+}
+
+namespace {
+infini::ops::Tensor::Strides ComputeContiguousStrides(const std::vector<int64_t> &dims) {
+    infini::ops::Tensor::Strides strides(dims.size());
+    if (dims.empty()) {
+        return strides;
+    }
+    strides.back() = 1;
+    for (int i = static_cast<int>(dims.size()) - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * static_cast<infini::ops::Tensor::Stride>(dims[i + 1]);
+    }
+    return strides;
+}
+
+infini::ops::Tensor::Shape ToShape(const std::vector<int64_t> &dims) {
+    infini::ops::Tensor::Shape shape(dims.size());
+    for (size_t i = 0; i < dims.size(); ++i) { shape[i] = static_cast<infini::ops::Tensor::Size>(dims[i]); }
+    return shape;
+}
+
+infini::ops::Tensor::Strides ToStrides(const std::vector<int64_t> &strides) {
+    infini::ops::Tensor::Strides ops_strides(strides.size());
+    for (size_t i = 0; i < strides.size(); ++i) {
+        ops_strides[i] = static_cast<infini::ops::Tensor::Stride>(strides[i]);
+    }
+    return ops_strides;
+}
+} // namespace
+
+infini::ops::Tensor ToOpsTensor(const std::shared_ptr<Tensor> &tensor) {
+    const auto &dims = tensor->Dims();
+    return {tensor->DataPtr(), ToShape(dims), ToOpsDataType(tensor->Dtype()), ToOpsDevice(tensor->GetDevice()),
+            ComputeContiguousStrides(dims)};
+}
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device) {
+    return {data, ToShape(dims), ToOpsDataType(dtype), ToOpsDevice(device), ComputeContiguousStrides(dims)};
+}
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device,
+                                const std::vector<int64_t> &strides) {
+    CHECK_EQ(dims.size(), strides.size());
+    return {data, ToShape(dims), ToOpsDataType(dtype), ToOpsDevice(device), ToStrides(strides)};
+}
+
+} // namespace infini_train::kernel_provider::infiniops
diff --git a/infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc b/infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc
new file mode 100644
index 00000000..0be445a9
--- /dev/null
+++ b/infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc
@@ -0,0 +1,24 @@
+#include "glog/logging.h"
+
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
+
+namespace infini_train::kernel_provider::infiniops {
+namespace {
+
+infini::ops::Handle MakeCudaHandle(const Device &, core::Stream *stream) {
+    auto *cuda_stream = dynamic_cast<core::cuda::CudaStream *>(stream);
+    CHECK_NOTNULL(cuda_stream);
+
+    infini::ops::Handle handle;
+    handle.set_stream(static_cast<void *>(cuda_stream->cuda_stream()));
+    return handle;
+}
+
+const bool kCudaHandleFactoryRegistered = []() {
+    RegisterHandleFactory(Device::DeviceType::kCUDA, MakeCudaHandle);
+    return true;
+}();
+
+} // namespace
+} // namespace infini_train::kernel_provider::infiniops
diff --git a/infini_train/src/core/kernel_provider/infiniops/elementwise.cc b/infini_train/src/core/kernel_provider/infiniops/elementwise.cc
new file mode 100644
index 00000000..ed700bad
--- /dev/null
+++ b/infini_train/src/core/kernel_provider/infiniops/elementwise.cc
@@ -0,0 +1,71 @@
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include <infini/ops.h>
+
+namespace infini_train::kernel_provider::infiniops {
+namespace {
+
+std::vector<int64_t> ComputeBroadcastStrides(const std::vector<int64_t> &dims, const std::vector<int64_t> &out_dims) {
+    CHECK_LE(dims.size(), out_dims.size());
+
+    std::vector<int64_t> strides(dims.size());
+    if (!dims.empty()) {
+        strides.back() = 1;
+        for (int i = static_cast<int>(dims.size()) - 2; i >= 0; --i) { strides[i] = strides[i + 1] * dims[i + 1]; }
+    }
+
+    const size_t pad = out_dims.size() - dims.size();
+    std::vector<int64_t> out_strides(out_dims.size(), 0);
+    for (size_t i = 0; i < dims.size(); ++i) {
+        const int64_t dim = dims[i];
+        const int64_t out_dim = out_dims[pad + i];
+        CHECK(dim == out_dim || dim == 1) << "InfiniOps Add broadcast shape mismatch";
+        out_strides[pad + i] = dim == 1 ? 0 : strides[i];
+    }
+    return out_strides;
+}
+
+infini::ops::Tensor ToBroadcastOpsTensor(const std::shared_ptr<Tensor> &tensor, const std::vector<int64_t> &out_dims,
+                                         DataType dtype) {
+    const auto strides = ComputeBroadcastStrides(tensor->Dims(), out_dims);
+    return ToOpsTensor(tensor->DataPtr(), out_dims, dtype, tensor->GetDevice(), strides);
+}
+
+} // namespace
+
+std::shared_ptr<Tensor> AddForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    CHECK_GE(a->NumElements(), b->NumElements());
+    CHECK_EQ(a->NumElements() % b->NumElements(), 0);
+
+    auto a_dtype = a->Dtype();
+    auto b_dtype = b->Dtype();
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
+
+    auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
+    auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));
+
+    auto output = std::make_shared<Tensor>(a->Dims(), promoted_type, a->GetDevice());
+
+    core::DeviceGuard guard(a->GetDevice());
+    auto handle = GetHandle(a->GetDevice());
+    auto a_ops = ToBroadcastOpsTensor(a_promoted, output->Dims(), promoted_type);
+    auto b_ops = ToBroadcastOpsTensor(b_promoted, output->Dims(), promoted_type);
+    auto c_ops = ToOpsTensor(output);
+
+    infini::ops::Add::Call(handle, {}, a_ops, b_ops, c_ops);
+    return output;
+}
+
+} // namespace infini_train::kernel_provider::infiniops
+
+REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, AddForward,
+                infini_train::kernel_provider::infiniops::AddForward)
diff --git a/infini_train/src/core/kernel_provider/infiniops/gemm.cc b/infini_train/src/core/kernel_provider/infiniops/gemm.cc
new file mode 100644
index 00000000..43124d92
--- /dev/null
+++ b/infini_train/src/core/kernel_provider/infiniops/gemm.cc
@@ -0,0 +1,74 @@
+#include <optional>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/src/kernels/common/gemm.h"
+
+#include <infini/ops.h>
+
+namespace infini_train::kernel_provider::infiniops {
+namespace {
+
+int ToInfiniOpsTrans(kernels::GemmTranspose op) {
+    switch (op) {
+    case kernels::GemmTranspose::kNoTranspose:
+        return 0;
+    case kernels::GemmTranspose::kTranspose:
+        return 1;
+    }
+    LOG(FATAL) << "InfiniOps Gemm: unsupported transpose flag " << static_cast<int>(op);
+    return 0; // unreachable
+}
+
+std::vector<int64_t> MatrixShape(int batch_count, int rows, int cols) {
+    if (batch_count > 1) {
+        return {batch_count, rows, cols};
+    }
+    return {rows, cols};
+}
+
+std::vector<int64_t> RowMajorStrides(int batch_count, int ld, long long batch_stride) {
+    if (batch_count > 1) {
+        return {batch_stride, ld, 1};
+    }
+    return {ld, 1};
+}
+
+infini::ops::Tensor MakeRowMajorTransposeView(const void *data, int batch_count, int column_major_rows,
+                                              int column_major_cols, int ld, long long batch_stride, DataType dtype,
+                                              const Device &device) {
+    return ToOpsTensor(const_cast<void *>(data), MatrixShape(batch_count, column_major_cols, column_major_rows), dtype,
+                       device, RowMajorStrides(batch_count, ld, batch_stride));
+}
+
+} // namespace
+
+void Gemm(Device device, kernels::GemmParams p) {
+    CHECK_GE(p.batch_count, 1);
+
+    const bool trans_a = p.trans_a == kernels::GemmTranspose::kTranspose;
+    const bool trans_b = p.trans_b == kernels::GemmTranspose::kTranspose;
+
+    const int a_rows = trans_a ? p.k : p.m;
+    const int a_cols = trans_a ? p.m : p.k;
+    const int b_rows = trans_b ? p.n : p.k;
+    const int b_cols = trans_b ? p.k : p.n;
+
+    core::DeviceGuard guard(device);
+    auto handle = GetHandle(device);
+    auto a = MakeRowMajorTransposeView(p.A, p.batch_count, a_rows, a_cols, p.lda, p.stride_a, p.input_dtype, device);
+    auto b = MakeRowMajorTransposeView(p.B, p.batch_count, b_rows, b_cols, p.ldb, p.stride_b, p.input_dtype, device);
+    auto c = MakeRowMajorTransposeView(p.C, p.batch_count, p.m, p.n, p.ldc, p.stride_c, p.output_dtype, device);
+
+    infini::ops::Gemm::Call(handle, {}, b, a, std::optional<float>{p.alpha}, std::optional<float>{p.beta},
+                            std::optional<int>{ToInfiniOpsTrans(p.trans_b)},
+                            std::optional<int>{ToInfiniOpsTrans(p.trans_a)}, c);
+}
+
+} // namespace infini_train::kernel_provider::infiniops
+
+REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, Gemm, infini_train::kernel_provider::infiniops::Gemm)
diff --git a/infini_train/src/kernels/cuda/common/gemm.cu b/infini_train/src/kernels/cuda/common/gemm.cu
index 80f84970..8e3bfd56 100644
--- a/infini_train/src/kernels/cuda/common/gemm.cu
+++ b/infini_train/src/kernels/cuda/common/gemm.cu
@@ -83,5 +83,3 @@ void SgemvCuda(const Device &device, const SgemvParams &p) {
 }
 
 } // namespace infini_train::kernels::cuda
-
-REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, Gemm, infini_train::kernels::cuda::Gemm)
diff --git a/infini_train/src/kernels/cuda/common/gemm_registry.cu b/infini_train/src/kernels/cuda/common/gemm_registry.cu
new file mode 100644
index 00000000..a67e0554
--- /dev/null
+++ b/infini_train/src/kernels/cuda/common/gemm_registry.cu
@@ -0,0 +1,4 @@
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/src/kernels/cuda/common/gemm.cuh"
+
+REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, Gemm, infini_train::kernels::cuda::Gemm)
diff --git a/infini_train/src/kernels/cuda/elementwise.cu b/infini_train/src/kernels/cuda/elementwise.cu
index fe63e0b2..dc8b965a 100644
--- a/infini_train/src/kernels/cuda/elementwise.cu
+++ b/infini_train/src/kernels/cuda/elementwise.cu
@@ -1246,7 +1246,6 @@ REGISTER_CUDA_ELEMENTWISE_KERNEL(GeForward)
 REGISTER_CUDA_ELEMENTWISE_KERNEL(GeScalarForward)
 REGISTER_CUDA_ELEMENTWISE_KERNEL(OrForward)
 REGISTER_CUDA_ELEMENTWISE_KERNEL(AndForward)
-REGISTER_CUDA_ELEMENTWISE_KERNEL(AddForward)
 REGISTER_CUDA_ELEMENTWISE_KERNEL(AddBackward)
 REGISTER_CUDA_ELEMENTWISE_KERNEL(AddScalarForward)
 REGISTER_CUDA_ELEMENTWISE_KERNEL(AddScalarBackward)
diff --git a/infini_train/src/kernels/cuda/elementwise_add_registry.cu b/infini_train/src/kernels/cuda/elementwise_add_registry.cu
new file mode 100644
index 00000000..7967e4aa
--- /dev/null
+++ b/infini_train/src/kernels/cuda/elementwise_add_registry.cu
@@ -0,0 +1,12 @@
+#include <memory>
+
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::kernels::cuda {
+
+std::shared_ptr<Tensor> AddForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b);
+
+} // namespace infini_train::kernels::cuda
+
+REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, AddForward, infini_train::kernels::cuda::AddForward)
diff --git a/scripts/test_config.json b/scripts/test_config.json
index 2f061528..36d2839b 100644
--- a/scripts/test_config.json
+++ b/scripts/test_config.json
@@ -15,12 +15,12 @@
         {
             "id": "build_1",
             "profile": false,
-            "cmd": "cmake -DBUILD_TEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON .. && make -j"
+            "cmd": "cmake -DBUILD_TEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_INFINIOPS=ON .. && make -j"
         },
         {
             "id": "build_2",
             "profile": true,
-            "cmd": "cmake -DUSE_CUDA=ON -DUSE_NCCL=ON -DPROFILE_MODE=ON .. && make -j"
+            "cmd": "cmake -DUSE_CUDA=ON -DUSE_NCCL=ON -DPROFILE_MODE=ON -DUSE_INFINIOPS=ON .. && make -j"
         }
     ],
     "test_groups": [

From 8c9f75dace76a1ba8876a83a57dd0e670aed1f1e Mon Sep 17 00:00:00 2001
From: cx <cx2016013@163.com>
Date: Thu, 4 Jun 2026 09:47:08 +0000
Subject: [PATCH 2/2] chore: add InfiniOps submodule

---
 .gitmodules           | 3 +++
 third_party/InfiniOps | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 third_party/InfiniOps

diff --git a/.gitmodules b/.gitmodules
index d33d0a64..7b02a741 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = git@github.com:google/googletest.git
+[submodule "third_party/InfiniOps"]
+	path = third_party/InfiniOps
+	url = git@github.com:InfiniTensor/InfiniOps.git
diff --git a/third_party/InfiniOps b/third_party/InfiniOps
new file mode 160000
index 00000000..e870e3e7
--- /dev/null
+++ b/third_party/InfiniOps
@@ -0,0 +1 @@
+Subproject commit e870e3e74b196226ec80cd8ff9f843f838a70f39