InfiniTensor · chen2021673 · Jun 4, 2026 · Jun 4, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = git@github.com:google/googletest.git
+[submodule "third_party/InfiniOps"]
+	path = third_party/InfiniOps
+	url = git@github.com:InfiniTensor/InfiniOps.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
 option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running" ON)
+option(USE_INFINIOPS "Use InfiniOps as an optional kernel provider" OFF)
 option(BUILD_TEST "Build InfiniTrain tests" OFF)
 
 project(infini_train VERSION 0.5.0 LANGUAGES CXX)
@@ -51,6 +52,31 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)
 
 include_directories(${PROJECT_SOURCE_DIR})
 
+if(USE_INFINIOPS)
+  if(NOT USE_CUDA)
+    message(FATAL_ERROR "USE_INFINIOPS=ON currently requires USE_CUDA=ON")
+  endif()
+
+  add_compile_definitions(USE_INFINIOPS=1)
+
+  set(INFINIOPS_SOURCE_DIR "${PROJECT_SOURCE_DIR}/third_party/InfiniOps")
+  if(NOT EXISTS "${INFINIOPS_SOURCE_DIR}/CMakeLists.txt")
+    message(FATAL_ERROR
+      "USE_INFINIOPS=ON requires InfiniOps under third_party/InfiniOps. "
+      "Run: git submodule update --init third_party/InfiniOps")
+  endif()
+
+  set(WITH_CPU OFF CACHE BOOL "Enable InfiniOps CPU backend" FORCE)
+  set(WITH_NVIDIA ON CACHE BOOL "Enable InfiniOps NVIDIA backend" FORCE)
+  add_subdirectory(${INFINIOPS_SOURCE_DIR} ${CMAKE_BINARY_DIR}/third_party/InfiniOps EXCLUDE_FROM_ALL)
+  if(NOT TARGET infiniops)
+    message(FATAL_ERROR "InfiniOps third-party project did not define target `infiniops`")
+  endif()
+  if(NOT TARGET InfiniOps::infiniops)
+    add_library(InfiniOps::infiniops ALIAS infiniops)
+  endif()
+endif()
+
 if(PROFILE_MODE)
   add_compile_definitions(PROFILE_MODE=1)
 endif()
@@ -62,6 +88,9 @@ endif()
 # Framework core sources (*.cc), excluding cpu kernels (they are built separately)
 file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
 list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
+if(NOT USE_INFINIOPS)
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/kernel_provider/infiniops/.*\.cc$")
+endif()
 if(NOT USE_CUDA)
   list(FILTER SRC EXCLUDE REGEX ".*runtime/cuda/.*")
   list(FILTER SRC EXCLUDE REGEX ".*ccl/cuda/.*")
@@ -100,6 +129,10 @@ if(USE_CUDA)
 
   # Only compile CUDA kernels / cuda sources here (your original used src/*.cu)
   file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
+  if(USE_INFINIOPS)
+    list(FILTER CUDA_KERNELS EXCLUDE REGEX ".*infini_train/src/kernels/cuda/common/gemm_registry[.]cu$")
+    list(FILTER CUDA_KERNELS EXCLUDE REGEX ".*infini_train/src/kernels/cuda/elementwise_add_registry[.]cu$")
+  endif()
 
   add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
   set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
@@ -126,6 +159,9 @@ endif()
 # ------------------------------------------------------------------------------
 
 add_library(infini_train STATIC ${SRC})
+if(USE_INFINIOPS)
+  target_link_libraries(infini_train PUBLIC InfiniOps::infiniops)
+endif()
 target_link_libraries(infini_train
   PUBLIC
     glog

diff --git a/infini_train/include/core/kernel_provider/infiniops/adapter.h b/infini_train/include/core/kernel_provider/infiniops/adapter.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <handle.h>
+
+#include "data_type.h"
+#include "tensor.h"
+
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/device.h"
+
+namespace infini_train {
+class Tensor;
+} // namespace infini_train
+
+namespace infini_train::core {
+class Stream;
+} // namespace infini_train::core
+
+namespace infini_train::kernel_provider::infiniops {
+
+infini::ops::DataType ToOpsDataType(DataType dtype);
+
+infini::ops::Device ToOpsDevice(const Device &device);
+
+using HandleFactory = infini::ops::Handle (*)(const Device &device, core::Stream *stream);
+
+void RegisterHandleFactory(Device::DeviceType type, HandleFactory factory);
+
+infini::ops::Handle GetHandle(const Device &device);
+
+infini::ops::Tensor ToOpsTensor(const std::shared_ptr<Tensor> &tensor);
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device);
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device,
+                                const std::vector<int64_t> &strides);
+
+} // namespace infini_train::kernel_provider::infiniops
diff --git a/infini_train/src/core/kernel_provider/infiniops/adapter.cc b/infini_train/src/core/kernel_provider/infiniops/adapter.cc
@@ -0,0 +1,116 @@
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+
+#include <map>
+#include <unordered_map>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::kernel_provider::infiniops {
+
+namespace {
+
+inline const std::unordered_map<DataType, infini::ops::DataType> kOpsDataTypeMap = {
+    {DataType::kFLOAT16, infini::ops::DataType::kFloat16}, {DataType::kBFLOAT16, infini::ops::DataType::kBFloat16},
+    {DataType::kFLOAT32, infini::ops::DataType::kFloat32}, {DataType::kFLOAT64, infini::ops::DataType::kFloat64},
+    {DataType::kINT8, infini::ops::DataType::kInt8},       {DataType::kINT16, infini::ops::DataType::kInt16},
+    {DataType::kINT32, infini::ops::DataType::kInt32},     {DataType::kINT64, infini::ops::DataType::kInt64},
+    {DataType::kUINT8, infini::ops::DataType::kUInt8},     {DataType::kUINT16, infini::ops::DataType::kUInt16},
+    {DataType::kUINT32, infini::ops::DataType::kUInt32},   {DataType::kUINT64, infini::ops::DataType::kUInt64},
+};
+
+inline const std::unordered_map<Device::DeviceType, infini::ops::Device::Type> kOpsDeviceTypeMap = {
+    {Device::DeviceType::kCUDA, infini::ops::Device::Type::kNvidia},
+    {Device::DeviceType::kCPU, infini::ops::Device::Type::kCpu},
+};
+
+std::map<Device::DeviceType, HandleFactory> &HandleFactories() {
+    static std::map<Device::DeviceType, HandleFactory> factories;
+    return factories;
+}
+
+} // namespace
+
+void RegisterHandleFactory(Device::DeviceType type, HandleFactory factory) {
+    CHECK(factory != nullptr);
+    auto &factories = HandleFactories();
+    CHECK(!factories.contains(type)) << "InfiniOps handle factory already registered for device type "
+                                     << static_cast<int>(type);
+    factories.emplace(type, factory);
+}
+
+infini::ops::Handle GetHandle(const Device &device) {
+    auto &factories = HandleFactories();
+    auto it = factories.find(device.type());
+    CHECK(it != factories.end()) << "InfiniOps handle factory is not registered for device type "
+                                 << static_cast<int>(device.type());
+
+    auto *stream = core::GetDeviceGuardImpl(device.type())->GetStream(device);
+    return it->second(device, stream);
+}
+
+infini::ops::DataType ToOpsDataType(DataType dtype) {
+    auto it = kOpsDataTypeMap.find(dtype);
+    if (it == kOpsDataTypeMap.end()) {
+        LOG(FATAL) << "Unsupported DataType for InfiniOps: " << static_cast<int>(dtype);
+        __builtin_unreachable();
+    }
+    return it->second;
+}
+
+infini::ops::Device ToOpsDevice(const Device &device) {
+    auto it = kOpsDeviceTypeMap.find(device.type());
+    if (it == kOpsDeviceTypeMap.end()) {
+        LOG(FATAL) << "Unsupported DeviceType for InfiniOps: " << static_cast<int>(device.type());
+        __builtin_unreachable();
+    }
+    return {it->second, device.index()};
+}
+
+namespace {
+infini::ops::Tensor::Strides ComputeContiguousStrides(const std::vector<int64_t> &dims) {
+    infini::ops::Tensor::Strides strides(dims.size());
+    if (dims.empty()) {
+        return strides;
+    }
+    strides.back() = 1;
+    for (int i = static_cast<int>(dims.size()) - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * static_cast<infini::ops::Tensor::Stride>(dims[i + 1]);
+    }
+    return strides;
+}
+
+infini::ops::Tensor::Shape ToShape(const std::vector<int64_t> &dims) {
+    infini::ops::Tensor::Shape shape(dims.size());
+    for (size_t i = 0; i < dims.size(); ++i) { shape[i] = static_cast<infini::ops::Tensor::Size>(dims[i]); }
+    return shape;
+}
+
+infini::ops::Tensor::Strides ToStrides(const std::vector<int64_t> &strides) {
+    infini::ops::Tensor::Strides ops_strides(strides.size());
+    for (size_t i = 0; i < strides.size(); ++i) {
+        ops_strides[i] = static_cast<infini::ops::Tensor::Stride>(strides[i]);
+    }
+    return ops_strides;
+}
+} // namespace
+
+infini::ops::Tensor ToOpsTensor(const std::shared_ptr<Tensor> &tensor) {
+    const auto &dims = tensor->Dims();
+    return {tensor->DataPtr(), ToShape(dims), ToOpsDataType(tensor->Dtype()), ToOpsDevice(tensor->GetDevice()),
+            ComputeContiguousStrides(dims)};
+}
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device) {
+    return {data, ToShape(dims), ToOpsDataType(dtype), ToOpsDevice(device), ComputeContiguousStrides(dims)};
+}
+
+infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device,
+                                const std::vector<int64_t> &strides) {
+    CHECK_EQ(dims.size(), strides.size());
+    return {data, ToShape(dims), ToOpsDataType(dtype), ToOpsDevice(device), ToStrides(strides)};
+}
+
+} // namespace infini_train::kernel_provider::infiniops
diff --git a/infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc b/infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc
@@ -0,0 +1,24 @@
+#include "glog/logging.h"
+
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
+
+namespace infini_train::kernel_provider::infiniops {
+namespace {
+
+infini::ops::Handle MakeCudaHandle(const Device &, core::Stream *stream) {
+    auto *cuda_stream = dynamic_cast<core::cuda::CudaStream *>(stream);
+    CHECK_NOTNULL(cuda_stream);
+
+    infini::ops::Handle handle;
+    handle.set_stream(static_cast<void *>(cuda_stream->cuda_stream()));
+    return handle;
+}
+
+const bool kCudaHandleFactoryRegistered = []() {
+    RegisterHandleFactory(Device::DeviceType::kCUDA, MakeCudaHandle);
+    return true;
+}();
+
+} // namespace
+} // namespace infini_train::kernel_provider::infiniops
diff --git a/infini_train/src/core/kernel_provider/infiniops/elementwise.cc b/infini_train/src/core/kernel_provider/infiniops/elementwise.cc
@@ -0,0 +1,71 @@
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include <infini/ops.h>
+
+namespace infini_train::kernel_provider::infiniops {
+namespace {
+
+std::vector<int64_t> ComputeBroadcastStrides(const std::vector<int64_t> &dims, const std::vector<int64_t> &out_dims) {
+    CHECK_LE(dims.size(), out_dims.size());
+
+    std::vector<int64_t> strides(dims.size());
+    if (!dims.empty()) {
+        strides.back() = 1;
+        for (int i = static_cast<int>(dims.size()) - 2; i >= 0; --i) { strides[i] = strides[i + 1] * dims[i + 1]; }
+    }
+
+    const size_t pad = out_dims.size() - dims.size();
+    std::vector<int64_t> out_strides(out_dims.size(), 0);
+    for (size_t i = 0; i < dims.size(); ++i) {
+        const int64_t dim = dims[i];
+        const int64_t out_dim = out_dims[pad + i];
+        CHECK(dim == out_dim || dim == 1) << "InfiniOps Add broadcast shape mismatch";
+        out_strides[pad + i] = dim == 1 ? 0 : strides[i];
+    }
+    return out_strides;
+}
+
+infini::ops::Tensor ToBroadcastOpsTensor(const std::shared_ptr<Tensor> &tensor, const std::vector<int64_t> &out_dims,
+                                         DataType dtype) {
+    const auto strides = ComputeBroadcastStrides(tensor->Dims(), out_dims);
+    return ToOpsTensor(tensor->DataPtr(), out_dims, dtype, tensor->GetDevice(), strides);
+}
+
+} // namespace
+
+std::shared_ptr<Tensor> AddForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    CHECK_GE(a->NumElements(), b->NumElements());
+    CHECK_EQ(a->NumElements() % b->NumElements(), 0);
+
+    auto a_dtype = a->Dtype();
+    auto b_dtype = b->Dtype();
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
+
+    auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
+    auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));
+
+    auto output = std::make_shared<Tensor>(a->Dims(), promoted_type, a->GetDevice());
+
+    core::DeviceGuard guard(a->GetDevice());
+    auto handle = GetHandle(a->GetDevice());
+    auto a_ops = ToBroadcastOpsTensor(a_promoted, output->Dims(), promoted_type);
+    auto b_ops = ToBroadcastOpsTensor(b_promoted, output->Dims(), promoted_type);
+    auto c_ops = ToOpsTensor(output);
+
+    infini::ops::Add::Call(handle, {}, a_ops, b_ops, c_ops);
+    return output;
+}
+
+} // namespace infini_train::kernel_provider::infiniops
+
+REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, AddForward,
+                infini_train::kernel_provider::infiniops::AddForward)