Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = git@github.com:google/googletest.git
[submodule "third_party/InfiniOps"]
path = third_party/InfiniOps
url = git@github.com:InfiniTensor/InfiniOps.git
36 changes: 36 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
option(USE_OMP "Use OpenMP as backend for Eigen" ON)
option(USE_NCCL "Build project for distributed running" ON)
option(USE_INFINIOPS "Use InfiniOps as an optional kernel provider" OFF)
option(BUILD_TEST "Build InfiniTrain tests" OFF)

project(infini_train VERSION 0.5.0 LANGUAGES CXX)
Expand Down Expand Up @@ -51,6 +52,31 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)

include_directories(${PROJECT_SOURCE_DIR})

if(USE_INFINIOPS)
if(NOT USE_CUDA)
message(FATAL_ERROR "USE_INFINIOPS=ON currently requires USE_CUDA=ON")
endif()

add_compile_definitions(USE_INFINIOPS=1)

set(INFINIOPS_SOURCE_DIR "${PROJECT_SOURCE_DIR}/third_party/InfiniOps")
if(NOT EXISTS "${INFINIOPS_SOURCE_DIR}/CMakeLists.txt")
message(FATAL_ERROR
"USE_INFINIOPS=ON requires InfiniOps under third_party/InfiniOps. "
"Run: git submodule update --init third_party/InfiniOps")
endif()

set(WITH_CPU OFF CACHE BOOL "Enable InfiniOps CPU backend" FORCE)
set(WITH_NVIDIA ON CACHE BOOL "Enable InfiniOps NVIDIA backend" FORCE)
add_subdirectory(${INFINIOPS_SOURCE_DIR} ${CMAKE_BINARY_DIR}/third_party/InfiniOps EXCLUDE_FROM_ALL)
if(NOT TARGET infiniops)
message(FATAL_ERROR "InfiniOps third-party project did not define target `infiniops`")
endif()
if(NOT TARGET InfiniOps::infiniops)
add_library(InfiniOps::infiniops ALIAS infiniops)
endif()
endif()

if(PROFILE_MODE)
add_compile_definitions(PROFILE_MODE=1)
endif()
Expand All @@ -62,6 +88,9 @@ endif()
# Framework core sources (*.cc), excluding cpu kernels (they are built separately)
file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
if(NOT USE_INFINIOPS)
list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/kernel_provider/infiniops/.*\.cc$")
endif()
if(NOT USE_CUDA)
list(FILTER SRC EXCLUDE REGEX ".*runtime/cuda/.*")
list(FILTER SRC EXCLUDE REGEX ".*ccl/cuda/.*")
Expand Down Expand Up @@ -100,6 +129,10 @@ if(USE_CUDA)

# Only compile CUDA kernels / cuda sources here (your original used src/*.cu)
file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
if(USE_INFINIOPS)
list(FILTER CUDA_KERNELS EXCLUDE REGEX ".*infini_train/src/kernels/cuda/common/gemm_registry[.]cu$")
list(FILTER CUDA_KERNELS EXCLUDE REGEX ".*infini_train/src/kernels/cuda/elementwise_add_registry[.]cu$")
endif()

add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
Expand All @@ -126,6 +159,9 @@ endif()
# ------------------------------------------------------------------------------

add_library(infini_train STATIC ${SRC})
if(USE_INFINIOPS)
target_link_libraries(infini_train PUBLIC InfiniOps::infiniops)
endif()
target_link_libraries(infini_train
PUBLIC
glog
Expand Down
42 changes: 42 additions & 0 deletions infini_train/include/core/kernel_provider/infiniops/adapter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include <cstdint>
#include <memory>
#include <vector>

#include <handle.h>

#include "data_type.h"
#include "tensor.h"

#include "infini_train/include/datatype.h"
#include "infini_train/include/device.h"

namespace infini_train {
class Tensor;
} // namespace infini_train

namespace infini_train::core {
class Stream;
} // namespace infini_train::core

namespace infini_train::kernel_provider::infiniops {

infini::ops::DataType ToOpsDataType(DataType dtype);

infini::ops::Device ToOpsDevice(const Device &device);

using HandleFactory = infini::ops::Handle (*)(const Device &device, core::Stream *stream);

void RegisterHandleFactory(Device::DeviceType type, HandleFactory factory);

infini::ops::Handle GetHandle(const Device &device);

infini::ops::Tensor ToOpsTensor(const std::shared_ptr<Tensor> &tensor);

infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device);

infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device,
const std::vector<int64_t> &strides);

} // namespace infini_train::kernel_provider::infiniops
116 changes: 116 additions & 0 deletions infini_train/src/core/kernel_provider/infiniops/adapter.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"

#include <map>
#include <unordered_map>

#include "glog/logging.h"

#include "infini_train/include/core/runtime/device_guard.h"
#include "infini_train/include/tensor.h"

namespace infini_train::kernel_provider::infiniops {

namespace {

inline const std::unordered_map<DataType, infini::ops::DataType> kOpsDataTypeMap = {
{DataType::kFLOAT16, infini::ops::DataType::kFloat16}, {DataType::kBFLOAT16, infini::ops::DataType::kBFloat16},
{DataType::kFLOAT32, infini::ops::DataType::kFloat32}, {DataType::kFLOAT64, infini::ops::DataType::kFloat64},
{DataType::kINT8, infini::ops::DataType::kInt8}, {DataType::kINT16, infini::ops::DataType::kInt16},
{DataType::kINT32, infini::ops::DataType::kInt32}, {DataType::kINT64, infini::ops::DataType::kInt64},
{DataType::kUINT8, infini::ops::DataType::kUInt8}, {DataType::kUINT16, infini::ops::DataType::kUInt16},
{DataType::kUINT32, infini::ops::DataType::kUInt32}, {DataType::kUINT64, infini::ops::DataType::kUInt64},
};

inline const std::unordered_map<Device::DeviceType, infini::ops::Device::Type> kOpsDeviceTypeMap = {
{Device::DeviceType::kCUDA, infini::ops::Device::Type::kNvidia},
{Device::DeviceType::kCPU, infini::ops::Device::Type::kCpu},
};

std::map<Device::DeviceType, HandleFactory> &HandleFactories() {
static std::map<Device::DeviceType, HandleFactory> factories;
return factories;
}

} // namespace

void RegisterHandleFactory(Device::DeviceType type, HandleFactory factory) {
CHECK(factory != nullptr);
auto &factories = HandleFactories();
CHECK(!factories.contains(type)) << "InfiniOps handle factory already registered for device type "
<< static_cast<int>(type);
factories.emplace(type, factory);
}

infini::ops::Handle GetHandle(const Device &device) {
auto &factories = HandleFactories();
auto it = factories.find(device.type());
CHECK(it != factories.end()) << "InfiniOps handle factory is not registered for device type "
<< static_cast<int>(device.type());

auto *stream = core::GetDeviceGuardImpl(device.type())->GetStream(device);
return it->second(device, stream);
}

infini::ops::DataType ToOpsDataType(DataType dtype) {
auto it = kOpsDataTypeMap.find(dtype);
if (it == kOpsDataTypeMap.end()) {
LOG(FATAL) << "Unsupported DataType for InfiniOps: " << static_cast<int>(dtype);
__builtin_unreachable();
}
return it->second;
}

infini::ops::Device ToOpsDevice(const Device &device) {
auto it = kOpsDeviceTypeMap.find(device.type());
if (it == kOpsDeviceTypeMap.end()) {
LOG(FATAL) << "Unsupported DeviceType for InfiniOps: " << static_cast<int>(device.type());
__builtin_unreachable();
}
return {it->second, device.index()};
}

namespace {
infini::ops::Tensor::Strides ComputeContiguousStrides(const std::vector<int64_t> &dims) {
infini::ops::Tensor::Strides strides(dims.size());
if (dims.empty()) {
return strides;
}
strides.back() = 1;
for (int i = static_cast<int>(dims.size()) - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * static_cast<infini::ops::Tensor::Stride>(dims[i + 1]);
}
return strides;
}

infini::ops::Tensor::Shape ToShape(const std::vector<int64_t> &dims) {
infini::ops::Tensor::Shape shape(dims.size());
for (size_t i = 0; i < dims.size(); ++i) { shape[i] = static_cast<infini::ops::Tensor::Size>(dims[i]); }
return shape;
}

infini::ops::Tensor::Strides ToStrides(const std::vector<int64_t> &strides) {
infini::ops::Tensor::Strides ops_strides(strides.size());
for (size_t i = 0; i < strides.size(); ++i) {
ops_strides[i] = static_cast<infini::ops::Tensor::Stride>(strides[i]);
}
return ops_strides;
}
} // namespace

infini::ops::Tensor ToOpsTensor(const std::shared_ptr<Tensor> &tensor) {
const auto &dims = tensor->Dims();
return {tensor->DataPtr(), ToShape(dims), ToOpsDataType(tensor->Dtype()), ToOpsDevice(tensor->GetDevice()),
ComputeContiguousStrides(dims)};
}

infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device) {
return {data, ToShape(dims), ToOpsDataType(dtype), ToOpsDevice(device), ComputeContiguousStrides(dims)};
}

infini::ops::Tensor ToOpsTensor(void *data, const std::vector<int64_t> &dims, DataType dtype, const Device &device,
const std::vector<int64_t> &strides) {
CHECK_EQ(dims.size(), strides.size());
return {data, ToShape(dims), ToOpsDataType(dtype), ToOpsDevice(device), ToStrides(strides)};
}

} // namespace infini_train::kernel_provider::infiniops
24 changes: 24 additions & 0 deletions infini_train/src/core/kernel_provider/infiniops/cuda/handle.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "glog/logging.h"

#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"

namespace infini_train::kernel_provider::infiniops {
namespace {

infini::ops::Handle MakeCudaHandle(const Device &, core::Stream *stream) {
auto *cuda_stream = dynamic_cast<core::cuda::CudaStream *>(stream);
CHECK_NOTNULL(cuda_stream);

infini::ops::Handle handle;
handle.set_stream(static_cast<void *>(cuda_stream->cuda_stream()));
return handle;
}

const bool kCudaHandleFactoryRegistered = []() {
RegisterHandleFactory(Device::DeviceType::kCUDA, MakeCudaHandle);
return true;
}();

} // namespace
} // namespace infini_train::kernel_provider::infiniops
71 changes: 71 additions & 0 deletions infini_train/src/core/kernel_provider/infiniops/elementwise.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#include <memory>
#include <numeric>
#include <vector>

#include "glog/logging.h"

#include "infini_train/include/core/kernel_provider/infiniops/adapter.h"
#include "infini_train/include/core/runtime/device_guard.h"
#include "infini_train/include/dispatcher.h"
#include "infini_train/include/tensor.h"

#include <infini/ops.h>

namespace infini_train::kernel_provider::infiniops {
namespace {

std::vector<int64_t> ComputeBroadcastStrides(const std::vector<int64_t> &dims, const std::vector<int64_t> &out_dims) {
CHECK_LE(dims.size(), out_dims.size());

std::vector<int64_t> strides(dims.size());
if (!dims.empty()) {
strides.back() = 1;
for (int i = static_cast<int>(dims.size()) - 2; i >= 0; --i) { strides[i] = strides[i + 1] * dims[i + 1]; }
}

const size_t pad = out_dims.size() - dims.size();
std::vector<int64_t> out_strides(out_dims.size(), 0);
for (size_t i = 0; i < dims.size(); ++i) {
const int64_t dim = dims[i];
const int64_t out_dim = out_dims[pad + i];
CHECK(dim == out_dim || dim == 1) << "InfiniOps Add broadcast shape mismatch";
out_strides[pad + i] = dim == 1 ? 0 : strides[i];
}
return out_strides;
}

infini::ops::Tensor ToBroadcastOpsTensor(const std::shared_ptr<Tensor> &tensor, const std::vector<int64_t> &out_dims,
DataType dtype) {
const auto strides = ComputeBroadcastStrides(tensor->Dims(), out_dims);
return ToOpsTensor(tensor->DataPtr(), out_dims, dtype, tensor->GetDevice(), strides);
}

} // namespace

std::shared_ptr<Tensor> AddForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
CHECK_GE(a->NumElements(), b->NumElements());
CHECK_EQ(a->NumElements() % b->NumElements(), 0);

auto a_dtype = a->Dtype();
auto b_dtype = b->Dtype();
DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);

auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));

auto output = std::make_shared<Tensor>(a->Dims(), promoted_type, a->GetDevice());

core::DeviceGuard guard(a->GetDevice());
auto handle = GetHandle(a->GetDevice());
auto a_ops = ToBroadcastOpsTensor(a_promoted, output->Dims(), promoted_type);
auto b_ops = ToBroadcastOpsTensor(b_promoted, output->Dims(), promoted_type);
auto c_ops = ToOpsTensor(output);

infini::ops::Add::Call(handle, {}, a_ops, b_ops, c_ops);
return output;
}

} // namespace infini_train::kernel_provider::infiniops

REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, AddForward,
infini_train::kernel_provider::infiniops::AddForward)
Loading
Loading