diff --git a/.gitmodules b/.gitmodules
index bca919479..1c51c96f9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,3 +5,7 @@
path = third_party/nlohmann_json
url = https://github.com/nlohmann/json.git
branch = master
+[submodule "submodules/InfiniOps"]
+ path = submodules/InfiniOps
+ url = https://github.com/InfiniTensor/InfiniOps.git
+ branch = master
diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc
index 81d9cb066..b4efdf24d 100644
--- a/src/infiniop/ops/gemm/operator.cc
+++ b/src/infiniop/ops/gemm/operator.cc
@@ -1,142 +1,151 @@
-#include "../../operator.h"
#include "../../handle.h"
+#include "../../operator.h"
+#include "../../tensor.h"
#include "infiniop/ops/gemm.h"
-#ifdef ENABLE_CPU_API
-#include "cpu/gemm_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
-#include "nvidia/gemm_nvidia.cuh"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/gemm_bang.h"
-#endif
-#ifdef ENABLE_ASCEND_API
-#include "ascend/gemm_ascend.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/gemm_metax.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/gemm_moore.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/gemm_kunlun.h"
-#endif
-#ifdef ENABLE_QY_API
-#include "qy/gemm_qy.cuh"
-#endif
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+namespace {
+
+std::optional toInfiniOpsDtype(infiniDtype_t dtype) {
+ switch (dtype) {
+ case INFINI_DTYPE_I8:
+ return infini::ops::DataType::kInt8;
+ case INFINI_DTYPE_I16:
+ return infini::ops::DataType::kInt16;
+ case INFINI_DTYPE_I32:
+ return infini::ops::DataType::kInt32;
+ case INFINI_DTYPE_I64:
+ return infini::ops::DataType::kInt64;
+ case INFINI_DTYPE_U8:
+ return infini::ops::DataType::kUInt8;
+ case INFINI_DTYPE_U16:
+ return infini::ops::DataType::kUInt16;
+ case INFINI_DTYPE_U32:
+ return infini::ops::DataType::kUInt32;
+ case INFINI_DTYPE_U64:
+ return infini::ops::DataType::kUInt64;
+ case INFINI_DTYPE_F16:
+ return infini::ops::DataType::kFloat16;
+ case INFINI_DTYPE_BF16:
+ return infini::ops::DataType::kBFloat16;
+ case INFINI_DTYPE_F32:
+ return infini::ops::DataType::kFloat32;
+ case INFINI_DTYPE_F64:
+ return infini::ops::DataType::kFloat64;
+ default:
+ return std::nullopt;
+ }
+}
+
+std::optional toInfiniOpsDevice(infiniDevice_t device) {
+ switch (device) {
+ case INFINI_DEVICE_CPU:
+ return infini::ops::Device::Type::kCpu;
+ case INFINI_DEVICE_NVIDIA:
+ return infini::ops::Device::Type::kNvidia;
+ case INFINI_DEVICE_CAMBRICON:
+ return infini::ops::Device::Type::kCambricon;
+ case INFINI_DEVICE_ASCEND:
+ return infini::ops::Device::Type::kAscend;
+ case INFINI_DEVICE_METAX:
+ return infini::ops::Device::Type::kMetax;
+ case INFINI_DEVICE_MOORE:
+ return infini::ops::Device::Type::kMoore;
+ case INFINI_DEVICE_ILUVATAR:
+ return infini::ops::Device::Type::kIluvatar;
+ case INFINI_DEVICE_KUNLUN:
+ return infini::ops::Device::Type::kKunlun;
+ case INFINI_DEVICE_HYGON:
+ return infini::ops::Device::Type::kHygon;
+ case INFINI_DEVICE_QY:
+ return infini::ops::Device::Type::kQy;
+ default:
+ return std::nullopt;
+ }
+}
+
+struct TensorMeta {
+ std::vector shape;
+ std::vector strides;
+ infini::ops::DataType dtype;
+};
+
+TensorMeta makeMeta(infiniopTensorDescriptor_t desc, infini::ops::DataType dtype) {
+ return TensorMeta{desc->shape(), desc->strides(), dtype};
+}
+
+struct InfiniOpsGemmDescriptor : InfiniopDescriptor {
+ TensorMeta c;
+ TensorMeta a;
+ TensorMeta b;
+
+ InfiniOpsGemmDescriptor(const InfiniopHandle *handle,
+ infiniopTensorDescriptor_t c_desc,
+ infiniopTensorDescriptor_t a_desc,
+ infiniopTensorDescriptor_t b_desc,
+ infini::ops::DataType c_dtype,
+ infini::ops::DataType a_dtype,
+ infini::ops::DataType b_dtype)
+ : c(makeMeta(c_desc, c_dtype)), a(makeMeta(a_desc, a_dtype)), b(makeMeta(b_desc, b_dtype)) {
+ device_type = handle->device;
+ device_id = handle->device_id;
+ }
+
+ infini::ops::Tensor tensor(const TensorMeta &meta, const void *data) const {
+ auto dev = toInfiniOpsDevice(device_type);
+ if (!dev.has_value()) {
+ return infini::ops::Tensor(const_cast(data), meta.shape, meta.dtype);
+ }
+ return infini::ops::Tensor(
+ const_cast(data),
+ meta.shape,
+ meta.dtype,
+ infini::ops::Device(*dev, device_id),
+ meta.strides);
+ }
+};
+
+} // namespace
+
__INFINI_C infiniStatus_t infiniopCreateGemmDescriptor(
infiniopHandle_t handle,
infiniopGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE) \
- case CASE: \
- return op::gemm::NAMESPACE::Descriptor::create( \
- handle, \
- reinterpret_cast(desc_ptr), \
- c_desc, \
- a_desc, \
- b_desc)
-
- switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
- CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
- CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
- CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
- CREATE(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
- CREATE(INFINI_DEVICE_QY, qy);
-#endif
-#ifdef ENABLE_HYGON_API
- CREATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
- CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
- CREATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
- CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
- CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-#ifdef ENABLE_KUNLUN_API
- CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
- default:
+ if (!toInfiniOpsDevice(handle->device).has_value()) {
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
-#undef CREATE
+ auto c_dtype = toInfiniOpsDtype(c_desc->dtype());
+ auto a_dtype = toInfiniOpsDtype(a_desc->dtype());
+ auto b_dtype = toInfiniOpsDtype(b_desc->dtype());
+ if (!c_dtype.has_value() || !a_dtype.has_value() || !b_dtype.has_value()) {
+ return INFINI_STATUS_BAD_TENSOR_DTYPE;
+ }
+
+ *desc_ptr = new InfiniOpsGemmDescriptor(handle, c_desc, a_desc, b_desc, *c_dtype, *a_dtype, *b_dtype);
+ return INFINI_STATUS_SUCCESS;
}
-__INFINI_C infiniStatus_t
-infiniopGetGemmWorkspaceSize(
- infiniopGemmDescriptor_t desc,
+__INFINI_C infiniStatus_t infiniopGetGemmWorkspaceSize(
+ infiniopGemmDescriptor_t,
size_t *size) {
-
-#define GET(CASE, NAMESPACE) \
- case CASE: \
- *size = reinterpret_cast(desc)->workspaceSize(); \
- return INFINI_STATUS_SUCCESS
-
- switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
- GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
- GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
- GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
- GET(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
- GET(INFINI_DEVICE_QY, qy);
-#endif
-#ifdef ENABLE_HYGON_API
- GET(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
- GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
- GET(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
- GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
- GET(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_KUNLUN_API
- GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
- default:
- return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
- }
-
-#undef GET
+ *size = 0;
+ return INFINI_STATUS_SUCCESS;
}
__INFINI_C infiniStatus_t infiniopGemm(
@@ -148,105 +157,30 @@ __INFINI_C infiniStatus_t infiniopGemm(
float alpha,
float beta,
void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE) \
- case CASE: \
- return reinterpret_cast(desc) \
- ->calculate(workspace, workspace_size, \
- c, beta, \
- a, b, alpha, \
- stream)
-
- switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
- CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
- CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
- CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
- CALCULATE(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
- CALCULATE(INFINI_DEVICE_QY, qy);
-#endif
-#ifdef ENABLE_HYGON_API
- CALCULATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
- CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
- CALCULATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
- CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
- CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_KUNLUN_API
- CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
- default:
- return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
- }
-
-#undef CALCULATE
+ auto gemm_desc = reinterpret_cast(desc);
+
+ infini::ops::Handle handle;
+ handle.set_stream(stream);
+ handle.set_workspace(workspace);
+ handle.set_workspace_size_in_bytes(workspace_size);
+
+ infini::ops::Config config;
+ config.set_implementation_index(2);
+
+ infini::ops::Operator::Call(
+ handle,
+ config,
+ gemm_desc->tensor(gemm_desc->a, a),
+ gemm_desc->tensor(gemm_desc->b, b),
+ std::optional(alpha),
+ std::optional(beta),
+ std::optional{},
+ std::optional{},
+ gemm_desc->tensor(gemm_desc->c, c));
+ return INFINI_STATUS_SUCCESS;
}
-__INFINI_C infiniStatus_t
-infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE) \
- case CASE: \
- delete reinterpret_cast(desc); \
- return INFINI_STATUS_SUCCESS;
-
- switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
- DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
- DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
- DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
- DELETE(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
- DELETE(INFINI_DEVICE_QY, qy);
-#endif
-#ifdef ENABLE_HYGON_API
- DELETE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
- DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
- DELETE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
- DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
- DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_KUNLUN_API
- DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
- default:
- return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
- }
-
-#undef DELETE
+__INFINI_C infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
+ delete reinterpret_cast(desc);
+ return INFINI_STATUS_SUCCESS;
}
diff --git a/submodules/InfiniOps b/submodules/InfiniOps
new file mode 160000
index 000000000..9444f9c3d
--- /dev/null
+++ b/submodules/InfiniOps
@@ -0,0 +1 @@
+Subproject commit 9444f9c3d2b98084fc150252531239137ad2519b
diff --git a/xmake.lua b/xmake.lua
index ccae79cd2..6922763d0 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -55,8 +55,15 @@ option_end()
if has_config("nv-gpu") then
add_defines("ENABLE_NVIDIA_API")
includes("xmake/nvidia.lua")
+
+ local CUDA_ROOT = os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") or os.getenv("CUDA_ROOT")
+ if CUDA_ROOT ~= nil then
+ add_includedirs(CUDA_ROOT .. "/include")
+ add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include")
+ end
end
+
option("cudnn")
set_default(true)
set_showmenu(true)
@@ -254,6 +261,20 @@ if has_config("graph") then
end
+-- InfiniOps
+option("infiniops")
+ set_default(nil)
+ set_showmenu(true)
+ set_description("Path to InfiniOps build directory")
+option_end()
+
+local INFINIOPS_ROOT = path.join(os.projectdir(), "submodules", "InfiniOps")
+local INFINIOPS_BUILD_ROOT = get_config("infiniops")
+if not INFINIOPS_BUILD_ROOT or INFINIOPS_BUILD_ROOT == "" then
+ INFINIOPS_BUILD_ROOT = path.join(INFINIOPS_ROOT, "build")
+end
+local INFINIOPS_LIBDIR = path.join(INFINIOPS_BUILD_ROOT, "src")
+
-- InfiniCCL
option("ccl")
set_default(false)
@@ -393,6 +414,10 @@ target("infiniop")
add_deps("infiniop-hygon")
end
set_languages("cxx17")
+ add_includedirs(path.join(INFINIOPS_ROOT, "src"), {public = false})
+ add_includedirs(path.join(INFINIOPS_ROOT, "include"), {public = false})
+ add_rpathdirs(INFINIOPS_LIBDIR)
+ add_links(path.join(INFINIOPS_LIBDIR, "libinfiniops.so"))
add_files("src/infiniop/devices/handle.cc")
add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
add_files("src/infiniop/*.cc")
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index f0d273d77..3ffd46951 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -3,6 +3,12 @@ if CUDNN_ROOT ~= nil then
add_includedirs(CUDNN_ROOT .. "/include")
end
+local CUDA_ROOT = os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") or os.getenv("CUDA_ROOT")
+if CUDA_ROOT ~= nil then
+ add_includedirs(CUDA_ROOT .. "/include")
+ add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include")
+end
+
local CUTLASS_ROOT = os.getenv("CUTLASS_ROOT") or os.getenv("CUTLASS_HOME") or os.getenv("CUTLASS_PATH")
local FLASH_ATTN_ROOT = get_config("flash-attn")