diff --git a/.gitmodules b/.gitmodules index bca919479..1c51c96f9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,7 @@ path = third_party/nlohmann_json url = https://github.com/nlohmann/json.git branch = master +[submodule "submodules/InfiniOps"] + path = submodules/InfiniOps + url = https://github.com/InfiniTensor/InfiniOps.git + branch = master diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc index 81d9cb066..b4efdf24d 100644 --- a/src/infiniop/ops/gemm/operator.cc +++ b/src/infiniop/ops/gemm/operator.cc @@ -1,142 +1,151 @@ -#include "../../operator.h" #include "../../handle.h" +#include "../../operator.h" +#include "../../tensor.h" #include "infiniop/ops/gemm.h" -#ifdef ENABLE_CPU_API -#include "cpu/gemm_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API) -#include "nvidia/gemm_nvidia.cuh" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/gemm_bang.h" -#endif -#ifdef ENABLE_ASCEND_API -#include "ascend/gemm_ascend.h" -#endif -#ifdef ENABLE_METAX_API -#include "metax/gemm_metax.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/gemm_moore.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/gemm_kunlun.h" -#endif -#ifdef ENABLE_QY_API -#include "qy/gemm_qy.cuh" -#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace { + +std::optional toInfiniOpsDtype(infiniDtype_t dtype) { + switch (dtype) { + case INFINI_DTYPE_I8: + return infini::ops::DataType::kInt8; + case INFINI_DTYPE_I16: + return infini::ops::DataType::kInt16; + case INFINI_DTYPE_I32: + return infini::ops::DataType::kInt32; + case INFINI_DTYPE_I64: + return infini::ops::DataType::kInt64; + case INFINI_DTYPE_U8: + return infini::ops::DataType::kUInt8; + case INFINI_DTYPE_U16: + return infini::ops::DataType::kUInt16; + case INFINI_DTYPE_U32: + return infini::ops::DataType::kUInt32; + case INFINI_DTYPE_U64: + return infini::ops::DataType::kUInt64; + case INFINI_DTYPE_F16: + return infini::ops::DataType::kFloat16; + case INFINI_DTYPE_BF16: + return infini::ops::DataType::kBFloat16; + case INFINI_DTYPE_F32: + return infini::ops::DataType::kFloat32; + case INFINI_DTYPE_F64: + return infini::ops::DataType::kFloat64; + default: + return std::nullopt; + } +} + +std::optional toInfiniOpsDevice(infiniDevice_t device) { + switch (device) { + case INFINI_DEVICE_CPU: + return infini::ops::Device::Type::kCpu; + case INFINI_DEVICE_NVIDIA: + return infini::ops::Device::Type::kNvidia; + case INFINI_DEVICE_CAMBRICON: + return infini::ops::Device::Type::kCambricon; + case INFINI_DEVICE_ASCEND: + return infini::ops::Device::Type::kAscend; + case INFINI_DEVICE_METAX: + return infini::ops::Device::Type::kMetax; + case INFINI_DEVICE_MOORE: + return infini::ops::Device::Type::kMoore; + case INFINI_DEVICE_ILUVATAR: + return infini::ops::Device::Type::kIluvatar; + case INFINI_DEVICE_KUNLUN: + return infini::ops::Device::Type::kKunlun; + case INFINI_DEVICE_HYGON: + return infini::ops::Device::Type::kHygon; + case INFINI_DEVICE_QY: + return infini::ops::Device::Type::kQy; + default: + return std::nullopt; + } +} + +struct TensorMeta { + std::vector shape; + std::vector strides; + infini::ops::DataType dtype; +}; + +TensorMeta makeMeta(infiniopTensorDescriptor_t desc, infini::ops::DataType dtype) { + return TensorMeta{desc->shape(), desc->strides(), dtype}; +} + +struct InfiniOpsGemmDescriptor : InfiniopDescriptor { + TensorMeta c; + TensorMeta a; + TensorMeta b; + + InfiniOpsGemmDescriptor(const InfiniopHandle *handle, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infini::ops::DataType c_dtype, + infini::ops::DataType a_dtype, + infini::ops::DataType b_dtype) + : c(makeMeta(c_desc, c_dtype)), a(makeMeta(a_desc, a_dtype)), b(makeMeta(b_desc, b_dtype)) { + device_type = handle->device; + device_id = handle->device_id; + } + + infini::ops::Tensor tensor(const TensorMeta &meta, const void *data) const { + auto dev = toInfiniOpsDevice(device_type); + if (!dev.has_value()) { + return infini::ops::Tensor(const_cast(data), meta.shape, meta.dtype); + } + return infini::ops::Tensor( + const_cast(data), + meta.shape, + meta.dtype, + infini::ops::Device(*dev, device_id), + meta.strides); + } +}; + +} // namespace + __INFINI_C infiniStatus_t infiniopCreateGemmDescriptor( infiniopHandle_t handle, infiniopGemmDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::gemm::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - a_desc, \ - b_desc) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - CREATE(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, qy); -#endif -#ifdef ENABLE_HYGON_API - CREATE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: + if (!toInfiniOpsDevice(handle->device).has_value()) { return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -#undef CREATE + auto c_dtype = toInfiniOpsDtype(c_desc->dtype()); + auto a_dtype = toInfiniOpsDtype(a_desc->dtype()); + auto b_dtype = toInfiniOpsDtype(b_desc->dtype()); + if (!c_dtype.has_value() || !a_dtype.has_value() || !b_dtype.has_value()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + *desc_ptr = new InfiniOpsGemmDescriptor(handle, c_desc, a_desc, b_desc, *c_dtype, *a_dtype, *b_dtype); + return INFINI_STATUS_SUCCESS; } -__INFINI_C infiniStatus_t -infiniopGetGemmWorkspaceSize( - infiniopGemmDescriptor_t desc, +__INFINI_C infiniStatus_t infiniopGetGemmWorkspaceSize( + infiniopGemmDescriptor_t, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - GET(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, qy); -#endif -#ifdef ENABLE_HYGON_API - GET(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef GET + *size = 0; + return INFINI_STATUS_SUCCESS; } __INFINI_C infiniStatus_t infiniopGemm( @@ -148,105 +157,30 @@ __INFINI_C infiniStatus_t infiniopGemm( float alpha, float beta, void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, \ - c, beta, \ - a, b, alpha, \ - stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - CALCULATE(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, qy); -#endif -#ifdef ENABLE_HYGON_API - CALCULATE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE + auto gemm_desc = reinterpret_cast(desc); + + infini::ops::Handle handle; + handle.set_stream(stream); + handle.set_workspace(workspace); + handle.set_workspace_size_in_bytes(workspace_size); + + infini::ops::Config config; + config.set_implementation_index(2); + + infini::ops::Operator::Call( + handle, + config, + gemm_desc->tensor(gemm_desc->a, a), + gemm_desc->tensor(gemm_desc->b, b), + std::optional(alpha), + std::optional(beta), + std::optional{}, + std::optional{}, + gemm_desc->tensor(gemm_desc->c, c)); + return INFINI_STATUS_SUCCESS; } -__INFINI_C infiniStatus_t -infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - DELETE(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, qy); -#endif -#ifdef ENABLE_HYGON_API - DELETE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - DELETE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE +__INFINI_C infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { + delete reinterpret_cast(desc); + return INFINI_STATUS_SUCCESS; } diff --git a/submodules/InfiniOps b/submodules/InfiniOps new file mode 160000 index 000000000..9444f9c3d --- /dev/null +++ b/submodules/InfiniOps @@ -0,0 +1 @@ +Subproject commit 9444f9c3d2b98084fc150252531239137ad2519b diff --git a/xmake.lua b/xmake.lua index ccae79cd2..6922763d0 100644 --- a/xmake.lua +++ b/xmake.lua @@ -55,8 +55,15 @@ option_end() if has_config("nv-gpu") then add_defines("ENABLE_NVIDIA_API") includes("xmake/nvidia.lua") + + local CUDA_ROOT = os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") or os.getenv("CUDA_ROOT") + if CUDA_ROOT ~= nil then + add_includedirs(CUDA_ROOT .. "/include") + add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include") + end end + option("cudnn") set_default(true) set_showmenu(true) @@ -254,6 +261,20 @@ if has_config("graph") then end +-- InfiniOps +option("infiniops") + set_default(nil) + set_showmenu(true) + set_description("Path to InfiniOps build directory") +option_end() + +local INFINIOPS_ROOT = path.join(os.projectdir(), "submodules", "InfiniOps") +local INFINIOPS_BUILD_ROOT = get_config("infiniops") +if not INFINIOPS_BUILD_ROOT or INFINIOPS_BUILD_ROOT == "" then + INFINIOPS_BUILD_ROOT = path.join(INFINIOPS_ROOT, "build") +end +local INFINIOPS_LIBDIR = path.join(INFINIOPS_BUILD_ROOT, "src") + -- InfiniCCL option("ccl") set_default(false) @@ -393,6 +414,10 @@ target("infiniop") add_deps("infiniop-hygon") end set_languages("cxx17") + add_includedirs(path.join(INFINIOPS_ROOT, "src"), {public = false}) + add_includedirs(path.join(INFINIOPS_ROOT, "include"), {public = false}) + add_rpathdirs(INFINIOPS_LIBDIR) + add_links(path.join(INFINIOPS_LIBDIR, "libinfiniops.so")) add_files("src/infiniop/devices/handle.cc") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc") add_files("src/infiniop/*.cc") diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index f0d273d77..3ffd46951 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -3,6 +3,12 @@ if CUDNN_ROOT ~= nil then add_includedirs(CUDNN_ROOT .. "/include") end +local CUDA_ROOT = os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") or os.getenv("CUDA_ROOT") +if CUDA_ROOT ~= nil then + add_includedirs(CUDA_ROOT .. "/include") + add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include") +end + local CUTLASS_ROOT = os.getenv("CUTLASS_ROOT") or os.getenv("CUTLASS_HOME") or os.getenv("CUTLASS_PATH") local FLASH_ATTN_ROOT = get_config("flash-attn")