diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..2296f7d --- /dev/null +++ b/.clang-format @@ -0,0 +1,3 @@ +--- +BasedOnStyle: Google +... diff --git a/.gitignore b/.gitignore index d4fb281..37e0bb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Generated files +build/ + # Prerequisites *.d diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e92b202 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,197 @@ +cmake_minimum_required(VERSION 3.18) +project(InfiniRT LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Options for backends. +option(WITH_CPU "Enable CPU backend" OFF) +option(WITH_NVIDIA "Enable CUDA backend" OFF) +option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF) +option(WITH_METAX "Enable MetaX backend" OFF) +option(WITH_CAMBRICON "Enable Cambricon backend" OFF) +option(WITH_MOORE "Enable Moore backend" OFF) +option(WITH_ASCEND "Enable Ascend backend" OFF) + +option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) + +if(AUTO_DETECT_DEVICES) + message(STATUS "Auto-detecting available devices...") + + set(WITH_CPU ON) + + file(GLOB NVIDIA_DEV_FILES "/dev/nvidia*") + + if(NVIDIA_DEV_FILES) + set(WITH_NVIDIA ON) + message(STATUS "Auto-detected NVIDIA environment.") + endif() + + file(GLOB ILUVATAR_DEV_FILES "/dev/iluvatar*") + + if(ILUVATAR_DEV_FILES) + set(WITH_ILUVATAR ON) + message(STATUS "Auto-detected Iluvatar environment.") + endif() + + if(DEFINED ENV{MACA_PATH}) + set(WITH_METAX ON) + message(STATUS "Auto-detected MetaX environment from MACA_PATH") + else() + execute_process( + COMMAND sh -c "grep -h 9999 /sys/bus/pci/devices/*/vendor 2>/dev/null" + OUTPUT_VARIABLE _pci_vendor_output + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + string(FIND "${_pci_vendor_output}" "9999" _found_pos) + + if(_found_pos GREATER -1) + set(WITH_METAX ON) + message(STATUS "Detected MetaX GPU from PCI vendor ID 0x9999") + else() + set(WITH_METAX OFF) + message(STATUS "No MetaX GPU detected") + endif() + endif() + + if(DEFINED ENV{NEUWARE_HOME}) + set(WITH_CAMBRICON ON) + message(STATUS "Auto-detected Cambricon environment.") + endif() + + if(DEFINED ENV{MUSA_ROOT} OR DEFINED ENV{MUSA_HOME} OR DEFINED ENV{MUSA_PATH}) + set(WITH_MOORE ON) + set(WITH_MOORE ON CACHE BOOL "Enable Moore backend" FORCE) + message(STATUS "Auto-detected Moore environment.") + else() + set(WITH_MOORE OFF) + set(WITH_MOORE OFF CACHE BOOL "Enable Moore backend" FORCE) + endif() + + if(DEFINED ENV{ASCEND_HOME_PATH} OR EXISTS "/dev/davinci0") + set(WITH_ASCEND ON) + message(STATUS "Auto-detected Ascend environment.") + endif() +endif() + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) + +# Only one CUDA-like GPU backend can be enabled at a time. +set(_gpu_backend_count 0) +foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE WITH_ASCEND) + if(${_gpu_backend}) + math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1") + endif() +endforeach() + +if(_gpu_backend_count GREATER 1) + message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.") +endif() + +if(WITH_NVIDIA) + add_compile_definitions(WITH_NVIDIA=1) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +# Iluvatar: CUDA-compatible device, uses `clang++` with `-x ivcore` (not `nvcc`). +if(WITH_ILUVATAR) + add_compile_definitions(WITH_ILUVATAR=1) + set(ILUVATAR_ARCH "ivcore20" CACHE STRING "Iluvatar GPU architecture") + find_program(CLANGXX NAMES clang++) + if(CLANGXX) + set(CMAKE_CUDA_COMPILER "${CLANGXX}" CACHE STRING "Iluvatar CUDA compiler (clang++)") + else() + set(CMAKE_CUDA_COMPILER "clang++" CACHE STRING "Iluvatar CUDA compiler (clang++)") + endif() + set(CMAKE_CUDA_FLAGS "-x ivcore -std=c++17 --cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable" CACHE STRING "Iluvatar CUDA flags") + set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Iluvatar") + message(STATUS "Iluvatar: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${ILUVATAR_ARCH}") + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +if(WITH_METAX) + add_compile_definitions(WITH_METAX=1) + + # Normally can be found at: `/opt/maca/`. + set(MACA_PATH $ENV{MACA_PATH}) + set(CMAKE_C_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh) + set(CMAKE_CXX_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mxcc_wrapper.sh) + + include_directories("${MACA_PATH}/include") + link_directories("${MACA_PATH}/lib") + + # Libraries: mcruntime / mcdnn / mcblas. + find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED) +endif() + +if(WITH_MOORE) + add_compile_definitions(WITH_MOORE=1) + + set(MUSA_ROOT "") + foreach(_musa_env MUSA_ROOT MUSA_HOME MUSA_PATH) + if(NOT MUSA_ROOT AND DEFINED ENV{${_musa_env}} AND NOT "$ENV{${_musa_env}}" STREQUAL "") + set(MUSA_ROOT "$ENV{${_musa_env}}") + endif() + endforeach() + + if(NOT MUSA_ROOT AND EXISTS "/usr/local/musa") + set(MUSA_ROOT "/usr/local/musa") + endif() + + if(NOT MUSA_ROOT) + message(FATAL_ERROR "`WITH_MOORE` is `ON` but `MUSA_ROOT`/`MUSA_HOME`/`MUSA_PATH` is not set and `/usr/local/musa` was not found.") + endif() + + if(NOT EXISTS "${MUSA_ROOT}/bin/mcc") + message(FATAL_ERROR "Could not find `mcc` under `${MUSA_ROOT}/bin`.") + endif() + + message(STATUS "Using Moore from `${MUSA_ROOT}`.") + + set(CMAKE_C_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mcc_wrapper.sh) + set(CMAKE_CXX_COMPILER ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mcc_wrapper.sh) + + include_directories("${MUSA_ROOT}/include") + link_directories("${MUSA_ROOT}/lib") + + find_library(MUSA_LIB NAMES musa HINTS "${MUSA_ROOT}/lib" REQUIRED) + find_library(MUSART_LIB NAMES musart HINTS "${MUSA_ROOT}/lib" REQUIRED) +endif() + +if(WITH_CAMBRICON) + add_compile_definitions(WITH_CAMBRICON=1) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + + include_directories("${NEUWARE_HOME}/include") + link_directories("${NEUWARE_HOME}/lib") + link_directories("${NEUWARE_HOME}/lib64") + + # Libraries: `cnrt`. + find_library(CAMBRICON_RUNTIME_LIB NAMES cnrt HINTS "${NEUWARE_HOME}/lib64" REQUIRED) +endif() + +if(WITH_ASCEND) + add_compile_definitions(WITH_ASCEND=1) + if(NOT DEFINED ASCEND_HOME) + if(DEFINED ENV{ASCEND_HOME_PATH} AND NOT "$ENV{ASCEND_HOME_PATH}" STREQUAL "") + set(ASCEND_HOME "$ENV{ASCEND_HOME_PATH}" CACHE PATH "Ascend toolkit root") + else() + set(ASCEND_HOME "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "Ascend toolkit root") + endif() + endif() + if(NOT EXISTS "${ASCEND_HOME}") + message(FATAL_ERROR "`WITH_ASCEND` is ON but `${ASCEND_HOME}` was not found. Set ASCEND_HOME_PATH.") + endif() + message(STATUS "Using Ascend from `${ASCEND_HOME}`.") +endif() + +# If all other platforms are not enabled, CPU is enabled by default. +if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND) + set(WITH_CPU ON CACHE BOOL "Enable CPU backend" FORCE) + add_compile_definitions(WITH_CPU=1) +endif() + +add_subdirectory(src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..6ca3924 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,109 @@ +add_library(infinirt SHARED) + +include(GNUInstallDirs) + +file(GLOB BASE_SRCS CONFIGURE_DEPENDS "*.cc") +target_sources(infinirt PRIVATE ${BASE_SRCS}) + +if(WITH_CPU) + target_compile_definitions(infinirt PUBLIC WITH_CPU=1) + + find_package(OpenMP REQUIRED) + target_link_libraries(infinirt PRIVATE OpenMP::OpenMP_CXX) +endif() + +if(WITH_NVIDIA) + enable_language(CUDA) + + target_compile_definitions(infinirt PUBLIC WITH_NVIDIA=1) + + find_package(CUDAToolkit REQUIRED) + target_link_libraries(infinirt PUBLIC CUDA::cudart) + + set_target_properties(infinirt PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) +endif() + +if(WITH_ILUVATAR) + enable_language(CUDA) + + target_compile_definitions(infinirt PUBLIC WITH_ILUVATAR=1) + + find_package(CUDAToolkit REQUIRED) + target_link_libraries(infinirt PUBLIC CUDA::cudart) + + set_target_properties(infinirt PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) +endif() + +if(WITH_METAX) + target_compile_definitions(infinirt PRIVATE WITH_METAX=1) + + target_include_directories(infinirt PUBLIC "${MACA_PATH}/include") + target_link_libraries(infinirt PUBLIC ${MACA_RUNTIME_LIB}) +endif() + +if(WITH_MOORE) + target_compile_definitions(infinirt PRIVATE WITH_MOORE=1) + + target_include_directories(infinirt PUBLIC "${MUSA_ROOT}/include") + target_link_libraries(infinirt PUBLIC ${MUSA_LIB} ${MUSART_LIB}) +endif() + +if(WITH_CAMBRICON) + target_compile_definitions(infinirt PRIVATE WITH_CAMBRICON=1) + + target_include_directories(infinirt PUBLIC "${NEUWARE_HOME}/include") + target_link_libraries(infinirt PUBLIC ${CAMBRICON_RUNTIME_LIB}) +endif() + +if(WITH_ASCEND) + # ASCEND_HOME is set by the top-level CMakeLists.txt. + target_compile_definitions(infinirt PUBLIC WITH_ASCEND=1) + + # Resolve the driver lib dir two levels above the toolkit root. + get_filename_component(ASCEND_ROOT "${ASCEND_HOME}/../.." ABSOLUTE) + + # Prefer the real driver HAL; fall back to the toolkit stub for build-only + # environments (e.g., Docker CI images without hardware drivers installed). + # CANN <= 8.0: stub at runtime/lib64/stub/; CANN >= 8.5: devlib/-linux/devlib/. + set(ASCEND_HAL_REAL "${ASCEND_ROOT}/driver/lib64/driver/libascend_hal.so") + set(ASCEND_HAL_STUB "${ASCEND_HOME}/runtime/lib64/stub/libascend_hal.so") + set(ASCEND_HAL_DEVLIB "${ASCEND_HOME}/${CMAKE_SYSTEM_PROCESSOR}-linux/devlib/libascend_hal.so") + if(EXISTS "${ASCEND_HAL_REAL}") + set(ASCEND_HAL_LIB "${ASCEND_HAL_REAL}") + elseif(EXISTS "${ASCEND_HAL_STUB}") + set(ASCEND_HAL_LIB "${ASCEND_HAL_STUB}") + message(STATUS "ascend_hal: driver not found, using stub for linking") + elseif(EXISTS "${ASCEND_HAL_DEVLIB}") + set(ASCEND_HAL_LIB "${ASCEND_HAL_DEVLIB}") + message(STATUS "ascend_hal: driver not found, using devlib for linking") + else() + message(FATAL_ERROR "libascend_hal.so not found (tried ${ASCEND_HAL_REAL}, ${ASCEND_HAL_STUB}, and ${ASCEND_HAL_DEVLIB})") + endif() + + target_include_directories(infinirt PUBLIC + "${ASCEND_HOME}/include" + "${ASCEND_HOME}/include/aclnn" + "${ASCEND_HOME}/include/aclnnop") + target_link_libraries(infinirt PUBLIC + "${ASCEND_HOME}/lib64/libascendcl.so" + "${ASCEND_HAL_LIB}") +endif() + +target_include_directories(infinirt PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +install(TARGETS infinirt + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) + +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/infinirt + FILES_MATCHING PATTERN "*.h" +) diff --git a/src/ascend/data_type_.h b/src/ascend/data_type_.h new file mode 100644 index 0000000..d83a393 --- /dev/null +++ b/src/ascend/data_type_.h @@ -0,0 +1,61 @@ +#ifndef INFINI_RT_ASCEND_DATA_TYPE__H_ +#define INFINI_RT_ASCEND_DATA_TYPE__H_ + +#include + +#include "acl/acl.h" +#include "ascend/device_.h" +#include "data_type.h" + +namespace infini::rt::ascend { + +inline aclDataType ToAclDtype(DataType dt) { + switch (dt) { + case DataType::kInt8: + return ACL_INT8; + case DataType::kInt16: + return ACL_INT16; + case DataType::kInt32: + return ACL_INT32; + case DataType::kInt64: + return ACL_INT64; + case DataType::kUInt8: + return ACL_UINT8; + case DataType::kUInt16: + return ACL_UINT16; + case DataType::kUInt32: + return ACL_UINT32; + case DataType::kUInt64: + return ACL_UINT64; + case DataType::kFloat16: + return ACL_FLOAT16; + case DataType::kBFloat16: + return ACL_BF16; + case DataType::kFloat32: + return ACL_FLOAT; + default: + assert(false && "Unsupported dtype for Ascend backend."); + return ACL_DT_UNDEFINED; + } +} + +// Returns true for integer (signed or unsigned) `DataType` values. +inline bool IsIntegerDtype(DataType dt) { + switch (dt) { + case DataType::kInt8: + case DataType::kInt16: + case DataType::kInt32: + case DataType::kInt64: + case DataType::kUInt8: + case DataType::kUInt16: + case DataType::kUInt32: + case DataType::kUInt64: + return true; + default: + return false; + } +} + +} // namespace infini::rt::ascend + +#endif diff --git a/src/ascend/device_.h b/src/ascend/device_.h new file mode 100644 index 0000000..ffdec5a --- /dev/null +++ b/src/ascend/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_ASCEND_DEVICE__H_ +#define INFINI_RT_ASCEND_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/ascend/runtime_.h b/src/ascend/runtime_.h new file mode 100644 index 0000000..32595f6 --- /dev/null +++ b/src/ascend/runtime_.h @@ -0,0 +1,44 @@ +#ifndef INFINI_RT_ASCEND_RUNTIME__H_ +#define INFINI_RT_ASCEND_RUNTIME__H_ + +// clang-format off +#include "acl/acl.h" +// clang-format on + +#include "ascend/device_.h" +#include "runtime.h" + +namespace infini::rt { + +template <> +struct Runtime + : DeviceRuntime> { + using Stream = aclrtStream; + + static constexpr Device::Type kDeviceType = Device::Type::kAscend; + + static constexpr auto Malloc = [](void** ptr, size_t size) { + return aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + }; + + static constexpr auto Free = aclrtFree; + + static constexpr auto Memcpy = [](void* dst, const void* src, size_t count, + aclrtMemcpyKind kind) { + return aclrtMemcpy(dst, count, src, count, kind); + }; + + static constexpr auto MemcpyHostToDevice = ACL_MEMCPY_HOST_TO_DEVICE; + + static constexpr auto MemcpyDeviceToHost = ACL_MEMCPY_DEVICE_TO_HOST; + + static constexpr auto Memset = [](void* ptr, int value, size_t count) { + return aclrtMemset(ptr, count, value, count); + }; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/cambricon/data_type_.h b/src/cambricon/data_type_.h new file mode 100644 index 0000000..f1b8574 --- /dev/null +++ b/src/cambricon/data_type_.h @@ -0,0 +1,23 @@ +#ifndef INFINI_RT_CAMBRICON_DATA_TYPE__H_ +#define INFINI_RT_CAMBRICON_DATA_TYPE__H_ + +#include "bang_bf16.h" +#include "bang_fp16.h" +#include "cambricon/device_.h" +#include "data_type.h" + +namespace infini::rt { + +template <> +struct TypeMap { + using type = __half; +}; + +template <> +struct TypeMap { + using type = __bang_bfloat16; +}; + +} // namespace infini::rt + +#endif diff --git a/src/cambricon/device_.h b/src/cambricon/device_.h new file mode 100644 index 0000000..30bda29 --- /dev/null +++ b/src/cambricon/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_CAMBRICON_DEVICE__H_ +#define INFINI_RT_CAMBRICON_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/cambricon/runtime_.h b/src/cambricon/runtime_.h new file mode 100644 index 0000000..b6ff200 --- /dev/null +++ b/src/cambricon/runtime_.h @@ -0,0 +1,35 @@ +#ifndef INFINI_RT_CAMBRICON_RUNTIME__H_ +#define INFINI_RT_CAMBRICON_RUNTIME__H_ + +#include + +#include "cambricon/device_.h" +#include "runtime.h" + +namespace infini::rt { + +template <> +struct Runtime + : DeviceRuntime> { + using Stream = cnrtQueue_t; + + static constexpr Device::Type kDeviceType = Device::Type::kCambricon; + + static constexpr auto Malloc = cnrtMalloc; + + static constexpr auto Free = cnrtFree; + + static constexpr auto Memcpy = cnrtMemcpy; + + static constexpr auto MemcpyHostToDevice = cnrtMemcpyHostToDev; + + static constexpr auto MemcpyDeviceToHost = cnrtMemcpyDevToHost; + + static constexpr auto Memset = cnrtMemset; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/common/constexpr_map.h b/src/common/constexpr_map.h new file mode 100644 index 0000000..0a01e2a --- /dev/null +++ b/src/common/constexpr_map.h @@ -0,0 +1,32 @@ +#ifndef INFINI_RT_COMMON_CONSTEXPR_MAP_H_ +#define INFINI_RT_COMMON_CONSTEXPR_MAP_H_ + +#include +#include +#include +#include + +namespace infini::rt { + +template +struct ConstexprMap { + constexpr ConstexprMap(std::array, size> data) + : data_(data) {} + + constexpr Value at(Key key) const { + for (const auto& pr : data_) { + if (pr.first == key) return pr.second; + } + // TODO(lzm): change to logging. + assert("the key is not found in the `ConstexprMap`"); + // Unreachable, provided to satisfy the compiler's requirement. + std::abort(); + } + + private: + std::array, size> data_; +}; + +} // namespace infini::rt + +#endif diff --git a/src/common/traits.h b/src/common/traits.h new file mode 100644 index 0000000..6459bda --- /dev/null +++ b/src/common/traits.h @@ -0,0 +1,170 @@ +#ifndef INFINI_RT_COMMON_TRAITS_H_ +#define INFINI_RT_COMMON_TRAITS_H_ + +#include +#include + +namespace infini::rt { + +// --------------------- List and TypePack --------------------- +// A generic container for a sequence of compile-time values. +template +struct List {}; + +// `ListGet(List{})` extracts the `i`th value from a `List` +// tag. +template +constexpr auto ListGetImpl(List) { + if constexpr (index == 0) + return head; + else + return ListGetImpl(List{}); +} + +template +constexpr auto ListGet(List list) { + return ListGetImpl(list); +} + +template +struct TypePack {}; + +// ----------------------------------------------------------------------------- +// Tags +// ----------------------------------------------------------------------------- +// Tags are passed as regular function arguments to user functors instead of +// template parameters. This lets users write plain C++17 `[](auto tag)` lambdas +// rather than C++20 template lambdas (`[]()`). + +// `TypeTag`: carries a C++ type. Recover with `typename +// decltype(tag)::type`. +template +struct TypeTag { + using type = T; +}; + +// `ValueTag`: carries a compile-time value. Recover with +// `decltype(tag)::value`. +template +struct ValueTag { + using value_type = decltype(v); + static constexpr auto value = v; +}; + +// ----------------------------------------------------------------------------- +// List Queries +// ----------------------------------------------------------------------------- + +// Check at compile-time if a value exists within a construct (e.g., `List<>`). +// Example: `static_assert(ContainsValue)`; +template +struct Contains; + +template +struct Contains, value> + : std::disjunction...> {}; + +template +inline constexpr bool ContainsValue = Contains::value; + +// Check at compile-time if a type `T` is present in a variadic list of types +// `Ts`. +// Example: `static_assert(IsTypeInList)`; +template +inline constexpr bool IsTypeInList = (std::is_same_v || ...); + +// Trait to detect whether `T` is a `List<...>` specialization. +template +struct IsListType : std::false_type {}; + +template +struct IsListType> : std::true_type {}; + +// ----------------------------------------------------------------------------- +// List Operations +// ----------------------------------------------------------------------------- + +// Concatenates two List types into a single `List`. +// Example: `ConcatType, List<3, 4>>` is `List<1, 2, 3, 4>`. +template +struct Concat; + +template +struct Concat, List> { + using type = List; +}; + +template +using ConcatType = typename Concat::type; + +template +struct Flatten; + +template +struct Flatten> { + using type = List; +}; + +template +struct Flatten { + using type = typename Flatten, Rest...>::type; +}; + +// ----------------------------------------------------------------------------- +// Invocability Detection (SFINAE) +// ----------------------------------------------------------------------------- + +// Checks if a `Functor` can be called with a `ValueTag` and `Args...`. +template +struct IsInvocable : std::false_type {}; + +template +struct IsInvocable()( + ValueTag{}, std::declval()...))>, + Args...> : std::true_type {}; + +template +inline constexpr bool IsInvocableValue = + IsInvocable::value; + +// ----------------------------------------------------------------------------- +// Filtering Logic +// ----------------------------------------------------------------------------- + +// Recursive template to filter values based on `Functor` support at +// compile-time. +template +struct Filter; + +// Base case: All values processed. +template +struct Filter, List> { + using type = List; +}; + +// Recursive step: Test the `head` value and accumulate if supported. +template +struct Filter, List, head, tail...> { + using type = typename std::conditional_t< + IsInvocableValue && + !ContainsValue, head>, + Filter, List, tail...>, + Filter, List, tail...>>::type; +}; + +// Interface to filter a `List` type directly. +template +struct FilterList; + +template +struct FilterList, List> { + using type = + typename Filter, List<>, items...>::type; +}; + +} // namespace infini::rt + +#endif diff --git a/src/cpu/data_type_.h b/src/cpu/data_type_.h new file mode 100644 index 0000000..dd6c080 --- /dev/null +++ b/src/cpu/data_type_.h @@ -0,0 +1,21 @@ +#ifndef INFINI_RT_CPU_DATA_TYPE__H_ +#define INFINI_RT_CPU_DATA_TYPE__H_ + +#include "cpu/device_.h" +#include "data_type.h" + +namespace infini::rt { + +template <> +struct TypeMap { + using type = Float16; +}; + +template <> +struct TypeMap { + using type = BFloat16; +}; + +} // namespace infini::rt + +#endif diff --git a/src/cpu/device_.h b/src/cpu/device_.h new file mode 100644 index 0000000..78d4899 --- /dev/null +++ b/src/cpu/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_CPU_DEVICE__H_ +#define INFINI_RT_CPU_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/cpu/runtime_.h b/src/cpu/runtime_.h new file mode 100644 index 0000000..29219b8 --- /dev/null +++ b/src/cpu/runtime_.h @@ -0,0 +1,34 @@ +#ifndef INFINI_RT_CPU_RUNTIME__H_ +#define INFINI_RT_CPU_RUNTIME__H_ + +#include +#include + +#include "runtime.h" + +namespace infini::rt { + +template <> +struct Runtime : RuntimeBase> { + static constexpr Device::Type kDeviceType = Device::Type::kCpu; + + static void Malloc(void** ptr, std::size_t size) { *ptr = std::malloc(size); } + + static void Free(void* ptr) { std::free(ptr); } + + static void Memcpy(void* dst, const void* src, std::size_t size, int) { + std::memcpy(dst, src, size); + } + + static constexpr auto Memset = std::memset; + + static constexpr int MemcpyHostToDevice = 0; + + static constexpr int MemcpyDeviceToHost = 1; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/cuda/runtime.h b/src/cuda/runtime.h new file mode 100644 index 0000000..2b6cb32 --- /dev/null +++ b/src/cuda/runtime.h @@ -0,0 +1,29 @@ +#ifndef INFINI_RT_CUDA_RUNTIME_H_ +#define INFINI_RT_CUDA_RUNTIME_H_ + +#include + +#include "../runtime.h" + +namespace infini::rt { + +/// ## CUDA-like runtime interface enforcement via CRTP. +/// +/// `CudaRuntime` extends `DeviceRuntime` for backends that mirror +/// `cuda_runtime.h`-style memory copy APIs. +template +struct CudaRuntime : DeviceRuntime { + static constexpr bool Validate() { + DeviceRuntime::Validate(); + static_assert( + std::is_invocable_v, + "`Runtime::Memcpy` must be callable with " + "`(void*, const void*, size_t, MemcpyHostToDevice)`."); + return true; + } +}; + +} // namespace infini::rt + +#endif diff --git a/src/data_type.h b/src/data_type.h new file mode 100644 index 0000000..c71d540 --- /dev/null +++ b/src/data_type.h @@ -0,0 +1,211 @@ +#ifndef INFINI_RT_DATA_TYPE_H_ +#define INFINI_RT_DATA_TYPE_H_ + +#include +#include +#include + +#include "common/constexpr_map.h" +#include "common/traits.h" +#include "device.h" + +namespace infini::rt { + +enum class DataType : std::int8_t { + kInt8, + kInt16, + kInt32, + kInt64, + kUInt8, + kUInt16, + kUInt32, + kUInt64, + kFloat16, + kBFloat16, + kFloat32, + kFloat64 +}; + +constexpr ConstexprMap kDataTypeToSize{{{ + {DataType::kInt8, 1}, + {DataType::kInt16, 2}, + {DataType::kInt32, 4}, + {DataType::kInt64, 8}, + {DataType::kUInt8, 1}, + {DataType::kUInt16, 2}, + {DataType::kUInt32, 4}, + {DataType::kUInt64, 8}, + {DataType::kFloat16, 2}, + {DataType::kBFloat16, 2}, + {DataType::kFloat32, 4}, + {DataType::kFloat64, 8}, +}}}; + +constexpr ConstexprMap kDataTypeToDesc{{{ + {DataType::kInt8, "int8"}, + {DataType::kInt16, "int16"}, + {DataType::kInt32, "int32"}, + {DataType::kInt64, "int64"}, + {DataType::kUInt8, "uint8"}, + {DataType::kUInt16, "uint16"}, + {DataType::kUInt32, "uint32"}, + {DataType::kUInt64, "uint64"}, + {DataType::kFloat16, "float16"}, + {DataType::kBFloat16, "bfloat16"}, + {DataType::kFloat32, "float32"}, + {DataType::kFloat64, "float64"}, +}}}; + +constexpr ConstexprMap kStringToDataType{{{ + {"int8", DataType::kInt8}, + {"int16", DataType::kInt16}, + {"int32", DataType::kInt32}, + {"int64", DataType::kInt64}, + {"uint8", DataType::kUInt8}, + {"uint16", DataType::kUInt16}, + {"uint32", DataType::kUInt32}, + {"uint64", DataType::kUInt64}, + {"float16", DataType::kFloat16}, + {"bfloat16", DataType::kBFloat16}, + {"float32", DataType::kFloat32}, + {"float64", DataType::kFloat64}, +}}}; + +struct Float16 { + std::uint16_t bits; + + static inline Float16 FromFloat(float val) { + std::uint32_t f32; + std::memcpy(&f32, &val, sizeof(f32)); + std::uint16_t sign = (f32 >> 16) & 0x8000; + std::int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + std::uint32_t mantissa = f32 & 0x7FFFFF; + + if (exponent >= 16) { + // NaN + if (exponent == 128 && mantissa != 0) { + return {static_cast(sign | 0x7E00)}; + } + // Inf + return {static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return {static_cast(sign | ((exponent + 15) << 10) | + (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return {static_cast(sign | (mantissa >> 13))}; + } + // Too small for subnormal: return signed zero. + return {sign}; + } + + inline float ToFloat() const { + std::uint32_t sign = (bits & 0x8000) << 16; + std::int32_t exponent = (bits >> 10) & 0x1F; + std::uint32_t mantissa = bits & 0x3FF; + std::uint32_t f32_bits; + + if (exponent == 31) { + f32_bits = sign | 0x7F800000 | (mantissa << 13); + } else if (exponent == 0) { + if (mantissa == 0) { + f32_bits = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32_bits = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32_bits = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + + float result; + std::memcpy(&result, &f32_bits, sizeof(result)); + return result; + } +}; + +struct BFloat16 { + std::uint16_t bits; + + static inline BFloat16 FromFloat(float val) { + std::uint32_t bits32; + std::memcpy(&bits32, &val, sizeof(bits32)); + + const std::uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + std::uint16_t bf16_bits = + static_cast((bits32 + rounding_bias) >> 16); + return {bf16_bits}; + } + + inline float ToFloat() const { + std::uint32_t bits32 = static_cast(bits) << 16; + float result; + std::memcpy(&result, &bits32, sizeof(result)); + return result; + } +}; + +template +struct TypeMap; + +template +using TypeMapType = typename TypeMap::type; + +#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE) \ + template \ + struct TypeMap { \ + using type = CPP_TYPE; \ + }; + +DEFINE_DATA_TYPE_MAPPING(kUInt8, std::uint8_t) +DEFINE_DATA_TYPE_MAPPING(kInt8, std::int8_t) +DEFINE_DATA_TYPE_MAPPING(kUInt16, std::uint16_t) +DEFINE_DATA_TYPE_MAPPING(kInt16, std::int16_t) +DEFINE_DATA_TYPE_MAPPING(kUInt32, std::uint32_t) +DEFINE_DATA_TYPE_MAPPING(kInt32, std::int32_t) +DEFINE_DATA_TYPE_MAPPING(kUInt64, std::uint64_t) +DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t) +DEFINE_DATA_TYPE_MAPPING(kFloat32, float) +DEFINE_DATA_TYPE_MAPPING(kFloat64, double) +#undef DEFINE_DATA_TYPE_MAPPING + +// Checks whether a C++ type is the bfloat16 or float16 type for the given +// device. Full specializations for each device's float16/bfloat16 types are +// provided in the corresponding platform-specific device type headers. +template +inline constexpr bool IsBFloat16 = + std::is_same_v>; + +template +inline constexpr bool IsFP16 = + std::is_same_v>; + +// Defines the common categories of data types using List. +using FloatTypes = List; +using ReducedFloatTypes = List; +using IntTypes = + List; +using UIntTypes = List; + +using BitTypes8 = List; +using BitTypes16 = List; +using BitTypes32 = + List; +using BitTypes64 = + List; + +using AllFloatTypes = ConcatType; +using AllIntTypes = ConcatType; +using AllTypes = ConcatType; + +} // namespace infini::rt + +#endif diff --git a/src/device.h b/src/device.h new file mode 100644 index 0000000..7da2255 --- /dev/null +++ b/src/device.h @@ -0,0 +1,137 @@ +#ifndef INFINI_RT_DEVICE_H_ +#define INFINI_RT_DEVICE_H_ + +#include +#include + +#include "common/constexpr_map.h" +#include "common/traits.h" +#include "hash.h" + +namespace infini::rt { + +class Device { + public: + enum class Type { + kCpu = 0, + kNvidia = 1, + kCambricon = 2, + kAscend = 3, + kMetax = 4, + kMoore = 5, + kIluvatar = 6, + kKunlun = 7, + kHygon = 8, + kQy = 9, + kCount + }; + + Device() = default; + + Device(const Type& type, const int& index = 0) : type_{type}, index_{index} {} + + static const Type TypeFromString(const std::string& name) { + return kDescToDevice.at(name); + } + + static const std::string_view StringFromType(const Type& type) { + return kDeviceToDesc.at(type); + } + + const Type& type() const { return type_; } + + const int& index() const { return index_; } + + std::string ToString() const { + return std::string{StringFromType(type_)} + ":" + std::to_string(index_); + } + + bool operator==(const Device& other) const { + return type_ == other.type_ && index_ == other.index_; + } + + bool operator!=(const Device& other) const { return !(*this == other); } + + private: + Type type_{Type::kCpu}; + + static constexpr ConstexprMap(Device::Type::kCount)> + kDeviceToDesc{{{ + {Type::kCpu, "cpu"}, + {Type::kNvidia, "nvidia"}, + {Type::kCambricon, "cambricon"}, + {Type::kAscend, "ascend"}, + {Type::kMetax, "metax"}, + {Type::kMoore, "moore"}, + {Type::kIluvatar, "iluvatar"}, + {Type::kKunlun, "kunlun"}, + {Type::kHygon, "hygon"}, + {Type::kQy, "qy"}, + }}}; + + static constexpr ConstexprMap(Device::Type::kCount)> + kDescToDevice{{{ + {"cpu", Type::kCpu}, + {"nvidia", Type::kNvidia}, + {"cambricon", Type::kCambricon}, + {"ascend", Type::kAscend}, + {"metax", Type::kMetax}, + {"moore", Type::kMoore}, + {"iluvatar", Type::kIluvatar}, + {"kunlun", Type::kKunlun}, + {"hygon", Type::kHygon}, + {"qy", Type::kQy}, + }}}; + + int index_{0}; +}; + +// Primary template: Devices are disabled by default. Platform-specific +// headers (e.g. `cpu/device_.h`) specialize this to `std::true_type`. +template +struct DeviceEnabled : std::false_type {}; + +// Defines the common categories of devices using List. +using AllDeviceTypes = + List; + +// Deferred computation of active devices. The `Filter` and `FilterList` +// evaluation are nested inside a class template so that `DeviceEnabled` +// specializations from platform `device_.h` headers are visible at +// instantiation time. Use with a dependent type parameter +// (e.g. `ActiveDevices`) to ensure deferred instantiation. +template +struct ActiveDevicesImpl { + struct Filter { + template + std::enable_if_t::value> operator()( + ValueTag) const {} + }; + + using type = typename FilterList, AllDeviceTypes>::type; +}; + +template +using ActiveDevices = typename ActiveDevicesImpl::type; + +} // namespace infini::rt + +template <> +struct std::hash { + std::size_t operator()(const infini::rt::Device& device) const { + std::size_t seed{0}; + + HashCombine(seed, device.type()); + + HashCombine(seed, device.index()); + + return seed; + } +}; + +#endif diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 0000000..b3a6598 --- /dev/null +++ b/src/hash.h @@ -0,0 +1,12 @@ +#ifndef INFINI_RT_HASH_H_ +#define INFINI_RT_HASH_H_ + +#include + +template +inline void HashCombine(std::size_t& seed, const T& v) { + std::hash> hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +#endif diff --git a/src/iluvatar/data_type_.h b/src/iluvatar/data_type_.h new file mode 100644 index 0000000..4511394 --- /dev/null +++ b/src/iluvatar/data_type_.h @@ -0,0 +1,30 @@ +#ifndef INFINI_RT_ILUVATAR_DATA_TYPE__H_ +#define INFINI_RT_ILUVATAR_DATA_TYPE__H_ + +// clang-format off +#include +#include +// clang-format on + +#include "data_type.h" +#include "iluvatar/device_.h" + +namespace infini::rt { + +using cuda_bfloat16 = nv_bfloat16; + +using cuda_bfloat162 = nv_bfloat162; + +template <> +struct TypeMap { + using type = half; +}; + +template <> +struct TypeMap { + using type = __nv_bfloat16; +}; + +} // namespace infini::rt + +#endif diff --git a/src/iluvatar/device_.h b/src/iluvatar/device_.h new file mode 100644 index 0000000..dc0bb88 --- /dev/null +++ b/src/iluvatar/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_ILUVATAR_DEVICE__H_ +#define INFINI_RT_ILUVATAR_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/iluvatar/runtime_.h b/src/iluvatar/runtime_.h new file mode 100644 index 0000000..c442753 --- /dev/null +++ b/src/iluvatar/runtime_.h @@ -0,0 +1,41 @@ +#ifndef INFINI_RT_ILUVATAR_RUNTIME__H_ +#define INFINI_RT_ILUVATAR_RUNTIME__H_ + +#include + +// clang-format off +#include +// clang-format on + +#include "cuda/runtime.h" +#include "iluvatar/device_.h" + +namespace infini::rt { + +template <> +struct Runtime + : CudaRuntime> { + using Stream = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; + + static constexpr auto Malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto Memcpy = cudaMemcpy; + + static constexpr auto Free = cudaFree; + + static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; + + static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; + + static constexpr auto Memset = cudaMemset; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/infini_rt/cpu/data_type_.h b/src/infini_rt/cpu/data_type_.h new file mode 100644 index 0000000..31d69df --- /dev/null +++ b/src/infini_rt/cpu/data_type_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_CPU_DATA_TYPE__H_ +#define INFINI_RT_PUBLIC_CPU_DATA_TYPE__H_ + +#include "../../cpu/data_type_.h" + +#endif diff --git a/src/infini_rt/cpu/device_.h b/src/infini_rt/cpu/device_.h new file mode 100644 index 0000000..022d838 --- /dev/null +++ b/src/infini_rt/cpu/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_CPU_DEVICE__H_ +#define INFINI_RT_PUBLIC_CPU_DEVICE__H_ + +#include "../../cpu/device_.h" + +#endif diff --git a/src/infini_rt/cpu/runtime_.h b/src/infini_rt/cpu/runtime_.h new file mode 100644 index 0000000..aab20b9 --- /dev/null +++ b/src/infini_rt/cpu/runtime_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_CPU_RUNTIME__H_ +#define INFINI_RT_PUBLIC_CPU_RUNTIME__H_ + +#include "../../cpu/runtime_.h" + +#endif diff --git a/src/infini_rt/data_type.h b/src/infini_rt/data_type.h new file mode 100644 index 0000000..8fe5786 --- /dev/null +++ b/src/infini_rt/data_type.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_DATA_TYPE_H_ +#define INFINI_RT_PUBLIC_DATA_TYPE_H_ + +#include "../data_type.h" + +#endif diff --git a/src/infini_rt/device.h b/src/infini_rt/device.h new file mode 100644 index 0000000..9e9eb8b --- /dev/null +++ b/src/infini_rt/device.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_DEVICE_H_ +#define INFINI_RT_PUBLIC_DEVICE_H_ + +#include "../device.h" + +#endif diff --git a/src/infini_rt/runtime.h b/src/infini_rt/runtime.h new file mode 100644 index 0000000..ab6d557 --- /dev/null +++ b/src/infini_rt/runtime.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_RUNTIME_H_ +#define INFINI_RT_PUBLIC_RUNTIME_H_ + +#include "../runtime.h" + +#endif diff --git a/src/infini_rt/tensor_view.h b/src/infini_rt/tensor_view.h new file mode 100644 index 0000000..0193b2d --- /dev/null +++ b/src/infini_rt/tensor_view.h @@ -0,0 +1,6 @@ +#ifndef INFINI_RT_PUBLIC_TENSOR_VIEW_H_ +#define INFINI_RT_PUBLIC_TENSOR_VIEW_H_ + +#include "../tensor_view.h" + +#endif diff --git a/src/metax/data_type_.h b/src/metax/data_type_.h new file mode 100644 index 0000000..3c1b932 --- /dev/null +++ b/src/metax/data_type_.h @@ -0,0 +1,29 @@ +#ifndef INFINI_RT_METAX_DATA_TYPE__H_ +#define INFINI_RT_METAX_DATA_TYPE__H_ + +#include +#include +#include + +#include "data_type.h" +#include "metax/device_.h" + +namespace infini::rt { + +using cuda_bfloat16 = maca_bfloat16; + +using cuda_bfloat162 = maca_bfloat162; + +template <> +struct TypeMap { + using type = __half; +}; + +template <> +struct TypeMap { + using type = __maca_bfloat16; +}; + +} // namespace infini::rt + +#endif diff --git a/src/metax/device_.h b/src/metax/device_.h new file mode 100644 index 0000000..4fc8825 --- /dev/null +++ b/src/metax/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_METAX_DEVICE__H_ +#define INFINI_RT_METAX_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/metax/runtime_.h b/src/metax/runtime_.h new file mode 100644 index 0000000..2d33123 --- /dev/null +++ b/src/metax/runtime_.h @@ -0,0 +1,35 @@ +#ifndef INFINI_RT_METAX_RUNTIME__H_ +#define INFINI_RT_METAX_RUNTIME__H_ + +#include + +#include "cuda/runtime.h" +#include "metax/device_.h" + +namespace infini::rt { + +template <> +struct Runtime + : CudaRuntime> { + using Stream = mcStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMetax; + + static constexpr auto Malloc = mcMalloc; + + static constexpr auto Memcpy = mcMemcpy; + + static constexpr auto Free = mcFree; + + static constexpr auto MemcpyHostToDevice = mcMemcpyHostToDevice; + + static constexpr auto MemcpyDeviceToHost = mcMemcpyDeviceToHost; + + static constexpr auto Memset = mcMemset; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/moore/data_type_.h b/src/moore/data_type_.h new file mode 100644 index 0000000..0fdeab9 --- /dev/null +++ b/src/moore/data_type_.h @@ -0,0 +1,28 @@ +#ifndef INFINI_RT_MOORE_DATA_TYPE__H_ +#define INFINI_RT_MOORE_DATA_TYPE__H_ + +#include +#include + +#include "data_type.h" +#include "moore/device_.h" + +namespace infini::rt { + +using cuda_bfloat16 = __mt_bfloat16; + +using cuda_bfloat162 = __mt_bfloat162; + +template <> +struct TypeMap { + using type = half; +}; + +template <> +struct TypeMap { + using type = __mt_bfloat16; +}; + +} // namespace infini::rt + +#endif diff --git a/src/moore/device_.h b/src/moore/device_.h new file mode 100644 index 0000000..2bb52a9 --- /dev/null +++ b/src/moore/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_MOORE_DEVICE__H_ +#define INFINI_RT_MOORE_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/moore/runtime_.h b/src/moore/runtime_.h new file mode 100644 index 0000000..c268a8a --- /dev/null +++ b/src/moore/runtime_.h @@ -0,0 +1,43 @@ +#ifndef INFINI_RT_MOORE_RUNTIME__H_ +#define INFINI_RT_MOORE_RUNTIME__H_ + +#include + +#include + +#include "cuda/runtime.h" +#include "moore/device_.h" + +namespace infini::rt { + +template <> +struct Runtime + : CudaRuntime> { + using Stream = musaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kMoore; + + static constexpr auto Malloc = [](auto&&... args) { + return musaMalloc(std::forward(args)...); + }; + + static constexpr auto Memcpy = [](auto&&... args) { + return musaMemcpy(std::forward(args)...); + }; + + static constexpr auto Free = [](auto&&... args) { + return musaFree(std::forward(args)...); + }; + + static constexpr auto MemcpyHostToDevice = musaMemcpyHostToDevice; + + static constexpr auto MemcpyDeviceToHost = musaMemcpyDeviceToHost; + + static constexpr auto Memset = musaMemset; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/nvidia/data_type_.h b/src/nvidia/data_type_.h new file mode 100644 index 0000000..e9afbcd --- /dev/null +++ b/src/nvidia/data_type_.h @@ -0,0 +1,30 @@ +#ifndef INFINI_RT_NVIDIA_DATA_TYPE__H_ +#define INFINI_RT_NVIDIA_DATA_TYPE__H_ + +// clang-format off +#include +#include +// clang-format on + +#include "data_type.h" +#include "nvidia/device_.h" + +namespace infini::rt { + +using cuda_bfloat16 = nv_bfloat16; + +using cuda_bfloat162 = nv_bfloat162; + +template <> +struct TypeMap { + using type = half; +}; + +template <> +struct TypeMap { + using type = __nv_bfloat16; +}; + +} // namespace infini::rt + +#endif diff --git a/src/nvidia/device_.h b/src/nvidia/device_.h new file mode 100644 index 0000000..b89a4e9 --- /dev/null +++ b/src/nvidia/device_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_RT_NVIDIA_DEVICE__H_ +#define INFINI_RT_NVIDIA_DEVICE__H_ + +#include "device.h" + +namespace infini::rt { + +template <> +struct DeviceEnabled : std::true_type {}; + +} // namespace infini::rt + +#endif diff --git a/src/nvidia/runtime_.h b/src/nvidia/runtime_.h new file mode 100644 index 0000000..f3d815f --- /dev/null +++ b/src/nvidia/runtime_.h @@ -0,0 +1,41 @@ +#ifndef INFINI_RT_NVIDIA_RUNTIME__H_ +#define INFINI_RT_NVIDIA_RUNTIME__H_ + +#include + +// clang-format off +#include +// clang-format on + +#include "cuda/runtime.h" +#include "nvidia/device_.h" + +namespace infini::rt { + +template <> +struct Runtime + : CudaRuntime> { + using Stream = cudaStream_t; + + static constexpr Device::Type kDeviceType = Device::Type::kNvidia; + + static constexpr auto Malloc = [](auto&&... args) { + return cudaMalloc(std::forward(args)...); + }; + + static constexpr auto Memcpy = cudaMemcpy; + + static constexpr auto Free = cudaFree; + + static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; + + static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; + + static constexpr auto Memset = cudaMemset; +}; + +static_assert(Runtime::Validate()); + +} // namespace infini::rt + +#endif diff --git a/src/runtime.h b/src/runtime.h new file mode 100644 index 0000000..839477b --- /dev/null +++ b/src/runtime.h @@ -0,0 +1,55 @@ +#ifndef INFINI_RT_RUNTIME_H_ +#define INFINI_RT_RUNTIME_H_ + +#include + +#include "device.h" + +namespace infini::rt { + +template +struct Runtime; + +/// ## Interface enforcement via CRTP. +/// +/// Inherit from the appropriate base to declare which interface level a +/// `Runtime` specialization implements. After the struct is fully defined, call +/// `static_assert(Runtime<...>::Validate())`. The chained `Validate()` checks +/// every required member's existence and signature at compile time, analogous +/// to how `override` catches signature mismatches for virtual functions. +/// +/// - `RuntimeBase`: `kDeviceType` only (e.g. CPU). +/// - `DeviceRuntime`: adds `Stream`, `Malloc`, and `Free` (e.g. Cambricon). + +/// Every Runtime must provide `static constexpr Device::Type kDeviceType`. +template +struct RuntimeBase { + static constexpr bool Validate() { + static_assert( + std::is_same_v, + Device::Type>, + "`Runtime` must define `static constexpr Device::Type kDeviceType`."); + return true; + } +}; + +/// Runtimes with device memory must additionally provide `Stream`, `Malloc`, +/// and `Free`. +template +struct DeviceRuntime : RuntimeBase { + static constexpr bool Validate() { + RuntimeBase::Validate(); + static_assert(sizeof(typename Derived::Stream) > 0, + "`Runtime` must define a `Stream` type alias."); + static_assert( + std::is_invocable_v, + "`Runtime::Malloc` must be callable with `(void**, size_t)`."); + static_assert(std::is_invocable_v, + "`Runtime::Free` must be callable with `(void*)`."); + return true; + } +}; + +} // namespace infini::rt + +#endif diff --git a/src/tensor_view.cc b/src/tensor_view.cc new file mode 100644 index 0000000..ffc8e04 --- /dev/null +++ b/src/tensor_view.cc @@ -0,0 +1,176 @@ +#include "tensor_view.h" + +#include +#include +#include + +namespace infini::rt { + +static TensorView::Index GetEffectiveIndex(TensorView::Index index, + TensorView::Size size) { + return index < 0 ? index + size : index; +} + +TensorView::TensorView(void* data, std::initializer_list shape, + const DataType& dtype, const Device& device, + std::initializer_list strides) + : TensorView{data, decltype(shape_){shape}, dtype, device, + decltype(strides_){strides}} {} + +TensorView TensorView::operator[](const Index& index) const { + return { + reinterpret_cast( + reinterpret_cast(data_) + + GetEffectiveIndex(index, shape_[0]) * strides_[0] * element_size()), + Shape{shape_.cbegin() + 1, shape_.cend()}, dtype_, device_, + Strides{strides_.cbegin() + 1, strides_.cend()}}; +} + +void*& TensorView::data() { return data_; } + +const void* TensorView::data() const { return data_; } + +const TensorView::Shape& TensorView::shape() const { return shape_; } + +const DataType& TensorView::dtype() const { return dtype_; } + +const Device& TensorView::device() const { return device_; } + +const TensorView::Strides& TensorView::strides() const { return strides_; } + +TensorView::Size TensorView::size(const Index& index) const { + return shape_[GetEffectiveIndex(index, shape_.size())]; +} + +TensorView::Stride TensorView::stride(const Index& index) const { + return strides_[GetEffectiveIndex(index, strides_.size())]; +} + +TensorView::Size TensorView::ndim() const { return shape_.size(); } + +TensorView::Size TensorView::element_size() const { + return kDataTypeToSize.at(dtype_); +} + +TensorView::Size TensorView::numel() const { + return std::accumulate( + shape_.begin(), shape_.end(), static_cast(1), + [](TensorView::Size a, TensorView::Size b) { return a * b; }); +} + +TensorView TensorView::T() const { + return {data_, + {shape_[1], shape_[0]}, + dtype_, + device_, + {strides_[1], strides_[0]}}; +} + +std::string TensorView::ToString() const { + return "tensor(" + ToStringHelper() + + ", dtype=" + std::string(kDataTypeToDesc.at(dtype_)) + ", device='" + + device_.ToString() + "')"; +} + +bool TensorView::HasBroadcastDim() const { + return std::any_of(shape_.begin(), shape_.end(), + [&, i = 0](const auto&) mutable { + return shape_[i] != 1 && strides_[i++] == 0; + }); +} + +bool TensorView::IsContiguous() const { + if (ndim() == 0) { + return true; + } + + if (!IsMergeable(0, ndim() - 1)) { + return false; + } + + return stride(ndim() - 1) == 1; +} + +const DataType TensorView::DefaultDataType() { return DataType::kFloat32; } + +Device TensorView::DefaultDevice() { return Device{Device::Type::kCpu}; } + +TensorView::Strides TensorView::DefaultStrides(const Shape& shape) { + if (shape.empty()) { + return {}; + } + + Strides strides(shape.size()); + + strides.back() = 1; + + for (auto i{shape.size() - 2}; i != -1; --i) { + strides[i] = strides[i + 1] * shape[i + 1]; + } + + return strides; +} + +std::string TensorView::ToStringHelper() const { + if (ndim() == 0) { + switch (dtype_) { + case DataType::kFloat16: + return std::to_string(static_cast(data_)->ToFloat()); + case DataType::kBFloat16: + return std::to_string(static_cast(data_)->ToFloat()); + case DataType::kFloat32: + return std::to_string(*static_cast(data_)); + case DataType::kFloat64: + return std::to_string(*static_cast(data_)); + case DataType::kInt8: + return std::to_string(*static_cast(data_)); + case DataType::kInt16: + return std::to_string(*static_cast(data_)); + case DataType::kInt32: + return std::to_string(*static_cast(data_)); + case DataType::kInt64: + return std::to_string(*static_cast(data_)); + case DataType::kUInt8: + return std::to_string(*static_cast(data_)); + case DataType::kUInt16: + return std::to_string(*static_cast(data_)); + case DataType::kUInt32: + return std::to_string(*static_cast(data_)); + case DataType::kUInt64: + return std::to_string(*static_cast(data_)); + default: + return "?"; + } + } + + std::string result{"["}; + + for (auto i{Index{0}}; i < shape_[0]; ++i) { + result += operator[](i).ToStringHelper() + ", "; + } + + result.pop_back(); + result.back() = ']'; + + return result; +} + +bool TensorView::IsMergeable(TensorView::Size dim_start, + TensorView::Size dim_end) const { + if (dim_start == dim_end) { + return true; + } + + for (TensorView::Size i = dim_start; i < dim_end; ++i) { + if (size(i) == 1 && stride(i) == 0) { + return false; + } + if (stride(i) != size(i + 1) * stride(i + 1)) { + return false; + } + } + + return true; +} + +} // namespace infini::rt diff --git a/src/tensor_view.h b/src/tensor_view.h new file mode 100644 index 0000000..e747d19 --- /dev/null +++ b/src/tensor_view.h @@ -0,0 +1,158 @@ +#ifndef INFINI_RT_TENSOR_VIEW_H_ +#define INFINI_RT_TENSOR_VIEW_H_ + +#include +#include +#include + +#include "data_type.h" +#include "device.h" +#include "hash.h" + +namespace infini::rt { + +class TensorView { + public: + using Size = std::size_t; + + using Stride = std::ptrdiff_t; + + using Index = Stride; + + using Shape = std::vector; + + using Strides = std::vector; + + template + TensorView(void* data, const Shape& shape) + : data_{data}, + shape_{shape}, + dtype_{DefaultDataType()}, + device_{DefaultDevice()}, + strides_{DefaultStrides(shape)} {} + + template + TensorView(void* data, const Shape& shape, const DataType& dtype) + : data_{data}, + shape_{shape}, + dtype_{dtype}, + device_{DefaultDevice()}, + strides_{DefaultStrides(shape)} {} + + template + TensorView(void* data, const Shape& shape, const Device& device) + : data_{data}, + shape_{shape}, + dtype_{DefaultDataType()}, + device_{device}, + strides_{DefaultStrides(shape)} {} + + template + TensorView(void* data, const Shape& shape, const DataType& dtype, + const Device& device) + : data_{data}, + shape_{shape}, + dtype_{dtype}, + device_{device}, + strides_{DefaultStrides(shape)} {} + + template + TensorView(void* data, const Shape& shape, const DataType& dtype, + const Device& device, const Strides& strides) + : data_{data}, + shape_{shape}, + dtype_{dtype}, + device_{device}, + strides_{strides} {} + + TensorView(void* data, std::initializer_list shape, + const DataType& dtype, const Device& device, + std::initializer_list strides); + + TensorView operator[](const Index& index) const; + + void*& data(); + + const void* data() const; + + const DataType& dtype() const; + + const Device& device() const; + + const Shape& shape() const; + + const Strides& strides() const; + + Size size(const Index& index) const; + + Stride stride(const Index& index) const; + + Size ndim() const; + + Size element_size() const; + + Size numel() const; + + TensorView T() const; + + std::string ToString() const; + + bool HasBroadcastDim() const; + + bool IsContiguous() const; + + private: + static const DataType DefaultDataType(); + + static Device DefaultDevice(); + + static Strides DefaultStrides(const Shape& shape); + + std::string ToStringHelper() const; + + bool IsMergeable(Size dim_start, Size dim_end) const; + + void* data_{nullptr}; + + Shape shape_; + + const DataType dtype_; + + Device device_; + + Strides strides_; +}; + +} // namespace infini::rt + +template <> +struct std::hash { + std::size_t operator()(const infini::rt::TensorView& tensor) const { + std::size_t seed{0}; + + for (const auto& size : tensor.shape()) { + HashCombine(seed, size); + } + + HashCombine(seed, tensor.dtype()); + + HashCombine(seed, tensor.device()); + + for (const auto& stride : tensor.strides()) { + HashCombine(seed, stride); + } + + return seed; + } +}; + +template <> +struct std::equal_to { + bool operator()(const infini::rt::TensorView& a, + const infini::rt::TensorView& b) const { + return a.dtype() == b.dtype() && a.device() == b.device() && + a.shape() == b.shape() && a.strides() == b.strides(); + } +}; + +#endif