Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions include/infiniop/ops/awq_marlin_repack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef __INFINIOP_AWQ_MARLIN_REPACK_API_H__
#define __INFINIOP_AWQ_MARLIN_REPACK_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopAwqMarlinRepackDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateAwqMarlinRepackDescriptor(infiniopHandle_t handle,
infiniopAwqMarlinRepackDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
int64_t num_bits,
bool is_a_8bit);

__INFINI_C __export infiniStatus_t infiniopGetAwqMarlinRepackWorkspaceSize(infiniopAwqMarlinRepackDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopAwqMarlinRepack(infiniopAwqMarlinRepackDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyAwqMarlinRepackDescriptor(infiniopAwqMarlinRepackDescriptor_t desc);

#endif
48 changes: 48 additions & 0 deletions src/infiniop/ops/awq_marlin_repack/awq_marlin_repack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef AWQ_MARLIN_REPACK_H
#define AWQ_MARLIN_REPACK_H

#include "../../operator.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
\
namespace op::awq_marlin_repack::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AwqMarlinRepackInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AwqMarlinRepackInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
int64_t num_bits, \
bool is_a_8bit); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
void *stream) const; \
}; \
}

#endif // AWQ_MARLIN_REPACK_H
197 changes: 197 additions & 0 deletions src/infiniop/ops/awq_marlin_repack/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#include "../marlin/marlin.cuh"

namespace marlin {

template <int const num_threads, int const num_bits, bool is_a_8bit>
__device__ void awq_marlin_repack_kernel(
uint32_t const *__restrict__ b_q_weight_ptr, uint32_t *__restrict__ out_ptr,
int size_k, int size_n) {
constexpr int pack_factor = 32 / num_bits;

constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
int k_tiles = size_k / target_tile_k_size;
int n_tiles = size_n / target_tile_n_size;
int block_k_tiles = div_ceil(k_tiles, gridDim.x);

auto start_k_tile = blockIdx.x * block_k_tiles;
if (start_k_tile >= k_tiles) {
return;
}

int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);

// Wait until the next thread tile has been loaded to shared memory.
auto wait_for_stage = [&]() {
// We only have `stages - 2` active fetches since we are double buffering
// and can only issue the next fetch when it is guaranteed that the previous
// shared memory load is fully complete (as it may otherwise be
// overwritten).
cp_async_wait<repack_stages - 2>();
__syncthreads();
};

extern __shared__ int4 sh[];

constexpr int tile_n_ints = target_tile_n_size / pack_factor;

constexpr int stage_n_threads = tile_n_ints / 4;
constexpr int stage_k_threads = target_tile_k_size;
constexpr int stage_size = stage_k_threads * stage_n_threads;

auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
if (n_tile_id >= n_tiles) {
cp_async_fence();
return;
}

int first_n = n_tile_id * target_tile_n_size;
int first_n_packed = first_n / pack_factor;

int4 *sh_ptr = sh + stage_size * pipe;

if (threadIdx.x < stage_size) {
auto k_id = threadIdx.x / stage_n_threads;
auto n_id = threadIdx.x % stage_n_threads;

int first_k = k_tile_id * target_tile_k_size;

cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
reinterpret_cast<int4 const *>(
&(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) + first_n_packed + (n_id * 4)])));
}

cp_async_fence();
};

auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
if (n_tile_id >= n_tiles) {
return;
}

auto warp_id = threadIdx.x / 32;
auto th_id = threadIdx.x % 32;

if (warp_id >= 4) {
return;
}

int tc_col = th_id / 4;
int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);

constexpr int tc_offsets[4] = {0, 1, 8, 9};

int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
int cur_n_packed = cur_n / pack_factor;
int cur_n_pos = cur_n % pack_factor;

constexpr int sh_stride = tile_n_ints;
constexpr uint32_t mask = (1 << num_bits) - 1;

int4 *sh_stage_ptr = sh + stage_size * pipe;
uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);

// Undo interleaving
int cur_n_pos_unpacked;
if constexpr (num_bits == 4) {
constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
cur_n_pos_unpacked = undo_pack[cur_n_pos];
} else {
constexpr int undo_pack[4] = {0, 2, 1, 3};
cur_n_pos_unpacked = undo_pack[cur_n_pos];
}

uint32_t vals[8];
#pragma unroll
for (int i = 0; i < 4; i++) {
if constexpr (is_a_8bit) {
int cur_elem = tc_row + i;

int packed_src_0 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) + sh_stride * cur_elem];
int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) + sh_stride * (cur_elem + 16)];

vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
} else {
int cur_elem = tc_row + tc_offsets[i];

int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) + sh_stride * cur_elem];

vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
}
}

constexpr int tile_size = target_tile_k_size * target_tile_n_size / pack_factor;
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;

// Result of:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
if constexpr (!is_a_8bit && num_bits == 4) {
int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};

uint32_t res = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
res |= vals[pack_idx[i]] << (i * 4);
}

out_ptr[out_offset + th_id * 4 + warp_id] = res;

} else if constexpr (is_a_8bit && num_bits == 4) {
int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};

uint32_t res = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
res |= vals[pack_idx[i]] << (i * 4);
}

out_ptr[out_offset + th_id * 4 + warp_id] = res;

} else {
constexpr int pack_idx[4] = {0, 2, 1, 3};

uint32_t res1 = 0;
uint32_t res2 = 0;
#pragma unroll
for (int i = 0; i < 4; i++) {
const int ii = is_a_8bit ? i : pack_idx[i];
res1 |= vals[ii] << (i * 8);
res2 |= vals[4 + ii] << (i * 8);
}

out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
}
};

auto start_pipes = [&](int k_tile_id, int n_tile_id) {
#pragma unroll
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
}

wait_for_stage();
};
#pragma unroll
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
int n_tile_id = 0;

start_pipes(k_tile_id, n_tile_id);

while (n_tile_id < n_tiles) {
#pragma unroll
for (int pipe = 0; pipe < repack_stages; pipe++) {
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
n_tile_id + pipe + repack_stages - 1);
repack_tile(pipe, k_tile_id, n_tile_id + pipe);
wait_for_stage();
}
n_tile_id += repack_stages;
}
}
}

} // namespace marlin
49 changes: 49 additions & 0 deletions src/infiniop/ops/awq_marlin_repack/info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef __AWQ_MARLIN_REPACK_INFO_H__
#define __AWQ_MARLIN_REPACK_INFO_H__

#include "../../../utils.h"
#include "../../tensor.h"
#include "marlin/marlin.cuh"
#include <vector>

#include <cassert>

namespace op::awq_marlin_repack {

class AwqMarlinRepackInfo {
AwqMarlinRepackInfo() = default;

public:
infiniDtype_t output_dtype, input_dtype;
size_t size_k, size_n;
int64_t num_bits;
bool is_a_8bit;

static utils::Result<AwqMarlinRepackInfo> create(
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
int64_t num_bits,
bool is_a_8bit) {
CHECK_OR_RETURN(
output_desc != nullptr && input_desc != nullptr,
INFINI_STATUS_NULL_POINTER);
const infiniDtype_t output_dtype = output_desc->dtype();
const infiniDtype_t input_dtype = input_desc->dtype();
CHECK_DTYPE(input_dtype, INFINI_DTYPE_I32);
CHECK_DTYPE(input_dtype, output_dtype);

size_t size_k = input_desc->dim(0);
int const pack_factor = 32 / num_bits;
size_t size_n = input_desc->dim(1) * pack_factor;

CHECK_OR_RETURN(size_k / marlin::tile_size == output_desc->dim(0) || size_n * marlin::tile_size / pack_factor == output_desc->dim(1),
INFINI_STATUS_BAD_TENSOR_SHAPE);

return utils::Result<AwqMarlinRepackInfo>(
AwqMarlinRepackInfo{output_dtype, input_dtype, size_k, size_n, num_bits, is_a_8bit});
}
};

} // namespace op::awq_marlin_repack

#endif // __AWQ_MARLIN_REPACK_INFO_H__
Loading
Loading