InfiniTensor · xgqdut2016 · Jun 4, 2026
diff --git a/include/infiniop/ops/awq_marlin_repack.h b/include/infiniop/ops/awq_marlin_repack.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_AWQ_MARLIN_REPACK_API_H__
+#define __INFINIOP_AWQ_MARLIN_REPACK_API_H__
+
+#include "../operator_descriptor.h"
+#include <cstdint>
+
+typedef struct InfiniopDescriptor *infiniopAwqMarlinRepackDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateAwqMarlinRepackDescriptor(infiniopHandle_t handle,
+                                                                           infiniopAwqMarlinRepackDescriptor_t *desc_ptr,
+                                                                           infiniopTensorDescriptor_t output_desc,
+                                                                           infiniopTensorDescriptor_t input_desc,
+                                                                           int64_t num_bits,
+                                                                           bool is_a_8bit);
+
+__INFINI_C __export infiniStatus_t infiniopGetAwqMarlinRepackWorkspaceSize(infiniopAwqMarlinRepackDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopAwqMarlinRepack(infiniopAwqMarlinRepackDescriptor_t desc,
+                                                           void *workspace,
+                                                           size_t workspace_size,
+                                                           void *output,
+                                                           const void *input,
+                                                           void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyAwqMarlinRepackDescriptor(infiniopAwqMarlinRepackDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/awq_marlin_repack/awq_marlin_repack.h b/src/infiniop/ops/awq_marlin_repack/awq_marlin_repack.h
@@ -0,0 +1,48 @@
+#ifndef AWQ_MARLIN_REPACK_H
+#define AWQ_MARLIN_REPACK_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::awq_marlin_repack::NAMESPACE {                 \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AwqMarlinRepackInfo _info;                               \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            AwqMarlinRepackInfo info,                            \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            int64_t num_bits,                                    \
+            bool is_a_8bit);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // AWQ_MARLIN_REPACK_H
diff --git a/src/infiniop/ops/awq_marlin_repack/cuda/kernel.cuh b/src/infiniop/ops/awq_marlin_repack/cuda/kernel.cuh
@@ -0,0 +1,197 @@
+#include "../marlin/marlin.cuh"
+
+namespace marlin {
+
+template <int const num_threads, int const num_bits, bool is_a_8bit>
+__device__ void awq_marlin_repack_kernel(
+    uint32_t const *__restrict__ b_q_weight_ptr, uint32_t *__restrict__ out_ptr,
+    int size_k, int size_n) {
+    constexpr int pack_factor = 32 / num_bits;
+
+    constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+    constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+    int k_tiles = size_k / target_tile_k_size;
+    int n_tiles = size_n / target_tile_n_size;
+    int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+    auto start_k_tile = blockIdx.x * block_k_tiles;
+    if (start_k_tile >= k_tiles) {
+        return;
+    }
+
+    int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+    // Wait until the next thread tile has been loaded to shared memory.
+    auto wait_for_stage = [&]() {
+        // We only have `stages - 2` active fetches since we are double buffering
+        // and can only issue the next fetch when it is guaranteed that the previous
+        // shared memory load is fully complete (as it may otherwise be
+        // overwritten).
+        cp_async_wait<repack_stages - 2>();
+        __syncthreads();
+    };
+
+    extern __shared__ int4 sh[];
+
+    constexpr int tile_n_ints = target_tile_n_size / pack_factor;
+
+    constexpr int stage_n_threads = tile_n_ints / 4;
+    constexpr int stage_k_threads = target_tile_k_size;
+    constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+    auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+        if (n_tile_id >= n_tiles) {
+            cp_async_fence();
+            return;
+        }
+
+        int first_n = n_tile_id * target_tile_n_size;
+        int first_n_packed = first_n / pack_factor;
+
+        int4 *sh_ptr = sh + stage_size * pipe;
+
+        if (threadIdx.x < stage_size) {
+            auto k_id = threadIdx.x / stage_n_threads;
+            auto n_id = threadIdx.x % stage_n_threads;
+
+            int first_k = k_tile_id * target_tile_k_size;
+
+            cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                      reinterpret_cast<int4 const *>(
+                          &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) + first_n_packed + (n_id * 4)])));
+        }
+
+        cp_async_fence();
+    };
+
+    auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+        if (n_tile_id >= n_tiles) {
+            return;
+        }
+
+        auto warp_id = threadIdx.x / 32;
+        auto th_id = threadIdx.x % 32;
+
+        if (warp_id >= 4) {
+            return;
+        }
+
+        int tc_col = th_id / 4;
+        int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
+
+        constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+        int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
+        int cur_n_packed = cur_n / pack_factor;
+        int cur_n_pos = cur_n % pack_factor;
+
+        constexpr int sh_stride = tile_n_ints;
+        constexpr uint32_t mask = (1 << num_bits) - 1;
+
+        int4 *sh_stage_ptr = sh + stage_size * pipe;
+        uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
+
+        // Undo interleaving
+        int cur_n_pos_unpacked;
+        if constexpr (num_bits == 4) {
+            constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+            cur_n_pos_unpacked = undo_pack[cur_n_pos];
+        } else {
+            constexpr int undo_pack[4] = {0, 2, 1, 3};
+            cur_n_pos_unpacked = undo_pack[cur_n_pos];
+        }
+
+        uint32_t vals[8];
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+            if constexpr (is_a_8bit) {
+                int cur_elem = tc_row + i;
+
+                int packed_src_0 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) + sh_stride * cur_elem];
+                int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) + sh_stride * (cur_elem + 16)];
+
+                vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+                vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+            } else {
+                int cur_elem = tc_row + tc_offsets[i];
+
+                int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+                int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) + sh_stride * cur_elem];
+
+                vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+                vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+            }
+        }
+
+        constexpr int tile_size = target_tile_k_size * target_tile_n_size / pack_factor;
+        int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+        // Result of:
+        // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+        if constexpr (!is_a_8bit && num_bits == 4) {
+            int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+            uint32_t res = 0;
+#pragma unroll
+            for (int i = 0; i < 8; i++) {
+                res |= vals[pack_idx[i]] << (i * 4);
+            }
+
+            out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+        } else if constexpr (is_a_8bit && num_bits == 4) {
+            int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+
+            uint32_t res = 0;
+#pragma unroll
+            for (int i = 0; i < 8; i++) {
+                res |= vals[pack_idx[i]] << (i * 4);
+            }
+
+            out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+        } else {
+            constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+            uint32_t res1 = 0;
+            uint32_t res2 = 0;
+#pragma unroll
+            for (int i = 0; i < 4; i++) {
+                const int ii = is_a_8bit ? i : pack_idx[i];
+                res1 |= vals[ii] << (i * 8);
+                res2 |= vals[4 + ii] << (i * 8);
+            }
+
+            out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+            out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+        }
+    };
+
+    auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+        for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+            fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+        }
+
+        wait_for_stage();
+    };
+#pragma unroll
+    for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+        int n_tile_id = 0;
+
+        start_pipes(k_tile_id, n_tile_id);
+
+        while (n_tile_id < n_tiles) {
+#pragma unroll
+            for (int pipe = 0; pipe < repack_stages; pipe++) {
+                fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                                n_tile_id + pipe + repack_stages - 1);
+                repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+                wait_for_stage();
+            }
+            n_tile_id += repack_stages;
+        }
+    }
+}
+
+} // namespace marlin
diff --git a/src/infiniop/ops/awq_marlin_repack/info.h b/src/infiniop/ops/awq_marlin_repack/info.h
@@ -0,0 +1,49 @@
+#ifndef __AWQ_MARLIN_REPACK_INFO_H__
+#define __AWQ_MARLIN_REPACK_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include "marlin/marlin.cuh"
+#include <vector>
+
+#include <cassert>
+
+namespace op::awq_marlin_repack {
+
+class AwqMarlinRepackInfo {
+    AwqMarlinRepackInfo() = default;
+
+public:
+    infiniDtype_t output_dtype, input_dtype;
+    size_t size_k, size_n;
+    int64_t num_bits;
+    bool is_a_8bit;
+
+    static utils::Result<AwqMarlinRepackInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        int64_t num_bits,
+        bool is_a_8bit) {
+        CHECK_OR_RETURN(
+            output_desc != nullptr && input_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+        const infiniDtype_t output_dtype = output_desc->dtype();
+        const infiniDtype_t input_dtype = input_desc->dtype();
+        CHECK_DTYPE(input_dtype, INFINI_DTYPE_I32);
+        CHECK_DTYPE(input_dtype, output_dtype);
+
+        size_t size_k = input_desc->dim(0);
+        int const pack_factor = 32 / num_bits;
+        size_t size_n = input_desc->dim(1) * pack_factor;
+
+        CHECK_OR_RETURN(size_k / marlin::tile_size == output_desc->dim(0) || size_n * marlin::tile_size / pack_factor == output_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<AwqMarlinRepackInfo>(
+            AwqMarlinRepackInfo{output_dtype, input_dtype, size_k, size_n, num_bits, is_a_8bit});
+    }
+};
+
+} // namespace op::awq_marlin_repack
+
+#endif // __AWQ_MARLIN_REPACK_INFO_H__