Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -4942,6 +4942,8 @@
<dd>The limit used to clamp inputs in SwiGLU. It is infinite when limit is not provided.</dd>
<dt><tt>use_sparse_mixer</tt> : int</dt>
<dd>Whether to use sparse mixer</dd>
<dt><tt>weights_prepacked</tt> : int</dt>
<dd>Only meaningful when quant_type='int'. Tri-state control over whether the int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS fpA_intB format expected by the runner. -1 (auto): let the execution provider choose its own backward-compatible default; the CUDA EP treats auto as prepacked. 1: the initializers are already prepacked (e.g. produced offline by pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers are raw, un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself in PrePack(), matching the behaviour of MatMulNBits and removing the offline pre-pack requirement from exporters. Defaults to -1 (auto) so each execution provider can pick its own backward-compatible default rather than the schema imposing one.</dd>

Check warning on line 4946 in docs/ContribOperators.md

View workflow job for this annotation

GitHub Actions / Optional Lint

[misspell] reported by reviewdog 🐶 "behaviour" is a misspelling of "behavior" Raw Output: ./docs/ContribOperators.md:4946:624: "behaviour" is a misspelling of "behavior"
</dl>

#### Inputs (6 - 21)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
int8_t* row_major_quantized_weight,
int32_t* d_permutation_map,
std::vector<size_t> const& shape,
QuantType quant_type);
QuantType quant_type,
bool synchronize = true);

} // namespace weight_only
} // namespace kernels
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,8 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
int8_t* row_major_quantized_weight,
int32_t* d_permutation_map,
std::vector<size_t> const& shape,
QuantType quant_type) {
QuantType quant_type,
bool synchronize) {
LayoutDetails details = getLayoutDetailsForTransform(quant_type, arch);

ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
Expand Down Expand Up @@ -576,9 +577,17 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
ORT_ENFORCE(copy_err == cudaSuccess, "cudaMemcpyAsync failed: ", cudaGetErrorString(copy_err));
}

// Synchronize the stream to ensure the permutation is complete before row_permutation memory is relased.
auto sync_err = cudaStreamSynchronize(stream);
ORT_ENFORCE(sync_err == cudaSuccess, "cudaStreamSynchronize failed: ", cudaGetErrorString(sync_err));
// Synchronize the stream so that all transform work is complete before the
// caller releases the (transient) scratch buffers. Callers that invoke this
// repeatedly on the same stream (e.g. QMoE looping over experts) can pass
// ``synchronize=false`` to skip the per-call host-blocking sync and issue a
// single ``cudaStreamSynchronize`` once after the final call instead. The
// device permutation source (``kPerm_*``) has static storage duration, so it
// is always safe regardless of when the async copy completes.
if (synchronize) {
auto sync_err = cudaStreamSynchronize(stream);
ORT_ENFORCE(sync_err == cudaSuccess, "cudaStreamSynchronize failed: ", cudaGetErrorString(sync_err));
}
}

} // namespace weight_only
Expand Down
182 changes: 174 additions & 8 deletions onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "contrib_ops/cuda/moe/qmoe_kernels.h"
#include "contrib_ops/cuda/llm/common/env_utils.h"
#include "contrib_ops/cuda/llm/common/logger.h"
#include "contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.h"
#include "contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h"

#include "contrib_ops/cuda/utils/dump_cuda_tensor.h"
#include "contrib_ops/cpu/utils/debug_macros.h"
Expand Down Expand Up @@ -60,6 +62,19 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE
this->quant_type_ = op_kernel_info.GetAttrOrDefault<std::string>("quant_type", "int");
ORT_ENFORCE(quant_type_ == "int" || quant_type_ == "fp4" || quant_type_ == "fp8" || quant_type_ == "wfp4afp8",
"quant_type must be 'int', 'fp4', 'fp8', or 'wfp4afp8', but got '", quant_type_, "'");
// ``weights_prepacked`` is an optional tri-state attribute that defaults to
// -1 (auto) in the schema, so each EP picks its own backward-compatible
// default rather than the schema imposing one:
// -1 (auto, also the schema default): the EP decides. The CUDA EP's
// backward-compatible default is "prepacked" because all pre-existing
// tooling ships CUTLASS-prepacked weights.
// 1: initializers are already prepacked; the compute path reads them as-is.
// 0: initializers are raw [E, N, K/pack]; the PrePack hook lays them out.
const int64_t weights_prepacked_mode =
op_kernel_info.GetAttrOrDefault<int64_t>("weights_prepacked", static_cast<int64_t>(-1));
ORT_ENFORCE(weights_prepacked_mode == -1 || weights_prepacked_mode == 0 || weights_prepacked_mode == 1,
"weights_prepacked must be -1 (auto), 0, or 1, but got ", weights_prepacked_mode);
weights_prepacked_ = (weights_prepacked_mode != 0);
#if !defined(ENABLE_FP4) || !defined(USE_FP4_QMOE)
ORT_ENFORCE(quant_type_ != "fp4", "QMoE quant_type='fp4' requires USE_FP4_QMOE with CUDA 12.8 or newer.");
ORT_ENFORCE(quant_type_ != "wfp4afp8",
Expand Down Expand Up @@ -199,10 +214,20 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
const bool uses_global_weight_scales = is_fp4 || is_fp8 || is_wfp4afp8;
const Tensor* input = context->Input<Tensor>(0);
const Tensor* router_probs = context->Input<Tensor>(1);
const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
// When PrePack consumed the int4/int8 expert-weight initializers
// (``weights_prepacked == false`` opt-in path), the original tensors
// were freed; ``context->Input<Tensor>(2)/(5)`` would return nothing.
// Mirror how ``MatMulNBits`` reads its prepacked B input.
// Gate on *both* prepacked buffers being present. If only fc1 were prepacked
// (e.g. a partial prepack from an earlier failure or a future refactor), this
// path must not null out fc2_experts_weights and feed a null fc2 weight/shape
// to the runner.
const bool int_weights_consumed_by_prepack =
is_int && !weights_prepacked_ && packed_fc1_weights_ != nullptr && packed_fc2_weights_ != nullptr;
Comment thread
tianleiwu marked this conversation as resolved.
const Tensor* fc1_experts_weights = int_weights_consumed_by_prepack ? nullptr : context->Input<Tensor>(2);
Comment thread
justinchuby marked this conversation as resolved.
Comment thread
tianleiwu marked this conversation as resolved.
const Tensor* fc1_scales = (is_int && !packed_fc1_scales_) ? context->Input<Tensor>(3) : nullptr;
const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
const Tensor* fc2_experts_weights = context->Input<Tensor>(5);
const Tensor* fc2_experts_weights = int_weights_consumed_by_prepack ? nullptr : context->Input<Tensor>(5);
const Tensor* fc2_scales = (is_int && !packed_fc2_scales_) ? context->Input<Tensor>(6) : nullptr;
const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(7);
// The CUTLASS MoE runner has no separate FC3 GEMM — gate and up projection weights must be
Expand All @@ -224,8 +249,13 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
return Status::OK();
};

ORT_RETURN_IF_ERROR(check_weight_type(fc1_experts_weights, "fc1_experts_weights", is_fp8));
ORT_RETURN_IF_ERROR(check_weight_type(fc2_experts_weights, "fc2_experts_weights", is_fp8));
// When PrePack consumed the int weight initializers, the dtype check
// is no longer applicable (we know they were uint8 — that's what
// PrePackIntExpertWeights validated and consumed).
if (!int_weights_consumed_by_prepack) {
ORT_RETURN_IF_ERROR(check_weight_type(fc1_experts_weights, "fc1_experts_weights", is_fp8));
ORT_RETURN_IF_ERROR(check_weight_type(fc2_experts_weights, "fc2_experts_weights", is_fp8));
}
Comment thread
justinchuby marked this conversation as resolved.

// Unified FP4 inputs: block scales in fc*_scales (3/6), global scales in 15/16.
const Tensor* fp4_fc1_block_scales = (uses_fp4_weight_scales && !packed_fp4_fc1_block_scales_) ? context->Input<Tensor>(3) : nullptr;
Expand Down Expand Up @@ -256,10 +286,13 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
int64_t pack_size = expert_weight_bits_ == 4 ? 2 : 1;
bool is_fused_swiglu = activation_type_ == onnxruntime::llm::kernels::cutlass_kernels::ActivationType::Swiglu;
MoEParameters moe_params;
// Prefer the cached shapes when PrePack consumed the source initializer.
const TensorShape& fc1_shape = int_weights_consumed_by_prepack ? fc1_weights_shape_ : fc1_experts_weights->Shape();
const TensorShape& fc2_shape = int_weights_consumed_by_prepack ? fc2_weights_shape_ : fc2_experts_weights->Shape();
Comment thread
justinchuby marked this conversation as resolved.
ORT_RETURN_IF_ERROR(onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
moe_params, input, router_probs, fc1_experts_weights,
moe_params, input, router_probs, &fc1_shape,
fc1_experts_bias_optional, fc1_scales, fc1_zeros,
fc2_experts_weights, fc2_experts_bias_optional, fc2_scales, fc2_zeros,
&fc2_shape, fc2_experts_bias_optional, fc2_scales, fc2_zeros,
nullptr, nullptr, nullptr, nullptr,
pack_size, is_fused_swiglu, block_size_));

Expand Down Expand Up @@ -808,11 +841,22 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {

Tensor* output = context->Output(0, input->Shape());

const void* fc1_weight_data = fc1_experts_weights->DataRaw();
const void* fc2_weight_data = fc2_experts_weights->DataRaw();
const void* fc1_weight_data = fc1_experts_weights ? fc1_experts_weights->DataRaw() : nullptr;
const void* fc2_weight_data = fc2_experts_weights ? fc2_experts_weights->DataRaw() : nullptr;
if (is_wfp4afp8 && !use_wfp4afp8_dequant_fallback_) {
fc1_weight_data = packed_fp4_fc1_weights_ ? packed_fp4_fc1_weights_.get() : fc1_weight_data;
fc2_weight_data = packed_fp4_fc2_weights_ ? packed_fp4_fc2_weights_.get() : fc2_weight_data;
Comment thread
tianleiwu marked this conversation as resolved.
} else if (int_weights_consumed_by_prepack) {
// PrePack converted the raw int4/int8 weights to the CUTLASS fpA_intB
// layout that the runner consumes and freed the source initializer
// (``is_packed = true``). Gate on ``int_weights_consumed_by_prepack``
// (which already requires ``packed_fc1_weights_ != nullptr``) rather than
// just ``is_int && !weights_prepacked_``: when prepacking is disabled at
// the session level (``session.disable_prepacking``) PrePack never runs,
// the prepack buffers stay null, and the raw initializer pointers read
// above must be kept so the runner is not handed null weight pointers.
fc1_weight_data = packed_fc1_weights_.get();
fc2_weight_data = packed_fc2_weights_.get();
Comment thread
justinchuby marked this conversation as resolved.
}
IAllocatorUniquePtr<void> dequant_fc1_weights;
IAllocatorUniquePtr<void> dequant_fc2_weights;
Expand Down Expand Up @@ -972,6 +1016,19 @@ Status QMoE::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
} else if (input_idx == 5 && quant_type_ == "wfp4afp8" && !use_wfp4afp8_dequant_fallback_) {
PrePackRepackFP4Weights(tensor, stream, alloc, packed_fp4_fc2_weights_, is_packed);
is_packed = false;
} else if (input_idx == 2 && quant_type_ == "int" && !weights_prepacked_) {
// Caller opted in (``weights_prepacked=0`` attribute) to having ORT
// do the CUTLASS fpA_intB layout transform internally, instead of
// shipping pre-prepacked bytes. Mirrors ``MatMulNBits::PrePack_B``
// looped over the E experts of ``[E, N, K/pack]``. We cache the
// source shape in ``fc1_weights_shape_`` so ``CheckInputs`` can be
// satisfied without holding the original initializer alive, then
// set ``is_packed = true`` to let ORT free it.
fc1_weights_shape_ = tensor.Shape();
PrePackIntExpertWeights(tensor, stream, alloc, packed_fc1_weights_, is_packed);
} else if (input_idx == 5 && quant_type_ == "int" && !weights_prepacked_) {
fc2_weights_shape_ = tensor.Shape();
PrePackIntExpertWeights(tensor, stream, alloc, packed_fc2_weights_, is_packed);
} else if (input_idx == 3) { // fc1_scales
DUMP_TENSOR("fc1_scales", tensor);
if (quant_type_ == "wfp4afp8" && !use_wfp4afp8_dequant_fallback_) {
Expand Down Expand Up @@ -1078,6 +1135,115 @@ void QMoE::PrePackCopyToGpu(const Tensor& tensor, cudaStream_t stream, Allocator
is_packed = true;
}

// ---------------------------------------------------------------------------
// PrePack helper: int4/int8 per-expert weights → CUTLASS fpA_intB layout.
// ---------------------------------------------------------------------------
// Mirrors ``MatMulNBits::PrePack_B`` but loops over the leading E (experts)
// dimension. Input ``tensor`` is the row-major 3-D ``[E, N, K/(8/bits)]``
// quantized weight initializer; output is a GPU buffer in the
// kernel-expected ``[E, K, N/(8/bits)]`` layout.
void QMoE::PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, AllocatorPtr alloc,
IAllocatorUniquePtr<void>& packed_buf, bool& is_packed) {
ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8,
"PrePackIntExpertWeights: only 4 and 8 bits are supported, got ", expert_weight_bits_);
const auto& shape = tensor.Shape();
ORT_ENFORCE(shape.NumDimensions() == 3,
"PrePackIntExpertWeights: expected 3-D weight tensor [E, N, K/pack], got ndim=",
shape.NumDimensions());

const int bits = static_cast<int>(expert_weight_bits_);
const int pack_factor = 8 / bits;
const int64_t num_experts = shape[0];
const int64_t n = shape[1];
const int64_t k_packed = shape[2];
const int64_t k = k_packed * pack_factor;

// Weight packing is architecture-aware (see
// docs/contrib_ops/cuda/moe_qmoe.md §7 "Cross-Architecture Packing
// Compatibility"). SM90 (Hopper) uses its own Permuted-Linear layout that
// skips column interleaving, so it is its own compatibility group. Every
// other supported arch — SM75/80/86/89 and SM100/120 (Blackwell) — shares
// the SM80 fpA_intB layout, so they all pack as SM80. SM70 and older lack
// INT8 LDSM and are unsupported. The compute-side runner selects the same
// layout from this clamped arch, so the two cannot drift.
//
// SM75 is passed through unchanged (rather than clamped to 80) even though it
// shares SM80's layout: the compute-side dispatch (getLayoutDetailsForTransform)
// still has a distinct SM75 branch, so mirroring it here avoids confusing a
// reader into thinking prepack and dispatch disagree.
ORT_ENFORCE(sm_ >= 75,
"QMoE int4/int8 weight prepack requires SM75 or newer, got sm=", sm_);
const int packing_sm = (sm_ == 90 || sm_ == 75) ? sm_ : 80;

Comment thread
justinchuby marked this conversation as resolved.
// Per-expert sizes.
const size_t per_expert_bytes = static_cast<size_t>(n) * static_cast<size_t>(k) / pack_factor;
const size_t total_bytes = per_expert_bytes * static_cast<size_t>(num_experts);

// Output buffer holds all E prepacked experts back-to-back in
// [E, K, N/pack_factor] layout.
packed_buf = IAllocator::MakeUniquePtr<void>(alloc, total_bytes, /*use_reserve=*/true);
int8_t* dst_all = reinterpret_cast<int8_t*>(packed_buf.get());

// Two transient per-expert scratch buffers reused across experts.
IAllocatorUniquePtr<void> transposed_scratch =
this->GetTransientScratchBuffer<void>(per_expert_bytes);
int8_t* transposed_scratch_ptr = reinterpret_cast<int8_t*>(transposed_scratch.get());

IAllocatorUniquePtr<void> src_gpu_scratch;
const uint8_t* src_base_gpu = nullptr;
if (tensor.Location().device.Type() == OrtDevice::CPU) {
src_gpu_scratch = this->GetTransientScratchBuffer<void>(total_bytes);
CUDA_CALL_THROW(cudaMemcpyAsync(src_gpu_scratch.get(), tensor.DataRaw(), total_bytes,
cudaMemcpyHostToDevice, stream));
src_base_gpu = reinterpret_cast<const uint8_t*>(src_gpu_scratch.get());
} else {
src_base_gpu = reinterpret_cast<const uint8_t*>(tensor.DataRaw());
}

IAllocatorUniquePtr<int32_t> permutation_map = this->GetTransientScratchBuffer<int32_t>(32);

using onnxruntime::llm::kernels::weight_only::QuantType;
const QuantType quant_type = (bits == 4) ? QuantType::W4_A16 : QuantType::W8_A16;

for (int64_t e = 0; e < num_experts; ++e) {
const uint8_t* src_e = src_base_gpu + static_cast<size_t>(e) * per_expert_bytes;
int8_t* dst_e = dst_all + static_cast<size_t>(e) * per_expert_bytes;

// Step 1: transpose + (for int4) unpack/zero-point bias into the
// transposed-int8 scratch buffer. Mirrors MatMulNBits's PrePack_B.
if (bits == 4) {
onnxruntime::llm::kernels::fpA_intB_gemv::unpack_uint4_transposed_to_int8_direct_cuda(
stream, transposed_scratch_ptr, src_e, static_cast<int>(n), static_cast<int>(k));
} else {
onnxruntime::llm::kernels::fpA_intB_gemv::transpose_uint8_matrix_and_convert_to_int8(
stream, transposed_scratch_ptr, src_e, static_cast<int>(n), static_cast<int>(k));
}

// Step 2: apply the CUTLASS fpA_intB row-permutation / column-interleave /
// bias / pair-interleave transform into the per-expert output slot.
// ``synchronize=false``: avoid one host-blocking ``cudaStreamSynchronize``
// per expert (which would scale model-load time with ``num_experts``).
// Stream ordering guarantees expert e's transform finishes before expert
// e+1 reuses the shared transpose scratch, and a single sync after the loop
// makes the whole batch complete before the scratch buffers are freed.
onnxruntime::llm::kernels::weight_only::preprocess_weights_for_mixed_gemm_cuda(
stream,
packing_sm,
dst_e,
transposed_scratch_ptr,
permutation_map.get(),
{static_cast<size_t>(k), static_cast<size_t>(n)},
quant_type,
/*synchronize=*/false);
}

// Single host-blocking sync after all experts: this guarantees every
// per-expert transform (and the CPU->GPU staging copy above) is complete, so
// the transient scratch buffers are safe to free on return.
CUDA_CALL_THROW(cudaStreamSynchronize(stream));
is_packed = true;
}

// ---------------------------------------------------------------------------
// PrePack helper: Swizzle MXFP block scales for SM120 TMA layout using GPU kernel.
// ---------------------------------------------------------------------------
Expand Down
Loading
Loading