diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 554ff5a1bf863..0cf15f33ec4ab 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -4942,6 +4942,8 @@ This version of the operator has been available since version 1 of the 'com.micr
The limit used to clamp inputs in SwiGLU. It is infinite when limit is not provided.
use_sparse_mixer : int
Whether to use sparse mixer
+weights_prepacked : int
+Only meaningful when quant_type='int'. Tri-state control over whether the int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS fpA_intB format expected by the runner. -1 (auto): let the execution provider choose its own backward-compatible default; the CUDA EP treats auto as prepacked. 1: the initializers are already prepacked (e.g. produced offline by pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers are raw, un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself in PrePack(), matching the behaviour of MatMulNBits and removing the offline pre-pack requirement from exporters. Defaults to -1 (auto) so each execution provider can pick its own backward-compatible default rather than the schema imposing one.
#### Inputs (6 - 21)
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h
index da6cc4dd2376b..b9e62443145e5 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h
@@ -37,7 +37,8 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
int8_t* row_major_quantized_weight,
int32_t* d_permutation_map,
std::vector const& shape,
- QuantType quant_type);
+ QuantType quant_type,
+ bool synchronize = true);
} // namespace weight_only
} // namespace kernels
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu
index 7cb0f6e91fc7d..a006612ddadc9 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu
@@ -527,7 +527,8 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
int8_t* row_major_quantized_weight,
int32_t* d_permutation_map,
std::vector const& shape,
- QuantType quant_type) {
+ QuantType quant_type,
+ bool synchronize) {
LayoutDetails details = getLayoutDetailsForTransform(quant_type, arch);
ORT_ENFORCE(shape.size() == 2 || shape.size() == 3, "Shape must be 2-D or 3-D");
@@ -576,9 +577,17 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
ORT_ENFORCE(copy_err == cudaSuccess, "cudaMemcpyAsync failed: ", cudaGetErrorString(copy_err));
}
- // Synchronize the stream to ensure the permutation is complete before row_permutation memory is relased.
- auto sync_err = cudaStreamSynchronize(stream);
- ORT_ENFORCE(sync_err == cudaSuccess, "cudaStreamSynchronize failed: ", cudaGetErrorString(sync_err));
+ // Synchronize the stream so that all transform work is complete before the
+ // caller releases the (transient) scratch buffers. Callers that invoke this
+ // repeatedly on the same stream (e.g. QMoE looping over experts) can pass
+ // ``synchronize=false`` to skip the per-call host-blocking sync and issue a
+ // single ``cudaStreamSynchronize`` once after the final call instead. The
+ // device permutation source (``kPerm_*``) has static storage duration, so it
+ // is always safe regardless of when the async copy completes.
+ if (synchronize) {
+ auto sync_err = cudaStreamSynchronize(stream);
+ ORT_ENFORCE(sync_err == cudaSuccess, "cudaStreamSynchronize failed: ", cudaGetErrorString(sync_err));
+ }
}
} // namespace weight_only
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc
index f6bf5bbb1f0e3..e1ddcac0cea4f 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc
@@ -15,6 +15,8 @@
#include "contrib_ops/cuda/moe/qmoe_kernels.h"
#include "contrib_ops/cuda/llm/common/env_utils.h"
#include "contrib_ops/cuda/llm/common/logger.h"
+#include "contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.h"
+#include "contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h"
#include "contrib_ops/cuda/utils/dump_cuda_tensor.h"
#include "contrib_ops/cpu/utils/debug_macros.h"
@@ -60,6 +62,19 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE
this->quant_type_ = op_kernel_info.GetAttrOrDefault("quant_type", "int");
ORT_ENFORCE(quant_type_ == "int" || quant_type_ == "fp4" || quant_type_ == "fp8" || quant_type_ == "wfp4afp8",
"quant_type must be 'int', 'fp4', 'fp8', or 'wfp4afp8', but got '", quant_type_, "'");
+ // ``weights_prepacked`` is an optional tri-state attribute that defaults to
+ // -1 (auto) in the schema, so each EP picks its own backward-compatible
+ // default rather than the schema imposing one:
+ // -1 (auto, also the schema default): the EP decides. The CUDA EP's
+ // backward-compatible default is "prepacked" because all pre-existing
+ // tooling ships CUTLASS-prepacked weights.
+ // 1: initializers are already prepacked; the compute path reads them as-is.
+ // 0: initializers are raw [E, N, K/pack]; the PrePack hook lays them out.
+ const int64_t weights_prepacked_mode =
+ op_kernel_info.GetAttrOrDefault("weights_prepacked", static_cast(-1));
+ ORT_ENFORCE(weights_prepacked_mode == -1 || weights_prepacked_mode == 0 || weights_prepacked_mode == 1,
+ "weights_prepacked must be -1 (auto), 0, or 1, but got ", weights_prepacked_mode);
+ weights_prepacked_ = (weights_prepacked_mode != 0);
#if !defined(ENABLE_FP4) || !defined(USE_FP4_QMOE)
ORT_ENFORCE(quant_type_ != "fp4", "QMoE quant_type='fp4' requires USE_FP4_QMOE with CUDA 12.8 or newer.");
ORT_ENFORCE(quant_type_ != "wfp4afp8",
@@ -199,10 +214,20 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
const bool uses_global_weight_scales = is_fp4 || is_fp8 || is_wfp4afp8;
const Tensor* input = context->Input(0);
const Tensor* router_probs = context->Input(1);
- const Tensor* fc1_experts_weights = context->Input(2);
+ // When PrePack consumed the int4/int8 expert-weight initializers
+ // (``weights_prepacked == false`` opt-in path), the original tensors
+ // were freed; ``context->Input(2)/(5)`` would return nothing.
+ // Mirror how ``MatMulNBits`` reads its prepacked B input.
+ // Gate on *both* prepacked buffers being present. If only fc1 were prepacked
+ // (e.g. a partial prepack from an earlier failure or a future refactor), this
+ // path must not null out fc2_experts_weights and feed a null fc2 weight/shape
+ // to the runner.
+ const bool int_weights_consumed_by_prepack =
+ is_int && !weights_prepacked_ && packed_fc1_weights_ != nullptr && packed_fc2_weights_ != nullptr;
+ const Tensor* fc1_experts_weights = int_weights_consumed_by_prepack ? nullptr : context->Input(2);
const Tensor* fc1_scales = (is_int && !packed_fc1_scales_) ? context->Input(3) : nullptr;
const Tensor* fc1_experts_bias_optional = context->Input(4);
- const Tensor* fc2_experts_weights = context->Input(5);
+ const Tensor* fc2_experts_weights = int_weights_consumed_by_prepack ? nullptr : context->Input(5);
const Tensor* fc2_scales = (is_int && !packed_fc2_scales_) ? context->Input(6) : nullptr;
const Tensor* fc2_experts_bias_optional = context->Input(7);
// The CUTLASS MoE runner has no separate FC3 GEMM — gate and up projection weights must be
@@ -224,8 +249,13 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
return Status::OK();
};
- ORT_RETURN_IF_ERROR(check_weight_type(fc1_experts_weights, "fc1_experts_weights", is_fp8));
- ORT_RETURN_IF_ERROR(check_weight_type(fc2_experts_weights, "fc2_experts_weights", is_fp8));
+ // When PrePack consumed the int weight initializers, the dtype check
+ // is no longer applicable (we know they were uint8 — that's what
+ // PrePackIntExpertWeights validated and consumed).
+ if (!int_weights_consumed_by_prepack) {
+ ORT_RETURN_IF_ERROR(check_weight_type(fc1_experts_weights, "fc1_experts_weights", is_fp8));
+ ORT_RETURN_IF_ERROR(check_weight_type(fc2_experts_weights, "fc2_experts_weights", is_fp8));
+ }
// Unified FP4 inputs: block scales in fc*_scales (3/6), global scales in 15/16.
const Tensor* fp4_fc1_block_scales = (uses_fp4_weight_scales && !packed_fp4_fc1_block_scales_) ? context->Input(3) : nullptr;
@@ -256,10 +286,13 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
int64_t pack_size = expert_weight_bits_ == 4 ? 2 : 1;
bool is_fused_swiglu = activation_type_ == onnxruntime::llm::kernels::cutlass_kernels::ActivationType::Swiglu;
MoEParameters moe_params;
+ // Prefer the cached shapes when PrePack consumed the source initializer.
+ const TensorShape& fc1_shape = int_weights_consumed_by_prepack ? fc1_weights_shape_ : fc1_experts_weights->Shape();
+ const TensorShape& fc2_shape = int_weights_consumed_by_prepack ? fc2_weights_shape_ : fc2_experts_weights->Shape();
ORT_RETURN_IF_ERROR(onnxruntime::contrib::moe_helper::CheckInputs(
- moe_params, input, router_probs, fc1_experts_weights,
+ moe_params, input, router_probs, &fc1_shape,
fc1_experts_bias_optional, fc1_scales, fc1_zeros,
- fc2_experts_weights, fc2_experts_bias_optional, fc2_scales, fc2_zeros,
+ &fc2_shape, fc2_experts_bias_optional, fc2_scales, fc2_zeros,
nullptr, nullptr, nullptr, nullptr,
pack_size, is_fused_swiglu, block_size_));
@@ -808,11 +841,22 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
Tensor* output = context->Output(0, input->Shape());
- const void* fc1_weight_data = fc1_experts_weights->DataRaw();
- const void* fc2_weight_data = fc2_experts_weights->DataRaw();
+ const void* fc1_weight_data = fc1_experts_weights ? fc1_experts_weights->DataRaw() : nullptr;
+ const void* fc2_weight_data = fc2_experts_weights ? fc2_experts_weights->DataRaw() : nullptr;
if (is_wfp4afp8 && !use_wfp4afp8_dequant_fallback_) {
fc1_weight_data = packed_fp4_fc1_weights_ ? packed_fp4_fc1_weights_.get() : fc1_weight_data;
fc2_weight_data = packed_fp4_fc2_weights_ ? packed_fp4_fc2_weights_.get() : fc2_weight_data;
+ } else if (int_weights_consumed_by_prepack) {
+ // PrePack converted the raw int4/int8 weights to the CUTLASS fpA_intB
+ // layout that the runner consumes and freed the source initializer
+ // (``is_packed = true``). Gate on ``int_weights_consumed_by_prepack``
+ // (which already requires ``packed_fc1_weights_ != nullptr``) rather than
+ // just ``is_int && !weights_prepacked_``: when prepacking is disabled at
+ // the session level (``session.disable_prepacking``) PrePack never runs,
+ // the prepack buffers stay null, and the raw initializer pointers read
+ // above must be kept so the runner is not handed null weight pointers.
+ fc1_weight_data = packed_fc1_weights_.get();
+ fc2_weight_data = packed_fc2_weights_.get();
}
IAllocatorUniquePtr dequant_fc1_weights;
IAllocatorUniquePtr dequant_fc2_weights;
@@ -972,6 +1016,19 @@ Status QMoE::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
} else if (input_idx == 5 && quant_type_ == "wfp4afp8" && !use_wfp4afp8_dequant_fallback_) {
PrePackRepackFP4Weights(tensor, stream, alloc, packed_fp4_fc2_weights_, is_packed);
is_packed = false;
+ } else if (input_idx == 2 && quant_type_ == "int" && !weights_prepacked_) {
+ // Caller opted in (``weights_prepacked=0`` attribute) to having ORT
+ // do the CUTLASS fpA_intB layout transform internally, instead of
+ // shipping pre-prepacked bytes. Mirrors ``MatMulNBits::PrePack_B``
+ // looped over the E experts of ``[E, N, K/pack]``. We cache the
+ // source shape in ``fc1_weights_shape_`` so ``CheckInputs`` can be
+ // satisfied without holding the original initializer alive, then
+ // set ``is_packed = true`` to let ORT free it.
+ fc1_weights_shape_ = tensor.Shape();
+ PrePackIntExpertWeights(tensor, stream, alloc, packed_fc1_weights_, is_packed);
+ } else if (input_idx == 5 && quant_type_ == "int" && !weights_prepacked_) {
+ fc2_weights_shape_ = tensor.Shape();
+ PrePackIntExpertWeights(tensor, stream, alloc, packed_fc2_weights_, is_packed);
} else if (input_idx == 3) { // fc1_scales
DUMP_TENSOR("fc1_scales", tensor);
if (quant_type_ == "wfp4afp8" && !use_wfp4afp8_dequant_fallback_) {
@@ -1078,6 +1135,115 @@ void QMoE::PrePackCopyToGpu(const Tensor& tensor, cudaStream_t stream, Allocator
is_packed = true;
}
+// ---------------------------------------------------------------------------
+// PrePack helper: int4/int8 per-expert weights → CUTLASS fpA_intB layout.
+// ---------------------------------------------------------------------------
+// Mirrors ``MatMulNBits::PrePack_B`` but loops over the leading E (experts)
+// dimension. Input ``tensor`` is the row-major 3-D ``[E, N, K/(8/bits)]``
+// quantized weight initializer; output is a GPU buffer in the
+// kernel-expected ``[E, K, N/(8/bits)]`` layout.
+void QMoE::PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, AllocatorPtr alloc,
+ IAllocatorUniquePtr& packed_buf, bool& is_packed) {
+ ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8,
+ "PrePackIntExpertWeights: only 4 and 8 bits are supported, got ", expert_weight_bits_);
+ const auto& shape = tensor.Shape();
+ ORT_ENFORCE(shape.NumDimensions() == 3,
+ "PrePackIntExpertWeights: expected 3-D weight tensor [E, N, K/pack], got ndim=",
+ shape.NumDimensions());
+
+ const int bits = static_cast(expert_weight_bits_);
+ const int pack_factor = 8 / bits;
+ const int64_t num_experts = shape[0];
+ const int64_t n = shape[1];
+ const int64_t k_packed = shape[2];
+ const int64_t k = k_packed * pack_factor;
+
+ // Weight packing is architecture-aware (see
+ // docs/contrib_ops/cuda/moe_qmoe.md §7 "Cross-Architecture Packing
+ // Compatibility"). SM90 (Hopper) uses its own Permuted-Linear layout that
+ // skips column interleaving, so it is its own compatibility group. Every
+ // other supported arch — SM75/80/86/89 and SM100/120 (Blackwell) — shares
+ // the SM80 fpA_intB layout, so they all pack as SM80. SM70 and older lack
+ // INT8 LDSM and are unsupported. The compute-side runner selects the same
+ // layout from this clamped arch, so the two cannot drift.
+ //
+ // SM75 is passed through unchanged (rather than clamped to 80) even though it
+ // shares SM80's layout: the compute-side dispatch (getLayoutDetailsForTransform)
+ // still has a distinct SM75 branch, so mirroring it here avoids confusing a
+ // reader into thinking prepack and dispatch disagree.
+ ORT_ENFORCE(sm_ >= 75,
+ "QMoE int4/int8 weight prepack requires SM75 or newer, got sm=", sm_);
+ const int packing_sm = (sm_ == 90 || sm_ == 75) ? sm_ : 80;
+
+ // Per-expert sizes.
+ const size_t per_expert_bytes = static_cast(n) * static_cast(k) / pack_factor;
+ const size_t total_bytes = per_expert_bytes * static_cast(num_experts);
+
+ // Output buffer holds all E prepacked experts back-to-back in
+ // [E, K, N/pack_factor] layout.
+ packed_buf = IAllocator::MakeUniquePtr(alloc, total_bytes, /*use_reserve=*/true);
+ int8_t* dst_all = reinterpret_cast(packed_buf.get());
+
+ // Two transient per-expert scratch buffers reused across experts.
+ IAllocatorUniquePtr transposed_scratch =
+ this->GetTransientScratchBuffer(per_expert_bytes);
+ int8_t* transposed_scratch_ptr = reinterpret_cast(transposed_scratch.get());
+
+ IAllocatorUniquePtr src_gpu_scratch;
+ const uint8_t* src_base_gpu = nullptr;
+ if (tensor.Location().device.Type() == OrtDevice::CPU) {
+ src_gpu_scratch = this->GetTransientScratchBuffer(total_bytes);
+ CUDA_CALL_THROW(cudaMemcpyAsync(src_gpu_scratch.get(), tensor.DataRaw(), total_bytes,
+ cudaMemcpyHostToDevice, stream));
+ src_base_gpu = reinterpret_cast(src_gpu_scratch.get());
+ } else {
+ src_base_gpu = reinterpret_cast(tensor.DataRaw());
+ }
+
+ IAllocatorUniquePtr permutation_map = this->GetTransientScratchBuffer(32);
+
+ using onnxruntime::llm::kernels::weight_only::QuantType;
+ const QuantType quant_type = (bits == 4) ? QuantType::W4_A16 : QuantType::W8_A16;
+
+ for (int64_t e = 0; e < num_experts; ++e) {
+ const uint8_t* src_e = src_base_gpu + static_cast(e) * per_expert_bytes;
+ int8_t* dst_e = dst_all + static_cast(e) * per_expert_bytes;
+
+ // Step 1: transpose + (for int4) unpack/zero-point bias into the
+ // transposed-int8 scratch buffer. Mirrors MatMulNBits's PrePack_B.
+ if (bits == 4) {
+ onnxruntime::llm::kernels::fpA_intB_gemv::unpack_uint4_transposed_to_int8_direct_cuda(
+ stream, transposed_scratch_ptr, src_e, static_cast(n), static_cast(k));
+ } else {
+ onnxruntime::llm::kernels::fpA_intB_gemv::transpose_uint8_matrix_and_convert_to_int8(
+ stream, transposed_scratch_ptr, src_e, static_cast(n), static_cast(k));
+ }
+
+ // Step 2: apply the CUTLASS fpA_intB row-permutation / column-interleave /
+ // bias / pair-interleave transform into the per-expert output slot.
+ // ``synchronize=false``: avoid one host-blocking ``cudaStreamSynchronize``
+ // per expert (which would scale model-load time with ``num_experts``).
+ // Stream ordering guarantees expert e's transform finishes before expert
+ // e+1 reuses the shared transpose scratch, and a single sync after the loop
+ // makes the whole batch complete before the scratch buffers are freed.
+ onnxruntime::llm::kernels::weight_only::preprocess_weights_for_mixed_gemm_cuda(
+ stream,
+ packing_sm,
+ dst_e,
+ transposed_scratch_ptr,
+ permutation_map.get(),
+ {static_cast(k), static_cast(n)},
+ quant_type,
+ /*synchronize=*/false);
+ }
+
+ // Single host-blocking sync after all experts: this guarantees every
+ // per-expert transform (and the CPU->GPU staging copy above) is complete, so
+ // the transient scratch buffers are safe to free on return.
+ CUDA_CALL_THROW(cudaStreamSynchronize(stream));
+ is_packed = true;
+}
+
// ---------------------------------------------------------------------------
// PrePack helper: Swizzle MXFP block scales for SM120 TMA layout using GPU kernel.
// ---------------------------------------------------------------------------
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h
index afacaf45a65ba..5722ac41cc470 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h
@@ -37,8 +37,35 @@ class QMoE final : public CudaKernel, public MoEBase {
IAllocatorUniquePtr& packed_buf, bool& is_packed);
void PrePackRepackFP4Weights(const Tensor& tensor, cudaStream_t stream, AllocatorPtr alloc,
IAllocatorUniquePtr& packed_buf, bool& is_packed);
+ // Prepacks int4/int8 expert weights into the CUTLASS fpA_intB layout so the
+ // QMoE runner can consume them directly. Mirrors what MatMulNBits.PrePack
+ // does, looped over the E expert dimension. ``tensor`` is the 3-D
+ // ``[E, N, K / (8 / bits)]`` weight initializer; ``packed_buf`` receives a
+ // GPU buffer in the kernel-expected ``[E, K, N / (8 / bits)]`` layout.
+ void PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, AllocatorPtr alloc,
+ IAllocatorUniquePtr& packed_buf, bool& is_packed);
int64_t expert_weight_bits_;
bool is_fp16_;
+ // When true (the schema default), the int4/int8 fc1/fc2 weight
+ // initializers are already in the CUTLASS fpA_intB layout — produced
+ // offline e.g. via ``pack_weights_for_cuda_mixed_gemm`` — and the
+ // compute path reads them as-is. When false, the raw schema-conformant
+ // ``[E, N, K/pack]`` layout (as produced by
+ // ``quantize_matmul_{4,8}bits``) is rewritten inside the PrePack hook
+ // via ``PrePackIntExpertWeights``, removing the offline prepack
+ // dependency. Only meaningful when ``quant_type_ == "int"``. Derived from
+ // the optional tri-state ``weights_prepacked`` attribute: -1/auto (or
+ // absent) maps to true on the CUDA EP, 1 maps to true, 0 maps to false.
+ bool weights_prepacked_ = true;
+ // Cached source weight shapes captured at PrePack time. When the
+ // PrePack hook consumed and released the original int4/int8 weight
+ // initializers (``is_packed = true``), ``context->Input(2)``
+ // and ``(5)`` return nothing, so ``moe_helper::CheckInputs`` can no
+ // longer read the shapes from the live tensors. We feed it these
+ // cached shapes instead via the ``TensorShape*`` overload, matching
+ // how ``MatMulNBits`` caches ``N_`` / ``K_`` in its constructor.
+ TensorShape fc1_weights_shape_;
+ TensorShape fc2_weights_shape_;
bool use_fp4_dequant_fallback_ = false;
// Dequantizes FP8 weights to FP16/BF16 scratch buffers before invoking the A16 MoE runner.
bool use_fp8_dequant_fallback_ = false;
@@ -54,6 +81,14 @@ class QMoE final : public CudaKernel, public MoEBase {
// PrePack logic:
// - Copies scales to GPU buffer (if in CPU) or just keeps them. For simplicity, we allocate and copy.
// - Computes Bias from ZP and Scale using PrePack kernel.
+ // - For ``quant_type == "int"``, also prepacks the per-expert int4/int8
+ // weight tensors into the CUTLASS fpA_intB layout, mirroring
+ // ``MatMulNBits.PrePack_B``. Without this, callers would have to
+ // pre-prepack the weights offline using ``pack_weights_for_cuda_mixed_gemm``,
+ // which is asymmetric with how ``MatMulNBits`` is consumed and forces
+ // a CUDA-enabled ORT build for any offline quantization tooling.
+ IAllocatorUniquePtr packed_fc1_weights_;
+ IAllocatorUniquePtr packed_fc2_weights_;
IAllocatorUniquePtr packed_fc1_scales_;
IAllocatorUniquePtr packed_fc1_bias_;
IAllocatorUniquePtr packed_fc2_scales_;
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 0d9a53bb355c8..1054fd94ef423 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1519,6 +1519,21 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
"fc*_scales inputs contain MXFP4 block scales, and fc*_global_scale inputs must be provided.",
AttributeProto::STRING,
std::string("int"))
+ .Attr("weights_prepacked",
+ "Only meaningful when quant_type='int'. Tri-state control over whether the "
+ "int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS "
+ "fpA_intB format expected by the runner. -1 (auto): let the execution provider "
+ "choose its own backward-compatible default; the CUDA EP treats auto as "
+ "prepacked. 1: the initializers are already prepacked (e.g. produced offline by "
+ "pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers "
+ "are raw, un-prepacked [E, N, K/pack] tensors as produced by "
+ "quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself "
+ "in PrePack(), matching the behaviour of MatMulNBits and removing the offline "
+ "pre-pack requirement from exporters. Defaults to -1 (auto) so each execution "
+ "provider can pick its own backward-compatible default rather than the schema "
+ "imposing one.",
+ AttributeProto::INT,
+ static_cast(-1))
.Input(0,
"input",
"2D tensor with shape (num_tokens, hidden_size), or "
diff --git a/onnxruntime/test/python/transformers/test_qmoe_cuda.py b/onnxruntime/test/python/transformers/test_qmoe_cuda.py
index 9fa10e4964e65..993716a4c80b0 100644
--- a/onnxruntime/test/python/transformers/test_qmoe_cuda.py
+++ b/onnxruntime/test/python/transformers/test_qmoe_cuda.py
@@ -2069,5 +2069,142 @@ def test_qmoe_swiglu_throughput_benchmark(self):
print("- Throughput: ORT throughput improvement (higher is better)")
+# ============================================================================
+# QMoE integer-weight PrePack smoke test.
+#
+# Validates the PrePack hook added in PR #28749: with `quant_type="int"`, the
+# QMoE op should be able to consume raw quantized weights — shape
+# `[E, N, K/(8/bits)]` as produced by `quantize_matmul_{4,8}bits` —
+# and internally run the CUTLASS fpA_intB layout transform that callers
+# previously had to do offline via `pack_weights_for_cuda_mixed_gemm`.
+#
+# Strategy: build a single ONNX graph with raw (un-prepacked) int4 weight
+# initializers and `weights_prepacked=0`, run it through ORT's CUDA QMoE
+# kernel, and assert the output is finite and has a plausible magnitude.
+# This is a smoke test, not a numerical parity check — see the class
+# docstring for why a bit-parity comparison is intentionally omitted.
+# ============================================================================
+
+
+@unittest.skipUnless(torch.cuda.is_available(), "QMoE PrePack smoke test requires CUDA")
+class TestQMoEIntPrePackSmoke(unittest.TestCase):
+ """Smoke test for the QMoE int4 PrePack hook (issue #28748 / PR #28749).
+
+ Builds a single QMoE node with raw, un-prepacked ``[E, N, K/2]`` int4
+ weights straight from ``quantize_matmul_4bits`` and runs it through
+ the CUDA QMoE kernel. With the new ``PrePackIntExpertWeights`` hook,
+ the kernel should:
+
+ 1. Accept the on-disk shape that matches the ``com.microsoft::QMoE``
+ schema (``[E, N, K/pack]``), where today's offline tooling has to
+ hand-write the transposed pre-prepacked shape ``[E, K, N/pack]``
+ and pre-pack the bytes itself via ``pack_weights_for_cuda_mixed_gemm``.
+ 2. Run the GEMM to completion and produce sensible output (no NaN /
+ Inf, output magnitudes consistent with a small weight + small
+ input matmul).
+
+ We deliberately do **not** include a bit-parity check against the
+ existing offline-pre-pack code path because the existing harness
+ (``quant_dequant_blockwise`` → ``pack_weights_for_cuda_mixed_gemm``)
+ hardcodes ``force_arch=80`` and produces incorrect output on SM>=90
+ hardware (the other ``test_swiglu_qmoe_parity_*`` cases in this file
+ fail on H200 / H100 with max-diff > 1.0 on plain main, by
+ inspection — pre-existing). A real parity check can be added once
+ that harness honours the runtime SM.
+ """
+
+ def _run_one(self, *, hidden_size, inter_size, num_experts, top_k, swiglu_fusion, batch_size):
+ torch.manual_seed(123)
+ numpy.random.seed(123)
+
+ onnx_dtype = TensorProto.FLOAT16
+ use_swiglu = True
+ # fc1 packs gate+up along the N axis when use_swiglu=True.
+ fc1_n = 2 * inter_size if use_swiglu else inter_size
+ fc1_k = hidden_size
+ fc2_n = hidden_size
+ fc2_k = inter_size
+
+ raw_fc1 = numpy.zeros((num_experts, fc1_n, fc1_k // 2), dtype=numpy.uint8)
+ raw_fc2 = numpy.zeros((num_experts, fc2_n, fc2_k // 2), dtype=numpy.uint8)
+ fc1_scales = numpy.zeros((num_experts, fc1_n), dtype=numpy.float16)
+ fc2_scales = numpy.zeros((num_experts, fc2_n), dtype=numpy.float16)
+
+ for e in range(num_experts):
+ w1 = (torch.randn(fc1_n, fc1_k) * 0.05).numpy().astype(numpy.float16)
+ w2 = (torch.randn(fc2_n, fc2_k) * 0.05).numpy().astype(numpy.float16)
+ qw1 = numpy.zeros((fc1_n, 1, fc1_k // 2), dtype=numpy.uint8)
+ qw2 = numpy.zeros((fc2_n, 1, fc2_k // 2), dtype=numpy.uint8)
+ sc1 = numpy.zeros((fc1_n, 1), dtype=numpy.float32)
+ sc2 = numpy.zeros((fc2_n, 1), dtype=numpy.float32)
+ zp1 = numpy.zeros((fc1_n, 1), dtype=numpy.uint8)
+ zp2 = numpy.zeros((fc2_n, 1), dtype=numpy.uint8)
+ _pybind.quantize_matmul_4bits(qw1, numpy.ascontiguousarray(w1.T), sc1, zp1, fc1_k, fc1_n, fc1_k, True)
+ _pybind.quantize_matmul_4bits(qw2, numpy.ascontiguousarray(w2.T), sc2, zp2, fc2_k, fc2_n, fc2_k, True)
+ raw_fc1[e] = qw1.reshape(fc1_n, fc1_k // 2)
+ raw_fc2[e] = qw2.reshape(fc2_n, fc2_k // 2)
+ fc1_scales[e] = numpy.abs(sc1).flatten().astype(numpy.float16)
+ fc2_scales[e] = numpy.abs(sc2).flatten().astype(numpy.float16)
+
+ qmoe = helper.make_node(
+ "QMoE",
+ inputs=["x", "router", "fc1_W", "fc1_S", "", "fc2_W", "fc2_S", ""],
+ outputs=["y"],
+ name="qmoe",
+ domain="com.microsoft",
+ k=top_k,
+ normalize_routing_weights=1,
+ activation_type="swiglu" if use_swiglu else "silu",
+ swiglu_fusion=swiglu_fusion,
+ expert_weight_bits=4,
+ quant_type="int",
+ # Opt in to the PrePack-hook path; the weights below are raw
+ # ``[E, N, K/2]`` outputs of ``quantize_matmul_4bits``, not
+ # CUTLASS-prepacked.
+ weights_prepacked=0,
+ )
+ graph = helper.make_graph(
+ nodes=[qmoe],
+ name="qmoe_only",
+ inputs=[
+ helper.make_tensor_value_info("x", onnx_dtype, [None, hidden_size]),
+ helper.make_tensor_value_info("router", onnx_dtype, [None, num_experts]),
+ ],
+ outputs=[helper.make_tensor_value_info("y", onnx_dtype, [None, hidden_size])],
+ initializer=[
+ helper.make_tensor("fc1_W", TensorProto.UINT8, list(raw_fc1.shape), raw_fc1.tobytes(), raw=True),
+ helper.make_tensor("fc2_W", TensorProto.UINT8, list(raw_fc2.shape), raw_fc2.tobytes(), raw=True),
+ helper.make_tensor("fc1_S", onnx_dtype, list(fc1_scales.shape), fc1_scales.flatten().tolist()),
+ helper.make_tensor("fc2_S", onnx_dtype, list(fc2_scales.shape), fc2_scales.flatten().tolist()),
+ ],
+ )
+ model = helper.make_model(
+ graph, opset_imports=[helper.make_opsetid("", 20), helper.make_opsetid("com.microsoft", 1)]
+ )
+ model.ir_version = 10
+
+ sess = onnxruntime.InferenceSession(model.SerializeToString(), providers=ort_provider)
+ x = numpy.random.randn(batch_size, hidden_size).astype(numpy.float16)
+ router = numpy.random.randn(batch_size, num_experts).astype(numpy.float16)
+ out = sess.run(None, {"x": x, "router": router})[0]
+
+ self.assertEqual(out.shape, (batch_size, hidden_size))
+ self.assertEqual(out.dtype, numpy.float16)
+ self.assertFalse(numpy.isnan(out).any(), "QMoE raw-weight output has NaN")
+ self.assertFalse(numpy.isinf(out).any(), "QMoE raw-weight output has Inf")
+ # With weights ~ N(0, 0.05) and input ~ N(0, 1), SwiGLU + routing
+ # output magnitudes land well below 10 per element. A loose bound
+ # catches accidental near-zero or runaway output that would
+ # indicate the PrePack hook silently produced wrong bytes.
+ self.assertGreater(numpy.abs(out).mean(), 1e-4, "Output is suspiciously close to zero")
+ self.assertLess(numpy.abs(out).max(), 10.0, "Output magnitude is implausibly large")
+
+ def test_int4_swiglu_interleaved_small(self):
+ self._run_one(hidden_size=64, inter_size=32, num_experts=4, top_k=2, swiglu_fusion=1, batch_size=8)
+
+ def test_int4_swiglu_interleaved_medium(self):
+ self._run_one(hidden_size=128, inter_size=64, num_experts=8, top_k=2, swiglu_fusion=1, batch_size=16)
+
+
if __name__ == "__main__":
unittest.main()