feat: Add weight layout option for trtllm-gen fused moe (#1297)

aleozlx · web-flow · commit 8cd1800b3367 · 2025-07-25T17:15:11.000-07:00
## 📌 Description Expose weight layout for BlockMajorK usage ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -98,7 +98,8 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
         (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct &&
         options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch &&
         tileSize == mOptions.tileSize &&
-        options.mUseShuffledMatrixA == mOptions.useShuffledMatrixA) {
+        options.mUseShuffledMatrixA == mOptions.useShuffledMatrixA &&
+        options.mLayoutA == mOptions.weightLayout) {
       if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM) {
         mPassingConfigIndices.push_back(i);
       }
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -482,12 +482,26 @@ at::Tensor trtllm_fp8_block_scale_moe_launcher(
               "hidden_states_scale dim1 must match num_tokens.");
   TORCH_CHECK(gemm1_weights.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "gemm1_weights must be fp8.");
-  TORCH_CHECK(gemm1_weights.dim() == 3, "gemm1_weights must be 3D.");
-  TORCH_CHECK(gemm1_weights.sizes()[1] % 2 == 0, "the second dimension of weights must be even.");
-  TORCH_CHECK(intermediate_size == gemm1_weights.sizes()[1] / 2,
-              "intermediate_size has incorrect shape.");
-  TORCH_CHECK(gemm1_weights.sizes()[2] == hidden_states.sizes()[1],
-              "the third dimension of weights must be equal to hidden_size.");
+
+  TORCH_CHECK(gemm1_weights.dim() == 3 || gemm1_weights.dim() == 4,
+              "gemm1_weights must be 3D or 4D.");
+  {
+    int64_t Mn = 0, K = 0;
+    if (gemm1_weights.dim() == 3) {
+      // MajorK [num_experts, M, K]
+      Mn = gemm1_weights.sizes()[1];
+      K = gemm1_weights.sizes()[2];
+    } else if (gemm1_weights.dim() == 4) {
+      // BlockMajorK [num_experts, K/block_k, M, block_k]
+      Mn = gemm1_weights.sizes()[2];
+      int64_t block_k = gemm1_weights.sizes()[3];
+      K = gemm1_weights.sizes()[1] * block_k;
+    }
+    TORCH_CHECK(Mn % 2 == 0, "the second dimension of weights must be even.");
+    TORCH_CHECK(intermediate_size == Mn / 2, "intermediate_size has incorrect shape.");
+    TORCH_CHECK(K == hidden_states.sizes()[1],
+                "the third dimension of weights must be equal to hidden_size.");
+  }
   TORCH_CHECK(gemm1_weights_scale.scalar_type() == at::ScalarType::Float,
               "gemm1_weights_scale must be float.");
   TORCH_CHECK(gemm1_weights_scale.dim() == 3, "gemm1_weights_scale must be 3D.");
@@ -502,9 +516,22 @@ at::Tensor trtllm_fp8_block_scale_moe_launcher(
               "gemm1_weights_scale has incorrect shape.");
   TORCH_CHECK(gemm2_weights.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "gemm2_weights must be fp8.");
-  TORCH_CHECK(gemm2_weights.dim() == 3, "gemm2_weights must be 3D.");
-  TORCH_CHECK(gemm2_weights.sizes()[2] == intermediate_size,
-              "the third dimension of weights must be equal to intermediate_size.");
+
+  TORCH_CHECK(gemm2_weights.dim() == 3 || gemm2_weights.dim() == 4,
+              "gemm2_weights must be 3D or 4D.");
+  {
+    int64_t K = 0;
+    if (gemm2_weights.dim() == 3) {
+      // MajorK [num_experts, M, K]
+      K = gemm2_weights.sizes()[2];
+    } else if (gemm2_weights.dim() == 4) {
+      // BlockMajorK [num_experts, K/block_k, M, block_k]
+      int64_t block_k = gemm2_weights.sizes()[3];
+      K = gemm2_weights.sizes()[1] * block_k;
+    }
+    TORCH_CHECK(K == intermediate_size,
+                "the third dimension of weights must be equal to intermediate_size.");
+  }
   TORCH_CHECK(gemm2_weights_scale.scalar_type() == at::ScalarType::Float,
               "gemm2_weights_scale must be float.");
   TORCH_CHECK(gemm2_weights_scale.dim() == 3, "gemm2_weights_scale must be 3D.");
@@ -568,7 +595,8 @@ at::Tensor trtllm_fp8_block_scale_moe(
     at::Tensor const& gemm2_weights, at::Tensor const& gemm2_weights_scale, int64_t num_experts,
     int64_t top_k, int64_t n_group, int64_t topk_group, int64_t intermediate_size,
     int64_t local_expert_offset, int64_t local_num_experts, double routed_scaling_factor,
-    int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight) {
+    int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+    int64_t weight_layout) {
   auto dtype = hidden_states.dtype();
   if (dtype == at::ScalarType::Half || dtype == at::ScalarType::BFloat16 ||
       dtype == at::ScalarType::Float8_e4m3fn) {
@@ -578,9 +606,13 @@ at::Tensor trtllm_fp8_block_scale_moe(
         batchedGemm::trtllm::gen::Dtype::E4m3};  // FP8 runner so hard-coded
     bool mUseDeepSeekFp8{true};                  // Always true for BlockScaleMoe
 
+    TORCH_CHECK(0 <= weight_layout && weight_layout <= 2,
+                "the value of weight_layout is not recognized");
+
     // Properly initialize the runner using make_unique like in the original code
-    auto mRunner = std::make_unique<RunnerType>(mDtypeElt, mUseDeepSeekFp8, tile_tokens_dim,
-                                                use_shuffled_weight);
+    auto mRunner = std::make_unique<RunnerType>(
+        mDtypeElt, mUseDeepSeekFp8, tile_tokens_dim, use_shuffled_weight,
+        static_cast<batchedGemm::gemm::MatrixLayout>(weight_layout));
 
     // Always use fallback config (equivalent to moeConfigIndex == -1 case from original code)
     auto const num_tokens = hidden_states.sizes()[0];
@@ -929,7 +961,8 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
 
   // Properly initialize the runner using make_unique like in the original code
   auto mRunner = std::make_unique<RunnerType>(mDtypeElt, mUseDeepSeekFp8, tile_tokens_dim,
-                                              /*useShuffledMatrixA*/ true);
+                                              /*useShuffledMatrixA*/ true,
+                                              batchedGemm::gemm::MatrixLayout::MajorK);
 
   auto const num_tokens = hidden_states.sizes()[0];
 
diff --git a/csrc/trtllm_fused_moe_runner.cu b/csrc/trtllm_fused_moe_runner.cu
@@ -174,10 +174,9 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
 
 namespace PermuteGemm1 {
 
-tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(btg::Dtype dtypeElt,
-                                                                    int32_t tileTokensDim,
-                                                                    bool useDeepSeekFp8,
-                                                                    bool useShuffledMatrixA) {
+tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(
+    btg::Dtype dtypeElt, int32_t tileTokensDim, bool useDeepSeekFp8, bool useShuffledMatrixA,
+    batchedGemm::gemm::MatrixLayout weightLayout) {
   tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {
       .eltType = dtypeElt,
       .outputType = dtypeElt,
@@ -188,15 +187,17 @@ tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(btg::Dtype d
       .transposeMmaOutput = true,
       .tileSize = tileTokensDim,
       .epilogueTileM = useDeepSeekFp8 ? 64 : 128,
-      .useShuffledMatrixA = useShuffledMatrixA};
+      .useShuffledMatrixA = useShuffledMatrixA,
+      .weightLayout = weightLayout};
   return options;
 }
 
-Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim, bool useShuffledMatrixA)
+Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim, bool useShuffledMatrixA,
+               batchedGemm::gemm::MatrixLayout weightLayout)
     : mDtypeElt(dtypeElt),
       mTileTokensDim(tileTokensDim),
-      mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(
-          getOptions(mDtypeElt, mTileTokensDim, useDeepSeekFp8, useShuffledMatrixA))) {}
+      mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(getOptions(
+          mDtypeElt, mTileTokensDim, useDeepSeekFp8, useShuffledMatrixA, weightLayout))) {}
 
 void Runner::run(void* hiddenState, void* hiddenStateScale, void* weights, void* weightsScale,
                  void* expertWeights, float* outputScalesScalar, float* outputScalesGateScalar,
@@ -253,11 +254,9 @@ std::vector<int64_t> Runner::getPassingConfigIndices() const {
 }  // namespace PermuteGemm1
 
 namespace Gemm2 {
-tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(btg::Dtype dtypeElt,
-                                                                    btg::Dtype dtypeOut,
-                                                                    int32_t tileTokensDim,
-                                                                    bool useDeepSeekFp8,
-                                                                    bool useShuffledMatrixA) {
+tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(
+    btg::Dtype dtypeElt, btg::Dtype dtypeOut, int32_t tileTokensDim, bool useDeepSeekFp8,
+    bool useShuffledMatrixA, batchedGemm::gemm::MatrixLayout weightLayout) {
   tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {
       .eltType = dtypeElt,
       .outputType = dtypeOut,
@@ -268,17 +267,19 @@ tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(btg::Dtype d
       .transposeMmaOutput = true,
       .tileSize = tileTokensDim,
       .epilogueTileM = useDeepSeekFp8 ? 64 : 128,
-      .useShuffledMatrixA = useShuffledMatrixA};
+      .useShuffledMatrixA = useShuffledMatrixA,
+      .weightLayout = weightLayout};
   return options;
 }
 
 Runner::Runner(btg::Dtype dtypeElt, btg::Dtype outputDtype, bool useDeepSeekFp8, int tileTokensDim,
-               bool useShuffledMatrixA)
+               bool useShuffledMatrixA, batchedGemm::gemm::MatrixLayout weightLayout)
     : mDtypeElt(dtypeElt),
       mOutputDtype(outputDtype),
       mTileTokensDim(tileTokensDim),
-      mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(getOptions(
-          mDtypeElt, mOutputDtype, mTileTokensDim, useDeepSeekFp8, useShuffledMatrixA))) {}
+      mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(
+          getOptions(mDtypeElt, mOutputDtype, mTileTokensDim, useDeepSeekFp8, useShuffledMatrixA,
+                     weightLayout))) {}
 
 void Runner::run(void* permutedHiddenState, void* permutedHiddenStateScale, void* weights,
                  void* weightsScale, float* outputScalesScalar, void* output, void* outputScale,
@@ -336,11 +337,11 @@ std::vector<int64_t> Runner::getPassingConfigIndices() const {
 
 namespace MoE {
 Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim,
-               bool useShuffledMatrixA)
-    : mPermuteGemm1(
-          PermuteGemm1::Runner(dtypeElt, useDeepSeekFp8, tileTokensDim, useShuffledMatrixA)),
+               bool useShuffledMatrixA, batchedGemm::gemm::MatrixLayout weightLayout)
+    : mPermuteGemm1(PermuteGemm1::Runner(dtypeElt, useDeepSeekFp8, tileTokensDim,
+                                         useShuffledMatrixA, weightLayout)),
       mGemm2(Gemm2::Runner(dtypeElt, btg::Dtype::Bfloat16, useDeepSeekFp8, tileTokensDim,
-                           useShuffledMatrixA)) {
+                           useShuffledMatrixA, weightLayout)) {
   auto const& gemm1PassingIndices = mPermuteGemm1.getPassingConfigIndices();
   auto const& gemm2PassingIndices = mGemm2.getPassingConfigIndices();
 
diff --git a/flashinfer/fused_moe.py b/flashinfer/fused_moe.py
@@ -59,6 +59,17 @@ class RoutingMethodType(IntEnum):
     Unspecified = 5
 
 
+# See MatrixLayout from include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h
+class WeightLayout(IntEnum):
+    # K-major layout (default). [Mn, K]
+    MajorK = 0
+    # M-major for A and N-major for B. [K, Mn]
+    MajorMn = 1
+    # Layout is blocked along the K dimension. [K / blockK, Mn, blockK]
+    # where blockK is fixed at 128B
+    BlockMajorK = 2
+
+
 def get_reorder_rows_for_gated_act_gemm_row_indices(x) -> torch.Tensor:
     """
     Reorders rows in the gemm/MOE_gemm weight matrix for min-latency
@@ -224,6 +235,12 @@ def shuffle_matrix_sf_a(
     return nvfp4_block_scale_interleave(w_shuffled)
 
 
+def convert_to_block_layout(input_tensor: torch.Tensor, blockK: int) -> torch.Tensor:
+    M, K = input_tensor.shape
+    assert K % blockK == 0, "K must be divisible by blockK"
+    return input_tensor.view(M, K // blockK, blockK).permute(1, 0, 2).contiguous()
+
+
 def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
     return gen_jit_spec(
         "fused_moe_sm100",
@@ -884,6 +901,7 @@ def trtllm_fp8_block_scale_moe_op(
         tile_tokens_dim: int,
         routing_method_type: int,
         use_shuffled_weight: bool = False,
+        weight_layout: int = 0,
     ) -> torch.Tensor:
 
         # Call the C++ function for block scale MoE
@@ -907,6 +925,7 @@ def trtllm_fp8_block_scale_moe_op(
             tile_tokens_dim,
             routing_method_type,
             use_shuffled_weight,
+            weight_layout,
         )
 
         return output
@@ -932,6 +951,7 @@ def _fake_trtllm_fp8_block_scale_moe(
         tile_tokens_dim: int = 8,
         routing_method_type: int = 0,
         use_shuffled_weight: bool = False,
+        weight_layout: int = 0,
     ):
         seq_len = hidden_states.shape[0]
         hidden_size = hidden_states.shape[1]
@@ -1121,7 +1141,8 @@ def trtllm_fp8_block_scale_moe(
     routed_scaling_factor: float,
     tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
-    use_shuffled_weight: bool = True,
+    use_shuffled_weight: bool = False,
+    weight_layout: int = 0,
 ) -> torch.Tensor:
     """FP8 block scale MoE operation.
 
@@ -1168,6 +1189,7 @@ def trtllm_fp8_block_scale_moe(
         tile_tokens_dim,
         routing_method_type,
         use_shuffled_weight,
+        weight_layout,
     )
 
 
diff --git a/include/flashinfer/trtllm/batched_gemm/KernelRunner.h b/include/flashinfer/trtllm/batched_gemm/KernelRunner.h
@@ -23,6 +23,7 @@
 #include <vector>
 
 // #include "flashinfer/trtllm/common/Dtype.h"
+#include "trtllmGen_bmm_export/Enums.h"
 #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h"
 
 namespace tensorrt_llm {
@@ -39,6 +40,7 @@ struct TrtllmGenBatchedGemmRunnerOptions {
   int32_t tileSize{8};
   int32_t epilogueTileM{128};
   bool useShuffledMatrixA{false};
+  batchedGemm::gemm::MatrixLayout weightLayout{batchedGemm::gemm::MatrixLayout::MajorK};
 };
 
 class TrtllmGenBatchedGemmRunner {
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -645,7 +645,7 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
 
   auto fiModuleLoadData = [&](CUmodule* module) {
     const std::string sha256 = config.mHash ? config.mHash : "";
-    const std::string pipeline_hash = "39b7e49bfedde88ea29bfdc2547cbba659f2b236";
+    const std::string pipeline_hash = "991e7438224199de85ef08a2730ce18c12b4e0aa";
     const std::string cubin_path = pipeline_hash + "/" + std::string("batched_gemm-") +
                                    TLLM_GEN_COMMIT + "-" + TLLM_GEN_BATCHED_GEMM_CONFIG_HASH + "/";
     std::string fname_cubin = config.mFunctionName;
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h
diff --git a/include/flashinfer/trtllm/fused_moe/runner.h b/include/flashinfer/trtllm/fused_moe/runner.h
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py