deepseek-ai
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/apis/mega.hpp‎
Lines changed: 46 additions & 12 deletions b/‎csrc/apis/mega.hpp‎
Lines changed: 46 additions & 12 deletions
diff --git a/‎csrc/jit_kernels/heuristics/mega_moe.hpp‎
Lines changed: 8 additions & 3 deletions b/‎csrc/jit_kernels/heuristics/mega_moe.hpp‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp‎
Lines changed: 32 additions & 2 deletions b/‎csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp‎
Lines changed: 32 additions & 2 deletions
@@ -17,8 +17,8 @@ find_package(CUDAToolkit REQUIRED)
 find_package(pybind11 REQUIRED)
 find_package(Torch REQUIRED)
 
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CUDA_STANDARD 20)
 
 include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include third-party/fmt/include)
 include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include/cccl ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS})
 
@@ -11,7 +11,7 @@
 
 namespace deep_gemm::mega {
 
-static std::tuple<int64_t, std::function<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(const torch::Tensor&)>>
+static std::tuple<int64_t, std::function<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(const torch::Tensor&)>>
 get_symm_buffer_size_for_mega_moe(
     const int& num_ranks, const int& num_experts,
     const int& num_max_tokens_per_rank, const int& num_topk,
@@ -27,6 +27,8 @@ get_symm_buffer_size_for_mega_moe(
     const auto fp8_token_layout = layout::Data(hidden);
     const auto bf16_token_layout = layout::Data(hidden * 2);
     const auto fp8_intermediate_token_layout = layout::Data(intermediate_hidden);
+    const auto fp8_sf_layout = layout::Data(hidden / 32);
+    const auto fp8_intermediate_sf_layout = layout::Data(intermediate_hidden / 32);
     const auto input_topk_idx_layout = layout::Data(num_topk * sizeof(int64_t), false);
     const auto input_topk_weights_layout = layout::Data(num_topk * sizeof(float), false);
     const auto l1_topk_weights_layout = layout::Data(sizeof(float), false);
@@ -35,10 +37,12 @@ get_symm_buffer_size_for_mega_moe(
     const auto input_token_buffer = layout::Buffer(
         fp8_token_layout, 1, num_max_tokens_per_rank,
         workspace.get_end_ptr());
-    // TODO: add `input_sf_buffer`
+    const auto input_sf_buffer = layout::Buffer(
+        fp8_sf_layout, 1, num_max_tokens_per_rank,
+        input_token_buffer.get_end_ptr());
     const auto input_topk_idx_buffer = layout::Buffer(
         input_topk_idx_layout, 1, num_max_tokens_per_rank,
-        input_token_buffer.get_end_ptr());
+        input_sf_buffer.get_end_ptr());
     const auto input_topk_weights_buffer = layout::Buffer(
         input_topk_weights_layout, 1, num_max_tokens_per_rank,
         input_topk_idx_buffer.get_end_ptr());
@@ -49,29 +53,41 @@ get_symm_buffer_size_for_mega_moe(
     const auto l1_token_buffer = layout::Buffer(
         fp8_token_layout, num_experts_per_rank, num_max_recv_tokens_per_expert,
         input_topk_weights_buffer.get_end_ptr());
-    // TODO: add `l1_input_sf_buffer`
+    const auto l1_sf_buffer = layout::Buffer(
+        fp8_sf_layout, num_experts_per_rank, num_max_recv_tokens_per_expert,
+        l1_token_buffer.get_end_ptr());
     const auto l1_topk_weights_buffer = layout::Buffer(
         l1_topk_weights_layout, num_experts_per_rank, num_max_recv_tokens_per_expert,
-        l1_token_buffer.get_end_ptr());
+        l1_sf_buffer.get_end_ptr());
 
     // L2 input buffer
     const auto l2_token_buffer = layout::Buffer(
         fp8_intermediate_token_layout, num_experts_per_rank, num_max_recv_tokens_per_expert,
         l1_topk_weights_buffer.get_end_ptr());
+    const auto l2_sf_buffer = layout::Buffer(
+        fp8_intermediate_sf_layout, num_experts_per_rank, num_max_recv_tokens_per_expert,
+        l2_token_buffer.get_end_ptr());
 
     // Combine input buffer: BF16 tokens for cross-rank combine
     const auto combine_token_buffer = layout::Buffer(
         bf16_token_layout, num_topk, num_max_tokens_per_rank,
-        l2_token_buffer.get_end_ptr());
+        l2_sf_buffer.get_end_ptr());
 
-    // Slice function: creates `(x, x_sf, topk_weights, topk_idx, l1_acts, l2_acts)` tensor views from the raw buffer
+    // Check SF buffer requirements
+    DG_HOST_ASSERT(hidden % 128 == 0 and intermediate_hidden % 128 == 0);
+    DG_HOST_ASSERT(num_max_recv_tokens_per_expert % 4 == 0);
+
+    // Slice function: creates `(x, x_sf, topk_weights, topk_idx, l1_acts, l1_acts_sf, l2_acts, l2_acts_sf)` tensor views from the raw buffer
+    // NOTES: `x_sf` is K-major, while `l1_acts_sf` and `l2_acts_sf` are M-major
     auto slice_input_buffers = [=](const torch::Tensor& buffer) {
         auto x = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(input_token_buffer.base)),
             {num_max_tokens_per_rank, hidden},
             torch::TensorOptions().dtype(torch::kFloat8_e4m3fn).device(buffer.device()));
-        // TODO: create `x_sf` from buffer
-        auto x_sf = torch::empty(0, torch::TensorOptions().device(buffer.device()));
+        auto x_sf = torch::from_blob(
+            math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(input_sf_buffer.base)),
+            {num_max_tokens_per_rank, hidden / 128},
+            torch::TensorOptions().dtype(torch::kInt).device(buffer.device()));
         auto topk_idx = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(input_topk_idx_buffer.base)),
             {num_max_tokens_per_rank, num_topk},
@@ -84,11 +100,21 @@ get_symm_buffer_size_for_mega_moe(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(l1_token_buffer.base)),
             {num_experts_per_rank * num_max_recv_tokens_per_expert, hidden},
             torch::TensorOptions().dtype(torch::kFloat8_e4m3fn).device(buffer.device()));
+        auto l1_acts_sf = torch::from_blob(
+            math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(l1_sf_buffer.base)),
+            {num_max_recv_tokens_per_expert, hidden / 128 * num_experts_per_rank},
+            {1, num_max_recv_tokens_per_expert},
+            torch::TensorOptions().dtype(torch::kInt).device(buffer.device()));
         auto l2_acts = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(l2_token_buffer.base)),
             {num_experts_per_rank * num_max_recv_tokens_per_expert, intermediate_hidden},
             torch::TensorOptions().dtype(torch::kFloat8_e4m3fn).device(buffer.device()));
-        return std::make_tuple(x, x_sf, topk_idx, topk_weights, l1_acts, l2_acts);
+        auto l2_acts_sf = torch::from_blob(
+            math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(l2_sf_buffer.base)),
+            {num_max_recv_tokens_per_expert, intermediate_hidden / 128 * num_experts_per_rank},
+            {1, num_max_recv_tokens_per_expert},
+            torch::TensorOptions().dtype(torch::kInt).device(buffer.device()));
+        return std::make_tuple(x, x_sf, topk_idx, topk_weights, l1_acts, l1_acts_sf, l2_acts, l2_acts_sf);
     };
     return {reinterpret_cast<int64_t>(combine_token_buffer.get_end_ptr()), slice_input_buffers};
 }
@@ -133,6 +159,13 @@ static void fp8_fp4_mega_moe(
     DG_HOST_ASSERT(intermediate_hidden_2 == 2 * intermediate_hidden);
     DG_HOST_ASSERT(l1_weights.is_contiguous() and l2_weights.is_contiguous());
 
+   // Check weight SF layout for UE8M0 packing, MN-major, and TMA alignment
+    constexpr int kGranMN = 1, kGranK = 32;
+    check_sf_layout(l1_weights_sf, intermediate_hidden * 2, hidden, kGranMN, kGranK,
+                    num_experts_per_rank, true, false, torch::kInt);
+    check_sf_layout(l2_weights_sf, hidden, intermediate_hidden, kGranMN, kGranK,
+                    num_experts_per_rank, true, false, torch::kInt);
+
     // Check buffer bytes
     const auto num_ranks = static_cast<int>(sym_buffer_ptrs.size());
     const auto num_experts_ = num_experts_per_rank * num_ranks;
@@ -145,12 +178,13 @@ static void fp8_fp4_mega_moe(
     DG_HOST_ASSERT(num_experts == num_experts_);
 
     // Already registered tensors
-    const auto [x, x_sf, topk_idx, topk_weights, l1_acts, l2_acts] = slice(sym_buffer);
+    const auto [x, x_sf, topk_idx, topk_weights, l1_acts, l1_acts_sf, l2_acts, l2_acts_sf] = slice(sym_buffer);
 
     // Dispatch into different architectures
     if (arch_major == 10) {
         sm100_fp8_fp4_mega_moe(y,
-                               l1_acts, l2_acts,
+                               l1_acts, l1_acts_sf,
+                               l2_acts, l2_acts_sf,
                                l1_weights, l2_weights,
                                l1_weights_sf, l2_weights_sf,
                                sym_buffer_ptrs,
 
@@ -117,8 +117,13 @@ static std::pair<int, int> get_pipeline_config_for_mega_moe(
     // Tensor memory pointer
     const int smem_tmem_ptr = 4;
 
-    // Per-stage: A tile + B tile + full/empty barriers
-    const int smem_per_stage = load_block_m * block_k + block_n * block_k + 2 * 8;
+    // SF is aligned to UTCCP 128-element granularity
+    const auto [sf_block_m, sf_block_n] = SM100ArchSpec::get_sf_uttcp_aligned_block_sizes(block_m, block_n, MmaKind::MXFP8FP4);
+    const int smem_sfa_per_stage = sf_block_m * 4;
+    const int smem_sfb_per_stage = sf_block_n * 4;
+
+    // Per-stage: A tile + B tile + SFA tile + SFB tile + full/empty/with_sf_full barriers
+    const int smem_per_stage = load_block_m * block_k + block_n * block_k + smem_sfa_per_stage + smem_sfb_per_stage + 3 * 8;
 
     // Fixed total
     const int smem_fixed = smem_dispatch_size + smem_cd + smem_amax_reduction + smem_barriers + smem_tmem_ptr;
@@ -140,7 +145,7 @@ static MegaMoEConfig get_mega_moe_config(
     const int block_k = 128;
     const int load_block_m = block_m / 2;
     const int load_block_n = block_n;
-    const int store_block_m = 48;
+    const int store_block_m = 32;
     // NOTES: FP8 activations and FP4 weights (unpacked to 8-bit in smem) both use 128B swizzle
     const int swizzle_acts_mode = 128;
     const int swizzle_weights_mode = 128;
 
@@ -35,10 +35,14 @@ class SM100FP8FP4MegaMoERuntime final : public LaunchRuntime<SM100FP8FP4MegaMoER
 
         // Tensormap
         CUtensorMap tensor_map_l1_acts;
+        CUtensorMap tensor_map_l1_acts_sf;
         CUtensorMap tensor_map_l1_weights;
+        CUtensorMap tensor_map_l1_weights_sf;
         CUtensorMap tensor_map_l1_output;
         CUtensorMap tensor_map_l2_acts;
+        CUtensorMap tensor_map_l2_acts_sf;
         CUtensorMap tensor_map_l2_weights;
+        CUtensorMap tensor_map_l2_weights_sf;
 
         // Launch configs
         LaunchArgs launch_args;
@@ -85,17 +89,22 @@ static void __instantiate_kernel() {{
             args.num_tokens,
             args.sym_buffer_ptrs, args.rank_idx,
             args.tensor_map_l1_acts,
+            args.tensor_map_l1_acts_sf,
             args.tensor_map_l1_weights,
+            args.tensor_map_l1_weights_sf,
             args.tensor_map_l1_output,
             args.tensor_map_l2_acts,
-            args.tensor_map_l2_weights
+            args.tensor_map_l2_acts_sf,
+            args.tensor_map_l2_weights,
+            args.tensor_map_l2_weights_sf
         ));
     }
 };
 
 static void sm100_fp8_fp4_mega_moe(
     const torch::Tensor& y,
-    const torch::Tensor& l1_acts, const torch::Tensor& l2_acts,
+    const torch::Tensor& l1_acts, const torch::Tensor& l1_acts_sf,
+    const torch::Tensor& l2_acts, const torch::Tensor& l2_acts_sf,
     const torch::Tensor& l1_weights, const torch::Tensor& l2_weights,
     const torch::Tensor& l1_weights_sf, const torch::Tensor& l2_weights_sf,
     const std::vector<uint64_t>& sym_buffer_ptrs,
@@ -116,17 +125,26 @@ static void sm100_fp8_fp4_mega_moe(
         num_max_tokens_per_rank, num_topk, hidden, intermediate_hidden);
 
     // Make tensormap
+    constexpr int kGranK = 32;
     const auto num_max_recv_tokens = num_experts_per_rank * num_max_recv_tokens_per_expert;
     const auto tensor_map_l1_acts = make_tma_2d_desc(l1_acts,
                                                      hidden, num_max_recv_tokens,
                                                      config.block_k, config.load_block_m,
                                                      static_cast<int>(l1_acts.stride(-2)),
                                                      config.swizzle_acts_mode);
+    const auto tensor_map_l1_acts_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l1_acts_sf,
+                                                        num_max_recv_tokens_per_expert, hidden,
+                                                        config.block_m, kGranK,
+                                                        num_experts_per_rank, 0);
     const auto tensor_map_l1_weights = make_tma_2d_desc(l1_weights,
                                                         hidden, num_experts_per_rank * intermediate_hidden * 2,
                                                         config.block_k, config.load_block_n,
                                                         static_cast<int>(l1_weights.stride(-2)),
                                                         config.swizzle_weights_mode);
+    const auto tensor_map_l1_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l1_weights_sf,
+                                                           intermediate_hidden * 2, hidden,
+                                                           config.block_n, kGranK,
+                                                           num_experts_per_rank, 0);
     // NOTES: L1 output and L2 activations are essentially the same tensor.
     // Post-SwiGLU output has half the N width (`BLOCK_N / 2` per input tile),
     // so the swizzle mode is also halved (128 -> 64).
@@ -140,11 +158,19 @@ static void sm100_fp8_fp4_mega_moe(
                                                      config.block_k, config.load_block_m,
                                                      static_cast<int>(l2_acts.stride(-2)),
                                                      config.swizzle_acts_mode);
+    const auto tensor_map_l2_acts_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l2_acts_sf,
+                                                        num_max_recv_tokens_per_expert, intermediate_hidden,
+                                                        config.block_m, kGranK,
+                                                        num_experts_per_rank, 0);
     const auto tensor_map_l2_weights = make_tma_2d_desc(l2_weights,
                                                         intermediate_hidden, num_experts_per_rank * hidden,
                                                         config.block_k, config.load_block_n,
                                                         static_cast<int>(l2_weights.stride(-2)),
                                                         config.swizzle_weights_mode);
+    const auto tensor_map_l2_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l2_weights_sf,
+                                                           hidden, intermediate_hidden,
+                                                           config.block_n, kGranK,
+                                                           num_experts_per_rank, 0);
 
     // Launch
     const auto num_sms = device_runtime->get_num_sms();
@@ -161,10 +187,14 @@ static void sm100_fp8_fp4_mega_moe(
         .sym_buffer_ptrs = layout::SymBuffer<>(sym_buffer_ptrs, rank_idx),
         .rank_idx = rank_idx,
         .tensor_map_l1_acts = tensor_map_l1_acts,
+        .tensor_map_l1_acts_sf = tensor_map_l1_acts_sf,
         .tensor_map_l1_weights = tensor_map_l1_weights,
+        .tensor_map_l1_weights_sf = tensor_map_l1_weights_sf,
         .tensor_map_l1_output = tensor_map_l1_output,
         .tensor_map_l2_acts = tensor_map_l2_acts,
+        .tensor_map_l2_acts_sf = tensor_map_l2_acts_sf,
         .tensor_map_l2_weights = tensor_map_l2_weights,
+        .tensor_map_l2_weights_sf = tensor_map_l2_weights_sf,
         .launch_args = LaunchArgs(num_sms,
                                   config.num_dispatch_threads + config.num_non_epilogue_threads + config.num_epilogue_threads,
                                   config.smem_size, 2)