deepseek-ai
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎csrc/apis/mega.hpp‎
Lines changed: 6 additions & 4 deletions b/‎csrc/apis/mega.hpp‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎csrc/jit/handle.hpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/jit/handle.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp‎
Lines changed: 10 additions & 6 deletions b/‎csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎csrc/jit_kernels/impls/smxx_fp8_paged_mqa_logits.hpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/jit_kernels/impls/smxx_fp8_paged_mqa_logits.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deep_gemm/include/deep_gemm/common/math.cuh‎
Lines changed: 3 additions & 3 deletions b/‎deep_gemm/include/deep_gemm/common/math.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh‎
Lines changed: 6 additions & 6 deletions b/‎deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎deep_gemm/include/deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh‎
Lines changed: 1 addition & 1 deletion b/‎deep_gemm/include/deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh‎
Lines changed: 8 additions & 8 deletions b/‎deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh‎
Lines changed: 8 additions & 8 deletions
@@ -3,8 +3,7 @@ cmake_minimum_required(VERSION 3.10)
 project(deep_gemm LANGUAGES CXX CUDA)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC -Wno-psabi")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -Wno-psabi")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -Wno-psabi -Wno-deprecated-declarations")
 set(CUDA_SEPARABLE_COMPILATION ON)
 list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG")
 list(APPEND CUDA_NVCC_FLAGS "-O3")
@@ -22,7 +21,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 
 include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include third-party/fmt/include)
-include_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS})
+include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include/cccl ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS})
 link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
 
 # The main Python API entrance
 
@@ -13,11 +13,13 @@ static int64_t get_symm_buffer_size_for_mega_moe(
     const int& hidden, const int& intermediate_hidden,
     const bool& use_fp8_dispatch, const std::string& activation) {
     // TODO: implement
-    return 4096;
+    // Currently, we use 16 GiB to debug
+    return 16ll * 1024ll * 1024ll * 1024ll;
 }
 
 static void fp8_fp4_mega_moe(
-    const std::tuple<torch::Tensor, torch::Tensor>& hidden_states_,
+    const std::tuple<torch::Tensor, torch::Tensor>& x_,
+    const torch::Tensor& y,
     const std::tuple<torch::Tensor, torch::Tensor>& l1_weights_,
     const std::tuple<torch::Tensor, torch::Tensor>& l2_weights_,
     const torch::Tensor& topk_idx, const torch::Tensor& topk_weights,
@@ -26,7 +28,7 @@ static void fp8_fp4_mega_moe(
     const int& num_max_tokens_per_rank,
     const std::tuple<int, int, int>& recipe,
     const std::string& activation) {
-    const auto [hidden_states, hidden_states_sf] = hidden_states_;
+    const auto [x, x_sf] = x_;
     const auto [l1_weights, l1_weights_sf] = l1_weights_;
     const auto [l2_weights, l2_weights_sf] = l2_weights_;
 
@@ -42,7 +44,7 @@ static void fp8_fp4_mega_moe(
     // Dispatch into different architectures
     const auto arch_major = device_runtime->get_arch_major();
     if (arch_major == 10) {
-        sm100_fp8_fp4_mega_moe(hidden_states, hidden_states_sf,
+        sm100_fp8_fp4_mega_moe(x, x_sf, y,
                                l1_weights, l1_weights_sf,
                                l2_weights, l2_weights_sf,
                                topk_idx, topk_weights,
 
@@ -24,7 +24,7 @@ static void* get_driver_handle() {
 #define DECL_LAZY_CUDA_DRIVER_FUNCTION(name) \
 template <typename... Args> \
 static auto lazy_##name(Args&&... args) -> decltype(name(args...)) { \
-    using FuncType = decltype(&name); \
+    using FuncType = decltype(&(name)); \
     static FuncType func = nullptr; \
     if (func == nullptr) { \
         func = reinterpret_cast<FuncType>(dlsym(get_driver_handle(), #name)); \
 
@@ -25,8 +25,9 @@ class SM100FP8FP4MegaMoERuntime final : public LaunchRuntime<SM100FP8FP4MegaMoER
         int num_ranks;
 
         // Runtime arguments
-        int num_tokens;
+        void* x;
         int64_t* topk_idx;
+        int num_tokens;
         layout::SymBuffer<> sym_buffer_ptrs;
         int rank_idx;
 
@@ -64,18 +65,20 @@ static void __instantiate_kernel() {{
         // TODO: optimize `args` copy
         // TODO: tensor maps are missing
         DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,
-            args.num_tokens, args.topk_idx, args.sym_buffer_ptrs, args.rank_idx));
+            args.x, args.topk_idx,
+            args.num_tokens,
+            args.sym_buffer_ptrs, args.rank_idx));
     }
 };
 
 static void sm100_fp8_fp4_mega_moe(
-    const torch::Tensor& hidden_states, const torch::Tensor& hidden_states_sf,
+    const torch::Tensor& x, const torch::Tensor& x_sf, const torch::Tensor& y,
     const torch::Tensor& l1_weights, const torch::Tensor& l1_weights_sf,
     const torch::Tensor& l2_weights, const torch::Tensor& l2_weights_sf,
     const torch::Tensor& topk_idx, const torch::Tensor& topk_weights,
     const std::vector<uint64_t>& sym_buffer_ptrs, const int& rank_idx,
     const int& num_max_tokens_per_rank) {
-    const auto [num_tokens, hidden] = get_shape<2>(hidden_states);
+    const auto [num_tokens, hidden] = get_shape<2>(x);
     const auto [num_experts_per_rank, intermediate_hidden, _] = get_shape<3>(l2_weights);
     const auto [__, num_topk] = get_shape<2>(topk_idx);
     const auto num_ranks = static_cast<int>(sym_buffer_ptrs.size());
@@ -92,11 +95,12 @@ static void sm100_fp8_fp4_mega_moe(
         .num_stages = 5,
         .num_dispatch_threads = 128, .num_mma_non_epilogue_threads = 128, .num_mma_epilogue_threads = 128,
         .num_ranks = num_ranks,
-        .num_tokens = num_tokens,
+        .x = x.data_ptr(),
         .topk_idx = topk_idx.data_ptr<int64_t>(),
+        .num_tokens = num_tokens,
         .sym_buffer_ptrs = layout::SymBuffer<>(sym_buffer_ptrs),
         .rank_idx = rank_idx,
-        .launch_args = LaunchArgs(num_sms, 256, 16384, 2)
+        .launch_args = LaunchArgs(num_sms, 256, 232448, 2)
     };
     const auto code = SM100FP8FP4MegaMoERuntime::generate(args);
     const auto runtime = compiler->build("sm100_fp8_fp4_mega_moe", code);
 
@@ -31,7 +31,7 @@ class SMXXPagedMQALogitsMetadataRuntime final: public LaunchRuntime<SMXXPagedMQA
 using namespace deep_gemm;
 
 static void __instantiate_kernel() {{
-    auto ptr = reinterpret_cast<void*>(&scheduler::smxx_paged_mqa_logits_metadata<
+    auto ptr = reinterpret_cast<void*>(&sched::smxx_paged_mqa_logits_metadata<
         {}, {}, {}
     >);
 }};
 
@@ -23,9 +23,9 @@ __device__ __host__ constexpr T constexpr_ceil_div(T a, T b) {
     return (a + b - 1) / b;
 }
 
-template <typename T>
-__device__ __host__ T align(T a, T b) {
-    return ceil_div(a, b) * b;
+template <typename T, bool kDoCeilAlignment = true>
+__forceinline__ __device__ __host__ T align(T a, T b) {
+    return (kDoCeilAlignment ? ceil_div(a, b) : (a / b)) * b;
 }
 
 template <typename T>
 
@@ -166,7 +166,7 @@ sm100_bf16_gemm_impl(int* grouped_layout,
 
     // Block scheduler
     uint32_t m_block_idx, n_block_idx;
-    auto scheduler = scheduler::Scheduler<kGemmType, BLOCK_M, BLOCK_N, kNumGroups, kNumMulticast, kIsMulticastOnA, kNumSMs>(
+    auto scheduler = sched::Scheduler<kGemmType, BLOCK_M, BLOCK_N, kNumGroups, kNumMulticast, kIsMulticastOnA, kNumSMs>(
         shape_m, shape_n, shape_k, grouped_layout);
 
     // Pipeline and TMA phases
@@ -195,19 +195,19 @@ sm100_bf16_gemm_impl(int* grouped_layout,
 
                 // Compute offsets
                 // NOTES: the group is always concatenated with the outer dimension
-                uint32_t m_idx = scheduler.template get_global_idx<(kGemmType == GemmType::MGroupedMasked), scheduler::IndexType::MN> (
+                uint32_t m_idx = scheduler.template get_global_idx<(kGemmType == GemmType::MGroupedMasked), sched::IndexType::MN> (
                     shape_m, BLOCK_M, m_block_idx);
-                uint32_t n_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::K), scheduler::IndexType::MN> (
+                uint32_t n_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::K), sched::IndexType::MN> (
                     shape_n, BLOCK_N, n_block_idx, m_block_idx);
 
                 // NOTES: `k_idx` is actually the k index default for K-major, while `k_b_idx` may be MN-major
                 // And for all m-grouped GEMMs, A must be K-majored
                 DG_STATIC_ASSERT(kGemmType == GemmType::Normal or kGemmType == GemmType::KGroupedContiguous or kGemmType == GemmType::Batched or
                                  kMajorA == cute::UMMA::Major::K, "Invalid major");
                 uint32_t k_idx = k_block_idx * BLOCK_K;
-                uint32_t k_a_idx = scheduler.template get_global_idx<(kMajorA == cute::UMMA::Major::MN), scheduler::IndexType::K> (
+                uint32_t k_a_idx = scheduler.template get_global_idx<(kMajorA == cute::UMMA::Major::MN), sched::IndexType::K> (
                     shape_k, BLOCK_K, k_block_idx, m_block_idx);
-                uint32_t k_b_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::MN), scheduler::IndexType::K> (
+                uint32_t k_b_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::MN), sched::IndexType::K> (
                     shape_k, BLOCK_K, k_block_idx, m_block_idx);
 
                 // Add 2 CTA offsets
@@ -384,7 +384,7 @@ sm100_bf16_gemm_impl(int* grouped_layout,
             // Load from tensor memory into registers, and write shared memory with STSM
             const auto tmem_base_addr = accum_stage_idx * UMMA_N;
             const auto base_m_idx = scheduler.template get_global_idx<
-                (not is_m_grouped_contiguous(kGemmType)), scheduler::IndexType::MN>(shape_m, BLOCK_M, m_block_idx);
+                (not is_m_grouped_contiguous(kGemmType)), sched::IndexType::MN>(shape_m, BLOCK_M, m_block_idx);
             const auto base_n_idx = n_block_idx * BLOCK_N;
 
             if constexpr (kSwapAB) {
 
@@ -152,7 +152,7 @@ void sm100_fp4_paged_mqa_logits(const uint32_t batch_size,
 
     // Scheduler
     constexpr uint32_t kNumBlocksPerSplit = SPLIT_KV / BLOCK_KV;
-    auto scheduler = scheduler::PagedMQALogitsScheduler<kNextN, kIsContextLens2D, BLOCK_KV, kNumBlocksPerSplit>(
+    auto scheduler = sched::PagedMQALogitsScheduler<kNextN, kIsContextLens2D, BLOCK_KV, kNumBlocksPerSplit>(
         batch_size, blockIdx.x, context_lens, schedule_meta);
     DG_STATIC_ASSERT(SPLIT_KV == BLOCK_KV * kNumBlocksPerSplit, "Invalid `SPLIT_KV`");
 
 
@@ -180,7 +180,7 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
 
     // Block scheduler
     uint32_t m_block_idx, n_block_idx;
-    auto scheduler = scheduler::Scheduler<kGemmType, BLOCK_M, BLOCK_N, kNumGroups, kNumMulticast, kIsMulticastOnA, kNumSMs>(
+    auto scheduler = sched::Scheduler<kGemmType, BLOCK_M, BLOCK_N, kNumGroups, kNumMulticast, kIsMulticastOnA, kNumSMs>(
         shape_m, shape_n, shape_k, grouped_layout);
 
     // Pipeline and TMA phases
@@ -209,19 +209,19 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
 
                 // Compute offsets
                 // NOTES: the group is always concatenated with the outer dimension
-                uint32_t m_idx = scheduler.template get_global_idx<(kGemmType == GemmType::MGroupedMasked), scheduler::IndexType::MN> (
+                uint32_t m_idx = scheduler.template get_global_idx<(kGemmType == GemmType::MGroupedMasked), sched::IndexType::MN> (
                     shape_m, BLOCK_M, m_block_idx);
-                uint32_t n_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::K), scheduler::IndexType::MN> (
+                uint32_t n_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::K), sched::IndexType::MN> (
                     shape_n, BLOCK_N, n_block_idx, m_block_idx);
 
                 // NOTES: `k_idx` is actually the k index default for K-major, while `k_b_idx` may be MN-major
                 // And for all m-grouped GEMMs, A must be K-majored
                 DG_STATIC_ASSERT(kGemmType == GemmType::Normal or kGemmType == GemmType::KGroupedContiguous or kGemmType == GemmType::Batched or
                                  kMajorA == cute::UMMA::Major::K, "Invalid major");
                 uint32_t k_idx = k_block_idx * BLOCK_K;
-                uint32_t k_a_idx = scheduler.template get_global_idx<(kMajorA == cute::UMMA::Major::MN), scheduler::IndexType::K> (
+                uint32_t k_a_idx = scheduler.template get_global_idx<(kMajorA == cute::UMMA::Major::MN), sched::IndexType::K> (
                     shape_k, BLOCK_K, k_block_idx, m_block_idx);
-                uint32_t k_b_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::MN), scheduler::IndexType::K> (
+                uint32_t k_b_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::MN), sched::IndexType::K> (
                     shape_k, BLOCK_K, k_block_idx, m_block_idx);
 
                 // Add 2 CTA offsets
@@ -252,14 +252,14 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
                 // No swizzling, so one TMA for one SF is enough
                 if (k_block_idx % kNumSFAStagesPerLoad == 0) {
                     uint32_t sfa_m_idx = m_block_idx * BLOCK_M;
-                    uint32_t sfa_k_idx = scheduler.template get_global_idx<(not is_m_grouped_contiguous(kGemmType)), scheduler::IndexType::SF_K>(
+                    uint32_t sfa_k_idx = scheduler.template get_global_idx<(not is_m_grouped_contiguous(kGemmType)), sched::IndexType::SF_K>(
                         shape_sfa_k, 1, math::ceil_div(k_idx, BLOCK_K * kNumSFAStagesPerLoad));
                     tma::copy<BLOCK_M, 1, 0>(&tensor_map_sfa, full_barriers[stage_idx], smem_sfa[stage_idx], sfa_m_idx, sfa_k_idx);
                     num_arrival_bytes += BLOCK_M * sizeof(uint32_t);
                 }
                 if (k_block_idx % kNumSFBStagesPerLoad == 0) {
                     uint32_t sfb_n_idx = n_block_idx * BLOCK_N;
-                    uint32_t sfb_k_idx = scheduler.template get_global_idx<true, scheduler::IndexType::SF_K>(
+                    uint32_t sfb_k_idx = scheduler.template get_global_idx<true, sched::IndexType::SF_K>(
                         shape_sfb_k, 1, math::ceil_div(k_idx, BLOCK_K * kNumSFBStagesPerLoad), m_block_idx);
                     tma::copy<BLOCK_N, 1, 0>(&tensor_map_sfb, full_barriers[stage_idx], smem_sfb[stage_idx], sfb_n_idx, sfb_k_idx);
                     num_arrival_bytes += BLOCK_N * sizeof(uint32_t);
@@ -460,7 +460,7 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
             ptx::tcgen05_after_thread_sync();
 
             const auto tmem_base_addr = accum_stage_idx * UMMA_N;
-            const auto base_m_idx = scheduler.template get_global_idx<(not is_m_grouped_contiguous(kGemmType)), scheduler::IndexType::MN>(shape_m, BLOCK_M, m_block_idx);
+            const auto base_m_idx = scheduler.template get_global_idx<(not is_m_grouped_contiguous(kGemmType)), sched::IndexType::MN>(shape_m, BLOCK_M, m_block_idx);
             const auto base_n_idx = n_block_idx * BLOCK_N;
 
             if constexpr (kSwapAB) {
Original file line number	Diff line number	Diff line change
`@@ -23,9 +23,9 @@ __device__ __host__ constexpr T constexpr_ceil_div(T a, T b) {`
`23`	`23`	`return (a + b - 1) / b;`
`24`	`24`	`}`
`25`	`25`
`26`		`-template <typename T>`
`27`		`-__device__ __host__ T align(T a, T b) {`
`28`		`- return ceil_div(a, b) * b;`
	`26`	`+template <typename T, bool kDoCeilAlignment = true>`
	`27`	`+__forceinline__ __device__ __host__ T align(T a, T b) {`
	`28`	`+ return (kDoCeilAlignment ? ceil_div(a, b) : (a / b)) * b;`
`29`	`29`	`}`
`30`	`30`
`31`	`31`	`template <typename T>`