Add support for hc_bwd on SM90 (#170)

517517517 · LyricZhao · web-flow · commit 4fdadc8a609f · 2026-03-09T13:23:20.000+08:00
* Add support for sm90_hc_bwd

* minor fix

* Remove all unsafe refs

* Remove all unsafe refs

* same bar name and add __syncwarp()

* additional transpose warp

* additional transpose warp

* Revert "additional transpose warp"

This reverts commit 331df9f893998661002b7cdda968feae2efbdc51.

* remove one barrier

* improve bf16 to fp32 conversion

* fix bug

* remove one barrier and postpone wgmma waitgroup&lt;0&gt;

* Loop Invariant Code Motion

* increase block_M to 128

* Minor Fix

* Minor Fix

* Add one barrier for stress test correctness

* Add comments

* Some refactors with the new device APIs

---------

Co-authored-by: Chenggang Zhao &lt;chenggangz@deepseek.com&gt;
diff --git a/csrc/apis/hyperconnection.hpp b/csrc/apis/hyperconnection.hpp
@@ -5,6 +5,7 @@
 #if DG_FP8_COMPATIBLE and DG_TENSORMAP_COMPATIBLE
 #include "../jit_kernels/impls/sm90_tf32_hc_prenorm_gemm.hpp"
 #include "../jit_kernels/impls/sm100_tf32_hc_prenorm_gemm.hpp"
+#include "../jit_kernels/impls/sm90_tf32_hc_prenorm_bwd_gemm.hpp"       // oss-ignore-line
 #include "../jit_kernels/impls/sm100_tf32_hc_prenorm_bwd_gemm.hpp"      // oss-ignore-line
 #endif
 
@@ -97,7 +98,9 @@ static void tf32_hc_prenorm_bwd_gemm(const torch::Tensor& a,
 
     // Dispatch into different implements
     const auto arch_major = device_runtime->get_arch_major();
-    if (arch_major == 10) {
+    if (arch_major == 9) {
+        sm90_tf32_hc_prenorm_bwd_gemm(a, b, dd, ds, da, db, m, n, k, accumulate_on_da);
+    } else if (arch_major == 10) {
         sm100_tf32_hc_prenorm_bwd_gemm(a, b, dd, ds, da, db, m, n, k, accumulate_on_da);
     } else {
         DG_HOST_UNREACHABLE("Unsupported architecture");
diff --git a/csrc/indexing/main.cu b/csrc/indexing/main.cu
@@ -24,7 +24,10 @@
 // Hyperconnection kernels
 #include <deep_gemm/impls/sm90_tf32_hc_prenorm_gemm.cuh>
 #include <deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh>
-#include <deep_gemm/impls/sm100_tf32_hc_prenorm_bwd_gemm.cuh>       // oss-ignore-line
+/* oss-ignore-begin */
+#include <deep_gemm/impls/sm90_tf32_hc_prenorm_bwd_gemm.cuh>
+#include <deep_gemm/impls/sm100_tf32_hc_prenorm_bwd_gemm.cuh>
+/* oss-ignore-end */
 
 // Layout kernels
 #include <deep_gemm/impls/smxx_layout.cuh>
diff --git a/csrc/jit_kernels/impls/sm90_tf32_hc_prenorm_bwd_gemm.hpp b/csrc/jit_kernels/impls/sm90_tf32_hc_prenorm_bwd_gemm.hpp
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <torch/python.h>
+
+#include "../../jit/compiler.hpp"
+#include "../../jit/device_runtime.hpp"
+#include "../../jit/kernel_runtime.hpp"
+#include "../../utils/exception.hpp"
+#include "../../utils/format.hpp"
+#include "../../utils/math.hpp"
+#include "../heuristics/sm90.hpp"
+#include "runtime_utils.hpp"
+
+namespace deep_gemm {
+
+class SM90BF16HCPrenormBwdGemmRuntime final: public LaunchRuntime<SM90BF16HCPrenormBwdGemmRuntime> {
+public:
+    struct Args {
+        int m, n, k;
+        int block_m, block_n, block_k;
+        int swizzle_n_mode, swizzle_k_mode, swizzle_m_mode;
+        int accumulate_on_da;
+        int num_stages;
+        int num_tma_threads, num_da_threads, num_db_threads;
+
+        LaunchArgs launch_args;
+
+        float* ds;
+        float* db;
+        CUtensorMap tensor_map_a;
+        CUtensorMap tensor_map_b;
+        CUtensorMap tensor_map_dd;
+        CUtensorMap tensor_map_da;
+    };
+
+    static std::string generate_impl(const Args& args) {
+        return fmt::format(R"(
+#include <deep_gemm/impls/sm90_tf32_hc_prenorm_bwd_gemm.cuh>
+
+using namespace deep_gemm;
+
+static void __instantiate_kernel() {{
+    auto ptr = reinterpret_cast<void*>(&sm90_tf32_hc_prenorm_bwd_gemm_impl<
+        {}, {},
+        {}, {}, {},
+        {}, {}, {},
+        {},
+        {},
+        {}, {}, {}
+    >);
+}};
+)",
+        args.n, args.k,
+        args.block_m, args.block_n, args.block_k,
+        args.swizzle_n_mode, args.swizzle_k_mode, args.swizzle_m_mode,
+        args.accumulate_on_da,
+        args.num_stages,
+        args.num_tma_threads, args.num_da_threads, args.num_db_threads);
+    }
+
+    static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) {
+        // TODO: optimize `args` copy
+        DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,
+            args.m, args.ds, args.db,
+            args.tensor_map_a, args.tensor_map_b, args.tensor_map_dd,
+            args.tensor_map_da));
+    }
+};
+
+static void sm90_tf32_hc_prenorm_bwd_gemm(const torch::Tensor& a,
+                                          const torch::Tensor& b,
+                                          const torch::Tensor& dd,
+                                          const torch::Tensor& ds,
+                                          const torch::Tensor& da,
+                                          const torch::Tensor& db,
+                                          const int& m, const int& n, const int& k,
+                                          const bool& accumulate_on_da) {
+    constexpr int block_m = 128;
+    const int block_n = align(n, 16);
+    DG_HOST_ASSERT(n <= block_n);
+    // Only support small N for now
+    DG_HOST_ASSERT(n <= 32 and n % 8 == 0);
+    constexpr int block_k = 64;
+    
+    constexpr int num_tma_threads = 128;
+    constexpr int num_da_threads = 128;
+    constexpr int num_db_threads = 128;
+
+    // NOTES: block K must be large enough (>= 64) to ensure TF32 and BF16 swizzling are the same
+    const auto& swizzle_n_mode = get_swizzle_mode(block_n, sizeof(float));
+    const auto& swizzle_k_mode = get_swizzle_mode(block_k, sizeof(float));
+    const auto& swizzle_m_mode = get_swizzle_mode(block_m, sizeof(float));
+    DG_HOST_ASSERT(swizzle_k_mode == get_swizzle_mode(block_k, sizeof(nv_bfloat16))); // for tma_a (BF16)
+
+    const auto tensor_map_a = make_tma_b_desc(cute::UMMA::Major::MN, a, k, m,
+                                              block_k, block_m,
+                                              static_cast<int>(a.stride(0)), 1,
+                                              swizzle_k_mode);
+    const auto tensor_map_b = make_tma_b_desc(cute::UMMA::Major::MN, b, k, n,
+                                              block_k, block_n,
+                                              static_cast<int>(b.stride(0)), 1,
+                                              swizzle_k_mode, 0, true);
+    const auto tensor_map_dd = make_tma_a_desc(cute::UMMA::Major::MN, dd, n, m,
+                                               block_n, block_m,
+                                               static_cast<int>(dd.stride(0)), 1,
+                                               swizzle_n_mode, 0, true);
+    const auto tensor_map_da = make_tma_cd_desc(da, m, k, // (m, k) k inner major
+                                                block_m, block_k,
+                                                static_cast<int>(da.stride(0)), 1,
+                                                swizzle_k_mode);
+
+    // Calculate stages
+    int num_stages = 16, smem_size = 0;
+    while (num_stages > 0) {
+        const int smem_dd_per_stage = block_m * block_n * sizeof(float);
+        const int smem_a_per_stage = block_m * block_k * sizeof(nv_bfloat16);
+        const int smem_b = block_n * block_k * sizeof(float);
+        const int smem_ds_per_stage = block_m * sizeof(float);
+        const int smem_da = block_m * block_k * sizeof(nv_bfloat16) * 2;
+        const int smem_barriers = (num_stages * 4 + 1) * 8;
+        smem_size = (smem_dd_per_stage + smem_a_per_stage + smem_ds_per_stage) * num_stages +
+                    smem_da + smem_b + smem_barriers;
+
+        if (smem_size <= SM90ArchSpec::smem_capacity)
+            break;
+        -- num_stages;
+    }
+    DG_HOST_ASSERT(num_stages > 0);
+
+    // Print configs
+    if (get_env("DG_JIT_DEBUG", 0)) {
+        printf("M: %d, N: %d, K: %d -> "
+               "block M: %d, block N: %d, block K: %d, "
+               "stages: %d, shared memory: %d, "
+               "swizzle N: %d, swizzle K: %d\n, swizzle M: %d\n",
+               m, n, k, block_m, block_n, block_k,
+               num_stages, smem_size,
+               swizzle_n_mode, swizzle_k_mode, swizzle_m_mode);
+    }
+
+    // Launch
+    const SM90BF16HCPrenormBwdGemmRuntime::Args& args = {
+        .m = m, .n = n, .k = k,
+        .block_m = block_m, .block_n = block_n, .block_k = block_k,
+        .swizzle_n_mode = swizzle_n_mode, .swizzle_k_mode = swizzle_k_mode, .swizzle_m_mode = swizzle_m_mode, 
+        .accumulate_on_da = static_cast<int>(accumulate_on_da),
+        .num_stages = num_stages,
+        .num_tma_threads = num_tma_threads,
+        .num_da_threads = num_da_threads,
+        .num_db_threads = num_db_threads,
+        .launch_args = LaunchArgs(ceil_div(k, block_k), num_tma_threads + num_da_threads + num_db_threads, smem_size, 1),
+        .ds = ds.data_ptr<float>(),
+        .db = db.data_ptr<float>(),
+        .tensor_map_a = tensor_map_a,
+        .tensor_map_b = tensor_map_b,
+        .tensor_map_dd = tensor_map_dd,
+        .tensor_map_da = tensor_map_da,
+    };
+    const auto code = SM90BF16HCPrenormBwdGemmRuntime::generate(args);
+    const auto runtime = compiler->build("sm90_tf32_hc_prenorm_bwd_gemm", code);
+    SM90BF16HCPrenormBwdGemmRuntime::launch(runtime, args);
+}
+
+} // namespace deep_gemm
diff --git a/deep_gemm/include/deep_gemm/common/math.cuh b/deep_gemm/include/deep_gemm/common/math.cuh
@@ -45,6 +45,13 @@ __forceinline__ __device__ void swap(T& a, T& b) {
     b = temp;
 }
 
+__device__ __forceinline__ float2 fma2(const float2& a, const float2& b, const float2& c) {
+    return make_float2(
+        __fmaf_rn(a.x, b.x, c.x),
+        __fmaf_rn(a.y, b.y, c.y)
+    );
+}
+
 /// Casting
 template <typename old_t>
 __device__ __forceinline__ int cast_into_bf16_and_pack(old_t& x, old_t& y) {
diff --git a/deep_gemm/include/deep_gemm/impls/sm90_tf32_hc_prenorm_bwd_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm90_tf32_hc_prenorm_bwd_gemm.cuh
diff --git a/tests/test_hyperconnection.py b/tests/test_hyperconnection.py