Misc optimizations (#240)

interestingLSY · web-flow · commit 9b6dccd1e5fb · 2026-04-10T17:48:10.000+08:00
* Remove unnecessary cute::min

* Use add.rn.f32.bf16 for mixed-precision addition

* Code tidy-up

* Use __fdividef for kFastMath

* Revert buggy optimization
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh
@@ -589,11 +589,6 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                 // Wait for token TMA store to complete
                 cute::tma_store_arrive();
                 ptx::tma_store_wait<0>();
-            }
-            __syncwarp();
-
-            // Notify finishing
-            if (cute::elect_one_sync()) {
                 ptx::red_add_rel(
                     workspace.get_l1_arrival_count_ptr(current_expert_idx, token_idx_in_expert / BLOCK_M), 1);
             }
@@ -1025,7 +1020,7 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                             const auto denom = __fadd2_rn(make_float2(1.0f, 1.0f), neg_gate_exp);
                             float2 silu_gate;
                             if constexpr (kFastMath) {
-                                silu_gate = make_float2(__fdiv_rn(gate.x, denom.x), __fdiv_rn(gate.y, denom.y));
+                                silu_gate = make_float2(__fdividef(gate.x, denom.x), __fdividef(gate.y, denom.y));
                             } else {
                                 silu_gate = make_float2(gate.x / denom.x, gate.y / denom.y);
                             }
@@ -1349,7 +1344,7 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                         const auto bf16_values = reinterpret_cast<const nv_bfloat162*>(&uint4_values);
                         #pragma unroll
                         for (uint32_t l = 0; l < kNumElemsPerUint4; ++ l)
-                            reduced[j * kNumElemsPerUint4 + l] = __fadd2_rn(reduced[j * kNumElemsPerUint4 + l], __bfloat1622float2(bf16_values[l]));
+                            ptx::accumulate(reduced[j * kNumElemsPerUint4 + l], bf16_values[l]);
                     }
                     combine_phase ^= load_stage_idx;
                     load_stage_idx ^= 1;
diff --git a/deep_gemm/include/deep_gemm/ptx/utils.cuh b/deep_gemm/include/deep_gemm/ptx/utils.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cuda/std/cstdint>
+#include <cuda_bf16.h>
 
 #include <deep_gemm/common/exception.cuh>
 
@@ -14,14 +15,9 @@ CUTLASS_DEVICE uint32_t get_sm_idx() {
 
 CUTLASS_DEVICE uint32_t get_lane_idx() {
     uint32_t lane_id;
-    asm ("mov.u32 %0, %laneid;" : "=r"(lane_id));
+    asm ("mov.u32 %0, %%laneid;" : "=r"(lane_id));
     return lane_id;
 }
-__forceinline__ __device__ float warp_reduce_amax(const float& value, const uint32_t& mask) {
-    float result;
-    asm volatile("redux.sync.max.abs.NaN.f32 %0, %1, %d;\n" : "=f"(result) : "f"(value), "r"(mask));
-    return result;
-}
 
 template <typename dtype_t>
 CUTLASS_DEVICE dtype_t exchange(dtype_t ptr, const uint32_t& src_lane_idx) {
@@ -35,4 +31,15 @@ CUTLASS_DEVICE dtype_t exchange(dtype_t ptr, const uint32_t& src_lane_idx) {
     return recv_dtype;
 }
 
+CUTLASS_DEVICE void accumulate(float2& a, nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 1000)
+    // Use `add.rn.f32.bf16` instruction to perform fused (cast + add) operation on SM100
+    asm("add.rn.f32.bf16 %0, %1, %0;\n" : "+f"(a.x) : "h"(*reinterpret_cast<uint16_t*>(&b.x)));
+    asm("add.rn.f32.bf16 %0, %1, %0;\n" : "+f"(a.y) : "h"(*reinterpret_cast<uint16_t*>(&b.y)));
+#else
+    const auto [x, y] = __bfloat1622float2(b);
+    a.x += x, a.y += y;
+#endif
+}
+
 } // namespace deep_gemm::ptx