Remove a redundant register

LyricZhao · LyricZhao · commit 2b71c004fc23 · 2026-04-10T16:49:45.000+08:00
diff --git a/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp b/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp
@@ -31,7 +31,6 @@ class SM100FP8FP4MegaMoERuntime final : public LaunchRuntime<SM100FP8FP4MegaMoER
         void* y;
         int num_tokens;
         layout::SymBuffer<> sym_buffer_ptrs;
-        int rank_idx;
 
         // Tensormap
         CUtensorMap tensor_map_l1_acts;
@@ -87,7 +86,7 @@ static void __instantiate_kernel() {{
         DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,
             args.y,
             args.num_tokens,
-            args.sym_buffer_ptrs, args.rank_idx,
+            args.sym_buffer_ptrs,
             args.tensor_map_l1_acts,
             args.tensor_map_l1_acts_sf,
             args.tensor_map_l1_weights,
@@ -185,7 +184,6 @@ static void sm100_fp8_fp4_mega_moe(
         .y = y.data_ptr(),
         .num_tokens = num_tokens,
         .sym_buffer_ptrs = layout::SymBuffer<>(sym_buffer_ptrs, rank_idx),
-        .rank_idx = rank_idx,
         .tensor_map_l1_acts = tensor_map_l1_acts,
         .tensor_map_l1_acts_sf = tensor_map_l1_acts_sf,
         .tensor_map_l1_weights = tensor_map_l1_weights,
diff --git a/deep_gemm/include/deep_gemm/comm/barrier.cuh b/deep_gemm/include/deep_gemm/comm/barrier.cuh
@@ -57,8 +57,8 @@ CUTLASS_DEVICE void nvlink_barrier(const layout::Workspace& workspace,
             const auto start_clock = clock64();
             while (ptx::ld_acq_sys(signal_ptr) != target) {
                 if (clock64() - start_clock >= kNumTimeoutCycles) {
-                    printf("DeepGEMM NVLink barrier timeout (30s): signal=%d, target=%d, phase=%d, sign=%d\n",
-                           ptx::ld_acq_sys(signal_ptr), target, signal_phase, signal_sign);
+                    printf("DeepGEMM NVLink barrier timeout (30s): rank=%d, signal=%d, target=%d, phase=%d, sign=%d\n",
+                           sym_buffer.rank_idx, ptx::ld_acq_sys(signal_ptr), target, signal_phase, signal_sign);
                     DG_DEVICE_ASSERT(false and "NVLink barrier timeout");
                 }
             }
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh
@@ -48,7 +48,6 @@ CUTLASS_GLOBAL __launch_bounds__(kNumThreads, 1) void
 sm100_fp8_fp4_mega_moe_impl(void* y,
                             const uint32_t num_tokens,
                             const __grid_constant__ layout::SymBuffer<kNumRanks> sym_buffer,
-                            const uint32_t rank_idx,
                             const __grid_constant__ cute::TmaDescriptor tensor_map_l1_acts,
                             const __grid_constant__ cute::TmaDescriptor tensor_map_l1_acts_sf,
                             const __grid_constant__ cute::TmaDescriptor tensor_map_l1_weights,
@@ -390,7 +389,7 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
             const auto dst_rank_idx = expert_idx / kNumExpertsPerRank;
             const auto dst_slot_idx = atomicAdd_block(smem_expert_count + expert_idx, 1);
             const auto dst_ptr = workspace.get_src_token_topk_idx_ptr(
-                expert_idx % kNumExpertsPerRank, rank_idx, dst_slot_idx);
+                expert_idx % kNumExpertsPerRank, sym_buffer.rank_idx, dst_slot_idx);
             *sym_buffer.map(dst_ptr, dst_rank_idx) = token_topk_idx;
         });
         cutlass::arch::NamedBarrier::sync(kNumDispatchThreads, kDispatchBarrierIdx);
@@ -409,7 +408,7 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                 const auto dst_local_expert_idx = i % kNumExpertsPerRank;
                 const auto expert_status = *workspace.get_expert_send_count_ptr(i);
                 *sym_buffer.map(
-                    workspace.get_expert_recv_count_ptr(rank_idx, dst_local_expert_idx),
+                    workspace.get_expert_recv_count_ptr(sym_buffer.rank_idx, dst_local_expert_idx),
                     dst_rank_idx) = expert_status & 0xffffffff;
                 ptx::atomic_add_sys(
                     sym_buffer.map(workspace.get_expert_recv_count_sum_ptr(dst_local_expert_idx), dst_rank_idx),
diff --git a/deep_gemm/include/deep_gemm/layout/sym_buffer.cuh b/deep_gemm/include/deep_gemm/layout/sym_buffer.cuh
@@ -4,7 +4,7 @@
 
 namespace deep_gemm::layout {
 
-constexpr static uint32_t kNumMaxRanks = 64;
+constexpr static uint32_t kNumMaxRanks = 72;
 
 template <uint32_t kNumRanks = kNumMaxRanks>
 struct SymBuffer {

Original file line number	Diff line number	Diff line change
`@@ -57,8 +57,8 @@ CUTLASS_DEVICE void nvlink_barrier(const layout::Workspace& workspace,`
`57`	`57`	`const auto start_clock = clock64();`
`58`	`58`	`while (ptx::ld_acq_sys(signal_ptr) != target) {`
`59`	`59`	`if (clock64() - start_clock >= kNumTimeoutCycles) {`
`60`		`- printf("DeepGEMM NVLink barrier timeout (30s): signal=%d, target=%d, phase=%d, sign=%d\n",`
`61`		`- ptx::ld_acq_sys(signal_ptr), target, signal_phase, signal_sign);`
	`60`	`+ printf("DeepGEMM NVLink barrier timeout (30s): rank=%d, signal=%d, target=%d, phase=%d, sign=%d\n",`
	`61`	`+ sym_buffer.rank_idx, ptx::ld_acq_sys(signal_ptr), target, signal_phase, signal_sign);`
`62`	`62`	`DG_DEVICE_ASSERT(false and "NVLink barrier timeout");`
`63`	`63`	`}`
`64`	`64`	`}`