Sync cluster before 2-CTA TMEM alloc (#192)

zheanxu · web-flow · commit ef5c7ee5c255 · 2026-03-23T09:54:55.000+08:00
* Sync cluster before 2-CTA TMEM alloc

* Minor fix

* Minor fix
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh
@@ -94,6 +94,9 @@ sm100_bf16_gemm_impl(int* grouped_layout,
     constexpr uint32_t kNumTmemCols = utils::get_num_aligned_tmem_cols<kNumAccumTmemCols>();
     DG_STATIC_ASSERT(32 <= kNumTmemCols and kNumTmemCols <= 512, "Invalid tensor memory columns");
 
+    // Synchronize the cluster before 2-CTA TMEM allocation
+    kNumMulticast > 1 ? cute::cluster_sync() : void();
+
     // Utils
     bool is_leader_cta = cute::block_rank_in_cluster() == 0;
     const auto warp_idx = cutlass::canonical_warp_idx_sync();
@@ -416,8 +419,10 @@ sm100_bf16_gemm_impl(int* grouped_layout,
         }
     }
 
-    // Deallocate tensor memory
+    // TODO: Remove redundant synchronization
     kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
+
+    // Deallocate tensor memory
     if (warp_idx == 0)
         Allocator().free(0, kNumTmemCols);
 
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh
@@ -99,6 +99,9 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
     constexpr uint32_t kTmemStartColOfSFB = kNumAccumTmemCols + kNumSFATmemCols;
     DG_STATIC_ASSERT(32 <= kNumTmemCols and kNumTmemCols <= 512, "Invalid tensor memory columns");
 
+    // Synchronize the cluster before 2-CTA TMEM allocation
+    kNumMulticast > 1 ? cute::cluster_sync() : void();
+
     // Utils
     const bool is_leader_cta = cute::block_rank_in_cluster() == 0;
     const auto warp_idx = cutlass::canonical_warp_idx_sync();
@@ -494,8 +497,10 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
         }
     }
 
-    // Deallocate tensor memory
+    // TODO: Remove redundant synchronization
     kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
+
+    // Deallocate tensor memory
     if (warp_idx == 0)
         Allocator().free(0, kNumTmemCols);