Fix 2CTA TMEM Free (#189)

zheanxu · web-flow · commit 0df67dc11f65 · 2026-03-19T16:40:48.000+08:00
* Fix 2CTA TMEM

* Fix BF16 as well
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh
@@ -411,12 +411,13 @@ sm100_bf16_gemm_impl(int* grouped_layout,
                  tensor_map_cd);
             }
         }
-
-        // Deallocate tensor memory by the last UMMA store warp
-        // NOTES: warp 0 is waiting TMA store
-        if (epilogue_warp_idx == kNumUMMAStoreThreads / 32 - 1)
-            Allocator().free(0, kNumTmemCols);
     }
+
+    // Deallocate tensor memory
+    kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
+    if (warp_idx == 0)
+        Allocator().free(0, kNumTmemCols);
+
 #else
     if (blockIdx.x == 0 and threadIdx.x == 0)
         DG_DEVICE_ASSERT(false and "This kernel only support sm_100f");
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_gemm_1d1d.cuh
@@ -489,12 +489,13 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,
                  tensor_map_cd);
             }
         }
-
-        // Deallocate tensor memory by the last UMMA store warp
-        // NOTES: warp 0 is waiting TMA store
-        if (epilogue_warp_idx == kNumUMMAStoreThreads / 32 - 1)
-            Allocator().free(0, kNumTmemCols);
     }
+
+    // Deallocate tensor memory
+    kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
+    if (warp_idx == 0)
+        Allocator().free(0, kNumTmemCols);
+
 #else
     if (blockIdx.x == 0 and threadIdx.x == 0)
         DG_DEVICE_ASSERT(false and "This kernel only support sm_100f");

Original file line number	Diff line number	Diff line change
`@@ -411,12 +411,13 @@ sm100_bf16_gemm_impl(int* grouped_layout,`
`411`	`411`	`tensor_map_cd);`
`412`	`412`	`}`
`413`	`413`	`}`
`414`		`-`
`415`		`- // Deallocate tensor memory by the last UMMA store warp`
`416`		`- // NOTES: warp 0 is waiting TMA store`
`417`		`- if (epilogue_warp_idx == kNumUMMAStoreThreads / 32 - 1)`
`418`		`- Allocator().free(0, kNumTmemCols);`
`419`	`414`	`}`
	`415`	`+`
	`416`	`+ // Deallocate tensor memory`
	`417`	`+ kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();`
	`418`	`+ if (warp_idx == 0)`
	`419`	`+ Allocator().free(0, kNumTmemCols);`
	`420`	`+`
`420`	`421`	`#else`
`421`	`422`	`if (blockIdx.x == 0 and threadIdx.x == 0)`
`422`	`423`	`DG_DEVICE_ASSERT(false and "This kernel only support sm_100f");`
Original file line number	Diff line number	Diff line change
`@@ -489,12 +489,13 @@ sm100_fp8_fp4_gemm_1d1d_impl(int* grouped_layout,`
`489`	`489`	`tensor_map_cd);`
`490`	`490`	`}`
`491`	`491`	`}`
`492`		`-`
`493`		`- // Deallocate tensor memory by the last UMMA store warp`
`494`		`- // NOTES: warp 0 is waiting TMA store`
`495`		`- if (epilogue_warp_idx == kNumUMMAStoreThreads / 32 - 1)`
`496`		`- Allocator().free(0, kNumTmemCols);`
`497`	`492`	`}`
	`493`	`+`
	`494`	`+ // Deallocate tensor memory`
	`495`	`+ kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();`
	`496`	`+ if (warp_idx == 0)`
	`497`	`+ Allocator().free(0, kNumTmemCols);`
	`498`	`+`
`498`	`499`	`#else`
`499`	`500`	`if (blockIdx.x == 0 and threadIdx.x == 0)`
`500`	`501`	`DG_DEVICE_ASSERT(false and "This kernel only support sm_100f");`