fix b200 cu128 (#4)

zhyncs · web-flow · commit b29302e5fe87 · 2025-08-26T11:19:21.000-07:00
diff --git a/README.md b/README.md
@@ -66,7 +66,9 @@ cat develop.sh
 
 # Test all GEMM implements
 python tests/test_layout.py
-python tests/test_core.py
+python tests/test_bf16.py
+python tests/test_fp8.py
+python tests/test_lazy_init.py
 ```
 
 ### Installation
diff --git a/csrc/jit/compiler.hpp b/csrc/jit/compiler.hpp
@@ -155,7 +155,7 @@ class NVCCCompiler final: public Compiler {
         signature = fmt::format("NVCC{}.{}", nvcc_major, nvcc_minor);
 
         // The override the compiler flags
-        flags = fmt::format("{} -I{} --gpu-architecture=sm_{} "
+        flags = fmt::format("{} -I{} --gpu-architecture=sm_{}a "
                             "--compiler-options=-fPIC,-O3,-fconcepts,-Wno-deprecated-declarations,-Wno-abi "
                             "-cubin -O3 --expt-relaxed-constexpr --expt-extended-lambda",
                             flags, library_include_path.c_str(), device_runtime->get_arch());
@@ -205,7 +205,7 @@ class NVRTCCompiler final: public Compiler {
         }
 
         // Override the compiler flags
-        flags = fmt::format("{} {}--gpu-architecture=sm_{} -default-device {}",
+        flags = fmt::format("{} {}--gpu-architecture=sm_{}a -default-device {}",
                             flags, include_dirs, device_runtime->get_arch(), pch_flags);
     }
 
diff --git a/csrc/jit/device_runtime.hpp b/csrc/jit/device_runtime.hpp
@@ -25,11 +25,9 @@ class DeviceRuntime {
         return {prop->major, prop->minor};
     }
 
-    std::string get_arch() {
+    int get_arch() {
         const auto& [major, minor] = get_arch_pair();
-        if (major == 10 and minor != 1)
-            return "100f";
-        return std::to_string(major * 10 + minor) + "a";
+        return major * 10 + minor;
     }
 
     int get_arch_major() {
diff --git a/csrc/jit/kernel_runtime.hpp b/csrc/jit/kernel_runtime.hpp
@@ -46,7 +46,8 @@ class KernelRuntime final {
         std::istringstream iss(symbols);
         std::vector<std::string> symbol_names;
         for (std::string line; std::getline(iss, line); ) {
-            if (line.find("STT_FUNC") == 0 and std::none_of(illegal_names.begin(), illegal_names.end(),
+            if (line.find("STT_FUNC") == 0 and line.find("STO_ENTRY") != std::string::npos and
+                std::none_of(illegal_names.begin(), illegal_names.end(),
                 [&](const auto& name) { return line.find(name) != std::string::npos; })) {
                 const auto& last_space = line.rfind(' ');
                 symbol_names.push_back(line.substr(last_space + 1));
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh
@@ -32,7 +32,6 @@ sm100_bf16_gemm_impl(int* grouped_layout,
                      const __grid_constant__ cute::TmaDescriptor tensor_map_d) {
 #if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 1000)) or defined(__CLION_IDE__)
     using Barrier = cutlass::arch::ClusterTransactionBarrier;
-    using Allocator = cute::conditional_t<kNumMulticast == 1, cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
 
     // GEMM with accumulation must have FP32 output
     if constexpr (kWithAccumulation)
@@ -142,7 +141,7 @@ sm100_bf16_gemm_impl(int* grouped_layout,
         cutlass::arch::fence_barrier_init();
     } else if (threadIdx.x >= 32 and threadIdx.x < 64) {
         // Allocate tensor memory
-        Allocator().allocate(kNumTmemCols, tmem_ptr_in_smem);
+        cute::TMEM::Allocator1Sm().allocate(kNumTmemCols, tmem_ptr_in_smem);
     }
     kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
 
@@ -473,13 +472,15 @@ sm100_bf16_gemm_impl(int* grouped_layout,
         }
 
         // Flush all stages in the pipeline to make TMA stores visible to the next kernel
+        // TODO: do we actually need this?
         if (epilogue_thread_idx == 0)
             cute::tma_store_wait<0>();
 
         // Deallocate tensor memory by warp 1
         // NOTES: warp 0 is waiting TMA store
+        // TODO: do we need 2 SM allocation?
         if (epilogue_warp_idx == 1)
-            Allocator().free(0, kNumTmemCols);
+            cute::TMEM::Allocator1Sm().free(0, kNumTmemCols);
     }
 
     // To safely deconstruct all barriers, we need a cluster sync
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh
@@ -33,7 +33,6 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout,
                          const __grid_constant__ cute::TmaDescriptor tensor_map_d) {
 #if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 1000)) or defined(__CLION_IDE__)
     using Barrier = cutlass::arch::ClusterTransactionBarrier;
-    using Allocator = cute::conditional_t<kNumMulticast == 1, cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
 
     // GEMM with accumulation must have FP32 output
     if constexpr (kWithAccumulation)
@@ -170,7 +169,7 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout,
         cutlass::arch::fence_barrier_init();
     } else if (threadIdx.x >= 32 and threadIdx.x < 64) {
         // Allocate tensor memory
-        Allocator().allocate(kNumTmemCols, tmem_ptr_in_smem);
+        cute::TMEM::Allocator1Sm().allocate(kNumTmemCols, tmem_ptr_in_smem);
     }
     kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
 
@@ -578,13 +577,15 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout,
         }
 
         // Flush all stages in the pipeline to make TMA stores visible to the next kernel
+        // TODO: do we actually need this?
         if (epilogue_thread_idx == 0)
             cute::tma_store_wait<0>();
 
         // Deallocate tensor memory by warp 1
         // NOTES: warp 0 is waiting TMA store
+        // TODO: do we need 2 SM allocation?
         if (epilogue_warp_idx == 1)
-            Allocator().free(0, kNumTmemCols);
+            cute::TMEM::Allocator1Sm().free(0, kNumTmemCols);
     }
 
     // To safely deconstruct all barriers, we need a cluster sync
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh
@@ -32,7 +32,6 @@ sm100_fp8_gemm_1d2d_impl(float* sfb, int* grouped_layout,
                          const __grid_constant__ cute::TmaDescriptor tensor_map_sfa) {
 #if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 1000)) or defined(__CLION_IDE__)
     using Barrier = cutlass::arch::ClusterTransactionBarrier;
-    using Allocator = cute::conditional_t<kNumMulticast == 1, cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
 
     // Scaling checks
     DG_STATIC_ASSERT(BLOCK_K == 128, "Only support per-128-channel FP8 scaling");
@@ -153,7 +152,7 @@ sm100_fp8_gemm_1d2d_impl(float* sfb, int* grouped_layout,
         cutlass::arch::fence_barrier_init();
     } else if (threadIdx.x >= 32 and threadIdx.x < 64) {
         // Allocate tensor memory
-        Allocator().allocate(kNumTmemCols, tmem_ptr_in_smem);
+        cute::TMEM::Allocator1Sm().allocate(kNumTmemCols, tmem_ptr_in_smem);
     }
     kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads();
 
@@ -519,7 +518,7 @@ sm100_fp8_gemm_1d2d_impl(float* sfb, int* grouped_layout,
         // Deallocate tensor memory by warp 1
         // NOTES: warp 0 is waiting TMA store
         if (epilogue_warp_idx == 1)
-            Allocator().free(0, kNumTmemCols);
+            cute::TMEM::Allocator1Sm().free(0, kNumTmemCols);
     }
 
     // To safely deconstruct all barriers, we need a cluster sync

Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ class NVCCCompiler final: public Compiler {`
`155`	`155`	`signature = fmt::format("NVCC{}.{}", nvcc_major, nvcc_minor);`
`156`	`156`
`157`	`157`	`// The override the compiler flags`
`158`		`- flags = fmt::format("{} -I{} --gpu-architecture=sm_{} "`
	`158`	`+ flags = fmt::format("{} -I{} --gpu-architecture=sm_{}a "`
`159`	`159`	`"--compiler-options=-fPIC,-O3,-fconcepts,-Wno-deprecated-declarations,-Wno-abi "`
`160`	`160`	`"-cubin -O3 --expt-relaxed-constexpr --expt-extended-lambda",`
`161`	`161`	`flags, library_include_path.c_str(), device_runtime->get_arch());`
`@@ -205,7 +205,7 @@ class NVRTCCompiler final: public Compiler {`
`205`	`205`	`}`
`206`	`206`
`207`	`207`	`// Override the compiler flags`
`208`		`- flags = fmt::format("{} {}--gpu-architecture=sm_{} -default-device {}",`
	`208`	`+ flags = fmt::format("{} {}--gpu-architecture=sm_{}a -default-device {}",`
`209`	`209`	`flags, include_dirs, device_runtime->get_arch(), pch_flags);`
`210`	`210`	`}`
`211`	`211`
Original file line number	Diff line number	Diff line change
`@@ -25,11 +25,9 @@ class DeviceRuntime {`
`25`	`25`	`return {prop->major, prop->minor};`
`26`	`26`	`}`
`27`	`27`
`28`		`- std::string get_arch() {`
	`28`	`+ int get_arch() {`
`29`	`29`	`const auto& [major, minor] = get_arch_pair();`
`30`		`- if (major == 10 and minor != 1)`
`31`		`- return "100f";`
`32`		`- return std::to_string(major * 10 + minor) + "a";`
	`30`	`+ return major * 10 + minor;`
`33`	`31`	`}`
`34`	`32`
`35`	`33`	`int get_arch_major() {`