Loading with the Library Enumerate Kernels API (#181)

zheanxu · web-flow · commit 9f4799d4d56d · 2026-03-09T11:24:08.000+08:00
* Support Driver Library Enumerate Kernels

* Minor fix

* Many fixes

* Lint

* Minor fix
diff --git a/csrc/jit/handle.hpp b/csrc/jit/handle.hpp
@@ -39,6 +39,9 @@ DECL_LAZY_CUDA_DRIVER_FUNCTION(cuFuncSetAttribute);
 DECL_LAZY_CUDA_DRIVER_FUNCTION(cuModuleLoad);
 DECL_LAZY_CUDA_DRIVER_FUNCTION(cuModuleUnload);
 DECL_LAZY_CUDA_DRIVER_FUNCTION(cuModuleGetFunction);
+DECL_LAZY_CUDA_DRIVER_FUNCTION(cuLibraryLoadFromFile);
+DECL_LAZY_CUDA_DRIVER_FUNCTION(cuLibraryUnload);
+DECL_LAZY_CUDA_DRIVER_FUNCTION(cuKernelGetFunction);
 DECL_LAZY_CUDA_DRIVER_FUNCTION(cuLaunchKernelEx);
 DECL_LAZY_CUDA_DRIVER_FUNCTION(cuTensorMapEncodeTiled);
 
@@ -103,33 +106,57 @@ static auto launch_kernel(const KernelHandle& kernel, const LaunchConfigHandle&
 #else
 
 // Use CUDA driver API
-using LibraryHandle = CUmodule;
 using KernelHandle = CUfunction;
 using LaunchConfigHandle = CUlaunchConfig;
 using LaunchAttrHandle = CUlaunchAttribute;
 
+// `cuLibraryEnumerateKernels` is supported since CUDA Driver API 12.4
+#if CUDA_VERSION >= 12040
+    #define DG_JIT_USE_LIBRARY_ENUM_KERNELS
+    DECL_LAZY_CUDA_DRIVER_FUNCTION(cuLibraryGetKernelCount);
+    DECL_LAZY_CUDA_DRIVER_FUNCTION(cuLibraryEnumerateKernels);
+    using LibraryHandle = CUlibrary;
+#else
+    using LibraryHandle = CUmodule;
+#endif
+
 #define DG_CUDA_UNIFIED_CHECK DG_CUDA_DRIVER_CHECK
 
 static KernelHandle load_kernel(const std::filesystem::path& cubin_path, const std::string& func_name,
                                 LibraryHandle *library_opt = nullptr) {
     LibraryHandle library;
     KernelHandle kernel;
+
+#ifdef DG_JIT_USE_LIBRARY_ENUM_KERNELS
+    DG_CUDA_DRIVER_CHECK(lazy_cuLibraryLoadFromFile(&library, cubin_path.c_str(), nullptr, nullptr, 0, nullptr, nullptr, 0));
+    unsigned int num_kernels;
+    DG_CUDA_DRIVER_CHECK(lazy_cuLibraryGetKernelCount(&num_kernels, library));
+    DG_HOST_ASSERT(num_kernels == 1);
+    CUkernel cu_kernel;
+    DG_CUDA_DRIVER_CHECK(lazy_cuLibraryEnumerateKernels(&cu_kernel, 1, library));
+    DG_CUDA_DRIVER_CHECK(lazy_cuKernelGetFunction(&kernel, cu_kernel));
+#else
     DG_CUDA_DRIVER_CHECK(lazy_cuModuleLoad(&library, cubin_path.c_str()));
     DG_CUDA_DRIVER_CHECK(lazy_cuModuleGetFunction(&kernel, library, func_name.c_str()));
+#endif
 
     if (library_opt != nullptr)
         *library_opt = library;
     return kernel;
 }
 
 static void unload_library(const LibraryHandle& library) {
+#ifdef DG_JIT_USE_LIBRARY_ENUM_KERNELS
+    const auto error = lazy_cuLibraryUnload(library);
+#else
     const auto error = lazy_cuModuleUnload(library);
+#endif
     DG_HOST_ASSERT(error == CUDA_SUCCESS or error == CUDA_ERROR_DEINITIALIZED);
 }
 
 static LaunchConfigHandle construct_launch_config(const KernelHandle& kernel,
-                                                 const cudaStream_t& stream, const int& smem_size,
-                                                 const dim3& grid_dim, const dim3& block_dim, const int& cluster_dim) {
+                                                  const cudaStream_t& stream, const int& smem_size,
+                                                  const dim3& grid_dim, const dim3& block_dim, const int& cluster_dim) {
     if (smem_size > 0)
         DG_CUDA_DRIVER_CHECK(lazy_cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem_size));
 
diff --git a/csrc/jit/kernel_runtime.hpp b/csrc/jit/kernel_runtime.hpp
@@ -46,6 +46,10 @@ class KernelRuntime final {
         if (get_env<int>("DG_JIT_DEBUG") or get_env<int>("DG_JIT_PRINT_LOAD_TIME"))
             start_time = std::chrono::high_resolution_clock::now();
 
+#ifdef DG_JIT_USE_LIBRARY_ENUM_KERNELS
+        // Load from the library
+        kernel = load_kernel(cubin_path, {}, &library);
+#else
         // Find the only symbol
         // TODO: use kernel enumeration for newer drivers
         const std::vector<std::string> illegal_names = {"vprintf", "__instantiate_kernel", "__internal", "__assertfail"};
@@ -75,6 +79,7 @@ class KernelRuntime final {
 
         // Load from the library
         kernel = load_kernel(cubin_path, symbol_names[0], &library);
+#endif
 
         // Print load time
         if (get_env<int>("DG_JIT_DEBUG") or get_env<int>("DG_JIT_PRINT_LOAD_TIME")) {
diff --git a/csrc/jit_kernels/impls/runtime_utils.hpp b/csrc/jit_kernels/impls/runtime_utils.hpp
@@ -72,14 +72,16 @@ static CUtensorMapDataType aten_dtype_to_tensor_map_dtype(const at::ScalarType&
         case torch::kFloat:         return CU_TENSOR_MAP_DATA_TYPE_FLOAT32;
         case torch::kBFloat16:      return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
         case torch::kFloat8_e4m3fn: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+#if CUDA_VERSION >= 12080
         case kPackedFP4:            return fp4_unpacked_smem ? CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B
                                                              : CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B;
+#endif
         default: DG_HOST_UNREACHABLE("Unsupported dtype");
     }
 }
 
 static CUtensorMapSwizzle mode_into_tensor_map_swizzle(const int& mode, const int& base) {
-#if CUDART_VERSION >= 12080
+#if CUDA_VERSION >= 12080
     if (base != 0) {
         DG_HOST_ASSERT(base == 32 and mode == 128);
         return CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B;