deepseek-ai
diff --git a/‎csrc/apis/gemm.hpp‎
Lines changed: 1 addition & 66 deletions b/‎csrc/apis/gemm.hpp‎
Lines changed: 1 addition & 66 deletions
diff --git a/‎csrc/apis/layout.hpp‎
Lines changed: 0 additions & 13 deletions b/‎csrc/apis/layout.hpp‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎csrc/apis/runtime.hpp‎
Lines changed: 1 addition & 19 deletions b/‎csrc/apis/runtime.hpp‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎csrc/jit/device_runtime.hpp‎
Lines changed: 2 additions & 4 deletions b/‎csrc/jit/device_runtime.hpp‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎csrc/jit_kernels/impls/runtime_utils.hpp‎
Lines changed: 0 additions & 1 deletion b/‎csrc/jit_kernels/impls/runtime_utils.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎csrc/jit_kernels/impls/sm100_bf16_gemm.hpp‎
Lines changed: 0 additions & 2 deletions b/‎csrc/jit_kernels/impls/sm100_bf16_gemm.hpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp‎
Lines changed: 0 additions & 2 deletions b/‎csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp‎
Lines changed: 0 additions & 2 deletions b/‎csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/jit_kernels/impls/sm90_bf16_gemm.hpp‎
Lines changed: 0 additions & 2 deletions b/‎csrc/jit_kernels/impls/sm90_bf16_gemm.hpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp‎
Lines changed: 0 additions & 2 deletions b/‎csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp‎
Lines changed: 0 additions & 2 deletions
@@ -403,69 +403,4 @@ static void m_grouped_bf16_gemm_nt_masked(const torch::Tensor& a, const torch::T
     }
 }
 
-static void register_apis(pybind11::module_& m) {
-    // FP8 GEMMs
-     m.def("fp8_gemm_nt", &fp8_gemm_nt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_gemm_nn", &fp8_gemm_nn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_gemm_tn", &fp8_gemm_tn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "mn",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("fp8_gemm_tt", &fp8_gemm_tt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "mn",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("m_grouped_fp8_gemm_nt_contiguous", &m_grouped_fp8_gemm_nt_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"),
-          py::arg("recipe") = std::nullopt, py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("m_grouped_fp8_gemm_nn_contiguous", &m_grouped_fp8_gemm_nn_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"),
-          py::arg("recipe") = std::nullopt, py::arg("compiled_dims") = "nk",
-          py::arg("disable_ue8m0_cast") = false);
-    m.def("m_grouped_fp8_gemm_nt_masked", &m_grouped_fp8_gemm_nt_masked,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("masked_m"),
-          py::arg("expected_m"), py::arg("recipe") = std::nullopt,
-          py::arg("compiled_dims") = "nk", py::arg("disable_ue8m0_cast") = false);
-    m.def("k_grouped_fp8_gemm_tn_contiguous", &k_grouped_fp8_gemm_tn_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("ks"),
-          py::arg("ks_tensor"), py::arg("c") = std::nullopt,
-          py::arg("recipe") = std::make_tuple(1, 1, 128),
-          py::arg("compiled_dims") = "mn");
-
-    // BF16 GEMMs
-    m.def("bf16_gemm_nt", &bf16_gemm_nt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "nk");
-    m.def("bf16_gemm_nn", &bf16_gemm_nn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "nk");
-    m.def("bf16_gemm_tn", &bf16_gemm_tn,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "mn");
-    m.def("bf16_gemm_tt", &bf16_gemm_tt,
-          py::arg("a"), py::arg("b"), py::arg("d"),
-          py::arg("c") = std::nullopt,
-          py::arg("compiled_dims") = "mn");
-    m.def("m_grouped_bf16_gemm_nt_contiguous", &m_grouped_bf16_gemm_nt_contiguous,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"),
-          py::arg("compiled_dims") = "nk");
-    m.def("m_grouped_bf16_gemm_nt_masked", &m_grouped_bf16_gemm_nt_masked,
-          py::arg("a"), py::arg("b"), py::arg("d"), py::arg("masked_m"),
-          py::arg("expected_m"), py::arg("compiled_dims") = "nk");
-}
-
-} // namespace deep_gemm::gemm
+} // namespace deep_gemm::gemm
@@ -69,17 +69,4 @@ static torch::Tensor transform_k_grouped_sf_into_required_layout(const torch::Te
     DG_HOST_UNREACHABLE("Unknown cases");
 }
 
-static void register_apis(pybind11::module_& m) {
-    m.def("transform_sf_into_required_layout", &transform_sf_into_required_layout,
-      py::arg("sf"), py::arg("mn"), py::arg("k"), py::arg("recipe"),
-      py::arg("num_groups") = std::nullopt, py::arg("is_sfa") = false,
-      py::arg("disable_ue8m0_cast") = false);
-
-    m.def("get_tma_aligned_size", &get_tma_aligned_size);
-    m.def("get_mk_alignment_for_contiguous_layout", &get_mk_alignment_for_contiguous_layout);
-    m.def("get_mn_major_tma_aligned_tensor", &get_mn_major_tma_aligned_tensor);
-    m.def("get_mn_major_tma_aligned_packed_ue8m0_tensor", &get_mn_major_tma_aligned_packed_ue8m0_tensor);
-    m.def("get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor", &get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor);
-}
-
 } // namespace deep_gemm::layout
@@ -5,24 +5,6 @@
 
 namespace deep_gemm::runtime {
 
-static void register_apis(pybind11::module_& m) {
-    m.def("set_num_sms", [&](const int& new_num_sms) {
-        device_runtime->set_num_sms(new_num_sms);
-    });
-    m.def("get_num_sms", [&]() {
-       return device_runtime->get_num_sms();
-    });
-    m.def("set_tc_util", [&](const int& new_tc_util) {
-        device_runtime->set_tc_util(new_tc_util);
-    });
-    m.def("get_tc_util", [&]() {
-        return device_runtime->get_tc_util();
-    });
-
-    m.def("init", [&](const std::string& library_root_path, const std::string& cuda_home_path_by_python) {
-        Compiler::prepare_init(library_root_path, cuda_home_path_by_python);
-        KernelRuntime::prepare_init(cuda_home_path_by_python);
-    });
-}
+// The init and other functions are now exposed via TORCH_LIBRARY in python_api.cpp
 
 } // namespace deep_gemm::runtime
@@ -25,11 +25,9 @@ class DeviceRuntime {
         return {prop->major, prop->minor};
     }
 
-    std::string get_arch() {
+    int get_arch() {
         const auto& [major, minor] = get_arch_pair();
-        if (major == 10 and minor != 1)
-            return "100f";
-        return std::to_string(major * 10 + minor) + "a";
+        return major * 10 + minor;
     }
 
     int get_arch_major() {
 
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <cuda.h>
-#include <torch/python.h>
 
 #include "../../utils/math.hpp"
 #include "../../utils/exception.hpp"
 
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
 
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
 
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
 
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/kernel_runtime.hpp"
 #include "../../utils/exception.hpp"
 
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/python.h>
-
 #include "../../jit/compiler.hpp"
 #include "../../jit/device_runtime.hpp"
 #include "../../jit/kernel_runtime.hpp"
Original file line number	Diff line number	Diff line change
`@@ -25,11 +25,9 @@ class DeviceRuntime {`
`25`	`25`	`return {prop->major, prop->minor};`
`26`	`26`	`}`
`27`	`27`
`28`		`- std::string get_arch() {`
	`28`	`+ int get_arch() {`
`29`	`29`	`const auto& [major, minor] = get_arch_pair();`
`30`		`- if (major == 10 and minor != 1)`
`31`		`- return "100f";`
`32`		`- return std::to_string(major * 10 + minor) + "a";`
	`30`	`+ return major * 10 + minor;`
`33`	`31`	`}`
`34`	`32`
`35`	`33`	`int get_arch_major() {`