deepseek-ai
diff --git a/‎csrc/apis/attention.hpp‎
Lines changed: 8 additions & 2 deletions b/‎csrc/apis/attention.hpp‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎csrc/jit_kernels/impls/runtime_utils.hpp‎
Lines changed: 1 addition & 0 deletions b/‎csrc/jit_kernels/impls/runtime_utils.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/jit_kernels/impls/smxx_fp8_mqa_logits.hpp‎
Lines changed: 22 additions & 17 deletions b/‎csrc/jit_kernels/impls/smxx_fp8_mqa_logits.hpp‎
Lines changed: 22 additions & 17 deletions
@@ -121,9 +121,15 @@ static torch::Tensor fp8_mqa_logits(const torch::Tensor& q,
     }
 
     // Dispatch implementation
+    torch::Tensor cu_seq_len_k_start_and_end = torch::stack({cu_seq_len_k_start, cu_seq_len_k_end}, 1).reshape({-1});
+    cu_seq_len_k_start_and_end = cu_seq_len_k_start_and_end.contiguous();
     const auto& arch_major = device_runtime->get_arch_major();
-    if (arch_major == 9 or arch_major == 10) {
-        smxx_fp8_mqa_logits(q, kv.first, kv.second, weights, cu_seq_len_k_start, cu_seq_len_k_end, logits,
+    if (arch_major == 9) {
+        smxx_fp8_mqa_logits(q, kv.first, kv.second, weights, cu_seq_len_k_start_and_end, logits,
+                            seq_len, seq_len_kv, max_seqlen_k, stride_logits, num_heads, head_dim, seq_len_alignment);
+    } else if (arch_major == 10) {
+        auto weights_fp16 = weights.to(torch::kFloat16).contiguous();
+        smxx_fp8_mqa_logits(q, kv.first, kv.second, weights_fp16, cu_seq_len_k_start_and_end, logits,
                             seq_len, seq_len_kv, max_seqlen_k, stride_logits, num_heads, head_dim, seq_len_alignment);
     } else {
         DG_HOST_UNREACHABLE("Unsupported architecture");
 
@@ -65,6 +65,7 @@ static CUtensorMapDataType aten_dtype_to_tensor_map_dtype(const at::ScalarType&
         case torch::kFloat:         return CU_TENSOR_MAP_DATA_TYPE_FLOAT32;
         case torch::kBFloat16:      return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
         case torch::kFloat8_e4m3fn: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+        case torch::kFloat16:       return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
         default: DG_HOST_UNREACHABLE("Unsupported dtype");
     }
 }
 
@@ -24,8 +24,7 @@ class SMXXFP8MQALogitsRuntime final: public LaunchRuntime<SMXXFP8MQALogitsRuntim
         int block_q;
         int block_kv;
 
-        int* cu_seq_len_k_start;
-        int* cu_seq_len_k_end;
+        int* cu_seq_len_k_start_and_end;
         float* logits;
         float softmax_scale;
 
@@ -72,7 +71,7 @@ static void __instantiate_kernel() {{
         DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,
             args.seq_len, args.seq_len_kv,
             args.max_seqlen_k, static_cast<int64_t>(args.stride_logits),
-            args.cu_seq_len_k_start, args.cu_seq_len_k_end,
+            args.cu_seq_len_k_start_and_end,
             args.logits,
             args.tensor_map_q, args.tensor_map_kv,
             args.tensor_map_kv_scales, args.tensor_map_weights
@@ -83,8 +82,7 @@ static void __instantiate_kernel() {{
 static void smxx_fp8_mqa_logits(const torch::Tensor& q,
                                 const torch::Tensor& kv, const torch::Tensor& kv_scales,
                                 const torch::Tensor& weights,
-                                const torch::Tensor& cu_seq_len_k_start,
-                                const torch::Tensor& cu_seq_len_k_end,
+                                const torch::Tensor& cu_seq_len_k_start_and_end,
                                 const torch::Tensor& logits,
                                 const int& seq_len, const int& seq_len_kv,
                                 const int& max_seqlen_k, const int& stride_logits,
@@ -93,8 +91,15 @@ static void smxx_fp8_mqa_logits(const torch::Tensor& q,
     constexpr int block_qh = 128;
     constexpr int block_kv = 256;
     constexpr int num_specialized_threads = 128;
-    constexpr int num_q_stages = 3, num_kv_stages = 3;
-    const int num_math_threads = (device_runtime->get_arch_major() == 10 ? 256 : 512);
+    bool is_sm100 = device_runtime->get_arch_major() == 10;
+    int num_q_stages = 3, num_kv_stages = 3;
+    int num_splits = 1;
+    if (is_sm100) {
+        num_q_stages = 5;
+        num_kv_stages = 8;
+        num_splits = 2;
+    }
+    const int num_math_threads = (is_sm100 ? 256 : 512);
     const int block_q = block_qh / num_heads;
     DG_HOST_ASSERT(block_qh % num_heads == 0);
     DG_HOST_ASSERT(seq_len_alignment % block_q == 0);
@@ -107,27 +112,28 @@ static void smxx_fp8_mqa_logits(const torch::Tensor& q,
     const auto& tensor_map_q = make_tma_2d_desc(q, head_dim, seq_len * num_heads,
                                                 head_dim, block_qh, head_dim, head_dim);
     const auto& tensor_map_kv = make_tma_2d_desc(kv, head_dim, seq_len_kv,
-                                                 head_dim, block_kv, head_dim, head_dim);
+                                                 head_dim, block_kv / num_splits, head_dim, head_dim);
     // According to the driver API, the minimal alignment is 256 bytes
     // So it is safe for us to do a 16-byte OOB
     const auto& tensor_map_kv_scales = make_tma_2d_desc(kv_scales,
                                                         get_tma_aligned_size(seq_len_kv, static_cast<int>(kv_scales.element_size())),
-                                                        1, block_kv, 1, 0, 0);
+                                                        1, block_kv / num_splits, 1, 0, 0);
     const auto& tensor_map_weights = make_tma_2d_desc(weights, num_heads, seq_len,
                                                       num_heads, block_q, num_heads, 0);
 
     // Calculate shared memory size
     int smem_size = 0;
     const int smem_q_size_per_stage = block_q * num_heads * head_dim * static_cast<int>(q.element_size());
-    const int smem_weight_size_per_stage = block_q * num_heads * static_cast<int>(weights.element_size());
-    const int smem_kv_size_per_stage = block_kv * head_dim * static_cast<int>(kv.element_size());
-    const int kv_scale_size_per_stage = block_kv * static_cast<int>(kv_scales.element_size());
+    const int smem_weight_size_per_stage = num_splits * block_q * num_heads * static_cast<int>(weights.element_size());
+    const int smem_kv_size_per_stage = (block_kv / num_splits) * head_dim * static_cast<int>(kv.element_size());
+    const int kv_scale_size_per_stage = (block_kv / num_splits) * static_cast<int>(kv_scales.element_size());
     smem_size += num_q_stages * smem_q_size_per_stage;
     smem_size += num_kv_stages * smem_kv_size_per_stage;
     smem_size += num_q_stages * smem_weight_size_per_stage;
     smem_size += num_kv_stages * kv_scale_size_per_stage;
-    smem_size += (num_q_stages * 2 + num_kv_stages * 2 + (num_math_threads / 128) * 2) * 8;
-    smem_size += 4;
+    const int num_mma_stages = 2;
+    smem_size += (num_q_stages * 2 + num_kv_stages * 2 + num_mma_stages * 2) * 8;
+    smem_size += 256;
     DG_HOST_ASSERT(smem_size <= SM90ArchSpec::smem_capacity);
     DG_HOST_ASSERT(smem_size <= SM100ArchSpec::smem_capacity);
 
@@ -143,8 +149,7 @@ static void smxx_fp8_mqa_logits(const torch::Tensor& q,
         .num_kv_stages = num_kv_stages,
         .block_q = block_q,
         .block_kv = block_kv,
-        .cu_seq_len_k_start = cu_seq_len_k_start.data_ptr<int>(),
-        .cu_seq_len_k_end = cu_seq_len_k_end.data_ptr<int>(),
+        .cu_seq_len_k_start_and_end = cu_seq_len_k_start_and_end.data_ptr<int>(),
         .logits = logits.data_ptr<float>(),
         .tensor_map_q = tensor_map_q,
         .tensor_map_kv = tensor_map_kv,
@@ -154,7 +159,7 @@ static void smxx_fp8_mqa_logits(const torch::Tensor& q,
         .num_math_threads = num_math_threads,
         .launch_args = LaunchArgs(device_runtime->get_num_sms(),
                                   num_specialized_threads + num_math_threads,
-                                  smem_size)
+                                  smem_size, num_splits)
     };
     const auto& code = SMXXFP8MQALogitsRuntime::generate(args);
     const auto& runtime = compiler->build("smxx_fp8_mqa_logits", code);
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ static CUtensorMapDataType aten_dtype_to_tensor_map_dtype(const at::ScalarType&`
`65`	`65`	`case torch::kFloat: return CU_TENSOR_MAP_DATA_TYPE_FLOAT32;`
`66`	`66`	`case torch::kBFloat16: return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;`
`67`	`67`	`case torch::kFloat8_e4m3fn: return CU_TENSOR_MAP_DATA_TYPE_UINT8;`
	`68`	`+ case torch::kFloat16: return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;`
`68`	`69`	`default: DG_HOST_UNREACHABLE("Unsupported dtype");`
`69`	`70`	`}`
`70`	`71`	`}`