Maintain symmetric offset diff instead of the whole pointer (#239)

LyricZhao · web-flow · commit 67c7e2be78b6 · 2026-04-10T15:26:12.000+08:00
diff --git a/csrc/apis/mega.hpp b/csrc/apis/mega.hpp
@@ -124,7 +124,7 @@ static void fp8_fp4_mega_moe(
     const std::tuple<torch::Tensor, torch::Tensor>& l1_weights_,
     const std::tuple<torch::Tensor, torch::Tensor>& l2_weights_,
     const torch::Tensor& sym_buffer,
-    const std::vector<uint64_t>& sym_buffer_ptrs, const int& rank_idx,
+    const std::vector<int64_t>& sym_buffer_ptrs, const int& rank_idx,
     const int& num_max_tokens_per_rank,
     const int& num_experts, const int& num_topk,
     const std::tuple<int, int, int>& recipe,
diff --git a/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp b/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp
@@ -107,7 +107,7 @@ static void sm100_fp8_fp4_mega_moe(
     const torch::Tensor& l2_acts, const torch::Tensor& l2_acts_sf,
     const torch::Tensor& l1_weights, const torch::Tensor& l2_weights,
     const torch::Tensor& l1_weights_sf, const torch::Tensor& l2_weights_sf,
-    const std::vector<uint64_t>& sym_buffer_ptrs,
+    const std::vector<int64_t>& sym_buffer_ptrs,
     const int& rank_idx, const int& num_max_tokens_per_rank,
     const int& num_experts_per_rank,
     const int& num_tokens, const int& num_topk,
diff --git a/deep_gemm/include/deep_gemm/layout/sym_buffer.cuh b/deep_gemm/include/deep_gemm/layout/sym_buffer.cuh
@@ -8,31 +8,31 @@ constexpr static uint32_t kNumMaxRanks = 64;
 
 template <uint32_t kNumRanks = kNumMaxRanks>
 struct SymBuffer {
-    uint64_t offsets[kNumMaxRanks];
-
-    uint32_t rank_idx = 0;
+    int64_t base;
+    int64_t offsets[kNumMaxRanks];
+    uint32_t rank_idx;
 
     DG_STATIC_ASSERT(kNumRanks <= kNumMaxRanks, "Too many ranks");
 
     SymBuffer() = default;
 
     template <typename Container>
-    explicit SymBuffer(const Container& c, const uint32_t& rank_idx = 0): rank_idx(rank_idx) {
+    explicit SymBuffer(const Container& c, const uint32_t& rank_idx): rank_idx(rank_idx) {
         const auto size = static_cast<uint32_t>(c.size());
+        base = c[rank_idx];
         for (uint32_t i = 0; i < kNumMaxRanks; ++ i)
-            offsets[i] = i < size ? c[i] : 0;
+            offsets[i] = i < size ? (c[i] - base) : 0;
     }
 
 #if defined(__CUDA_ARCH__) or defined(__CLION_IDE__)
     template <typename ptr_t = void*>
     CUTLASS_DEVICE ptr_t get_base_ptr() const {
-        return reinterpret_cast<ptr_t>(offsets[rank_idx]);
+        return reinterpret_cast<ptr_t>(base);
     }
 
     template <typename ptr_t>
     CUTLASS_DEVICE ptr_t map(const ptr_t& ptr, const uint32_t& dst_rank_idx) const {
-        uint64_t mapped_ptr = offsets[dst_rank_idx] +
-            (reinterpret_cast<uint64_t>(ptr) - offsets[rank_idx]);
+        int64_t mapped_ptr = offsets[dst_rank_idx] + reinterpret_cast<int64_t>(ptr);
         return *reinterpret_cast<ptr_t*>(&mapped_ptr);
     }
 #endif