k2-fsa · danpovey · Sep 26, 2020 · Sep 24, 2020 · Sep 24, 2020 · Sep 26, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -101,6 +101,7 @@ if(USE_PYTORCH)
   include(torch)
 endif()
 include(cub)
+include(moderngpu)
 include(googletest)
 
 add_subdirectory(k2)
diff --git a/cmake/cub.cmake b/cmake/cub.cmake
@@ -24,7 +24,6 @@ function(download_cub)
   message(STATUS "cub is downloaded to ${cub_SOURCE_DIR}")
   add_library(cub INTERFACE)
   target_include_directories(cub INTERFACE ${cub_SOURCE_DIR})
-
 endfunction()
 
 download_cub()
diff --git a/cmake/moderngpu.cmake b/cmake/moderngpu.cmake
@@ -0,0 +1,31 @@
+# Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+# See ../LICENSE for clarification regarding multiple authors
+
+function(download_moderngpu)
+  if(CMAKE_VERSION VERSION_LESS 3.11)
+    list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+  endif()
+
+  include(FetchContent)
+
+  # this is the latest commit of modern gpu as of 2020-09-26
+  set(moderngpu_URL  "https://github.com/moderngpu/moderngpu/archive/2b3985541c8e88a133769598c406c33ddde9d0a5.zip")
+  set(moderngpu_HASH "SHA256=191546af18cd5fb858ecb561316f3af67537ab16f610fc8f1a5febbffc27755a")
+
+  FetchContent_Declare(moderngpu
+    URL               ${moderngpu_URL}
+    URL_HASH          ${moderngpu_HASH}
+  )
+
+  FetchContent_GetProperties(moderngpu)
+  if(NOT moderngpu)
+    message(STATUS "Downloading moderngpu")
+    FetchContent_Populate(moderngpu)
+  endif()
+  message(STATUS "moderngpu is downloaded to ${moderngpu_SOURCE_DIR}")
+  add_library(moderngpu INTERFACE)
+  target_include_directories(moderngpu INTERFACE ${moderngpu_SOURCE_DIR}/src)
+  target_compile_options(moderngpu INTERFACE -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w)
+endfunction()
+
+download_moderngpu()
diff --git a/k2/csrc/CMakeLists.txt b/k2/csrc/CMakeLists.txt
@@ -15,6 +15,7 @@ set(context_srcs
   fsa.cu
   fsa_algo.cu
   math.cu
+  moderngpu_allocator.cu
   ragged.cu
   tensor.cu
   tensor_ops.cu
@@ -33,6 +34,8 @@ set_target_properties(context PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 # lib deps
 target_link_libraries(context PUBLIC cub)
+target_link_libraries(context PUBLIC fsa)
+target_link_libraries(context PUBLIC moderngpu)
 if(USE_PYTORCH)
   target_link_libraries(context PUBLIC ${TORCH_LIBRARIES})
 endif()
@@ -41,14 +44,14 @@ endif()
 
 # please sort the source files alphabetically
 set(cuda_tests
-    array_ops_test
-    array_test
-    log_test
-    ragged_shape_test
-    ragged_test
-    tensor_test
-    utils_test
-    )
+  array_ops_test
+  array_test
+  log_test
+  ragged_shape_test
+  ragged_test
+  tensor_test
+  utils_test
+)
 
 # utility function to add gtest
 function(k2_add_cuda_test name)
@@ -58,16 +61,16 @@ function(k2_add_cuda_test name)
   add_executable(${target_name} "${name}.cu")
   set_target_properties(${target_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
   target_link_libraries(${target_name}
-      PRIVATE
-      context
-      fsa  # for code in k2/csrc/host
-      gtest
-      gtest_main
-      )
+    PRIVATE
+    context
+    fsa  # for code in k2/csrc/host
+    gtest
+    gtest_main
+  )
   add_test(NAME "Test.Cuda.${target_name}"
-      COMMAND
-      $<TARGET_FILE:${target_name}>
-      )
+    COMMAND
+    $<TARGET_FILE:${target_name}>
+  )
 endfunction()
 
 foreach (name IN LISTS cuda_tests)

diff --git a/k2/csrc/array_ops_inl.h b/k2/csrc/array_ops_inl.h
@@ -379,15 +379,17 @@ Array1<T> RandUniformArray1(ContextPtr &c, int32_t dim, T min_value,
   T *data = temp.Data();
   K2_CHECK(max_value >= min_value);
   if (max_value == min_value) {
-    for (int32_t i = 0; i < dim; i++) data[i] = 0;
+    for (int32_t i = 0; i < dim; ++i) data[i] = 0;
   } else if (std::is_floating_point<T>::value ||
              std::abs(min_value) > RAND_MAX || std::abs(max_value) > RAND_MAX) {
     for (int32_t i = 0; i < dim; i++)
       data[i] =
           min_value + (rand() * (max_value - min_value) / RAND_MAX);  // NOLINT
   } else {
-    for (int32_t i = 0; i < dim; i++)
-      data[i] = min_value + (rand() % (max_value + 1 - min_value));  // NOLINT
+    for (int32_t i = 0; i < dim; ++i)
+      data[i] =
+          min_value +
+          (rand() % static_cast<int32_t>(max_value + 1 - min_value));  // NOLINT
   }
   return temp.To(c);
 }

diff --git a/k2/csrc/moderngpu_allocator.cu b/k2/csrc/moderngpu_allocator.cu
@@ -0,0 +1,52 @@
+/**
+ * @brief A better memory allocator for moderngpu.
+ *
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#include <utility>
+
+#include "k2/csrc/context.h"
+#include "k2/csrc/moderngpu_allocator.h"
+#include "moderngpu/context.hxx"
+
+namespace {
+
+class ModernGpuAllocator : public mgpu::standard_context_t {
+ public:
+  explicit ModernGpuAllocator(k2::ContextPtr context)
+      : mgpu::standard_context_t(false, context->GetCudaStream()),
+        context_(std::move(context)) {}
+
+  void *alloc(size_t size, mgpu::memory_space_t space) override {
+    K2_DCHECK_EQ(space, mgpu::memory_space_device);
+    void *deleter_ = nullptr;
+    void *p = context_->Allocate(size, &deleter_);
+    K2_DCHECK(deleter_ == nullptr);
+    return p;
+  }
+
+  void free(void *p, mgpu::memory_space_t space) override {
+    K2_DCHECK_EQ(space, mgpu::memory_space_device);
+    context_->Deallocate(p, nullptr);
+  }
+
+ private:
+  k2::ContextPtr context_;
+};
+
+}  // namespace
+
+namespace k2 {
+
+std::unique_ptr<mgpu::context_t> GetModernGpuAllocator(
+    int32_t device_id /*= -1*/) {
+  return std::make_unique<ModernGpuAllocator>(GetCudaContext(device_id));
+}
+
+}  // namespace k2
diff --git a/k2/csrc/moderngpu_allocator.h b/k2/csrc/moderngpu_allocator.h
@@ -0,0 +1,27 @@
+/**
+ * @brief This is an allocator for moderngpu only.
+ *
+ * Currently it is used by `SortSublists`.
+ *
+ * @copyright
+ * Copyright (c)  2020  AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#ifndef K2_CSRC_MODERNGPU_ALLOCATOR_H_
+#define K2_CSRC_MODERNGPU_ALLOCATOR_H_
+
+#include <memory>
+
+#include "moderngpu/context.hxx"
+
+namespace k2 {
+// Return a context for moderngpu that has a better memory allocator
+// than mgpu::standard_context_t
+std::unique_ptr<mgpu::context_t> GetModernGpuAllocator(int32_t device_id = -1);
+
+}  // namespace k2
+
+#endif  // K2_CSRC_MODERNGPU_ALLOCATOR_H_
diff --git a/k2/csrc/pytorch_context.cu b/k2/csrc/pytorch_context.cu
@@ -11,7 +11,10 @@
 
 #include <memory>
 
+#include "c10/cuda/CUDACachingAllocator.h"
 #include "c10/cuda/CUDAFunctions.h"
+#include "k2/csrc/context.h"
+#include "k2/csrc/log.h"
 #include "k2/csrc/pytorch_context.h"
 
 namespace k2 {
@@ -23,9 +26,6 @@ class PytorchCpuContext : public Context {
     K2_CHECK(allocator_->raw_deleter() != nullptr);
   }
 
-  // since the constructor is private, the only way to create an instance
-  // of PytorchCpuContext is via `Make`, which returns a `shared_ptr`.
-  // Thus it is safe to call `shared_from_this`.
   ContextPtr GetCpuContext() override { return shared_from_this(); }
 
   ContextPtr GetPinnedContext() override { return nullptr; }

diff --git a/k2/csrc/pytorch_context.h b/k2/csrc/pytorch_context.h
@@ -15,10 +15,7 @@
 
 #include <memory>
 
-#include "c10/cuda/CUDACachingAllocator.h"
-#include "c10/cuda/CUDAFunctions.h"
 #include "k2/csrc/context.h"
-#include "k2/csrc/log.h"
 #include "torch/torch.h"
 
 namespace k2 {

diff --git a/k2/csrc/ragged.cu b/k2/csrc/ragged.cu
@@ -294,7 +294,7 @@ int32_t RaggedShape::operator[](const std::vector<int32_t> &indexes) {
 void RaggedShape::Check() {
   ContextPtr c = Context();
   int32_t num_axes = axes_.size();
-  for (int32_t axis = 0; axis < axes_.size(); axis++) {
+  for (int32_t axis = 0; axis < num_axes; ++axis) {
     RaggedShapeDim &rsd = axes_[axis];
     K2_CHECK_GE(rsd.row_splits.Dim(), 0);
     if (rsd.cached_tot_size >= 0) {
@@ -343,7 +343,7 @@ void RaggedShape::Check() {
                       << " but cached_tot_size == " << rsd.cached_tot_size;
       }
     }
-    if (axis + 1 < axes_.size()) {
+    if (axis + 1 < num_axes) {
       int32_t next_num_rows = axes_[axis + 1].row_splits.Dim() - 1;
       if (num_elems != next_num_rows) {
         K2_LOG(FATAL) << "Ragged shape has num_elems for axes_[" << axis

diff --git a/k2/csrc/ragged.h b/k2/csrc/ragged.h
@@ -102,8 +102,7 @@ class RaggedShape {
   // row_splits on that  axis.
   int32_t MaxSize(int32_t axis);
 
-  ContextPtr &Context() { return axes_[0].row_splits.Context(); }
-  const ContextPtr &Context() const { return axes_[0].row_splits.Context(); }
+  ContextPtr &Context() const { return axes_[0].row_splits.Context(); }
 
   /*
     It is an error to call this if this.NumAxes() < 2.  This will return
@@ -127,7 +126,8 @@ class RaggedShape {
 
   RaggedShapeIndexIterator Iterator();
 
-  explicit RaggedShape(std::vector<RaggedShapeDim> &axes, bool check = true)
+  explicit RaggedShape(const std::vector<RaggedShapeDim> &axes,
+                       bool check = true)
       : axes_(axes) {
     if (check) Check();
   }
@@ -486,7 +486,6 @@ Ragged<T> Stack(int32_t axis, int32_t num_srcs, Ragged<T> **src);
 template <typename T>
 Ragged<T> Stack(int32_t axis, int32_t num_srcs, Ragged<T> *src);
 
-
 /*
   Construct a RaggedShape with 2 axes.
      @param [in] row_splits   row_splits, or NULL (at least one of this and
@@ -574,6 +573,19 @@ Ragged<T> RandomRagged(T min_value = static_cast<T>(0),
                        int32_t min_num_elements = 0,
                        int32_t max_num_elements = 2000);
 
+/*
+  Sort a ragged array in-place.
+
+     @param [inout]   The input array to be sorted.
+                      CAUTION: it is sorted in-place.
+     @param [out]     The indexes mapping from the sorted
+                      array to the input array. The caller
+                      has to pre-allocate memory for it
+                      on the same device as `src`.
+ */
+template <typename T, typename Op = LessThan<T>>
+void SortSublists(Ragged<T> *src, Array1<int32_t> *order);
+
 }  // namespace k2
 
 // TODO(dan): include guard maybe.

diff --git a/k2/csrc/ragged_inl.h b/k2/csrc/ragged_inl.h
@@ -7,6 +7,7 @@
  *
  * @copyright
  * Copyright (c)  2020  Xiaomi Corporation (authors: Daniel Povey)
+ *                      Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
  *
  * @copyright
  * See LICENSE for clarification regarding multiple authors
@@ -15,8 +16,12 @@
 #ifndef K2_CSRC_RAGGED_INL_H_
 #define K2_CSRC_RAGGED_INL_H_
 
+#include <memory>
 #include <vector>
 
+#include "k2/csrc/moderngpu_allocator.h"
+#include "moderngpu/kernel_segsort.hxx"
+
 namespace k2 {
 
 template <typename T>
@@ -95,6 +100,35 @@ Ragged<T> RandomRagged(T min_value, T max_value, int32_t min_num_axes,
   return Ragged<T>(shape, values);
 }
 
+template <typename T, typename Op /* = LessThan<T> */>
+void SortSublists(Ragged<T> *src, Array1<int32_t> *order) {
+  K2_DCHECK(IsCompatible(src->values, *order));
+  K2_DCHECK_EQ(src->values.Dim(), order->Dim());
+  K2_DCHECK_EQ(src->Context()->GetDeviceType(), kCuda)
+      << "It supports only CUDA at present";
+
+  std::unique_ptr<mgpu::context_t> context =
+      GetModernGpuAllocator(src->Context()->GetDeviceId());
+
+  Array1<int32_t> &segment = src->shape.RowSplits(src->NumAxes() - 1);
   Sort a ragged array in-place. 
      @param [inout]   The input array to be sorted. 
                       CAUTION: it is sorted in-place. 
      @param [out]     The indexes mapping from the sorted 
                       array to the input array. The caller 
                       has to pre-allocate memory for it 
                       on the same device as `src`. 
 RaggedShape() = default; 
 K2_DCHECK_GE(src->NumAxes(), 2); 
   Sort a ragged array in-place. 
  
      @param [inout]   The input array to be sorted. 
                       CAUTION: it is sorted in-place. 
      @param [out]     The indexes mapping from the sorted 
                       array to the input array. The caller 
                       has to pre-allocate memory for it 
                       on the same device as `src`. 
 RaggedShape() = default; 
 K2_DCHECK_GE(src->NumAxes(), 2); 
+  mgpu::segmented_sort_indices(src->values.Data(),  // keys
+                               order->Data(),       // indices
+                               src->values.Dim(),   // count
+                               segment.Data() + 1,  // segments
+                               segment.Dim() - 1,   // num_segments
+                               Op(),                // cmp
+                               *context);           // context
+  auto err = cudaGetLastError();
+  (void)err;
+  // TODO(fangjun): err is not cudaSuccess, but why was the data sorted
+  // correctly?
+  //
+  // Check failed: err == cudaSuccess (9 vs. 0)  Error: invalid configuration
+  // argument.
+  //
+  // K2_DCHECK_CUDA_ERROR(err);
+}
+
 }  // namespace k2
 
 #endif  // K2_CSRC_RAGGED_INL_H_