Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add segsort from moderngpu #181

Merged
merged 13 commits into from
Sep 26, 2020
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ if(USE_PYTORCH)
include(torch)
endif()
include(cub)
include(moderngpu)
include(googletest)

add_subdirectory(k2)
1 change: 0 additions & 1 deletion cmake/cub.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ function(download_cub)
message(STATUS "cub is downloaded to ${cub_SOURCE_DIR}")
add_library(cub INTERFACE)
target_include_directories(cub INTERFACE ${cub_SOURCE_DIR})

endfunction()

download_cub()
31 changes: 31 additions & 0 deletions cmake/moderngpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2020 Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
# See ../LICENSE for clarification regarding multiple authors

function(download_moderngpu)
if(CMAKE_VERSION VERSION_LESS 3.11)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
endif()

include(FetchContent)

# this is the latest commit of modern gpu as of 2020-09-26
set(moderngpu_URL "https://github.com/moderngpu/moderngpu/archive/2b3985541c8e88a133769598c406c33ddde9d0a5.zip")
set(moderngpu_HASH "SHA256=191546af18cd5fb858ecb561316f3af67537ab16f610fc8f1a5febbffc27755a")

FetchContent_Declare(moderngpu
URL ${moderngpu_URL}
URL_HASH ${moderngpu_HASH}
)

FetchContent_GetProperties(moderngpu)
if(NOT moderngpu)
message(STATUS "Downloading moderngpu")
FetchContent_Populate(moderngpu)
endif()
message(STATUS "moderngpu is downloaded to ${moderngpu_SOURCE_DIR}")
add_library(moderngpu INTERFACE)
target_include_directories(moderngpu INTERFACE ${moderngpu_SOURCE_DIR}/src)
target_compile_options(moderngpu INTERFACE -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w)
endfunction()

download_moderngpu()
37 changes: 20 additions & 17 deletions k2/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ set(context_srcs
fsa.cu
fsa_algo.cu
math.cu
moderngpu_context.cu
ragged.cu
tensor.cu
tensor_ops.cu
Expand All @@ -33,6 +34,8 @@ set_target_properties(context PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

# lib deps
target_link_libraries(context PUBLIC cub)
target_link_libraries(context PUBLIC fsa)
target_link_libraries(context PUBLIC moderngpu)
if(USE_PYTORCH)
target_link_libraries(context PUBLIC ${TORCH_LIBRARIES})
endif()
Expand All @@ -41,14 +44,14 @@ endif()

# please sort the source files alphabetically
set(cuda_tests
array_ops_test
array_test
log_test
ragged_shape_test
ragged_test
tensor_test
utils_test
)
array_ops_test
array_test
log_test
ragged_shape_test
ragged_test
tensor_test
utils_test
)

# utility function to add gtest
function(k2_add_cuda_test name)
Expand All @@ -58,16 +61,16 @@ function(k2_add_cuda_test name)
add_executable(${target_name} "${name}.cu")
set_target_properties(${target_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(${target_name}
PRIVATE
context
fsa # for code in k2/csrc/host
gtest
gtest_main
)
PRIVATE
context
fsa # for code in k2/csrc/host
gtest
gtest_main
)
add_test(NAME "Test.Cuda.${target_name}"
COMMAND
$<TARGET_FILE:${target_name}>
)
COMMAND
$<TARGET_FILE:${target_name}>
)
endfunction()

foreach (name IN LISTS cuda_tests)
Expand Down
8 changes: 5 additions & 3 deletions k2/csrc/array_ops_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,15 +379,17 @@ Array1<T> RandUniformArray1(ContextPtr &c, int32_t dim, T min_value,
T *data = temp.Data();
K2_CHECK(max_value >= min_value);
if (max_value == min_value) {
for (int32_t i = 0; i < dim; i++) data[i] = 0;
for (int32_t i = 0; i < dim; ++i) data[i] = 0;
} else if (std::is_floating_point<T>::value ||
std::abs(min_value) > RAND_MAX || std::abs(max_value) > RAND_MAX) {
for (int32_t i = 0; i < dim; i++)
data[i] =
min_value + (rand() * (max_value - min_value) / RAND_MAX); // NOLINT
} else {
for (int32_t i = 0; i < dim; i++)
data[i] = min_value + (rand() % (max_value + 1 - min_value)); // NOLINT
for (int32_t i = 0; i < dim; ++i)
data[i] =
min_value +
(rand() % static_cast<int32_t>(max_value + 1 - min_value)); // NOLINT
}
return temp.To(c);
}
Expand Down
52 changes: 52 additions & 0 deletions k2/csrc/moderngpu_context.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* @brief A context for moderngpu with a better memory allocator.
*
*
* @copyright
* Copyright (c) 2020 Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
*
* @copyright
* See LICENSE for clarification regarding multiple authors
*/

#include <utility>

#include "k2/csrc/context.h"
#include "k2/csrc/moderngpu_context.h"
#include "moderngpu/context.hxx"

namespace {

class ModernGpuContext : public mgpu::standard_context_t {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i wonder if this should be called ModernGpuAllocator to clarify that it's not an instance of k2::Context.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

public:
explicit ModernGpuContext(k2::ContextPtr context)
: mgpu::standard_context_t(false, context->GetCudaStream()),
context_(std::move(context)) {}

void *alloc(size_t size, mgpu::memory_space_t space) override {
K2_DCHECK_EQ(space, mgpu::memory_space_device);
void *deleter_ = nullptr;
void *p = context_->Allocate(size, &deleter_);
K2_DCHECK(deleter_ == nullptr);
return p;
}

void free(void *p, mgpu::memory_space_t space) override {
K2_DCHECK_EQ(space, mgpu::memory_space_device);
context_->Deallocate(p, nullptr);
}

private:
k2::ContextPtr context_;
};

} // namespace

namespace k2 {

std::unique_ptr<mgpu::context_t> GetModernGpuContext(
int32_t device_id /*= -1*/) {
return std::make_unique<ModernGpuContext>(GetCudaContext(device_id));
}

} // namespace k2
27 changes: 27 additions & 0 deletions k2/csrc/moderngpu_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/**
* @brief This is a Context for moderngpu only.
*
* Currently it is used by `SortSublists`.
*
* @copyright
* Copyright (c) 2020 AI Lab, Beijing, China (authors: Fangjun Kuang)
*
* @copyright
* See LICENSE for clarification regarding multiple authors
*/

#ifndef K2_CSRC_MODERNGPU_CONTEXT_H_
#define K2_CSRC_MODERNGPU_CONTEXT_H_

#include <memory>

#include "moderngpu/context.hxx"

namespace k2 {
// Return a context for moderngpu that has a better memory allocator
// than mgpu::standard_context_t
std::unique_ptr<mgpu::context_t> GetModernGpuContext(int32_t device_id = -1);

} // namespace k2

#endif // K2_CSRC_MODERNGPU_CONTEXT_H_
6 changes: 3 additions & 3 deletions k2/csrc/pytorch_context.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@

#include <memory>

#include "c10/cuda/CUDACachingAllocator.h"
#include "c10/cuda/CUDAFunctions.h"
#include "k2/csrc/context.h"
#include "k2/csrc/log.h"
#include "k2/csrc/pytorch_context.h"

namespace k2 {
Expand All @@ -23,9 +26,6 @@ class PytorchCpuContext : public Context {
K2_CHECK(allocator_->raw_deleter() != nullptr);
}

// since the constructor is private, the only way to create an instance
// of PytorchCpuContext is via `Make`, which returns a `shared_ptr`.
// Thus it is safe to call `shared_from_this`.
ContextPtr GetCpuContext() override { return shared_from_this(); }

ContextPtr GetPinnedContext() override { return nullptr; }
Expand Down
3 changes: 0 additions & 3 deletions k2/csrc/pytorch_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@

#include <memory>

#include "c10/cuda/CUDACachingAllocator.h"
#include "c10/cuda/CUDAFunctions.h"
#include "k2/csrc/context.h"
#include "k2/csrc/log.h"
#include "torch/torch.h"

namespace k2 {
Expand Down
4 changes: 2 additions & 2 deletions k2/csrc/ragged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ int32_t RaggedShape::operator[](const std::vector<int32_t> &indexes) {
void RaggedShape::Check() {
ContextPtr c = Context();
int32_t num_axes = axes_.size();
for (int32_t axis = 0; axis < axes_.size(); axis++) {
for (int32_t axis = 0; axis < num_axes; ++axis) {
RaggedShapeDim &rsd = axes_[axis];
K2_CHECK_GE(rsd.row_splits.Dim(), 0);
if (rsd.cached_tot_size >= 0) {
Expand Down Expand Up @@ -343,7 +343,7 @@ void RaggedShape::Check() {
<< " but cached_tot_size == " << rsd.cached_tot_size;
}
}
if (axis + 1 < axes_.size()) {
if (axis + 1 < num_axes) {
int32_t next_num_rows = axes_[axis + 1].row_splits.Dim() - 1;
if (num_elems != next_num_rows) {
K2_LOG(FATAL) << "Ragged shape has num_elems for axes_[" << axis
Expand Down
20 changes: 16 additions & 4 deletions k2/csrc/ragged.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,7 @@ class RaggedShape {
// row_splits on that axis.
int32_t MaxSize(int32_t axis);

ContextPtr &Context() { return axes_[0].row_splits.Context(); }
const ContextPtr &Context() const { return axes_[0].row_splits.Context(); }
ContextPtr &Context() const { return axes_[0].row_splits.Context(); }

/*
It is an error to call this if this.NumAxes() < 2. This will return
Expand All @@ -127,7 +126,8 @@ class RaggedShape {

RaggedShapeIndexIterator Iterator();

explicit RaggedShape(std::vector<RaggedShapeDim> &axes, bool check = true)
explicit RaggedShape(const std::vector<RaggedShapeDim> &axes,
bool check = true)
: axes_(axes) {
if (check) Check();
}
Expand Down Expand Up @@ -486,7 +486,6 @@ Ragged<T> Stack(int32_t axis, int32_t num_srcs, Ragged<T> **src);
template <typename T>
Ragged<T> Stack(int32_t axis, int32_t num_srcs, Ragged<T> *src);


/*
Construct a RaggedShape with 2 axes.
@param [in] row_splits row_splits, or NULL (at least one of this and
Expand Down Expand Up @@ -574,6 +573,19 @@ Ragged<T> RandomRagged(T min_value = static_cast<T>(0),
int32_t min_num_elements = 0,
int32_t max_num_elements = 2000);

/*
Sort a ragged array in-place.

@param [inout] The input array to be sorted.
CAUTION: it is sorted in-place.
@param [out] The indexes mapping from the sorted
array to the input array. The caller
has to pre-allocate memory for it
on the same device as `src`.
*/
template <typename T, typename Op = LessThan<T>>
void SortSublists(Ragged<T> *src, Array1<int32_t> *order);

} // namespace k2

// TODO(dan): include guard maybe.
Expand Down
34 changes: 34 additions & 0 deletions k2/csrc/ragged_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*
* @copyright
* Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey)
* Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
*
* @copyright
* See LICENSE for clarification regarding multiple authors
Expand All @@ -15,8 +16,12 @@
#ifndef K2_CSRC_RAGGED_INL_H_
#define K2_CSRC_RAGGED_INL_H_

#include <memory>
#include <vector>

#include "k2/csrc/moderngpu_context.h"
#include "moderngpu/kernel_segsort.hxx"

namespace k2 {

template <typename T>
Expand Down Expand Up @@ -95,6 +100,35 @@ Ragged<T> RandomRagged(T min_value, T max_value, int32_t min_num_axes,
return Ragged<T>(shape, values);
}

template <typename T, typename Op /* = LessThan<T> */>
void SortSublists(Ragged<T> *src, Array1<int32_t> *order) {
K2_DCHECK(IsCompatible(src->values, *order));
K2_DCHECK_EQ(src->values.Dim(), order->Dim());
K2_DCHECK_EQ(src->Context()->GetDeviceType(), kCuda)
<< "It supports only CUDA at present";

std::unique_ptr<mgpu::context_t> context =
GetModernGpuContext(src->Context()->GetDeviceId());

Array1<int32_t> &segment = src->shape.RowSplits(src->NumAxes() - 1);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's better to add K2_CHECK_EQ(src->shape.NumAxes() == 2 at entry as we just sort values in src

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It supports ragged arrays with more than two axes.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, at least check >=2 here otherwise there will be an error when calling shape.RowSplits(NumAxes() - 1), note that you should add such requirements in the documentation

k2/k2/csrc/ragged.h

Lines 577 to 584 in a178524

Sort a ragged array in-place.
@param [inout] The input array to be sorted.
CAUTION: it is sorted in-place.
@param [out] The indexes mapping from the sorted
array to the input array. The caller
has to pre-allocate memory for it
on the same device as `src`.

BTW, in other APIs, such as MaxPerSublist, we require NumAxes() ==2 now. @danpovey I think we should make those api consistent, should we require exactly NumAxes() == 2 or NumAxes() >=2

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general I'd like to make the APIs as general as they can be made without requiring extra work.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has no requirement about the number of axes a ragged array has. It always sorts the last axis.
This function works as long as the ragged array is not empty.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm.. definitely it requires NumAxes >=2...It's meaningless to call RowSplits(1) on a shape with NumAxes < 2, actually it will crash..

Then I suggest we add documentation and requirements for those APIs with axes >= 2

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you give an example to show what a ragged array with NumAxes < 2 looks like?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm, the point here is, now we allow user define a empty RaggedShape technically,

RaggedShape() = default;
, so it's better if we can check this and report error as early as possible (i.e. at entry of the function).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add a K2_DCHECK in the next commit.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's better to add K2_CHECK_EQ(src->shape.NumAxes() == 2 at entry as we just sort values in src

@qindazhu
fixed in #218

K2_DCHECK_GE(src->NumAxes(), 2);

mgpu::segmented_sort_indices(src->values.Data(), // keys
order->Data(), // indices
src->values.Dim(), // count
segment.Data() + 1, // segments
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why +1 here? the api of mgpu is strange,

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also tried segment.Data() and got the same result.

According to https://github.com/moderngpu/moderngpu/blob/2b3985541c8e88a133769598c406c33ddde9d0a5/tests/test_segsort.cu#L9-L15

 std::vector<int> cpu_segsort(const std::vector<int>& data,
  const std::vector<int>& segments) {

  std::vector<int> copy = data;
  int cur = 0;
  for(int seg = 0; seg < segments.size(); ++seg) {
    int next = segments[seg];
    std::sort(copy.data() + cur, copy.data() + next);
    cur = next;
  }
  std::sort(copy.data() + cur, copy.data() + data.size());
  return copy;
}

The start index of the first segment is 0. That's why I use segment.Data() + 1.

segment.Dim() - 1, // num_segments
Op(), // cmp
*context); // context
auto err = cudaGetLastError();
(void)err;
// TODO(fangjun): err is not cudaSuccess, but why was the data sorted
// correctly?
//
// Check failed: err == cudaSuccess (9 vs. 0) Error: invalid configuration
// argument.
//
// K2_DCHECK_CUDA_ERROR(err);
}

} // namespace k2

#endif // K2_CSRC_RAGGED_INL_H_
Loading