-
Notifications
You must be signed in to change notification settings - Fork 222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add segsort from moderngpu #181
Changes from 10 commits
e0d17a0
a2f82b9
533406e
9cb83a5
c09a3ed
a8120f5
ce9eddd
01d1091
89c8595
8f2f8e0
6c14c1c
5a9fc1c
0eaf4df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Copyright (c) 2020 Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang) | ||
# See ../LICENSE for clarification regarding multiple authors | ||
|
||
function(download_moderngpu) | ||
if(CMAKE_VERSION VERSION_LESS 3.11) | ||
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) | ||
endif() | ||
|
||
include(FetchContent) | ||
|
||
# this is the latest commit of modern gpu as of 2020-09-26 | ||
set(moderngpu_URL "https://github.com/moderngpu/moderngpu/archive/2b3985541c8e88a133769598c406c33ddde9d0a5.zip") | ||
set(moderngpu_HASH "SHA256=191546af18cd5fb858ecb561316f3af67537ab16f610fc8f1a5febbffc27755a") | ||
|
||
FetchContent_Declare(moderngpu | ||
URL ${moderngpu_URL} | ||
URL_HASH ${moderngpu_HASH} | ||
) | ||
|
||
FetchContent_GetProperties(moderngpu) | ||
if(NOT moderngpu) | ||
message(STATUS "Downloading moderngpu") | ||
FetchContent_Populate(moderngpu) | ||
endif() | ||
message(STATUS "moderngpu is downloaded to ${moderngpu_SOURCE_DIR}") | ||
add_library(moderngpu INTERFACE) | ||
target_include_directories(moderngpu INTERFACE ${moderngpu_SOURCE_DIR}/src) | ||
target_compile_options(moderngpu INTERFACE -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w) | ||
endfunction() | ||
|
||
download_moderngpu() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/** | ||
* @brief A context for moderngpu with a better memory allocator. | ||
* | ||
* | ||
* @copyright | ||
* Copyright (c) 2020 Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang) | ||
* | ||
* @copyright | ||
* See LICENSE for clarification regarding multiple authors | ||
*/ | ||
|
||
#include "k2/csrc/context.h" | ||
#include "k2/csrc/moderngpu_context.h" | ||
#include "moderngpu/context.hxx" | ||
|
||
namespace { | ||
|
||
class ModernGpuContext : public mgpu::standard_context_t { | ||
public: | ||
explicit ModernGpuContext(k2::ContextPtr context) | ||
: mgpu::standard_context_t(false, context->GetCudaStream()), | ||
context_(std::move(context)) {} | ||
|
||
void *alloc(size_t size, mgpu::memory_space_t space) override { | ||
K2_DCHECK_EQ(space, mgpu::memory_space_device); | ||
void *deleter_ = nullptr; | ||
void *p = context_->Allocate(size, &deleter_); | ||
K2_DCHECK(deleter_ == nullptr); | ||
return p; | ||
} | ||
|
||
void free(void *p, mgpu::memory_space_t space) override { | ||
K2_DCHECK_EQ(space, mgpu::memory_space_device); | ||
context_->Deallocate(p, nullptr); | ||
} | ||
|
||
private: | ||
k2::ContextPtr context_; | ||
}; | ||
|
||
} // namespace | ||
|
||
namespace k2 { | ||
|
||
std::unique_ptr<mgpu::context_t> GetModernGpuContext( | ||
int32_t device_id /*= -1*/) { | ||
return std::make_unique<ModernGpuContext>(GetCudaContext(device_id)); | ||
} | ||
|
||
} // namespace k2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/** | ||
* @brief This is a Context for moderngpu only. | ||
* | ||
* Currently it is used by `SortSublists`. | ||
* | ||
* @copyright | ||
* Copyright (c) 2020 AI Lab, Beijing, China (authors: Fangjun Kuang) | ||
* | ||
* @copyright | ||
* See LICENSE for clarification regarding multiple authors | ||
*/ | ||
|
||
#ifndef K2_CSRC_MODERNGPU_CONTEXT_H_ | ||
#define K2_CSRC_MODERNGPU_CONTEXT_H_ | ||
|
||
#include <memory> | ||
|
||
#include "moderngpu/context.hxx" | ||
|
||
namespace k2 { | ||
// Return a context for moderngpu that has a better memory allocator | ||
// than mgpu::standard_context_t | ||
std::unique_ptr<mgpu::context_t> GetModernGpuContext(int32_t device_id = -1); | ||
|
||
} // namespace k2 | ||
|
||
#endif // K2_CSRC_MODERNGPU_CONTEXT_H_ |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -7,6 +7,7 @@ | |||||||||||||||||||||
* | ||||||||||||||||||||||
* @copyright | ||||||||||||||||||||||
* Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey) | ||||||||||||||||||||||
* Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang) | ||||||||||||||||||||||
* | ||||||||||||||||||||||
* @copyright | ||||||||||||||||||||||
* See LICENSE for clarification regarding multiple authors | ||||||||||||||||||||||
|
@@ -17,6 +18,9 @@ | |||||||||||||||||||||
|
||||||||||||||||||||||
#include <vector> | ||||||||||||||||||||||
|
||||||||||||||||||||||
#include "k2/csrc/moderngpu_context.h" | ||||||||||||||||||||||
#include "moderngpu/kernel_segsort.hxx" | ||||||||||||||||||||||
|
||||||||||||||||||||||
namespace k2 { | ||||||||||||||||||||||
|
||||||||||||||||||||||
template <typename T> | ||||||||||||||||||||||
|
@@ -95,6 +99,35 @@ Ragged<T> RandomRagged(T min_value, T max_value, int32_t min_num_axes, | |||||||||||||||||||||
return Ragged<T>(shape, values); | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
template <typename T, typename Op /* = LessThan<T> */> | ||||||||||||||||||||||
void SortSublists(Ragged<T> *src, Array1<int32_t> *order) { | ||||||||||||||||||||||
K2_DCHECK(IsCompatible(src->values, *order)); | ||||||||||||||||||||||
K2_DCHECK_EQ(src->values.Dim(), order->Dim()); | ||||||||||||||||||||||
K2_DCHECK_EQ(src->Context()->GetDeviceType(), kCuda) | ||||||||||||||||||||||
<< "It supports only CUDA at present"; | ||||||||||||||||||||||
|
||||||||||||||||||||||
std::unique_ptr<mgpu::context_t> context = | ||||||||||||||||||||||
GetModernGpuContext(src->Context()->GetDeviceId()); | ||||||||||||||||||||||
|
||||||||||||||||||||||
Array1<int32_t> &segment = src->shape.RowSplits(src->NumAxes() - 1); | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's better to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It supports ragged arrays with more than two axes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, at least check Lines 577 to 584 in a178524
BTW, in other APIs, such as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general I'd like to make the APIs as general as they can be made without requiring extra work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It has no requirement about the number of axes a ragged array has. It always sorts the last axis. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mm.. definitely it requires NumAxes >=2...It's meaningless to call RowSplits(1) on a shape with Then I suggest we add documentation and requirements for those APIs with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you give an example to show what a ragged array with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mm, the point here is, now we allow user define a empty RaggedShape technically, Line 137 in a178524
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Line 111 in d291daf
|
||||||||||||||||||||||
mgpu::segmented_sort_indices(src->values.Data(), // keys | ||||||||||||||||||||||
order->Data(), // indices | ||||||||||||||||||||||
src->values.Dim(), // count | ||||||||||||||||||||||
segment.Data() + 1, // segments | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also tried According to https://github.com/moderngpu/moderngpu/blob/2b3985541c8e88a133769598c406c33ddde9d0a5/tests/test_segsort.cu#L9-L15 std::vector<int> cpu_segsort(const std::vector<int>& data,
const std::vector<int>& segments) {
std::vector<int> copy = data;
int cur = 0;
for(int seg = 0; seg < segments.size(); ++seg) {
int next = segments[seg];
std::sort(copy.data() + cur, copy.data() + next);
cur = next;
}
std::sort(copy.data() + cur, copy.data() + data.size());
return copy;
} The start index of the first segment is 0. That's why I use |
||||||||||||||||||||||
segment.Dim() - 1, // num_segments | ||||||||||||||||||||||
Op(), // cmp | ||||||||||||||||||||||
*context); // context | ||||||||||||||||||||||
auto err = cudaGetLastError(); | ||||||||||||||||||||||
(void)err; | ||||||||||||||||||||||
// TODO(fangjun): err is not cudaSuccess, but why was the data sorted | ||||||||||||||||||||||
// correctly? | ||||||||||||||||||||||
// | ||||||||||||||||||||||
// Check failed: err == cudaSuccess (9 vs. 0) Error: invalid configuration | ||||||||||||||||||||||
// argument. | ||||||||||||||||||||||
// | ||||||||||||||||||||||
// K2_DCHECK_CUDA_ERROR(err); | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
} // namespace k2 | ||||||||||||||||||||||
|
||||||||||||||||||||||
#endif // K2_CSRC_RAGGED_INL_H_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i wonder if this should be called ModernGpuAllocator to clarify that it's not an instance of k2::Context.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.