Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] initial port #3126

Closed
wants to merge 49 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
36e1c57
stub rocm into all CMakeLists.txt
jeffdaily Feb 7, 2023
d44ff2f
add faiss/gpu/hipify.sh
jeffdaily Feb 20, 2023
506f619
various updates
jeffdaily Feb 21, 2023
1056988
USE_ROCM section of faiss/gpu/utils/WarpShuffles.cuh
jeffdaily Feb 21, 2023
7c7cb03
USE_ROCM section of faiss/gpu/utils/MergeNetworkWarp.cuh
jeffdaily Feb 21, 2023
f19b7bc
more updates
jeffdaily Feb 21, 2023
b8f48a7
allow kWarpSize 64 in BinaryDistance.cu
jeffdaily Feb 21, 2023
3dcf89d
fix error: use of undeclared identifier 'half'
jeffdaily Feb 21, 2023
ed8fa1a
fix error: use of undeclared identifier 'cublasSgemmEx'
jeffdaily Feb 22, 2023
6473115
warp 64 fixes for IVFInterleaved
jeffdaily Feb 22, 2023
97fd7e4
stub out asm as TODO
jeffdaily Feb 22, 2023
8151f75
warp 64 fixes for IVFUtilsSelect1 IVFUtilsSelect2
jeffdaily Feb 22, 2023
2431d8f
fix error: use of undeclared identifier 'half'
jeffdaily Feb 22, 2023
c072ed6
warp 64 fixes for L2Select.cu
jeffdaily Feb 22, 2023
cc5c384
missing half, math_constants.h in VectorResidual
jeffdaily Feb 22, 2023
fdc80e8
only compile warp 32 functions if warp size is 32
jeffdaily Feb 22, 2023
5210714
cmake updates
jeffdaily Feb 23, 2023
9764491
build warp==32 dummy symbols to fix linking errors
jeffdaily Feb 23, 2023
8ecc1f6
gpu-rocm and python binding
Mar 26, 2023
f2d7665
fix error: #include <faiss/gpu/*>
Mar 27, 2023
115a0c5
Rewrite some asm code for ROCm in LoadStoreOperators
Apr 12, 2023
c6bec4a
Resolved TODO's in PQCodeLoad.cuh
May 2, 2023
96da5fe
Fix some bugs in LoadStoreOperators.cuh
May 2, 2023
a6700a9
Port the code to Navi 2x/3x, whose warp size is 32.
xinyazhang May 16, 2023
729e929
Properly implement getBitfield and GET_BITFIELD_U32/64 on ROCM.
xinyazhang May 16, 2023
46f2b0a
Fix LoadCode32<56> in ROCM, and put runtime safeguards in other speci…
xinyazhang May 16, 2023
ad74736
Fix the misuse of hip header in gpu/
xinyazhang May 17, 2023
53bdd9e
ROCM/Navi 2x: Fix LoadStore32 template and Float16 support in LoadStore.
xinyazhang May 18, 2023
4d5be04
Merge branch 'main_upstream' into xinyazhang/navi-21
jeffdaily Sep 12, 2023
e1b7aa9
fix build
jeffdaily Sep 12, 2023
1778639
partial revert of using kWarpSize*2 etc
jeffdaily Sep 13, 2023
ebc1701
relax warp size 32 constraint
jeffdaily Sep 13, 2023
72ab993
add and use getWarpSizeCurrentDevice()
jeffdaily Sep 13, 2023
7e3b7ed
compiles for warpSize 64, however failing tests
jeffdaily Sep 13, 2023
021722e
TestGpuSelect passes for warpSize 64
jeffdaily Sep 13, 2023
5bd71d9
modify __CUDA_ARCH__ or CUDA_VERSION checks with USE_ROCM
jeffdaily Oct 12, 2023
2d38b95
fix GeneralDistance for both 32 and 64 warp sizes
jeffdaily Oct 12, 2023
cff550e
fix launch bounds for PQCodeDistances-inl.cuh
jeffdaily Oct 12, 2023
bf472e3
fix hammin20 read past end of array
jeffdaily Nov 2, 2023
da5a818
interleaved based on kWarpSize, not hard-coded to 32
jeffdaily Nov 2, 2023
ededed6
cannot use kWarpSize in host code
jeffdaily Nov 3, 2023
f3c963e
cannot use kLanes aka kWarpSize in host code
jeffdaily Nov 3, 2023
9551699
additional uses of kWarpSize found in host code
jeffdaily Nov 3, 2023
9cd52cf
kWarpSize==64 fixes for WarpPackedBits 6bit and 4bit
jeffdaily Nov 3, 2023
ee8aea9
fix cuda build
jeffdaily Nov 6, 2023
8672522
clang-format
jeffdaily Nov 6, 2023
0b1751e
fix raft build due to TILE_SIZE redefined
jeffdaily Nov 7, 2023
14cdf80
Merge branch 'main' into jeffdaily/rocm2
jeffdaily Nov 7, 2023
97523ff
fix include statement so it hipifies properly
jeffdaily Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix GeneralDistance for both 32 and 64 warp sizes
jeffdaily committed Oct 12, 2023

Verified

This commit was signed with the committer’s verified signature.
renovate-bot Mend Renovate
commit 2d38b957e16891ab5b4419e598183a3f6f145eb6
54 changes: 30 additions & 24 deletions faiss/gpu/impl/GeneralDistance.cuh
Original file line number Diff line number Diff line change
@@ -27,6 +27,12 @@
// Kernels for non-L2 / inner product distances
//

// Initially kWarpSize was used for the x and y tile shape.
// This works when kWarpSize is 32 but for kWarpSize 64,
// this results in an invalid launch configuration of 64x64 block size.
// 32 is a reasonable tile size for both kWarpSize options.
#define TILE_SIZE 32

namespace faiss {
namespace gpu {

@@ -56,8 +62,8 @@ struct ReduceDistanceOp<DistanceOp, 1> {
template <typename T, int Unroll, int DimMultiple, typename DistanceOp>
inline __device__ DistanceOp
reduce(const DistanceOp& in,
const T queryTile[kWarpSize][DimMultiple * kWarpSize + 1],
const T vecTile[kWarpSize][DimMultiple * kWarpSize + 1]) {
const T queryTile[TILE_SIZE][DimMultiple * TILE_SIZE + 1],
const T vecTile[TILE_SIZE][DimMultiple * TILE_SIZE + 1]) {
DistanceOp accs[Unroll];
#pragma unroll
for (int i = 0; i < Unroll; ++i) {
@@ -70,8 +76,8 @@ reduce(const DistanceOp& in,
#pragma unroll
for (int i = 0; i < Unroll; ++i) {
#pragma unroll
for (int j = 0; j < (kWarpSize * DimMultiple / Unroll); ++j) {
int idx = i * (kWarpSize * DimMultiple / Unroll) + j;
for (int j = 0; j < (TILE_SIZE * DimMultiple / Unroll); ++j) {
int idx = i * (TILE_SIZE * DimMultiple / Unroll) + j;
accs[i].handle(
ConvertTo<float>::to(queryTileBase[idx]),
ConvertTo<float>::to(vecTileBase[idx]));
@@ -83,23 +89,23 @@ reduce(const DistanceOp& in,

// Our general distance matrix "multiplication" kernel
template <typename T, typename DistanceOp, bool InnerContig>
__launch_bounds__(kWarpSize* kWarpSize) __global__ void generalDistance(
__launch_bounds__(TILE_SIZE* TILE_SIZE) __global__ void generalDistance(
Tensor<T, 2, InnerContig> query, // m x k
Tensor<T, 2, InnerContig> vec, // n x k
DistanceOp op,
Tensor<float, 2, true> out) { // m x n
constexpr int kDimMultiple = 1;

__shared__ T queryTile[kWarpSize][kWarpSize * kDimMultiple + 1];
__shared__ T vecTile[kWarpSize][kWarpSize * kDimMultiple + 1];
__shared__ T queryTile[TILE_SIZE][TILE_SIZE * kDimMultiple + 1];
__shared__ T vecTile[TILE_SIZE][TILE_SIZE * kDimMultiple + 1];

// block y -> query
// block x -> vector

idx_t queryBlock = idx_t(blockIdx.y) * kWarpSize;
idx_t queryBlock = idx_t(blockIdx.y) * TILE_SIZE;
idx_t queryThread = queryBlock + threadIdx.y;

idx_t vecBlock = idx_t(blockIdx.x) * kWarpSize;
idx_t vecBlock = idx_t(blockIdx.x) * TILE_SIZE;
idx_t vecThreadLoad = vecBlock + threadIdx.y;
idx_t vecThreadSave = vecBlock + threadIdx.x;

@@ -116,16 +122,16 @@ __launch_bounds__(kWarpSize* kWarpSize) __global__ void generalDistance(
// Interior tile
//
idx_t limit =
utils::roundDown(query.getSize(1), kWarpSize * kDimMultiple);
utils::roundDown(query.getSize(1), TILE_SIZE * kDimMultiple);

for (idx_t k = threadIdx.x; k < limit; k += kWarpSize * kDimMultiple) {
for (idx_t k = threadIdx.x; k < limit; k += TILE_SIZE * kDimMultiple) {
// Load query tile
#pragma unroll
for (int i = 0; i < kDimMultiple; ++i) {
queryTileBase[threadIdx.x + i * kWarpSize] =
queryBase[k + i * kWarpSize];
vecTileBase[threadIdx.x + i * kWarpSize] =
vecBase[k + i * kWarpSize];
queryTileBase[threadIdx.x + i * TILE_SIZE] =
queryBase[k + i * TILE_SIZE];
vecTileBase[threadIdx.x + i * TILE_SIZE] =
vecBase[k + i * TILE_SIZE];
}

__syncthreads();
@@ -141,13 +147,13 @@ __launch_bounds__(kWarpSize* kWarpSize) __global__ void generalDistance(
if (limit < query.getSize(1)) {
#pragma unroll
for (int i = 0; i < kDimMultiple; ++i) {
idx_t k = limit + threadIdx.x + i * kWarpSize;
idx_t k = limit + threadIdx.x + i * TILE_SIZE;
bool kInBounds = k < query.getSize(1);

queryTileBase[threadIdx.x + i * kWarpSize] =
queryTileBase[threadIdx.x + i * TILE_SIZE] =
kInBounds ? queryBase[k] : ConvertTo<T>::to(0);

vecTileBase[threadIdx.x + i * kWarpSize] =
vecTileBase[threadIdx.x + i * TILE_SIZE] =
kInBounds ? vecBase[k] : ConvertTo<T>::to(0);
}

@@ -174,9 +180,9 @@ __launch_bounds__(kWarpSize* kWarpSize) __global__ void generalDistance(
bool queryThreadInBounds = queryThread < query.getSize(0);
bool vecThreadInBoundsLoad = vecThreadLoad < vec.getSize(0);
bool vecThreadInBoundsSave = vecThreadSave < vec.getSize(0);
idx_t limit = utils::roundDown(query.getSize(1), kWarpSize);
idx_t limit = utils::roundDown(query.getSize(1), TILE_SIZE);

for (idx_t k = threadIdx.x; k < limit; k += kWarpSize) {
for (idx_t k = threadIdx.x; k < limit; k += TILE_SIZE) {
// Load query tile
queryTileBase[threadIdx.x] =
queryThreadInBounds ? queryBase[k] : ConvertTo<T>::to(0);
@@ -188,7 +194,7 @@ __launch_bounds__(kWarpSize* kWarpSize) __global__ void generalDistance(

// thread (y, x) does (query y, vec x)
#pragma unroll
for (int i = 0; i < kWarpSize; ++i) {
for (int i = 0; i < TILE_SIZE; ++i) {
acc.handle(
ConvertTo<float>::to(queryTileBase[i]),
ConvertTo<float>::to(vecTile[threadIdx.x][i]));
@@ -242,10 +248,10 @@ void runGeneralDistanceKernel(
FAISS_ASSERT(out.getSize(1) == vecs.getSize(0));

dim3 grid(
utils::divUp(vecs.getSize(0), kWarpSize),
utils::divUp(query.getSize(0), kWarpSize));
utils::divUp(vecs.getSize(0), TILE_SIZE),
utils::divUp(query.getSize(0), TILE_SIZE));
FAISS_ASSERT(grid.y <= getMaxGridCurrentDevice().y);
dim3 block(kWarpSize, kWarpSize);
dim3 block(TILE_SIZE, TILE_SIZE);

generalDistance<<<grid, block, 0, stream>>>(query, vecs, op, out);
}