Skip to content

Commit 27b1055

Browse files
tarang-jainfacebook-github-bot
authored andcommitted
Integrate IVF-PQ from RAFT (facebookresearch#3044)
Summary: Imports changes from facebookresearch#3133 and facebookresearch#3171. So this single PR adds all the changes together. - [x] Implement RaftIVFPQ class - [x] Update gtests to test correctness with RAFT enabled - [x] All googleTests for RAFT enabled IVFPQ pass - [x] Move some common functions in RaftIVFFlat and RaftIVFPQ to helper: RaftUtils.h - [x] update Quantizer retroactively after building RAFT index -- both IVFFlat and IVFPQ - [x] resolve failing LargeBatch (classical GPU) - [x] add checks for Pascal deprecation - [x] apply RMM changes from facebookresearch#3171 - [x] apply robertmaynard's changes from facebookresearch#3133 Pull Request resolved: facebookresearch#3044 Reviewed By: junjieqi Differential Revision: D51074065 Pulled By: algoriddle fbshipit-source-id: 6871257921bcaff2064a20637e2ed358acbdc363
1 parent 87d43b9 commit 27b1055

40 files changed

+2199
-654
lines changed

CMakeLists.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@ if(FAISS_ENABLE_GPU)
6262
enable_language(CUDA)
6363
endif()
6464

65-
if(FAISS_ENABLE_RAFT)
66-
find_package(raft COMPONENTS compiled distributed)
67-
endif()
65+
if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
66+
find_package(raft COMPONENTS compiled distributed)
67+
endif()
6868

6969
add_subdirectory(faiss)
7070

benchs/bench_ivfflat_raft.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
#
6+
# Copyright (c) 2023, NVIDIA CORPORATION.
7+
#
8+
# Licensed under the Apache License, Version 2.0 (the "License");
9+
# you may not use this file except in compliance with the License.
10+
# You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing, software
15+
# distributed under the License is distributed on an "AS IS" BASIS,
16+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
# See the License for the specific language governing permissions and
18+
# limitations under the License.
19+
#
20+
21+
import numpy as np
22+
import faiss
23+
import time
24+
import argparse
25+
import rmm
26+
27+
######################################################
28+
# Command-line parsing
29+
######################################################
30+
31+
parser = argparse.ArgumentParser()
32+
33+
34+
def aa(*args, **kwargs):
35+
group.add_argument(*args, **kwargs)
36+
37+
38+
group = parser.add_argument_group('benchmarking options')
39+
40+
aa('--bm_train', default=False, action='store_true',
41+
help='whether to benchmark train operation on GPU index')
42+
aa('--bm_add', default=False, action='store_true',
43+
help='whether to benchmark add operation on GPU index')
44+
aa('--bm_search', default=True,
45+
help='whether to benchmark search operation on GPU index')
46+
aa('--raft_only', default=False, action='store_true',
47+
help='whether to only produce RAFT enabled benchmarks')
48+
49+
50+
group = parser.add_argument_group('IVF options')
51+
aa('--n_centroids', default=256, type=int,
52+
help="number of IVF centroids")
53+
54+
55+
group = parser.add_argument_group('searching')
56+
57+
aa('--k', default=100, type=int, help='nb of nearest neighbors')
58+
aa('--nprobe', default=50, help='nb of IVF lists to probe')
59+
60+
args = parser.parse_args()
61+
62+
print("args:", args)
63+
64+
rs = np.random.RandomState(123)
65+
66+
res = faiss.StandardGpuResources()
67+
68+
# Use an RMM pool memory resource for device allocations
69+
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
70+
rmm.mr.set_current_device_resource(mr)
71+
72+
def bench_train_milliseconds(index, trainVecs, use_raft):
73+
co = faiss.GpuMultipleClonerOptions()
74+
co.use_raft = use_raft
75+
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
76+
t0 = time.time()
77+
index_gpu.train(trainVecs)
78+
return 1000*(time.time() - t0)
79+
80+
81+
if args.bm_train:
82+
print("=" * 40)
83+
print("GPU Train Benchmarks")
84+
print("=" * 40)
85+
trainset_sizes = [5000, 10000, 100000, 1000000, 5000000]
86+
dataset_dims = [128, 256, 1024]
87+
for n_rows in trainset_sizes:
88+
for n_cols in dataset_dims:
89+
index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
90+
trainVecs = rs.rand(n_rows, n_cols).astype('float32')
91+
raft_gpu_train_time = bench_train_milliseconds(
92+
index, trainVecs, True)
93+
if args.raft_only:
94+
print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
95+
n_cols, args.n_centroids, n_rows, raft_gpu_train_time))
96+
else:
97+
classical_gpu_train_time = bench_train_milliseconds(
98+
index, trainVecs, False)
99+
print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
100+
n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time))
101+
102+
103+
def bench_add_milliseconds(index, addVecs, use_raft):
104+
co = faiss.GpuMultipleClonerOptions()
105+
co.use_raft = use_raft
106+
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
107+
index_gpu.copyFrom(index)
108+
t0 = time.time()
109+
index_gpu.add(addVecs)
110+
return 1000*(time.time() - t0)
111+
112+
113+
if args.bm_add:
114+
print("=" * 40)
115+
print("GPU Add Benchmarks")
116+
print("=" * 40)
117+
addset_sizes = [5000, 10000, 100000, 1000000]
118+
dataset_dims = [128, 256, 1024]
119+
n_train = 10000
120+
trainVecs = rs.rand(n_train, n_cols).astype('float32')
121+
index = faiss.index_factory(
122+
n_cols, "IVF" + str(args.n_centroids) + ",Flat")
123+
index.train(trainVecs)
124+
for n_rows in addset_sizes:
125+
for n_cols in dataset_dims:
126+
addVecs = rs.rand(n_rows, n_cols).astype('float32')
127+
raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
128+
if args.raft_only:
129+
print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % (
130+
n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time))
131+
else:
132+
classical_gpu_add_time = bench_add_milliseconds(
133+
index, addVecs, False)
134+
print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
135+
n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time))
136+
137+
138+
def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
139+
co = faiss.GpuMultipleClonerOptions()
140+
co.use_raft = use_raft
141+
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
142+
index_gpu.copyFrom(index)
143+
index_gpu.add(addVecs)
144+
index_gpu.nprobe = nprobe
145+
t0 = time.time()
146+
index_gpu.search(queryVecs, k)
147+
return 1000*(time.time() - t0)
148+
149+
150+
if args.bm_search:
151+
print("=" * 40)
152+
print("GPU Search Benchmarks")
153+
print("=" * 40)
154+
queryset_sizes = [5000, 10000, 100000, 500000]
155+
n_train = 10000
156+
n_add = 100000
157+
search_bm_dims = [8, 16, 32]
158+
for n_cols in search_bm_dims:
159+
index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
160+
trainVecs = rs.rand(n_train, n_cols).astype('float32')
161+
index.train(trainVecs)
162+
addVecs = rs.rand(n_add, n_cols).astype('float32')
163+
for n_rows in queryset_sizes:
164+
queryVecs = rs.rand(n_rows, n_cols).astype('float32')
165+
raft_gpu_search_time = bench_search_milliseconds(
166+
index, addVecs, queryVecs, args.nprobe, args.k, True)
167+
if args.raft_only:
168+
print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
169+
n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
170+
else:
171+
classical_gpu_search_time = bench_search_milliseconds(
172+
index, addVecs, queryVecs, args.nprobe, args.k, False)
173+
print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
174+
n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
175+
176+
print("=" * 40)
177+
print("Large RAFT Enabled Benchmarks")
178+
print("=" * 40)
179+
# Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k
180+
queryset_sizes = [100000, 500000, 1000000]
181+
large_search_bm_dims = [128, 256, 1024]
182+
for n_cols in large_search_bm_dims:
183+
trainVecs = rs.rand(n_train, n_cols).astype('float32')
184+
index = faiss.index_factory(
185+
n_cols, "IVF" + str(args.n_centroids) + ",Flat")
186+
index.train(trainVecs)
187+
addVecs = rs.rand(n_add, n_cols).astype('float32')
188+
for n_rows in queryset_sizes:
189+
queryVecs = rs.rand(n_rows, n_cols).astype('float32')
190+
raft_gpu_search_time = bench_search_milliseconds(
191+
index, addVecs, queryVecs, args.nprobe, args.k, True)
192+
print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
193+
n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))

benchs/bench_ivfpq_raft.py

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
#
6+
# Copyright (c) 2023, NVIDIA CORPORATION.
7+
#
8+
# Licensed under the Apache License, Version 2.0 (the "License");
9+
# you may not use this file except in compliance with the License.
10+
# You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing, software
15+
# distributed under the License is distributed on an "AS IS" BASIS,
16+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
# See the License for the specific language governing permissions and
18+
# limitations under the License.
19+
#
20+
21+
import numpy as np
22+
import faiss
23+
import time
24+
import argparse
25+
import rmm
26+
27+
######################################################
28+
# Command-line parsing
29+
######################################################
30+
31+
parser = argparse.ArgumentParser()
32+
33+
from datasets import load_sift1M, evaluate
34+
35+
36+
print("load data")
37+
xb, xq, xt, gt = load_sift1M()
38+
39+
def aa(*args, **kwargs):
40+
group.add_argument(*args, **kwargs)
41+
42+
43+
group = parser.add_argument_group('benchmarking options')
44+
aa('--raft_only', default=False, action='store_true',
45+
help='whether to only produce RAFT enabled benchmarks')
46+
47+
group = parser.add_argument_group('IVF options')
48+
aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled')
49+
aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code')
50+
aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)')
51+
52+
group = parser.add_argument_group('searching')
53+
aa('--k', default=10, type=int, help='nb of nearest neighbors')
54+
aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe')
55+
56+
args = parser.parse_args()
57+
58+
print("args:", args)
59+
60+
rs = np.random.RandomState(123)
61+
62+
res = faiss.StandardGpuResources()
63+
64+
# Use an RMM pool memory resource for device allocations
65+
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
66+
rmm.mr.set_current_device_resource(mr)
67+
68+
# A heuristic to select a suitable number of lists
69+
def compute_nlist(numVecs):
70+
nlist = np.sqrt(numVecs)
71+
if (numVecs / nlist < 1000):
72+
nlist = numVecs / 1000
73+
return int(nlist)
74+
75+
76+
def bench_train_milliseconds(index, trainVecs, use_raft):
77+
co = faiss.GpuMultipleClonerOptions()
78+
# use float 16 lookup tables to save space
79+
co.useFloat16LookupTables = True
80+
co.use_raft = use_raft
81+
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
82+
t0 = time.time()
83+
index_gpu.train(trainVecs)
84+
return 1000*(time.time() - t0)
85+
86+
n_rows, n_cols = xb.shape
87+
n_train, _ = xt.shape
88+
M = n_cols // args.pq_len
89+
nlist = compute_nlist(n_rows)
90+
index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
91+
92+
print("=" * 40)
93+
print("GPU Train Benchmarks")
94+
print("=" * 40)
95+
raft_gpu_train_time = bench_train_milliseconds(index, xt, True)
96+
if args.raft_only:
97+
print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
98+
n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time))
99+
else:
100+
classical_gpu_train_time = bench_train_milliseconds(
101+
index, xt, False)
102+
print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
103+
n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time))
104+
105+
106+
def bench_add_milliseconds(index, addVecs, use_raft):
107+
co = faiss.GpuMultipleClonerOptions()
108+
# use float 16 lookup tables to save space
109+
co.useFloat16LookupTables = True
110+
co.use_raft = use_raft
111+
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
112+
index_gpu.copyFrom(index)
113+
t0 = time.time()
114+
index_gpu.add(addVecs)
115+
return 1000*(time.time() - t0)
116+
117+
print("=" * 40)
118+
print("GPU Add Benchmarks")
119+
print("=" * 40)
120+
index.train(xt)
121+
raft_gpu_add_time = bench_add_milliseconds(index, xb, True)
122+
if args.raft_only:
123+
print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % (
124+
n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time))
125+
else:
126+
classical_gpu_add_time = bench_add_milliseconds(
127+
index, xb, False)
128+
print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
129+
n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time))
130+
131+
132+
def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
133+
co = faiss.GpuMultipleClonerOptions()
134+
co.use_raft = use_raft
135+
co.useFloat16LookupTables = True
136+
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
137+
index_gpu.copyFrom(index)
138+
index_gpu.add(addVecs)
139+
index_gpu.nprobe = nprobe
140+
t0 = time.time()
141+
index_gpu.search(queryVecs, k)
142+
return 1000*(time.time() - t0)
143+
144+
145+
if args.bm_search:
146+
print("=" * 40)
147+
print("GPU Search Benchmarks")
148+
print("=" * 40)
149+
queryset_sizes = [1, 10, 100, 1000, 10000]
150+
n_train, n_cols = xt.shape
151+
n_add, _ = xb.shape
152+
print(xq.shape)
153+
M = n_cols // args.pq_len
154+
nlist = compute_nlist(n_add)
155+
index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
156+
index.train(xt)
157+
for n_rows in queryset_sizes:
158+
queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)]
159+
raft_gpu_search_time = bench_search_milliseconds(
160+
index, xb, queryVecs, args.nprobe, args.k, True)
161+
if args.raft_only:
162+
print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
163+
n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
164+
else:
165+
classical_gpu_search_time = bench_search_milliseconds(
166+
index, xb, queryVecs, args.nprobe, args.k, False)
167+
print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
168+
n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))

cmake/thirdparty/fetch_rapids.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# or implied. See the License for the specific language governing permissions and limitations under
1616
# the License.
1717
# =============================================================================
18-
set(RAPIDS_VERSION "23.12")
18+
set(RAPIDS_VERSION "24.02")
1919

2020
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
2121
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake

0 commit comments

Comments
 (0)