Skip to content

Commit 1c1879b

Browse files
algoriddlefacebook-github-bot
authored andcommitted
tiling bfKnn (facebookresearch#2865)
Summary: Pull Request resolved: facebookresearch#2865 Introduces a tiling version of `bfKnn` called `bfKnn_tiling`, which can break up both queries and vectors into tiles of size vectorsMemoryLimit and queriesMemoryLimit. Reviewed By: wickedfoo Differential Revision: D45944524 fbshipit-source-id: f9cd4c14dbf2d43def773124f19e92d25c86fc5a
1 parent 5c221ed commit 1c1879b

File tree

5 files changed

+211
-4
lines changed

5 files changed

+211
-4
lines changed

faiss/gpu/GpuDistance.cu

+150
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <faiss/gpu/GpuResources.h>
2525
#include <faiss/gpu/utils/DeviceUtils.h>
2626
#include <faiss/impl/FaissAssert.h>
27+
#include <faiss/utils/Heap.h>
2728
#include <faiss/gpu/impl/Distance.cuh>
2829
#include <faiss/gpu/utils/ConversionOperators.cuh>
2930
#include <faiss/gpu/utils/CopyUtils.cuh>
@@ -368,6 +369,155 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
368369
}
369370
}
370371

372+
template <class C>
373+
void bfKnn_shard_database(
374+
GpuResourcesProvider* prov,
375+
const GpuDistanceParams& args,
376+
size_t shard_size,
377+
size_t distance_size) {
378+
std::vector<typename C::T> heaps_distances;
379+
if (args.ignoreOutDistances) {
380+
heaps_distances.resize(args.numQueries * args.k, 0);
381+
}
382+
HeapArray<C> heaps = {
383+
(size_t)args.numQueries,
384+
(size_t)args.k,
385+
(typename C::TI*)args.outIndices,
386+
args.ignoreOutDistances ? heaps_distances.data()
387+
: args.outDistances};
388+
heaps.heapify();
389+
std::vector<typename C::TI> labels(args.numQueries * args.k);
390+
std::vector<typename C::T> distances(args.numQueries * args.k);
391+
GpuDistanceParams args_batch = args;
392+
args_batch.outDistances = distances.data();
393+
args_batch.ignoreOutDistances = false;
394+
args_batch.outIndices = labels.data();
395+
for (idx_t i = 0; i < args.numVectors; i += shard_size) {
396+
args_batch.numVectors = min(shard_size, args.numVectors - i);
397+
args_batch.vectors =
398+
(char*)args.vectors + distance_size * args.dims * i;
399+
args_batch.vectorNorms =
400+
args.vectorNorms ? args.vectorNorms + i : nullptr;
401+
bfKnn(prov, args_batch);
402+
for (auto& label : labels) {
403+
label += i;
404+
}
405+
heaps.addn_with_ids(args.k, distances.data(), labels.data(), args.k);
406+
}
407+
heaps.reorder();
408+
}
409+
410+
void bfKnn_single_query_shard(
411+
GpuResourcesProvider* prov,
412+
const GpuDistanceParams& args,
413+
size_t vectorsMemoryLimit) {
414+
if (vectorsMemoryLimit == 0) {
415+
bfKnn(prov, args);
416+
return;
417+
}
418+
FAISS_THROW_IF_NOT_MSG(
419+
args.numVectors > 0, "bfKnn_tiling: numVectors must be > 0");
420+
FAISS_THROW_IF_NOT_MSG(
421+
args.vectors,
422+
"bfKnn_tiling: vectors must be provided (passed null)");
423+
FAISS_THROW_IF_NOT_MSG(
424+
getDeviceForAddress(args.vectors) == -1,
425+
"bfKnn_tiling: vectors should be in CPU memory when vectorsMemoryLimit > 0");
426+
FAISS_THROW_IF_NOT_MSG(
427+
args.vectorsRowMajor,
428+
"bfKnn_tiling: tiling vectors is only supported in row major mode");
429+
FAISS_THROW_IF_NOT_MSG(
430+
args.k > 0,
431+
"bfKnn_tiling: tiling vectors is only supported for k > 0");
432+
size_t distance_size = args.vectorType == DistanceDataType::F32 ? 4
433+
: args.vectorType == DistanceDataType::F16 ? 2
434+
: 0;
435+
FAISS_THROW_IF_NOT_MSG(
436+
distance_size > 0, "bfKnn_tiling: unknown vectorType");
437+
size_t shard_size = vectorsMemoryLimit / (args.dims * distance_size);
438+
FAISS_THROW_IF_NOT_MSG(
439+
shard_size > 0, "bfKnn_tiling: vectorsMemoryLimit is too low");
440+
if (args.numVectors <= shard_size) {
441+
bfKnn(prov, args);
442+
return;
443+
}
444+
if (is_similarity_metric(args.metric)) {
445+
if (args.outIndicesType == IndicesDataType::I64) {
446+
bfKnn_shard_database<CMin<float, int64_t>>(
447+
prov, args, shard_size, distance_size);
448+
} else if (args.outIndicesType == IndicesDataType::I32) {
449+
bfKnn_shard_database<CMin<float, int32_t>>(
450+
prov, args, shard_size, distance_size);
451+
} else {
452+
FAISS_THROW_MSG("bfKnn_tiling: unknown outIndicesType");
453+
}
454+
} else {
455+
if (args.outIndicesType == IndicesDataType::I64) {
456+
bfKnn_shard_database<CMax<float, int64_t>>(
457+
prov, args, shard_size, distance_size);
458+
} else if (args.outIndicesType == IndicesDataType::I32) {
459+
bfKnn_shard_database<CMax<float, int32_t>>(
460+
prov, args, shard_size, distance_size);
461+
} else {
462+
FAISS_THROW_MSG("bfKnn_tiling: unknown outIndicesType");
463+
}
464+
}
465+
}
466+
467+
void bfKnn_tiling(
468+
GpuResourcesProvider* prov,
469+
const GpuDistanceParams& args,
470+
size_t vectorsMemoryLimit,
471+
size_t queriesMemoryLimit) {
472+
if (queriesMemoryLimit == 0) {
473+
bfKnn_single_query_shard(prov, args, vectorsMemoryLimit);
474+
return;
475+
}
476+
FAISS_THROW_IF_NOT_MSG(
477+
args.numQueries > 0, "bfKnn_tiling: numQueries must be > 0");
478+
FAISS_THROW_IF_NOT_MSG(
479+
args.queries,
480+
"bfKnn_tiling: queries must be provided (passed null)");
481+
FAISS_THROW_IF_NOT_MSG(
482+
getDeviceForAddress(args.queries) == -1,
483+
"bfKnn_tiling: queries should be in CPU memory when queriesMemoryLimit > 0");
484+
FAISS_THROW_IF_NOT_MSG(
485+
args.queriesRowMajor,
486+
"bfKnn_tiling: tiling queries is only supported in row major mode");
487+
FAISS_THROW_IF_NOT_MSG(
488+
args.k > 0,
489+
"bfKnn_tiling: tiling queries is only supported for k > 0");
490+
size_t distance_size = args.queryType == DistanceDataType::F32 ? 4
491+
: args.queryType == DistanceDataType::F16 ? 2
492+
: 0;
493+
FAISS_THROW_IF_NOT_MSG(
494+
distance_size > 0, "bfKnn_tiling: unknown queryType");
495+
size_t label_size = args.outIndicesType == IndicesDataType::I64 ? 8
496+
: args.outIndicesType == IndicesDataType::I32 ? 4
497+
: 0;
498+
FAISS_THROW_IF_NOT_MSG(
499+
distance_size > 0, "bfKnn_tiling: unknown outIndicesType");
500+
size_t shard_size = queriesMemoryLimit /
501+
(args.k * (distance_size + label_size) + args.dims * distance_size);
502+
FAISS_THROW_IF_NOT_MSG(
503+
shard_size > 0, "bfKnn_tiling: queriesMemoryLimit is too low");
504+
FAISS_THROW_IF_NOT_MSG(
505+
args.outIndices,
506+
"bfKnn: outIndices must be provided (passed null)");
507+
for (idx_t i = 0; i < args.numQueries; i += shard_size) {
508+
GpuDistanceParams args_batch = args;
509+
args_batch.numQueries = min(shard_size, args.numQueries - i);
510+
args_batch.queries =
511+
(char*)args.queries + distance_size * args.dims * i;
512+
if (!args_batch.ignoreOutDistances) {
513+
args_batch.outDistances = args.outDistances + args.k * i;
514+
}
515+
args_batch.outIndices =
516+
(char*)args.outIndices + args.k * label_size * i;
517+
bfKnn_single_query_shard(prov, args_batch, vectorsMemoryLimit);
518+
}
519+
}
520+
371521
// legacy version
372522
void bruteForceKnn(
373523
GpuResourcesProvider* res,

faiss/gpu/GpuDistance.h

+18
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,24 @@ struct GpuDistanceParams {
123123
/// nearest neighbors with respect to the given metric
124124
void bfKnn(GpuResourcesProvider* resources, const GpuDistanceParams& args);
125125

126+
// bfKnn which takes two extra parameters to control the maximum GPU
127+
// memory allowed for vectors and queries, the latter including the
128+
// memory required for the results.
129+
// If 0, the corresponding input must fit into GPU memory.
130+
// If greater than 0, the function will use at most this much GPU
131+
// memory (in bytes) for vectors and queries respectively.
132+
// Vectors are broken up into chunks of size vectorsMemoryLimit,
133+
// and queries are broken up into chunks of size queriesMemoryLimit.
134+
// The tiles resulting from the product of the query and vector
135+
// chunks are processed sequentially on the GPU.
136+
// Only supported for row major matrices and k > 0. The input that
137+
// needs sharding must reside on the CPU.
138+
void bfKnn_tiling(
139+
GpuResourcesProvider* resources,
140+
const GpuDistanceParams& args,
141+
size_t vectorsMemoryLimit,
142+
size_t queriesMemoryLimit);
143+
126144
/// Deprecated legacy implementation
127145
void bruteForceKnn(
128146
GpuResourcesProvider* resources,

faiss/gpu/test/test_gpu_basics.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,14 @@ def make_t(num, d, clamp=False, seed=None):
225225

226226
class TestKnn(unittest.TestCase):
227227
def test_input_types(self):
228+
self.do_test_input_types(0, 0)
229+
230+
def test_input_types_tiling(self):
231+
self.do_test_input_types(0, 500)
232+
self.do_test_input_types(1000, 0)
233+
self.do_test_input_types(1000, 500)
234+
235+
def do_test_input_types(self, vectorsMemoryLimit, queriesMemoryLimit):
228236
d = 33
229237
k = 5
230238
nb = 1000
@@ -243,6 +251,8 @@ def test_input_types(self):
243251
out_d = np.empty((nq, k), dtype=np.float32)
244252
out_i = np.empty((nq, k), dtype=np.int64)
245253

254+
gpu_id = random.randrange(0, faiss.get_num_gpus())
255+
246256
# Try f32 data/queries, i64 out indices
247257
params = faiss.GpuDistanceParams()
248258
params.k = k
@@ -253,9 +263,24 @@ def test_input_types(self):
253263
params.numQueries = nq
254264
params.outDistances = faiss.swig_ptr(out_d)
255265
params.outIndices = faiss.swig_ptr(out_i)
256-
params.device = random.randrange(0, faiss.get_num_gpus())
266+
params.device = gpu_id
267+
268+
if vectorsMemoryLimit > 0 or queriesMemoryLimit > 0:
269+
faiss.bfKnn_tiling(
270+
res,
271+
params,
272+
vectorsMemoryLimit,
273+
queriesMemoryLimit)
274+
else:
275+
faiss.bfKnn(res, params)
257276

258-
faiss.bfKnn(res, params)
277+
self.assertTrue(np.allclose(ref_d, out_d, atol=1e-5))
278+
self.assertGreaterEqual((out_i == ref_i).sum(), ref_i.size)
279+
280+
out_d, out_i = faiss.knn_gpu(
281+
res, qs, xs, k, device=gpu_id,
282+
vectorsMemoryLimit=vectorsMemoryLimit,
283+
queriesMemoryLimit=queriesMemoryLimit)
259284

260285
self.assertTrue(np.allclose(ref_d, out_d, atol=1e-5))
261286
self.assertGreaterEqual((out_i == ref_i).sum(), ref_i.size)
@@ -266,6 +291,7 @@ def test_input_types(self):
266291
params.outIndicesType = faiss.IndicesDataType_I32
267292

268293
faiss.bfKnn(res, params)
294+
269295
self.assertEqual((out_i32 == ref_i).sum(), ref_i.size)
270296

271297
# Try float16 data/queries, i64 out indices

faiss/python/gpu_wrappers.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def index_cpu_to_gpus_list(index, co=None, gpus=None, ngpu=-1):
5454
# allows numpy ndarray usage with bfKnn
5555

5656

57-
def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_raft=False):
57+
def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_raft=False, vectorsMemoryLimit=0, queriesMemoryLimit=0):
5858
"""
5959
Compute the k nearest neighbors of a vector on one GPU without constructing an index
6060
@@ -82,6 +82,14 @@ def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_raf
8282
(can also be set via torch.cuda.set_device in PyTorch)
8383
Otherwise, an integer 0 <= device < numDevices indicates the GPU on which
8484
the computation should be run
85+
vectorsMemoryLimit: int, optional
86+
queriesMemoryLimit: int, optional
87+
Memory limits for vectors and queries.
88+
If not 0, the GPU will use at most this amount of memory
89+
for vectors and queries respectively.
90+
Vectors are broken up into chunks of size vectorsMemoryLimit,
91+
and queries are broken up into chunks of size queriesMemoryLimit,
92+
including the memory required for the results.
8593
8694
Returns
8795
-------
@@ -172,7 +180,10 @@ def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_raf
172180

173181
# no stream synchronization needed, inputs and outputs are guaranteed to
174182
# be on the CPU (numpy arrays)
175-
bfKnn(res, args)
183+
if vectorsMemoryLimit > 0 or queriesMemoryLimit > 0:
184+
bfKnn_tiling(res, args, vectorsMemoryLimit, queriesMemoryLimit)
185+
else:
186+
bfKnn(res, args)
176187

177188
return D, I
178189

faiss/utils/Heap.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ void HeapArray<C>::per_line_extrema(T* out_val, TI* out_ids) const {
136136

137137
template struct HeapArray<CMin<float, int64_t>>;
138138
template struct HeapArray<CMax<float, int64_t>>;
139+
template struct HeapArray<CMin<float, int32_t>>;
140+
template struct HeapArray<CMax<float, int32_t>>;
139141
template struct HeapArray<CMin<int, int64_t>>;
140142
template struct HeapArray<CMax<int, int64_t>>;
141143

0 commit comments

Comments
 (0)