tiling bfKnn

algoriddle · facebook-github-bot · commit 875c168fbce3 · 2023-05-17T04:54:09.000-07:00
Summary: Adding tiling support for bfKnn, breaking up both queries and vectors into tiles of size vectorsMemoryLimit and queriesMemoryLimit.

Differential Revision: D45944524

fbshipit-source-id: 9dfab73338601c6278171a37282694273473ace7
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
@@ -24,6 +24,7 @@
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
 #include <faiss/gpu/impl/Distance.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
@@ -218,7 +219,9 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
 }
 
-void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
+void bfKnn_single_tile(
+        GpuResourcesProvider* prov,
+        const GpuDistanceParams& args) {
     // For now, both vectors and queries must be of the same data type
     FAISS_THROW_IF_NOT_MSG(
             args.vectorType == args.queryType,
@@ -368,6 +371,126 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     }
 }
 
+template <class C>
+void bfKnn_shard_database(
+        GpuResourcesProvider* prov,
+        const GpuDistanceParams& args,
+        idx_t shard_size,
+        idx_t distance_size) {
+    std::vector<typename C::T> heaps_distances;
+    if (args.ignoreOutDistances) {
+        heaps_distances.resize(args.numQueries * args.k, 0);
+    }
+    HeapArray<C> heaps = {
+            (size_t)args.numQueries,
+            (size_t)args.k,
+            (typename C::TI*)args.outIndices,
+            args.ignoreOutDistances ? heaps_distances.data()
+                                    : args.outDistances};
+    heaps.heapify();
+    std::vector<typename C::TI> labels(args.numQueries * args.k, -1);
+    std::vector<typename C::T> distances(args.numQueries * args.k, 0);
+    GpuDistanceParams args_batch = args;
+    args_batch.outDistances = distances.data();
+    args_batch.ignoreOutDistances = false;
+    args_batch.outIndices = labels.data();
+    for (idx_t i = 0; i < args.numVectors; i += shard_size) {
+        args_batch.numVectors = min(shard_size, args.numVectors - i);
+        args_batch.vectors =
+                (char*)args.vectors + distance_size * args.dims * i;
+        args_batch.vectorNorms =
+                args.vectorNorms ? args.vectorNorms + i : nullptr;
+        bfKnn_single_tile(prov, args_batch);
+        for (auto& label : labels) {
+            label += i;
+        }
+        heaps.addn_with_ids(args.k, distances.data(), labels.data(), args.k);
+    }
+    heaps.reorder();
+}
+
+void bfKnn_single_query_shard(
+        GpuResourcesProvider* prov,
+        const GpuDistanceParams& args) {
+    if (args.vectorsMemoryLimit == 0) {
+        bfKnn_single_tile(prov, args);
+        return;
+    }
+    FAISS_THROW_IF_NOT_MSG(
+            args.vectorsRowMajor,
+            "sharding vectors is only supported in row major mode");
+    FAISS_THROW_IF_NOT_MSG(
+            args.k > 0, "sharding vectors is only supported for k > 0");
+    idx_t distance_size = args.vectorType == DistanceDataType::F32 ? 4
+            : args.vectorType == DistanceDataType::F16             ? 2
+                                                                   : 0;
+    FAISS_THROW_IF_NOT_MSG(distance_size > 0, "unknown vectorType");
+    idx_t shard_size = args.vectorsMemoryLimit / (args.dims * distance_size);
+    FAISS_THROW_IF_NOT_MSG(
+            shard_size > 0,
+            "vectorsMemoryLimit is too low, shard size would be zero");
+    if (args.numVectors <= shard_size) {
+        bfKnn_single_tile(prov, args);
+        return;
+    }
+    if (is_similarity_metric(args.metric)) {
+        if (args.outIndicesType == IndicesDataType::I64) {
+            bfKnn_shard_database<CMin<float, int64_t>>(
+                    prov, args, shard_size, distance_size);
+        } else if (args.outIndicesType == IndicesDataType::I32) {
+            bfKnn_shard_database<CMin<float, int32_t>>(
+                    prov, args, shard_size, distance_size);
+        } else {
+            FAISS_THROW_MSG("unknown outIndicesType");
+        }
+    } else {
+        if (args.outIndicesType == IndicesDataType::I64) {
+            bfKnn_shard_database<CMax<float, int64_t>>(
+                    prov, args, shard_size, distance_size);
+        } else if (args.outIndicesType == IndicesDataType::I32) {
+            bfKnn_shard_database<CMax<float, int32_t>>(
+                    prov, args, shard_size, distance_size);
+        } else {
+            FAISS_THROW_MSG("unknown outIndicesType");
+        }
+    }
+}
+
+void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
+    if (args.queriesMemoryLimit == 0) {
+        bfKnn_single_query_shard(prov, args);
+        return;
+    }
+    FAISS_THROW_IF_NOT_MSG(
+            args.queriesRowMajor,
+            "sharding queries is only supported in row major mode");
+    FAISS_THROW_IF_NOT_MSG(
+            args.k > 0, "sharding queries is only supported for k > 0");
+    idx_t distance_size = args.queryType == DistanceDataType::F32 ? 4
+            : args.queryType == DistanceDataType::F16             ? 2
+                                                                  : 0;
+    FAISS_THROW_IF_NOT_MSG(distance_size > 0, "unknown queryType");
+    idx_t label_size = args.outIndicesType == IndicesDataType::I64 ? 8
+            : args.outIndicesType == IndicesDataType::I32          ? 4
+                                                                   : 0;
+    FAISS_THROW_IF_NOT_MSG(distance_size > 0, "unknown outIndicesType");
+    idx_t shard_size = args.queriesMemoryLimit /
+            (args.k * (distance_size + label_size) + args.dims * distance_size);
+    FAISS_THROW_IF_NOT_MSG(shard_size > 0, "queriesMemoryLimit is too low");
+    for (idx_t i = 0; i < args.numQueries; i += shard_size) {
+        GpuDistanceParams args_batch = args;
+        args_batch.numQueries = min(shard_size, args.numQueries - i);
+        args_batch.queries =
+                (char*)args.queries + distance_size * args.dims * i;
+        if (!args_batch.ignoreOutDistances) {
+            args_batch.outDistances = args.outDistances + args.k * i;
+        }
+        args_batch.outIndices =
+                (char*)args.outIndices + args.k * label_size * i;
+        bfKnn_single_query_shard(prov, args_batch);
+    }
+}
+
 // legacy version
 void bruteForceKnn(
         GpuResourcesProvider* res,
diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h
@@ -46,7 +46,10 @@ struct GpuDistanceParams {
               ignoreOutDistances(false),
               outIndicesType(IndicesDataType::I64),
               outIndices(nullptr),
-              device(-1) {}
+              device(-1),
+              vectorsMemoryLimit(0),
+              queriesMemoryLimit(0),
+              use_raft(false) {}
 
     //
     // Search parameters
@@ -125,8 +128,18 @@ struct GpuDistanceParams {
     /// execution
     int device;
 
+    // Memory limits for vectors and queries.
+    // If not 0, the GPU will use at most this amount of memory
+    // for vectors and queries respectively.
+    // Vectors are broken up into chunks of size vectorsMemoryLimit,
+    // and queries are broken up into chunks of size queriesMemoryLimit,
+    // including the memory required for the results.
+    // Only supported for row major matrices.
+    uint64_t vectorsMemoryLimit;
+    uint64_t queriesMemoryLimit;
+
     /// Should the index dispatch down to RAFT?
-    bool use_raft = false;
+    bool use_raft;
 };
 
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
diff --git a/faiss/gpu/test/test_gpu_basics.py b/faiss/gpu/test/test_gpu_basics.py
@@ -225,6 +225,14 @@ def make_t(num, d, clamp=False, seed=None):
 
 class TestKnn(unittest.TestCase):
     def test_input_types(self):
+        self.do_test_input_types(0, 0)
+
+    def test_input_types_tiling(self):
+        self.do_test_input_types(0, 500)
+        self.do_test_input_types(1000, 0)
+        self.do_test_input_types(1000, 500)
+
+    def do_test_input_types(self, vectorsMemoryLimit, queriesMemoryLimit):
         d = 33
         k = 5
         nb = 1000
@@ -243,6 +251,8 @@ def test_input_types(self):
         out_d = np.empty((nq, k), dtype=np.float32)
         out_i = np.empty((nq, k), dtype=np.int64)
 
+        gpu_id = random.randrange(0, faiss.get_num_gpus())
+
         # Try f32 data/queries, i64 out indices
         params = faiss.GpuDistanceParams()
         params.k = k
@@ -253,19 +263,30 @@ def test_input_types(self):
         params.numQueries = nq
         params.outDistances = faiss.swig_ptr(out_d)
         params.outIndices = faiss.swig_ptr(out_i)
-        params.device = random.randrange(0, faiss.get_num_gpus())
+        params.device = gpu_id
+        params.vectorsMemoryLimit = vectorsMemoryLimit
+        params.queriesMemoryLimit = queriesMemoryLimit
 
         faiss.bfKnn(res, params)
 
         self.assertTrue(np.allclose(ref_d, out_d, atol=1e-5))
         self.assertGreaterEqual((out_i == ref_i).sum(), ref_i.size)
 
+        out_d, out_i = faiss.knn_gpu(
+            res, qs, xs, k, device=gpu_id,
+            vectorsMemoryLimit=vectorsMemoryLimit,
+            queriesMemoryLimit=queriesMemoryLimit)
+
+        self.assertTrue(np.allclose(ref_d, out_d, atol=1e-5))
+        self.assertGreaterEqual((out_i == ref_i).sum(), ref_i.size)
+
         # Try int32 out indices
         out_i32 = np.empty((nq, k), dtype=np.int32)
         params.outIndices = faiss.swig_ptr(out_i32)
         params.outIndicesType = faiss.IndicesDataType_I32
 
         faiss.bfKnn(res, params)
+
         self.assertEqual((out_i32 == ref_i).sum(), ref_i.size)
 
         # Try float16 data/queries, i64 out indices
diff --git a/faiss/python/gpu_wrappers.py b/faiss/python/gpu_wrappers.py
@@ -54,7 +54,7 @@ def index_cpu_to_gpus_list(index, co=None, gpus=None, ngpu=-1):
 # allows numpy ndarray usage with bfKnn
 
 
-def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1):
+def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, vectorsMemoryLimit=0, queriesMemoryLimit=0):
     """
     Compute the k nearest neighbors of a vector on one GPU without constructing an index
 
@@ -82,6 +82,14 @@ def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1):
         (can also be set via torch.cuda.set_device in PyTorch)
         Otherwise, an integer 0 <= device < numDevices indicates the GPU on which
         the computation should be run
+    vectorsMemoryLimit: int, optional
+    queriesMemoryLimit: int, optional
+        Memory limits for vectors and queries.
+        If not 0, the GPU will use at most this amount of memory
+        for vectors and queries respectively.
+        Vectors are broken up into chunks of size vectorsMemoryLimit,
+        and queries are broken up into chunks of size queriesMemoryLimit,
+        including the memory required for the results.
 
     Returns
     -------
@@ -168,6 +176,8 @@ def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1):
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
+    args.vectorsMemoryLimit = vectorsMemoryLimit
+    args.queriesMemoryLimit = queriesMemoryLimit
 
     # no stream synchronization needed, inputs and outputs are guaranteed to
     # be on the CPU (numpy arrays)
diff --git a/faiss/utils/Heap.cpp b/faiss/utils/Heap.cpp
@@ -136,6 +136,8 @@ void HeapArray<C>::per_line_extrema(T* out_val, TI* out_ids) const {
 
 template struct HeapArray<CMin<float, int64_t>>;
 template struct HeapArray<CMax<float, int64_t>>;
+template struct HeapArray<CMin<float, int32_t>>;
+template struct HeapArray<CMax<float, int32_t>>;
 template struct HeapArray<CMin<int, int64_t>>;
 template struct HeapArray<CMax<int, int64_t>>;