Large two-level clustering (facebookresearch#2882)

mdouze · facebook-github-bot · commit 90349f264b62 · 2023-05-31T00:15:03.000-07:00
Summary: Pull Request resolved: facebookresearch#2882 A two level clustering version where the training data does not need to fit in RAM. Reviewed By: algoriddle Differential Revision: D44557021 fbshipit-source-id: 892d4fec4588eb33da6e7a82c15040f39426485e
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
@@ -107,17 +107,15 @@ def randn(n, seed=12345):
 def checksum(a):
     """ compute a checksum for quick-and-dirty comparisons of arrays """
     a = a.view('uint8')
-    n = a.size
-    n4 = n & ~3
-    cs = ivec_checksum(int(n4 / 4), swig_ptr(a[:n4].view('int32')))
-    for i in range(n4, n):
-        cs += x[i] * 33657
+    if a.ndim == 1:
+        return bvec_checksum(s.size, swig_ptr(a))
+    n, d = a.shape
+    cs = np.zeros(n, dtype='uint64')
+    bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
     return cs
 
-
 rand_smooth_vectors_c = rand_smooth_vectors
 
-
 def rand_smooth_vectors(n, d, seed=1234):
     res = np.empty((n, d), dtype='float32')
     rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
@@ -422,7 +420,7 @@ def __init__(self, d, k, **kwargs):
          including niter=25, verbose=False, spherical = False
         """
         self.d = d
-        self.k = k
+        self.reset(k)
         self.gpu = False
         if "progressive_dim_steps" in kwargs:
             self.cp = ProgressiveDimClusteringParameters()
@@ -437,7 +435,32 @@ def __init__(self, d, k, **kwargs):
                 # if this raises an exception, it means that it is a non-existent field
                 getattr(self.cp, k)
                 setattr(self.cp, k, v)
+        self.set_index()
+
+    def set_index(self):
+        d = self.d
+        if self.cp.__class__ == ClusteringParameters:
+            if self.cp.spherical:
+                self.index = IndexFlatIP(d)
+            else:
+                self.index = IndexFlatL2(d)
+            if self.gpu:
+                self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
+        else:
+            if self.gpu:
+                fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
+            else:
+                fac = ProgressiveDimIndexFactory()
+            self.fac = fac
+
+    def reset(self, k=None):
+        """ prepare k-means object to perform a new clustering, possibly
+        with another number of centroids """
+        if k is not None:
+            self.k = int(k)
         self.centroids = None
+        self.obj = None
+        self.iteration_stats = None
 
     def train(self, x, weights=None, init_centroids=None):
         """ Perform k-means clustering.
@@ -476,24 +499,14 @@ def train(self, x, weights=None, init_centroids=None):
                 nc, d2 = init_centroids.shape
                 assert d2 == d
                 faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
-            if self.cp.spherical:
-                self.index = IndexFlatIP(d)
-            else:
-                self.index = IndexFlatL2(d)
-            if self.gpu:
-                self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
             clus.train(x, self.index, weights)
         else:
             # not supported for progressive dim
             assert weights is None
             assert init_centroids is None
             assert not self.cp.spherical
             clus = ProgressiveDimClustering(d, self.k, self.cp)
-            if self.gpu:
-                fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
-            else:
-                fac = ProgressiveDimIndexFactory()
-            clus.train(n, swig_ptr(x), fac)
+            clus.train(n, swig_ptr(x), self.fac)
 
         centroids = faiss.vector_float_to_array(clus.centroids)
 
diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
@@ -428,15 +428,30 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
     }
 }
 
-size_t ivec_checksum(size_t n, const int32_t* asigned) {
+uint64_t ivec_checksum(size_t n, const int32_t* asigned) {
     const uint32_t* a = reinterpret_cast<const uint32_t*>(asigned);
-    size_t cs = 112909;
+    uint64_t cs = 112909;
     while (n--) {
         cs = cs * 65713 + a[n] * 1686049;
     }
     return cs;
 }
 
+uint64_t bvec_checksum(size_t n, const uint8_t* a) {
+    uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
+    for (size_t i = n / 4 * 4; i < n; i++) {
+        cs = cs * 65713 + a[n] * 1686049;
+    }
+    return cs;
+}
+
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
+#pragma omp parallel for if (n > 1000)
+    for (size_t i = 0; i < n; i++) {
+        cs[i] = bvec_checksum(d, a + i * d);
+    }
+}
+
 const float* fvecs_maybe_subsample(
         size_t d,
         size_t* n,
diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h
@@ -121,7 +121,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist);
 void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);
 
 /// compute a checksum on a table.
-size_t ivec_checksum(size_t n, const int32_t* a);
+uint64_t ivec_checksum(size_t n, const int32_t* a);
+
+/// compute a checksum on a table.
+uint64_t bvec_checksum(size_t n, const uint8_t* a);
+
+/** compute checksums for the rows of a matrix
+ *
+ * @param n   number of rows
+ * @param d   size per row
+ * @param a   matrix to handle, size n * d
+ * @param cs  output checksums, size n
+ */
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);
 
 /** random subsamples a set of vectors if there are too many of them
  *