Cleanup clustering code (facebookresearch#3030)

mdouze · facebook-github-bot · commit 5c4bd3feb3d7 · 2023-08-31T01:11:45.000-07:00
Summary: Pull Request resolved: facebookresearch#3030 Added default arguments to the .h file (for some reason I forgot this file when migrating default args). Logging a hash value in MatrixStats, useful to check if two runs really really run on the same matrix... Reviewed By: pemazare Differential Revision: D48834343 fbshipit-source-id: 7c1948464e66ada1f462f4486f7cf3159bbf9dfd
diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp
@@ -27,20 +27,6 @@
 
 namespace faiss {
 
-ClusteringParameters::ClusteringParameters()
-        : niter(25),
-          nredo(1),
-          verbose(false),
-          spherical(false),
-          int_centroids(false),
-          update_index(false),
-          frozen_centroids(false),
-          min_points_per_centroid(39),
-          max_points_per_centroid(256),
-          seed(1234),
-          decode_block_size(32768) {}
-// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
-
 Clustering::Clustering(int d, int k) : d(d), k(k) {}
 
 Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
diff --git a/faiss/Clustering.h b/faiss/Clustering.h
@@ -5,7 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
+/** Implementation of k-means clustering with many variants. */
 
 #ifndef FAISS_CLUSTERING_H
 #define FAISS_CLUSTERING_H
@@ -19,25 +19,35 @@ namespace faiss {
  * constructor of the Clustering object.
  */
 struct ClusteringParameters {
-    int niter; ///< clustering iterations
-    int nredo; ///< redo clustering this many times and keep best
-
-    bool verbose;
-    bool spherical;        ///< do we want normalized centroids?
-    bool int_centroids;    ///< round centroids coordinates to integer
-    bool update_index;     ///< re-train index after each iteration?
-    bool frozen_centroids; ///< use the centroids provided as input and do not
-                           ///< change them during iterations
-
-    int min_points_per_centroid; ///< otherwise you get a warning
-    int max_points_per_centroid; ///< to limit size of dataset
-
-    int seed; ///< seed for the random number generator
-
-    size_t decode_block_size; ///< how many vectors at a time to decode
-
-    /// sets reasonable defaults
-    ClusteringParameters();
+    /// number of clustering iterations
+    int niter = 25;
+    /// redo clustering this many times and keep the clusters with the best
+    /// objective
+    int nredo = 1;
+
+    bool verbose = false;
+    /// whether to normalize centroids after each iteration (useful for inner
+    /// product clustering)
+    bool spherical = false;
+    /// round centroids coordinates to integer after each iteration?
+    bool int_centroids = false;
+    /// re-train index after each iteration?
+    bool update_index = false;
+
+    /// Use the subset of centroids provided as input and do not change them
+    /// during iterations
+    bool frozen_centroids = false;
+    /// If fewer than this number of training vectors per centroid are provided,
+    /// writes a warning. Note that fewer than 1 point per centroid raises an
+    /// exception.
+    int min_points_per_centroid = 39;
+    /// to limit size of dataset, otherwise the training set is subsampled
+    int max_points_per_centroid = 256;
+    /// seed for the random number generator
+    int seed = 1234;
+
+    /// when the training set is encoded, batch size of the codec decoder
+    size_t decode_block_size = 32768;
 };
 
 struct ClusteringIterationStats {
@@ -94,7 +104,7 @@ struct Clustering : ClusteringParameters {
      * to decode the input vectors.
      *
      * @param codec      codec used to decode the vectors (nullptr =
-     *                   vectors are in fact floats)     *
+     *                   vectors are in fact floats)
      */
     void train_encoded(
             idx_t nx,
diff --git a/faiss/MatrixStats.cpp b/faiss/MatrixStats.cpp
@@ -12,6 +12,7 @@
 #include <stdarg.h> /* va_list, va_start, va_arg, va_end */
 
 #include <faiss/utils/utils.h>
+#include <inttypes.h>
 #include <cmath>
 #include <cstdio>
 
@@ -21,18 +22,6 @@ namespace faiss {
  * MatrixStats
  *********************************************************************/
 
-MatrixStats::PerDimStats::PerDimStats()
-        : n(0),
-          n_nan(0),
-          n_inf(0),
-          n0(0),
-          min(HUGE_VALF),
-          max(-HUGE_VALF),
-          sum(0),
-          sum2(0),
-          mean(NAN),
-          stddev(NAN) {}
-
 void MatrixStats::PerDimStats::add(float x) {
     n++;
     if (std::isnan(x)) {
@@ -74,26 +63,22 @@ void MatrixStats::do_comment(const char* fmt, ...) {
     buf += size;
 }
 
-MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
-        : n(n),
-          d(d),
-          n_collision(0),
-          n_valid(0),
-          n0(0),
-          min_norm2(HUGE_VAL),
-          max_norm2(0) {
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
     std::vector<char> comment_buf(10000);
     buf = comment_buf.data();
     nbuf = comment_buf.size();
 
-    do_comment("analyzing %ld vectors of size %ld\n", n, d);
+    do_comment("analyzing %zd vectors of size %zd\n", n, d);
 
     if (d > 1024) {
         do_comment(
                 "indexing this many dimensions is hard, "
                 "please consider dimensionality reducution (with PCAMatrix)\n");
     }
 
+    hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
+    do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
+
     size_t nbytes = sizeof(x[0]) * d;
     per_dim_stats.resize(d);
 
@@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
 
         if (n_collision > 0) {
             do_comment(
-                    "%ld collisions in hash table, "
+                    "%zd collisions in hash table, "
                     "counts may be invalid\n",
                     n_collision);
         }
@@ -167,22 +152,22 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 max = it->second;
             }
         }
-        do_comment("vector %ld has %ld copies\n", max.first, max.count);
+        do_comment("vector %zd has %zd copies\n", max.first, max.count);
     }
 
     { // norm stats
         min_norm2 = sqrt(min_norm2);
         max_norm2 = sqrt(max_norm2);
         do_comment(
-                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                "range of L2 norms=[%g, %g] (%zd null vectors)\n",
                 min_norm2,
                 max_norm2,
                 n0);
 
         if (max_norm2 < min_norm2 * 1.0001) {
             do_comment(
                     "vectors are normalized, inner product and "
-                    "L2  search are equivalent\n");
+                    "L2 search are equivalent\n");
         }
 
         if (max_norm2 > min_norm2 * 100) {
@@ -227,15 +212,15 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
             do_comment("no constant dimensions\n");
         } else {
             do_comment(
-                    "%ld dimensions are constant: they can be removed\n",
+                    "%zd dimensions are constant: they can be removed\n",
                     n_0_range);
         }
 
         if (n_dangerous_range == 0) {
             do_comment("no dimension has a too large mean\n");
         } else {
             do_comment(
-                    "%ld dimensions are too large "
+                    "%zd dimensions are too large "
                     "wrt. their variance, may loose precision "
                     "in IndexFlatL2 (use CenteringTransform)\n",
                     n_dangerous_range);
diff --git a/faiss/MatrixStats.h b/faiss/MatrixStats.h
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <cmath>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -26,20 +27,31 @@ struct MatrixStats {
     std::string comments;
 
     // raw statistics
-    size_t n, d;
-    size_t n_collision, n_valid, n0;
-    double min_norm2, max_norm2;
+    size_t n = 0, d = 0;
+    size_t n_collision = 0;
+    size_t n_valid = 0;
+    size_t n0 = 0;
+    double min_norm2 = HUGE_VALF;
+    double max_norm2 = 0;
+    uint64_t hash_value = 0;
 
     struct PerDimStats {
-        size_t n, n_nan, n_inf, n0;
+        /// counts of various special entries
+        size_t n = 0;
+        size_t n_nan = 0;
+        size_t n_inf = 0;
+        size_t n0 = 0;
 
-        float min, max;
-        double sum, sum2;
+        /// to get min/max and stddev values
+        float min = HUGE_VALF;
+        float max = -HUGE_VALF;
+        double sum = 0;
+        double sum2 = 0;
 
-        size_t n_valid;
-        double mean, stddev;
+        size_t n_valid = 0;
+        double mean = NAN;
+        double stddev = NAN;
 
-        PerDimStats();
         void add(float x);
         void compute_mean_std();
     };
diff --git a/tests/test_build_blocks.py b/tests/test_build_blocks.py
@@ -256,6 +256,14 @@ def test_normalized(self):
         print(comments)
         assert 'vectors are normalized' in comments
 
+    def test_hash(self):
+        cc = []
+        for _ in range(2):
+            rs = np.random.RandomState(123)
+            m = rs.rand(40, 20).astype('float32')
+            cc.append(faiss.MatrixStats(m).hash_value)
+        self.assertTrue(cc[0] == cc[1])
+
 
 class TestScalarQuantizer(unittest.TestCase):