Skip to content

Commit 5c4bd3f

Browse files
mdouzefacebook-github-bot
authored andcommitted
Cleanup clustering code (facebookresearch#3030)
Summary: Pull Request resolved: facebookresearch#3030 Added default arguments to the .h file (for some reason I forgot this file when migrating default args). Logging a hash value in MatrixStats, useful to check if two runs really really run on the same matrix... Reviewed By: pemazare Differential Revision: D48834343 fbshipit-source-id: 7c1948464e66ada1f462f4486f7cf3159bbf9dfd
1 parent 3888f9b commit 5c4bd3f

File tree

5 files changed

+72
-71
lines changed

5 files changed

+72
-71
lines changed

faiss/Clustering.cpp

-14
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,6 @@
2727

2828
namespace faiss {
2929

30-
ClusteringParameters::ClusteringParameters()
31-
: niter(25),
32-
nredo(1),
33-
verbose(false),
34-
spherical(false),
35-
int_centroids(false),
36-
update_index(false),
37-
frozen_centroids(false),
38-
min_points_per_centroid(39),
39-
max_points_per_centroid(256),
40-
seed(1234),
41-
decode_block_size(32768) {}
42-
// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
43-
4430
Clustering::Clustering(int d, int k) : d(d), k(k) {}
4531

4632
Clustering::Clustering(int d, int k, const ClusteringParameters& cp)

faiss/Clustering.h

+31-21
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* LICENSE file in the root directory of this source tree.
66
*/
77

8-
// -*- c++ -*-
8+
/** Implementation of k-means clustering with many variants. */
99

1010
#ifndef FAISS_CLUSTERING_H
1111
#define FAISS_CLUSTERING_H
@@ -19,25 +19,35 @@ namespace faiss {
1919
* constructor of the Clustering object.
2020
*/
2121
struct ClusteringParameters {
22-
int niter; ///< clustering iterations
23-
int nredo; ///< redo clustering this many times and keep best
24-
25-
bool verbose;
26-
bool spherical; ///< do we want normalized centroids?
27-
bool int_centroids; ///< round centroids coordinates to integer
28-
bool update_index; ///< re-train index after each iteration?
29-
bool frozen_centroids; ///< use the centroids provided as input and do not
30-
///< change them during iterations
31-
32-
int min_points_per_centroid; ///< otherwise you get a warning
33-
int max_points_per_centroid; ///< to limit size of dataset
34-
35-
int seed; ///< seed for the random number generator
36-
37-
size_t decode_block_size; ///< how many vectors at a time to decode
38-
39-
/// sets reasonable defaults
40-
ClusteringParameters();
22+
/// number of clustering iterations
23+
int niter = 25;
24+
/// redo clustering this many times and keep the clusters with the best
25+
/// objective
26+
int nredo = 1;
27+
28+
bool verbose = false;
29+
/// whether to normalize centroids after each iteration (useful for inner
30+
/// product clustering)
31+
bool spherical = false;
32+
/// round centroids coordinates to integer after each iteration?
33+
bool int_centroids = false;
34+
/// re-train index after each iteration?
35+
bool update_index = false;
36+
37+
/// Use the subset of centroids provided as input and do not change them
38+
/// during iterations
39+
bool frozen_centroids = false;
40+
/// If fewer than this number of training vectors per centroid are provided,
41+
/// writes a warning. Note that fewer than 1 point per centroid raises an
42+
/// exception.
43+
int min_points_per_centroid = 39;
44+
/// to limit size of dataset, otherwise the training set is subsampled
45+
int max_points_per_centroid = 256;
46+
/// seed for the random number generator
47+
int seed = 1234;
48+
49+
/// when the training set is encoded, batch size of the codec decoder
50+
size_t decode_block_size = 32768;
4151
};
4252

4353
struct ClusteringIterationStats {
@@ -94,7 +104,7 @@ struct Clustering : ClusteringParameters {
94104
* to decode the input vectors.
95105
*
96106
* @param codec codec used to decode the vectors (nullptr =
97-
* vectors are in fact floats) *
107+
* vectors are in fact floats)
98108
*/
99109
void train_encoded(
100110
idx_t nx,

faiss/MatrixStats.cpp

+12-27
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
1313

1414
#include <faiss/utils/utils.h>
15+
#include <inttypes.h>
1516
#include <cmath>
1617
#include <cstdio>
1718

@@ -21,18 +22,6 @@ namespace faiss {
2122
* MatrixStats
2223
*********************************************************************/
2324

24-
MatrixStats::PerDimStats::PerDimStats()
25-
: n(0),
26-
n_nan(0),
27-
n_inf(0),
28-
n0(0),
29-
min(HUGE_VALF),
30-
max(-HUGE_VALF),
31-
sum(0),
32-
sum2(0),
33-
mean(NAN),
34-
stddev(NAN) {}
35-
3625
void MatrixStats::PerDimStats::add(float x) {
3726
n++;
3827
if (std::isnan(x)) {
@@ -74,26 +63,22 @@ void MatrixStats::do_comment(const char* fmt, ...) {
7463
buf += size;
7564
}
7665

77-
MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
78-
: n(n),
79-
d(d),
80-
n_collision(0),
81-
n_valid(0),
82-
n0(0),
83-
min_norm2(HUGE_VAL),
84-
max_norm2(0) {
66+
MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
8567
std::vector<char> comment_buf(10000);
8668
buf = comment_buf.data();
8769
nbuf = comment_buf.size();
8870

89-
do_comment("analyzing %ld vectors of size %ld\n", n, d);
71+
do_comment("analyzing %zd vectors of size %zd\n", n, d);
9072

9173
if (d > 1024) {
9274
do_comment(
9375
"indexing this many dimensions is hard, "
9476
"please consider dimensionality reducution (with PCAMatrix)\n");
9577
}
9678

79+
hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
80+
do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
81+
9782
size_t nbytes = sizeof(x[0]) * d;
9883
per_dim_stats.resize(d);
9984

@@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
156141

157142
if (n_collision > 0) {
158143
do_comment(
159-
"%ld collisions in hash table, "
144+
"%zd collisions in hash table, "
160145
"counts may be invalid\n",
161146
n_collision);
162147
}
@@ -167,22 +152,22 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
167152
max = it->second;
168153
}
169154
}
170-
do_comment("vector %ld has %ld copies\n", max.first, max.count);
155+
do_comment("vector %zd has %zd copies\n", max.first, max.count);
171156
}
172157

173158
{ // norm stats
174159
min_norm2 = sqrt(min_norm2);
175160
max_norm2 = sqrt(max_norm2);
176161
do_comment(
177-
"range of L2 norms=[%g, %g] (%ld null vectors)\n",
162+
"range of L2 norms=[%g, %g] (%zd null vectors)\n",
178163
min_norm2,
179164
max_norm2,
180165
n0);
181166

182167
if (max_norm2 < min_norm2 * 1.0001) {
183168
do_comment(
184169
"vectors are normalized, inner product and "
185-
"L2 search are equivalent\n");
170+
"L2 search are equivalent\n");
186171
}
187172

188173
if (max_norm2 > min_norm2 * 100) {
@@ -227,15 +212,15 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
227212
do_comment("no constant dimensions\n");
228213
} else {
229214
do_comment(
230-
"%ld dimensions are constant: they can be removed\n",
215+
"%zd dimensions are constant: they can be removed\n",
231216
n_0_range);
232217
}
233218

234219
if (n_dangerous_range == 0) {
235220
do_comment("no dimension has a too large mean\n");
236221
} else {
237222
do_comment(
238-
"%ld dimensions are too large "
223+
"%zd dimensions are too large "
239224
"wrt. their variance, may loose precision "
240225
"in IndexFlatL2 (use CenteringTransform)\n",
241226
n_dangerous_range);

faiss/MatrixStats.h

+21-9
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#pragma once
1111

1212
#include <stdint.h>
13+
#include <cmath>
1314
#include <string>
1415
#include <unordered_map>
1516
#include <vector>
@@ -26,20 +27,31 @@ struct MatrixStats {
2627
std::string comments;
2728

2829
// raw statistics
29-
size_t n, d;
30-
size_t n_collision, n_valid, n0;
31-
double min_norm2, max_norm2;
30+
size_t n = 0, d = 0;
31+
size_t n_collision = 0;
32+
size_t n_valid = 0;
33+
size_t n0 = 0;
34+
double min_norm2 = HUGE_VALF;
35+
double max_norm2 = 0;
36+
uint64_t hash_value = 0;
3237

3338
struct PerDimStats {
34-
size_t n, n_nan, n_inf, n0;
39+
/// counts of various special entries
40+
size_t n = 0;
41+
size_t n_nan = 0;
42+
size_t n_inf = 0;
43+
size_t n0 = 0;
3544

36-
float min, max;
37-
double sum, sum2;
45+
/// to get min/max and stddev values
46+
float min = HUGE_VALF;
47+
float max = -HUGE_VALF;
48+
double sum = 0;
49+
double sum2 = 0;
3850

39-
size_t n_valid;
40-
double mean, stddev;
51+
size_t n_valid = 0;
52+
double mean = NAN;
53+
double stddev = NAN;
4154

42-
PerDimStats();
4355
void add(float x);
4456
void compute_mean_std();
4557
};

tests/test_build_blocks.py

+8
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,14 @@ def test_normalized(self):
256256
print(comments)
257257
assert 'vectors are normalized' in comments
258258

259+
def test_hash(self):
260+
cc = []
261+
for _ in range(2):
262+
rs = np.random.RandomState(123)
263+
m = rs.rand(40, 20).astype('float32')
264+
cc.append(faiss.MatrixStats(m).hash_value)
265+
self.assertTrue(cc[0] == cc[1])
266+
259267

260268
class TestScalarQuantizer(unittest.TestCase):
261269

0 commit comments

Comments
 (0)