Skip to content

Commit 821a401

Browse files
algoriddlefacebook-github-bot
authored andcommitted
CodeSet for deduping large datasets (facebookresearch#2949)
Summary: Pull Request resolved: facebookresearch#2949 A more scalable alternative to `np.unique` for deduping large datasets with a quantized code. Reviewed By: mlomeli1 Differential Revision: D47443953 fbshipit-source-id: 4a1554d4d4200b5fa657e9d8b7395bba9856a8e3
1 parent 43d86e3 commit 821a401

File tree

5 files changed

+52
-0
lines changed

5 files changed

+52
-0
lines changed

faiss/python/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
class_wrappers.handle_IDSelectorSubset(IDSelectorBatch, class_owns=True)
4242
class_wrappers.handle_IDSelectorSubset(IDSelectorArray, class_owns=False)
4343
class_wrappers.handle_IDSelectorSubset(IDSelectorBitmap, class_owns=False, force_int64=False)
44+
class_wrappers.handle_CodeSet(CodeSet)
4445

4546
this_module = sys.modules[__name__]
4647

faiss/python/class_wrappers.py

+18
Original file line numberDiff line numberDiff line change
@@ -1102,3 +1102,21 @@ def replacement_init(self, *args):
11021102
self.original_init(*args)
11031103

11041104
the_class.__init__ = replacement_init
1105+
1106+
1107+
def handle_CodeSet(the_class):
1108+
1109+
def replacement_insert(self, codes, inserted=None):
1110+
n, d = codes.shape
1111+
assert d == self.d
1112+
codes = np.ascontiguousarray(codes, dtype=np.uint8)
1113+
1114+
if inserted is None:
1115+
inserted = np.empty(n, dtype=bool)
1116+
else:
1117+
assert inserted.shape == (n, )
1118+
1119+
self.insert_c(n, swig_ptr(codes), swig_ptr(inserted))
1120+
return inserted
1121+
1122+
replace_method(the_class, 'insert', replacement_insert)

faiss/utils/utils.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <omp.h>
2929

3030
#include <algorithm>
31+
#include <set>
3132
#include <type_traits>
3233
#include <vector>
3334

@@ -623,4 +624,12 @@ void CombinerRangeKNN<T>::write_result(T* D_res, int64_t* I_res) {
623624
template struct CombinerRangeKNN<float>;
624625
template struct CombinerRangeKNN<int16_t>;
625626

627+
void CodeSet::insert(size_t n, const uint8_t* codes, bool* inserted) {
628+
for (size_t i = 0; i < n; i++) {
629+
auto res = s.insert(
630+
std::vector<uint8_t>(codes + i * d, codes + i * d + d));
631+
inserted[i] = res.second;
632+
}
633+
}
634+
626635
} // namespace faiss

faiss/utils/utils.h

+10
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
#define FAISS_utils_h
1818

1919
#include <stdint.h>
20+
#include <set>
2021
#include <string>
22+
#include <vector>
2123

2224
#include <faiss/impl/platform_macros.h>
2325
#include <faiss/utils/Heap.h>
@@ -209,6 +211,14 @@ struct CombinerRangeKNN {
209211
void write_result(T* D_res, int64_t* I_res);
210212
};
211213

214+
struct CodeSet {
215+
size_t d;
216+
std::set<std::vector<uint8_t>> s;
217+
218+
explicit CodeSet(size_t d) : d(d) {}
219+
void insert(size_t n, const uint8_t* codes, bool* inserted);
220+
};
221+
212222
} // namespace faiss
213223

214224
#endif /* FAISS_utils_h */

tests/test_contrib.py

+14
Original file line numberDiff line numberDiff line change
@@ -630,3 +630,17 @@ def test_hnsw_permute(self):
630630
np.testing.assert_equal(Dnew, Dref)
631631
Inew_remap = perm[Inew]
632632
np.testing.assert_equal(Inew_remap, Iref)
633+
634+
635+
class TestCodeSet(unittest.TestCase):
636+
637+
def test_code_set(self):
638+
""" CodeSet and np.unique should produce the same output """
639+
d = 8
640+
n = 1000 # > 256 and using only 0 or 1 so there must be duplicates
641+
codes = np.random.randint(0, 2, (n, d), dtype=np.uint8)
642+
s = faiss.CodeSet(d)
643+
inserted = s.insert(codes)
644+
np.testing.assert_equal(
645+
np.sort(np.unique(codes, axis=0), axis=None),
646+
np.sort(codes[inserted], axis=None))

0 commit comments

Comments
 (0)