blevesearch
diff --git a/‎c_api/IndexIVF_c.cpp
+11 b/‎c_api/IndexIVF_c.cpp
+11
diff --git a/‎c_api/IndexIVF_c.h
+6 b/‎c_api/IndexIVF_c.h
+6
diff --git a/‎c_api/IndexScalarQuantizer_c.cpp
-10 b/‎c_api/IndexScalarQuantizer_c.cpp
-10
diff --git a/‎c_api/IndexScalarQuantizer_c.h
-5 b/‎c_api/IndexScalarQuantizer_c.h
-5
diff --git a/‎faiss/IndexIVF.cpp
+36-6 b/‎faiss/IndexIVF.cpp
+36-6
diff --git a/‎faiss/IndexIVF.h
+15-4 b/‎faiss/IndexIVF.h
+15-4
diff --git a/‎faiss/IndexIVFAdditiveQuantizer.cpp
+8-18 b/‎faiss/IndexIVFAdditiveQuantizer.cpp
+8-18
diff --git a/‎faiss/IndexIVFAdditiveQuantizer.h
+3-2 b/‎faiss/IndexIVFAdditiveQuantizer.h
+3-2
diff --git a/‎faiss/IndexIVFAdditiveQuantizerFastScan.cpp
+10-36 b/‎faiss/IndexIVFAdditiveQuantizerFastScan.cpp
+10-36
diff --git a/‎faiss/IndexIVFAdditiveQuantizerFastScan.h
+3-1 b/‎faiss/IndexIVFAdditiveQuantizerFastScan.h
+3-1
diff --git a/‎faiss/IndexIVFFastScan.cpp
+3 b/‎faiss/IndexIVFFastScan.cpp
+3
diff --git a/‎faiss/IndexIVFFastScan.h
-1 b/‎faiss/IndexIVFFastScan.h
-1
diff --git a/‎faiss/IndexIVFFlat.cpp
+7 b/‎faiss/IndexIVFFlat.cpp
+7
diff --git a/‎faiss/IndexIVFFlat.h
+1-1 b/‎faiss/IndexIVFFlat.h
+1-1
@@ -165,6 +165,17 @@ void faiss_IndexIVF_invlists_get_ids(
     memcpy(invlist, list, list_size * sizeof(idx_t));
 }
 
+int faiss_IndexIVF_train_encoder(
+        FaissIndexIVF* index,
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    try {
+        reinterpret_cast<IndexIVF*>(index)->train_encoder(n, x, assign);
+    }
+    CATCH_AND_HANDLE
+}
+
 void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats) {
     reinterpret_cast<IndexIVFStats*>(stats)->reset();
 }
 
@@ -154,6 +154,12 @@ void faiss_IndexIVF_invlists_get_ids(
         size_t list_no,
         idx_t* invlist);
 
+int faiss_IndexIVF_train_encoder(
+        FaissIndexIVF* index,
+        idx_t n,
+        const float* x,
+        const idx_t* assign);
+
 typedef struct FaissIndexIVFStats {
     size_t nq;                // nb of queries run
     size_t nlist;             // nb of inverted lists scanned
 
@@ -110,13 +110,3 @@ int faiss_IndexIVFScalarQuantizer_add_core(
     }
     CATCH_AND_HANDLE
 }
-
-int faiss_IndexIVFScalarQuantizer_train_residual(
-        FaissIndexIVFScalarQuantizer* index,
-        idx_t n,
-        const float* x) {
-    try {
-        reinterpret_cast<IndexIVFScalarQuantizer*>(index)->train_residual(n, x);
-    }
-    CATCH_AND_HANDLE
-}
@@ -88,11 +88,6 @@ int faiss_IndexIVFScalarQuantizer_add_core(
         const idx_t* xids,
         const idx_t* precomputed_idx);
 
-int faiss_IndexIVFScalarQuantizer_train_residual(
-        FaissIndexIVFScalarQuantizer* index,
-        idx_t n,
-        const float* x);
-
 #ifdef __cplusplus
 }
 #endif
 
@@ -1061,22 +1061,52 @@ void IndexIVF::update_vectors(int n, const idx_t* new_ids, const float* x) {
 }
 
 void IndexIVF::train(idx_t n, const float* x) {
-    if (verbose)
+    if (verbose) {
         printf("Training level-1 quantizer\n");
+    }
 
     train_q1(n, x, verbose, metric_type);
 
-    if (verbose)
+    if (verbose) {
         printf("Training IVF residual\n");
+    }
+
+    // optional subsampling
+    idx_t max_nt = train_encoder_num_vectors();
+    if (max_nt <= 0) {
+        max_nt = (size_t)1 << 35;
+    }
+
+    TransformedVectors tv(
+            x, fvecs_maybe_subsample(d, (size_t*)&n, max_nt, x, verbose));
+
+    if (by_residual) {
+        std::vector<idx_t> assign(n);
+        quantizer->assign(n, tv.x, assign.data());
+
+        std::vector<float> residuals(n * d);
+        quantizer->compute_residual_n(n, tv.x, residuals.data(), assign.data());
+
+        train_encoder(n, residuals.data(), assign.data());
+    } else {
+        train_encoder(n, tv.x, nullptr);
+    }
 
-    train_residual(n, x);
     is_trained = true;
 }
 
-void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
-    if (verbose)
-        printf("IndexIVF: no residual training\n");
+idx_t IndexIVF::train_encoder_num_vectors() const {
+    return 0;
+}
+
+void IndexIVF::train_encoder(
+        idx_t /*n*/,
+        const float* /*x*/,
+        const idx_t* assign) {
     // does nothing by default
+    if (verbose) {
+        printf("IndexIVF: no residual training\n");
+    }
 }
 
 bool check_compatible_for_merge_expensive_check = true;
 
@@ -177,6 +177,7 @@ struct IndexIVF : Index, IndexIVFInterface {
     bool own_invlists = false;
 
     size_t code_size = 0; ///< code size per vector in bytes
+
     /** Parallel mode determines how queries are parallelized with OpenMP
      *
      * 0 (default): split over queries
@@ -194,6 +195,10 @@ struct IndexIVF : Index, IndexIVFInterface {
      *  enables reconstruct() */
     DirectMap direct_map;
 
+    /// do the codes in the invlists encode the vectors relative to the
+    /// centroids?
+    bool by_residual = true;
+
     /** The Inverted file takes a quantizer (an Index) on input,
      * which implements the function mapping a vector to a list
      * identifier.
@@ -207,7 +212,7 @@ struct IndexIVF : Index, IndexIVFInterface {
 
     void reset() override;
 
-    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    /// Trains the quantizer and calls train_encoder to train sub-quantizers
     void train(idx_t n, const float* x) override;
 
     /// Calls add_with_ids with NULL ids
@@ -252,9 +257,15 @@ struct IndexIVF : Index, IndexIVFInterface {
      */
     void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids);
 
-    /// Sub-classes that encode the residuals can train their encoders here
-    /// does nothing by default
-    virtual void train_residual(idx_t n, const float* x);
+    /** Train the encoder for the vectors.
+     *
+     * If by_residual then it is called with residuals and corresponding assign
+     * array, otherwise x is the raw training vectors and assign=nullptr */
+    virtual void train_encoder(idx_t n, const float* x, const idx_t* assign);
+
+    /// can be redefined by subclasses to indicate how many training vectors
+    /// they need
+    virtual idx_t train_encoder_num_vectors() const;
 
     void search_preassigned(
             idx_t n,
 
@@ -37,30 +37,20 @@ IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(
 IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq)
         : IndexIVF(), aq(aq) {}
 
-void IndexIVFAdditiveQuantizer::train_residual(idx_t n, const float* x) {
-    const float* x_in = x;
+void IndexIVFAdditiveQuantizer::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    aq->train(n, x);
+}
 
+idx_t IndexIVFAdditiveQuantizer::train_encoder_num_vectors() const {
     size_t max_train_points = 1024 * ((size_t)1 << aq->nbits[0]);
     // we need more data to train LSQ
     if (dynamic_cast<LocalSearchQuantizer*>(aq)) {
         max_train_points = 1024 * aq->M * ((size_t)1 << aq->nbits[0]);
     }
-
-    x = fvecs_maybe_subsample(
-            d, (size_t*)&n, max_train_points, x, verbose, 1234);
-    ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
-
-    if (by_residual) {
-        std::vector<idx_t> idx(n);
-        quantizer->assign(n, x, idx.data());
-
-        std::vector<float> residuals(n * d);
-        quantizer->compute_residual_n(n, x, residuals.data(), idx.data());
-
-        aq->train(n, residuals.data());
-    } else {
-        aq->train(n, x);
-    }
+    return max_train_points;
 }
 
 void IndexIVFAdditiveQuantizer::encode_vectors(
 
@@ -26,7 +26,6 @@ namespace faiss {
 struct IndexIVFAdditiveQuantizer : IndexIVF {
     // the quantizer
     AdditiveQuantizer* aq;
-    bool by_residual = true;
     int use_precomputed_table = 0; // for future use
 
     using Search_type_t = AdditiveQuantizer::Search_type_t;
@@ -40,7 +39,9 @@ struct IndexIVFAdditiveQuantizer : IndexIVF {
 
     explicit IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq);
 
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    idx_t train_encoder_num_vectors() const override;
 
     void encode_vectors(
             idx_t n,
 
@@ -131,45 +131,20 @@ IndexIVFAdditiveQuantizerFastScan::~IndexIVFAdditiveQuantizerFastScan() {}
  * Training
  *********************************************************/
 
-void IndexIVFAdditiveQuantizerFastScan::train_residual(
+idx_t IndexIVFAdditiveQuantizerFastScan::train_encoder_num_vectors() const {
+    return max_train_points;
+}
+
+void IndexIVFAdditiveQuantizerFastScan::train_encoder(
         idx_t n,
-        const float* x_in) {
+        const float* x,
+        const idx_t* assign) {
     if (aq->is_trained) {
         return;
     }
 
-    const int seed = 0x12345;
-    size_t nt = n;
-    const float* x = fvecs_maybe_subsample(
-            d, &nt, max_train_points, x_in, verbose, seed);
-    n = nt;
     if (verbose) {
-        printf("training additive quantizer on %zd vectors\n", nt);
-    }
-    aq->verbose = verbose;
-
-    std::unique_ptr<float[]> del_x;
-    if (x != x_in) {
-        del_x.reset((float*)x);
-    }
-
-    const float* trainset;
-    std::vector<float> residuals(n * d);
-    std::vector<idx_t> assign(n);
-
-    if (by_residual) {
-        if (verbose) {
-            printf("computing residuals\n");
-        }
-        quantizer->assign(n, x, assign.data());
-        residuals.resize(n * d);
-        for (idx_t i = 0; i < n; i++) {
-            quantizer->compute_residual(
-                    x + i * d, residuals.data() + i * d, assign[i]);
-        }
-        trainset = residuals.data();
-    } else {
-        trainset = x;
+        printf("training additive quantizer on %d vectors\n", int(n));
     }
 
     if (verbose) {
@@ -181,17 +156,16 @@ void IndexIVFAdditiveQuantizerFastScan::train_residual(
                d);
     }
     aq->verbose = verbose;
-    aq->train(n, trainset);
+    aq->train(n, x);
 
     // train norm quantizer
     if (by_residual && metric_type == METRIC_L2) {
         std::vector<float> decoded_x(n * d);
         std::vector<uint8_t> x_codes(n * aq->code_size);
-        aq->compute_codes(residuals.data(), x_codes.data(), n);
+        aq->compute_codes(x, x_codes.data(), n);
         aq->decode(x_codes.data(), decoded_x.data(), n);
 
         // add coarse centroids
-        FAISS_THROW_IF_NOT(assign.size() == n);
         std::vector<float> centroid(d);
         for (idx_t i = 0; i < n; i++) {
             auto xi = decoded_x.data() + i * d;
 
@@ -63,7 +63,9 @@ struct IndexIVFAdditiveQuantizerFastScan : IndexIVFFastScan {
             const IndexIVFAdditiveQuantizer& orig,
             int bbs = 32);
 
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    idx_t train_encoder_num_vectors() const override;
 
     void estimate_norm_scale(idx_t n, const float* x);
 
 
@@ -43,13 +43,16 @@ IndexIVFFastScan::IndexIVFFastScan(
         size_t code_size,
         MetricType metric)
         : IndexIVF(quantizer, d, nlist, code_size, metric) {
+    // unlike other indexes, we prefer no residuals for performance reasons.
+    by_residual = false;
     FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
 }
 
 IndexIVFFastScan::IndexIVFFastScan() {
     bbs = 0;
     M2 = 0;
     is_trained = false;
+    by_residual = false;
 }
 
 void IndexIVFFastScan::init_fastscan(
 
@@ -45,7 +45,6 @@ struct IndexIVFFastScan : IndexIVF {
     int implem = 0;
     // skip some parts of the computation (for timing)
     int skip = 0;
-    bool by_residual = false;
 
     // batching factors at search time (0 = default)
     int qbs = 0;
 
@@ -36,6 +36,11 @@ IndexIVFFlat::IndexIVFFlat(
         MetricType metric)
         : IndexIVF(quantizer, d, nlist, sizeof(float) * d, metric) {
     code_size = sizeof(float) * d;
+    by_residual = false;
+}
+
+IndexIVFFlat::IndexIVFFlat() {
+    by_residual = false;
 }
 
 void IndexIVFFlat::add_core(
@@ -45,6 +50,7 @@ void IndexIVFFlat::add_core(
         const int64_t* coarse_idx) {
     FAISS_THROW_IF_NOT(is_trained);
     FAISS_THROW_IF_NOT(coarse_idx);
+    FAISS_THROW_IF_NOT(!by_residual);
     assert(invlists);
     direct_map.check_can_add(xids);
 
@@ -89,6 +95,7 @@ void IndexIVFFlat::encode_vectors(
         const idx_t* list_nos,
         uint8_t* codes,
         bool include_listnos) const {
+    FAISS_THROW_IF_NOT(!by_residual);
     if (!include_listnos) {
         memcpy(codes, x, code_size * n);
     } else {
 
@@ -50,7 +50,7 @@ struct IndexIVFFlat : IndexIVF {
 
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
-    IndexIVFFlat() {}
+    IndexIVFFlat();
 };
 
 struct IndexIVFFlatDedup : IndexIVFFlat {
Original file line number	Diff line number	Diff line change
`@@ -110,13 +110,3 @@ int faiss_IndexIVFScalarQuantizer_add_core(`
`110`	`110`	`}`
`111`	`111`	`CATCH_AND_HANDLE`
`112`	`112`	`}`
`113`		`-`
`114`		`-int faiss_IndexIVFScalarQuantizer_train_residual(`
`115`		`- FaissIndexIVFScalarQuantizer* index,`
`116`		`- idx_t n,`
`117`		`- const float* x) {`
`118`		`- try {`
`119`		`- reinterpret_cast<IndexIVFScalarQuantizer*>(index)->train_residual(n, x);`
`120`		`- }`
`121`		`- CATCH_AND_HANDLE`
`122`		`-}`