From 61f1215ed51ff817ba5f63482c474d4518a975a0 Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Mon, 4 Dec 2023 11:23:27 +0100 Subject: [PATCH 01/11] sparse vectors support --- Cargo.lock | 3 +-- Cargo.toml | 2 +- src/args.rs | 4 ++++ src/common.rs | 24 ++++++++++++++++++++++-- src/main.rs | 16 ++++++++++++++-- src/search.rs | 16 ++++++++++++---- src/upsert.rs | 15 +++++++++++---- 7 files changed, 65 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a1237b5..675ec11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -893,8 +893,7 @@ dependencies = [ [[package]] name = "qdrant-client" version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "337c9a836a26e90a171fe8eb4c57e8d0a4e65a23cbac76685960df36c227081e" +source = "git+https://github.com/qdrant/rust-client.git?branch=dev#9462cc9e05f86b122cb6409bfc6ce78fe23d538c" dependencies = [ "anyhow", "futures-util", diff --git a/Cargo.toml b/Cargo.toml index 88eb1ec..a2211c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ ctrlc = "3.4.1" futures = "0.3.29" indicatif = "0.17.7" memmap2 = "0.9.0" -qdrant-client = "1.6.0" +qdrant-client = { git = "https://github.com/qdrant/rust-client.git", branch = "dev" } rand = "0.8.5" serde = "1.0" serde_json = "1.0" diff --git a/src/args.rs b/src/args.rs index 5bd348a..2744a33 100644 --- a/src/args.rs +++ b/src/args.rs @@ -221,6 +221,10 @@ pub struct Args { /// Skip un-indexed segments during search #[clap(long)] pub indexed_only: Option, + + /// Whether to use sparse vectors + #[clap(long)] + pub sparse_vectors: Option, } #[derive(Copy, Clone, Debug)] diff --git a/src/common.rs b/src/common.rs index 12c948a..44582c0 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,8 +1,9 @@ +use crate::args::Args; use core::option::Option; use core::option::Option::{None, Some}; use qdrant_client::client::{Payload, QdrantClient}; use qdrant_client::qdrant::r#match::MatchValue; -use qdrant_client::qdrant::{FieldCondition, Filter, Match, Range}; +use qdrant_client::qdrant::{FieldCondition, Filter, Match, Range, Vector}; use rand::prelude::SliceRandom; use rand::Rng; @@ -111,7 +112,26 @@ pub fn random_filter( have_any.then_some(filter) } -pub fn random_vector(dim: usize) -> Vec { +pub fn random_vector(args: &Args) -> Vector { + if args.sparse_vectors == Some(true) { + random_sparse_vector(args.dim).into() + } else { + random_dense_vector(args.dim).into() + } +} + +pub fn random_sparse_vector(max_size: usize) -> Vec<(u32, f32)> { + let mut rng = rand::thread_rng(); + let size = rng.gen_range(1..max_size); + // (index, value) + let mut pairs = Vec::with_capacity(size); + for i in 1..=size { + pairs.push((i as u32, rng.gen_range(-1.0..1.0) as f32)); + } + pairs +} + +pub fn random_dense_vector(dim: usize) -> Vec { let mut rng = rand::thread_rng(); (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect() } diff --git a/src/main.rs b/src/main.rs index 78d8415..b96d117 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,7 @@ mod upsert; use crate::args::QuantizationArg; use crate::common::{ - random_filter, random_payload, random_vector, INTEGERS_PAYLOAD_KEY, KEYWORD_PAYLOAD_KEY, + random_dense_vector, random_filter, random_payload, INTEGERS_PAYLOAD_KEY, KEYWORD_PAYLOAD_KEY, }; use crate::fbin_reader::FBinReader; use crate::search::SearchProcessor; @@ -23,7 +23,8 @@ use qdrant_client::qdrant::vectors_config::Config; use qdrant_client::qdrant::{ CollectionStatus, CompressionRatio, CreateCollection, Distance, FieldType, HnswConfigDiff, OptimizersConfigDiff, ProductQuantization, QuantizationConfig, QuantizationType, - ScalarQuantization, VectorParams, VectorParamsMap, VectorsConfig, + ScalarQuantization, SparseVectorConfig, SparseVectorParams, VectorParams, VectorParamsMap, + VectorsConfig, }; use rand::Rng; use serde::{Deserialize, Serialize}; @@ -134,6 +135,16 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() Config::ParamsMap(VectorParamsMap { map: params }) }; + let sparse_vectors_config = if args.sparse_vectors == Some(true) { + let params = (0..args.vectors_per_point) + .map(|idx| (idx.to_string(), SparseVectorParams { index: None })) + .collect(); + + Some(SparseVectorConfig { map: params }) + } else { + None + }; + client .create_collection(&CreateCollection { collection_name: args.collection_name.clone(), @@ -200,6 +211,7 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() }, None => None, }, + sparse_vectors_config, ..Default::default() }) .await?; diff --git a/src/search.rs b/src/search.rs index 11af787..06fcb6d 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,8 +1,8 @@ -use crate::common::{random_vector_name, retry_with_clients}; -use crate::{random_filter, random_vector, Args}; +use crate::common::{random_sparse_vector, random_vector_name, retry_with_clients}; +use crate::{random_dense_vector, random_filter, Args}; use indicatif::ProgressBar; use qdrant_client::client::QdrantClient; -use qdrant_client::qdrant::{QuantizationSearchParams, SearchParams, SearchPoints}; +use qdrant_client::qdrant::{QuantizationSearchParams, SearchParams, SearchPoints, Vector}; use std::sync::atomic::AtomicBool; use std::sync::{Arc, Mutex}; @@ -36,7 +36,12 @@ impl SearchProcessor { return Ok(()); } - let query_vector = random_vector(self.args.dim); + let (query_vector, sparse_indices) = if self.args.sparse_vectors == Some(true) { + let sparse_vector: Vector = random_sparse_vector(self.args.dim).into(); + (sparse_vector.data, sparse_vector.indices) + } else { + (random_dense_vector(self.args.dim), None) + }; let query_filter = random_filter( self.args.keywords, self.args.float_payloads, @@ -72,6 +77,9 @@ impl SearchProcessor { vector_name, with_vectors: None, read_consistency: self.args.read_consistency.map(Into::into), + timeout: None, + shard_key_selector: None, + sparse_indices, }; let res = diff --git a/src/upsert.rs b/src/upsert.rs index 37723f3..dc4861b 100644 --- a/src/upsert.rs +++ b/src/upsert.rs @@ -1,6 +1,6 @@ -use crate::common::retry_with_clients; +use crate::common::{random_vector, retry_with_clients}; use crate::fbin_reader::FBinReader; -use crate::{random_payload, random_vector, Args}; +use crate::{random_dense_vector, random_payload, Args}; use anyhow::Error; use indicatif::ProgressBar; use qdrant_client::client::QdrantClient; @@ -75,13 +75,20 @@ impl UpsertProcessor { let vectors_map: HashMap<_, _> = (0..self.args.vectors_per_point) .map(|i| { let vector_name = format!("{}", i); - let vector = random_vector(self.args.dim); + let vector = random_vector(&self.args); (vector_name, vector) }) .collect(); vectors_map.into() + } else if self.args.sparse_vectors == Some(true) { + // single sparse vector must be named + let vectors_map: HashMap<_, _> = + vec![(format!("{}", i), random_vector(&self.args))] + .into_iter() + .collect(); + vectors_map.into() } else { - random_vector(self.args.dim).into() + random_dense_vector(self.args.dim).into() }; points.push(PointStruct::new( From 209f7147feec9ed96cac96e6e66c77c52d5abb63 Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Mon, 4 Dec 2023 12:15:09 +0100 Subject: [PATCH 02/11] make it work --- src/args.rs | 4 ++-- src/common.rs | 2 +- src/main.rs | 3 ++- src/search.rs | 4 ++-- src/upsert.rs | 9 ++++----- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/args.rs b/src/args.rs index 2744a33..39d0d8e 100644 --- a/src/args.rs +++ b/src/args.rs @@ -223,8 +223,8 @@ pub struct Args { pub indexed_only: Option, /// Whether to use sparse vectors - #[clap(long)] - pub sparse_vectors: Option, + #[clap(long, default_value_t = false)] + pub sparse_vectors: bool, } #[derive(Copy, Clone, Debug)] diff --git a/src/common.rs b/src/common.rs index 44582c0..d7ee7ea 100644 --- a/src/common.rs +++ b/src/common.rs @@ -113,7 +113,7 @@ pub fn random_filter( } pub fn random_vector(args: &Args) -> Vector { - if args.sparse_vectors == Some(true) { + if args.sparse_vectors { random_sparse_vector(args.dim).into() } else { random_dense_vector(args.dim).into() diff --git a/src/main.rs b/src/main.rs index b96d117..ee70474 100644 --- a/src/main.rs +++ b/src/main.rs @@ -135,10 +135,11 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() Config::ParamsMap(VectorParamsMap { map: params }) }; - let sparse_vectors_config = if args.sparse_vectors == Some(true) { + let sparse_vectors_config = if args.sparse_vectors { let params = (0..args.vectors_per_point) .map(|idx| (idx.to_string(), SparseVectorParams { index: None })) .collect(); + println!("Sparse vectors params: {:?}", params); Some(SparseVectorConfig { map: params }) } else { diff --git a/src/search.rs b/src/search.rs index 06fcb6d..4fc9c35 100644 --- a/src/search.rs +++ b/src/search.rs @@ -36,7 +36,7 @@ impl SearchProcessor { return Ok(()); } - let (query_vector, sparse_indices) = if self.args.sparse_vectors == Some(true) { + let (query_vector, sparse_indices) = if self.args.sparse_vectors { let sparse_vector: Vector = random_sparse_vector(self.args.dim).into(); (sparse_vector.data, sparse_vector.indices) } else { @@ -50,7 +50,7 @@ impl SearchProcessor { let start = std::time::Instant::now(); - let vector_name = if self.args.vectors_per_point > 1 { + let vector_name = if self.args.vectors_per_point > 1 || self.args.sparse_vectors { Some(random_vector_name(self.args.vectors_per_point)) } else { None diff --git a/src/upsert.rs b/src/upsert.rs index dc4861b..9edf8b7 100644 --- a/src/upsert.rs +++ b/src/upsert.rs @@ -80,12 +80,11 @@ impl UpsertProcessor { }) .collect(); vectors_map.into() - } else if self.args.sparse_vectors == Some(true) { + } else if self.args.sparse_vectors { // single sparse vector must be named - let vectors_map: HashMap<_, _> = - vec![(format!("{}", i), random_vector(&self.args))] - .into_iter() - .collect(); + let vectors_map: HashMap<_, _> = vec![("0".to_string(), random_vector(&self.args))] + .into_iter() + .collect(); vectors_map.into() } else { random_dense_vector(self.args.dim).into() From 74697d6dc8487e063a940e848ada3a781d54989f Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Mon, 4 Dec 2023 14:48:52 +0100 Subject: [PATCH 03/11] sparse only --- src/main.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index ee70474..5ff8a18 100644 --- a/src/main.rs +++ b/src/main.rs @@ -125,7 +125,7 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() ..Default::default() }; - let vector_params = if args.vectors_per_point == 1 { + let dense_vector_params = if args.vectors_per_point == 1 { Config::Params(_vector_param) } else { let params = (0..args.vectors_per_point) @@ -135,11 +135,18 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() Config::ParamsMap(VectorParamsMap { map: params }) }; + let vectors_config = if args.sparse_vectors { + Some(VectorsConfig { + config: Some(dense_vector_params), + }) + } else { + None + }; + let sparse_vectors_config = if args.sparse_vectors { let params = (0..args.vectors_per_point) .map(|idx| (idx.to_string(), SparseVectorParams { index: None })) .collect(); - println!("Sparse vectors params: {:?}", params); Some(SparseVectorConfig { map: params }) } else { @@ -149,9 +156,7 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() client .create_collection(&CreateCollection { collection_name: args.collection_name.clone(), - vectors_config: Some(VectorsConfig { - config: Some(vector_params), - }), + vectors_config, hnsw_config: Some(HnswConfigDiff { on_disk: Some(args.on_disk_hnsw), m: args.hnsw_m.map(|x| x as u64), From 3221e34133ad8e0023ab61f2f4a41f43aed8e7c1 Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Mon, 4 Dec 2023 15:55:10 +0100 Subject: [PATCH 04/11] fix --- src/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5ff8a18..3f77c50 100644 --- a/src/main.rs +++ b/src/main.rs @@ -135,12 +135,13 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() Config::ParamsMap(VectorParamsMap { map: params }) }; + // TODO enable mix configurations (dense+sparse) let vectors_config = if args.sparse_vectors { + None + } else { Some(VectorsConfig { config: Some(dense_vector_params), }) - } else { - None }; let sparse_vectors_config = if args.sparse_vectors { From cf20c61adfab3b426202e341939db2a15ec4e9c0 Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Mon, 4 Dec 2023 16:45:31 +0100 Subject: [PATCH 05/11] wired on_disk_index --- src/args.rs | 4 ++-- src/main.rs | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/args.rs b/src/args.rs index 39d0d8e..01aa6ab 100644 --- a/src/args.rs +++ b/src/args.rs @@ -115,9 +115,9 @@ pub struct Args { #[clap(long, default_value_t = false)] pub on_disk_payload: bool, - /// On disk hnsw + /// On disk index #[clap(long, default_value_t = false)] - pub on_disk_hnsw: bool, + pub on_disk_index: bool, /// On disk vectors #[clap(long)] diff --git a/src/main.rs b/src/main.rs index 3f77c50..1f8cde0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -23,8 +23,8 @@ use qdrant_client::qdrant::vectors_config::Config; use qdrant_client::qdrant::{ CollectionStatus, CompressionRatio, CreateCollection, Distance, FieldType, HnswConfigDiff, OptimizersConfigDiff, ProductQuantization, QuantizationConfig, QuantizationType, - ScalarQuantization, SparseVectorConfig, SparseVectorParams, VectorParams, VectorParamsMap, - VectorsConfig, + ScalarQuantization, SparseIndexConfig, SparseVectorConfig, SparseVectorParams, VectorParams, + VectorParamsMap, VectorsConfig, }; use rand::Rng; use serde::{Deserialize, Serialize}; @@ -146,7 +146,17 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() let sparse_vectors_config = if args.sparse_vectors { let params = (0..args.vectors_per_point) - .map(|idx| (idx.to_string(), SparseVectorParams { index: None })) + .map(|idx| { + ( + idx.to_string(), + SparseVectorParams { + index: Some(SparseIndexConfig { + full_scan_threshold: None, + on_disk: Some(args.on_disk_index), + }), + }, + ) + }) .collect(); Some(SparseVectorConfig { map: params }) @@ -159,7 +169,7 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() collection_name: args.collection_name.clone(), vectors_config, hnsw_config: Some(HnswConfigDiff { - on_disk: Some(args.on_disk_hnsw), + on_disk: Some(args.on_disk_index), m: args.hnsw_m.map(|x| x as u64), ef_construct: args.hnsw_ef_construct.map(|x| x as u64), ..Default::default() From 20e86d549cd75d06870c6a47e1b5c0fe8046c99d Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Tue, 5 Dec 2023 09:04:36 +0100 Subject: [PATCH 06/11] improve sparse vector generator --- src/common.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/common.rs b/src/common.rs index d7ee7ea..eaa6f22 100644 --- a/src/common.rs +++ b/src/common.rs @@ -120,13 +120,21 @@ pub fn random_vector(args: &Args) -> Vector { } } +/// Generate random sparse vector with random size and random values. +/// - `max_size` - maximum size of vector pub fn random_sparse_vector(max_size: usize) -> Vec<(u32, f32)> { let mut rng = rand::thread_rng(); let size = rng.gen_range(1..max_size); // (index, value) let mut pairs = Vec::with_capacity(size); for i in 1..=size { - pairs.push((i as u32, rng.gen_range(-1.0..1.0) as f32)); + // probability of skipping a dimension to make the vectors sparse + let skip = rng.gen_bool(0.05); + if skip { + continue; + } + // Only positive values are generated to make sure to hit the pruning path. + pairs.push((i as u32, rng.gen_range(0.0..10.0) as f32)); } pairs } From a1de4d37adb6419cc6ab145b0a130d17e1ff7445 Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Tue, 5 Dec 2023 10:15:00 +0100 Subject: [PATCH 07/11] generate sparser vectors --- src/args.rs | 2 +- src/common.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/args.rs b/src/args.rs index 01aa6ab..b02a0d0 100644 --- a/src/args.rs +++ b/src/args.rs @@ -40,7 +40,7 @@ pub struct Args { #[clap(short, long)] pub max_id: Option, - /// Number of dimensions in each vector + /// Number of dimensions in each dense vector or max dimension for sparse vectors #[clap(short, long, default_value_t = 128)] pub dim: usize, diff --git a/src/common.rs b/src/common.rs index eaa6f22..cc4ec2d 100644 --- a/src/common.rs +++ b/src/common.rs @@ -129,7 +129,7 @@ pub fn random_sparse_vector(max_size: usize) -> Vec<(u32, f32)> { let mut pairs = Vec::with_capacity(size); for i in 1..=size { // probability of skipping a dimension to make the vectors sparse - let skip = rng.gen_bool(0.05); + let skip = rng.gen_bool(0.1); if skip { continue; } From 96eac45df20e1cd54fa1b7e08b76c7599b5da495 Mon Sep 17 00:00:00 2001 From: timvisee Date: Tue, 5 Dec 2023 19:13:24 +0100 Subject: [PATCH 08/11] Change on-disk-index to optional boolean --- src/args.rs | 4 ++-- src/main.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/args.rs b/src/args.rs index b02a0d0..13d0fb9 100644 --- a/src/args.rs +++ b/src/args.rs @@ -116,8 +116,8 @@ pub struct Args { pub on_disk_payload: bool, /// On disk index - #[clap(long, default_value_t = false)] - pub on_disk_index: bool, + #[clap(long)] + pub on_disk_index: Option, /// On disk vectors #[clap(long)] diff --git a/src/main.rs b/src/main.rs index 1f8cde0..1c21e30 100644 --- a/src/main.rs +++ b/src/main.rs @@ -152,7 +152,7 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() SparseVectorParams { index: Some(SparseIndexConfig { full_scan_threshold: None, - on_disk: Some(args.on_disk_index), + on_disk: args.on_disk_index, }), }, ) @@ -169,7 +169,7 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() collection_name: args.collection_name.clone(), vectors_config, hnsw_config: Some(HnswConfigDiff { - on_disk: Some(args.on_disk_index), + on_disk: args.on_disk_index, m: args.hnsw_m.map(|x| x as u64), ef_construct: args.hnsw_ef_construct.map(|x| x as u64), ..Default::default() From cc37d7228219fbbc12b5fa4e09b1c9e639a65b54 Mon Sep 17 00:00:00 2001 From: generall Date: Wed, 6 Dec 2023 20:21:11 +0100 Subject: [PATCH 09/11] more advanced sparse config --- src/args.rs | 8 +++++-- src/common.rs | 11 ++++------ src/main.rs | 17 ++++++--------- src/search.rs | 58 +++++++++++++++++++++++++++++++++++++++------------ src/upsert.rs | 40 +++++++++++++++++++++++++++-------- 5 files changed, 92 insertions(+), 42 deletions(-) diff --git a/src/args.rs b/src/args.rs index 13d0fb9..f983dbe 100644 --- a/src/args.rs +++ b/src/args.rs @@ -223,8 +223,12 @@ pub struct Args { pub indexed_only: Option, /// Whether to use sparse vectors - #[clap(long, default_value_t = false)] - pub sparse_vectors: bool, + #[clap(long)] + pub sparse_vectors: Option, + + /// Number of named vectors per point + #[clap(long, default_value_t = 1)] + pub sparse_vectors_per_point: usize, } #[derive(Copy, Clone, Debug)] diff --git a/src/common.rs b/src/common.rs index cc4ec2d..e6f3761 100644 --- a/src/common.rs +++ b/src/common.rs @@ -113,23 +113,20 @@ pub fn random_filter( } pub fn random_vector(args: &Args) -> Vector { - if args.sparse_vectors { - random_sparse_vector(args.dim).into() - } else { - random_dense_vector(args.dim).into() - } + random_dense_vector(args.dim).into() } /// Generate random sparse vector with random size and random values. /// - `max_size` - maximum size of vector -pub fn random_sparse_vector(max_size: usize) -> Vec<(u32, f32)> { +/// - `sparsity` - how many non-zero values should be in vector +pub fn random_sparse_vector(max_size: usize, sparsity: f64) -> Vec<(u32, f32)> { let mut rng = rand::thread_rng(); let size = rng.gen_range(1..max_size); // (index, value) let mut pairs = Vec::with_capacity(size); for i in 1..=size { // probability of skipping a dimension to make the vectors sparse - let skip = rng.gen_bool(0.1); + let skip = !rng.gen_bool(sparsity); if skip { continue; } diff --git a/src/main.rs b/src/main.rs index 1c21e30..fec06d8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -135,20 +135,15 @@ async fn recreate_collection(args: &Args, stopped: Arc) -> Result<() Config::ParamsMap(VectorParamsMap { map: params }) }; - // TODO enable mix configurations (dense+sparse) - let vectors_config = if args.sparse_vectors { - None - } else { - Some(VectorsConfig { - config: Some(dense_vector_params), - }) - }; + let vectors_config = Some(VectorsConfig { + config: Some(dense_vector_params), + }); - let sparse_vectors_config = if args.sparse_vectors { - let params = (0..args.vectors_per_point) + let sparse_vectors_config = if args.sparse_vectors.is_some() { + let params = (0..args.sparse_vectors_per_point) .map(|idx| { ( - idx.to_string(), + format!("{idx}_sparse").to_string(), SparseVectorParams { index: Some(SparseIndexConfig { full_scan_threshold: None, diff --git a/src/search.rs b/src/search.rs index 4fc9c35..c21d23f 100644 --- a/src/search.rs +++ b/src/search.rs @@ -2,7 +2,9 @@ use crate::common::{random_sparse_vector, random_vector_name, retry_with_clients use crate::{random_dense_vector, random_filter, Args}; use indicatif::ProgressBar; use qdrant_client::client::QdrantClient; -use qdrant_client::qdrant::{QuantizationSearchParams, SearchParams, SearchPoints, Vector}; +use qdrant_client::qdrant::{ + QuantizationSearchParams, SearchParams, SearchPoints, SparseIndices, Vector, +}; use std::sync::atomic::AtomicBool; use std::sync::{Arc, Mutex}; @@ -27,6 +29,32 @@ impl SearchProcessor { } } + fn get_sparse_query(&self) -> (Vec, Option, Option) { + if let Some(sparsity) = self.args.sparse_vectors { + let sparse_vector: Vector = random_sparse_vector(self.args.dim, sparsity).into(); + let query_vector = sparse_vector.data; + let sparse_indices = sparse_vector.indices; + let name = format!( + "{}_sparse", + random_vector_name(self.args.sparse_vectors_per_point) + ); + (query_vector, sparse_indices, Some(name)) + } else { + panic!("No sparse vectors configured") + } + } + + fn get_dense_query(&self) -> (Vec, Option, Option) { + let query_vector = random_dense_vector(self.args.dim); + let sparse_indices = None; + if self.args.vectors_per_point > 1 { + let name = random_vector_name(self.args.vectors_per_point); + (query_vector, sparse_indices, Some(name)) + } else { + (query_vector, sparse_indices, None) + } + } + pub async fn search( &self, _req_id: usize, @@ -36,26 +64,30 @@ impl SearchProcessor { return Ok(()); } - let (query_vector, sparse_indices) = if self.args.sparse_vectors { - let sparse_vector: Vector = random_sparse_vector(self.args.dim).into(); - (sparse_vector.data, sparse_vector.indices) + let start = std::time::Instant::now(); + + let has_sparse = self.args.sparse_vectors.is_some(); + let has_dense = self.args.vectors_per_point > 0; + + let use_sparse = match (has_sparse, has_dense) { + (true, true) => rand::random::(), + (true, false) => true, + (false, true) => false, + (false, false) => panic!("No sparse or dense vectors"), + }; + + let (query_vector, sparse_indices, vector_name) = if use_sparse { + self.get_sparse_query() } else { - (random_dense_vector(self.args.dim), None) + self.get_dense_query() }; + let query_filter = random_filter( self.args.keywords, self.args.float_payloads, self.args.int_payloads, ); - let start = std::time::Instant::now(); - - let vector_name = if self.args.vectors_per_point > 1 || self.args.sparse_vectors { - Some(random_vector_name(self.args.vectors_per_point)) - } else { - None - }; - let request = SearchPoints { collection_name: self.args.collection_name.to_string(), vector: query_vector, diff --git a/src/upsert.rs b/src/upsert.rs index 9edf8b7..2bc8d2f 100644 --- a/src/upsert.rs +++ b/src/upsert.rs @@ -1,11 +1,12 @@ -use crate::common::{random_vector, retry_with_clients}; +use crate::common::{random_sparse_vector, random_vector, retry_with_clients}; use crate::fbin_reader::FBinReader; use crate::{random_dense_vector, random_payload, Args}; use anyhow::Error; use indicatif::ProgressBar; use qdrant_client::client::QdrantClient; use qdrant_client::qdrant::point_id::PointIdOptions; -use qdrant_client::qdrant::{PointId, PointStruct, PointsSelector, Vectors}; +use qdrant_client::qdrant::vectors::VectorsOptions; +use qdrant_client::qdrant::{PointId, PointStruct, PointsSelector, Vector, Vectors}; use rand::Rng; use std::cmp::min; use std::collections::HashMap; @@ -71,7 +72,7 @@ impl UpsertProcessor { let vectors: Vectors = if let Some(reader) = &self.reader { reader.read_vector(idx as usize).to_vec().into() - } else if self.args.vectors_per_point > 1 { + } else if self.args.vectors_per_point != 1 { let vectors_map: HashMap<_, _> = (0..self.args.vectors_per_point) .map(|i| { let vector_name = format!("{}", i); @@ -80,16 +81,37 @@ impl UpsertProcessor { }) .collect(); vectors_map.into() - } else if self.args.sparse_vectors { - // single sparse vector must be named - let vectors_map: HashMap<_, _> = vec![("0".to_string(), random_vector(&self.args))] - .into_iter() - .collect(); - vectors_map.into() } else { random_dense_vector(self.args.dim).into() }; + let vectors: Vectors = if let Some(sparsity) = self.args.sparse_vectors { + let mut vectors_map: HashMap<_, _> = Default::default(); + + for i in 0..self.args.sparse_vectors_per_point { + let vector_name = format!("{}_sparse", i); + let vector = Vector::from(random_sparse_vector(self.args.dim, sparsity)); + vectors_map.insert(vector_name, vector); + } + + match vectors.vectors_options { + None => {} + Some(vectors) => match vectors { + VectorsOptions::Vector(vector) => { + vectors_map.insert("".to_string(), vector); + } + VectorsOptions::Vectors(vectors) => { + for (name, vector) in vectors.vectors.into_iter() { + vectors_map.insert(name, vector); + } + } + }, + } + vectors_map.into() + } else { + vectors + }; + points.push(PointStruct::new( point_id, vectors, From c7a4c34e81be0d09a903f3a4e63bf046cd11de06 Mon Sep 17 00:00:00 2001 From: KShivendu Date: Thu, 7 Dec 2023 14:05:47 +0530 Subject: [PATCH 10/11] fix: Display --sparse-vector SPARSITY in help to clarify the usage --- src/args.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/args.rs b/src/args.rs index f983dbe..cc92e0a 100644 --- a/src/args.rs +++ b/src/args.rs @@ -223,7 +223,7 @@ pub struct Args { pub indexed_only: Option, /// Whether to use sparse vectors - #[clap(long)] + #[clap(long, value_name = "SPARSITY")] pub sparse_vectors: Option, /// Number of named vectors per point From ecdfc0de11b2d2ac9f4ff317afb905528be06a3b Mon Sep 17 00:00:00 2001 From: KShivendu Date: Thu, 7 Dec 2023 14:10:05 +0530 Subject: [PATCH 11/11] feat: Improve arg explanation --- src/args.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/args.rs b/src/args.rs index cc92e0a..42ba180 100644 --- a/src/args.rs +++ b/src/args.rs @@ -222,7 +222,7 @@ pub struct Args { #[clap(long)] pub indexed_only: Option, - /// Whether to use sparse vectors + /// Whether to use sparse vectors and with how much sparsity #[clap(long, value_name = "SPARSITY")] pub sparse_vectors: Option,