Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Text search API and VectorStoreTextSearch implementation #249

Merged
merged 2 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -416,25 +416,25 @@ public void exactSearch(QueryProvider provider, String embeddingName) {

VectorSearchOptions options = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withLimit(3)
.withTop(3)
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
options).block();
options).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The third hotel should be the most similar
assertEquals(hotels.get(2).getId(), results.get(0).getRecord().getId());

options = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withOffset(1)
.withLimit(-100)
.withSkip(1)
.withTop(-100)
.build();

// Skip the first result
results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, options).block();
results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, options).block().getResults();
assertNotNull(results);
assertEquals(1, results.size());
// The first hotel should be the most similar
Expand All @@ -453,12 +453,12 @@ public void approximateSearch(QueryProvider provider) {

VectorSearchOptions options = VectorSearchOptions.builder()
.withVectorFieldName("indexedEuclidean")
.withLimit(5)
.withTop(5)
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
options).block();
options).block().getResults();
assertNotNull(results);
assertEquals(5, results.size());
// The third hotel should be the most similar
Expand All @@ -477,15 +477,15 @@ public void searchWithFilterEqualToFilter(QueryProvider provider, String embeddi

VectorSearchOptions options = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withLimit(3)
.withTop(3)
.withVectorSearchFilter(
VectorSearchFilter.builder()
.equalTo("rating", 4.0).build())
.build();

// Embeddings similar to the third hotel, but as the filter is set to 4.0, the third hotel should not be returned
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
options).block();
options).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The first hotel should be the most similar
Expand All @@ -504,15 +504,15 @@ public void searchWithAnyTagEqualToFilter(QueryProvider provider, String embeddi

VectorSearchOptions options = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withLimit(3)
.withTop(3)
.withVectorSearchFilter(
VectorSearchFilter.builder()
.anyTagEqualTo("tags", "city").build())
.build();

// Embeddings similar to the third hotel, but as the filter is set to 4.0, the third hotel should not be returned
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
options).block();
options).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The first hotel should be the most similar
Expand All @@ -530,7 +530,7 @@ public void postgresSearchIncludeAndNotIncludeVectors() {
recordCollection.upsertBatchAsync(hotels, null).block();

List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
null).block();
null).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The third hotel should be the most similar
Expand All @@ -541,7 +541,7 @@ public void postgresSearchIncludeAndNotIncludeVectors() {
.withIncludeVectors(true)
.build();

results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, options).block();
results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, options).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The third hotel should be the most similar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,9 @@ public void search(RecordCollectionOptions options, String embeddingName) {
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block();
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block().getResults();
assertNotNull(results);
assertEquals(VectorSearchOptions.DEFAULT_RESULT_LIMIT, results.size(), indexingFailureMessage);
assertEquals(VectorSearchOptions.DEFAULT_TOP, results.size(), indexingFailureMessage);
// The third hotel should be the most similar
assertEquals(hotels.get(2).getId(), results.get(0).getRecord().getId(), indexingFailureMessage);
// Score should be different than zero
Expand All @@ -412,9 +412,9 @@ public void searchWithVectors(RecordCollectionOptions options, String embeddingN
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block();
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block().getResults();
assertNotNull(results);
assertEquals(VectorSearchOptions.DEFAULT_RESULT_LIMIT, results.size(), indexingFailureMessage);
assertEquals(VectorSearchOptions.DEFAULT_TOP, results.size(), indexingFailureMessage);
// The third hotel should be the most similar
assertEquals(hotels.get(2).getId(), results.get(0).getRecord().getId(), indexingFailureMessage);
assertNotNull(results.get(0).getRecord().getEuclidean());
Expand All @@ -431,12 +431,12 @@ public void searchWithOffSet(RecordCollectionOptions options, String embeddingNa

VectorSearchOptions searchOptions = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withOffset(1)
.withLimit(4)
.withSkip(1)
.withTop(4)
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block();
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block().getResults();
assertNotNull(results);
assertEquals(4, results.size(), indexingFailureMessage);
// The first hotel should be the most similar
Expand All @@ -454,15 +454,15 @@ public void searchWithFilterEqualToFilter(RecordCollectionOptions recordCollecti

VectorSearchOptions options = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withLimit(3)
.withTop(3)
.withVectorSearchFilter(
VectorSearchFilter.builder()
.equalTo("rating", 4.0).build())
.build();

// Embeddings similar to the third hotel, but as the filter is set to 4.0, the third hotel should not be returned
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
options).block();
options).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The first hotel should be the most similar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,9 @@ public void search(RecordCollectionOptions options, String embeddingName) {
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block();
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block().getResults();
assertNotNull(results);
assertEquals(VectorSearchOptions.DEFAULT_RESULT_LIMIT, results.size(), indexingFailureMessage);
assertEquals(VectorSearchOptions.DEFAULT_TOP, results.size(), indexingFailureMessage);
// The third hotel should be the most similar
assertEquals(hotels.get(2).getId(), results.get(0).getRecord().getId(), indexingFailureMessage);
// Score should be different than zero
Expand All @@ -412,9 +412,9 @@ public void searchWithVectors(RecordCollectionOptions options, String embeddingN
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block();
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block().getResults();
assertNotNull(results);
assertEquals(VectorSearchOptions.DEFAULT_RESULT_LIMIT, results.size(), indexingFailureMessage);
assertEquals(VectorSearchOptions.DEFAULT_TOP, results.size(), indexingFailureMessage);
// The third hotel should be the most similar
assertEquals(hotels.get(2).getId(), results.get(0).getRecord().getId(), indexingFailureMessage);
assertNotNull(results.get(0).getRecord().getEuclidean());
Expand All @@ -431,12 +431,12 @@ public void searchWithOffSet(RecordCollectionOptions options, String embeddingNa

VectorSearchOptions searchOptions = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withOffset(1)
.withLimit(4)
.withSkip(1)
.withTop(4)
.build();

// Embeddings similar to the third hotel
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block();
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS, searchOptions).block().getResults();
assertNotNull(results);
assertEquals(4, results.size(), indexingFailureMessage);
// The first hotel should be the most similar
Expand All @@ -454,15 +454,15 @@ public void searchWithFilterEqualToFilter(RecordCollectionOptions recordCollecti

VectorSearchOptions options = VectorSearchOptions.builder()
.withVectorFieldName(embeddingName)
.withLimit(3)
.withTop(3)
.withVectorSearchFilter(
VectorSearchFilter.builder()
.equalTo("rating", 4.0).build())
.build();

// Embeddings similar to the third hotel, but as the filter is set to 4.0, the third hotel should not be returned
List<VectorSearchResult<Hotel>> results = recordCollection.searchAsync(SEARCH_EMBEDDINGS,
options).block();
options).block().getResults();
assertNotNull(results);
assertEquals(3, results.size());
// The first hotel should be the most similar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
import com.azure.core.credential.KeyCredential;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.microsoft.semantickernel.aiservices.openai.textembedding.OpenAITextEmbeddingGenerationService;
import com.microsoft.semantickernel.data.textsearch.TextSearchResultValue;
import com.microsoft.semantickernel.data.vectorsearch.VectorSearchResult;
import com.microsoft.semantickernel.data.vectorstorage.VectorStoreRecordCollection;
import com.microsoft.semantickernel.data.VolatileVectorStore;
import com.microsoft.semantickernel.data.VolatileVectorStoreRecordCollectionOptions;
import com.microsoft.semantickernel.data.vectorstorage.VectorStoreTextSearch;
import com.microsoft.semantickernel.data.vectorstorage.VectorStoreTextSearchOptions;
import com.microsoft.semantickernel.data.vectorstorage.annotations.VectorStoreRecordData;
import com.microsoft.semantickernel.data.vectorstorage.annotations.VectorStoreRecordKey;
import com.microsoft.semantickernel.data.vectorstorage.annotations.VectorStoreRecordVector;
Expand Down Expand Up @@ -38,12 +41,12 @@ public class InMemoryVolatileVectorStore {
private static final int EMBEDDING_DIMENSIONS = 1536;

static class GitHubFile {
@JsonProperty("fileId") // Set a different name for the storage field if needed
@VectorStoreRecordKey()
@VectorStoreRecordKey
private final String id;
@VectorStoreRecordData()
@VectorStoreRecordData
private final String description;
@VectorStoreRecordData
@TextSearchResultValue
private final String link;
@VectorStoreRecordVector(dimensions = EMBEDDING_DIMENSIONS, indexKind = IndexKind.HNSW, distanceFunction = DistanceFunction.COSINE_DISTANCE)
private final List<Float> embedding;
Expand Down Expand Up @@ -125,27 +128,24 @@ public static void inMemoryStoreAndSearch(
.then(storeData(collection, embeddingGeneration, sampleData()))
.block();

// Build a vectorized search
var vectorStoreTextSearch = VectorStoreTextSearch.<GitHubFile>builder()
.withVectorizedSearch(collection)
.withTextEmbeddingGenerationService(embeddingGeneration)
.build();

// Search for results
// Volatile store executes an exhaustive search, for approximate search use Azure AI Search, Redis or JDBC with PostgreSQL
var results = search("How to get started", collection, embeddingGeneration).block();
String query = "How to get started?";
var results = vectorStoreTextSearch.searchAsync(query, null)
.block();

if (results == null || results.isEmpty()) {
if (results == null || results.getTotalCount() == 0) {
System.out.println("No search results found.");
return;
}
var searchResult = results.get(0);
System.out.printf("Search result with score: %f.%n Link: %s, Description: %s%n",
searchResult.getScore(), searchResult.getRecord().link,
searchResult.getRecord().description);
}

private static Mono<List<VectorSearchResult<GitHubFile>>> search(
String searchText,
VectorStoreRecordCollection<String, GitHubFile> recordCollection,
OpenAITextEmbeddingGenerationService embeddingGeneration) {
// Generate embeddings for the search text and search for the closest records
return embeddingGeneration.generateEmbeddingsAsync(Collections.singletonList(searchText))
.flatMap(r -> recordCollection.searchAsync(r.get(0).getVector(), null));
System.out.printf("Best result for '%s': %s%n", query, results.getResults().get(0));
}

private static Mono<List<String>> storeData(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
import com.microsoft.semantickernel.connectors.data.azureaisearch.AzureAISearchVectorStore;
import com.microsoft.semantickernel.connectors.data.azureaisearch.AzureAISearchVectorStoreOptions;
import com.microsoft.semantickernel.connectors.data.azureaisearch.AzureAISearchVectorStoreRecordCollectionOptions;
import com.microsoft.semantickernel.data.textsearch.TextSearchResultValue;
import com.microsoft.semantickernel.data.vectorsearch.VectorSearchResult;
import com.microsoft.semantickernel.data.vectorsearch.VectorizedSearch;
import com.microsoft.semantickernel.data.vectorstorage.VectorStoreRecordCollection;
import com.microsoft.semantickernel.data.vectorstorage.VectorStoreTextSearch;
import com.microsoft.semantickernel.data.vectorstorage.annotations.VectorStoreRecordData;
import com.microsoft.semantickernel.data.vectorstorage.annotations.VectorStoreRecordKey;
import com.microsoft.semantickernel.data.vectorstorage.annotations.VectorStoreRecordVector;
Expand Down Expand Up @@ -51,13 +54,12 @@ public class VectorStoreWithAzureAISearch {
private static final int EMBEDDING_DIMENSIONS = 1536;

static class GitHubFile {

@JsonProperty("fileId") // Set a different name for the storage field if needed
@VectorStoreRecordKey()
private final String id;
@VectorStoreRecordData()
private final String description;
@VectorStoreRecordData
@TextSearchResultValue
private final String link;
@VectorStoreRecordVector(dimensions = EMBEDDING_DIMENSIONS, indexKind = IndexKind.HNSW, distanceFunction = DistanceFunction.COSINE_SIMILARITY)
private final List<Float> embedding;
Expand Down Expand Up @@ -111,7 +113,6 @@ public static void main(String[] args) {
var searchClient = new SearchIndexClientBuilder()
.endpoint(AZURE_AI_SEARCH_ENDPOINT)
.credential(new AzureKeyCredential(AZURE_AISEARCH_KEY))
.clientOptions(clientOptions())
.buildAsyncClient();

storeAndSearch(searchClient, embeddingGeneration);
Expand Down Expand Up @@ -141,27 +142,24 @@ public static void storeAndSearch(
.then(storeData(collection, embeddingGeneration, sampleData()))
.block();

// Build a vectorized search
var vectorStoreTextSearch = VectorStoreTextSearch.<GitHubFile>builder()
.withVectorizedSearch(collection)
.withTextEmbeddingGenerationService(embeddingGeneration)
.build();

// Search for results
// Might need to wait for the data to be indexed
var results = search("How to get started", collection, embeddingGeneration).block();
String query = "How to get started?";
var results = vectorStoreTextSearch.searchAsync(query, null)
.block();

if (results == null || results.isEmpty()) {
if (results == null || results.getTotalCount() == 0) {
System.out.println("No search results found.");
return;
}
var searchResult = results.get(0);
System.out.printf("Search result with score: %f.%n Link: %s, Description: %s%n",
searchResult.getScore(), searchResult.getRecord().link,
searchResult.getRecord().description);
}

private static Mono<List<VectorSearchResult<GitHubFile>>> search(
String searchText,
VectorStoreRecordCollection<String, GitHubFile> recordCollection,
OpenAITextEmbeddingGenerationService embeddingGeneration) {
// Generate embeddings for the search text and search for the closest records
return embeddingGeneration.generateEmbeddingAsync(searchText)
.flatMap(r -> recordCollection.searchAsync(r.getVector(), null));
System.out.printf("Best result for '%s': %s%n", query, results.getResults().get(0));
}

private static Mono<List<String>> storeData(
Expand Down Expand Up @@ -204,11 +202,4 @@ private static Map<String, String> sampleData() {
"README: README associated with a sample chat summary react-based webapp" },
}).collect(Collectors.toMap(element -> element[0], element -> element[1]));
}

private static ClientOptions clientOptions() {
return new ClientOptions()
.setTracingOptions(new TracingOptions())
.setMetricsOptions(new MetricsOptions())
.setApplicationId("Semantic-Kernel");
}
}
Loading