Skip to content

Commit 541ae49

Browse files
committed
search/embeddings: speed up similarity search by more than 50%
Exploit normalization of embeddings. Document this assumption and fulfill it in the mock embedding model.
1 parent bf76bb1 commit 541ae49

File tree

8 files changed

+15
-27
lines changed

8 files changed

+15
-27
lines changed

packages/SemanticText.package/OpenAIEmbeddingModel.class/instance/getEmbeddingsForAll.config..st

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
service
22
getEmbeddingsForAll: strings config: aConfigOrNil
3-
"Answer a collection with one embedding for each string. Each embedding vector is a Float32Array of numbers."
3+
"Answer a collection with one embedding for each string. Each embedding vector is a Float32Array of numbers. Each vector is normalized, i.e., has a length very close to 1."
44

55
| embeddingResult |
66
embeddingResult := self

packages/SemanticText.package/OpenAIEmbeddingModel.class/methodProperties.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"getEmbeddingFor:" : "ct 8/20/2023 12:49",
1313
"getEmbeddingFor:config:" : "ct 8/20/2023 13:03",
1414
"getEmbeddingsForAll:" : "ct 8/20/2023 22:55",
15-
"getEmbeddingsForAll:config:" : "ct 8/20/2023 19:43",
15+
"getEmbeddingsForAll:config:" : "ct 8/29/2023 17:16",
1616
"name:" : "ct 8/19/2023 22:15",
1717
"pathToEndpoint" : "ct 8/17/2023 18:07",
1818
"priceFor:" : "ct 8/27/2023 16:55",
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,7 @@
11
private
22
distanceBetween: embedding and: anotherEmbedding
3-
"cosine distance"
3+
"Answer the cosine distance between both embeddings. The length of embeddings is ignored, so senders have to take care not to compare differences between pairs of vectors with different total scalars."
44

5-
| abs otherAbs |
65
anotherEmbedding ifNil: [^ Float infinity].
76

8-
abs := embedding squaredLength.
9-
abs = 0 ifTrue: [^ Float infinity].
10-
otherAbs := anotherEmbedding squaredLength.
11-
otherAbs = 0 ifTrue: [^ Float infinity].
12-
^ 1.0 -
13-
(
14-
(embedding dot: anotherEmbedding)
15-
/
16-
(
17-
abs
18-
*
19-
otherAbs
20-
) sqrt
21-
)
7+
^ 1.0 - (embedding dot: anotherEmbedding)

packages/SemanticText.package/SemanticCorpus.class/instance/findAllDocuments.nearEmbedding..st

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
search-embeddings
22
findAllDocuments: number nearEmbedding: embedding
3+
"Note: Ranking of results is invariant of length of the embedding argument."
34

45
| relatednesses |
56
relatednesses := Array new: self documents size streamContents: [:stream |

packages/SemanticText.package/SemanticCorpus.class/instance/findDocuments.similarTo.collect.thenSelect..st

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ findDocuments: number similarTo: documents collect: collectBlock thenSelect: sel
1010
average := classifiedDocuments first embedding shallowCopy.
1111
2 to: classifiedDocuments size do: [:i |
1212
average += (classifiedDocuments at: i) embedding].
13-
"average /= documents size." "NOT required because cosine distance only depends on vector direction"
13+
"average /= documents size." "NOT required because ranking of results only depends on vector direction"
1414

1515
^ self
1616
findDocuments: number

packages/SemanticText.package/SemanticCorpus.class/methodProperties.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"addFragmentDocumentsFrom:title:content:as:" : "ct 8/14/2023 16:15",
1313
"addFragmentDocumentsFromTitle:content:collect:" : "ct 8/14/2023 19:47",
1414
"allDocumentsForObject:" : "ct 8/16/2023 20:07",
15-
"distanceBetween:and:" : "ct 8/19/2023 22:09",
15+
"distanceBetween:and:" : "ct 8/29/2023 17:32",
1616
"documentClass" : "ct 8/14/2023 15:23",
1717
"documents" : "ct 7/2/2023 17:41",
1818
"documentsForObject:" : "ct 8/14/2023 16:30",
@@ -25,15 +25,15 @@
2525
"estimateTimeToInitializeEmbeddingsFor:" : "ct 8/20/2023 21:00",
2626
"estimateTokens" : "ct 8/20/2023 20:57",
2727
"estimateTokensFor:" : "ct 8/27/2023 16:57",
28-
"findAllDocuments:nearEmbedding:" : "ct 8/16/2023 19:03",
28+
"findAllDocuments:nearEmbedding:" : "ct 8/29/2023 17:30",
2929
"findClusteredDocuments:nearEmbedding:" : "ct 8/16/2023 19:12",
3030
"findClusters:nearEmbedding:" : "ct 8/19/2023 22:09",
3131
"findDocuments:nearEmbedding:" : "ct 8/16/2023 19:03",
3232
"findDocuments:nearEmbedding:collect:" : "ct 8/17/2023 12:02",
3333
"findDocuments:nearEmbedding:collect:thenSelect:" : "ct 8/19/2023 22:07",
3434
"findDocuments:nearEmbedding:useClusters:" : "ct 8/16/2023 19:03",
3535
"findDocuments:similarTo:collect:" : "ct 8/17/2023 12:03",
36-
"findDocuments:similarTo:collect:thenSelect:" : "ct 8/19/2023 22:02",
36+
"findDocuments:similarTo:collect:thenSelect:" : "ct 8/29/2023 19:51",
3737
"findDocuments:similarToObject:collect:" : "ct 8/16/2023 20:02",
3838
"findDocuments:similarToObjects:collect:" : "ct 8/16/2023 20:03",
3939
"findDocuments:similarToQuery:collect:" : "ct 8/17/2023 20:35",
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
service
22
getEmbeddingsForAll: strings config: aConfigOrNil
3-
"Answer a collection with one embedding for each string. Each embedding vector is an array of numbers, commonly represented as a Float32Array."
3+
"Answer a collection with one embedding for each string. Each embedding vector is an array of numbers, commonly represented as a Float32Array. Each vector is normalized, i.e., has a length very close to 1."
44

55
| config |
66
config := self baseConfig.
77
aConfigOrNil ifNotNil:
88
[config := config updatedWith: aConfigOrNil].
99

1010
^ strings collect: [:string |
11-
| words |
11+
| words vector |
1212
words := string substrings collect: [:word | word asLowercaseAlphabetic] as: Bag.
13-
self keywords
13+
vector := self keywords
1414
collect: [:keyword | (words occurrencesOf: keyword) / words size]
15-
as: Float32Array]
15+
as: Float32Array.
16+
vector /= vector length]

packages/SemanticText.package/SemanticMockEmbeddingModel.class/methodProperties.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"getEmbeddingFor:" : "ct 8/20/2023 12:48",
99
"getEmbeddingFor:config:" : "ct 8/20/2023 12:52",
1010
"getEmbeddingsForAll:" : "ct 8/20/2023 12:47",
11-
"getEmbeddingsForAll:config:" : "ct 8/20/2023 13:03",
11+
"getEmbeddingsForAll:config:" : "ct 8/29/2023 17:16",
1212
"keywords" : "ct 8/17/2023 20:34",
1313
"truncateString:minusString:minusWords:to:" : "ct 8/20/2023 13:11",
1414
"truncateString:to:" : "ct 8/20/2023 13:11" } }

0 commit comments

Comments
 (0)