Skip to content

Commit 82b019b

Browse files
authored
Merge branch 'main' into fixup-github-lint
2 parents 5e89e85 + 66d7710 commit 82b019b

File tree

8 files changed

+1171
-0
lines changed

8 files changed

+1171
-0
lines changed

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ require (
158158
gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect
159159
gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect
160160
gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect
161+
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 // indirect
161162
go.opencensus.io v0.24.0 // indirect
162163
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect
163164
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,8 @@ go.mongodb.org/mongo-driver v1.7.5/go.mod h1:VXEWRZ6URJIkUq2SCAyapmhH0ZLRBP+FT4x
783783
go.mongodb.org/mongo-driver v1.10.0/go.mod h1:wsihk0Kdgv8Kqu1Anit4sfK+22vSFbUrAVEYRhCXrA8=
784784
go.mongodb.org/mongo-driver v1.14.0 h1:P98w8egYRjYe3XDjxhYJagTokP/H6HzlsnojRgZRd80=
785785
go.mongodb.org/mongo-driver v1.14.0/go.mod h1:Vzb0Mk/pa7e6cWw85R4F/endUC3u0U9jGcNU603k65c=
786+
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 h1:vwKMYa9FCX1OW7efPaH0FUaD6o+WC0kiC7VtHtNX7UU=
787+
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1/go.mod h1:pfndQmffp38kKjbwVfoavadsdC0Nsg/qb+INK01PNaM=
786788
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
787789
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
788790
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 h1:A3SayB3rNyt+1S6qpI9mHPkeHTZbD7XILEqWnYZb2l0=

vectorstores/mongovector/doc.go

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Package mongovector implements a vector store using MongoDB as the backend.
2+
//
3+
// The mongovector package provides a way to store and retrieve document embeddings
4+
// using MongoDB's vector search capabilities. It implements the VectorStore
5+
// interface from the vectorstores package, allowing it to be used interchangeably
6+
// with other vector store implementations.
7+
//
8+
// Key features:
9+
// - Store document embeddings in MongoDB
10+
// - Perform similarity searches on stored embeddings
11+
// - Configurable index and path settings
12+
// - Support for custom embedding functions
13+
//
14+
// Main types:
15+
// - Store: The main type that implements the VectorStore interface
16+
// - Option: A function type for configuring the Store
17+
//
18+
// Usage:
19+
//
20+
// import (
21+
// "github.com/tmc/langchaingo/vectorstores/mongovector"
22+
// "go.mongodb.org/mongo-driver/mongo"
23+
// )
24+
//
25+
// // Create a new Store
26+
// coll := // ... obtain a *mongo.Collection
27+
// embedder := // ... obtain an embeddings.Embedder
28+
// store := mongovector.New(coll, embedder)
29+
//
30+
// // Add documents
31+
// docs := []schema.Document{
32+
// {PageContent: "Document 1"},
33+
// {PageContent: "Document 2"},
34+
// }
35+
// ids, err := store.AddDocuments(context.Background(), docs)
36+
//
37+
// // Perform similarity search
38+
// results, err := store.SimilaritySearch(context.Background(), "query", 5)
39+
//
40+
// The package also provides options for customizing the Store:
41+
// - WithIndex: Set a custom index name
42+
// - WithPath: Set a custom path for the vector field
43+
// - WithNumCandidates: Set the number of candidates for similarity search
44+
//
45+
// For more detailed information, see the documentation for individual types and functions.
46+
package mongovector
+207
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
package mongovector
2+
3+
import (
4+
"context"
5+
"crypto/rand"
6+
"fmt"
7+
"math/big"
8+
"time"
9+
10+
"github.com/tmc/langchaingo/embeddings"
11+
"github.com/tmc/langchaingo/schema"
12+
"github.com/tmc/langchaingo/vectorstores"
13+
)
14+
15+
type mockEmbedder struct {
16+
queryVector []float32
17+
docs map[string]schema.Document
18+
docVectors map[string][]float32
19+
}
20+
21+
var _ embeddings.Embedder = &mockEmbedder{}
22+
23+
func newMockEmbedder(dim int) *mockEmbedder {
24+
emb := &mockEmbedder{
25+
queryVector: newNormalizedVector(dim),
26+
docs: make(map[string]schema.Document),
27+
docVectors: make(map[string][]float32),
28+
}
29+
30+
return emb
31+
}
32+
33+
// mockDocuments will add the given documents to the embedder, assigning each
34+
// a vector such that similarity score = 0.5 * ( 1 + vector * queryVector).
35+
func (emb *mockEmbedder) mockDocuments(doc ...schema.Document) {
36+
for _, d := range doc {
37+
emb.docs[d.PageContent] = d
38+
}
39+
}
40+
41+
// existingVectors returns all the vectors that have been added to the embedder.
42+
// The query vector is included in the list to maintain orthogonality.
43+
func (emb *mockEmbedder) existingVectors() [][]float32 {
44+
vectors := make([][]float32, 0, len(emb.docs)+1)
45+
for _, vec := range emb.docVectors {
46+
vectors = append(vectors, vec)
47+
}
48+
49+
return append(vectors, emb.queryVector)
50+
}
51+
52+
// EmbedDocuments will return the embedded vectors for the given texts. If the
53+
// text does not exist in the document set, a zero vector will be returned.
54+
func (emb *mockEmbedder) EmbedDocuments(_ context.Context, texts []string) ([][]float32, error) {
55+
vectors := make([][]float32, len(texts))
56+
for i := range vectors {
57+
// If the text does not exist in the document set, return a zero vector.
58+
doc, ok := emb.docs[texts[i]]
59+
if !ok {
60+
vectors[i] = make([]float32, len(emb.queryVector))
61+
}
62+
63+
// If the vector exists, use it.
64+
existing, ok := emb.docVectors[texts[i]]
65+
if ok {
66+
vectors[i] = existing
67+
68+
continue
69+
}
70+
71+
// If it does not exist, make a linearly independent vector.
72+
newVectorBasis := newOrthogonalVector(len(emb.queryVector), emb.existingVectors()...)
73+
74+
// Update the newVector to be scaled by the score.
75+
newVector := dotProductNormFn(doc.Score, emb.queryVector, newVectorBasis)
76+
77+
vectors[i] = newVector
78+
emb.docVectors[texts[i]] = newVector
79+
}
80+
81+
return vectors, nil
82+
}
83+
84+
// EmbedQuery returns the query vector.
85+
func (emb *mockEmbedder) EmbedQuery(context.Context, string) ([]float32, error) {
86+
return emb.queryVector, nil
87+
}
88+
89+
// Insert all of the mock documents collected by the embedder.
90+
func flushMockDocuments(ctx context.Context, store Store, emb *mockEmbedder) error {
91+
docs := make([]schema.Document, 0, len(emb.docs))
92+
for _, doc := range emb.docs {
93+
docs = append(docs, doc)
94+
}
95+
96+
_, err := store.AddDocuments(ctx, docs, vectorstores.WithEmbedder(emb))
97+
if err != nil {
98+
return err
99+
}
100+
101+
// Consistency on indexes is not synchronous.
102+
// nolint:mnd
103+
time.Sleep(10 * time.Second)
104+
105+
return nil
106+
}
107+
108+
// newNormalizedFloat32 will generate a random float32 in [-1, 1].
109+
// nolint:mnd
110+
func newNormalizedFloat32() (float32, error) {
111+
max := big.NewInt(1 << 24)
112+
113+
n, err := rand.Int(rand.Reader, max)
114+
if err != nil {
115+
return 0.0, fmt.Errorf("failed to normalize float32")
116+
}
117+
118+
return 2.0*(float32(n.Int64())/float32(1<<24)) - 1.0, nil
119+
}
120+
121+
// dotProduct will return the dot product between two slices of f32.
122+
func dotProduct(v1, v2 []float32) float32 {
123+
var sum float32
124+
125+
for i := range v1 {
126+
sum += v1[i] * v2[i]
127+
}
128+
129+
return sum
130+
}
131+
132+
// linearlyIndependent true if the vectors are linearly independent.
133+
func linearlyIndependent(v1, v2 []float32) bool {
134+
var ratio float32
135+
136+
for i := range v1 {
137+
if v1[i] != 0 {
138+
r := v2[i] / v1[i]
139+
140+
if ratio == 0 {
141+
ratio = r
142+
143+
continue
144+
}
145+
146+
if r == ratio {
147+
continue
148+
}
149+
150+
return true
151+
}
152+
153+
if v2[i] != 0 {
154+
return true
155+
}
156+
}
157+
158+
return false
159+
}
160+
161+
// Create a vector of values between [-1, 1] of the specified size.
162+
func newNormalizedVector(dim int) []float32 {
163+
vector := make([]float32, dim)
164+
for i := range vector {
165+
vector[i], _ = newNormalizedFloat32()
166+
}
167+
168+
return vector
169+
}
170+
171+
// Use Gram Schmidt to return a vector orthogonal to the basis, so long as
172+
// the vectors in the basis are linearly independent.
173+
func newOrthogonalVector(dim int, basis ...[]float32) []float32 {
174+
candidate := newNormalizedVector(dim)
175+
176+
for _, b := range basis {
177+
dp := dotProduct(candidate, b)
178+
basisNorm := dotProduct(b, b)
179+
180+
for i := range candidate {
181+
candidate[i] -= (dp / basisNorm) * b[i]
182+
}
183+
}
184+
185+
return candidate
186+
}
187+
188+
// return a new vector such that v1 * v2 = 2S - 1.
189+
func dotProductNormFn(score float32, qvector, basis []float32) []float32 {
190+
var sum float32
191+
192+
// Populate v2 upto dim-1.
193+
for i := range qvector[:len(qvector)-1] {
194+
sum += qvector[i] * basis[i]
195+
}
196+
197+
// Calculate v_{2, dim} such that v1 * v2 = 2S - 1:
198+
basis[len(basis)-1] = (2*score - 1 - sum) / qvector[len(qvector)-1]
199+
200+
// If the vectors are linearly independent, regenerate the dim-1 elements
201+
// of v2.
202+
if !linearlyIndependent(qvector, basis) {
203+
return dotProductNormFn(score, qvector, basis)
204+
}
205+
206+
return basis
207+
}

vectorstores/mongovector/mock_llm.go

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package mongovector
2+
3+
import (
4+
"context"
5+
6+
"github.com/tmc/langchaingo/embeddings"
7+
)
8+
9+
// mockLLM will create consistent text embeddings mocking the OpenAI
10+
// text-embedding-3-small algorithm.
11+
type mockLLM struct {
12+
seen map[string][]float32
13+
dim int
14+
}
15+
16+
var _ embeddings.EmbedderClient = &mockLLM{}
17+
18+
// createEmbedding will return vector embeddings for the mock LLM, maintaining
19+
// consistency.
20+
func (emb *mockLLM) CreateEmbedding(_ context.Context, texts []string) ([][]float32, error) {
21+
if emb.seen == nil {
22+
emb.seen = map[string][]float32{}
23+
}
24+
25+
vectors := make([][]float32, len(texts))
26+
for i, text := range texts {
27+
if f32s := emb.seen[text]; len(f32s) > 0 {
28+
vectors[i] = f32s
29+
30+
continue
31+
}
32+
33+
vectors[i] = newNormalizedVector(emb.dim)
34+
emb.seen[text] = vectors[i] // ensure consistency
35+
}
36+
37+
return vectors, nil
38+
}

0 commit comments

Comments
 (0)