|
| 1 | +package mongovector |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "crypto/rand" |
| 6 | + "fmt" |
| 7 | + "math/big" |
| 8 | + "time" |
| 9 | + |
| 10 | + "github.com/tmc/langchaingo/embeddings" |
| 11 | + "github.com/tmc/langchaingo/schema" |
| 12 | + "github.com/tmc/langchaingo/vectorstores" |
| 13 | +) |
| 14 | + |
| 15 | +type mockEmbedder struct { |
| 16 | + queryVector []float32 |
| 17 | + docs map[string]schema.Document |
| 18 | + docVectors map[string][]float32 |
| 19 | +} |
| 20 | + |
| 21 | +var _ embeddings.Embedder = &mockEmbedder{} |
| 22 | + |
| 23 | +func newMockEmbedder(dim int) *mockEmbedder { |
| 24 | + emb := &mockEmbedder{ |
| 25 | + queryVector: newNormalizedVector(dim), |
| 26 | + docs: make(map[string]schema.Document), |
| 27 | + docVectors: make(map[string][]float32), |
| 28 | + } |
| 29 | + |
| 30 | + return emb |
| 31 | +} |
| 32 | + |
| 33 | +// mockDocuments will add the given documents to the embedder, assigning each |
| 34 | +// a vector such that similarity score = 0.5 * ( 1 + vector * queryVector). |
| 35 | +func (emb *mockEmbedder) mockDocuments(doc ...schema.Document) { |
| 36 | + for _, d := range doc { |
| 37 | + emb.docs[d.PageContent] = d |
| 38 | + } |
| 39 | +} |
| 40 | + |
| 41 | +// existingVectors returns all the vectors that have been added to the embedder. |
| 42 | +// The query vector is included in the list to maintain orthogonality. |
| 43 | +func (emb *mockEmbedder) existingVectors() [][]float32 { |
| 44 | + vectors := make([][]float32, 0, len(emb.docs)+1) |
| 45 | + for _, vec := range emb.docVectors { |
| 46 | + vectors = append(vectors, vec) |
| 47 | + } |
| 48 | + |
| 49 | + return append(vectors, emb.queryVector) |
| 50 | +} |
| 51 | + |
| 52 | +// EmbedDocuments will return the embedded vectors for the given texts. If the |
| 53 | +// text does not exist in the document set, a zero vector will be returned. |
| 54 | +func (emb *mockEmbedder) EmbedDocuments(_ context.Context, texts []string) ([][]float32, error) { |
| 55 | + vectors := make([][]float32, len(texts)) |
| 56 | + for i := range vectors { |
| 57 | + // If the text does not exist in the document set, return a zero vector. |
| 58 | + doc, ok := emb.docs[texts[i]] |
| 59 | + if !ok { |
| 60 | + vectors[i] = make([]float32, len(emb.queryVector)) |
| 61 | + } |
| 62 | + |
| 63 | + // If the vector exists, use it. |
| 64 | + existing, ok := emb.docVectors[texts[i]] |
| 65 | + if ok { |
| 66 | + vectors[i] = existing |
| 67 | + |
| 68 | + continue |
| 69 | + } |
| 70 | + |
| 71 | + // If it does not exist, make a linearly independent vector. |
| 72 | + newVectorBasis := newOrthogonalVector(len(emb.queryVector), emb.existingVectors()...) |
| 73 | + |
| 74 | + // Update the newVector to be scaled by the score. |
| 75 | + newVector := dotProductNormFn(doc.Score, emb.queryVector, newVectorBasis) |
| 76 | + |
| 77 | + vectors[i] = newVector |
| 78 | + emb.docVectors[texts[i]] = newVector |
| 79 | + } |
| 80 | + |
| 81 | + return vectors, nil |
| 82 | +} |
| 83 | + |
| 84 | +// EmbedQuery returns the query vector. |
| 85 | +func (emb *mockEmbedder) EmbedQuery(context.Context, string) ([]float32, error) { |
| 86 | + return emb.queryVector, nil |
| 87 | +} |
| 88 | + |
| 89 | +// Insert all of the mock documents collected by the embedder. |
| 90 | +func flushMockDocuments(ctx context.Context, store Store, emb *mockEmbedder) error { |
| 91 | + docs := make([]schema.Document, 0, len(emb.docs)) |
| 92 | + for _, doc := range emb.docs { |
| 93 | + docs = append(docs, doc) |
| 94 | + } |
| 95 | + |
| 96 | + _, err := store.AddDocuments(ctx, docs, vectorstores.WithEmbedder(emb)) |
| 97 | + if err != nil { |
| 98 | + return err |
| 99 | + } |
| 100 | + |
| 101 | + // Consistency on indexes is not synchronous. |
| 102 | + // nolint:mnd |
| 103 | + time.Sleep(10 * time.Second) |
| 104 | + |
| 105 | + return nil |
| 106 | +} |
| 107 | + |
| 108 | +// newNormalizedFloat32 will generate a random float32 in [-1, 1]. |
| 109 | +// nolint:mnd |
| 110 | +func newNormalizedFloat32() (float32, error) { |
| 111 | + max := big.NewInt(1 << 24) |
| 112 | + |
| 113 | + n, err := rand.Int(rand.Reader, max) |
| 114 | + if err != nil { |
| 115 | + return 0.0, fmt.Errorf("failed to normalize float32") |
| 116 | + } |
| 117 | + |
| 118 | + return 2.0*(float32(n.Int64())/float32(1<<24)) - 1.0, nil |
| 119 | +} |
| 120 | + |
| 121 | +// dotProduct will return the dot product between two slices of f32. |
| 122 | +func dotProduct(v1, v2 []float32) float32 { |
| 123 | + var sum float32 |
| 124 | + |
| 125 | + for i := range v1 { |
| 126 | + sum += v1[i] * v2[i] |
| 127 | + } |
| 128 | + |
| 129 | + return sum |
| 130 | +} |
| 131 | + |
| 132 | +// linearlyIndependent true if the vectors are linearly independent. |
| 133 | +func linearlyIndependent(v1, v2 []float32) bool { |
| 134 | + var ratio float32 |
| 135 | + |
| 136 | + for i := range v1 { |
| 137 | + if v1[i] != 0 { |
| 138 | + r := v2[i] / v1[i] |
| 139 | + |
| 140 | + if ratio == 0 { |
| 141 | + ratio = r |
| 142 | + |
| 143 | + continue |
| 144 | + } |
| 145 | + |
| 146 | + if r == ratio { |
| 147 | + continue |
| 148 | + } |
| 149 | + |
| 150 | + return true |
| 151 | + } |
| 152 | + |
| 153 | + if v2[i] != 0 { |
| 154 | + return true |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + return false |
| 159 | +} |
| 160 | + |
| 161 | +// Create a vector of values between [-1, 1] of the specified size. |
| 162 | +func newNormalizedVector(dim int) []float32 { |
| 163 | + vector := make([]float32, dim) |
| 164 | + for i := range vector { |
| 165 | + vector[i], _ = newNormalizedFloat32() |
| 166 | + } |
| 167 | + |
| 168 | + return vector |
| 169 | +} |
| 170 | + |
| 171 | +// Use Gram Schmidt to return a vector orthogonal to the basis, so long as |
| 172 | +// the vectors in the basis are linearly independent. |
| 173 | +func newOrthogonalVector(dim int, basis ...[]float32) []float32 { |
| 174 | + candidate := newNormalizedVector(dim) |
| 175 | + |
| 176 | + for _, b := range basis { |
| 177 | + dp := dotProduct(candidate, b) |
| 178 | + basisNorm := dotProduct(b, b) |
| 179 | + |
| 180 | + for i := range candidate { |
| 181 | + candidate[i] -= (dp / basisNorm) * b[i] |
| 182 | + } |
| 183 | + } |
| 184 | + |
| 185 | + return candidate |
| 186 | +} |
| 187 | + |
| 188 | +// return a new vector such that v1 * v2 = 2S - 1. |
| 189 | +func dotProductNormFn(score float32, qvector, basis []float32) []float32 { |
| 190 | + var sum float32 |
| 191 | + |
| 192 | + // Populate v2 upto dim-1. |
| 193 | + for i := range qvector[:len(qvector)-1] { |
| 194 | + sum += qvector[i] * basis[i] |
| 195 | + } |
| 196 | + |
| 197 | + // Calculate v_{2, dim} such that v1 * v2 = 2S - 1: |
| 198 | + basis[len(basis)-1] = (2*score - 1 - sum) / qvector[len(qvector)-1] |
| 199 | + |
| 200 | + // If the vectors are linearly independent, regenerate the dim-1 elements |
| 201 | + // of v2. |
| 202 | + if !linearlyIndependent(qvector, basis) { |
| 203 | + return dotProductNormFn(score, qvector, basis) |
| 204 | + } |
| 205 | + |
| 206 | + return basis |
| 207 | +} |
0 commit comments