22
22
import org .apache .lucene .index .MergeState ;
23
23
import org .apache .lucene .index .SegmentWriteState ;
24
24
import org .apache .lucene .index .Sorter ;
25
- import org .apache .lucene .search .DocIdSetIterator ;
26
25
import org .apache .lucene .util .IOUtils ;
27
26
import org .apache .lucene .util .InfoStream ;
28
- import org .opensearch .knn .index .KNNSettings ;
29
- import org .opensearch .knn .index .codec .KNN80Codec .KNN80DocValuesConsumer ;
30
- import org .opensearch .knn .index .codec .util .KNNCodecUtil ;
31
- import org .opensearch .knn .index .codec .util .SerializationMode ;
32
- import org .opensearch .knn .jni .JNICommons ;
27
+ import org .opensearch .knn .index .NativeIndexCreationManager ;
28
+ import org .opensearch .knn .index .vectorvalues .KNNVectorValuesFactory ;
33
29
34
30
import java .io .IOException ;
35
31
import java .util .ArrayList ;
36
- import java .util .Arrays ;
37
32
import java .util .List ;
38
33
39
34
@ RequiredArgsConstructor
@@ -69,6 +64,7 @@ public KnnFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws IOException
69
64
* @param sortMap {@link Sorter.DocMap}
70
65
*/
71
66
@ Override
67
+ @ SuppressWarnings ("unchecked" )
72
68
public void flush (int maxDoc , Sorter .DocMap sortMap ) throws IOException {
73
69
// simply write data in the flat file
74
70
flatVectorsWriter .flush (maxDoc , sortMap );
@@ -79,12 +75,11 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException {
79
75
// on the disk.
80
76
// getFloatsFromFloatVectorValues(fields);
81
77
for (NativeEnginesKNNVectorsWriter .FieldWriter <?> fieldWriter : fields ) {
82
- KNNCodecUtil .Pair pair = getFloatsFromFieldWriter (fieldWriter );
83
- if (pair .getVectorAddress () == 0 || pair .docs .length == 0 ) {
84
- log .info ("Skipping engine index creation as there are no vectors or docs in the segment" );
85
- continue ;
86
- }
87
- KNN80DocValuesConsumer .createNativeIndex (segmentWriteState , fieldWriter .fieldInfo , pair );
78
+ NativeIndexCreationManager .startIndexCreation (
79
+ segmentWriteState ,
80
+ KNNVectorValuesFactory .getFloatVectorValues (fieldWriter .docsWithField .iterator (), (List <float []>) fieldWriter .vectors ),
81
+ fieldWriter .fieldInfo
82
+ );
88
83
}
89
84
}
90
85
@@ -94,12 +89,11 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
94
89
flatVectorsWriter .mergeOneField (fieldInfo , mergeState );
95
90
final FloatVectorValues floatVectorValues = KnnVectorsWriter .MergedVectorValues .mergeFloatVectorValues (fieldInfo , mergeState );
96
91
// merging the graphs here
97
- final KNNCodecUtil .Pair pair = getFloatsFromFloatVectorValues (floatVectorValues );
98
- if (pair .getVectorAddress () == 0 || pair .docs .length == 0 ) {
99
- log .info ("Skipping engine index creation as there are no vectors or docs to be merged" );
100
- return ;
101
- }
102
- KNN80DocValuesConsumer .createNativeIndex (segmentWriteState , fieldInfo , pair );
92
+ NativeIndexCreationManager .startIndexCreation (
93
+ segmentWriteState ,
94
+ KNNVectorValuesFactory .getFloatVectorValues (floatVectorValues ),
95
+ fieldInfo
96
+ );
103
97
}
104
98
105
99
/**
@@ -140,105 +134,6 @@ public long ramBytesUsed() {
140
134
return 0 ;
141
135
}
142
136
143
- private KNNCodecUtil .Pair getFloatsFromFloatVectorValues (FloatVectorValues floatVectorValues ) throws IOException {
144
- List <float []> vectorList = new ArrayList <>();
145
- List <Integer > docIdList = new ArrayList <>();
146
- long vectorAddress = 0 ;
147
- int dimension = 0 ;
148
-
149
- long totalLiveDocs = floatVectorValues .size ();
150
- long vectorsStreamingMemoryLimit = KNNSettings .getVectorStreamingMemoryLimit ().getBytes ();
151
- long vectorsPerTransfer = Integer .MIN_VALUE ;
152
-
153
- for (int doc = floatVectorValues .nextDoc (); doc != DocIdSetIterator .NO_MORE_DOCS ; doc = floatVectorValues .nextDoc ()) {
154
- float [] temp = floatVectorValues .vectorValue ();
155
- // This temp object and copy of temp object is required because when we map floats we read to a memory
156
- // location in heap always for floatVectorValues. Ref: OffHeapFloatVectorValues.vectorValue.
157
- float [] vector = Arrays .copyOf (floatVectorValues .vectorValue (), temp .length );
158
- dimension = vector .length ;
159
- if (vectorsPerTransfer == Integer .MIN_VALUE ) {
160
- vectorsPerTransfer = (dimension * Float .BYTES * totalLiveDocs ) / vectorsStreamingMemoryLimit ;
161
- // This condition comes if vectorsStreamingMemoryLimit is higher than total number floats to transfer
162
- // Doing this will reduce 1 extra trip to JNI layer.
163
- if (vectorsPerTransfer == 0 ) {
164
- vectorsPerTransfer = totalLiveDocs ;
165
- }
166
- }
167
-
168
- if (vectorList .size () == vectorsPerTransfer ) {
169
- vectorAddress = JNICommons .storeVectorData (vectorAddress , vectorList .toArray (new float [][] {}), totalLiveDocs * dimension );
170
- // We should probably come up with a better way to reuse the vectorList memory which we have
171
- // created. Problem here is doing like this can lead to a lot of list memory which is of no use and
172
- // will be garbage collected later on, but it creates pressure on JVM. We should revisit this.
173
- vectorList = new ArrayList <>();
174
- }
175
- vectorList .add (vector );
176
- docIdList .add (doc );
177
- }
178
-
179
- if (vectorList .isEmpty () == false ) {
180
- vectorAddress = JNICommons .storeVectorData (vectorAddress , vectorList .toArray (new float [][] {}), totalLiveDocs * dimension );
181
- }
182
- // SerializationMode.COLLECTION_OF_FLOATS is not getting used. I just added it to ensure code successfully
183
- // works.
184
- return new KNNCodecUtil .Pair (
185
- docIdList .stream ().mapToInt (Integer ::intValue ).toArray (),
186
- vectorAddress ,
187
- dimension ,
188
- SerializationMode .COLLECTION_OF_FLOATS
189
- );
190
- }
191
-
192
- private KNNCodecUtil .Pair getFloatsFromFieldWriter (NativeEnginesKNNVectorsWriter .FieldWriter <?> fieldWriter ) throws IOException {
193
- List <float []> vectorList = new ArrayList <>();
194
- List <Integer > docIdList = new ArrayList <>();
195
- long vectorAddress = 0 ;
196
- int dimension = 0 ;
197
-
198
- long totalLiveDocs = fieldWriter .vectors .size ();
199
- long vectorsStreamingMemoryLimit = KNNSettings .getVectorStreamingMemoryLimit ().getBytes ();
200
- long vectorsPerTransfer = Integer .MIN_VALUE ;
201
-
202
- DocIdSetIterator disi = fieldWriter .docsWithField .iterator ();
203
-
204
- for (int i = 0 ; i < fieldWriter .vectors .size (); i ++) {
205
- float [] vector = (float []) fieldWriter .vectors .get (i );
206
- dimension = vector .length ;
207
- if (vectorsPerTransfer == Integer .MIN_VALUE ) {
208
- vectorsPerTransfer = (dimension * Float .BYTES * totalLiveDocs ) / vectorsStreamingMemoryLimit ;
209
- // This condition comes if vectorsStreamingMemoryLimit is higher than total number floats to transfer
210
- // Doing this will reduce 1 extra trip to JNI layer.
211
- if (vectorsPerTransfer == 0 ) {
212
- vectorsPerTransfer = totalLiveDocs ;
213
- }
214
- }
215
-
216
- if (vectorList .size () == vectorsPerTransfer ) {
217
- vectorAddress = JNICommons .storeVectorData (vectorAddress , vectorList .toArray (new float [][] {}), totalLiveDocs * dimension );
218
- // We should probably come up with a better way to reuse the vectorList memory which we have
219
- // created. Problem here is doing like this can lead to a lot of list memory which is of no use and
220
- // will be garbage collected later on, but it creates pressure on JVM. We should revisit this.
221
- vectorList = new ArrayList <>();
222
- }
223
-
224
- vectorList .add (vector );
225
- docIdList .add (disi .nextDoc ());
226
-
227
- }
228
-
229
- if (vectorList .isEmpty () == false ) {
230
- vectorAddress = JNICommons .storeVectorData (vectorAddress , vectorList .toArray (new float [][] {}), totalLiveDocs * dimension );
231
- }
232
- // SerializationMode.COLLECTION_OF_FLOATS is not getting used. I just added it to ensure code successfully
233
- // works.
234
- return new KNNCodecUtil .Pair (
235
- docIdList .stream ().mapToInt (Integer ::intValue ).toArray (),
236
- vectorAddress ,
237
- dimension ,
238
- SerializationMode .COLLECTION_OF_FLOATS
239
- );
240
- }
241
-
242
137
private static class FieldWriter <T > extends KnnFieldVectorsWriter <T > {
243
138
private final FieldInfo fieldInfo ;
244
139
private final List <T > vectors ;
0 commit comments