opensearch-project
diff --git a/‎src/main/java/org/opensearch/knn/index/NativeIndexCreationManager.java
+104 b/‎src/main/java/org/opensearch/knn/index/NativeIndexCreationManager.java
+104
diff --git a/‎src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEnginesKNNVectorsWriter.java
+13-118 b/‎src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEnginesKNNVectorsWriter.java
+13-118
diff --git a/‎src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java
+1-1 b/‎src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java
+1-1
diff --git a/‎src/main/java/org/opensearch/knn/index/codec/util/KNNVectorSerializerFactory.java
+1-1 b/‎src/main/java/org/opensearch/knn/index/codec/util/KNNVectorSerializerFactory.java
+1-1
@@ -0,0 +1,104 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ *
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.knn.index;
+
+import lombok.extern.log4j.Log4j2;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.opensearch.knn.index.codec.KNN80Codec.KNN80DocValuesConsumer;
+import org.opensearch.knn.index.codec.util.KNNCodecUtil;
+import org.opensearch.knn.index.codec.util.SerializationMode;
+import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
+import org.opensearch.knn.index.vectorvalues.KNNVectorValuesIterator;
+import org.opensearch.knn.jni.JNICommons;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This is a single layer that will be responsible for creating the native indices. Right now this is just a POC code,
+ * this needs to be fixed. Its more of a testing to see if everything works correctly.
+ */
+@Log4j2
+public class NativeIndexCreationManager {
+
+    public static void startIndexCreation(
+        final SegmentWriteState segmentWriteState,
+        final KNNVectorValues<float[]> vectorValues,
+        final FieldInfo fieldInfo
+    ) throws IOException {
+        KNNCodecUtil.Pair pair = streamFloatVectors(vectorValues);
+        if (pair.getVectorAddress() == 0 || pair.docs.length == 0) {
+            log.info("Skipping engine index creation as there are no vectors or docs in the segment");
+            return;
+        }
+        createNativeIndex(segmentWriteState, fieldInfo, pair);
+    }
+
+    private static void createNativeIndex(
+        final SegmentWriteState segmentWriteState,
+        final FieldInfo fieldInfo,
+        final KNNCodecUtil.Pair pair
+    ) throws IOException {
+        KNN80DocValuesConsumer.createNativeIndex(segmentWriteState, fieldInfo, pair);
+    }
+
+    private static KNNCodecUtil.Pair streamFloatVectors(final KNNVectorValues<float[]> kNNVectorValues) throws IOException {
+        List<float[]> vectorList = new ArrayList<>();
+        List<Integer> docIdList = new ArrayList<>();
+        long vectorAddress = 0;
+        int dimension = 0;
+        long totalLiveDocs = kNNVectorValues.totalLiveDocs();
+        long vectorsStreamingMemoryLimit = KNNSettings.getVectorStreamingMemoryLimit().getBytes();
+        long vectorsPerTransfer = Integer.MIN_VALUE;
+
+        KNNVectorValuesIterator iterator = kNNVectorValues.getVectorValuesIterator();
+
+        for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
+            float[] vector = kNNVectorValues.getVector();
+            dimension = vector.length;
+            if (vectorsPerTransfer == Integer.MIN_VALUE) {
+                vectorsPerTransfer = (dimension * Float.BYTES * totalLiveDocs) / vectorsStreamingMemoryLimit;
+                // This condition comes if vectorsStreamingMemoryLimit is higher than total number floats to transfer
+                // Doing this will reduce 1 extra trip to JNI layer.
+                if (vectorsPerTransfer == 0) {
+                    vectorsPerTransfer = totalLiveDocs;
+                }
+            }
+
+            if (vectorList.size() == vectorsPerTransfer) {
+                vectorAddress = JNICommons.storeVectorData(vectorAddress, vectorList.toArray(new float[][] {}), totalLiveDocs * dimension);
+                // We should probably come up with a better way to reuse the vectorList memory which we have
+                // created. Problem here is doing like this can lead to a lot of list memory which is of no use and
+                // will be garbage collected later on, but it creates pressure on JVM. We should revisit this.
+                vectorList = new ArrayList<>();
+            }
+
+            vectorList.add(vector);
+            docIdList.add(doc);
+        }
+
+        if (vectorList.isEmpty() == false) {
+            vectorAddress = JNICommons.storeVectorData(vectorAddress, vectorList.toArray(new float[][] {}), totalLiveDocs * dimension);
+        }
+        // SerializationMode.COLLECTION_OF_FLOATS is not getting used. I just added it to ensure code successfully
+        // works.
+        return new KNNCodecUtil.Pair(
+            docIdList.stream().mapToInt(Integer::intValue).toArray(),
+            vectorAddress,
+            dimension,
+            SerializationMode.COLLECTION_OF_FLOATS
+        );
+    }
+}
@@ -22,18 +22,13 @@
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Sorter;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.InfoStream;
-import org.opensearch.knn.index.KNNSettings;
-import org.opensearch.knn.index.codec.KNN80Codec.KNN80DocValuesConsumer;
-import org.opensearch.knn.index.codec.util.KNNCodecUtil;
-import org.opensearch.knn.index.codec.util.SerializationMode;
-import org.opensearch.knn.jni.JNICommons;
+import org.opensearch.knn.index.NativeIndexCreationManager;
+import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory;
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 
 @RequiredArgsConstructor
@@ -69,6 +64,7 @@ public KnnFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws IOException
      * @param sortMap {@link Sorter.DocMap}
      */
     @Override
+    @SuppressWarnings("unchecked")
     public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException {
         // simply write data in the flat file
         flatVectorsWriter.flush(maxDoc, sortMap);
@@ -79,12 +75,11 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException {
         // on the disk.
         // getFloatsFromFloatVectorValues(fields);
         for (NativeEnginesKNNVectorsWriter.FieldWriter<?> fieldWriter : fields) {
-            KNNCodecUtil.Pair pair = getFloatsFromFieldWriter(fieldWriter);
-            if (pair.getVectorAddress() == 0 || pair.docs.length == 0) {
-                log.info("Skipping engine index creation as there are no vectors or docs in the segment");
-                continue;
-            }
-            KNN80DocValuesConsumer.createNativeIndex(segmentWriteState, fieldWriter.fieldInfo, pair);
+            NativeIndexCreationManager.startIndexCreation(
+                segmentWriteState,
+                KNNVectorValuesFactory.getFloatVectorValues(fieldWriter.docsWithField.iterator(), (List<float[]>) fieldWriter.vectors),
+                fieldWriter.fieldInfo
+            );
         }
     }
 
@@ -94,12 +89,11 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
         flatVectorsWriter.mergeOneField(fieldInfo, mergeState);
         final FloatVectorValues floatVectorValues = KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
         // merging the graphs here
-        final KNNCodecUtil.Pair pair = getFloatsFromFloatVectorValues(floatVectorValues);
-        if (pair.getVectorAddress() == 0 || pair.docs.length == 0) {
-            log.info("Skipping engine index creation as there are no vectors or docs to be merged");
-            return;
-        }
-        KNN80DocValuesConsumer.createNativeIndex(segmentWriteState, fieldInfo, pair);
+        NativeIndexCreationManager.startIndexCreation(
+            segmentWriteState,
+            KNNVectorValuesFactory.getFloatVectorValues(floatVectorValues),
+            fieldInfo
+        );
     }
 
     /**
@@ -140,105 +134,6 @@ public long ramBytesUsed() {
         return 0;
     }
 
-    private KNNCodecUtil.Pair getFloatsFromFloatVectorValues(FloatVectorValues floatVectorValues) throws IOException {
-        List<float[]> vectorList = new ArrayList<>();
-        List<Integer> docIdList = new ArrayList<>();
-        long vectorAddress = 0;
-        int dimension = 0;
-
-        long totalLiveDocs = floatVectorValues.size();
-        long vectorsStreamingMemoryLimit = KNNSettings.getVectorStreamingMemoryLimit().getBytes();
-        long vectorsPerTransfer = Integer.MIN_VALUE;
-
-        for (int doc = floatVectorValues.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = floatVectorValues.nextDoc()) {
-            float[] temp = floatVectorValues.vectorValue();
-            // This temp object and copy of temp object is required because when we map floats we read to a memory
-            // location in heap always for floatVectorValues. Ref: OffHeapFloatVectorValues.vectorValue.
-            float[] vector = Arrays.copyOf(floatVectorValues.vectorValue(), temp.length);
-            dimension = vector.length;
-            if (vectorsPerTransfer == Integer.MIN_VALUE) {
-                vectorsPerTransfer = (dimension * Float.BYTES * totalLiveDocs) / vectorsStreamingMemoryLimit;
-                // This condition comes if vectorsStreamingMemoryLimit is higher than total number floats to transfer
-                // Doing this will reduce 1 extra trip to JNI layer.
-                if (vectorsPerTransfer == 0) {
-                    vectorsPerTransfer = totalLiveDocs;
-                }
-            }
-
-            if (vectorList.size() == vectorsPerTransfer) {
-                vectorAddress = JNICommons.storeVectorData(vectorAddress, vectorList.toArray(new float[][] {}), totalLiveDocs * dimension);
-                // We should probably come up with a better way to reuse the vectorList memory which we have
-                // created. Problem here is doing like this can lead to a lot of list memory which is of no use and
-                // will be garbage collected later on, but it creates pressure on JVM. We should revisit this.
-                vectorList = new ArrayList<>();
-            }
-            vectorList.add(vector);
-            docIdList.add(doc);
-        }
-
-        if (vectorList.isEmpty() == false) {
-            vectorAddress = JNICommons.storeVectorData(vectorAddress, vectorList.toArray(new float[][] {}), totalLiveDocs * dimension);
-        }
-        // SerializationMode.COLLECTION_OF_FLOATS is not getting used. I just added it to ensure code successfully
-        // works.
-        return new KNNCodecUtil.Pair(
-            docIdList.stream().mapToInt(Integer::intValue).toArray(),
-            vectorAddress,
-            dimension,
-            SerializationMode.COLLECTION_OF_FLOATS
-        );
-    }
-
-    private KNNCodecUtil.Pair getFloatsFromFieldWriter(NativeEnginesKNNVectorsWriter.FieldWriter<?> fieldWriter) throws IOException {
-        List<float[]> vectorList = new ArrayList<>();
-        List<Integer> docIdList = new ArrayList<>();
-        long vectorAddress = 0;
-        int dimension = 0;
-
-        long totalLiveDocs = fieldWriter.vectors.size();
-        long vectorsStreamingMemoryLimit = KNNSettings.getVectorStreamingMemoryLimit().getBytes();
-        long vectorsPerTransfer = Integer.MIN_VALUE;
-
-        DocIdSetIterator disi = fieldWriter.docsWithField.iterator();
-
-        for (int i = 0; i < fieldWriter.vectors.size(); i++) {
-            float[] vector = (float[]) fieldWriter.vectors.get(i);
-            dimension = vector.length;
-            if (vectorsPerTransfer == Integer.MIN_VALUE) {
-                vectorsPerTransfer = (dimension * Float.BYTES * totalLiveDocs) / vectorsStreamingMemoryLimit;
-                // This condition comes if vectorsStreamingMemoryLimit is higher than total number floats to transfer
-                // Doing this will reduce 1 extra trip to JNI layer.
-                if (vectorsPerTransfer == 0) {
-                    vectorsPerTransfer = totalLiveDocs;
-                }
-            }
-
-            if (vectorList.size() == vectorsPerTransfer) {
-                vectorAddress = JNICommons.storeVectorData(vectorAddress, vectorList.toArray(new float[][] {}), totalLiveDocs * dimension);
-                // We should probably come up with a better way to reuse the vectorList memory which we have
-                // created. Problem here is doing like this can lead to a lot of list memory which is of no use and
-                // will be garbage collected later on, but it creates pressure on JVM. We should revisit this.
-                vectorList = new ArrayList<>();
-            }
-
-            vectorList.add(vector);
-            docIdList.add(disi.nextDoc());
-
-        }
-
-        if (vectorList.isEmpty() == false) {
-            vectorAddress = JNICommons.storeVectorData(vectorAddress, vectorList.toArray(new float[][] {}), totalLiveDocs * dimension);
-        }
-        // SerializationMode.COLLECTION_OF_FLOATS is not getting used. I just added it to ensure code successfully
-        // works.
-        return new KNNCodecUtil.Pair(
-            docIdList.stream().mapToInt(Integer::intValue).toArray(),
-            vectorAddress,
-            dimension,
-            SerializationMode.COLLECTION_OF_FLOATS
-        );
-    }
-
     private static class FieldWriter<T> extends KnnFieldVectorsWriter<T> {
         private final FieldInfo fieldInfo;
         private final List<T> vectors;
 
@@ -145,7 +145,7 @@ public static String buildEngineFileSuffix(String fieldName, String extension) {
         return String.format("_%s%s", fieldName, extension);
     }
 
-    private static long getTotalLiveDocsCount(final BinaryDocValues binaryDocValues) {
+    public static long getTotalLiveDocsCount(final BinaryDocValues binaryDocValues) {
         long totalLiveDocs;
         if (binaryDocValues instanceof KNN80BinaryDocValues) {
             totalLiveDocs = ((KNN80BinaryDocValues) binaryDocValues).getTotalLiveDocs();
 
@@ -56,7 +56,7 @@ public static KNNVectorSerializer getSerializerByStreamContent(final ByteArrayIn
         return getSerializerBySerializationMode(serializationMode);
     }
 
-    static SerializationMode serializerModeFromStream(ByteArrayInputStream byteStream) {
+    public static SerializationMode serializerModeFromStream(ByteArrayInputStream byteStream) {
         int numberOfAvailableBytesInStream = byteStream.available();
         if (numberOfAvailableBytesInStream < ARRAY_HEADER_OFFSET) {
             return getSerializerOrThrowError(numberOfAvailableBytesInStream, COLLECTION_OF_FLOATS);
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ public static String buildEngineFileSuffix(String fieldName, String extension) {`
`145`	`145`	`return String.format("_%s%s", fieldName, extension);`
`146`	`146`	`}`
`147`	`147`
`148`		`- private static long getTotalLiveDocsCount(final BinaryDocValues binaryDocValues) {`
	`148`	`+ public static long getTotalLiveDocsCount(final BinaryDocValues binaryDocValues) {`
`149`	`149`	`long totalLiveDocs;`
`150`	`150`	`if (binaryDocValues instanceof KNN80BinaryDocValues) {`
`151`	`151`	`totalLiveDocs = ((KNN80BinaryDocValues) binaryDocValues).getTotalLiveDocs();`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ public static KNNVectorSerializer getSerializerByStreamContent(final ByteArrayIn`
`56`	`56`	`return getSerializerBySerializationMode(serializationMode);`
`57`	`57`	`}`
`58`	`58`
`59`		`- static SerializationMode serializerModeFromStream(ByteArrayInputStream byteStream) {`
	`59`	`+ public static SerializationMode serializerModeFromStream(ByteArrayInputStream byteStream) {`
`60`	`60`	`int numberOfAvailableBytesInStream = byteStream.available();`
`61`	`61`	`if (numberOfAvailableBytesInStream < ARRAY_HEADER_OFFSET) {`
`62`	`62`	`return getSerializerOrThrowError(numberOfAvailableBytesInStream, COLLECTION_OF_FLOATS);`