Merge branch 'main' into export-D66074156

asadoughi · web-flow · commit bccfdb44a8f3 · 2024-11-25T18:03:02.000-05:00
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
@@ -34,8 +34,8 @@ runs:
     - name: Install conda build tools
       shell: ${{ steps.choose_shell.outputs.shell }}
       run: |
-        conda update -y -q conda
-        conda install -y -q conda-build
+        conda install -y -q "conda!=24.11.0"
+        conda install -y -q "conda-build!=24.11.0"
     - name: Enable anaconda uploads
       if: inputs.label != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
@@ -80,7 +80,7 @@ runs:
       working-directory: conda
       run: |
         conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+            -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
     - name: Conda build (GPU w/ cuVS) w/ anaconda upload
       if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
@@ -89,4 +89,4 @@ runs:
         PACKAGE_TYPE: ${{ inputs.label }}
       run: |
         conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -238,8 +238,8 @@ jobs:
         with:
           label: main
           cuda: "12.1.1"
-  linux-x86_64-GPU-CUVS-packages-CUDA12-1-1:
-    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.1.1)
+  linux-x86_64-GPU-CUVS-packages-CUDA12-4-0:
+    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.4.0)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -257,7 +257,7 @@ jobs:
         with:
           label: main
           cuvs: "ON"
-          cuda: "12.1.1"
+          cuda: "12.4.0"
   windows-x86_64-packages:
     name: Windows x86_64 packages
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -73,8 +73,8 @@ jobs:
         with:
           label: nightly
           cuda: "12.1.1"
-  linux-x86_64-GPU-CUVS-CUDA12-1-1-nightly:
-    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.1.1)
+  linux-x86_64-GPU-CUVS-CUDA12-4-0-nightly:
+    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.4.0)
     runs-on: 4-core-ubuntu-gpu-t4
     env:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
@@ -90,7 +90,7 @@ jobs:
         with:
           label: nightly
           cuvs: "ON"
-          cuda: "12.1.1"
+          cuda: "12.4.0"
   windows-x86_64-nightly:
     name: Windows x86_64 nightlies
     runs-on: windows-2019
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
@@ -9,7 +9,7 @@
 {% if cudatoolkit == '11.8.0' %}
 {% set cuda_constraints=">=11.8,<12" %}
 {% set libcublas_constraints=">=11.11,<12" %}
-{% elif cudatoolkit == '12.1.1' %}
+{% elif cudatoolkit == '12.4.0' %}
 {% set cuda_constraints=">=12.1,<13" %}
 {% set libcublas_constraints=">=12.1,<13" %}
 {% endif %}
@@ -48,7 +48,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.24.0
+        - cmake >=3.26.4
         - make  # [not win]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
@@ -402,16 +402,11 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     } else if (args.vectorType == DistanceDataType::F16) {
         bfKnnConvert<half>(prov, args);
     } else if (args.vectorType == DistanceDataType::BF16) {
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
         if (prov->getResources()->supportsBFloat16CurrentDevice()) {
             bfKnnConvert<__nv_bfloat16>(prov, args);
         } else {
             FAISS_THROW_MSG("not compiled with bfloat16 support");
         }
-#else
-        FAISS_THROW_MSG("no AMD bfloat16 support");
-#endif
     } else {
         FAISS_THROW_MSG("unknown vectorType");
     }
diff --git a/faiss/gpu/hipify.sh b/faiss/gpu/hipify.sh
@@ -3,17 +3,46 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+#
+# Usage: ./gpu/hipify.sh
+#
 
 function hipify_dir()
 {
     # print dir name
     cd "$1" || exit
     echo "Hipifying $(pwd)"
 
+    if [ -d ./gpu-tmp ]; then
+        #Clearing out any leftover files and directories
+        echo "Removing old ./gpu-tmp"
+        rm -rf ./gpu-tmp
+    fi
+
+    if [ -d ./gpu ]; then
+        #Making a temp directory to implement pre hipify rules
+        echo "Creating ./gpu-tmp"
+        cp -r ./gpu ./gpu-tmp
+
+        # adjust __nv_bfloat162 before hipify because of inaccurate conversions
+        # adjust __nv_bfloat16 before hipify because of inaccurate conversions
+        for ext in hip cuh h cpp c cu cuh
+        do
+            while IFS= read -r -d '' src
+            do
+                sed -i 's@__nv_bfloat162@__hip_bfloat162@' "$src"
+                sed -i 's@__nv_bfloat16@__hip_bfloat16@' "$src"
+            done <   <(find ./gpu-tmp -name "*.$ext" -print0)
+        done
+    else
+        echo "Can't find the gpu/ dir"
+        exit
+    fi
+
     # create all destination directories for hipified files into sibling 'gpu-rocm' directory
     while IFS= read -r -d '' src
     do
-        dst="${src//gpu/gpu-rocm}"
+        dst="${src//gpu-tmp/gpu-rocm}"
 
         if [ -d $dst ]; then
             #Clearing out any leftover files and directories
@@ -24,17 +53,17 @@ function hipify_dir()
         #Making directories
         echo "Creating $dst"
         mkdir -p "$dst"
-    done <   <(find ./gpu -type d -print0)
+    done <   <(find ./gpu-tmp -type d -print0)
 
     # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
     # run all files in parallel to speed up
     for ext in cu cuh h cpp c
     do
         while IFS= read -r -d '' src
         do
-            dst="${src//\.\/gpu/\.\/gpu-rocm}"
+            dst="${src//\.\/gpu-tmp/\.\/gpu-rocm}"
             hipify-perl -o="$dst.tmp" "$src" &
-        done <   <(find ./gpu -name "*.$ext" -print0)
+        done <   <(find ./gpu-tmp -name "*.$ext" -print0)
     done
     wait
 
@@ -45,6 +74,12 @@ function hipify_dir()
         mv "$src" "$dst"
     done <   <(find ./gpu-rocm -name "*.cu.tmp" -print0)
 
+    if [ -d ./gpu-tmp ]; then
+        #Clearing out any leftover files and directories
+        echo "Removing ./gpu-tmp"
+        rm -rf ./gpu-tmp
+    fi
+
     # replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
     # replace thrust::cuda::par with thrust::hip::par
     # adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
diff --git a/faiss/gpu/impl/Distance.cu b/faiss/gpu/impl/Distance.cu
@@ -504,8 +504,6 @@ void runAllPairwiseL2Distance(
             outDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -526,7 +524,6 @@ void runAllPairwiseL2Distance(
             queriesRowMajor,
             outDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runAllPairwiseIPDistance(
         GpuResources* res,
@@ -568,8 +565,6 @@ void runAllPairwiseIPDistance(
             outDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -589,7 +584,6 @@ void runAllPairwiseIPDistance(
             queriesRowMajor,
             outDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runL2Distance(
         GpuResources* res,
@@ -643,8 +637,6 @@ void runL2Distance(
             ignoreOutDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -670,7 +662,6 @@ void runL2Distance(
             outIndices,
             ignoreOutDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runIPDistance(
         GpuResources* res,
@@ -716,8 +707,6 @@ void runIPDistance(
             outIndices);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -739,7 +728,6 @@ void runIPDistance(
             outDistances,
             outIndices);
 }
-#endif // USE_AMD_ROCM
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/Distance.cuh b/faiss/gpu/impl/Distance.cuh
@@ -41,8 +41,6 @@ void runAllPairwiseL2Distance(
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -52,7 +50,6 @@ void runAllPairwiseL2Distance(
         Tensor<__nv_bfloat16, 2, true>& queries,
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
-#endif // USE_AMD_ROCM
 
 void runAllPairwiseIPDistance(
         GpuResources* res,
@@ -72,8 +69,6 @@ void runAllPairwiseIPDistance(
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -82,7 +77,6 @@ void runAllPairwiseIPDistance(
         Tensor<__nv_bfloat16, 2, true>& queries,
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
-#endif // USE_AMD_ROCM
 
 /// Calculates brute-force L2 distance between `vectors` and
 /// `queries`, returning the k closest results seen
@@ -116,8 +110,6 @@ void runL2Distance(
         Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Distance(
         GpuResources* resources,
         cudaStream_t stream,
@@ -130,7 +122,6 @@ void runL2Distance(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
-#endif // USE_AMD_ROCM
 
 /// Calculates brute-force inner product distance between `vectors`
 /// and `queries`, returning the k closest results seen
@@ -156,8 +147,6 @@ void runIPDistance(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runIPDistance(
         GpuResources* resources,
         cudaStream_t stream,
@@ -168,7 +157,6 @@ void runIPDistance(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices);
-#endif // USE_AMD_ROCM
 
 //
 // General distance implementation, assumes that all arguments are on the
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
@@ -275,8 +275,6 @@ void runL2Norm(
     runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Norm(
         Tensor<__nv_bfloat16, 2, true>& input,
         bool inputRowMajor,
@@ -286,7 +284,6 @@ void runL2Norm(
     runL2Norm<__nv_bfloat16, __nv_bfloat162>(
             input, inputRowMajor, output, normSquared, stream);
 }
-#endif
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/L2Norm.cuh b/faiss/gpu/impl/L2Norm.cuh
@@ -27,15 +27,12 @@ void runL2Norm(
         bool normSquared,
         cudaStream_t stream);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Norm(
         Tensor<__nv_bfloat16, 2, true>& input,
         bool inputRowMajor,
         Tensor<float, 1, true>& output,
         bool normSquared,
         cudaStream_t stream);
-#endif
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/utils/ConversionOperators.cuh b/faiss/gpu/utils/ConversionOperators.cuh
@@ -38,12 +38,9 @@ struct ConvertTo<float> {
     static inline __device__ float to(half v) {
         return __half2float(v);
     }
-
-#ifndef USE_AMD_ROCM
     static inline __device__ float to(__nv_bfloat16 v) {
         return __bfloat162float(v);
     }
-#endif // !USE_AMD_ROCM
 };
 
 template <>
@@ -96,9 +93,6 @@ struct ConvertTo<Half4> {
     }
 };
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
-
 template <>
 struct ConvertTo<__nv_bfloat16> {
     static inline __device__ __nv_bfloat16 to(float v) {
@@ -112,8 +106,6 @@ struct ConvertTo<__nv_bfloat16> {
     }
 };
 
-#endif // USE_AMD_ROCM
-
 template <typename From, typename To>
 struct Convert {
     inline __device__ To operator()(From v) const {
diff --git a/faiss/gpu/utils/Float16.cuh b/faiss/gpu/utils/Float16.cuh
diff --git a/faiss/gpu/utils/MathOperators.cuh b/faiss/gpu/utils/MathOperators.cuh
diff --git a/faiss/gpu/utils/MatrixMult-inl.cuh b/faiss/gpu/utils/MatrixMult-inl.cuh

Original file line number	Diff line number	Diff line change
`@@ -275,8 +275,6 @@ void runL2Norm(`
`275`	`275`	`runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);`
`276`	`276`	`}`
`277`	`277`
`278`		`-// no bf16 support for AMD`
`279`		`-#ifndef USE_AMD_ROCM`
`280`	`278`	`void runL2Norm(`
`281`	`279`	`Tensor<__nv_bfloat16, 2, true>& input,`
`282`	`280`	`bool inputRowMajor,`
`@@ -286,7 +284,6 @@ void runL2Norm(`
`286`	`284`	`runL2Norm<__nv_bfloat16, __nv_bfloat162>(`
`287`	`285`	`input, inputRowMajor, output, normSquared, stream);`
`288`	`286`	`}`
`289`		`-#endif`
`290`	`287`
`291`	`288`	`} // namespace gpu`
`292`	`289`	`} // namespace faiss`