ROCm support for bfloat16 (#4039)

ItsPitt · facebook-github-bot · commit 37f52dc9d2b1 · 2024-11-25T14:59:11.000-08:00
Summary: Updated the hipify script to handle bfloat16 conversion and unblocked disabling for ROCm. Pull Request resolved: #4039 Reviewed By: junjieqi Differential Revision: D66413190 Pulled By: asadoughi fbshipit-source-id: d1564f87e3c3466ff929dfd639bd544318371148
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
@@ -402,16 +402,11 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     } else if (args.vectorType == DistanceDataType::F16) {
         bfKnnConvert<half>(prov, args);
     } else if (args.vectorType == DistanceDataType::BF16) {
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
         if (prov->getResources()->supportsBFloat16CurrentDevice()) {
             bfKnnConvert<__nv_bfloat16>(prov, args);
         } else {
             FAISS_THROW_MSG("not compiled with bfloat16 support");
         }
-#else
-        FAISS_THROW_MSG("no AMD bfloat16 support");
-#endif
     } else {
         FAISS_THROW_MSG("unknown vectorType");
     }
diff --git a/faiss/gpu/hipify.sh b/faiss/gpu/hipify.sh
@@ -3,17 +3,46 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+#
+# Usage: ./gpu/hipify.sh
+#
 
 function hipify_dir()
 {
     # print dir name
     cd "$1" || exit
     echo "Hipifying $(pwd)"
 
+    if [ -d ./gpu-tmp ]; then
+        #Clearing out any leftover files and directories
+        echo "Removing old ./gpu-tmp"
+        rm -rf ./gpu-tmp
+    fi
+
+    if [ -d ./gpu ]; then
+        #Making a temp directory to implement pre hipify rules
+        echo "Creating ./gpu-tmp"
+        cp -r ./gpu ./gpu-tmp
+
+        # adjust __nv_bfloat162 before hipify because of inaccurate conversions
+        # adjust __nv_bfloat16 before hipify because of inaccurate conversions
+        for ext in hip cuh h cpp c cu cuh
+        do
+            while IFS= read -r -d '' src
+            do
+                sed -i 's@__nv_bfloat162@__hip_bfloat162@' "$src"
+                sed -i 's@__nv_bfloat16@__hip_bfloat16@' "$src"
+            done <   <(find ./gpu-tmp -name "*.$ext" -print0)
+        done
+    else
+        echo "Can't find the gpu/ dir"
+        exit
+    fi
+
     # create all destination directories for hipified files into sibling 'gpu-rocm' directory
     while IFS= read -r -d '' src
     do
-        dst="${src//gpu/gpu-rocm}"
+        dst="${src//gpu-tmp/gpu-rocm}"
 
         if [ -d $dst ]; then
             #Clearing out any leftover files and directories
@@ -24,17 +53,17 @@ function hipify_dir()
         #Making directories
         echo "Creating $dst"
         mkdir -p "$dst"
-    done <   <(find ./gpu -type d -print0)
+    done <   <(find ./gpu-tmp -type d -print0)
 
     # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
     # run all files in parallel to speed up
     for ext in cu cuh h cpp c
     do
         while IFS= read -r -d '' src
         do
-            dst="${src//\.\/gpu/\.\/gpu-rocm}"
+            dst="${src//\.\/gpu-tmp/\.\/gpu-rocm}"
             hipify-perl -o="$dst.tmp" "$src" &
-        done <   <(find ./gpu -name "*.$ext" -print0)
+        done <   <(find ./gpu-tmp -name "*.$ext" -print0)
     done
     wait
 
@@ -45,6 +74,12 @@ function hipify_dir()
         mv "$src" "$dst"
     done <   <(find ./gpu-rocm -name "*.cu.tmp" -print0)
 
+    if [ -d ./gpu-tmp ]; then
+        #Clearing out any leftover files and directories
+        echo "Removing ./gpu-tmp"
+        rm -rf ./gpu-tmp
+    fi
+
     # replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
     # replace thrust::cuda::par with thrust::hip::par
     # adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
diff --git a/faiss/gpu/impl/Distance.cu b/faiss/gpu/impl/Distance.cu
@@ -504,8 +504,6 @@ void runAllPairwiseL2Distance(
             outDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -526,7 +524,6 @@ void runAllPairwiseL2Distance(
             queriesRowMajor,
             outDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runAllPairwiseIPDistance(
         GpuResources* res,
@@ -568,8 +565,6 @@ void runAllPairwiseIPDistance(
             outDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -589,7 +584,6 @@ void runAllPairwiseIPDistance(
             queriesRowMajor,
             outDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runL2Distance(
         GpuResources* res,
@@ -643,8 +637,6 @@ void runL2Distance(
             ignoreOutDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -670,7 +662,6 @@ void runL2Distance(
             outIndices,
             ignoreOutDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runIPDistance(
         GpuResources* res,
@@ -716,8 +707,6 @@ void runIPDistance(
             outIndices);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -739,7 +728,6 @@ void runIPDistance(
             outDistances,
             outIndices);
 }
-#endif // USE_AMD_ROCM
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/Distance.cuh b/faiss/gpu/impl/Distance.cuh
@@ -41,8 +41,6 @@ void runAllPairwiseL2Distance(
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -52,7 +50,6 @@ void runAllPairwiseL2Distance(
         Tensor<__nv_bfloat16, 2, true>& queries,
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
-#endif // USE_AMD_ROCM
 
 void runAllPairwiseIPDistance(
         GpuResources* res,
@@ -72,8 +69,6 @@ void runAllPairwiseIPDistance(
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -82,7 +77,6 @@ void runAllPairwiseIPDistance(
         Tensor<__nv_bfloat16, 2, true>& queries,
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
-#endif // USE_AMD_ROCM
 
 /// Calculates brute-force L2 distance between `vectors` and
 /// `queries`, returning the k closest results seen
@@ -116,8 +110,6 @@ void runL2Distance(
         Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Distance(
         GpuResources* resources,
         cudaStream_t stream,
@@ -130,7 +122,6 @@ void runL2Distance(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
-#endif // USE_AMD_ROCM
 
 /// Calculates brute-force inner product distance between `vectors`
 /// and `queries`, returning the k closest results seen
@@ -156,8 +147,6 @@ void runIPDistance(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runIPDistance(
         GpuResources* resources,
         cudaStream_t stream,
@@ -168,7 +157,6 @@ void runIPDistance(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices);
-#endif // USE_AMD_ROCM
 
 //
 // General distance implementation, assumes that all arguments are on the
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
@@ -275,8 +275,6 @@ void runL2Norm(
     runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Norm(
         Tensor<__nv_bfloat16, 2, true>& input,
         bool inputRowMajor,
@@ -286,7 +284,6 @@ void runL2Norm(
     runL2Norm<__nv_bfloat16, __nv_bfloat162>(
             input, inputRowMajor, output, normSquared, stream);
 }
-#endif
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/L2Norm.cuh b/faiss/gpu/impl/L2Norm.cuh
@@ -27,15 +27,12 @@ void runL2Norm(
         bool normSquared,
         cudaStream_t stream);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Norm(
         Tensor<__nv_bfloat16, 2, true>& input,
         bool inputRowMajor,
         Tensor<float, 1, true>& output,
         bool normSquared,
         cudaStream_t stream);
-#endif
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/utils/ConversionOperators.cuh b/faiss/gpu/utils/ConversionOperators.cuh
@@ -38,12 +38,9 @@ struct ConvertTo<float> {
     static inline __device__ float to(half v) {
         return __half2float(v);
     }
-
-#ifndef USE_AMD_ROCM
     static inline __device__ float to(__nv_bfloat16 v) {
         return __bfloat162float(v);
     }
-#endif // !USE_AMD_ROCM
 };
 
 template <>
@@ -96,9 +93,6 @@ struct ConvertTo<Half4> {
     }
 };
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
-
 template <>
 struct ConvertTo<__nv_bfloat16> {
     static inline __device__ __nv_bfloat16 to(float v) {
@@ -112,8 +106,6 @@ struct ConvertTo<__nv_bfloat16> {
     }
 };
 
-#endif // USE_AMD_ROCM
-
 template <typename From, typename To>
 struct Convert {
     inline __device__ To operator()(From v) const {
diff --git a/faiss/gpu/utils/Float16.cuh b/faiss/gpu/utils/Float16.cuh
@@ -12,25 +12,22 @@
 #include <faiss/gpu/utils/DeviceUtils.h>
 
 // Some compute capabilities have full float16 ALUs.
-#if __CUDA_ARCH__ >= 530 || defined(USE_AMD_ROCM)
+#if __CUDA_ARCH__ >= 530
 #define FAISS_USE_FULL_FLOAT16 1
 #endif // __CUDA_ARCH__ types
 
 // Some compute capabilities have full bfloat16 ALUs.
-// FIXME: no support in ROCm yet
-#if __CUDA_ARCH__ >= 800 // || defined(USE_AMD_ROCM)
+#if __CUDA_ARCH__ >= 800 || defined(USE_AMD_ROCM)
 #define FAISS_USE_FULL_BFLOAT16 1
 #endif // __CUDA_ARCH__ types
 
-#include <cuda_fp16.h>
 #if !defined(USE_AMD_ROCM)
 #include <cuda_bf16.h>
-#endif
-// #else
-//  FIXME: no support in ROCm yet
-//  #include <amd_hip_bf16.h>
-//  #include <amd_hip_fp16.h>
-// #endif // !defined(USE_AMD_ROCM)
+#include <cuda_fp16.h>
+#else
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#endif // !defined(USE_AMD_ROCM)
 
 namespace faiss {
 namespace gpu {
diff --git a/faiss/gpu/utils/MathOperators.cuh b/faiss/gpu/utils/MathOperators.cuh
@@ -556,8 +556,6 @@ struct Math<Half8> {
     }
 };
 
-#ifndef USE_AMD_ROCM
-
 template <>
 struct Math<__nv_bfloat16> {
     typedef __nv_bfloat16 ScalarType;
@@ -626,7 +624,7 @@ struct Math<__nv_bfloat16> {
     }
 
     static inline __device__ __nv_bfloat16 zero() {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
         return 0.0f;
 #else
         __nv_bfloat16 h;
@@ -789,7 +787,5 @@ struct Math<__nv_bfloat162> {
     }
 };
 
-#endif // !USE_AMD_ROCM
-
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/utils/MatrixMult-inl.cuh b/faiss/gpu/utils/MatrixMult-inl.cuh
@@ -32,11 +32,10 @@ struct GetCudaType<half> {
     static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F;
 };
 
-// FIXME: no AMD support for bf16
-// template <>
-// struct GetCudaType<__nv_bfloat16> {
-//     static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
-// };
+template <>
+struct GetCudaType<__hip_bfloat16> {
+    static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
+};
 
 #else
 

Original file line number	Diff line number	Diff line change
`@@ -275,8 +275,6 @@ void runL2Norm(`
`275`	`275`	`runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);`
`276`	`276`	`}`
`277`	`277`
`278`		`-// no bf16 support for AMD`
`279`		`-#ifndef USE_AMD_ROCM`
`280`	`278`	`void runL2Norm(`
`281`	`279`	`Tensor<__nv_bfloat16, 2, true>& input,`
`282`	`280`	`bool inputRowMajor,`
`@@ -286,7 +284,6 @@ void runL2Norm(`
`286`	`284`	`runL2Norm<__nv_bfloat16, __nv_bfloat162>(`
`287`	`285`	`input, inputRowMajor, output, normSquared, stream);`
`288`	`286`	`}`
`289`		`-#endif`
`290`	`287`
`291`	`288`	`} // namespace gpu`
`292`	`289`	`} // namespace faiss`