Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROCm support for bfloat16 #4039

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions faiss/gpu/GpuDistance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -402,16 +402,11 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
} else if (args.vectorType == DistanceDataType::F16) {
bfKnnConvert<half>(prov, args);
} else if (args.vectorType == DistanceDataType::BF16) {
// no bf16 support for AMD
#ifndef USE_AMD_ROCM
if (prov->getResources()->supportsBFloat16CurrentDevice()) {
bfKnnConvert<__nv_bfloat16>(prov, args);
} else {
FAISS_THROW_MSG("not compiled with bfloat16 support");
}
#else
FAISS_THROW_MSG("no AMD bfloat16 support");
#endif
} else {
FAISS_THROW_MSG("unknown vectorType");
}
Expand Down
43 changes: 39 additions & 4 deletions faiss/gpu/hipify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,46 @@
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Usage: ./gpu/hipify.sh
#

function hipify_dir()
{
# print dir name
cd "$1" || exit
echo "Hipifying $(pwd)"

if [ -d ./gpu-tmp ]; then
#Clearing out any leftover files and directories
echo "Removing old ./gpu-tmp"
rm -rf ./gpu-tmp
fi

if [ -d ./gpu ]; then
#Making a temp directory to implement pre hipify rules
echo "Creating ./gpu-tmp"
cp -r ./gpu ./gpu-tmp

# adjust __nv_bfloat162 before hipify because of inaccurate conversions
# adjust __nv_bfloat16 before hipify because of inaccurate conversions
for ext in hip cuh h cpp c cu cuh
do
while IFS= read -r -d '' src
do
sed -i 's@__nv_bfloat162@__hip_bfloat162@' "$src"
sed -i 's@__nv_bfloat16@__hip_bfloat16@' "$src"
done < <(find ./gpu-tmp -name "*.$ext" -print0)
done
else
echo "Can't find the gpu/ dir"
exit
fi

# create all destination directories for hipified files into sibling 'gpu-rocm' directory
while IFS= read -r -d '' src
do
dst="${src//gpu/gpu-rocm}"
dst="${src//gpu-tmp/gpu-rocm}"

if [ -d $dst ]; then
#Clearing out any leftover files and directories
Expand All @@ -24,17 +53,17 @@ function hipify_dir()
#Making directories
echo "Creating $dst"
mkdir -p "$dst"
done < <(find ./gpu -type d -print0)
done < <(find ./gpu-tmp -type d -print0)

# run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
# run all files in parallel to speed up
for ext in cu cuh h cpp c
do
while IFS= read -r -d '' src
do
dst="${src//\.\/gpu/\.\/gpu-rocm}"
dst="${src//\.\/gpu-tmp/\.\/gpu-rocm}"
hipify-perl -o="$dst.tmp" "$src" &
done < <(find ./gpu -name "*.$ext" -print0)
done < <(find ./gpu-tmp -name "*.$ext" -print0)
done
wait

Expand All @@ -45,6 +74,12 @@ function hipify_dir()
mv "$src" "$dst"
done < <(find ./gpu-rocm -name "*.cu.tmp" -print0)

if [ -d ./gpu-tmp ]; then
#Clearing out any leftover files and directories
echo "Removing ./gpu-tmp"
rm -rf ./gpu-tmp
fi

# replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
# replace thrust::cuda::par with thrust::hip::par
# adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
Expand Down
12 changes: 0 additions & 12 deletions faiss/gpu/impl/Distance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,6 @@ void runAllPairwiseL2Distance(
outDistances);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -526,7 +524,6 @@ void runAllPairwiseL2Distance(
queriesRowMajor,
outDistances);
}
#endif // USE_AMD_ROCM

void runAllPairwiseIPDistance(
GpuResources* res,
Expand Down Expand Up @@ -568,8 +565,6 @@ void runAllPairwiseIPDistance(
outDistances);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -589,7 +584,6 @@ void runAllPairwiseIPDistance(
queriesRowMajor,
outDistances);
}
#endif // USE_AMD_ROCM

void runL2Distance(
GpuResources* res,
Expand Down Expand Up @@ -643,8 +637,6 @@ void runL2Distance(
ignoreOutDistances);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -670,7 +662,6 @@ void runL2Distance(
outIndices,
ignoreOutDistances);
}
#endif // USE_AMD_ROCM

void runIPDistance(
GpuResources* res,
Expand Down Expand Up @@ -716,8 +707,6 @@ void runIPDistance(
outIndices);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -739,7 +728,6 @@ void runIPDistance(
outDistances,
outIndices);
}
#endif // USE_AMD_ROCM

} // namespace gpu
} // namespace faiss
12 changes: 0 additions & 12 deletions faiss/gpu/impl/Distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ void runAllPairwiseL2Distance(
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -52,7 +50,6 @@ void runAllPairwiseL2Distance(
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);
#endif // USE_AMD_ROCM

void runAllPairwiseIPDistance(
GpuResources* res,
Expand All @@ -72,8 +69,6 @@ void runAllPairwiseIPDistance(
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -82,7 +77,6 @@ void runAllPairwiseIPDistance(
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);
#endif // USE_AMD_ROCM

/// Calculates brute-force L2 distance between `vectors` and
/// `queries`, returning the k closest results seen
Expand Down Expand Up @@ -116,8 +110,6 @@ void runL2Distance(
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances = false);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Distance(
GpuResources* resources,
cudaStream_t stream,
Expand All @@ -130,7 +122,6 @@ void runL2Distance(
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances = false);
#endif // USE_AMD_ROCM

/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
Expand All @@ -156,8 +147,6 @@ void runIPDistance(
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runIPDistance(
GpuResources* resources,
cudaStream_t stream,
Expand All @@ -168,7 +157,6 @@ void runIPDistance(
int k,
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices);
#endif // USE_AMD_ROCM

//
// General distance implementation, assumes that all arguments are on the
Expand Down
3 changes: 0 additions & 3 deletions faiss/gpu/impl/L2Norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,6 @@ void runL2Norm(
runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Norm(
Tensor<__nv_bfloat16, 2, true>& input,
bool inputRowMajor,
Expand All @@ -286,7 +284,6 @@ void runL2Norm(
runL2Norm<__nv_bfloat16, __nv_bfloat162>(
input, inputRowMajor, output, normSquared, stream);
}
#endif

} // namespace gpu
} // namespace faiss
3 changes: 0 additions & 3 deletions faiss/gpu/impl/L2Norm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,12 @@ void runL2Norm(
bool normSquared,
cudaStream_t stream);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Norm(
Tensor<__nv_bfloat16, 2, true>& input,
bool inputRowMajor,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream);
#endif

} // namespace gpu
} // namespace faiss
8 changes: 0 additions & 8 deletions faiss/gpu/utils/ConversionOperators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,9 @@ struct ConvertTo<float> {
static inline __device__ float to(half v) {
return __half2float(v);
}

#ifndef USE_AMD_ROCM
static inline __device__ float to(__nv_bfloat16 v) {
return __bfloat162float(v);
}
#endif // !USE_AMD_ROCM
};

template <>
Expand Down Expand Up @@ -96,9 +93,6 @@ struct ConvertTo<Half4> {
}
};

// no bf16 support for AMD
#ifndef USE_AMD_ROCM

template <>
struct ConvertTo<__nv_bfloat16> {
static inline __device__ __nv_bfloat16 to(float v) {
Expand All @@ -112,8 +106,6 @@ struct ConvertTo<__nv_bfloat16> {
}
};

#endif // USE_AMD_ROCM

template <typename From, typename To>
struct Convert {
inline __device__ To operator()(From v) const {
Expand Down
17 changes: 7 additions & 10 deletions faiss/gpu/utils/Float16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,22 @@
#include <faiss/gpu/utils/DeviceUtils.h>

// Some compute capabilities have full float16 ALUs.
#if __CUDA_ARCH__ >= 530 || defined(USE_AMD_ROCM)
#if __CUDA_ARCH__ >= 530
#define FAISS_USE_FULL_FLOAT16 1
#endif // __CUDA_ARCH__ types

// Some compute capabilities have full bfloat16 ALUs.
// FIXME: no support in ROCm yet
#if __CUDA_ARCH__ >= 800 // || defined(USE_AMD_ROCM)
#if __CUDA_ARCH__ >= 800 || defined(USE_AMD_ROCM)
#define FAISS_USE_FULL_BFLOAT16 1
#endif // __CUDA_ARCH__ types

#include <cuda_fp16.h>
#if !defined(USE_AMD_ROCM)
#include <cuda_bf16.h>
#endif
// #else
// FIXME: no support in ROCm yet
// #include <amd_hip_bf16.h>
// #include <amd_hip_fp16.h>
// #endif // !defined(USE_AMD_ROCM)
#include <cuda_fp16.h>
#else
#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
#endif // !defined(USE_AMD_ROCM)

namespace faiss {
namespace gpu {
Expand Down
6 changes: 1 addition & 5 deletions faiss/gpu/utils/MathOperators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,6 @@ struct Math<Half8> {
}
};

#ifndef USE_AMD_ROCM

template <>
struct Math<__nv_bfloat16> {
typedef __nv_bfloat16 ScalarType;
Expand Down Expand Up @@ -626,7 +624,7 @@ struct Math<__nv_bfloat16> {
}

static inline __device__ __nv_bfloat16 zero() {
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
return 0.0f;
#else
__nv_bfloat16 h;
Expand Down Expand Up @@ -789,7 +787,5 @@ struct Math<__nv_bfloat162> {
}
};

#endif // !USE_AMD_ROCM

} // namespace gpu
} // namespace faiss
9 changes: 4 additions & 5 deletions faiss/gpu/utils/MatrixMult-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@ struct GetCudaType<half> {
static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F;
};

// FIXME: no AMD support for bf16
// template <>
// struct GetCudaType<__nv_bfloat16> {
// static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
// };
template <>
struct GetCudaType<__hip_bfloat16> {
static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
};

#else

Expand Down
Loading