Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROCm support #3462

Closed
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
set(FAISS_LANGUAGES CXX)

if(FAISS_ENABLE_GPU)
list(APPEND FAISS_LANGUAGES CUDA)
# if ROCm install detected, assume ROCm/HIP is GPU device
if (EXISTS /opt/rocm)
set(USE_ROCM TRUE)
list(APPEND FAISS_LANGUAGES HIP)
else()
list(APPEND FAISS_LANGUAGES CUDA)
endif()
endif()

if(FAISS_ENABLE_RAFT)
Expand Down Expand Up @@ -58,8 +64,17 @@ option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
option(FAISS_ENABLE_C_API "Build C API." OFF)

if(FAISS_ENABLE_GPU)
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
if(USE_ROCM)
enable_language(HIP)
add_definitions(-DUSE_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
set(GPU_EXT_PREFIX "hip")
else ()
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
set(GPU_EXT_PREFIX "cu")
endif()
endif()

if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
Expand All @@ -69,7 +84,11 @@ if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
add_subdirectory(faiss)

if(FAISS_ENABLE_GPU)
add_subdirectory(faiss/gpu)
if(USE_ROCM)
add_subdirectory(faiss/gpu-rocm)
else()
add_subdirectory(faiss/gpu)
endif()
endif()

if(FAISS_ENABLE_PYTHON)
Expand All @@ -90,6 +109,10 @@ if(BUILD_TESTING)
add_subdirectory(tests)

if(FAISS_ENABLE_GPU)
add_subdirectory(faiss/gpu/test)
if(USE_ROCM)
add_subdirectory(faiss/gpu-rocm/test)
else()
add_subdirectory(faiss/gpu/test)
endif()
endif()
endif()
6 changes: 5 additions & 1 deletion c_api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,9 @@ add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
target_link_libraries(example_c PRIVATE faiss_c)

if(FAISS_ENABLE_GPU)
add_subdirectory(gpu)
if(USE_ROCM)
add_subdirectory(gpu-rocm)
else ()
add_subdirectory(gpu)
endif()
endif()
6 changes: 6 additions & 0 deletions c_api/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@ target_sources(faiss_c PRIVATE
file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)

if (USE_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
else()
find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
endif()

add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
target_link_libraries(example_gpu_c PRIVATE faiss_c)
51 changes: 32 additions & 19 deletions faiss/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ function(generate_ivf_interleaved_code)
"64|2048|8"
)

if (USE_ROCM)
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
endif()

# Traverse through the Cartesian product of X and Y
foreach(sub_codec ${SUB_CODEC_TYPE})
foreach(metric_type ${SUB_METRIC_TYPE})
Expand All @@ -210,10 +214,10 @@ function(generate_ivf_interleaved_code)
set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
# Remove illegal characters from filename
string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu")
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${GPU_EXT_PREFIX}")

# Read the template file
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content)
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.${GPU_EXT_PREFIX}" template_content)

# Replace the placeholders
string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
Expand Down Expand Up @@ -290,6 +294,10 @@ if(FAISS_ENABLE_RAFT)
target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
endif()

if (USE_ROCM)
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
endif()

# Export FAISS_GPU_HEADERS variable to parent scope.
set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)

Expand All @@ -305,21 +313,26 @@ foreach(header ${FAISS_GPU_HEADERS})
)
endforeach()

# Prepares a host linker script and enables host linker to support
# very large device object files.
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
[=[
SECTIONS
{
.nvFatBinSegment : { *(.nvFatBinSegment) }
__nv_relfatbin : { *(__nv_relfatbin) }
.nv_fatbin : { *(.nv_fatbin) }
}
]=]
)
target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
if (USE_ROCM)
target_link_libraries(faiss_gpu PRIVATE $<$<BOOL:${USE_ROCM}>:hip::host> $<$<BOOL:${USE_ROCM}>:roc::hipblas>)
target_compile_options(faiss_gpu PRIVATE)
else()
# Prepares a host linker script and enables host linker to support
# very large device object files.
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
[=[
SECTIONS
{
.nvFatBinSegment : { *(.nvFatBinSegment) }
__nv_relfatbin : { *(__nv_relfatbin) }
.nv_fatbin : { *(.nv_fatbin) }
}
]=]
)
target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")

find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
endif()
2 changes: 1 addition & 1 deletion faiss/gpu/GpuFaissAssert.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
/// Assertions
///

#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#define GPU_FAISS_ASSERT(X) assert(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
Expand Down
9 changes: 9 additions & 0 deletions faiss/gpu/StandardGpuResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
prop.major,
prop.minor);

#if USE_ROCM
// Our code is pre-built with and expects warpSize == 32 or 64, validate
// that
FAISS_ASSERT_FMT(
prop.warpSize == 32 || prop.warpSize == 64,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this ROCm specific? If so, can we allow 64 only for ROCm?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have both wavefront 32 (E.g. navi) and 64 (E.g. MI250) devices. So this offers support for both.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It sounds like Nvidia is 32 only and ROCm is 32 or 64. Should we lock it accordingly in code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is desired, I could rework that assert using a ROCm flag to only allow a warpSize of 64 (and 32) on ROCm devices. It shouldn't be an issue at all!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think I would do that.

"Device id %d does not have expected warpSize of 32 or 64",
device);
#else
// Our code is pre-built with and expects warpSize == 32, validate that
FAISS_ASSERT_FMT(
prop.warpSize == 32,
"Device id %d does not have expected warpSize of 32",
device);
#endif

// Create streams
cudaStream_t defaultStream = nullptr;
Expand Down
Loading
Loading