Skip to content

Commit 581f23f

Browse files
iotamudeltaketor
authored andcommitted
ROCm support (facebookresearch#3462)
Summary: * add hipify at configure time * ROCm specific code paths behind USE_ROCM guards * support for wavefront 32 (Navi) and 64 (MI) * use builtins to match inline PTX * support C API on ROCm * support Python API on ROCm --------- Pull Request resolved: facebookresearch#3462 Reviewed By: asadoughi Differential Revision: D60431193 Pulled By: ramilbakhshyiev fbshipit-source-id: ac82d5ecb38f995c467e100ed583d5178ae489ee
1 parent e1668ba commit 581f23f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1799
-863
lines changed

CMakeLists.txt

+28-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,13 @@ cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
2323
set(FAISS_LANGUAGES CXX)
2424

2525
if(FAISS_ENABLE_GPU)
26-
list(APPEND FAISS_LANGUAGES CUDA)
26+
# if ROCm install detected, assume ROCm/HIP is GPU device
27+
if (EXISTS /opt/rocm)
28+
set(USE_ROCM TRUE)
29+
list(APPEND FAISS_LANGUAGES HIP)
30+
else()
31+
list(APPEND FAISS_LANGUAGES CUDA)
32+
endif()
2733
endif()
2834

2935
if(FAISS_ENABLE_RAFT)
@@ -58,8 +64,17 @@ option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
5864
option(FAISS_ENABLE_C_API "Build C API." OFF)
5965

6066
if(FAISS_ENABLE_GPU)
61-
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
62-
enable_language(CUDA)
67+
if(USE_ROCM)
68+
enable_language(HIP)
69+
add_definitions(-DUSE_ROCM)
70+
find_package(HIP REQUIRED)
71+
find_package(hipBLAS REQUIRED)
72+
set(GPU_EXT_PREFIX "hip")
73+
else ()
74+
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
75+
enable_language(CUDA)
76+
set(GPU_EXT_PREFIX "cu")
77+
endif()
6378
endif()
6479

6580
if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
@@ -69,7 +84,11 @@ if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
6984
add_subdirectory(faiss)
7085

7186
if(FAISS_ENABLE_GPU)
72-
add_subdirectory(faiss/gpu)
87+
if(USE_ROCM)
88+
add_subdirectory(faiss/gpu-rocm)
89+
else()
90+
add_subdirectory(faiss/gpu)
91+
endif()
7392
endif()
7493

7594
if(FAISS_ENABLE_PYTHON)
@@ -90,6 +109,10 @@ if(BUILD_TESTING)
90109
add_subdirectory(tests)
91110

92111
if(FAISS_ENABLE_GPU)
93-
add_subdirectory(faiss/gpu/test)
112+
if(USE_ROCM)
113+
add_subdirectory(faiss/gpu-rocm/test)
114+
else()
115+
add_subdirectory(faiss/gpu/test)
116+
endif()
94117
endif()
95118
endif()

c_api/CMakeLists.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,9 @@ add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
5656
target_link_libraries(example_c PRIVATE faiss_c)
5757

5858
if(FAISS_ENABLE_GPU)
59-
add_subdirectory(gpu)
59+
if(USE_ROCM)
60+
add_subdirectory(gpu-rocm)
61+
else ()
62+
add_subdirectory(gpu)
63+
endif()
6064
endif()

c_api/gpu/CMakeLists.txt

+6
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,14 @@ target_sources(faiss_c PRIVATE
1515
file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
1616
faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)
1717

18+
if (USE_ROCM)
19+
find_package(HIP REQUIRED)
20+
find_package(hipBLAS REQUIRED)
21+
target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
22+
else()
1823
find_package(CUDAToolkit REQUIRED)
1924
target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
25+
endif()
2026

2127
add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
2228
target_link_libraries(example_gpu_c PRIVATE faiss_c)

faiss/gpu/CMakeLists.txt

+32-19
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,10 @@ function(generate_ivf_interleaved_code)
197197
"64|2048|8"
198198
)
199199

200+
if (USE_ROCM)
201+
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
202+
endif()
203+
200204
# Traverse through the Cartesian product of X and Y
201205
foreach(sub_codec ${SUB_CODEC_TYPE})
202206
foreach(metric_type ${SUB_METRIC_TYPE})
@@ -210,10 +214,10 @@ function(generate_ivf_interleaved_code)
210214
set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
211215
# Remove illegal characters from filename
212216
string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
213-
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu")
217+
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${GPU_EXT_PREFIX}")
214218

215219
# Read the template file
216-
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content)
220+
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.${GPU_EXT_PREFIX}" template_content)
217221

218222
# Replace the placeholders
219223
string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
@@ -290,6 +294,10 @@ if(FAISS_ENABLE_RAFT)
290294
target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
291295
endif()
292296

297+
if (USE_ROCM)
298+
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
299+
endif()
300+
293301
# Export FAISS_GPU_HEADERS variable to parent scope.
294302
set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
295303

@@ -305,21 +313,26 @@ foreach(header ${FAISS_GPU_HEADERS})
305313
)
306314
endforeach()
307315

308-
# Prepares a host linker script and enables host linker to support
309-
# very large device object files.
310-
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
311-
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
312-
[=[
313-
SECTIONS
314-
{
315-
.nvFatBinSegment : { *(.nvFatBinSegment) }
316-
__nv_relfatbin : { *(__nv_relfatbin) }
317-
.nv_fatbin : { *(.nv_fatbin) }
318-
}
319-
]=]
320-
)
321-
target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
316+
if (USE_ROCM)
317+
target_link_libraries(faiss_gpu PRIVATE $<$<BOOL:${USE_ROCM}>:hip::host> $<$<BOOL:${USE_ROCM}>:roc::hipblas>)
318+
target_compile_options(faiss_gpu PRIVATE)
319+
else()
320+
# Prepares a host linker script and enables host linker to support
321+
# very large device object files.
322+
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
323+
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
324+
[=[
325+
SECTIONS
326+
{
327+
.nvFatBinSegment : { *(.nvFatBinSegment) }
328+
__nv_relfatbin : { *(__nv_relfatbin) }
329+
.nv_fatbin : { *(.nv_fatbin) }
330+
}
331+
]=]
332+
)
333+
target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
322334

323-
find_package(CUDAToolkit REQUIRED)
324-
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
325-
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
335+
find_package(CUDAToolkit REQUIRED)
336+
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
337+
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
338+
endif()

faiss/gpu/GpuFaissAssert.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
/// Assertions
1616
///
1717

18-
#ifdef __CUDA_ARCH__
18+
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
1919
#define GPU_FAISS_ASSERT(X) assert(X)
2020
#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
2121
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)

faiss/gpu/StandardGpuResources.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -363,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
363363
prop.major,
364364
prop.minor);
365365

366+
#if USE_ROCM
367+
// Our code is pre-built with and expects warpSize == 32 or 64, validate
368+
// that
369+
FAISS_ASSERT_FMT(
370+
prop.warpSize == 32 || prop.warpSize == 64,
371+
"Device id %d does not have expected warpSize of 32 or 64",
372+
device);
373+
#else
366374
// Our code is pre-built with and expects warpSize == 32, validate that
367375
FAISS_ASSERT_FMT(
368376
prop.warpSize == 32,
369377
"Device id %d does not have expected warpSize of 32",
370378
device);
379+
#endif
371380

372381
// Create streams
373382
cudaStream_t defaultStream = nullptr;

0 commit comments

Comments
 (0)