Skip to content

Commit 87d43b9

Browse files
borrrdenfacebook-github-bot
authored andcommitted
Fix AVX2 build on Windows (#3238)
Summary: Tested with both MSVC (with /openmp:llvm) and clang-cl (no particular extra flags needed). This PR is separated into two commits (three after I found out that lines need to be 80 chars or less): 1. Changes needed for clang-cl (and probably stock clang too) 2. Changes needed for MSVC So FAISS can decide either to require using LLVM for Windows (not a hard thing to do these days since it is fully supported inside Visual Studio) and discarding the second commits, or taking them all and documenting the need to use /openmp:llvm Closes #3193 Pull Request resolved: #3238 Reviewed By: mdouze Differential Revision: D53479325 Pulled By: algoriddle fbshipit-source-id: e8628f44626b6f49c5d9d7f259a9e3061cfe5568
1 parent c577f43 commit 87d43b9

File tree

5 files changed

+26
-15
lines changed

5 files changed

+26
-15
lines changed

faiss/impl/LocalSearchQuantizer.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step(
628628
{
629629
size_t binary_idx = (other_m + 1) * M * K * K +
630630
m * K * K + code2 * K + code;
631-
_mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
631+
_mm_prefetch(
632+
(const char*)(binaries + binary_idx),
633+
_MM_HINT_T0);
632634
}
633635
}
634636
#endif

faiss/impl/platform_macros.h

+9
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,13 @@
4040

4141
#include <intrin.h>
4242

43+
#ifndef __clang__
4344
inline int __builtin_ctzll(uint64_t x) {
4445
unsigned long ret;
4546
_BitScanForward64(&ret, x);
4647
return (int)ret;
4748
}
49+
#endif
4850

4951
// cudatoolkit provides __builtin_ctz for NVCC >= 11.0
5052
#if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
@@ -55,13 +57,20 @@ inline int __builtin_ctz(unsigned long x) {
5557
}
5658
#endif
5759

60+
#ifndef __clang__
5861
inline int __builtin_clzll(uint64_t x) {
5962
return (int)__lzcnt64(x);
6063
}
64+
#endif
6165

6266
#define __builtin_popcount __popcnt
6367
#define __builtin_popcountl __popcnt64
6468

69+
#ifndef __clang__
70+
#define __m128i_u __m128i
71+
#define __m256i_u __m256i
72+
#endif
73+
6574
// MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
6675
// processors cf.
6776
// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros

faiss/utils/distances.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -417,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
417417
for (int64_t i = i0; i < i1; i++) {
418418
float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
419419

420-
_mm_prefetch(ip_line, _MM_HINT_NTA);
421-
_mm_prefetch(ip_line + 16, _MM_HINT_NTA);
420+
_mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
421+
_mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);
422422

423423
// constant
424424
const __m256 mul_minus2 = _mm256_set1_ps(-2);
@@ -445,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
445445

446446
// process 16 elements per loop
447447
for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
448-
_mm_prefetch(ip_line + 32, _MM_HINT_NTA);
449-
_mm_prefetch(ip_line + 48, _MM_HINT_NTA);
448+
_mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
449+
_mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);
450450

451451
// load values for norms
452452
const __m256 y_norm_0 =

faiss/utils/distances_fused/simdlib_based.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ void kernel(
7373

7474
// prefetch the next point
7575
#if defined(__AVX2__)
76-
_mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
76+
_mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
7777
#endif
7878

7979
// load a single point from x

faiss/utils/distances_simd.cpp

+9-9
Original file line numberDiff line numberDiff line change
@@ -439,14 +439,14 @@ void fvec_op_ny_D2<ElementOpIP>(
439439

440440
if (ny8 > 0) {
441441
// process 8 D2-vectors per loop.
442-
_mm_prefetch(y, _MM_HINT_T0);
443-
_mm_prefetch(y + 16, _MM_HINT_T0);
442+
_mm_prefetch((const char*)y, _MM_HINT_T0);
443+
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
444444

445445
const __m256 m0 = _mm256_set1_ps(x[0]);
446446
const __m256 m1 = _mm256_set1_ps(x[1]);
447447

448448
for (i = 0; i < ny8 * 8; i += 8) {
449-
_mm_prefetch(y + 32, _MM_HINT_T0);
449+
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
450450

451451
// load 8x2 matrix and transpose it in registers.
452452
// the typical bottleneck is memory access, so
@@ -496,14 +496,14 @@ void fvec_op_ny_D2<ElementOpL2>(
496496

497497
if (ny8 > 0) {
498498
// process 8 D2-vectors per loop.
499-
_mm_prefetch(y, _MM_HINT_T0);
500-
_mm_prefetch(y + 16, _MM_HINT_T0);
499+
_mm_prefetch((const char*)y, _MM_HINT_T0);
500+
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
501501

502502
const __m256 m0 = _mm256_set1_ps(x[0]);
503503
const __m256 m1 = _mm256_set1_ps(x[1]);
504504

505505
for (i = 0; i < ny8 * 8; i += 8) {
506-
_mm_prefetch(y + 32, _MM_HINT_T0);
506+
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
507507

508508
// load 8x2 matrix and transpose it in registers.
509509
// the typical bottleneck is memory access, so
@@ -1084,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
10841084
// process 8 D2-vectors per loop.
10851085
const size_t ny8 = ny / 8;
10861086
if (ny8 > 0) {
1087-
_mm_prefetch(y, _MM_HINT_T0);
1088-
_mm_prefetch(y + 16, _MM_HINT_T0);
1087+
_mm_prefetch((const char*)y, _MM_HINT_T0);
1088+
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
10891089

10901090
// track min distance and the closest vector independently
10911091
// for each of 8 AVX2 components.
@@ -1100,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
11001100
const __m256 m1 = _mm256_set1_ps(x[1]);
11011101

11021102
for (; i < ny8 * 8; i += 8) {
1103-
_mm_prefetch(y + 32, _MM_HINT_T0);
1103+
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
11041104

11051105
__m256 v0;
11061106
__m256 v1;

0 commit comments

Comments
 (0)