Skip to content

Commit 1cb1e54

Browse files
Alexandr Guzhvafacebook-github-bot
Alexandr Guzhva
authored andcommitted
upgrade horizontal sum in distance_single_code for PQ/IVFPQ (facebookresearch#2830)
Summary: Pull Request resolved: facebookresearch#2830 17 cycles per AVX2 horizontal sum instead of 19 Reviewed By: mdouze Differential Revision: D45244153 fbshipit-source-id: 15accba2e8b4f12725dba41696c302e72f61c2db
1 parent d0ba4c0 commit 1cb1e54

File tree

1 file changed

+12
-14
lines changed

1 file changed

+12
-14
lines changed

faiss/impl/code_distance/code_distance-avx2.h

+12-14
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,19 @@
1717

1818
namespace {
1919

20-
// Computes a horizontal sum over an __m256 register
21-
inline float horizontal_sum(const __m256 reg) {
22-
const __m256 h0 = _mm256_hadd_ps(reg, reg);
23-
const __m256 h1 = _mm256_hadd_ps(h0, h0);
24-
25-
// extract high and low __m128 regs from __m256
26-
const __m128 h2 = _mm256_extractf128_ps(h1, 1);
27-
const __m128 h3 = _mm256_castps256_ps128(h1);
28-
29-
// get a final hsum into all 4 regs
30-
const __m128 h4 = _mm_add_ss(h2, h3);
20+
inline float horizontal_sum(const __m128 v) {
21+
const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
22+
const __m128 v1 = _mm_add_ps(v, v0);
23+
__m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
24+
const __m128 v3 = _mm_add_ps(v1, v2);
25+
return _mm_cvtss_f32(v3);
26+
}
3127

32-
// extract f[0] from __m128
33-
const float hsum = _mm_cvtss_f32(h4);
34-
return hsum;
28+
// Computes a horizontal sum over an __m256 register
29+
inline float horizontal_sum(const __m256 v) {
30+
const __m128 v0 =
31+
_mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
32+
return horizontal_sum(v0);
3533
}
3634

3735
} // namespace

0 commit comments

Comments
 (0)