@@ -101,8 +101,7 @@ struct Codec8bit {
101
101
}
102
102
float32x4_t res1 = vld1q_f32 (result);
103
103
float32x4_t res2 = vld1q_f32 (result + 4 );
104
- float32x4x2_t res = vzipq_f32 (res1, res2);
105
- return vuzpq_f32 (res.val [0 ], res.val [1 ]);
104
+ return {res1, res2};
106
105
}
107
106
#endif
108
107
};
@@ -153,8 +152,7 @@ struct Codec4bit {
153
152
}
154
153
float32x4_t res1 = vld1q_f32 (result);
155
154
float32x4_t res2 = vld1q_f32 (result + 4 );
156
- float32x4x2_t res = vzipq_f32 (res1, res2);
157
- return vuzpq_f32 (res.val [0 ], res.val [1 ]);
155
+ return {res1, res2};
158
156
}
159
157
#endif
160
158
};
@@ -266,8 +264,7 @@ struct Codec6bit {
266
264
}
267
265
float32x4_t res1 = vld1q_f32 (result);
268
266
float32x4_t res2 = vld1q_f32 (result + 4 );
269
- float32x4x2_t res = vzipq_f32 (res1, res2);
270
- return vuzpq_f32 (res.val [0 ], res.val [1 ]);
267
+ return {res1, res2};
271
268
}
272
269
#endif
273
270
};
@@ -345,16 +342,14 @@ struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
345
342
FAISS_ALWAYS_INLINE float32x4x2_t
346
343
reconstruct_8_components (const uint8_t * code, int i) const {
347
344
float32x4x2_t xi = Codec::decode_8_components (code, i);
348
- float32x4x2_t res = vzipq_f32 (
349
- vfmaq_f32 (
345
+ return {vfmaq_f32 (
350
346
vdupq_n_f32 (this ->vmin ),
351
347
xi.val [0 ],
352
348
vdupq_n_f32 (this ->vdiff )),
353
349
vfmaq_f32 (
354
350
vdupq_n_f32 (this ->vmin ),
355
351
xi.val [1 ],
356
- vdupq_n_f32 (this ->vdiff )));
357
- return vuzpq_f32 (res.val [0 ], res.val [1 ]);
352
+ vdupq_n_f32 (this ->vdiff ))};
358
353
}
359
354
};
360
355
@@ -431,10 +426,8 @@ struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
431
426
float32x4x2_t vmin_8 = vld1q_f32_x2 (this ->vmin + i);
432
427
float32x4x2_t vdiff_8 = vld1q_f32_x2 (this ->vdiff + i);
433
428
434
- float32x4x2_t res = vzipq_f32 (
435
- vfmaq_f32 (vmin_8.val [0 ], xi.val [0 ], vdiff_8.val [0 ]),
436
- vfmaq_f32 (vmin_8.val [1 ], xi.val [1 ], vdiff_8.val [1 ]));
437
- return vuzpq_f32 (res.val [0 ], res.val [1 ]);
429
+ return {vfmaq_f32 (vmin_8.val [0 ], xi.val [0 ], vdiff_8.val [0 ]),
430
+ vfmaq_f32 (vmin_8.val [1 ], xi.val [1 ], vdiff_8.val [1 ])};
438
431
}
439
432
};
440
433
@@ -496,10 +489,9 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
496
489
497
490
FAISS_ALWAYS_INLINE float32x4x2_t
498
491
reconstruct_8_components (const uint8_t * code, int i) const {
499
- uint16x4x2_t codei = vld2_u16 ((const uint16_t *)(code + 2 * i));
500
- return vzipq_f32 (
501
- vcvt_f32_f16 (vreinterpret_f16_u16 (codei.val [0 ])),
502
- vcvt_f32_f16 (vreinterpret_f16_u16 (codei.val [1 ])));
492
+ uint16x4x2_t codei = vld1_u16_x2 ((const uint16_t *)(code + 2 * i));
493
+ return {vcvt_f32_f16 (vreinterpret_f16_u16 (codei.val [0 ])),
494
+ vcvt_f32_f16 (vreinterpret_f16_u16 (codei.val [1 ]))};
503
495
}
504
496
};
505
497
#endif
@@ -568,8 +560,7 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
568
560
}
569
561
float32x4_t res1 = vld1q_f32 (result);
570
562
float32x4_t res2 = vld1q_f32 (result + 4 );
571
- float32x4x2_t res = vzipq_f32 (res1, res2);
572
- return vuzpq_f32 (res.val [0 ], res.val [1 ]);
563
+ return {res1, res2};
573
564
}
574
565
};
575
566
@@ -868,7 +859,7 @@ struct SimilarityL2<8> {
868
859
float32x4x2_t accu8;
869
860
870
861
FAISS_ALWAYS_INLINE void begin_8 () {
871
- accu8 = vzipq_f32 ( vdupq_n_f32 (0 .0f ), vdupq_n_f32 (0 .0f )) ;
862
+ accu8 = { vdupq_n_f32 (0 .0f ), vdupq_n_f32 (0 .0f )} ;
872
863
yi = y;
873
864
}
874
865
@@ -882,8 +873,7 @@ struct SimilarityL2<8> {
882
873
float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], sub0, sub0);
883
874
float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], sub1, sub1);
884
875
885
- float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
886
- accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
876
+ accu8 = {accu8_0, accu8_1};
887
877
}
888
878
889
879
FAISS_ALWAYS_INLINE void add_8_components_2 (
@@ -895,8 +885,7 @@ struct SimilarityL2<8> {
895
885
float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], sub0, sub0);
896
886
float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], sub1, sub1);
897
887
898
- float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
899
- accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
888
+ accu8 = {accu8_0, accu8_1};
900
889
}
901
890
902
891
FAISS_ALWAYS_INLINE float result_8 () {
@@ -996,7 +985,7 @@ struct SimilarityIP<8> {
996
985
float32x4x2_t accu8;
997
986
998
987
FAISS_ALWAYS_INLINE void begin_8 () {
999
- accu8 = vzipq_f32 ( vdupq_n_f32 (0 .0f ), vdupq_n_f32 (0 .0f )) ;
988
+ accu8 = { vdupq_n_f32 (0 .0f ), vdupq_n_f32 (0 .0f )} ;
1000
989
yi = y;
1001
990
}
1002
991
@@ -1006,28 +995,25 @@ struct SimilarityIP<8> {
1006
995
1007
996
float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], yiv.val [0 ], x.val [0 ]);
1008
997
float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], yiv.val [1 ], x.val [1 ]);
1009
- float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
1010
- accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
998
+ accu8 = {accu8_0, accu8_1};
1011
999
}
1012
1000
1013
1001
FAISS_ALWAYS_INLINE void add_8_components_2 (
1014
1002
float32x4x2_t x1,
1015
1003
float32x4x2_t x2) {
1016
1004
float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], x1.val [0 ], x2.val [0 ]);
1017
1005
float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], x1.val [1 ], x2.val [1 ]);
1018
- float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
1019
- accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
1006
+ accu8 = {accu8_0, accu8_1};
1020
1007
}
1021
1008
1022
1009
FAISS_ALWAYS_INLINE float result_8 () {
1023
- float32x4x2_t sum_tmp = vzipq_f32 (
1010
+ float32x4x2_t sum = {
1024
1011
vpaddq_f32 (accu8.val [0 ], accu8.val [0 ]),
1025
- vpaddq_f32 (accu8.val [1 ], accu8.val [1 ])) ;
1026
- float32x4x2_t sum = vuzpq_f32 (sum_tmp. val [ 0 ], sum_tmp. val [ 1 ]);
1027
- float32x4x2_t sum2_tmp = vzipq_f32 (
1012
+ vpaddq_f32 (accu8.val [1 ], accu8.val [1 ])} ;
1013
+
1014
+ float32x4x2_t sum2 = {
1028
1015
vpaddq_f32 (sum.val [0 ], sum.val [0 ]),
1029
- vpaddq_f32 (sum.val [1 ], sum.val [1 ]));
1030
- float32x4x2_t sum2 = vuzpq_f32 (sum2_tmp.val [0 ], sum2_tmp.val [1 ]);
1016
+ vpaddq_f32 (sum.val [1 ], sum.val [1 ])};
1031
1017
return vgetq_lane_f32 (sum2.val [0 ], 0 ) + vgetq_lane_f32 (sum2.val [1 ], 0 );
1032
1018
}
1033
1019
};
0 commit comments