@@ -767,8 +767,8 @@ struct SimilarityL2<8> {
767
767
float32x4_t sub0 = vsubq_f32 (yiv.val [0 ], x.val [0 ]);
768
768
float32x4_t sub1 = vsubq_f32 (yiv.val [1 ], x.val [1 ]);
769
769
770
- float32x4_t accu8_0 = vaddq_f32 (accu8.val [0 ], vmulq_f32 ( sub0, sub0) );
771
- float32x4_t accu8_1 = vaddq_f32 (accu8.val [1 ], vmulq_f32 ( sub1, sub1) );
770
+ float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], sub0, sub0);
771
+ float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], sub1, sub1);
772
772
773
773
float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
774
774
accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
@@ -780,8 +780,8 @@ struct SimilarityL2<8> {
780
780
float32x4_t sub0 = vsubq_f32 (y.val [0 ], x.val [0 ]);
781
781
float32x4_t sub1 = vsubq_f32 (y.val [1 ], x.val [1 ]);
782
782
783
- float32x4_t accu8_0 = vaddq_f32 (accu8.val [0 ], vmulq_f32 ( sub0, sub0) );
784
- float32x4_t accu8_1 = vaddq_f32 (accu8.val [1 ], vmulq_f32 ( sub1, sub1) );
783
+ float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], sub0, sub0);
784
+ float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], sub1, sub1);
785
785
786
786
float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
787
787
accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
@@ -892,21 +892,17 @@ struct SimilarityIP<8> {
892
892
float32x4x2_t yiv = vld1q_f32_x2 (yi);
893
893
yi += 8 ;
894
894
895
- float32x4_t accu8_0 =
896
- vaddq_f32 (accu8.val [0 ], vmulq_f32 (yiv.val [0 ], x.val [0 ]));
897
- float32x4_t accu8_1 =
898
- vaddq_f32 (accu8.val [1 ], vmulq_f32 (yiv.val [1 ], x.val [1 ]));
895
+ float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], yiv.val [0 ], x.val [0 ]);
896
+ float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], yiv.val [1 ], x.val [1 ]);
899
897
float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
900
898
accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
901
899
}
902
900
903
901
FAISS_ALWAYS_INLINE void add_8_components_2 (
904
902
float32x4x2_t x1,
905
903
float32x4x2_t x2) {
906
- float32x4_t accu8_0 =
907
- vaddq_f32 (accu8.val [0 ], vmulq_f32 (x1.val [0 ], x2.val [0 ]));
908
- float32x4_t accu8_1 =
909
- vaddq_f32 (accu8.val [1 ], vmulq_f32 (x1.val [1 ], x2.val [1 ]));
904
+ float32x4_t accu8_0 = vfmaq_f32 (accu8.val [0 ], x1.val [0 ], x2.val [0 ]);
905
+ float32x4_t accu8_1 = vfmaq_f32 (accu8.val [1 ], x1.val [1 ], x2.val [1 ]);
910
906
float32x4x2_t accu8_temp = vzipq_f32 (accu8_0, accu8_1);
911
907
accu8 = vuzpq_f32 (accu8_temp.val [0 ], accu8_temp.val [1 ]);
912
908
}
0 commit comments