[CIR][CIRGen][Builtin][Neon] Lower neon_vpadd_v and neon_vpaddq_v int…

…o llvm intrinsic (llvm#960) This PR refactored Neon Built in code in clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp a bit to make it cleaner. Also changed RUNOption of test file clang/test/CIR/CodeGen/AArch64/neon-arith.c to make test more concise, and easy to compare against OG (to compare, just remove -fclangir from llvm gen part of RUN, and the test should still pass)
keryell · Oct 19, 2024 · 244734d · 244734d
1 parent 90a9be6
commit 244734d
Show file tree

Hide file tree

Showing 2 changed files with 202 additions and 61 deletions.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <utility>
+
 #include "CIRGenCXXABI.h"
 #include "CIRGenCall.h"
 #include "CIRGenFunction.h"
@@ -2158,7 +2160,7 @@ static mlir::Value buildArmLdrexNon128Intrinsic(unsigned int builtinID,
   }
 }
 
-mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf,
+mlir::Value buildNeonCall(CIRGenBuilderTy &builder,
                           llvm::SmallVector<mlir::Type> argTypes,
                           llvm::SmallVectorImpl<mlir::Value> &args,
                           llvm::StringRef intrinsicName, mlir::Type funcResTy,
@@ -2175,7 +2177,6 @@ mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf,
   if (shift > 0)
     llvm_unreachable("Argument shift NYI");
 
-  CIRGenBuilderTy &builder = cgf.getBuilder();
   for (unsigned j = 0; j < argTypes.size(); ++j) {
     if (isConstrainedFPIntrinsic) {
       assert(!MissingFeatures::buildConstrainedFPCall());
@@ -2205,6 +2206,24 @@ static int64_t getIntValueFromConstOp(mlir::Value val) {
       .getSExtValue();
 }
 
+/// This function `buildCommonNeonCallPattern0` implements a common way
+//  to generate neon intrinsic call that has following pattern:
+//  1. There is a need to cast result of the intrinsic call back to
+//     expression type.
+//  2. Function arg types are given, not deduced from actual arg types.
+static mlir::Value
+buildCommonNeonCallPattern0(CIRGenFunction &cgf, std::string &intrincsName,
+                            llvm::SmallVector<mlir::Type> argTypes,
+                            llvm::SmallVectorImpl<mlir::Value> &ops,
+                            mlir::Type funcResTy, const clang::CallExpr *e) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Value res =
+      buildNeonCall(builder, std::move(argTypes), ops, intrincsName, funcResTy,
+                    cgf.getLoc(e->getExprLoc()));
+  mlir::Type resultType = cgf.ConvertType(e->getType());
+  return builder.createBitcast(res, resultType);
+}
+
 mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr(
     unsigned builtinID, unsigned llvmIntrinsic, unsigned altLLVMIntrinsic,
     const char *nameHint, unsigned modifier, const CallExpr *e,
@@ -2269,18 +2288,25 @@ mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr(
   default:
     llvm::errs() << getAArch64SIMDIntrinsicString(builtinID) << " ";
     llvm_unreachable("NYI");
-  case NEON::BI__builtin_neon_vqadd_v:
-    mlir::Value res = buildNeonCall(builtinID, *this, {vTy, vTy}, ops,
-                                    (intrinicId != altLLVMIntrinsic)
-                                        ? "llvm.aarch64.neon.uqadd"
-                                        : "llvm.aarch64.neon.sqadd",
-                                    vTy, getLoc(e->getExprLoc()));
-    mlir::Type resultType = ConvertType(e->getType());
-    // AArch64 intrinsic one-element vector type cast to
-    // scalar type expected by the builtin
-    return builder.createBitcast(res, resultType);
+
+  case NEON::BI__builtin_neon_vpadd_v:
+  case NEON::BI__builtin_neon_vpaddq_v: {
+    std::string intrincsName = mlir::isa<mlir::FloatType>(vTy.getEltType())
+                                   ? "llvm.aarch64.neon.faddp"
+                                   : "llvm.aarch64.neon.addp";
+    return buildCommonNeonCallPattern0(*this, intrincsName, {vTy, vTy}, ops,
+                                       vTy, e);
     break;
   }
+  case NEON::BI__builtin_neon_vqadd_v: {
+    std::string intrincsName = (intrinicId != altLLVMIntrinsic)
+                                   ? "llvm.aarch64.neon.uqadd"
+                                   : "llvm.aarch64.neon.sqadd";
+    return buildCommonNeonCallPattern0(*this, intrincsName, {vTy, vTy}, ops,
+                                       vTy, e);
+    break;
+  }
+  }
   return nullptr;
 }
 
@@ -3085,9 +3111,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
     // The prototype of builtin_neon_vqrshrun_n can be found at
     // https://developer.arm.com/architectures/instruction-sets/intrinsics/
     return buildNeonCall(
-        BuiltinID, *this,
-        {builder.getExtendedElementVectorType(ty, true), SInt32Ty}, Ops,
-        "llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc()));
+        builder, {builder.getExtendedElementVectorType(ty, true), SInt32Ty},
+        Ops, "llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc()));
   case NEON::BI__builtin_neon_vqshrn_n_v:
     llvm_unreachable("NYI");
   case NEON::BI__builtin_neon_vrshrn_n_v:
@@ -3100,7 +3125,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
   case NEON::BI__builtin_neon_vrnda_v:
   case NEON::BI__builtin_neon_vrndaq_v: {
     assert(!MissingFeatures::buildConstrainedFPCall());
-    return buildNeonCall(BuiltinID, *this, {ty}, Ops, "llvm.round", ty,
+    return buildNeonCall(builder, {ty}, Ops, "llvm.round", ty,
                          getLoc(E->getExprLoc()));
   }
   case NEON::BI__builtin_neon_vrndih_f16: {
@@ -3123,8 +3148,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
   case NEON::BI__builtin_neon_vrndns_f32: {
     mlir::Value arg0 = buildScalarExpr(E->getArg(0));
     args.push_back(arg0);
-    return buildNeonCall(NEON::BI__builtin_neon_vrndns_f32, *this,
-                         {arg0.getType()}, args, "llvm.roundeven.f32",
+    return buildNeonCall(builder, {arg0.getType()}, args, "llvm.roundeven.f32",
                          getCIRGenModule().FloatTy, getLoc(E->getExprLoc()));
   }
   case NEON::BI__builtin_neon_vrndph_f16: {

diff --git a/clang/test/CIR/CodeGen/AArch64/neon-arith.c b/clang/test/CIR/CodeGen/AArch64/neon-arith.c
@@ -1,8 +1,14 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -ffreestanding -emit-cir -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.cir
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \
+// RUN:    -fclangir -disable-O0-optnone \
+// RUN:  -flax-vector-conversions=none -fno-clangir-call-conv-lowering \
+// RUN:   -emit-cir -o %t.cir %s
 // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -ffreestanding -emit-llvm -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.ll
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \
+// RUN:    -fclangir -disable-O0-optnone \
+// RUN:  -flax-vector-conversions=none -fno-clangir-call-conv-lowering \
+// RUN:  -emit-llvm -o - %s \
+// RUN: | opt -S -passes=instcombine,mem2reg,simplifycfg -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
@@ -28,19 +34,9 @@ float32_t test_vrndns_f32(float32_t a) {
 // CIR: [[RET_VAL:%.*]] = cir.load [[RET_P]] : !cir.ptr<!cir.float>, !cir.float
 // CIR: cir.return [[RET_VAL]] : !cir.float loc
 
-// LLVM: define dso_local float @test_vrndns_f32(float [[ARG:%.*]])
-// LLVM: store float [[ARG]], ptr [[ARG_SAVE:%.*]], align 4
-// LLVM: [[P0:%.*]] = load float, ptr [[ARG_SAVE]], align 4,
-// LLVM: store float [[P0]], ptr [[P0_SAVE:%.*]], align 4,
-// LLVM: [[INTRIN_ARG:%.*]] = load float, ptr [[P0_SAVE]], align 4,
-// LLVM: [[INTRIN_RES:%.*]] = call float @llvm.roundeven.f32(float [[INTRIN_ARG]])
-// LLVM: store float [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 4, 
-// LLVM: [[RES_COPY0:%.*]] = load float, ptr [[RES_SAVE0]], align 4,
-// LLVM: store float [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 4,
-// LLVM: [[RES_COPY1:%.*]] = load float, ptr [[RES_SAVE1]], align 4,
-// LLVM: store float [[RES_COPY1]], ptr [[RET_P:%.*]], align 4,
-// LLVM: [[RET_VAL:%.*]] = load float, ptr [[RET_P]], align 4,
-// LLVM: ret float [[RET_VAL]]
+// LLVM: {{.*}}test_vrndns_f32(float{{.*}}[[ARG:%.*]])
+// LLVM: [[INTRIN_RES:%.*]] = call float @llvm.roundeven.f32(float [[ARG]])
+// LLVM: ret float [[INTRIN_RES]]
 
 float32x2_t test_vrnda_f32(float32x2_t a) {
   return vrnda_f32(a);
@@ -62,19 +58,9 @@ float32x2_t test_vrnda_f32(float32x2_t a) {
 // CIR: [[RET_VAL:%.*]] = cir.load [[RET_P]] : !cir.ptr<!cir.vector<!cir.float x 2>>, !cir.vector<!cir.float x 2>
 // CIR: cir.return [[RET_VAL]] : !cir.vector<!cir.float x 2>
 
-// LLVM: define dso_local <2 x float> @test_vrnda_f32(<2 x float> [[ARG:%.*]])
-// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: [[P0:%.*]] = load <2 x float>, ptr [[ARG_SAVE]], align 8,
-// LLVM: store <2 x float> [[P0]], ptr [[P0_SAVE:%.*]], align 8,
-// LLVM: [[INTRIN_ARG:%.*]] = load <2 x float>, ptr [[P0_SAVE]], align 8,
-// LLVM: [[INTRIN_RES:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[INTRIN_ARG]])
-// LLVM: store <2 x float> [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 8,
-// LLVM: [[RES_COPY0:%.*]] = load <2 x float>, ptr [[RES_SAVE0]], align 8,
-// LLVM: store <2 x float> [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 8,
-// LLVM: [[RES_COPY1:%.*]] = load <2 x float>, ptr [[RES_SAVE1]], align 8,
-// LLVM: store <2 x float> [[RES_COPY1]], ptr [[RET_P:%.*]], align 8,
-// LLVM: [[RET_VAL:%.*]] = load <2 x float>, ptr [[RET_P]], align 8,
-// LLVM: ret <2 x float> [[RET_VAL]]
+// LLVM: {{.*}}test_vrnda_f32(<2 x float>{{.*}}[[ARG:%.*]])
+// LLVM: [[INTRIN_RES:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[ARG]])
+// LLVM: ret <2 x float> [[INTRIN_RES]]
 
 float32x4_t test_vrndaq_f32(float32x4_t a) {
   return vrndaq_f32(a);
@@ -88,16 +74,147 @@ float32x4_t test_vrndaq_f32(float32x4_t a) {
 // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.round" [[INTRIN_ARG_BACK]] : (!cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
 // CIR: cir.return {{%.*}} : !cir.vector<!cir.float x 4>
 
-// LLVM: define dso_local <4 x float> @test_vrndaq_f32(<4 x float> [[ARG:%.*]])
-// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: [[P0:%.*]] = load <4 x float>, ptr [[ARG_SAVE]], align 16,
-// LLVM: store <4 x float> [[P0]], ptr [[P0_SAVE:%.*]], align 16,
-// LLVM: [[INTRIN_ARG:%.*]] = load <4 x float>, ptr [[P0_SAVE]], align 16,
-// LLVM: [[INTRIN_RES:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[INTRIN_ARG]])
-// LLVM: store <4 x float> [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 16,
-// LLVM: [[RES_COPY0:%.*]] = load <4 x float>, ptr [[RES_SAVE0]], align 16,
-// LLVM: store <4 x float> [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 16,
-// LLVM: [[RES_COPY1:%.*]] = load <4 x float>, ptr [[RES_SAVE1]], align 16,
-// LLVM: store <4 x float> [[RES_COPY1]], ptr [[RET_P:%.*]], align 16,
-// LLVM: [[RET_VAL:%.*]] = load <4 x float>, ptr [[RET_P]], align 16,
-// LLVM: ret <4 x float> [[RET_VAL]]
+// LLVM: {{.*}}test_vrndaq_f32(<4 x float>{{.*}}[[ARG:%.*]])
+// LLVM: [[INTRIN_RES:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[ARG]])
+// LLVM: ret <4 x float> [[INTRIN_RES]]
+
+int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
+  return vpadd_s8(a, b);
+}
+
+// CIR-LABEL: vpadd_s8
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s8i x 8>, !cir.vector<!s8i x 8>) -> !cir.vector<!s8i x 8>
+
+// LLVM: {{.*}}test_vpadd_s8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// LLVM: ret <8 x i8> [[RES]]
+
+
+int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
+  return vpaddq_s8(a, b);
+}
+
+// CIR-LABEL: vpaddq_s8
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s8i x 16>, !cir.vector<!s8i x 16>) -> !cir.vector<!s8i x 16>
+
+// LLVM: {{.*}}test_vpaddq_s8(<16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// LLVM: ret <16 x i8> [[RES]]
+
+uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
+  return vpadd_u8(a, b);
+}
+
+// CIR-LABEL: vpadd_u8
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!u8i x 8>, !cir.vector<!u8i x 8>) -> !cir.vector<!u8i x 8>
+
+// LLVM: {{.*}}test_vpadd_u8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// LLVM: ret <8 x i8> [[RES]]
+
+int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
+  return vpadd_s16(a, b);
+}
+
+// CIR-LABEL: vpadd_s16
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s16i x 4>, !cir.vector<!s16i x 4>) -> !cir.vector<!s16i x 4>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s16i x 4>), !cir.vector<!s8i x 8>
+
+// LLVM: {{.*}}test_vpadd_s16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// LLVM: ret <4 x i16> [[RES]]
+
+int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
+  return vpaddq_s16(a, b);
+}
+
+// CIR-LABEL: vpaddq_s16
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s16i x 8>, !cir.vector<!s16i x 8>) -> !cir.vector<!s16i x 8>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s16i x 8>), !cir.vector<!s8i x 16>
+
+// LLVM: {{.*}}test_vpaddq_s16(<8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// LLVM: ret <8 x i16> [[RES]]
+
+uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
+  return vpadd_u16(a, b);
+}
+
+// CIR-LABEL: vpadd_u16
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!u16i x 4>, !cir.vector<!u16i x 4>) -> !cir.vector<!u16i x 4>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!u16i x 4>), !cir.vector<!s8i x 8>
+
+// LLVM: {{.*}}test_vpadd_u16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// LLVM: ret <4 x i16> [[RES]]
+
+int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
+  return vpadd_s32(a, b);
+}
+
+// CIR-LABEL: vpadd_s32
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s32i x 2>, !cir.vector<!s32i x 2>) -> !cir.vector<!s32i x 2>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s32i x 2>), !cir.vector<!s8i x 8>
+
+// LLVM: {{.*}}test_vpadd_s32(<2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// LLVM: ret <2 x i32> [[RES]]
+
+int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
+  return vpaddq_s32(a, b);
+}
+
+// CIR-LABEL: vpaddq_s32
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s32i x 4>, !cir.vector<!s32i x 4>) -> !cir.vector<!s32i x 4>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s32i x 4>), !cir.vector<!s8i x 16>
+
+// LLVM: {{.*}}test_vpaddq_s32(<4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// LLVM: ret <4 x i32> [[RES]]
+
+float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
+  return vpadd_f32(a, b);
+}
+
+// CIR-LABEL: vpadd_f32
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!cir.float x 2>, !cir.vector<!cir.float x 2>) -> !cir.vector<!cir.float x 2>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!cir.float x 2>), !cir.vector<!s8i x 8>
+
+// LLVM: {{.*}}test_vpadd_f32(<2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// LLVM: ret <2 x float> [[RES]]
+
+float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
+  return vpaddq_f32(a, b);
+}
+
+// CIR-LABEL: vpaddq_f32
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!cir.float x 4>, !cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!cir.float x 4>), !cir.vector<!s8i x 16>
+
+// LLVM: {{.*}}test_vpaddq_f32(<4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// LLVM: ret <4 x float> [[RES]]
+
+float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
+  return vpaddq_f64(a, b);
+}
+
+// CIR-LABEL: vpaddq_f64
+// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!cir.double x 2>, !cir.vector<!cir.double x 2>) -> !cir.vector<!cir.double x 2>
+// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!cir.double x 2>), !cir.vector<!s8i x 16>
+
+// LLVM: {{.*}}test_vpaddq_f64(<2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]])
+// LLVM: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// LLVM: ret <2 x double> [[RES]]