From 77730a52b9a09eeb52951dbdc7adc16268ae50d5 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Mon, 17 Feb 2025 18:18:38 +0100 Subject: [PATCH] instcountci: Improve reciprocal estimate and tests --- .../AFP/SVE256/Secondary_REP.json | 5 +-- .../InstructionCountCI/AFP/Secondary_REP.json | 5 +-- .../InstructionCountCI/AFP/VEX_map1.json | 5 +-- unittests/InstructionCountCI/DDD.json | 24 ++++++++----- unittests/InstructionCountCI/RPRES/DDD.json | 36 +++++++++++++------ .../RPRES/Secondary_REP_AFP.json | 13 ++++--- .../RPRES/VEX_map1_AFP.json | 13 ++++--- 7 files changed, 68 insertions(+), 33 deletions(-) diff --git a/unittests/InstructionCountCI/AFP/SVE256/Secondary_REP.json b/unittests/InstructionCountCI/AFP/SVE256/Secondary_REP.json index 9e4d6adf8d..740bd30532 100644 --- a/unittests/InstructionCountCI/AFP/SVE256/Secondary_REP.json +++ b/unittests/InstructionCountCI/AFP/SVE256/Secondary_REP.json @@ -61,14 +61,15 @@ ] }, "rcpss xmm0, xmm1": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 3, "Comment": [ "FEAT_FPRES could make this more optimal", "0xf3 0x0f 0x53" ], "ExpectedArm64ASM": [ "fmov s0, #0x70 (1.0000)", - "fdiv s16, s0, s17" + "fdiv s0, s0, s17", + "mov v16.s[0], v0.s[0]" ] }, "addss xmm0, xmm1": { diff --git a/unittests/InstructionCountCI/AFP/Secondary_REP.json b/unittests/InstructionCountCI/AFP/Secondary_REP.json index 45633e7be2..c32c9efd90 100644 --- a/unittests/InstructionCountCI/AFP/Secondary_REP.json +++ b/unittests/InstructionCountCI/AFP/Secondary_REP.json @@ -61,14 +61,15 @@ ] }, "rcpss xmm0, xmm1": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 3, "Comment": [ "FEAT_FPRES could make this more optimal", "0xf3 0x0f 0x53" ], "ExpectedArm64ASM": [ "fmov s0, #0x70 (1.0000)", - "fdiv s16, s0, s17" + "fdiv s0, s0, s17", + "mov v16.s[0], v0.s[0]" ] }, "addss xmm0, xmm1": { diff --git a/unittests/InstructionCountCI/AFP/VEX_map1.json b/unittests/InstructionCountCI/AFP/VEX_map1.json index 573588ffbe..6771736e24 100644 --- a/unittests/InstructionCountCI/AFP/VEX_map1.json +++ b/unittests/InstructionCountCI/AFP/VEX_map1.json @@ -46,7 +46,7 @@ ] }, "vrcpss xmm0, xmm1, xmm2": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 4, "Comment": [ "FEAT_FPRES could make this more optimal", "Map 1 0b10 0x53 128-bit" @@ -54,7 +54,8 @@ "ExpectedArm64ASM": [ "mov v16.16b, v17.16b", "fmov s0, #0x70 (1.0000)", - "fdiv s16, s0, s18" + "fdiv s0, s0, s18", + "mov v16.s[0], v0.s[0]" ] }, "vcmpss xmm0, xmm1, xmm2, 0x00": { diff --git a/unittests/InstructionCountCI/DDD.json b/unittests/InstructionCountCI/DDD.json index fb2391a6bb..21560def89 100644 --- a/unittests/InstructionCountCI/DDD.json +++ b/unittests/InstructionCountCI/DDD.json @@ -90,16 +90,19 @@ ] }, "pfrsqrtv mm0, mm1": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 10, "Comment": [ "0x0f 0x0f 0x87" ], "ExpectedArm64ASM": [ "ldr d2, [x28, #1056]", + "fabs v3.4s, v2.4s", "fmov v0.4s, #0x70 (1.0000)", - "fsqrt v1.4s, v2.4s", - "fdiv v2.4s, v0.4s, v1.4s", - "str d2, [x28, #1040]", + "fsqrt v1.4s, v3.4s", + "fdiv v3.4s, v0.4s, v1.4s", + "movi v0.2s, #0x80, lsl #24", + "bit v3.8b, v2.8b, v0.8b", + "str d3, [x28, #1040]", "mov w20, #0xffff", "strh w20, [x28, #1048]" ] @@ -174,16 +177,19 @@ ] }, "pfrsqrt mm0, mm1": { - "ExpectedInstructionCount": 8, + "ExpectedInstructionCount": 11, "Comment": [ "0x0f 0x0f 0x97" ], "ExpectedArm64ASM": [ "ldr d2, [x28, #1056]", - "fmov s0, #0x70 (1.0000)", - "fsqrt s1, s2", - "fdiv s2, s0, s1", - "dup v2.2s, v2.s[0]", + "fabs v3.4s, v2.4s", + "fmov v0.4s, #0x70 (1.0000)", + "fsqrt v1.4s, v3.4s", + "fdiv v3.4s, v0.4s, v1.4s", + "movi v0.2s, #0x80, lsl #24", + "bit v3.8b, v2.8b, v0.8b", + "dup v2.2s, v3.s[0]", "str d2, [x28, #1040]", "mov w20, #0xffff", "strh w20, [x28, #1048]" diff --git a/unittests/InstructionCountCI/RPRES/DDD.json b/unittests/InstructionCountCI/RPRES/DDD.json index 6166d2aa9f..f016ec0557 100644 --- a/unittests/InstructionCountCI/RPRES/DDD.json +++ b/unittests/InstructionCountCI/RPRES/DDD.json @@ -12,39 +12,49 @@ }, "Instructions": { "pfrcpv mm0, mm1": { - "ExpectedInstructionCount": 5, + "ExpectedInstructionCount": 7, "Comment": [ "0x0f 0x0f 0x86" ], "ExpectedArm64ASM": [ "ldr d2, [x28, #1056]", - "frecpe v2.2s, v2.2s", + "frecpe v0.2s, v2.2s", + "frecps v1.2s, v0.2s, v2.2s", + "fmul v2.2s, v0.2s, v1.2s", "str d2, [x28, #1040]", "mov w20, #0xffff", "strh w20, [x28, #1048]" ] }, "pfrsqrtv mm0, mm1": { - "ExpectedInstructionCount": 5, + "ExpectedInstructionCount": 11, "Comment": [ "0x0f 0x0f 0x87" ], "ExpectedArm64ASM": [ "ldr d2, [x28, #1056]", - "frsqrte v2.2s, v2.2s", - "str d2, [x28, #1040]", + "fabs v3.4s, v2.4s", + "frsqrte v0.2s, v3.2s", + "fmul v1.2s, v0.2s, v0.2s", + "frsqrts v1.2s, v1.2s, v3.2s", + "fmul v3.2s, v0.2s, v1.2s", + "movi v0.2s, #0x80, lsl #24", + "bit v3.8b, v2.8b, v0.8b", + "str d3, [x28, #1040]", "mov w20, #0xffff", "strh w20, [x28, #1048]" ] }, "pfrcp mm0, mm1": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 8, "Comment": [ "0x0f 0x0f 0x96" ], "ExpectedArm64ASM": [ "ldr d2, [x28, #1056]", - "frecpe s2, s2", + "frecpe s0, s2", + "frecps s1, s0, s2", + "fmul s2, s0, s1", "dup v2.2s, v2.s[0]", "str d2, [x28, #1040]", "mov w20, #0xffff", @@ -52,14 +62,20 @@ ] }, "pfrsqrt mm0, mm1": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 12, "Comment": [ "0x0f 0x0f 0x97" ], "ExpectedArm64ASM": [ "ldr d2, [x28, #1056]", - "frsqrte s2, s2", - "dup v2.2s, v2.s[0]", + "fabs v3.4s, v2.4s", + "frsqrte v0.2s, v3.2s", + "fmul v1.2s, v0.2s, v0.2s", + "frsqrts v1.2s, v1.2s, v3.2s", + "fmul v3.2s, v0.2s, v1.2s", + "movi v0.2s, #0x80, lsl #24", + "bit v3.8b, v2.8b, v0.8b", + "dup v2.2s, v3.s[0]", "str d2, [x28, #1040]", "mov w20, #0xffff", "strh w20, [x28, #1048]" diff --git a/unittests/InstructionCountCI/RPRES/Secondary_REP_AFP.json b/unittests/InstructionCountCI/RPRES/Secondary_REP_AFP.json index ecff22b523..f6e5632ee3 100644 --- a/unittests/InstructionCountCI/RPRES/Secondary_REP_AFP.json +++ b/unittests/InstructionCountCI/RPRES/Secondary_REP_AFP.json @@ -12,21 +12,26 @@ }, "Instructions": { "rsqrtss xmm0, xmm1": { - "ExpectedInstructionCount": 1, + "ExpectedInstructionCount": 5, "Comment": [ "0xf3 0x0f 0x52" ], "ExpectedArm64ASM": [ - "frsqrte s16, s17" + "frsqrte s0, s17", + "fmul s1, s0, s0", + "frsqrts s1, s1, s17", + "fmul s0, s0, s1", + "mov v16.s[0], v0.s[0]" ] }, "rcpss xmm0, xmm1": { - "ExpectedInstructionCount": 1, + "ExpectedInstructionCount": 2, "Comment": [ "0xf3 0x0f 0x53" ], "ExpectedArm64ASM": [ - "frecpe s16, s17" + "frecpe s0, s17", + "mov v16.s[0], v0.s[0]" ] } } diff --git a/unittests/InstructionCountCI/RPRES/VEX_map1_AFP.json b/unittests/InstructionCountCI/RPRES/VEX_map1_AFP.json index 40eeb98d42..92a4074241 100644 --- a/unittests/InstructionCountCI/RPRES/VEX_map1_AFP.json +++ b/unittests/InstructionCountCI/RPRES/VEX_map1_AFP.json @@ -29,14 +29,18 @@ ] }, "vrsqrtss xmm0, xmm1, xmm2": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 6, "Comment": [ "AFP can make this more optimal", "Map 1 0b10 0x52 128-bit" ], "ExpectedArm64ASM": [ "mov v16.16b, v17.16b", - "frsqrte s16, s18" + "frsqrte s0, s18", + "fmul s1, s0, s0", + "frsqrts s1, s1, s18", + "fmul s0, s0, s1", + "mov v16.s[0], v0.s[0]" ] }, "vrcpps xmm0, xmm1": { @@ -58,13 +62,14 @@ ] }, "vrcpss xmm0, xmm1, xmm2": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 3, "Comment": [ "Map 1 0b10 0x53 128-bit" ], "ExpectedArm64ASM": [ "mov v16.16b, v17.16b", - "frecpe s16, s18" + "frecpe s0, s18", + "mov v16.s[0], v0.s[0]" ] } }