[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByC…

…onstant. If the divisor is even, we can first shift the dividend and divisor right by the number of trailing zeros. Now the divisor is odd and we can do the original algorithm to calculate a remainder. Then we shift that remainder left by the number of trailing zeros and add the bits that were shifted out of the dividend. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D135541
MaxMood96 · Oct 10, 2022 · d4facda · d4facda
1 parent 9eb1185
commit d4facda
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 84 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,8 +7168,15 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// For division, we can compute the remainder, subtract it from the dividend,
-// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
+// If Constant is even, we can shift right the dividend and the divisor by the
+// number of trailing zeros in Constant before computing the remainder. Then
+// fixup the remainder by shifting it left by the number of trailing zeros and
+// adding the bits that were shifted out of the dividend.
+//
+// For division, we can compute the remainder using the algorithm described
+// above, subtract it from the dividend to get an exact multiple of Constant.
+// Then multiply that extact multiple by the multiplicative inverse modulo
+// (1 << (BitWidth / 2)).
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7188,7 +7195,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (!CN)
     return false;
 
-  const APInt &Divisor = CN->getAPIntValue();
+  APInt Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7209,10 +7216,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0, 1 or even divisors.
-  if (Divisor.ule(1) || Divisor[0] == 0)
+  // Early out for 0 or 1 divisors.
+  if (Divisor.ule(1))
     return false;
 
+  // If the divisor is even, shift it until it becomes odd.
+  unsigned TrailingZeros = 0;
+  if (!Divisor[0]) {
+    TrailingZeros = Divisor.countTrailingZeros();
+    Divisor.lshrInPlace(TrailingZeros);
+  }
+
   SDLoc dl(N);
   SDValue Sum;
 
@@ -7229,17 +7243,35 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                        DAG.getIntPtrConstant(1, dl));
     }
 
+    SDValue ShiftedLL = LL;
+    SDValue ShiftedLH = LH;
+
+    // Shift the input by the number of TrailingZeros in the divisor. The
+    // shifted out bits will be added to the remainder later.
+    if (TrailingZeros) {
+      ShiftedLL = DAG.getNode(
+          ISD::OR, dl, HiLoVT,
+          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLL,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+          DAG.getNode(ISD::SHL, dl, HiLoVT, ShiftedLH,
+                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                 HiLoVT, dl)));
+      ShiftedLH =
+          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLH,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    }
+
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
     if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) {
       SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
-      Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
+      Sum = DAG.getNode(ISD::UADDO, dl, VTList, ShiftedLL, ShiftedLH);
       Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum,
                         DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1));
     } else {
-      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH);
-      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
+      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, ShiftedLL, ShiftedLH);
+      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, ShiftedLL, ISD::SETULT);
       // If the boolean for the target is 0 or 1, we can add the setcc result
       // directly.
       if (getBooleanContents(HiLoVT) ==
@@ -7263,6 +7295,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
+  // If we shifted the input, shift the remainder left and add the bits we
+  // shifted off the input.
+  if (TrailingZeros) {
+    APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+    RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
+                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL,
+                       DAG.getNode(ISD::AND, dl, HiLoVT, LL,
+                                   DAG.getConstant(Mask, dl, HiLoVT)));
+  }
+
   // If we only want remainder, we're done.
   if (Opcode == ISD::UREM) {
     Result.push_back(RemL);

diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,24 +502,65 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __udivdi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    add a3, a2, a3
+; RV32-NEXT:    sltu a2, a3, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    lui a3, 699051
+; RV32-NEXT:    addi a4, a3, -1365
+; RV32-NEXT:    mulhu a5, a2, a4
+; RV32-NEXT:    srli a6, a5, 1
+; RV32-NEXT:    andi a5, a5, -2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    andi a5, a0, 3
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    mul a3, a5, a3
+; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    mul a0, a0, a4
+; RV32-NEXT:    add a1, a3, a0
+; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __udivti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a3, a1, 2
+; RV64-NEXT:    lui a4, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a4, %lo(.LCPI10_0)(a4)
+; RV64-NEXT:    add a3, a2, a3
+; RV64-NEXT:    sltu a2, a3, a2
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    mulhu a3, a2, a4
+; RV64-NEXT:    srli a5, a3, 1
+; RV64-NEXT:    andi a3, a3, -2
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    sub a2, a2, a3
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI10_1)
+; RV64-NEXT:    ld a3, %lo(.LCPI10_1)(a3)
+; RV64-NEXT:    andi a5, a0, 3
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
+; RV64-NEXT:    mul a3, a5, a3
+; RV64-NEXT:    mulhu a6, a5, a4
+; RV64-NEXT:    add a3, a6, a3
+; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a1, a3, a0
+; RV64-NEXT:    mul a0, a5, a4
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a

diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,24 +335,46 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __umoddi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    andi a2, a2, -2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __umodti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a3, %lo(.LCPI10_0)(a3)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    mulhu a2, a1, a3
+; RV64-NEXT:    srli a3, a2, 1
+; RV64-NEXT:    andi a2, a2, -2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    li a1, 0
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a

diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,13 +735,24 @@ entry:
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __umoddi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    shldl $30, %esi, %ecx
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    leal (%esi,%ecx,4), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1116,13 +1127,37 @@ entry:
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __udivdi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    shldl $30, %ecx, %esi
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    leal (%eax,%esi,4), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    sbbl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12: