Skip to content

Commit

Permalink
[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByC…
Browse files Browse the repository at this point in the history
…onstant.

If the divisor is even, we can first shift the dividend and divisor
right by the number of trailing zeros. Now the divisor is odd and we
can do the original algorithm to calculate a remainder. Then we shift
that remainder left by the number of trailing zeros and add the bits
that were shifted out of the dividend.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D135541
  • Loading branch information
topperc committed Oct 10, 2022
1 parent 9eb1185 commit d4facda
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 84 deletions.
59 changes: 51 additions & 8 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7168,8 +7168,15 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
// Remainder = Sum % Constant
// This is based on "Remainder by Summing Digits" from Hacker's Delight.
//
// For division, we can compute the remainder, subtract it from the dividend,
// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
// If Constant is even, we can shift right the dividend and the divisor by the
// number of trailing zeros in Constant before computing the remainder. Then
// fixup the remainder by shifting it left by the number of trailing zeros and
// adding the bits that were shifted out of the dividend.
//
// For division, we can compute the remainder using the algorithm described
// above, subtract it from the dividend to get an exact multiple of Constant.
// Then multiply that extact multiple by the multiplicative inverse modulo
// (1 << (BitWidth / 2)).
bool TargetLowering::expandDIVREMByConstant(SDNode *N,
SmallVectorImpl<SDValue> &Result,
EVT HiLoVT, SelectionDAG &DAG,
Expand All @@ -7188,7 +7195,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (!CN)
return false;

const APInt &Divisor = CN->getAPIntValue();
APInt Divisor = CN->getAPIntValue();
unsigned BitWidth = Divisor.getBitWidth();
unsigned HBitWidth = BitWidth / 2;
assert(VT.getScalarSizeInBits() == BitWidth &&
Expand All @@ -7209,10 +7216,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (DAG.shouldOptForSize())
return false;

// Early out for 0, 1 or even divisors.
if (Divisor.ule(1) || Divisor[0] == 0)
// Early out for 0 or 1 divisors.
if (Divisor.ule(1))
return false;

// If the divisor is even, shift it until it becomes odd.
unsigned TrailingZeros = 0;
if (!Divisor[0]) {
TrailingZeros = Divisor.countTrailingZeros();
Divisor.lshrInPlace(TrailingZeros);
}

SDLoc dl(N);
SDValue Sum;

Expand All @@ -7229,17 +7243,35 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG.getIntPtrConstant(1, dl));
}

SDValue ShiftedLL = LL;
SDValue ShiftedLH = LH;

// Shift the input by the number of TrailingZeros in the divisor. The
// shifted out bits will be added to the remainder later.
if (TrailingZeros) {
ShiftedLL = DAG.getNode(
ISD::OR, dl, HiLoVT,
DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLL,
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
DAG.getNode(ISD::SHL, dl, HiLoVT, ShiftedLH,
DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
HiLoVT, dl)));
ShiftedLH =
DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLH,
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
}

// Use addcarry if we can, otherwise use a compare to detect overflow.
EVT SetCCType =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) {
SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
Sum = DAG.getNode(ISD::UADDO, dl, VTList, ShiftedLL, ShiftedLH);
Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum,
DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1));
} else {
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH);
SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, ShiftedLL, ShiftedLH);
SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, ShiftedLL, ISD::SETULT);
// If the boolean for the target is 0 or 1, we can add the setcc result
// directly.
if (getBooleanContents(HiLoVT) ==
Expand All @@ -7263,6 +7295,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// High half of the remainder is 0.
SDValue RemH = DAG.getConstant(0, dl, HiLoVT);

// If we shifted the input, shift the remainder left and add the bits we
// shifted off the input.
if (TrailingZeros) {
APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL,
DAG.getNode(ISD::AND, dl, HiLoVT, LL,
DAG.getConstant(Mask, dl, HiLoVT)));
}

// If we only want remainder, we're done.
if (Opcode == ISD::UREM) {
Result.push_back(RemL);
Expand Down
69 changes: 55 additions & 14 deletions llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -502,24 +502,65 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_12:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 12
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __udivdi3@plt
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: slli a2, a1, 30
; RV32-NEXT: srli a3, a0, 2
; RV32-NEXT: or a2, a3, a2
; RV32-NEXT: srli a3, a1, 2
; RV32-NEXT: add a3, a2, a3
; RV32-NEXT: sltu a2, a3, a2
; RV32-NEXT: add a2, a3, a2
; RV32-NEXT: lui a3, 699051
; RV32-NEXT: addi a4, a3, -1365
; RV32-NEXT: mulhu a5, a2, a4
; RV32-NEXT: srli a6, a5, 1
; RV32-NEXT: andi a5, a5, -2
; RV32-NEXT: add a5, a5, a6
; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: slli a2, a2, 2
; RV32-NEXT: andi a5, a0, 3
; RV32-NEXT: or a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: addi a3, a3, -1366
; RV32-NEXT: mul a3, a5, a3
; RV32-NEXT: mulhu a6, a5, a4
; RV32-NEXT: add a3, a6, a3
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a0, a1, a0
; RV32-NEXT: mul a0, a0, a4
; RV32-NEXT: add a1, a3, a0
; RV32-NEXT: mul a0, a5, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_12:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a2, 12
; RV64-NEXT: li a3, 0
; RV64-NEXT: call __udivti3@plt
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: slli a2, a1, 62
; RV64-NEXT: srli a3, a0, 2
; RV64-NEXT: or a2, a3, a2
; RV64-NEXT: srli a3, a1, 2
; RV64-NEXT: lui a4, %hi(.LCPI10_0)
; RV64-NEXT: ld a4, %lo(.LCPI10_0)(a4)
; RV64-NEXT: add a3, a2, a3
; RV64-NEXT: sltu a2, a3, a2
; RV64-NEXT: add a2, a3, a2
; RV64-NEXT: mulhu a3, a2, a4
; RV64-NEXT: srli a5, a3, 1
; RV64-NEXT: andi a3, a3, -2
; RV64-NEXT: add a3, a3, a5
; RV64-NEXT: sub a2, a2, a3
; RV64-NEXT: slli a2, a2, 2
; RV64-NEXT: lui a3, %hi(.LCPI10_1)
; RV64-NEXT: ld a3, %lo(.LCPI10_1)(a3)
; RV64-NEXT: andi a5, a0, 3
; RV64-NEXT: or a2, a2, a5
; RV64-NEXT: sub a5, a0, a2
; RV64-NEXT: mul a3, a5, a3
; RV64-NEXT: mulhu a6, a5, a4
; RV64-NEXT: add a3, a6, a3
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a0, a1, a0
; RV64-NEXT: mul a0, a0, a4
; RV64-NEXT: add a1, a3, a0
; RV64-NEXT: mul a0, a5, a4
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 12
ret iXLen2 %a
Expand Down
50 changes: 36 additions & 14 deletions llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -335,24 +335,46 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_12:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 12
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __umoddi3@plt
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: slli a2, a1, 30
; RV32-NEXT: srli a3, a0, 2
; RV32-NEXT: or a2, a3, a2
; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: lui a2, 699051
; RV32-NEXT: addi a2, a2, -1365
; RV32-NEXT: mulhu a2, a1, a2
; RV32-NEXT: srli a3, a2, 1
; RV32-NEXT: andi a2, a2, -2
; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: andi a0, a0, 3
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_12:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a2, 12
; RV64-NEXT: li a3, 0
; RV64-NEXT: call __umodti3@plt
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: slli a2, a1, 62
; RV64-NEXT: srli a3, a0, 2
; RV64-NEXT: or a2, a3, a2
; RV64-NEXT: srli a1, a1, 2
; RV64-NEXT: lui a3, %hi(.LCPI10_0)
; RV64-NEXT: ld a3, %lo(.LCPI10_0)(a3)
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a2, a1, a3
; RV64-NEXT: srli a3, a2, 1
; RV64-NEXT: andi a2, a2, -2
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: slli a1, a1, 2
; RV64-NEXT: andi a0, a0, 3
; RV64-NEXT: or a0, a1, a0
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 12
ret iXLen2 %a
Expand Down
63 changes: 49 additions & 14 deletions llvm/test/CodeGen/X86/divide-by-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -735,13 +735,24 @@ entry:
define i64 @urem_i64_12(i64 %x) nounwind {
; X32-LABEL: urem_i64_12:
; X32: # %bb.0: # %entry
; X32-NEXT: subl $12, %esp
; X32-NEXT: pushl $0
; X32-NEXT: pushl $12
; X32-NEXT: pushl {{[0-9]+}}(%esp)
; X32-NEXT: pushl {{[0-9]+}}(%esp)
; X32-NEXT: calll __umoddi3
; X32-NEXT: addl $28, %esp
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $2, %eax
; X32-NEXT: shldl $30, %esi, %ecx
; X32-NEXT: addl %eax, %ecx
; X32-NEXT: adcl $0, %ecx
; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %edx
; X32-NEXT: shrl %edx
; X32-NEXT: leal (%edx,%edx,2), %eax
; X32-NEXT: subl %eax, %ecx
; X32-NEXT: andl $3, %esi
; X32-NEXT: leal (%esi,%ecx,4), %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: urem_i64_12:
Expand Down Expand Up @@ -1116,13 +1127,37 @@ entry:
define i64 @udiv_i64_12(i64 %x) nounwind {
; X32-LABEL: udiv_i64_12:
; X32: # %bb.0: # %entry
; X32-NEXT: subl $12, %esp
; X32-NEXT: pushl $0
; X32-NEXT: pushl $12
; X32-NEXT: pushl {{[0-9]+}}(%esp)
; X32-NEXT: pushl {{[0-9]+}}(%esp)
; X32-NEXT: calll __udivdi3
; X32-NEXT: addl $28, %esp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl %edi, %eax
; X32-NEXT: shrl $2, %eax
; X32-NEXT: movl %edi, %esi
; X32-NEXT: shldl $30, %ecx, %esi
; X32-NEXT: addl %eax, %esi
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB
; X32-NEXT: movl %esi, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: shrl %edx
; X32-NEXT: leal (%edx,%edx,2), %eax
; X32-NEXT: subl %eax, %esi
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: andl $3, %eax
; X32-NEXT: leal (%eax,%esi,4), %eax
; X32-NEXT: subl %eax, %ecx
; X32-NEXT: sbbl $0, %edi
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: mull %ebx
; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
; X32-NEXT: addl %ecx, %edx
; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
; X32-NEXT: addl %ecx, %edx
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
;
; X64-LABEL: udiv_i64_12:
Expand Down
Loading

0 comments on commit d4facda

Please sign in to comment.