Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NanoMips: div-rem optimization #2

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 131 additions & 2 deletions llvm/lib/Target/Mips/MipsSEISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
if (!Subtarget.hasNanoMips()) {
setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
}

setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);

if (Subtarget.hasNanoMips()) {
Expand Down Expand Up @@ -324,6 +327,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::SREM, MVT::i32, Legal);
setOperationAction(ISD::UDIV, MVT::i32, Legal);
setOperationAction(ISD::UREM, MVT::i32, Legal);

setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
setOperationAction(ISD::UDIV, MVT::i64, Custom);
setOperationAction(ISD::UREM, MVT::i64, Custom);
}

computeRegisterProperties(Subtarget.getRegisterInfo());
Expand Down Expand Up @@ -504,6 +512,9 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
case ISD::SDIVREM: return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
case ISD::UDIVREM: return lowerMulDiv(Op, MipsISD::DivRemU, true, true,
DAG);
case ISD::UDIV:
case ISD::UREM:
return lowerRemOrDiv(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return lowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return lowerINTRINSIC_VOID(Op, DAG);
Expand Down Expand Up @@ -1315,6 +1326,59 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
// MIPS32r6/MIPS64r6 removed accumulator based multiplies.
assert(!Subtarget.hasMips32r6());

unsigned Opcode = Op.getOpcode();
MVT SimpleVT = Op.getSimpleValueType().SimpleTy;
if (Subtarget.hasNanoMips() && Opcode == ISD::UDIVREM &&
SimpleVT == MVT::i64) {
bool isSigned = false;
RTLIB::Libcall LC = RTLIB::UDIVREM_I64;

SDValue InChain = DAG.getEntryNode();

EVT RetVT = Op.getValueType();
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());

TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
for (const SDValue &Operand : Op.getNode()->op_values()) {
EVT ArgVT = Operand.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Node = Operand;
Entry.Ty = ArgTy;
Entry.IsSExt = isSigned;
Entry.IsZExt = !isSigned;
Args.push_back(Entry);
}

// Pass the return address of the remainder
SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
Entry.Node = FIPtr;
Entry.Ty = RetTy->getPointerTo();
Entry.IsSExt = isSigned;
Entry.IsZExt = !isSigned;
Args.push_back(Entry);

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));

SDLoc dl(Op);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(InChain)
.setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
.setSExtResult(isSigned)
.setZExtResult(!isSigned);

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

// Remainder is loaded back from the stack frame
SDValue Rem =
DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());

SDValue Vals[] = {CallInfo.first, Rem};
return DAG.getMergeValues(Vals, dl);
}

EVT Ty = Op.getOperand(0).getValueType();
SDLoc DL(Op);
SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
Expand All @@ -1333,6 +1397,71 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
return DAG.getMergeValues(Vals, DL);
}

// This custom lowering hook prevents expansion of DIV and REM nodes
// with i64 value types into DIVREM node for NanoMips target and lowers them
// into appropriate libcall instead.
// During type legalization DIV and REM nodes are expanded into DIVREM node
// because i64 is ilegal value type and the action for DIVREM node is set to be
// "Custom" for NanoMips target. We want to lower DIV and REM nodes into
// appropriate libcalls instead of expanding them to DIVREM. In order to
// accomplish this we set the actions for DIV and REM nodes for MVT::i64 to be
// "Custom" instead of "LibCall". This results in calling this hook before
// expansion happens, bypassing the expansion but still lowering DIV and REM
// into appropriate libcalls.
SDValue MipsSETargetLowering::lowerRemOrDiv(SDValue Op,
SelectionDAG &DAG) const {

unsigned Opcode = Op.getOpcode();
MVT SimpleVT = Op.getSimpleValueType().SimpleTy;
if (Subtarget.hasNanoMips() && (Opcode == ISD::UDIV || Opcode == ISD::UREM) &&
SimpleVT == MVT::i64) {

SDLoc dl(Op.getNode());
EVT VT = Op.getNode()->getValueType(0);
SDValue Ops[2] = {Op.getNode()->getOperand(0), Op.getNode()->getOperand(1)};
SDValue Lo, Hi;
Lo = Hi = SDValue();

RTLIB::Libcall LC = Opcode == ISD::UDIV ? RTLIB::UDIV_I64 : RTLIB::UREM_I64;

TargetLowering::MakeLibCallOptions CallOptions;

SDValue LibcallOp = makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first;

EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
LibcallOp.getValueSizeInBits() / 2);

EVT LoVT, HiVT;
LoVT = HalfVT;
HiVT = HalfVT;

SDLoc DL(LibcallOp);

assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
LibcallOp.getValueSizeInBits() &&
"Invalid integer splitting!");

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, LibcallOp);

unsigned ReqShiftAmountInBits =
Log2_32_Ceil(LibcallOp.getValueType().getSizeInBits());

MVT ShiftAmountTy =
getScalarShiftAmountTy(DAG.getDataLayout(), LibcallOp.getValueType());

assert(ReqShiftAmountInBits <= ShiftAmountTy.getSizeInBits());

Hi = DAG.getNode(ISD::SRL, DL, LibcallOp.getValueType(), LibcallOp,
DAG.getConstant(LoVT.getSizeInBits(), DL, ShiftAmountTy));

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

SDValue Vals[] = {LibcallOp, Lo, Hi};
return DAG.getMergeValues(Vals, dl);
}
return SDValue();
}

static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) {
SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
DAG.getConstant(0, DL, MVT::i32));
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/Mips/MipsSEISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class TargetRegisterClass;
SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;

SDValue lowerRemOrDiv(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
SelectionDAG &DAG) const;

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,7 @@ void NanoMipsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.Threshold = 60;
UP.OptSizeThreshold = 0;
}

bool NanoMipsTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
return F->hasOptSize() || (DataType->isIntegerTy(64) && !IsSigned);
}
5 changes: 4 additions & 1 deletion llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ class NanoMipsTTIImpl : public BasicTTIImplBase<NanoMipsTTIImpl> {
const MipsSubtarget *getST() const { return ST; }
const MipsTargetLowering *getTLI() const { return TLI; }

const Function *F;

public:
explicit NanoMipsTTIImpl(const MipsTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
TLI(ST->getTargetLowering()), F(&F) {}

InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind);
Expand All @@ -49,6 +51,7 @@ class NanoMipsTTIImpl : public BasicTTIImplBase<NanoMipsTTIImpl> {
Instruction *Inst = nullptr);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
bool hasDivRemOp(Type *DataType, bool IsSigned);
};

} // end namespace llvm
Expand Down
149 changes: 149 additions & 0 deletions llvm/test/CodeGen/Mips/nanomips/divrem.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
; RUN: llc -mtriple=nanomips -asm-show-inst -verify-machineinstrs < %s | FileCheck %s

; Make sure to generate __udivmoddi4 libcall when udiv and urem
; instructions with the same operands are present
; and the operands are of type int64
define void @test1(i64 %a, i64 %b, i64* %divmod) {
; CHECK: save 16, $ra, $s0
; CHECK: move $s0, $a4
; CHECK: move $a4, $sp
; CHECK: balc __udivmoddi4
; CHECK: swm $a0, 0($s0), 2
; CHECK: lw $a0, 4($sp)
; CHECK: sw $a0, 12($s0)
; CHECK: lw $a0, 0($sp)
; CHECK: sw $a0, 8($s0)
; CHECK: restore.jrc 16, $ra, $s0
%div = udiv i64 %a, %b
store i64 %div, i64* %divmod, align 8
%rem = urem i64 %a, %b
%arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1
store i64 %rem, i64* %arrayidx1, align 8
ret void
}

; Make sure to generate __umoddi3 libcall when only urem is present
; and the operands are of type int64
define void @test2(i64 %a, i64 %b, i64* %divmod) {
; CHECK: save 16, $ra, $s0
; CHECK: move $s0, $a4
; CHECK: balc __umoddi3
; CHECK: swm $a0, 8($s0), 2
; CHECK: restore.jrc 16, $ra, $s0
%rem = urem i64 %a, %b
%arrayidx = getelementptr inbounds i64, i64* %divmod, i32 1
store i64 %rem, i64* %arrayidx, align 8
ret void
}

; Make sure to generate __udivdi3 libcall when only udiv is present
; and the operands are of type int64
define void @test3(i64 %a, i64 %b, i64* %divmod) {
; CHECK: save 16, $ra, $s0
; CHECK: move $s0, $a4
; CHECK: balc __udivdi3
; CHECK: swm $a0, 0($s0), 2
; CHECK: restore.jrc 16, $ra, $s0
%div = udiv i64 %a, %b
store i64 %div, i64* %divmod, align 8
ret void
}

; If urem is expanded into mul+sub and the operands
; are of type int64, make sure to stay that way
define void @test4(i64 %a, i64 %b, i64* %divmod) {
; CHECK: save 32, $ra, $s0, $s1, $s2, $s3, $s4
; CHECK: movep $s1, $s0, $a3, $a4
; CHECK: movep $s4, $s2, $a1, $a2
; CHECK: move $s3, $a0
; CHECK: balc __udivdi3
; CHECK: mul $a2, $a0, $s2
; CHECK: subu $a3, $s3, $a2
; CHECK: sw $a3, 8($s0)
; CHECK: mul $a3, $a0, $s1
; CHECK: muhu $s1, $a0, $s2
; CHECK: addu $a3, $s1, $a3
; CHECK: swm $a0, 0($s0), 2
; CHECK: mul $a0, $a1, $s2
; CHECK: addu $a0, $a3, $a0
; CHECK: subu $a0, $s4, $a0
; CHECK: sltu $a1, $s3, $a2
; CHECK: subu $a0, $a0, $a1
; CHECK: sw $a0, 12($s0)
; CHECK: restore.jrc 32, $ra, $s0, $s1, $s2, $s3, $s4
%a.frozen = freeze i64 %a
%b.frozen = freeze i64 %b
%div = udiv i64 %a.frozen, %b.frozen
store i64 %div, i64* %divmod, align 8
%1 = mul i64 %div, %b.frozen
%rem.decomposed = sub i64 %a.frozen, %1
%arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1
store i64 %rem.decomposed, i64* %arrayidx1, align 8
ret void
}

; Make sure to generate divu and modu when udiv and urem
; instructions with the same operands are present
; and the operands are of type int32
define void @test5(i32 %a, i32 %b, i32* %divmod) {
; CHECK: modu $a3, $a0, $a1
; CHECK: teq $zero, $a1, 7
; CHECK: sw $a3, 4($a2)
; CHECK: divu $a0, $a0, $a1
; CHECK: teq $zero, $a1, 7
; CHECK: sw $a0, 0($a2)
; CHECK: jrc $ra
%div = udiv i32 %a, %b
store i32 %div, i32* %divmod, align 4
%rem = urem i32 %a, %b
%arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1
store i32 %rem, i32* %arrayidx1, align 4
ret void
}

; Make sure to generate modu when only urem is present
; and the operands are of type int32
define void @test6(i32 %a, i32 %b, i32* %divmod) {
; CHECK: modu $a0, $a0, $a1
; CHECK: teq $zero, $a1, 7
; CHECK: sw $a0, 4($a2)
; CHECK: jrc $ra
%rem = urem i32 %a, %b
%arrayidx = getelementptr inbounds i32, i32* %divmod, i32 1
store i32 %rem, i32* %arrayidx, align 4
ret void
}

; Make sure to generate divu when only udiv is present
; and the operands are of type int32
define void @test7(i32 %a, i32 %b, i32* %divmod) {
; CHECK: divu $a0, $a0, $a1
; CHECK: teq $zero, $a1, 7
; CHECK: sw $a0, 0($a2)
; CHECK: jrc $ra
%div = udiv i32 %a, %b
store i32 %div, i32* %divmod, align 4
ret void
}

; If urem is expanded into mul+sub and the operands
; are of type int32, make sure to stay that way.
define void @test8(i32 %a, i32 %b, i32* %divmod) {
; CHECK: divu $a3, $a0, $a1
; CHECK: teq $zero, $a1, 7
; CHECK: sw $a3, 0($a2)
; CHECK: mul $a1, $a3, $a1
; CHECK: subu $a0, $a0, $a1
; CHECK: sw $a0, 4($a2)
; CHECK: jrc $ra
%a.frozen = freeze i32 %a
%b.frozen = freeze i32 %b
%div = udiv i32 %a.frozen, %b.frozen
store i32 %div, i32* %divmod, align 4
%1 = mul i32 %div, %b.frozen
%rem.decomposed = sub i32 %a.frozen, %1
%arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1
store i32 %rem.decomposed, i32* %arrayidx1, align 4
ret void
}