From 9867d1171183556007c5fefd29ca30ed501b7714 Mon Sep 17 00:00:00 2001 From: Nikola Peric Date: Thu, 15 Dec 2022 12:39:56 +0100 Subject: [PATCH] NanoMips: div-rem optimization Disable expansion of mod to mul-and-sub when Os or Oz. Generate udivmoddi4 libcall when div-rem pairs of type uint64 are present. --- llvm/lib/Target/Mips/MipsSEISelLowering.cpp | 133 +++++++++++++++- llvm/lib/Target/Mips/MipsSEISelLowering.h | 1 + .../Mips/NanoMipsTargetTransformInfo.cpp | 4 + .../Target/Mips/NanoMipsTargetTransformInfo.h | 5 +- llvm/test/CodeGen/Mips/nanomips/divrem.ll | 149 ++++++++++++++++++ 5 files changed, 289 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/Mips/nanomips/divrem.ll diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index d409046446775..c844b22479608 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -208,8 +208,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); - setOperationAction(ISD::SDIVREM, MVT::i32, Custom); - setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + if (!Subtarget.hasNanoMips()) { + setOperationAction(ISD::SDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + } + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); if (Subtarget.hasNanoMips()) { @@ -324,6 +327,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::SREM, MVT::i32, Legal); setOperationAction(ISD::UDIV, MVT::i32, Legal); setOperationAction(ISD::UREM, MVT::i32, Legal); + + setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4"); + setOperationAction(ISD::UDIVREM, MVT::i64, Custom); + setOperationAction(ISD::UDIV, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); } computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -504,6 +512,9 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op, case ISD::SDIVREM: return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG); case ISD::UDIVREM: return lowerMulDiv(Op, MipsISD::DivRemU, true, true, DAG); + case ISD::UDIV: + case ISD::UREM: + return lowerRemOrDiv(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return lowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return lowerINTRINSIC_VOID(Op, DAG); @@ -1315,6 +1326,59 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc, // MIPS32r6/MIPS64r6 removed accumulator based multiplies. assert(!Subtarget.hasMips32r6()); + unsigned Opcode = Op.getOpcode(); + MVT SimpleVT = Op.getSimpleValueType().SimpleTy; + if (Subtarget.hasNanoMips() && Opcode == ISD::UDIVREM && + SimpleVT == MVT::i64) { + bool isSigned = false; + RTLIB::Libcall LC = RTLIB::UDIVREM_I64; + + SDValue InChain = DAG.getEntryNode(); + + EVT RetVT = Op.getValueType(); + Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (const SDValue &Operand : Op.getNode()->op_values()) { + EVT ArgVT = Operand.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Node = Operand; + Entry.Ty = ArgTy; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; + Args.push_back(Entry); + } + + // Pass the return address of the remainder + SDValue FIPtr = DAG.CreateStackTemporary(RetVT); + Entry.Node = FIPtr; + Entry.Ty = RetTy->getPointerTo(); + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; + Args.push_back(Entry); + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + SDLoc dl(Op); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); + + std::pair CallInfo = LowerCallTo(CLI); + + // Remainder is loaded back from the stack frame + SDValue Rem = + DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo()); + + SDValue Vals[] = {CallInfo.first, Rem}; + return DAG.getMergeValues(Vals, dl); + } + EVT Ty = Op.getOperand(0).getValueType(); SDLoc DL(Op); SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped, @@ -1333,6 +1397,71 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc, return DAG.getMergeValues(Vals, DL); } +// This custom lowering hook prevents expansion of DIV and REM nodes +// with i64 value types into DIVREM node for NanoMips target and lowers them +// into appropriate libcall instead. +// During type legalization DIV and REM nodes are expanded into DIVREM node +// because i64 is ilegal value type and the action for DIVREM node is set to be +// "Custom" for NanoMips target. We want to lower DIV and REM nodes into +// appropriate libcalls instead of expanding them to DIVREM. In order to +// accomplish this we set the actions for DIV and REM nodes for MVT::i64 to be +// "Custom" instead of "LibCall". This results in calling this hook before +// expansion happens, bypassing the expansion but still lowering DIV and REM +// into appropriate libcalls. +SDValue MipsSETargetLowering::lowerRemOrDiv(SDValue Op, + SelectionDAG &DAG) const { + + unsigned Opcode = Op.getOpcode(); + MVT SimpleVT = Op.getSimpleValueType().SimpleTy; + if (Subtarget.hasNanoMips() && (Opcode == ISD::UDIV || Opcode == ISD::UREM) && + SimpleVT == MVT::i64) { + + SDLoc dl(Op.getNode()); + EVT VT = Op.getNode()->getValueType(0); + SDValue Ops[2] = {Op.getNode()->getOperand(0), Op.getNode()->getOperand(1)}; + SDValue Lo, Hi; + Lo = Hi = SDValue(); + + RTLIB::Libcall LC = Opcode == ISD::UDIV ? RTLIB::UDIV_I64 : RTLIB::UREM_I64; + + TargetLowering::MakeLibCallOptions CallOptions; + + SDValue LibcallOp = makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first; + + EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), + LibcallOp.getValueSizeInBits() / 2); + + EVT LoVT, HiVT; + LoVT = HalfVT; + HiVT = HalfVT; + + SDLoc DL(LibcallOp); + + assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() == + LibcallOp.getValueSizeInBits() && + "Invalid integer splitting!"); + + Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, LibcallOp); + + unsigned ReqShiftAmountInBits = + Log2_32_Ceil(LibcallOp.getValueType().getSizeInBits()); + + MVT ShiftAmountTy = + getScalarShiftAmountTy(DAG.getDataLayout(), LibcallOp.getValueType()); + + assert(ReqShiftAmountInBits <= ShiftAmountTy.getSizeInBits()); + + Hi = DAG.getNode(ISD::SRL, DL, LibcallOp.getValueType(), LibcallOp, + DAG.getConstant(LoVT.getSizeInBits(), DL, ShiftAmountTy)); + + Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); + + SDValue Vals[] = {LibcallOp, Lo, Hi}; + return DAG.getMergeValues(Vals, dl); + } + return SDValue(); +} + static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) { SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In, DAG.getConstant(0, DL, MVT::i32)); diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.h b/llvm/lib/Target/Mips/MipsSEISelLowering.h index 0ee36ae9f9425..406e8080a60c0 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.h +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.h @@ -75,6 +75,7 @@ class TargetRegisterClass; SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerRemOrDiv(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp index 32c33999c1417..f05490ba82a4b 100644 --- a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp +++ b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp @@ -97,3 +97,7 @@ void NanoMipsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, UP.Threshold = 60; UP.OptSizeThreshold = 0; } + +bool NanoMipsTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { + return F->hasOptSize() || (DataType->isIntegerTy(64) && !IsSigned); +} diff --git a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h index 5c77c7e2feebf..941591bf13f3d 100644 --- a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h +++ b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h @@ -36,10 +36,12 @@ class NanoMipsTTIImpl : public BasicTTIImplBase { const MipsSubtarget *getST() const { return ST; } const MipsTargetLowering *getTLI() const { return TLI; } + const Function *F; + public: explicit NanoMipsTTIImpl(const MipsTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), - TLI(ST->getTargetLowering()) {} + TLI(ST->getTargetLowering()), F(&F) {} InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); @@ -49,6 +51,7 @@ class NanoMipsTTIImpl : public BasicTTIImplBase { Instruction *Inst = nullptr); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + bool hasDivRemOp(Type *DataType, bool IsSigned); }; } // end namespace llvm diff --git a/llvm/test/CodeGen/Mips/nanomips/divrem.ll b/llvm/test/CodeGen/Mips/nanomips/divrem.ll new file mode 100644 index 0000000000000..aaa2ce2984d4d --- /dev/null +++ b/llvm/test/CodeGen/Mips/nanomips/divrem.ll @@ -0,0 +1,149 @@ +; RUN: llc -mtriple=nanomips -asm-show-inst -verify-machineinstrs < %s | FileCheck %s + +; Make sure to generate __udivmoddi4 libcall when udiv and urem +; instructions with the same operands are present +; and the operands are of type int64 +define void @test1(i64 %a, i64 %b, i64* %divmod) { + ; CHECK: save 16, $ra, $s0 + ; CHECK: move $s0, $a4 + ; CHECK: move $a4, $sp + ; CHECK: balc __udivmoddi4 + ; CHECK: swm $a0, 0($s0), 2 + ; CHECK: lw $a0, 4($sp) + ; CHECK: sw $a0, 12($s0) + ; CHECK: lw $a0, 0($sp) + ; CHECK: sw $a0, 8($s0) + ; CHECK: restore.jrc 16, $ra, $s0 + %div = udiv i64 %a, %b + store i64 %div, i64* %divmod, align 8 + %rem = urem i64 %a, %b + %arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1 + store i64 %rem, i64* %arrayidx1, align 8 + ret void +} + +; Make sure to generate __umoddi3 libcall when only urem is present +; and the operands are of type int64 +define void @test2(i64 %a, i64 %b, i64* %divmod) { + ; CHECK: save 16, $ra, $s0 + ; CHECK: move $s0, $a4 + ; CHECK: balc __umoddi3 + ; CHECK: swm $a0, 8($s0), 2 + ; CHECK: restore.jrc 16, $ra, $s0 + %rem = urem i64 %a, %b + %arrayidx = getelementptr inbounds i64, i64* %divmod, i32 1 + store i64 %rem, i64* %arrayidx, align 8 + ret void +} + +; Make sure to generate __udivdi3 libcall when only udiv is present +; and the operands are of type int64 +define void @test3(i64 %a, i64 %b, i64* %divmod) { + ; CHECK: save 16, $ra, $s0 + ; CHECK: move $s0, $a4 + ; CHECK: balc __udivdi3 + ; CHECK: swm $a0, 0($s0), 2 + ; CHECK: restore.jrc 16, $ra, $s0 + %div = udiv i64 %a, %b + store i64 %div, i64* %divmod, align 8 + ret void +} + +; If urem is expanded into mul+sub and the operands +; are of type int64, make sure to stay that way +define void @test4(i64 %a, i64 %b, i64* %divmod) { + ; CHECK: save 32, $ra, $s0, $s1, $s2, $s3, $s4 + ; CHECK: movep $s1, $s0, $a3, $a4 + ; CHECK: movep $s4, $s2, $a1, $a2 + ; CHECK: move $s3, $a0 + ; CHECK: balc __udivdi3 + ; CHECK: mul $a2, $a0, $s2 + ; CHECK: subu $a3, $s3, $a2 + ; CHECK: sw $a3, 8($s0) + ; CHECK: mul $a3, $a0, $s1 + ; CHECK: muhu $s1, $a0, $s2 + ; CHECK: addu $a3, $s1, $a3 + ; CHECK: swm $a0, 0($s0), 2 + ; CHECK: mul $a0, $a1, $s2 + ; CHECK: addu $a0, $a3, $a0 + ; CHECK: subu $a0, $s4, $a0 + ; CHECK: sltu $a1, $s3, $a2 + ; CHECK: subu $a0, $a0, $a1 + ; CHECK: sw $a0, 12($s0) + ; CHECK: restore.jrc 32, $ra, $s0, $s1, $s2, $s3, $s4 + %a.frozen = freeze i64 %a + %b.frozen = freeze i64 %b + %div = udiv i64 %a.frozen, %b.frozen + store i64 %div, i64* %divmod, align 8 + %1 = mul i64 %div, %b.frozen + %rem.decomposed = sub i64 %a.frozen, %1 + %arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1 + store i64 %rem.decomposed, i64* %arrayidx1, align 8 + ret void +} + +; Make sure to generate divu and modu when udiv and urem +; instructions with the same operands are present +; and the operands are of type int32 +define void @test5(i32 %a, i32 %b, i32* %divmod) { + ; CHECK: modu $a3, $a0, $a1 + ; CHECK: teq $zero, $a1, 7 + ; CHECK: sw $a3, 4($a2) + ; CHECK: divu $a0, $a0, $a1 + ; CHECK: teq $zero, $a1, 7 + ; CHECK: sw $a0, 0($a2) + ; CHECK: jrc $ra + %div = udiv i32 %a, %b + store i32 %div, i32* %divmod, align 4 + %rem = urem i32 %a, %b + %arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1 + store i32 %rem, i32* %arrayidx1, align 4 + ret void +} + +; Make sure to generate modu when only urem is present +; and the operands are of type int32 +define void @test6(i32 %a, i32 %b, i32* %divmod) { + ; CHECK: modu $a0, $a0, $a1 + ; CHECK: teq $zero, $a1, 7 + ; CHECK: sw $a0, 4($a2) + ; CHECK: jrc $ra + %rem = urem i32 %a, %b + %arrayidx = getelementptr inbounds i32, i32* %divmod, i32 1 + store i32 %rem, i32* %arrayidx, align 4 + ret void +} + +; Make sure to generate divu when only udiv is present +; and the operands are of type int32 +define void @test7(i32 %a, i32 %b, i32* %divmod) { + ; CHECK: divu $a0, $a0, $a1 + ; CHECK: teq $zero, $a1, 7 + ; CHECK: sw $a0, 0($a2) + ; CHECK: jrc $ra + %div = udiv i32 %a, %b + store i32 %div, i32* %divmod, align 4 + ret void +} + +; If urem is expanded into mul+sub and the operands +; are of type int32, make sure to stay that way. +define void @test8(i32 %a, i32 %b, i32* %divmod) { + ; CHECK: divu $a3, $a0, $a1 + ; CHECK: teq $zero, $a1, 7 + ; CHECK: sw $a3, 0($a2) + ; CHECK: mul $a1, $a3, $a1 + ; CHECK: subu $a0, $a0, $a1 + ; CHECK: sw $a0, 4($a2) + ; CHECK: jrc $ra + %a.frozen = freeze i32 %a + %b.frozen = freeze i32 %b + %div = udiv i32 %a.frozen, %b.frozen + store i32 %div, i32* %divmod, align 4 + %1 = mul i32 %div, %b.frozen + %rem.decomposed = sub i32 %a.frozen, %1 + %arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1 + store i32 %rem.decomposed, i32* %arrayidx1, align 4 + ret void +} +