From 9867d1171183556007c5fefd29ca30ed501b7714 Mon Sep 17 00:00:00 2001
From: Nikola Peric <Nikola.Peric@Syrmia.com>
Date: Thu, 15 Dec 2022 12:39:56 +0100
Subject: [PATCH] NanoMips: div-rem optimization

Disable expansion of mod to mul-and-sub when Os or Oz.
Generate udivmoddi4 libcall when div-rem pairs of type
uint64 are present.
---
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp   | 133 +++++++++++++++-
 llvm/lib/Target/Mips/MipsSEISelLowering.h     |   1 +
 .../Mips/NanoMipsTargetTransformInfo.cpp      |   4 +
 .../Target/Mips/NanoMipsTargetTransformInfo.h |   5 +-
 llvm/test/CodeGen/Mips/nanomips/divrem.ll     | 149 ++++++++++++++++++
 5 files changed, 289 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/Mips/nanomips/divrem.ll

diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index d409046446775..c844b22479608 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -208,8 +208,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN,  MVT::i64, Custom);
 
-  setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  if (!Subtarget.hasNanoMips()) {
+    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  }
+
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
 
   if (Subtarget.hasNanoMips()) {
@@ -324,6 +327,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::SREM, MVT::i32, Legal);
     setOperationAction(ISD::UDIV, MVT::i32, Legal);
     setOperationAction(ISD::UREM, MVT::i32, Legal);
+
+    setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
+    setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
+    setOperationAction(ISD::UDIV, MVT::i64, Custom);
+    setOperationAction(ISD::UREM, MVT::i64, Custom);
   }
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -504,6 +512,9 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   case ISD::SDIVREM:   return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
   case ISD::UDIVREM:   return lowerMulDiv(Op, MipsISD::DivRemU, true, true,
                                           DAG);
+  case ISD::UDIV:
+  case ISD::UREM:
+    return lowerRemOrDiv(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID:     return lowerINTRINSIC_VOID(Op, DAG);
@@ -1315,6 +1326,59 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
   // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
   assert(!Subtarget.hasMips32r6());
 
+  unsigned Opcode = Op.getOpcode();
+  MVT SimpleVT = Op.getSimpleValueType().SimpleTy;
+  if (Subtarget.hasNanoMips() && Opcode == ISD::UDIVREM &&
+      SimpleVT == MVT::i64) {
+    bool isSigned = false;
+    RTLIB::Libcall LC = RTLIB::UDIVREM_I64;
+
+    SDValue InChain = DAG.getEntryNode();
+
+    EVT RetVT = Op.getValueType();
+    Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    for (const SDValue &Operand : Op.getNode()->op_values()) {
+      EVT ArgVT = Operand.getValueType();
+      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+      Entry.Node = Operand;
+      Entry.Ty = ArgTy;
+      Entry.IsSExt = isSigned;
+      Entry.IsZExt = !isSigned;
+      Args.push_back(Entry);
+    }
+
+    // Pass the return address of the remainder
+    SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
+    Entry.Node = FIPtr;
+    Entry.Ty = RetTy->getPointerTo();
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
+    Args.push_back(Entry);
+
+    SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                           getPointerTy(DAG.getDataLayout()));
+
+    SDLoc dl(Op);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(InChain)
+        .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+        .setSExtResult(isSigned)
+        .setZExtResult(!isSigned);
+
+    std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+    // Remainder is loaded back from the stack frame
+    SDValue Rem =
+        DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
+
+    SDValue Vals[] = {CallInfo.first, Rem};
+    return DAG.getMergeValues(Vals, dl);
+  }
+
   EVT Ty = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
   SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
@@ -1333,6 +1397,71 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
   return DAG.getMergeValues(Vals, DL);
 }
 
+// This custom lowering hook prevents expansion of DIV and REM nodes
+// with i64 value types into DIVREM node for NanoMips target and lowers them
+// into appropriate libcall instead.
+// During type legalization DIV and REM nodes are expanded into DIVREM node
+// because i64 is ilegal value type and the action for DIVREM node is set to be
+// "Custom" for NanoMips target. We want to lower DIV and REM nodes into
+// appropriate libcalls instead of expanding them to DIVREM. In order to
+// accomplish this we set the actions for DIV and REM nodes for MVT::i64 to be
+// "Custom" instead of "LibCall". This results in calling this hook before
+// expansion happens, bypassing the expansion but still lowering DIV and REM
+// into appropriate libcalls.
+SDValue MipsSETargetLowering::lowerRemOrDiv(SDValue Op,
+                                            SelectionDAG &DAG) const {
+
+  unsigned Opcode = Op.getOpcode();
+  MVT SimpleVT = Op.getSimpleValueType().SimpleTy;
+  if (Subtarget.hasNanoMips() && (Opcode == ISD::UDIV || Opcode == ISD::UREM) &&
+      SimpleVT == MVT::i64) {
+
+    SDLoc dl(Op.getNode());
+    EVT VT = Op.getNode()->getValueType(0);
+    SDValue Ops[2] = {Op.getNode()->getOperand(0), Op.getNode()->getOperand(1)};
+    SDValue Lo, Hi;
+    Lo = Hi = SDValue();
+
+    RTLIB::Libcall LC = Opcode == ISD::UDIV ? RTLIB::UDIV_I64 : RTLIB::UREM_I64;
+
+    TargetLowering::MakeLibCallOptions CallOptions;
+
+    SDValue LibcallOp = makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first;
+
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
+                                   LibcallOp.getValueSizeInBits() / 2);
+
+    EVT LoVT, HiVT;
+    LoVT = HalfVT;
+    HiVT = HalfVT;
+
+    SDLoc DL(LibcallOp);
+
+    assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
+               LibcallOp.getValueSizeInBits() &&
+           "Invalid integer splitting!");
+
+    Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, LibcallOp);
+
+    unsigned ReqShiftAmountInBits =
+        Log2_32_Ceil(LibcallOp.getValueType().getSizeInBits());
+
+    MVT ShiftAmountTy =
+        getScalarShiftAmountTy(DAG.getDataLayout(), LibcallOp.getValueType());
+
+    assert(ReqShiftAmountInBits <= ShiftAmountTy.getSizeInBits());
+
+    Hi = DAG.getNode(ISD::SRL, DL, LibcallOp.getValueType(), LibcallOp,
+                     DAG.getConstant(LoVT.getSizeInBits(), DL, ShiftAmountTy));
+
+    Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+
+    SDValue Vals[] = {LibcallOp, Lo, Hi};
+    return DAG.getMergeValues(Vals, dl);
+  }
+  return SDValue();
+}
+
 static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) {
   SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(0, DL, MVT::i32));
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.h b/llvm/lib/Target/Mips/MipsSEISelLowering.h
index 0ee36ae9f9425..406e8080a60c0 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -75,6 +75,7 @@ class TargetRegisterClass;
     SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue lowerRemOrDiv(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
                         SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp
index 32c33999c1417..f05490ba82a4b 100644
--- a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp
@@ -97,3 +97,7 @@ void NanoMipsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   UP.Threshold = 60;
   UP.OptSizeThreshold = 0;
 }
+
+bool NanoMipsTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+  return F->hasOptSize() || (DataType->isIntegerTy(64) && !IsSigned);
+}
diff --git a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h
index 5c77c7e2feebf..941591bf13f3d 100644
--- a/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h
+++ b/llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h
@@ -36,10 +36,12 @@ class NanoMipsTTIImpl : public BasicTTIImplBase<NanoMipsTTIImpl> {
   const MipsSubtarget *getST() const { return ST; }
   const MipsTargetLowering *getTLI() const { return TLI; }
 
+  const Function *F;
+
 public:
   explicit NanoMipsTTIImpl(const MipsTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
-        TLI(ST->getTargetLowering()) {}
+        TLI(ST->getTargetLowering()), F(&F) {}
 
   InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                                 TTI::TargetCostKind CostKind);
@@ -49,6 +51,7 @@ class NanoMipsTTIImpl : public BasicTTIImplBase<NanoMipsTTIImpl> {
                                     Instruction *Inst = nullptr);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
+  bool hasDivRemOp(Type *DataType, bool IsSigned);
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/Mips/nanomips/divrem.ll b/llvm/test/CodeGen/Mips/nanomips/divrem.ll
new file mode 100644
index 0000000000000..aaa2ce2984d4d
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/nanomips/divrem.ll
@@ -0,0 +1,149 @@
+; RUN: llc -mtriple=nanomips -asm-show-inst -verify-machineinstrs < %s | FileCheck %s
+
+; Make sure to generate __udivmoddi4 libcall when udiv and urem 
+; instructions with the same operands are present 
+; and the operands are of type int64
+define void @test1(i64 %a, i64 %b, i64* %divmod) {    
+  ; CHECK: save	16, $ra, $s0
+  ; CHECK: move	$s0, $a4
+  ; CHECK: move	$a4, $sp
+  ; CHECK: balc	__udivmoddi4
+  ; CHECK: swm	$a0, 0($s0), 2
+  ; CHECK: lw	$a0, 4($sp)
+  ; CHECK: sw	$a0, 12($s0)
+  ; CHECK: lw	$a0, 0($sp)
+  ; CHECK: sw	$a0, 8($s0)
+  ; CHECK: restore.jrc	16, $ra, $s0
+  %div = udiv i64 %a, %b
+  store i64 %div, i64* %divmod, align 8
+  %rem = urem i64 %a, %b
+  %arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1
+  store i64 %rem, i64* %arrayidx1, align 8
+  ret void
+}
+
+; Make sure to generate __umoddi3 libcall when only urem is present
+; and the operands are of type int64
+define void @test2(i64 %a, i64 %b, i64* %divmod) {
+	; CHECK: save	16, $ra, $s0
+	; CHECK: move	$s0, $a4
+	; CHECK: balc	__umoddi3
+	; CHECK: swm	$a0, 8($s0), 2
+	; CHECK: restore.jrc	16, $ra, $s0
+  %rem = urem i64 %a, %b
+  %arrayidx = getelementptr inbounds i64, i64* %divmod, i32 1
+  store i64 %rem, i64* %arrayidx, align 8
+  ret void
+}
+
+; Make sure to generate __udivdi3 libcall when only udiv is present
+; and the operands are of type int64
+define void @test3(i64 %a, i64 %b, i64* %divmod) {
+	; CHECK: save	16, $ra, $s0
+	; CHECK: move	$s0, $a4
+	; CHECK: balc	__udivdi3
+	; CHECK: swm	$a0, 0($s0), 2
+	; CHECK: restore.jrc	16, $ra, $s0
+  %div = udiv i64 %a, %b
+  store i64 %div, i64* %divmod, align 8
+  ret void
+}
+
+; If urem is expanded into mul+sub and the operands 
+; are of type int64, make sure to stay that way
+define void @test4(i64 %a, i64 %b, i64* %divmod) {
+  ; CHECK: save	32, $ra, $s0, $s1, $s2, $s3, $s4
+	; CHECK: movep	$s1, $s0, $a3, $a4
+	; CHECK: movep	$s4, $s2, $a1, $a2
+	; CHECK: move	$s3, $a0
+	; CHECK: balc	__udivdi3
+	; CHECK: mul	$a2, $a0, $s2
+	; CHECK: subu	$a3, $s3, $a2
+	; CHECK: sw	$a3, 8($s0)
+	; CHECK: mul	$a3, $a0, $s1
+	; CHECK: muhu	$s1, $a0, $s2
+	; CHECK: addu	$a3, $s1, $a3
+	; CHECK: swm	$a0, 0($s0), 2
+	; CHECK: mul	$a0, $a1, $s2
+	; CHECK: addu	$a0, $a3, $a0
+	; CHECK: subu	$a0, $s4, $a0
+	; CHECK: sltu	$a1, $s3, $a2
+	; CHECK: subu	$a0, $a0, $a1
+	; CHECK: sw	$a0, 12($s0)
+	; CHECK: restore.jrc	32, $ra, $s0, $s1, $s2, $s3, $s4
+  %a.frozen = freeze i64 %a
+  %b.frozen = freeze i64 %b
+  %div = udiv i64 %a.frozen, %b.frozen
+  store i64 %div, i64* %divmod, align 8
+  %1 = mul i64 %div, %b.frozen
+  %rem.decomposed = sub i64 %a.frozen, %1
+  %arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1
+  store i64 %rem.decomposed, i64* %arrayidx1, align 8
+  ret void
+}
+
+; Make sure to generate divu and modu when udiv and urem 
+; instructions with the same operands are present 
+; and the operands are of type int32
+define void @test5(i32 %a, i32 %b, i32* %divmod) {
+  ; CHECK: modu	$a3, $a0, $a1
+	; CHECK: teq	$zero, $a1, 7
+	; CHECK: sw	$a3, 4($a2)
+	; CHECK: divu	$a0, $a0, $a1
+	; CHECK: teq	$zero, $a1, 7
+	; CHECK: sw	$a0, 0($a2)
+	; CHECK: jrc	$ra
+  %div = udiv i32 %a, %b
+  store i32 %div, i32* %divmod, align 4
+  %rem = urem i32 %a, %b
+  %arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1
+  store i32 %rem, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Make sure to generate modu when only urem is present
+; and the operands are of type int32
+define  void @test6(i32 %a, i32 %b, i32* %divmod) {
+  ; CHECK: modu	$a0, $a0, $a1
+	; CHECK: teq	$zero, $a1, 7
+	; CHECK: sw	$a0, 4($a2)
+	; CHECK: jrc	$ra
+  %rem = urem i32 %a, %b
+  %arrayidx = getelementptr inbounds i32, i32* %divmod, i32 1
+  store i32 %rem, i32* %arrayidx, align 4
+  ret void
+}
+
+; Make sure to generate divu when only udiv is present
+; and the operands are of type int32
+define void @test7(i32 %a, i32 %b, i32* %divmod) {
+  ; CHECK: divu	$a0, $a0, $a1
+	; CHECK: teq	$zero, $a1, 7
+	; CHECK: sw	$a0, 0($a2)
+	; CHECK: jrc	$ra
+  %div = udiv i32 %a, %b
+  store i32 %div, i32* %divmod, align 4
+  ret void
+}
+
+; If urem is expanded into mul+sub and the operands 
+; are of type int32, make sure to stay that way.
+define void @test8(i32 %a, i32 %b, i32* %divmod) {
+  ; CHECK: divu	$a3, $a0, $a1
+	; CHECK: teq	$zero, $a1, 7
+	; CHECK: sw	$a3, 0($a2)
+	; CHECK: mul	$a1, $a3, $a1
+	; CHECK: subu	$a0, $a0, $a1
+	; CHECK: sw	$a0, 4($a2)
+	; CHECK: jrc	$ra
+  %a.frozen = freeze i32 %a
+  %b.frozen = freeze i32 %b
+  %div = udiv i32 %a.frozen, %b.frozen
+  store i32 %div, i32* %divmod, align 4
+  %1 = mul i32 %div, %b.frozen
+  %rem.decomposed = sub i32 %a.frozen, %1
+  %arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1
+  store i32 %rem.decomposed, i32* %arrayidx1, align 4
+  ret void
+}
+