diff --git a/eng/native/configureplatform.cmake b/eng/native/configureplatform.cmake index 7a845a5bf7be4a..274ab363bf7905 100644 --- a/eng/native/configureplatform.cmake +++ b/eng/native/configureplatform.cmake @@ -29,6 +29,8 @@ if(CLR_CMAKE_HOST_OS STREQUAL linux) set(CLR_CMAKE_HOST_UNIX_X86 1) elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL aarch64) set(CLR_CMAKE_HOST_UNIX_ARM64 1) + elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL riscv64) + set(CLR_CMAKE_HOST_UNIX_RISCV64 1) else() clr_unknown_arch() endif() diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 9a4736eef737ed..b4ca00cb16c003 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -7223,7 +7223,7 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe if (leftFrameSize != 0) { - genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true); + genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ false); } } diff --git a/src/coreclr/nativeaot/Common/src/Internal/Runtime/TransitionBlock.cs b/src/coreclr/nativeaot/Common/src/Internal/Runtime/TransitionBlock.cs index 7932ba9300dc3a..612430714bd2ca 100644 --- a/src/coreclr/nativeaot/Common/src/Internal/Runtime/TransitionBlock.cs +++ b/src/coreclr/nativeaot/Common/src/Internal/Runtime/TransitionBlock.cs @@ -366,8 +366,6 @@ internal struct ReturnBlock { private IntPtr returnValue; private IntPtr returnValue2; - private IntPtr returnValue3; - private IntPtr returnValue4; } [StructLayout(LayoutKind.Sequential)] diff --git a/src/coreclr/nativeaot/Runtime/CMakeLists.txt b/src/coreclr/nativeaot/Runtime/CMakeLists.txt index 8cf45e0018bfa8..ccf197b08f6602 100644 --- a/src/coreclr/nativeaot/Runtime/CMakeLists.txt +++ b/src/coreclr/nativeaot/Runtime/CMakeLists.txt @@ -236,7 +236,7 @@ endif() add_definitions(-DFEATURE_BASICFREEZE) add_definitions(-DFEATURE_CONSERVATIVE_GC) -if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64) +if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_RISCV64) add_definitions(-DFEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP) add_definitions(-DFEATURE_MANUALLY_MANAGED_CARD_BUNDLES) endif() diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index 302bd05861a509..78a39612f907c0 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -530,7 +530,7 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PTR_PAL_LIMITED_CO // preserved floating-point registers // int32_t preservedFpIndices[] = {8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}; - for (int i = 0; i < sizeof(preservedFpIndices) / sizeof(preservedFpIndices[0]); i++) + for (int i = 0; i < ARRAY_SIZE(preservedFpIndices); i++) { m_RegDisplay.F[preservedFpIndices[i]] = pCtx->F[preservedFpIndices[i]]; } @@ -809,6 +809,8 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, NATIVE_CONTEXT* pC m_RegDisplay.pS9 = (PTR_uintptr_t)PTR_TO_REG(pCtx, S9); m_RegDisplay.pS10 = (PTR_uintptr_t)PTR_TO_REG(pCtx, S10); m_RegDisplay.pS11 = (PTR_uintptr_t)PTR_TO_REG(pCtx, S11); + m_RegDisplay.pFP = (PTR_uintptr_t)PTR_TO_REG(pCtx, Fp); + m_RegDisplay.pRA = (PTR_uintptr_t)PTR_TO_REG(pCtx, Ra); // // scratch regs @@ -1285,9 +1287,10 @@ void StackFrameIterator::UnwindFuncletInvokeThunk() #elif defined(TARGET_RISCV64) PTR_uint64_t f = (PTR_uint64_t)(m_RegDisplay.SP); - for (int i = 0; i < 32; i++) + int32_t preservedFpIndices[] = {8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}; + for (int i = 0; i < ARRAY_SIZE(preservedFpIndices); i++) { - m_RegDisplay.F[i] = *f++; + m_RegDisplay.F[preservedFpIndices[i]] = *f++; } SP = (PTR_uintptr_t)f; @@ -1496,12 +1499,12 @@ struct UniversalTransitionStackFrame // Conservative GC reporting must be applied to everything between the base of the // ReturnBlock and the top of the StackPassedArgs. private: - uintptr_t m_pushedRA; // ChildSP+000 CallerSP-0F0 (0x08 bytes) (ra) - uintptr_t m_pushedFP; // ChildSP+008 CallerSP-0E8 (0x08 bytes) (fp) - Fp128 m_fpArgRegs[8]; // ChildSP+010 CallerSP-0E0 (0x80 bytes) (fa0-fa7) - uintptr_t m_returnBlock[4]; // ChildSP+090 CallerSP-060 (0x20 bytes) - uintptr_t m_intArgRegs[8]; // ChildSP+0B0 CallerSP-040 (0x40 bytes) (a0-a7) - uintptr_t m_stackPassedArgs[1]; // ChildSP+0F0 CallerSP+000 (unknown size) + uintptr_t m_pushedFP; // ChildSP+000 CallerSP-0A0 (0x08 bytes) (fp) + uintptr_t m_pushedRA; // ChildSP+008 CallerSP-098 (0x08 bytes) (ra) + uint64_t m_fpArgRegs[8]; // ChildSP+010 CallerSP-090 (0x40 bytes) (fa0-fa7) + uintptr_t m_returnBlock[2]; // ChildSP+050 CallerSP-050 (0x10 bytes) + uintptr_t m_intArgRegs[8]; // ChildSP+060 CallerSP-040 (0x40 bytes) (a0-a7) + uintptr_t m_stackPassedArgs[1]; // ChildSP+0A0 CallerSP+000 (unknown size) public: PTR_uintptr_t get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); } diff --git a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp index 9f7211a2ee7623..01f2dbfec95348 100644 --- a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp +++ b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp @@ -25,7 +25,7 @@ #elif TARGET_LOONGARCH64 #define THUNK_SIZE 16 #elif TARGET_RISCV64 -#define THUNK_SIZE 12 +#define THUNK_SIZE 20 #else #define THUNK_SIZE (2 * OS_PAGE_SIZE) // This will cause RhpGetNumThunksPerBlock to return 0 #endif @@ -259,21 +259,27 @@ EXTERN_C void* QCALLTYPE RhAllocateThunksMapping() #elif defined(TARGET_RISCV64) - // auipc t0, %hi(delta) // Load upper immediate with address high bits - // ld t1, %lo(delta)(t0) // Load data from address in (t0 + lower immediate) - // jr t1 // Jump and don't link register + //auipc t1, hi() + //addi t1, t1, lo() + //auipc t0, hi() + //ld t0, (t0) + //jalr zero, t0, 0 int delta = (int)(pCurrentDataAddress - pCurrentThunkAddress); - uint32_t deltaHi = (delta + 0x800) & 0xfffff000; - uint32_t deltaLo = delta << (32 - 12); - - *((uint32_t*)pCurrentThunkAddress) = 0x00000297 | deltaHi; // auipc + *((uint32_t*)pCurrentThunkAddress) = 0x00000317 | ((((delta + 0x800) & 0xFFFFF000) >> 12) << 12); // auipc t1, delta[31:12] + pCurrentThunkAddress += 4; + + *((uint32_t*)pCurrentThunkAddress) = 0x00030313 | ((delta & 0xFFF) << 20); // addi t1, t1, delta[11:0] + pCurrentThunkAddress += 4; + + delta += OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2) - 8; + *((uint32_t*)pCurrentThunkAddress) = 0x00000297 | ((((delta + 0x800) & 0xFFFFF000) >> 12) << 12); // auipc t0, delta[31:12] pCurrentThunkAddress += 4; - *((uint32_t*)pCurrentThunkAddress) = 0x0002B303 | deltaLo; // addi + *((uint32_t*)pCurrentThunkAddress) = 0x0002b283 | ((delta & 0xFFF) << 20); // ld t0, (delta[11:0])(t0) pCurrentThunkAddress += 4; - *((uint32_t*)pCurrentThunkAddress) = 0x00030067; // jr + *((uint32_t*)pCurrentThunkAddress) = 0x00008282; // jalr zero, t0, 0 pCurrentThunkAddress += 4; #else diff --git a/src/coreclr/nativeaot/Runtime/riscv64/AllocFast.S b/src/coreclr/nativeaot/Runtime/riscv64/AllocFast.S index f09e0452042734..4690b12c38dbfb 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/AllocFast.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/AllocFast.S @@ -74,8 +74,8 @@ LOCAL_LABEL(RhpNewFast_RarePath): // a3: transition frame - // Preserve the MethodTable in s0 - mv s0, a0 + // Preserve the MethodTable in s2 + mv s2, a0 li a2, 0 // numElements @@ -96,7 +96,7 @@ LOCAL_LABEL(NewOutOfMemory): // This is the OOM failure path. We are going to tail-call to a managed helper that will throw // an out of memory exception that the caller of this allocator understands. - mv a0, s0 // MethodTable pointer + mv a0, s2 // MethodTable pointer li a1, 0 // Indicate that we should throw OOM. POP_COOP_PINVOKE_FRAME @@ -243,7 +243,7 @@ LOCAL_LABEL(RhpNewArray_Rare): PUSH_COOP_PINVOKE_FRAME a3 // Preserve data we will need later into the callee saved registers - mv s0, a0 // Preserve MethodTable + mv s2, a0 // Preserve MethodTable mv a2, a1 // numElements li a1, 0 // uFlags @@ -264,7 +264,7 @@ LOCAL_LABEL(ArrayOutOfMemory): // This is the OOM failure path. We are going to tail-call to a managed helper that will throw // an out of memory exception that the caller of this allocator understands. - mv a0, s0 // MethodTable Pointer + mv a0, s2 // MethodTable Pointer li a1, 0 // Indicate that we should throw OOM. POP_COOP_PINVOKE_FRAME diff --git a/src/coreclr/nativeaot/Runtime/riscv64/ExceptionHandling.S b/src/coreclr/nativeaot/Runtime/riscv64/ExceptionHandling.S index ff20aeb736cc88..8258325967821a 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/ExceptionHandling.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/ExceptionHandling.S @@ -4,7 +4,7 @@ #include #include "AsmOffsets.inc" -#define STACKSIZEOF_ExInfo ((SIZEOF__ExInfo + 15)&(~15)) +#define STACKSIZEOF_ExInfo ((SIZEOF__ExInfo + 7) & ~7) #define HARDWARE_EXCEPTION 1 #define SOFTWARE_EXCEPTION 0 diff --git a/src/coreclr/nativeaot/Runtime/riscv64/GcProbe.S b/src/coreclr/nativeaot/Runtime/riscv64/GcProbe.S index f5f41c44c78fac..0d89b36e1aabdf 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/GcProbe.S @@ -44,14 +44,13 @@ # Perform the rest of the PInvokeTransitionFrame initialization. sd \threadReg, OFFSETOF__PInvokeTransitionFrame__m_pThread(sp) # Thread * (unused by stackwalker) - sd \BITMASK, (OFFSETOF__PInvokeTransitionFrame__m_pThread + 8)(sp) # Save the register bitmask passed in by caller + sd \BITMASK, OFFSETOF__PInvokeTransitionFrame__m_Flags(sp) # Save the register bitmask passed in by caller addi \trashReg, sp, PROBE_FRAME_SIZE # Recover value of caller's SP sd \trashReg, 0x78(sp) # Save caller's SP # Link the frame into the Thread - mv \trashReg, sp - sd \trashReg, OFFSETOF__Thread__m_pDeferredTransitionFrame(\threadReg) + sd sp, OFFSETOF__Thread__m_pDeferredTransitionFrame(\threadReg) .endm @@ -84,7 +83,9 @@ .macro FixupHijackedCallstack // a2 <- GetThread() + mv t1, a0 INLINE_GETTHREAD a2 + mv a0, t1 // Fix the stack by restoring the original return address ld ra, OFFSETOF__Thread__m_pvHijackedReturnAddress(a2) @@ -100,14 +101,13 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler FixupHijackedCallstack - PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, a3 - andi t3, a3, 1 << TrapThreadsFlags_TrapThreads_Bit + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, t3 + andi t3, t3, 1 << TrapThreadsFlags_TrapThreads_Bit bnez t3, LOCAL_LABEL(WaitForGC) jr ra LOCAL_LABEL(WaitForGC): - li t6, (DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_A0 + PTFF_SAVE_A1 + PTFF_THREAD_HIJACK_HI) - or t3, t3, t6 + li t3, (DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_A0 + PTFF_SAVE_A1 + (PTFF_THREAD_HIJACK_HI << 32)) tail C_FUNC(RhpWaitForGC) NESTED_END RhpGcProbeHijack diff --git a/src/coreclr/nativeaot/Runtime/riscv64/InteropThunksHelpers.S b/src/coreclr/nativeaot/Runtime/riscv64/InteropThunksHelpers.S index 04f28699dd2940..a19cf4c0010214 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/InteropThunksHelpers.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/InteropThunksHelpers.S @@ -12,27 +12,29 @@ // // RhCommonStub // - // INPUT: tp: thunk's data block + // INPUT: t1: thunk's data block // - // TRASHES: t0, t1, tp + // TRASHES: t0, t1, t2 // LEAF_ENTRY RhCommonStub, _TEXT // There are arbitrary callers passing arguments with arbitrary signatures. // Custom calling convention: // tp pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers) + mv t2, a0 INLINE_GET_TLS_VAR t0, C_FUNC(tls_thunkData) + mv a0, t2 // t0 = base address of TLS data - // tp = address of context cell in thunk's data + // t1 = address of context cell in thunk's data // Load the thunk address from the data block and store it in the thread's static storage - ld t1, 0(t0) // Load thunk address into t1 from the TLS base address - sd t1, 0(t0) // Store the thunk address in thread static storage + ld t2, 0(t1) // Load thunk data into t2 + sd t2, 0(t0) // Store the thunk address in thread static storage // Load the target address from the data block and jump to it - ld t1, POINTER_SIZE(t0) // Load target address into t1 from the data block - jalr t1 // Jump to the target address in t1 + ld t1, POINTER_SIZE(t1) // Load target address into t1 from the data block + jr t1 // Jump to the target address in t1 LEAF_END RhCommonStub, _TEXT diff --git a/src/coreclr/nativeaot/Runtime/riscv64/PInvoke.S b/src/coreclr/nativeaot/Runtime/riscv64/PInvoke.S index d1264271cc79c3..93b360ebda1c1e 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/PInvoke.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/PInvoke.S @@ -19,7 +19,7 @@ NESTED_ENTRY RhpPInvoke, _TEXT, NoHandler sd fp, OFFSETOF__PInvokeTransitionFrame__m_FramePointer(a0) sd ra, OFFSETOF__PInvokeTransitionFrame__m_RIP(a0) - sd t0, OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs(a0) + sd sp, OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs(a0) li t0, PTFF_SAVE_SP sd t0, OFFSETOF__PInvokeTransitionFrame__m_Flags(a0) @@ -40,7 +40,7 @@ LEAF_ENTRY RhpPInvokeReturn, _TEXT PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, t0 - bnez t1, 0f // If TrapThreadsFlags_None is non-zero, branch + bnez t0, 0f // If TrapThreadsFlags_None is non-zero, branch ret 0: diff --git a/src/coreclr/nativeaot/Runtime/riscv64/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/riscv64/UniversalTransition.S index 89691462c1231b..234e6b46357dd5 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/UniversalTransition.S @@ -12,15 +12,12 @@ .global RhpFpTrashValues #endif // TRASH_SAVED_ARGUMENT_REGISTERS -// Padding to account for the odd number of saved integer registers -#define ALIGNMENT_PADDING_SIZE (8) - #define COUNT_ARG_REGISTERS (8) #define INTEGER_REGISTER_SIZE (8) #define ARGUMENT_REGISTERS_SIZE (COUNT_ARG_REGISTERS * INTEGER_REGISTER_SIZE) // Largest return block is 4 doubles -#define RETURN_BLOCK_SIZE (32) +#define RETURN_BLOCK_SIZE 16 #define COUNT_FLOAT_ARG_REGISTERS (8) #define FLOAT_REGISTER_SIZE (8) @@ -31,7 +28,6 @@ // From CallerSP to ChildSP, the stack frame is composed of the following adjacent regions: // -// ALIGNMENT_PADDING_SIZE // ARGUMENT_REGISTERS_SIZE // RETURN_BLOCK_SIZE // FLOAT_ARG_REGISTERS_SIZE @@ -41,7 +37,7 @@ #define DISTANCE_FROM_CHILDSP_TO_RETURN_BLOCK (PUSHED_FP_SIZE + PUSHED_RA_SIZE + FLOAT_ARG_REGISTERS_SIZE) -#define STACK_SIZE (ALIGNMENT_PADDING_SIZE + ARGUMENT_REGISTERS_SIZE + RETURN_BLOCK_SIZE + FLOAT_ARG_REGISTERS_SIZE + PUSHED_RA_SIZE + PUSHED_FP_SIZE) +#define STACK_SIZE (ARGUMENT_REGISTERS_SIZE + RETURN_BLOCK_SIZE + FLOAT_ARG_REGISTERS_SIZE + PUSHED_RA_SIZE + PUSHED_FP_SIZE) #define FLOAT_ARG_OFFSET (PUSHED_FP_SIZE + PUSHED_RA_SIZE) #define ARGUMENT_REGISTERS_OFFSET (FLOAT_ARG_OFFSET + FLOAT_ARG_REGISTERS_SIZE + RETURN_BLOCK_SIZE) @@ -63,9 +59,8 @@ // Frame layout is: // // {StackPassedArgs} ChildSP+100 CallerSP+000 -// {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008 // {IntArgRegs (a0-a7) (0x40 bytes)} ChildSP+0B8 CallerSP-048 -// {ReturnBlock (0x20 bytes)} ChildSP+098 CallerSP-068 +// {ReturnBlock (0x10 bytes)} ChildSP+098 CallerSP-068 // -- The base address of the Return block is the TransitionBlock pointer, the floating point args are // in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact // layout of all pieces of the frame that lie at or above the pushed floating point registers. @@ -91,9 +86,7 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler # FP and RA registers - addi sp, sp, -STACK_SIZE - sd s0, 0x0(sp) # Save frame pointer - sd ra, 0x08(sp) # Save return address + PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, STACK_SIZE # Floating point registers fsd fa0, FLOAT_ARG_OFFSET(sp) @@ -105,7 +98,7 @@ fsd fa6, FLOAT_ARG_OFFSET + 0x30(sp) fsd fa7, FLOAT_ARG_OFFSET + 0x38(sp) - # Space for return buffer data (0x40 bytes) + # Space for return block data (0x10 bytes) # Save argument registers sd a0, ARGUMENT_REGISTERS_OFFSET(sp) diff --git a/src/coreclr/nativeaot/Runtime/riscv64/WriteBarriers.S b/src/coreclr/nativeaot/Runtime/riscv64/WriteBarriers.S index 1e9fedaa9f21c1..e063f549a398a1 100644 --- a/src/coreclr/nativeaot/Runtime/riscv64/WriteBarriers.S +++ b/src/coreclr/nativeaot/Runtime/riscv64/WriteBarriers.S @@ -31,7 +31,6 @@ la t3, g_GCShadow ld t3, 0(t3) beq t3, zero, 1f - li t4, 0 // Save destReg since we're about to modify it (and we need the original value both within the macro and // once we exit the macro). @@ -106,6 +105,7 @@ #ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP // Update the write watch table if necessary la t2, g_write_watch_table + ld t2, (t2) beqz t2, 2f srli t6, \destReg, 12 // SoftwareWriteWatch::AddressToTableByteIndexShift @@ -120,33 +120,40 @@ // We can skip the card table write if the reference is to // an object not on the ephemeral segment. la t2, g_ephemeral_low + ld t2, (t2) la t6, g_ephemeral_high - bgeu \refReg, t2, 0f - bltu \refReg, t6, 0f + ld t6, (t6) + bltu \refReg, t2, 0f + bgeu \refReg, t6, 0f // Set this object's card, if it has not already been set. la t2, g_card_table + ld t2, (t2) srli t6, \destReg, 11 add t6, t2, t6 // Check that this card has not already been written. Avoiding useless writes // is a big win on multi-proc systems since it avoids cache thrashing. - lb t2, 0(t6) - li t6, 0xFF - beq t2, t6, 0f + lbu t2, 0(t6) + addi t2, t2, -0xFF + beqz t2, 0f - sb t6, 0(t6) + li t2, 0xFF + sb t2, 0(t6) #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES // Check if we need to update the card bundle table la t2, g_card_bundle_table + ld t2, (t2) + srli t6, \destReg, 21 add t6, t2, t6 - lb t2, 0(t6) - li t6, 0xFF - beq t2, t6, 0f + lbu t2, 0(t6) + addi t2, t2, -0xFF + beqz t2, 0f - sb t6, 0(t6) + li t2, 0xFF + sb t2, 0(t6) #endif 0: @@ -166,10 +173,12 @@ // If no, early out. la t2, g_lowest_address - bgeu \destReg, t2, 0f + ld t2, (t2) + bltu \destReg, t2, 0f la t2, g_highest_address - bltu \destReg, t2, 0f + ld t2, (t2) + bgeu \destReg, t2, 0f 1: INSERT_UNCHECKED_WRITE_BARRIER_CORE \destReg, \refReg @@ -220,8 +229,10 @@ LEAF_END RhpByRefAssignRef, _TEXT LEAF_ENTRY RhpCheckedAssignRef, _TEXT # Check if the destination is within the heap bounds - la t2, C_FUNC(g_lowest_address) - la t6, C_FUNC(g_highest_address) + la t2, C_FUNC(g_lowest_address) + ld t2, (t2) + la t6, C_FUNC(g_highest_address) + ld t6, (t6) bltu t3, t2, LOCAL_LABEL(NotInHeap) bgeu t3, t6, LOCAL_LABEL(NotInHeap) @@ -250,6 +261,7 @@ LEAF_END RhpCheckedAssignRef, _TEXT // t2, t6 : trashed // t3 : incremented by 8 LEAF_ENTRY RhpAssignRefRiscV64, _TEXT + fence rw, rw ALTERNATE_ENTRY RhpAssignRefAVLocation sd t4, 0(t3) @@ -295,14 +307,18 @@ LEAF_ENTRY RhpCheckedLockCmpXchg LOCAL_LABEL(CmpXchgRetry): // Load the current value at the destination address. - lr.d t0, (a0) // t0 = *dest + lr.d.aqrl t0, (a0) // t0 = *dest (load with acquire-release ordering) // Compare the loaded value with the comparand. bne t0, a2, LOCAL_LABEL(CmpXchgNoUpdate) // if (*dest != comparand) goto CmpXchgNoUpdate // Attempt to store the exchange value at the destination address. - sc.d t1, a1, (a0) // t1 = (store conditional result: 0 if successful) + sc.d.rl t1, a1, (a0) // t1 = (store conditional result: 0 if successful, with release ordering) bnez t1, LOCAL_LABEL(CmpXchgRetry) // if store conditional failed, retry + // See comment at the top of PalInterlockedOperationBarrier method for explanation why this memory + // barrier is necessary. + fence rw, rw + LOCAL_LABEL(DoCardsCmpXchg): // We have successfully updated the value of the objectref so now we need a GC write barrier. // The following barrier code takes the destination in a0 and the value in a1 so the arguments are @@ -331,13 +347,14 @@ LEAF_END RhpCheckedLockCmpXchg // // On exit: // a0: original value of objectref -// t1: trashed -// t3, t6, t4: trashed +// t1, t6: trashed // LEAF_ENTRY RhpCheckedXchg + amoswap.d.aqrl t1, a1, (a0) - ld t1, 0(a0) - sd a1, 0(a0) + // See comment at the top of PalInterlockedOperationBarrier method for explanation why this memory + // barrier is necessary. + fence rw, rw DoCardsXchg: // We have successfully updated the value of the objectref so now we need a GC write barrier. diff --git a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp index a1e2e507df816a..231be716316475 100644 --- a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp @@ -698,23 +698,24 @@ int UnixNativeCodeManager::IsInProlog(MethodInfo * pMethodInfo, PTR_VOID pvAddre #elif defined(TARGET_RISCV64) -// store pair with signed offset -// 0100 00xx xxxxxxxx xxxx xxxx xxxx xxxx -#define STW_PAIR_BITS 0x04000000 -#define STW_PAIR_MASK 0xFC000000 +// store doubleword with signed offset +#define SD_BITS 0x00003023 +#define SD_MASK 0x0000707F -// add fp, sp, x // addi fp, sp, x -// 0000 0001 100x xxxx xxxx xxxx 0000 0000 -#define ADD_FP_SP_BITS 0x01C00000 -#define ADD_FP_SP_MASK 0xFFFFE000 +#define ADD_FP_SP_BITS 0x00010413 +#define ADD_FP_SP_MASK 0x000FFFFF + +// addi sp, sp, x +#define ADD_SP_SP_BITS 0x00010113 +#define ADD_SP_SP_MASK 0x000FFFFF -#define STW_PAIR_RS1_MASK 0xF80 -#define STW_PAIR_RS1_SP 0xF80 -#define STW_PAIR_RS1_FP 0xF00 -#define STW_PAIR_RS2_MASK 0xF00 -#define STW_PAIR_RS2_FP 0xF00 -#define STW_PAIR_RS2_RA 0xF40 +#define SD_RS1_MASK 0xF8000 +#define SD_RS1_SP 0x10000 +#define SD_RS1_FP 0x40000 +#define SD_RS2_MASK 0x1F00000 +#define SD_RS2_FP 0x800000 +#define SD_RS2_RA 0x100000 UnixNativeMethodInfo * pNativeMethodInfo = (UnixNativeMethodInfo *)pMethodInfo; ASSERT(pNativeMethodInfo != NULL); @@ -728,19 +729,19 @@ int UnixNativeCodeManager::IsInProlog(MethodInfo * pMethodInfo, PTR_VOID pvAddre { uint32_t instr = *pInstr; - if (((instr & STW_PAIR_MASK) == STW_PAIR_BITS) && - ((instr & STW_PAIR_RS1_MASK) == STW_PAIR_RS1_SP || (instr & STW_PAIR_RS1_MASK) == STW_PAIR_RS1_FP) && - ((instr & STW_PAIR_RS2_MASK) == STW_PAIR_RS2_FP || (instr & STW_PAIR_RS2_MASK) == STW_PAIR_RS2_RA)) + if (((instr & SD_MASK) == SD_BITS) && + ((instr & SD_RS1_MASK) == SD_RS1_SP || (instr & SD_RS1_MASK) == SD_RS1_FP) && + ((instr & SD_RS2_MASK) == SD_RS2_FP || (instr & SD_RS2_MASK) == SD_RS2_RA)) { // SP/FP-relative store of pair of registers - savedFp |= (instr & STW_PAIR_RS2_MASK) == STW_PAIR_RS2_FP; - savedRa |= (instr & STW_PAIR_RS2_MASK) == STW_PAIR_RS2_RA; + savedFp |= (instr & SD_RS2_MASK) == SD_RS2_FP; + savedRa |= (instr & SD_RS2_MASK) == SD_RS2_RA; } else if ((instr & ADD_FP_SP_MASK) == ADD_FP_SP_BITS) { establishedFp = true; } - else + else if ((instr & ADD_SP_SP_MASK) != ADD_SP_SP_BITS) { // JIT generates other patterns into the prolog that we currently don't // recognize (saving unpaired register, stack pointer adjustments). We @@ -1185,21 +1186,13 @@ int UnixNativeCodeManager::TrailingEpilogueInstructionsCount(MethodInfo * pMetho #elif defined(TARGET_RISCV64) -// Load with immediate -// LUI, LD, etc. -// 0000 0000 0000 0000 1111 1111 1111 1111 -#define LUI_BITS 0x00000037 -#define LUI_MASK 0x0000007F - // Load with register offset // LD with register offset -// 0000 0000 0000 0000 0111 0000 0000 0000 #define LD_BITS 0x00000003 #define LD_MASK 0x0000007F -// Branches, Jumps, System calls -// BEQ, BNE, JAL, etc. -// 1100 0000 0000 0000 0000 0000 0000 0000 +// Branches +// BEQ, BNE, etc. #define BEGS_BITS 0x00000063 #define BEGS_MASK 0x0000007F @@ -1229,14 +1222,20 @@ int UnixNativeCodeManager::TrailingEpilogueInstructionsCount(MethodInfo * pMetho } // Check for restoring registers (FP or RA) with `ld` - int rd = (instr >> 7) & 0x1F; // Extract the destination register - if (rd == 8 || rd == 1) // Check for FP (x8) or RA (x1) + if ((instr & LD_MASK) == LD_BITS) // Match `ld` instruction { - if ((instr & LD_MASK) == LD_BITS) // Match `ld` instruction + int rd = (instr >> 7) & 0x1F; // Extract the destination register + if (rd == 8 || rd == 1) // Check for FP (x8) or RA (x1) { return -1; } } + + // Check for adjusting stack pointer + if ((instr & ADD_SP_SP_MASK) == ADD_SP_SP_BITS) + { + return -1; + } } #endif diff --git a/src/coreclr/nativeaot/Runtime/unix/UnwindHelpers.cpp b/src/coreclr/nativeaot/Runtime/unix/UnwindHelpers.cpp index e387f3440e329f..f71a4b1338a6ea 100644 --- a/src/coreclr/nativeaot/Runtime/unix/UnwindHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/UnwindHelpers.cpp @@ -1140,8 +1140,13 @@ bool Registers_REGDISPLAY::validVectorRegister(int num) const inline uint64_t Registers_REGDISPLAY::getRegister(int regNum) const { switch (regNum) { + case UNW_REG_IP: + return IP; case UNW_RISCV_X1: return *pRA; + case UNW_REG_SP: + case UNW_RISCV_X2: + return SP; case UNW_RISCV_X3: return *pGP; case UNW_RISCV_X4: @@ -1255,60 +1260,79 @@ inline uint64_t Registers_REGDISPLAY::getRegister(int regNum) const { void Registers_REGDISPLAY::setRegister(int regNum, uint64_t value, uint64_t location) { switch (regNum) { + case UNW_REG_IP: + IP = (uintptr_t)value; + break; case UNW_RISCV_X1: - *pRA = value; + pRA = (PTR_uintptr_t)location; + break; + case UNW_REG_SP: + case UNW_RISCV_X2: + SP = (uintptr_t)value; break; case UNW_RISCV_X3: - *pGP = value; + pGP = (PTR_uintptr_t)location; break; case UNW_RISCV_X4: - *pTP = value; + pTP = (PTR_uintptr_t)location; break; case UNW_RISCV_X5: - *pT0 = value; + pT0 = (PTR_uintptr_t)location; break; case UNW_RISCV_X6: - *pT1 = value; + pT1 = (PTR_uintptr_t)location; break; case UNW_RISCV_X7: - *pT2 = value; + pT2 = (PTR_uintptr_t)location; break; case UNW_RISCV_X28: - *pT3 = value; + pT3 = (PTR_uintptr_t)location; break; case UNW_RISCV_X29: - *pT4 = value; + pT4 = (PTR_uintptr_t)location; break; case UNW_RISCV_X30: - *pT5 = value; + pT5 = (PTR_uintptr_t)location; break; case UNW_RISCV_X31: - *pT6 = value; + pT6 = (PTR_uintptr_t)location; break; case UNW_RISCV_X8: - *pFP = value; + pFP = (PTR_uintptr_t)location; break; case UNW_RISCV_X9: - *pS1 = value; + pS1 = (PTR_uintptr_t)location; break; case UNW_RISCV_X18: - *pS2 = value; + pS2 = (PTR_uintptr_t)location; break; case UNW_RISCV_X19: - *pS3 = value; + pS3 = (PTR_uintptr_t)location; break; case UNW_RISCV_X20: - *pS4 = value; + pS4 = (PTR_uintptr_t)location; break; case UNW_RISCV_X21: - *pS5 = value; + pS5 = (PTR_uintptr_t)location; break; case UNW_RISCV_X22: - *pS6 = value; + pS6 = (PTR_uintptr_t)location; break; case UNW_RISCV_X23: - *pS7 = value; + pS7 = (PTR_uintptr_t)location; + break; + case UNW_RISCV_X24: + pS8 = (PTR_uintptr_t)location; + break; + case UNW_RISCV_X25: + pS9 = (PTR_uintptr_t)location; + break; + case UNW_RISCV_X26: + pS10 = (PTR_uintptr_t)location; + break; + case UNW_RISCV_X27: + pS11 = (PTR_uintptr_t)location; break; // Add other general-purpose registers if needed diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosriscv64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosriscv64.inc index 787d80a4000dc9..cbfc289518db11 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosriscv64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosriscv64.inc @@ -96,9 +96,8 @@ C_FUNC(\Name): .endm .macro PROLOG_SAVE_REG_PAIR_INDEXED reg1, reg2, ssize, __def_cfa_save=1 - addi sp, sp, \ssize - .cfi_adjust_cfa_offset -\ssize - .cfi_def_cfa sp, \ssize + addi sp, sp, -\ssize + .cfi_adjust_cfa_offset \ssize sd \reg1, 0(sp) sd \reg2, 8(sp) @@ -113,8 +112,7 @@ C_FUNC(\Name): .macro PROLOG_SAVE_REG_PAIR_NO_FP_INDEXED reg1, reg2, ssize addi sp, sp, -\ssize - //.cfi_adjust_cfa_offset \ssize - .cfi_def_cfa sp, \ssize + .cfi_adjust_cfa_offset \ssize sd \reg1, 0(sp) sd \reg2, 8(sp) @@ -163,31 +161,37 @@ C_FUNC(\Name): .error "target cannot be a0" .endif - addi sp, sp, -48 - sd ra, 40(sp) - sd t1, 32(sp) - sd a1, 24(sp) - sd a2, 16(sp) - sd a3, 8(sp) - sd a4, 0(sp) + addi sp, sp, -72 + sd ra, 64(sp) + sd t1, 56(sp) + sd a1, 48(sp) + sd a2, 40(sp) + sd a3, 32(sp) + sd a4, 24(sp) + sd a5, 16(sp) + sd a6, 8(sp) + sd a7, 0(sp) // global dynamic TLS, see https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/eb2b2962/riscv-elf.adoc#global-dynamic la.tls.gd a0, \var call C_FUNC(__tls_get_addr) - ld ra, 40(sp) - ld t1, 32(sp) - ld a1, 24(sp) - ld a2, 16(sp) - ld a3, 8(sp) - ld a4, 0(sp) - addi sp, sp, 48 + ld ra, 64(sp) + ld t1, 56(sp) + ld a1, 48(sp) + ld a2, 40(sp) + ld a3, 32(sp) + ld a4, 24(sp) + ld a5, 16(sp) + ld a6, 8(sp) + ld a7, 0(sp) + addi sp, sp, 72 mv \target, a0 /* - In the future we should switch to TLS descriptors. The support was added in 2024 in glibc, musl, llvm, gcc and binutils, - so its support is currently unavailable on majority devices. See https://maskray.me/blog/2024-01-23-riscv-tlsdesc-works + In the future we should switch to TLS descriptors. Its support was added in 2024 in glibc, musl, llvm, gcc and binutils, + which is currently unavailable on majority devices. See https://maskray.me/blog/2024-01-23-riscv-tlsdesc-works When the support for TLS descriptors is available in NativeAOT baseline, actions to perform: * Apply this patch: @@ -204,6 +208,7 @@ C_FUNC(\Name): add_subdirectory(Bootstrap) ``` + * Remove global dynamic code including prolog and epilog. * Uncomment the following code and remove these comments. // TLS descriptor, see https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/eb2b2962/riscv-elf.adoc#tls-descriptors @@ -244,12 +249,12 @@ C_FUNC(\Name): #define PTFF_SAVE_A0 0x00004000 #define PTFF_SAVE_A1 0x00008000 #define PTFF_SAVE_ALL_PRESERVED 0x000007FF // NOTE: S1-S11 -#define PTFF_THREAD_HIJACK_HI 0x00000002 // upper 32 bits of the PTFF_THREAD_HIJACK +#define PTFF_THREAD_HIJACK_HI 0x00000001 // upper 32 bits of the PTFF_THREAD_HIJACK #define DEFAULT_FRAME_SAVE_FLAGS PTFF_SAVE_ALL_PRESERVED + PTFF_SAVE_SP .macro PUSH_COOP_PINVOKE_FRAME trashReg - PROLOG_SAVE_REG_PAIR_INDEXED s0, ra, -128 // Push down stack pointer and store s0 (fp) and RA + PROLOG_SAVE_REG_PAIR_INDEXED s0, ra, 128 // Push down stack pointer and store s0 (fp) and RA // 16 bytes reserved for Thread* and flags @@ -259,10 +264,11 @@ C_FUNC(\Name): PROLOG_SAVE_REG_PAIR s5, s6, 64 PROLOG_SAVE_REG_PAIR s7, s8, 80 PROLOG_SAVE_REG_PAIR s9, s10, 96 + PROLOG_SAVE_REG s11, 112 // Save the value of SP before stack allocation to the last slot in the frame (slot #15) add \trashReg, sp, 128 - sd \trashReg, 112(sp) + sd \trashReg, 120(sp) // Record the bitmask of saved registers in the frame (slot #3) li \trashReg, DEFAULT_FRAME_SAVE_FLAGS @@ -278,6 +284,7 @@ C_FUNC(\Name): EPILOG_RESTORE_REG_PAIR s5, s6, 64 EPILOG_RESTORE_REG_PAIR s7, s8, 80 EPILOG_RESTORE_REG_PAIR s9, s10, 96 + EPILOG_RESTORE_REG s11, 112 EPILOG_RESTORE_REG_PAIR_INDEXED s0, ra, 128 .endm diff --git a/src/coreclr/tools/Common/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64Emitter.cs b/src/coreclr/tools/Common/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64Emitter.cs index caf1dad16a1af5..380163ed2fbf20 100644 --- a/src/coreclr/tools/Common/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64Emitter.cs +++ b/src/coreclr/tools/Common/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64Emitter.cs @@ -25,6 +25,11 @@ public void EmitBreak() Builder.EmitUInt(0x00100073); } + public void EmitFENCE_R_RW() + { + Builder.EmitUInt(0x0230000f); + } + public void EmitLI(Register regDst, int offset) { Debug.Assert((offset >= -2048) && (offset <= 2047)); diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64ReadyToRunGenericHelperNode.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64ReadyToRunGenericHelperNode.cs index a285069cfada2d..c6e2364766fa64 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64ReadyToRunGenericHelperNode.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/Target_RiscV64/RiscV64ReadyToRunGenericHelperNode.cs @@ -47,8 +47,7 @@ protected void EmitDictionaryLookup(NodeFactory factory, ref RiscV64Emitter enco // should be reported by someone else - the system should not rely on it coming from here. if (!relocsOnly && _hasInvalidEntries) { - encoder.EmitXORI(encoder.TargetRegister.IntraProcedureCallScratch1, result, 0); - encoder.EmitJALR(Register.X0, encoder.TargetRegister.IntraProcedureCallScratch1, 0); + encoder.EmitJMPIfZero(result, GetBadSlotHelper(factory)); } } @@ -76,6 +75,7 @@ protected sealed override void EmitCode(NodeFactory factory, ref RiscV64Emitter // We need to trigger the cctor before returning the base. It is stored at the beginning of the non-GC statics region. encoder.EmitADDI(encoder.TargetRegister.Arg3, encoder.TargetRegister.Arg0, -NonGCStaticsNode.GetClassConstructorContextSize(factory.Target)); encoder.EmitLD(encoder.TargetRegister.Arg2, encoder.TargetRegister.Arg3, 0); + encoder.EmitFENCE_R_RW(); encoder.EmitRETIfZero(encoder.TargetRegister.Arg2); encoder.EmitMOV(encoder.TargetRegister.Arg1, encoder.TargetRegister.Result); @@ -107,6 +107,7 @@ protected sealed override void EmitCode(NodeFactory factory, ref RiscV64Emitter encoder.EmitADDI(encoder.TargetRegister.Arg2, encoder.TargetRegister.Arg2, -NonGCStaticsNode.GetClassConstructorContextSize(factory.Target)); encoder.EmitLD(encoder.TargetRegister.Arg3, encoder.TargetRegister.Arg2, 0); + encoder.EmitFENCE_R_RW(); encoder.EmitRETIfZero(encoder.TargetRegister.Arg3); encoder.EmitMOV(encoder.TargetRegister.Arg1, encoder.TargetRegister.Result); diff --git a/src/coreclr/vm/gcinfodecoder.cpp b/src/coreclr/vm/gcinfodecoder.cpp index c24a2bc14870bd..1b11a3e8f7ceee 100644 --- a/src/coreclr/vm/gcinfodecoder.cpp +++ b/src/coreclr/vm/gcinfodecoder.cpp @@ -1976,7 +1976,7 @@ OBJECTREF* GcInfoDecoder::GetRegisterSlot( _ASSERTE((regNum == 1) || (regNum >= 5 && regNum <= 31)); #ifdef FEATURE_NATIVEAOT - PTR_uintptr_t* ppReg = &pRD->pRA; + PTR_uintptr_t* ppReg = &pRD->pR0; return (OBJECTREF*)*(ppReg + regNum); #else diff --git a/src/installer/pkg/sfx/Microsoft.NETCore.App/Microsoft.NETCore.App.Runtime.CoreCLR.sfxproj b/src/installer/pkg/sfx/Microsoft.NETCore.App/Microsoft.NETCore.App.Runtime.CoreCLR.sfxproj index 14522c2c74e79e..b6d2d049dc91e6 100644 --- a/src/installer/pkg/sfx/Microsoft.NETCore.App/Microsoft.NETCore.App.Runtime.CoreCLR.sfxproj +++ b/src/installer/pkg/sfx/Microsoft.NETCore.App/Microsoft.NETCore.App.Runtime.CoreCLR.sfxproj @@ -27,6 +27,7 @@ false false + false true true $(PublishReadyToRun)