Skip to content

Commit

Permalink
Checkpoint Jiterpreter SIMD support
Browse files Browse the repository at this point in the history
Fix bugs in the jiterpreter-support import management implementation
Enable interpreter SIMD for WASM

Minor cleanups

Fix merge

Checkpoint PackedSimd

Implement a few PackedSimd methods in the interp

Add packedsimd ops to jiterpreter

Move the interp simd opcode -> wasm opcode mapping into the C tables

Add intrinsic ids for most of the remaining packedsimd methods

Map most of the PackedSimd methods to intrinsics

Update genmintops

Fix merge damage

Fix build

Add more wasm opcodes

Add more wasm opcodes

Add bitmask intrinsics

Add missing opcodes to transform-simd

Use HOST_BROWSER instead of HOST_WASM to fix wasi build

Implement the pack-n-elements vector instructions

Disable bitselect because it's broken somehow
Simplify vector packing
Add browser-bench measurements for packing vectors

Disable more opcodes

Disable more opcodes

Fix PackedSimd feature detection on non-wasm targets

Maybe fix linux interp assertion

Don't fail transform for unsupported PackedSimd methods on non-browser targets

Fix i64 popcnt

Add basic R4 v128 intrinsics (add/sub/div/mul)
Re-enable more jiterpreter simd
  • Loading branch information
kg committed May 6, 2023
1 parent 754d9ec commit b3f7277
Show file tree
Hide file tree
Showing 23 changed files with 1,531 additions and 195 deletions.
1 change: 1 addition & 0 deletions src/mono/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ elseif(CLR_CMAKE_HOST_OS STREQUAL "emscripten")
add_compile_options(-Wno-strict-prototypes)
add_compile_options(-Wno-unused-but-set-variable)
add_compile_options(-Wno-single-bit-bitfield-constant-conversion)
add_compile_options(-msimd128)
set(DISABLE_EXECUTABLES 1)
# FIXME: Is there a cmake option for this ?
set(DISABLE_SHARED_LIBS 1)
Expand Down
8 changes: 7 additions & 1 deletion src/mono/mono/mini/interp/interp-internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ typedef enum {

#define PROFILE_INTERP 0

#if !HOST_BROWSER && __GNUC__
#if __GNUC__
#define INTERP_ENABLE_SIMD
#endif

Expand Down Expand Up @@ -342,6 +342,12 @@ mono_jiterp_stackval_from_data (MonoType *type, stackval *result, const void *da
gpointer
mono_jiterp_frame_data_allocator_alloc (FrameDataAllocator *stack, InterpFrame *frame, int size);

gpointer
mono_jiterp_get_simd_intrinsic (int arity, int index);

int
mono_jiterp_get_simd_opcode (int arity, int index);

#endif

static inline int
Expand Down
266 changes: 185 additions & 81 deletions src/mono/mono/mini/interp/interp-simd-intrins.def

Large diffs are not rendered by default.

137 changes: 128 additions & 9 deletions src/mono/mono/mini/interp/interp-simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#include "interp-internals.h"
#include "interp-simd.h"

#if HOST_BROWSER
#include <wasm_simd128.h>
#endif

#ifdef INTERP_ENABLE_SIMD

typedef gint64 v128_i8 __attribute__ ((vector_size (SIZEOF_V128)));
Expand All @@ -12,6 +16,7 @@ typedef gint16 v128_i2 __attribute__ ((vector_size (SIZEOF_V128)));
typedef guint16 v128_u2 __attribute__ ((vector_size (SIZEOF_V128)));
typedef gint8 v128_i1 __attribute__ ((vector_size (SIZEOF_V128)));
typedef guint8 v128_u1 __attribute__ ((vector_size (SIZEOF_V128)));
typedef float v128_r4 __attribute__ ((vector_size (SIZEOF_V128)));

// get_AllBitsSet
static void
Expand Down Expand Up @@ -39,6 +44,12 @@ interp_v128_i4_op_addition (gpointer res, gpointer v1, gpointer v2)
*(v128_i4*)res = *(v128_i4*)v1 + *(v128_i4*)v2;
}

static void
interp_v128_r4_op_addition (gpointer res, gpointer v1, gpointer v2)
{
*(v128_r4*)res = *(v128_r4*)v1 + *(v128_r4*)v2;
}

// op_Subtraction
static void
interp_v128_i1_op_subtraction (gpointer res, gpointer v1, gpointer v2)
Expand All @@ -58,6 +69,12 @@ interp_v128_i4_op_subtraction (gpointer res, gpointer v1, gpointer v2)
*(v128_i4*)res = *(v128_i4*)v1 - *(v128_i4*)v2;
}

static void
interp_v128_r4_op_subtraction (gpointer res, gpointer v1, gpointer v2)
{
*(v128_r4*)res = *(v128_r4*)v1 - *(v128_r4*)v2;
}

// op_BitwiseAnd
static void
interp_v128_op_bitwise_and (gpointer res, gpointer v1, gpointer v2)
Expand Down Expand Up @@ -124,6 +141,18 @@ interp_v128_i4_op_multiply (gpointer res, gpointer v1, gpointer v2)
*(v128_i4*)res = *(v128_i4*)v1 * *(v128_i4*)v2;
}

static void
interp_v128_r4_op_multiply (gpointer res, gpointer v1, gpointer v2)
{
*(v128_r4*)res = *(v128_r4*)v1 * *(v128_r4*)v2;
}

static void
interp_v128_r4_op_division (gpointer res, gpointer v1, gpointer v2)
{
*(v128_r4*)res = *(v128_r4*)v1 / *(v128_r4*)v2;
}

// op_UnaryNegation
static void
interp_v128_i1_op_negation (gpointer res, gpointer v1)
Expand Down Expand Up @@ -535,32 +564,122 @@ interp_v128_i8_shuffle (gpointer res, gpointer v1, gpointer v2)
V128_SHUFFLE (gint64, guint64);
}

#define INTERP_SIMD_INTRINSIC_P_P(a,b)
#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)

// For the wasm packed simd intrinsics we want to automatically generate the C implementations from
// their corresponding clang intrinsics. See also:
// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h
// In this context V means Vector128 and P means void* pointer.
#ifdef HOST_BROWSER

static v128_t
_interp_wasm_simd_assert_not_reached (v128_t lhs, v128_t rhs) {
g_assert_not_reached ();
}

#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) \
static void \
_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \
*((v128_t *)res) = c_intrinsic (v1); \
}

#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) \
static void \
_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \
*((v128_t *)res) = c_intrinsic (*((v128_t *)v1)); \
}

#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) \
static void \
_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \
*((int32_t *)res) = c_intrinsic (*((v128_t *)v1)); \
}

#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) \
static void \
_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \
*((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2)); \
}

#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) \
static void \
_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \
*((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((int *)v2)); \
}

#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) \
static void \
_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2, gpointer v3) { \
*((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2), *((v128_t *)v3)); \
}

#include "interp-simd-intrins.def"

#undef INTERP_WASM_SIMD_INTRINSIC_V_P
#undef INTERP_WASM_SIMD_INTRINSIC_V_V
#undef INTERP_WASM_SIMD_INTRINSIC_I_V
#undef INTERP_WASM_SIMD_INTRINSIC_V_VV
#undef INTERP_WASM_SIMD_INTRINSIC_V_VI
#undef INTERP_WASM_SIMD_INTRINSIC_V_VVV

// Now generate the wasm opcode tables for the intrinsics

#undef INTERP_SIMD_INTRINSIC_P_P
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) c,

int interp_simd_p_p_wasm_opcode_table [] = {
#include "interp-simd-intrins.def"
};

#undef INTERP_SIMD_INTRINSIC_P_P
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_PP
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) c,

int interp_simd_p_pp_wasm_opcode_table [] = {
#include "interp-simd-intrins.def"
};

#undef INTERP_SIMD_INTRINSIC_P_PP
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_PPP
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) c,

int interp_simd_p_ppp_wasm_opcode_table [] = {
#include "interp-simd-intrins.def"
};

#undef INTERP_SIMD_INTRINSIC_P_PPP
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)

#endif // HOST_BROWSER

#undef INTERP_SIMD_INTRINSIC_P_P
#define INTERP_SIMD_INTRINSIC_P_P(a,b) b,
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) b,
PP_SIMD_Method interp_simd_p_p_table [] = {
#include "interp-simd-intrins.def"
};
#undef INTERP_SIMD_INTRINSIC_P_P
#define INTERP_SIMD_INTRINSIC_P_P(a,b)
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_PP
#define INTERP_SIMD_INTRINSIC_P_PP(a,b) b,
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) b,
PPP_SIMD_Method interp_simd_p_pp_table [] = {
#include "interp-simd-intrins.def"
};
#undef INTERP_SIMD_INTRINSIC_P_PP
#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_PPP
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) b,
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) b,
PPPP_SIMD_Method interp_simd_p_ppp_table [] = {
#include "interp-simd-intrins.def"
};
#undef INTERP_SIMD_INTRINSIC_P_PPP
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)

#endif // INTERP_ENABLE_SIMD
6 changes: 6 additions & 0 deletions src/mono/mono/mini/interp/interp-simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ extern PP_SIMD_Method interp_simd_p_p_table [];
extern PPP_SIMD_Method interp_simd_p_pp_table [];
extern PPPP_SIMD_Method interp_simd_p_ppp_table [];

#if HOST_BROWSER
extern int interp_simd_p_p_wasm_opcode_table [];
extern int interp_simd_p_pp_wasm_opcode_table [];
extern int interp_simd_p_ppp_wasm_opcode_table [];
#endif

#endif /* __MONO_MINI_INTERP_SIMD_H__ */


38 changes: 38 additions & 0 deletions src/mono/mono/mini/interp/interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -8907,4 +8907,42 @@ mono_jiterp_enum_hasflag (MonoClass *klass, gint32 *dest, stackval *sp1, stackva
*dest = mono_interp_enum_hasflag (sp1, sp2, klass);
}

EMSCRIPTEN_KEEPALIVE gpointer
mono_jiterp_get_simd_intrinsic (int arity, int index)
{
#ifdef INTERP_ENABLE_SIMD
switch (arity) {
case 1:
return interp_simd_p_p_table [index];
case 2:
return interp_simd_p_pp_table [index];
case 3:
return interp_simd_p_ppp_table [index];
default:
g_assert_not_reached();
}
#else
g_assert_not_reached();
#endif
}

EMSCRIPTEN_KEEPALIVE int
mono_jiterp_get_simd_opcode (int arity, int index)
{
#ifdef INTERP_ENABLE_SIMD
switch (arity) {
case 1:
return interp_simd_p_p_wasm_opcode_table [index];
case 2:
return interp_simd_p_pp_wasm_opcode_table [index];
case 3:
return interp_simd_p_ppp_wasm_opcode_table [index];
default:
g_assert_not_reached();
}
#else
g_assert_not_reached();
#endif
}

#endif
18 changes: 9 additions & 9 deletions src/mono/mono/mini/interp/mintops.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,35 +41,35 @@ typedef enum {

/* SIMD opcodes, grouped by signature */

#define INTERP_SIMD_INTRINSIC_P_P(a,b)
#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_P
#define INTERP_SIMD_INTRINSIC_P_P(a,b) a,
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) a,
typedef enum {
#include "interp-simd-intrins.def"
} MintSIMDOpsPP;
#undef INTERP_SIMD_INTRINSIC_P_P
#define INTERP_SIMD_INTRINSIC_P_P(a,b)
#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_PP
#define INTERP_SIMD_INTRINSIC_P_PP(a,b) a,
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) a,
typedef enum {
#include "interp-simd-intrins.def"
INTERP_SIMD_INTRINSIC_P_PP_LAST
} MintSIMDOpsPPP;
#undef INTERP_SIMD_INTRINSIC_P_PP
#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)

#undef INTERP_SIMD_INTRINSIC_P_PPP
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) a,
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) a,
typedef enum {
#include "interp-simd-intrins.def"
INTERP_SIMD_INTRINSIC_P_PPP_LAST
} MintSIMDOpsPPPP;
#undef INTERP_SIMD_INTRINSIC_P_PPP
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)

#if NO_UNALIGNED_ACCESS
# if G_BYTE_ORDER == G_LITTLE_ENDIAN
Expand Down
20 changes: 20 additions & 0 deletions src/mono/mono/mini/interp/simd-methods.def
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
SIMD_METHOD(get_Count)
SIMD_METHOD(get_AllBitsSet)
SIMD_METHOD(get_IsHardwareAccelerated)
SIMD_METHOD(get_IsSupported)
SIMD_METHOD(get_Item)
SIMD_METHOD(get_One)
SIMD_METHOD(get_Zero)
SIMD_METHOD(op_Addition)
SIMD_METHOD(op_BitwiseAnd)
SIMD_METHOD(op_BitwiseOr)
SIMD_METHOD(op_Division)
SIMD_METHOD(op_Equality)
SIMD_METHOD(op_ExclusiveOr)
SIMD_METHOD(op_Explicit)
Expand All @@ -24,6 +26,7 @@ SIMD_METHOD(ConditionalSelect)
SIMD_METHOD(Create)
SIMD_METHOD(CreateScalar)
SIMD_METHOD(CreateScalarUnsafe)

SIMD_METHOD(Equals)
SIMD_METHOD(ExtractMostSignificantBits)
SIMD_METHOD(GreaterThan)
Expand All @@ -36,3 +39,20 @@ SIMD_METHOD(ShiftRightLogical)
SIMD_METHOD(Shuffle)
SIMD_METHOD(WidenLower)
SIMD_METHOD(WidenUpper)

// PackedSimd
SIMD_METHOD(Splat)
SIMD_METHOD(ExtractLane)
SIMD_METHOD(ReplaceLane)
SIMD_METHOD(Swizzle)
SIMD_METHOD(Add)
SIMD_METHOD(Subtract)
SIMD_METHOD(Multiply)
SIMD_METHOD(Dot)
SIMD_METHOD(Negate)
SIMD_METHOD(And)
SIMD_METHOD(Bitmask)
SIMD_METHOD(CompareEqual)
SIMD_METHOD(CompareNotEqual)
SIMD_METHOD(ConvertNarrowingSignedSaturate)
SIMD_METHOD(ConvertNarrowingUnsignedSaturate)
Loading

0 comments on commit b3f7277

Please sign in to comment.