@@ -32,69 +32,6 @@ R"===(
32
32
#define cryptonight_conceal 14
33
33
#define cryptonight_v8_reversewaltz 17
34
34
35
- /* For Mesa clover support */
36
- #ifdef cl_clang_storage_class_specifiers
37
- # pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
38
- #endif
39
-
40
- #ifdef cl_amd_media_ops
41
- #pragma OPENCL EXTENSION cl_amd_media_ops : enable
42
- #else
43
- /* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
44
- * Build-in Function
45
- * uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
46
- * Description
47
- * dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
48
- * similar operation applied to other components of the vectors.
49
- *
50
- * The implemented function is modified because the last is in our case always a scalar.
51
- * We can ignore the bitwise AND operation.
52
- */
53
- inline uint2 amd_bitalign ( const uint2 src0 , const uint2 src1 , const uint src2 )
54
- {
55
- uint2 result ;
56
- result .s0 = (uint ) (((((ulong )src0 .s0 ) << 32 ) | (ulong )src1 .s0 ) >> (src2 ));
57
- result .s1 = (uint ) (((((ulong )src0 .s1 ) << 32 ) | (ulong )src1 .s1 ) >> (src2 ));
58
- return result ;
59
- }
60
- #endif
61
-
62
- #ifdef cl_amd_media_ops2
63
- #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
64
- #else
65
- /* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
66
- * Built-in Function:
67
- * uintn amd_bfe (uintn src0, uintn src1, uintn src2)
68
- * Description
69
- * NOTE: operator >> below represent logical right shift
70
- * offset = src1.s0 & 31;
71
- * width = src2.s0 & 31;
72
- * if width = 0
73
- * dst.s0 = 0;
74
- * else if (offset + width) < 32
75
- * dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
76
- * else
77
- * dst.s0 = src0.s0 >> offset;
78
- * similar operation applied to other components of the vectors
79
- */
80
- inline int amd_bfe (const uint src0 , const uint offset , const uint width )
81
- {
82
- /* casts are removed because we can implement everything as uint
83
- * int offset = src1;
84
- * int width = src2;
85
- * remove check for edge case, this function is always called with
86
- * `width==8`
87
- * @code
88
- * if ( width == 0 )
89
- * return 0;
90
- * @endcode
91
- */
92
- if ( (offset + width ) < 32u )
93
- return (src0 << (32u - offset - width )) >> (32u - width );
94
-
95
- return src0 >> offset ;
96
- }
97
- #endif
98
35
99
36
static const __constant ulong keccakf_rndc [24 ] =
100
37
{
@@ -128,6 +65,8 @@ static const __constant uchar sbox[256] =
128
65
0x8C , 0xA1 , 0x89 , 0x0D , 0xBF , 0xE6 , 0x42 , 0x68 , 0x41 , 0x99 , 0x2D , 0x0F , 0xB0 , 0x54 , 0xBB , 0x16
129
66
};
130
67
68
+ //#include "opencl/wolf-aes.cl"
69
+ XMRSTAK_INCLUDE_WOLF_AES
131
70
132
71
void keccakf1600 (ulong * s )
133
72
{
@@ -355,8 +294,6 @@ inline uint getIdx()
355
294
XMRSTAK_INCLUDE_FAST_INT_MATH_V2
356
295
//#include "fast_div_heavy.cl"
357
296
XMRSTAK_INCLUDE_FAST_DIV_HEAVY
358
- //#include "opencl/wolf-aes.cl"
359
- XMRSTAK_INCLUDE_WOLF_AES
360
297
//#include "opencl/wolf-skein.cl"
361
298
XMRSTAK_INCLUDE_WOLF_SKEIN
362
299
//#include "opencl/jh.cl"
@@ -461,8 +398,6 @@ void CNKeccak(ulong *output, ulong *input)
461
398
462
399
static const __constant uchar rcon [8 ] = { 0x8d , 0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 };
463
400
464
- #define BYTE (x , y ) (amd_bfe((x), (y) << 3U, 8U))
465
-
466
401
#define SubWord (inw ) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
467
402
468
403
void AESExpandKey256 (uint * keybuf )
@@ -539,6 +474,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
539
474
State [8 ] = input [8 ];
540
475
State [9 ] = input [9 ];
541
476
State [10 ] = input [10 ];
477
+ State [11 ] = input [11 ];
478
+ State [12 ] = input [12 ];
479
+ State [13 ] = input [13 ];
480
+ State [14 ] = input [14 ];
481
+ State [15 ] = input [15 ];
542
482
543
483
((__local uint * )State )[9 ] &= 0x00FFFFFFU ;
544
484
((__local uint * )State )[9 ] |= (((uint )get_global_id (0 )) & 0xFF ) << 24 ;
@@ -550,13 +490,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
550
490
*/
551
491
((__local uint * )State )[10 ] |= (((uint )get_global_id (0 ) >> 8 ));
552
492
553
- for (int i = 11 ; i < 25 ; ++ i ) {
554
- State [i ] = 0x00UL ;
555
- }
556
-
557
493
// Last bit of padding
558
494
State [16 ] = 0x8000000000000000UL ;
559
495
496
+ for (int i = 17 ; i < 25 ; ++ i ) {
497
+ State [i ] = 0x00UL ;
498
+ }
499
+
560
500
keccakf1600_2 (State );
561
501
562
502
#pragma unroll
@@ -1361,7 +1301,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
1361
1301
states += 25 * BranchBuf [idx ];
1362
1302
1363
1303
ulong State [8 ] = { 0UL , 0UL , 0UL , 0UL , 0UL , 0UL , 0UL , 0x0001000000000000UL };
1364
- #if defined(__clang__ ) && !defined(__NV_CL_C_VERSION )
1304
+ #if defined(__clang__ ) && !defined(__NV_CL_C_VERSION ) && ( IS_WINDOWS_OS != 1 )
1365
1305
// on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares
1366
1306
volatile
1367
1307
#endif
0 commit comments