Skip to content

Commit ef276cf

Browse files
committed
Upgrade to 2.10.1-hide-3.1.1
1 parent 2990196 commit ef276cf

29 files changed

+219
-193
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
6060
- [Stellite](https://stellite.cash/)
6161
- [TurtleCoin](https://turtlecoin.lol)
6262
- [Zelerius](https://zelerius.org/)
63+
- [X-CASH](https://x-network.io/)
6364

6465
Ryo currency is a way for us to implement the ideas that we were unable to in
6566
Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
@@ -78,6 +79,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
7879
- cryptonight_v7
7980
- cryptonight_v7_stellite
8081
- cryptonight_v8
82+
- cryptonight_v8_double (used by X-CASH)
8183
- cryptonight_v8_half (used by masari and stellite)
8284
- cryptonight_v8_reversewaltz (used by graft)
8385
- cryptonight_v8_zelerius

doc/usage.md

-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD
5252
```
5353
xmr-stak --noCPU
5454
```
55-
**CUDA** is currently not supported. I am currently try to get some performance out it.
5655

5756
### NVIDIA via OpenCL
5857

scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ fi
88
if [ -d xmr-stak ]; then
99
git -C xmr-stak clean -fd
1010
else
11-
git clone https://github.com/rapid821/xmr-stak-hide.git
11+
git clone https://github.com/fireice-uk/xmr-stak.git
1212
fi
1313

1414
wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run

xmrstak/backend/amd/OclCryptonightR_gen.cpp

+7-4
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ static cl_program CryptonightR_build_program(
134134
const GpuContext* ctx,
135135
xmrstak_algo algo,
136136
uint64_t height,
137+
uint32_t precompile_count,
137138
cl_kernel old_kernel,
138139
std::string source_code,
139140
std::string options)
@@ -151,7 +152,7 @@ static cl_program CryptonightR_build_program(
151152
for(size_t i = 0; i < CryptonightR_cache.size();)
152153
{
153154
const CacheEntry& entry = CryptonightR_cache[i];
154-
if ((entry.algo == algo) && (entry.height + 2 < height))
155+
if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
155156
{
156157
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
157158
old_programs.push_back(entry.program);
@@ -252,10 +253,12 @@ static cl_program CryptonightR_build_program(
252253
return program;
253254
}
254255

255-
cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel)
256+
cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel)
256257
{
258+
printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);
259+
257260
if (background) {
258-
background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); });
261+
background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); });
259262
return nullptr;
260263
}
261264

@@ -347,7 +350,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
347350

348351
}
349352

350-
return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options);
353+
return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options);
351354
}
352355

353356
} // namespace amd

xmrstak/backend/amd/OclCryptonightR_gen.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ namespace amd
2020
{
2121

2222
cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
23-
uint64_t height, bool background = false, cl_kernel old_kernel = nullptr);
23+
uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr);
2424

2525
} // namespace amd
2626
} // namespace xmrstak

xmrstak/backend/amd/amd_gpu/gpu.cpp

+15-9
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
199199
return ERR_OCL_API;
200200
}
201201

202-
ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
202+
ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
203203
if(ret != CL_SUCCESS)
204204
{
205205
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
@@ -334,6 +334,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
334334
*/
335335
options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);
336336

337+
uint32_t isWindowsOs = 0;
338+
#ifdef _WIN32
339+
isWindowsOs = 1;
340+
#endif
341+
options += " -DIS_WINDOWS_OS=" + std::to_string(isWindowsOs);
342+
337343
if(miner_algo == cryptonight_gpu)
338344
options += " -cl-fp32-correctly-rounded-divide-sqrt";
339345

@@ -889,15 +895,15 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
889895

890896
cl_int ret;
891897

892-
if(input_len > 84)
898+
if(input_len > 124)
893899
return ERR_STUPID_PARAMS;
894900

895901
input[input_len] = 0x01;
896-
memset(input + input_len + 1, 0, 88 - input_len - 1);
902+
memset(input + input_len + 1, 0, 128 - input_len - 1);
897903

898904
cl_uint numThreads = ctx->rawIntensity;
899905

900-
if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
906+
if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
901907
{
902908
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
903909
return ERR_OCL_API;
@@ -952,8 +958,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
952958

953959
if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
954960

961+
uint32_t PRECOMPILATION_DEPTH = 4;
962+
955963
// Get new kernel
956-
cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height);
964+
cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);
957965

958966
if (program != ctx->ProgramCryptonightR) {
959967
cl_int ret;
@@ -969,12 +977,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
969977
}
970978
ctx->ProgramCryptonightR = program;
971979

972-
uint32_t PRECOMPILATION_DEPTH = 4;
973-
974980
// Precompile next program in background
975-
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, true, old_kernel);
981+
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, PRECOMPILATION_DEPTH, true, old_kernel);
976982
for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i)
977-
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, true, nullptr);
983+
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true, nullptr);
978984

979985
printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
980986
}

xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl

+12-72
Original file line numberDiff line numberDiff line change
@@ -32,69 +32,6 @@ R"===(
3232
#define cryptonight_conceal 14
3333
#define cryptonight_v8_reversewaltz 17
3434

35-
/* For Mesa clover support */
36-
#ifdef cl_clang_storage_class_specifiers
37-
# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
38-
#endif
39-
40-
#ifdef cl_amd_media_ops
41-
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
42-
#else
43-
/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
44-
* Build-in Function
45-
* uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
46-
* Description
47-
* dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
48-
* similar operation applied to other components of the vectors.
49-
*
50-
* The implemented function is modified because the last is in our case always a scalar.
51-
* We can ignore the bitwise AND operation.
52-
*/
53-
inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
54-
{
55-
uint2 result;
56-
result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
57-
result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
58-
return result;
59-
}
60-
#endif
61-
62-
#ifdef cl_amd_media_ops2
63-
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
64-
#else
65-
/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
66-
* Built-in Function:
67-
* uintn amd_bfe (uintn src0, uintn src1, uintn src2)
68-
* Description
69-
* NOTE: operator >> below represent logical right shift
70-
* offset = src1.s0 & 31;
71-
* width = src2.s0 & 31;
72-
* if width = 0
73-
* dst.s0 = 0;
74-
* else if (offset + width) < 32
75-
* dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
76-
* else
77-
* dst.s0 = src0.s0 >> offset;
78-
* similar operation applied to other components of the vectors
79-
*/
80-
inline int amd_bfe(const uint src0, const uint offset, const uint width)
81-
{
82-
/* casts are removed because we can implement everything as uint
83-
* int offset = src1;
84-
* int width = src2;
85-
* remove check for edge case, this function is always called with
86-
* `width==8`
87-
* @code
88-
* if ( width == 0 )
89-
* return 0;
90-
* @endcode
91-
*/
92-
if ( (offset + width) < 32u )
93-
return (src0 << (32u - offset - width)) >> (32u - width);
94-
95-
return src0 >> offset;
96-
}
97-
#endif
9835

9936
static const __constant ulong keccakf_rndc[24] =
10037
{
@@ -128,6 +65,8 @@ static const __constant uchar sbox[256] =
12865
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
12966
};
13067

68+
//#include "opencl/wolf-aes.cl"
69+
XMRSTAK_INCLUDE_WOLF_AES
13170

13271
void keccakf1600(ulong *s)
13372
{
@@ -355,8 +294,6 @@ inline uint getIdx()
355294
XMRSTAK_INCLUDE_FAST_INT_MATH_V2
356295
//#include "fast_div_heavy.cl"
357296
XMRSTAK_INCLUDE_FAST_DIV_HEAVY
358-
//#include "opencl/wolf-aes.cl"
359-
XMRSTAK_INCLUDE_WOLF_AES
360297
//#include "opencl/wolf-skein.cl"
361298
XMRSTAK_INCLUDE_WOLF_SKEIN
362299
//#include "opencl/jh.cl"
@@ -461,8 +398,6 @@ void CNKeccak(ulong *output, ulong *input)
461398

462399
static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
463400

464-
#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U))
465-
466401
#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
467402

468403
void AESExpandKey256(uint *keybuf)
@@ -539,6 +474,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
539474
State[8] = input[8];
540475
State[9] = input[9];
541476
State[10] = input[10];
477+
State[11] = input[11];
478+
State[12] = input[12];
479+
State[13] = input[13];
480+
State[14] = input[14];
481+
State[15] = input[15];
542482

543483
((__local uint *)State)[9] &= 0x00FFFFFFU;
544484
((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
@@ -550,13 +490,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
550490
*/
551491
((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
552492

553-
for (int i = 11; i < 25; ++i) {
554-
State[i] = 0x00UL;
555-
}
556-
557493
// Last bit of padding
558494
State[16] = 0x8000000000000000UL;
559495

496+
for (int i = 17; i < 25; ++i) {
497+
State[i] = 0x00UL;
498+
}
499+
560500
keccakf1600_2(State);
561501

562502
#pragma unroll
@@ -1361,7 +1301,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
13611301
states += 25 * BranchBuf[idx];
13621302

13631303
ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
1364-
#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
1304+
#if defined(__clang__) && !defined(__NV_CL_C_VERSION) && (IS_WINDOWS_OS != 1)
13651305
// on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares
13661306
volatile
13671307
#endif

xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl

+64
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,70 @@ R"===(
22
#ifndef WOLF_AES_CL
33
#define WOLF_AES_CL
44

5+
/* For Mesa clover support */
6+
#ifdef cl_clang_storage_class_specifiers
7+
# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
8+
#endif
9+
10+
#ifdef cl_amd_media_ops
11+
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
12+
#else
13+
/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
14+
* Build-in Function
15+
* uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
16+
* Description
17+
* dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
18+
* similar operation applied to other components of the vectors.
19+
*
20+
* The implemented function is modified because the last is in our case always a scalar.
21+
* We can ignore the bitwise AND operation.
22+
*/
23+
inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
24+
{
25+
uint2 result;
26+
result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
27+
result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
28+
return result;
29+
}
30+
#endif
31+
32+
#ifdef cl_amd_media_ops2
33+
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
34+
#else
35+
/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
36+
* Built-in Function:
37+
* uintn amd_bfe (uintn src0, uintn src1, uintn src2)
38+
* Description
39+
* NOTE: operator >> below represent logical right shift
40+
* offset = src1.s0 & 31;
41+
* width = src2.s0 & 31;
42+
* if width = 0
43+
* dst.s0 = 0;
44+
* else if (offset + width) < 32
45+
* dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
46+
* else
47+
* dst.s0 = src0.s0 >> offset;
48+
* similar operation applied to other components of the vectors
49+
*/
50+
inline int amd_bfe(const uint src0, const uint offset, const uint width)
51+
{
52+
/* casts are removed because we can implement everything as uint
53+
* int offset = src1;
54+
* int width = src2;
55+
* remove check for edge case, this function is always called with
56+
* `width==8`
57+
* @code
58+
* if ( width == 0 )
59+
* return 0;
60+
* @endcode
61+
*/
62+
if ( (offset + width) < 32u )
63+
return (src0 << (32u - offset - width)) >> (32u - width);
64+
65+
return src0 >> offset;
66+
}
67+
#endif
68+
569
// AES table - the other three are generated on the fly
670

771
static const __constant uint AES0_C[256] =

xmrstak/backend/amd/autoAdjust.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,8 @@ class autoAdjust
187187
memPerThread = std::min(memPerThread, memDoubleThread);
188188
}
189189

190-
// 224byte extra memory is used per thread for meta data
191-
size_t perThread = hashMemSize + 224u;
190+
// 240byte extra memory is used per thread for meta data
191+
size_t perThread = hashMemSize + 240u;
192192
size_t maxIntensity = memPerThread / perThread;
193193
size_t possibleIntensity = std::min( maxThreads , maxIntensity );
194194
// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group

xmrstak/backend/amd/minethd.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ void minethd::work_main()
273273

274274
for(size_t i = 0; i < results[0xFF]; i++)
275275
{
276-
uint8_t bWorkBlob[112];
276+
uint8_t bWorkBlob[128];
277277
uint8_t bResult[32];
278278

279279
memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);

0 commit comments

Comments
 (0)