rapid821
diff --git a/‎README.md
+2 b/‎README.md
+2
diff --git a/‎doc/usage.md
-1 b/‎doc/usage.md
-1
diff --git a/‎scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
+1-1 b/‎scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
+1-1
diff --git a/‎xmrstak/backend/amd/OclCryptonightR_gen.cpp
+7-4 b/‎xmrstak/backend/amd/OclCryptonightR_gen.cpp
+7-4
diff --git a/‎xmrstak/backend/amd/OclCryptonightR_gen.hpp
+1-1 b/‎xmrstak/backend/amd/OclCryptonightR_gen.hpp
+1-1
diff --git a/‎xmrstak/backend/amd/amd_gpu/gpu.cpp
+15-9 b/‎xmrstak/backend/amd/amd_gpu/gpu.cpp
+15-9
diff --git a/‎xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+12-72 b/‎xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+12-72
diff --git a/‎xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+64 b/‎xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+64
diff --git a/‎xmrstak/backend/amd/autoAdjust.hpp
+2-2 b/‎xmrstak/backend/amd/autoAdjust.hpp
+2-2
diff --git a/‎xmrstak/backend/amd/minethd.cpp
+1-1 b/‎xmrstak/backend/amd/minethd.cpp
+1-1
@@ -60,6 +60,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Stellite](https://stellite.cash/)
 - [TurtleCoin](https://turtlecoin.lol)
 - [Zelerius](https://zelerius.org/)
+- [X-CASH](https://x-network.io/)
 
 Ryo currency is a way for us to implement the ideas that we were unable to in
 Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
@@ -78,6 +79,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_v7
     - cryptonight_v7_stellite
     - cryptonight_v8
+    - cryptonight_v8_double (used by X-CASH)
     - cryptonight_v8_half (used by masari and stellite)
     - cryptonight_v8_reversewaltz (used by graft)
     - cryptonight_v8_zelerius
 
@@ -52,7 +52,6 @@ The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD
 ```
 xmr-stak --noCPU
 ```
-**CUDA** is currently not supported. I am currently try to get some performance out it.
 
 ### NVIDIA via OpenCL
 
 
@@ -8,7 +8,7 @@ fi
 if [ -d xmr-stak ]; then
   git -C xmr-stak clean -fd
 else
-  git clone https://github.com/rapid821/xmr-stak-hide.git
+  git clone https://github.com/fireice-uk/xmr-stak.git
 fi
 
 wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run
 
@@ -134,6 +134,7 @@ static cl_program CryptonightR_build_program(
     const GpuContext* ctx,
     xmrstak_algo algo,
     uint64_t height,
+    uint32_t precompile_count,
     cl_kernel old_kernel,
     std::string source_code,
     std::string options)
@@ -151,7 +152,7 @@ static cl_program CryptonightR_build_program(
         for(size_t i = 0; i < CryptonightR_cache.size();)
         {
             const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 < height))
+            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
             {
                 printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
                 old_programs.push_back(entry.program);
@@ -252,10 +253,12 @@ static cl_program CryptonightR_build_program(
     return program;
 }
 
-cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel)
+cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel)
 {
+	printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);
+
     if (background) {
-        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); });
+        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); });
         return nullptr;
     }
 
@@ -347,7 +350,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
 
     }
 
-    return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options);
+    return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options);
 }
 
 } // namespace amd
 
@@ -20,7 +20,7 @@ namespace amd
 {
 
 cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
-	uint64_t height, bool background = false, cl_kernel old_kernel = nullptr);
+	uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr);
 
 } // namespace amd
 } // namespace xmrstak
@@ -199,7 +199,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
+	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
@@ -334,6 +334,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 */
 		options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);
 
+		uint32_t isWindowsOs = 0;
+#ifdef _WIN32
+		isWindowsOs = 1;
+#endif
+		options += " -DIS_WINDOWS_OS=" + std::to_string(isWindowsOs);
+		
 		if(miner_algo == cryptonight_gpu)
 			options += " -cl-fp32-correctly-rounded-divide-sqrt";
 
@@ -889,15 +895,15 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	cl_int ret;
 
-	if(input_len > 84)
+	if(input_len > 124)
 		return ERR_STUPID_PARAMS;
 
 	input[input_len] = 0x01;
-	memset(input + input_len + 1, 0, 88 - input_len - 1);
+	memset(input + input_len + 1, 0, 128 - input_len - 1);
 
 	cl_uint numThreads = ctx->rawIntensity;
 
-	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
 		return ERR_OCL_API;
@@ -952,8 +958,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
     if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
 
+		uint32_t PRECOMPILATION_DEPTH = 4;
+
         // Get new kernel
-        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height);
+        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);
 
         if (program != ctx->ProgramCryptonightR) {
             cl_int ret;
@@ -969,12 +977,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
             }
             ctx->ProgramCryptonightR = program;
 
-			uint32_t PRECOMPILATION_DEPTH = 4;
-
             // Precompile next program in background
-            xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, true, old_kernel);
+            xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, PRECOMPILATION_DEPTH, true, old_kernel);
             for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i)
-                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, true, nullptr);
+                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true, nullptr);
 
             printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
         }
 
@@ -32,69 +32,6 @@ R"===(
 #define cryptonight_conceal 14
 #define cryptonight_v8_reversewaltz 17
 
-/* For Mesa clover support */
-#ifdef cl_clang_storage_class_specifiers
-#   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
-#endif
-
-#ifdef cl_amd_media_ops
-#pragma OPENCL EXTENSION cl_amd_media_ops : enable
-#else
-/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
- * Build-in Function
- *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
- *   Description
- *     dst.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
- *     similar operation applied to other components of the vectors.
- *
- * The implemented function is modified because the last is in our case always a scalar.
- * We can ignore the bitwise AND operation.
- */
-inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
-{
-	uint2 result;
-	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
-	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
-	return result;
-}
-#endif
-
-#ifdef cl_amd_media_ops2
-#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
-#else
-/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
- *     Built-in Function:
- *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
- *   Description
- *     NOTE: operator >> below represent logical right shift
- *     offset = src1.s0 & 31;
- *     width = src2.s0 & 31;
- *     if width = 0
- *         dst.s0 = 0;
- *     else if (offset + width) < 32
- *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
- *     else
- *         dst.s0 = src0.s0 >> offset;
- *     similar operation applied to other components of the vectors
- */
-inline int amd_bfe(const uint src0, const uint offset, const uint width)
-{
-	/* casts are removed because we can implement everything as uint
-	 * int offset = src1;
-	 * int width = src2;
-	 * remove check for edge case, this function is always called with
-	 * `width==8`
-	 * @code
-	 *   if ( width == 0 )
-	 *      return 0;
-	 * @endcode
-	 */
-	if ( (offset + width) < 32u )
-		return (src0 << (32u - offset - width)) >> (32u - width);
-
-	return src0 >> offset;
-}
-#endif
 
 static const __constant ulong keccakf_rndc[24] =
 {
@@ -128,6 +65,8 @@ static const __constant uchar sbox[256] =
 	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
 };
 
+//#include "opencl/wolf-aes.cl"
+XMRSTAK_INCLUDE_WOLF_AES
 
 void keccakf1600(ulong *s)
 {
@@ -355,8 +294,6 @@ inline uint getIdx()
 XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "fast_div_heavy.cl"
 XMRSTAK_INCLUDE_FAST_DIV_HEAVY
-//#include "opencl/wolf-aes.cl"
-XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
 XMRSTAK_INCLUDE_WOLF_SKEIN
 //#include "opencl/jh.cl"
@@ -461,8 +398,6 @@ void CNKeccak(ulong *output, ulong *input)
 
 static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
 
-#define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
-
 #define SubWord(inw)		((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
 
 void AESExpandKey256(uint *keybuf)
@@ -539,6 +474,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 			State[8]  = input[8];
 			State[9]  = input[9];
 			State[10] = input[10];
+			State[11] = input[11];
+			State[12] = input[12];
+			State[13] = input[13];
+			State[14] = input[14];
+			State[15] = input[15];
 
 			((__local uint *)State)[9]  &= 0x00FFFFFFU;
 			((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
@@ -550,13 +490,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 			 */
 			((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
 
-			for (int i = 11; i < 25; ++i) {
-			    State[i] = 0x00UL;
-			}
-
 			// Last bit of padding
 			State[16] = 0x8000000000000000UL;
 
+			for (int i = 17; i < 25; ++i) {
+			    State[i] = 0x00UL;
+			}
+
 			keccakf1600_2(State);
 
 			#pragma unroll
@@ -1361,7 +1301,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		states += 25 * BranchBuf[idx];
 
 		ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
-#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION) && (IS_WINDOWS_OS != 1)
 		// on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares
 		volatile
 #endif
 
@@ -2,6 +2,70 @@ R"===(
 #ifndef WOLF_AES_CL
 #define WOLF_AES_CL
 
+/* For Mesa clover support */
+#ifdef cl_clang_storage_class_specifiers
+#   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#endif
+
+#ifdef cl_amd_media_ops
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#else
+/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
+ * Build-in Function
+ *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
+ *   Description
+ *     dst.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
+ *     similar operation applied to other components of the vectors.
+ *
+ * The implemented function is modified because the last is in our case always a scalar.
+ * We can ignore the bitwise AND operation.
+ */
+inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
+{
+	uint2 result;
+	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
+	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
+	return result;
+}
+#endif
+
+#ifdef cl_amd_media_ops2
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+#else
+/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
+ *     Built-in Function:
+ *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
+ *   Description
+ *     NOTE: operator >> below represent logical right shift
+ *     offset = src1.s0 & 31;
+ *     width = src2.s0 & 31;
+ *     if width = 0
+ *         dst.s0 = 0;
+ *     else if (offset + width) < 32
+ *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
+ *     else
+ *         dst.s0 = src0.s0 >> offset;
+ *     similar operation applied to other components of the vectors
+ */
+inline int amd_bfe(const uint src0, const uint offset, const uint width)
+{
+	/* casts are removed because we can implement everything as uint
+	 * int offset = src1;
+	 * int width = src2;
+	 * remove check for edge case, this function is always called with
+	 * `width==8`
+	 * @code
+	 *   if ( width == 0 )
+	 *      return 0;
+	 * @endcode
+	 */
+	if ( (offset + width) < 32u )
+		return (src0 << (32u - offset - width)) >> (32u - width);
+
+	return src0 >> offset;
+}
+#endif
+
 // AES table - the other three are generated on the fly
 
 static const __constant uint AES0_C[256] =
 
@@ -187,8 +187,8 @@ class autoAdjust
 				memPerThread = std::min(memPerThread, memDoubleThread);
 			}
 
-			// 224byte extra memory is used per thread for meta data
-			size_t perThread = hashMemSize + 224u;
+			// 240byte extra memory is used per thread for meta data
+			size_t perThread = hashMemSize + 240u;
 			size_t maxIntensity = memPerThread / perThread;
 			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
 
@@ -273,7 +273,7 @@ void minethd::work_main()
 
 			for(size_t i = 0; i < results[0xFF]; i++)
 			{
-				uint8_t	bWorkBlob[112];
+				uint8_t	bWorkBlob[128];
 				uint8_t	bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ static cl_program CryptonightR_build_program(`
`134`	`134`	`const GpuContext* ctx,`
`135`	`135`	`xmrstak_algo algo,`
`136`	`136`	`uint64_t height,`
	`137`	`+ uint32_t precompile_count,`
`137`	`138`	`cl_kernel old_kernel,`
`138`	`139`	`std::string source_code,`
`139`	`140`	`std::string options)`
`@@ -151,7 +152,7 @@ static cl_program CryptonightR_build_program(`
`151`	`152`	`for(size_t i = 0; i < CryptonightR_cache.size();)`
`152`	`153`	`{`
`153`	`154`	`const CacheEntry& entry = CryptonightR_cache[i];`
`154`		`- if ((entry.algo == algo) && (entry.height + 2 < height))`
	`155`	`+ if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))`
`155`	`156`	`{`
`156`	`157`	`printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);`
`157`	`158`	`old_programs.push_back(entry.program);`
`@@ -252,10 +253,12 @@ static cl_program CryptonightR_build_program(`
`252`	`253`	`return program;`
`253`	`254`	`}`
`254`	`255`
`255`		`-cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel)`
	`256`	`+cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel)`
`256`	`257`	`{`
	`258`	`+ printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);`
	`259`	`+`
`257`	`260`	`if (background) {`
`258`		`- background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); });`
	`261`	`+ background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); });`
`259`	`262`	`return nullptr;`
`260`	`263`	`}`
`261`	`264`
`@@ -347,7 +350,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t`
`347`	`350`
`348`	`351`	`}`
`349`	`352`
`350`		`- return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options);`
	`353`	`+ return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options);`
`351`	`354`	`}`
`352`	`355`
`353`	`356`	`} // namespace amd`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ namespace amd`
`20`	`20`	`{`
`21`	`21`
`22`	`22`	`cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,`
`23`		`- uint64_t height, bool background = false, cl_kernel old_kernel = nullptr);`
	`23`	`+ uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr);`
`24`	`24`
`25`	`25`	`} // namespace amd`
`26`	`26`	`} // namespace xmrstak`
Original file line number	Diff line number	Diff line change
`@@ -273,7 +273,7 @@ void minethd::work_main()`
`273`	`273`
`274`	`274`	`for(size_t i = 0; i < results[0xFF]; i++)`
`275`	`275`	`{`
`276`		`- uint8_t bWorkBlob[112];`
	`276`	`+ uint8_t bWorkBlob[128];`
`277`	`277`	`uint8_t bResult[32];`
`278`	`278`
`279`	`279`	`memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);`