Skip to content

Commit 74c094d

Browse files
committed
CryptonightR 2-5% faster
Tested on gtx pascal cards, and the 750ti For pascal cards the stable overclock is depending on the memory manufactor. Samsung can do around +900mhz miocron +650mhz Hynix +500mhz
1 parent 5e8710a commit 74c094d

File tree

3 files changed

+37
-27
lines changed

3 files changed

+37
-27
lines changed

src/core/Config.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
* Copyright 2016 Jay D Dee <[email protected]>
77
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
88
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <[email protected]>
9-
*
10-
* This program is free software: you can redistribute it and/or modify
9+
* Copyright 2019 SP <https://github.com/sp-hash>
10+
* This program is free software: you can redistribute it and/or modify
1111
* it under the terms of the GNU General Public License as published by
1212
* the Free Software Foundation, either version 3 of the License, or
1313
* (at your option) any later version.
@@ -41,7 +41,7 @@
4141
xmrig::Config::Config() : xmrig::CommonConfig(),
4242
m_autoConf(false),
4343
m_shouldSave(false),
44-
m_maxGpuThreads(64),
44+
m_maxGpuThreads(128),
4545
m_maxGpuUsage(100)
4646
{
4747
}

src/core/ConfigLoader_default.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ R"===(
4646
"colors": true,
4747
"cuda-bfactor": null,
4848
"cuda-bsleep": null,
49-
"cuda-max-threads": 64,
49+
"cuda-max-threads": 128,
5050
"donate-level": 5,
5151
"log-file": null,
5252
"pools": [

src/nvidia/cuda_extra.cu

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
1010
* Copyright 2019 Spudz76 <https://github.com/Spudz76>
1111
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <[email protected]>
12+
* Copyright 2019 SP <https://github.com/sp-hash>
1213
*
1314
* This program is free software: you can redistribute it and/or modify
1415
* it under the terms of the GNU General Public License as published by
@@ -117,7 +118,7 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state )
117118

118119

119120
template<xmrig::Algo ALGO, xmrig::Variant VARIANT>
120-
__global__ void cryptonight_extra_gpu_prepare(
121+
__launch_bounds__(1024, 1) __global__ void cryptonight_extra_gpu_prepare(
121122
int threads,
122123
uint32_t *__restrict__ d_input,
123124
uint32_t len,
@@ -202,7 +203,7 @@ __global__ void cryptonight_extra_gpu_prepare(
202203

203204

204205
template<xmrig::Algo ALGO>
205-
__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
206+
__launch_bounds__(1024,1) __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
206207
{
207208
const int thread = blockDim.x * blockIdx.x + threadIdx.x;
208209

@@ -275,7 +276,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
275276

276277

277278
template<xmrig::Algo ALGO>
278-
__global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
279+
__launch_bounds__(1024, 1) __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
279280
{
280281
const int thread = blockDim.x * blockIdx.x + threadIdx.x;
281282

@@ -578,32 +579,40 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
578579
ctx->device_pciDomainID = props.pciDomainID;
579580

580581
// set all device option those marked as auto (-1) to a valid value
581-
if (ctx->device_blocks == -1) {
582+
if (ctx->device_blocks == -1)
583+
{
582584
/* good values based of my experience
583585
* - 3 * SMX count >=sm_30
584586
* - 2 * SMX count for <sm_30
585587
*/
586-
ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3);
587-
588-
// increase bfactor for low end devices to avoid that the miner is killed by the OS
589-
# ifdef _WIN32
590-
if (props.multiProcessorCount <= 6 && ctx->device_bfactor == 6) {
591-
ctx->device_bfactor = 8;
592-
}
593-
# endif
588+
ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 1);
594589
}
595590

596-
if (ctx->device_threads == -1) {
591+
if (ctx->device_threads == -1)
592+
{
597593
/* sm_20 devices can only run 512 threads per cuda block
598594
* `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
599595
* `8 * ctx->device_threads` threads per block
600596
*/
601-
ctx->device_threads = 64;
597+
if (props.major < 6)
598+
{
599+
ctx->device_threads = 64;
600+
if ((ctx->device_arch[0] == 5) && ctx->device_arch[1] == 0)
601+
{
602+
ctx->device_threads = 40;
603+
}
604+
}
605+
else
606+
{
607+
ctx->device_threads = 128U;
608+
}
609+
602610
constexpr size_t byteToMiB = 1024u * 1024u;
603611

604612
// no limit by default 1TiB
605613
size_t maxMemUsage = byteToMiB * byteToMiB;
606-
if (props.major == 6) {
614+
/*if (props.major == 6)
615+
{
607616
if (props.multiProcessorCount < 15) {
608617
// limit memory usage for GPUs for pascal < GTX1070
609618
maxMemUsage = size_t(2048u) * byteToMiB;
@@ -613,6 +622,7 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
613622
maxMemUsage = size_t(4096u) * byteToMiB;
614623
}
615624
}
625+
*/
616626

617627
if (props.major < 6) {
618628
// limit memory usage for GPUs before pascal
@@ -657,18 +667,20 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
657667
perThread += 50 * 4; // state double buffer
658668
}
659669

660-
const size_t max_intensity = limitedMemory / perThread;
670+
// const size_t max_intensity = limitedMemory / perThread;
661671

662-
ctx->device_threads = max_intensity / ctx->device_blocks;
672+
// ctx->device_threads = max_intensity / ctx->device_blocks;
663673
// use only odd number of threads
664-
ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
674+
// ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
665675

666-
if (props.major == 2 && ctx->device_threads > 64) {
676+
if (props.major == 2 && ctx->device_threads > 64)
677+
{
667678
// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
668679
ctx->device_threads = 64;
669680
}
670681

671-
if (isCNv2 && props.major < 6) {
682+
if (isCNv2 && props.major < 6 && !(props.major== 5 && props.minor==0))
683+
{
672684
// 4 based on my test maybe it must be adjusted later
673685
size_t threads = 4;
674686
// 8 is chosen by checking the occupancy calculator
@@ -679,9 +691,7 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
679691
ctx->device_blocks = blockOptimal;
680692
}
681693
}
682-
683-
}
684-
694+
}
685695
return 0;
686696
}
687697

0 commit comments

Comments
 (0)