9
9
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
10
10
* Copyright 2019 Spudz76 <https://github.com/Spudz76>
11
11
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <[email protected] >
12
+ * Copyright 2019 SP <https://github.com/sp-hash>
12
13
*
13
14
* This program is free software: you can redistribute it and/or modify
14
15
* it under the terms of the GNU General Public License as published by
@@ -117,7 +118,7 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state )
117
118
118
119
119
120
template <xmrig::Algo ALGO, xmrig::Variant VARIANT>
120
- __global__ void cryptonight_extra_gpu_prepare (
121
+ __launch_bounds__ ( 1024 , 1 ) __global__ void cryptonight_extra_gpu_prepare(
121
122
int threads,
122
123
uint32_t *__restrict__ d_input,
123
124
uint32_t len,
@@ -202,7 +203,7 @@ __global__ void cryptonight_extra_gpu_prepare(
202
203
203
204
204
205
template <xmrig::Algo ALGO>
205
- __global__ void cryptonight_extra_gpu_final ( int threads, uint64_t target, uint32_t * __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
206
+ __launch_bounds__ ( 1024 , 1 ) __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t * __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
206
207
{
207
208
const int thread = blockDim .x * blockIdx .x + threadIdx .x ;
208
209
@@ -275,7 +276,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
275
276
276
277
277
278
template <xmrig::Algo ALGO>
278
- __global__ void cryptonight_gpu_extra_gpu_final ( int threads, uint64_t target, uint32_t * __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
279
+ __launch_bounds__ ( 1024 , 1 ) __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t * __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
279
280
{
280
281
const int thread = blockDim .x * blockIdx .x + threadIdx .x ;
281
282
@@ -578,32 +579,40 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
578
579
ctx->device_pciDomainID = props.pciDomainID ;
579
580
580
581
// set all device option those marked as auto (-1) to a valid value
581
- if (ctx->device_blocks == -1 ) {
582
+ if (ctx->device_blocks == -1 )
583
+ {
582
584
/* good values based of my experience
583
585
* - 3 * SMX count >=sm_30
584
586
* - 2 * SMX count for <sm_30
585
587
*/
586
- ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3 );
587
-
588
- // increase bfactor for low end devices to avoid that the miner is killed by the OS
589
- # ifdef _WIN32
590
- if (props.multiProcessorCount <= 6 && ctx->device_bfactor == 6 ) {
591
- ctx->device_bfactor = 8 ;
592
- }
593
- # endif
588
+ ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 1 );
594
589
}
595
590
596
- if (ctx->device_threads == -1 ) {
591
+ if (ctx->device_threads == -1 )
592
+ {
597
593
/* sm_20 devices can only run 512 threads per cuda block
598
594
* `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
599
595
* `8 * ctx->device_threads` threads per block
600
596
*/
601
- ctx->device_threads = 64 ;
597
+ if (props.major < 6 )
598
+ {
599
+ ctx->device_threads = 64 ;
600
+ if ((ctx->device_arch [0 ] == 5 ) && ctx->device_arch [1 ] == 0 )
601
+ {
602
+ ctx->device_threads = 40 ;
603
+ }
604
+ }
605
+ else
606
+ {
607
+ ctx->device_threads = 128U ;
608
+ }
609
+
602
610
constexpr size_t byteToMiB = 1024u * 1024u ;
603
611
604
612
// no limit by default 1TiB
605
613
size_t maxMemUsage = byteToMiB * byteToMiB;
606
- if (props.major == 6 ) {
614
+ /* if (props.major == 6)
615
+ {
607
616
if (props.multiProcessorCount < 15) {
608
617
// limit memory usage for GPUs for pascal < GTX1070
609
618
maxMemUsage = size_t(2048u) * byteToMiB;
@@ -613,6 +622,7 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
613
622
maxMemUsage = size_t(4096u) * byteToMiB;
614
623
}
615
624
}
625
+ */
616
626
617
627
if (props.major < 6 ) {
618
628
// limit memory usage for GPUs before pascal
@@ -657,18 +667,20 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
657
667
perThread += 50 * 4 ; // state double buffer
658
668
}
659
669
660
- const size_t max_intensity = limitedMemory / perThread;
670
+ // const size_t max_intensity = limitedMemory / perThread;
661
671
662
- ctx->device_threads = max_intensity / ctx->device_blocks ;
672
+ // ctx->device_threads = max_intensity / ctx->device_blocks;
663
673
// use only odd number of threads
664
- ctx->device_threads = ctx->device_threads & 0xFFFFFFFE ;
674
+ // ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
665
675
666
- if (props.major == 2 && ctx->device_threads > 64 ) {
676
+ if (props.major == 2 && ctx->device_threads > 64 )
677
+ {
667
678
// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
668
679
ctx->device_threads = 64 ;
669
680
}
670
681
671
- if (isCNv2 && props.major < 6 ) {
682
+ if (isCNv2 && props.major < 6 && !(props.major == 5 && props.minor ==0 ))
683
+ {
672
684
// 4 based on my test maybe it must be adjusted later
673
685
size_t threads = 4 ;
674
686
// 8 is chosen by checking the occupancy calculator
@@ -679,9 +691,7 @@ int cuda_get_deviceinfo(nvid_ctx* ctx, xmrig::Algo algo, bool isCNv2)
679
691
ctx->device_blocks = blockOptimal;
680
692
}
681
693
}
682
-
683
- }
684
-
694
+ }
685
695
return 0 ;
686
696
}
687
697
0 commit comments