Skip to content

Commit 9267abb

Browse files
committed
NT-opencl: 64-bit binary size
Not only do we save memory, we can reverse much more as well, and reject early. We check the remaining bits in cold host code, for good measure. Closes #5245
1 parent 42f37d4 commit 9267abb

27 files changed

+1169
-560
lines changed

doc/NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,11 @@ Major changes from 1.9.0-jumbo-1 (May 2019) in this bleeding-edge version:
296296

297297
- Added support for cracking SNTP-MS "timeroast". [magnum; 2023]
298298

299+
- Add NT-long-opencl (password length of up to 125 bytes). [magnum; 2023]
300+
301+
- NT-opencl: 64-bit binary size. Some good performance boost depending on
302+
number of hashes loaded. [magnum; 2023]
303+
299304

300305
Major changes from 1.8.0-jumbo-1 (December 2014) to 1.9.0-jumbo-1 (May 2019):
301306

run/opencl/nt_kernel.cl

Lines changed: 107 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@
3535
#define SQRT_2 0x5a827999
3636
#define SQRT_3 0x6ed9eba1
3737

38+
/*
39+
* If enabled, will check bitmap after calculating just the
40+
* first 32 bits of 'b' (does not apply to nt-long-opencl).
41+
*/
42+
#define EARLY_REJECT 1
43+
3844
#if USE_LOCAL_BITMAPS
3945
#define BITMAPS_TYPE __local
4046
#else
@@ -50,13 +56,13 @@
5056
#define CACHE_TYPE __global
5157
#endif
5258

53-
#if BITMAP_SIZE_BITS_LESS_ONE < 0xffffffff
54-
#define BITMAP_SIZE_BITS (BITMAP_SIZE_BITS_LESS_ONE + 1)
59+
#if BITMAP_MASK < 0xffffffff
60+
#define BITMAP_SIZE_BITS (BITMAP_MASK + 1)
5561
#else
56-
#error BITMAP_SIZE_BITS_LESS_ONE too large
62+
#error BITMAP_MASK too large
5763
#endif
5864

59-
inline void nt_crypt(uint *hash, uint *nt_buffer, uint md4_size)
65+
inline int nt_crypt(uint *hash, uint *nt_buffer, uint md4_size, BITMAPS_TYPE uint *bitmaps)
6066
{
6167
/* Round 1 */
6268
hash[0] = 0xFFFFFFFF + nt_buffer[0] ; hash[0] = rotate(hash[0], 3u);
@@ -130,19 +136,38 @@ inline void nt_crypt(uint *hash, uint *nt_buffer, uint md4_size)
130136
hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[9] + SQRT_3; hash[3] = rotate(hash[3], 9u );
131137
hash[2] += MD4_H (hash[3], hash[0], hash[1]) + nt_buffer[5] + SQRT_3; hash[2] = rotate(hash[2], 11u);
132138
hash[1] += MD4_H2(hash[2], hash[3], hash[0]) + nt_buffer[13];
139+
140+
#if EARLY_REJECT && PLAINTEXT_LENGTH <= 27
141+
uint bitmap_index = hash[1] & BITMAP_MASK;
142+
uint tmp = (bitmaps[(BITMAP_SIZE_BITS >> 5) * 0 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
143+
#if SELECT_CMP_STEPS == 8
144+
bitmap_index = (hash[1] >> 8) & BITMAP_MASK;
145+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 1 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
146+
bitmap_index = (hash[1] >> 16) & BITMAP_MASK;
147+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 2 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
148+
bitmap_index = (hash[1] >> 24) & BITMAP_MASK;
149+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
150+
#elif SELECT_CMP_STEPS == 4
151+
bitmap_index = (hash[1] >> 16) & BITMAP_MASK;
152+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 1 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
153+
#endif /* SELECT_CMP_STEPS == 8 */
154+
if (likely(!tmp))
155+
return 0;
156+
#endif /* EARLY_REJECT && PLAINTEXT_LENGTH <= 27 */
157+
133158
uint hash1 = hash[1] + SQRT_3; hash1 = rotate(hash1, 15u);
134159

135160
hash[0] += MD4_H (hash[3], hash[2], hash1 ) + nt_buffer[3] + SQRT_3; hash[0] = rotate(hash[0], 3u );
136-
hash[3] += MD4_H2(hash[2], hash1, hash[0]) + nt_buffer[11] + SQRT_3; hash[3] = rotate(hash[3], 9u );
137-
hash[2] += MD4_H (hash1, hash[0], hash[3]) + nt_buffer[7] + SQRT_3; hash[2] = rotate(hash[2], 11u);
138161

139162
#if PLAINTEXT_LENGTH > 27
140163
if (likely(md4_size <= (27 << 4)))
141-
return;
164+
return 1;
142165

143166
/*
144167
* Complete the first of a multi-block MD4 (reversing steps not possible).
145168
*/
169+
hash[3] += MD4_H2(hash[2], hash1, hash[0]) + nt_buffer[11] + SQRT_3; hash[3] = rotate(hash[3], 9u );
170+
hash[2] += MD4_H (hash1, hash[0], hash[3]) + nt_buffer[7] + SQRT_3; hash[2] = rotate(hash[2], 11u);
146171
hash[1] = hash1 + MD4_H2(hash[2], hash[3], hash[0]) + nt_buffer[15] + SQRT_3; hash[1] = rotate(hash[1], 15u);
147172
hash[0] += INIT_A;
148173
hash[1] += INIT_B;
@@ -233,15 +258,16 @@ inline void nt_crypt(uint *hash, uint *nt_buffer, uint md4_size)
233258
* This bogus reverse adds a little work to long crypts instead
234259
* of losing the real reverse for single block crypts.
235260
*/
236-
hash[0] -= INIT_A;
237-
hash[1] -= INIT_B;
238-
hash[2] -= INIT_C;
239261
hash[3] -= INIT_D;
262+
hash[2] -= INIT_C;
263+
hash[1] -= INIT_B;
264+
hash[0] -= INIT_A;
240265
hash[1] = (hash[1] >> 15) | (hash[1] << 17);
241-
hash[1] -= SQRT_3 + (hash[2] ^ hash[3] ^ hash[0]);
242-
hash[1] = (hash[1] >> 15) | (hash[1] << 17);
266+
hash[1] -= SQRT_3 + MD4_H2(hash[2], hash[3], hash[0]);
267+
hash[1] = rotate(hash[1], -15u);
243268
hash[1] -= SQRT_3;
244269
#endif
270+
return 1;
245271
}
246272

247273
#if __OS_X__ && (cpu(DEVICE_INFO) || gpu_nvidia(DEVICE_INFO))
@@ -341,47 +367,32 @@ inline uint prepare_key(__global uint *key, uint length, uint *nt_buffer)
341367
#endif /* UTF_8 */
342368

343369
inline void cmp_final(uint gid,
344-
uint iter,
345-
uint *hash,
346-
__global uint *offset_table,
347-
__global uint *hash_table,
348-
__global uint *return_hashes,
349-
volatile __global uint *output,
350-
volatile __global uint *bitmap_dupe) {
351-
352-
uint t, offset_table_index, hash_table_index;
353-
unsigned long LO, HI;
354-
unsigned long p;
355-
356-
HI = ((unsigned long)hash[3] << 32) | (unsigned long)hash[2];
357-
LO = ((unsigned long)hash[1] << 32) | (unsigned long)hash[0];
358-
359-
p = (HI % OFFSET_TABLE_SIZE) * SHIFT64_OT_SZ;
360-
p += LO % OFFSET_TABLE_SIZE;
361-
p %= OFFSET_TABLE_SIZE;
362-
offset_table_index = (unsigned int)p;
363-
364-
//error: chances of overflow is extremely low.
365-
LO += (unsigned long)offset_table[offset_table_index];
366-
367-
p = (HI % HASH_TABLE_SIZE) * SHIFT64_HT_SZ;
368-
p += LO % HASH_TABLE_SIZE;
369-
p %= HASH_TABLE_SIZE;
370-
hash_table_index = (unsigned int)p;
371-
372-
if (hash_table[hash_table_index] == hash[0])
373-
if (hash_table[HASH_TABLE_SIZE + hash_table_index] == hash[1])
374-
{
375-
/*
376-
* Prevent duplicate keys from cracking same hash
377-
*/
378-
if (!(atomic_or(&bitmap_dupe[hash_table_index/32], (1U << (hash_table_index % 32))) & (1U << (hash_table_index % 32)))) {
370+
uint iter,
371+
uint *hash,
372+
__global uint *offset_table,
373+
__global uint *hash_table,
374+
volatile __global uint *output,
375+
volatile __global uint *bitmap_dupe)
376+
{
377+
378+
uint t, hash_table_index;
379+
ulong hash64;
380+
381+
hash64 = ((ulong)hash[1] << 32) | (ulong)hash[0];
382+
hash64 += (ulong)offset_table[hash64 % OFFSET_TABLE_SIZE];
383+
hash_table_index = hash64 % HASH_TABLE_SIZE;
384+
385+
if (hash_table[hash_table_index] == hash[0] &&
386+
hash_table[hash_table_index + HASH_TABLE_SIZE] == hash[1]) {
387+
/*
388+
* Prevent duplicate keys from cracking same hash
389+
*/
390+
if (!(atomic_or(&bitmap_dupe[hash_table_index / 32],
391+
(1U << (hash_table_index % 32))) & (1U << (hash_table_index % 32)))) {
379392
t = atomic_inc(&output[0]);
380-
output[1 + 3 * t] = gid;
381-
output[2 + 3 * t] = iter;
382-
output[3 + 3 * t] = hash_table_index;
383-
return_hashes[2 * t] = hash[2];
384-
return_hashes[2 * t + 1] = hash[3];
393+
output[3 * t + 1] = gid;
394+
output[3 * t + 2] = iter;
395+
output[3 * t + 3] = hash_table_index;
385396
}
386397
}
387398
}
@@ -392,55 +403,59 @@ inline void cmp(uint gid,
392403
BITMAPS_TYPE uint *bitmaps,
393404
__global uint *offset_table,
394405
__global uint *hash_table,
395-
__global uint *return_hashes,
396406
volatile __global uint *output,
397407
volatile __global uint *bitmap_dupe)
398408
{
399409
uint bitmap_index, tmp = 1;
400410

401-
/* hash[0] += 0x67452301;
402-
hash[1] += 0xefcdab89;
403-
hash[2] += 0x98badcfe;
404-
hash[3] += 0x10325476;*/
405-
406-
#if SELECT_CMP_STEPS > 4
407-
bitmap_index = hash[0] & (BITMAP_SIZE_BITS - 1);
408-
tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
409-
bitmap_index = (hash[0] >> 16) & (BITMAP_SIZE_BITS - 1);
410-
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
411-
bitmap_index = hash[1] & (BITMAP_SIZE_BITS - 1);
412-
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 4) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
413-
bitmap_index = (hash[1] >> 16) & (BITMAP_SIZE_BITS - 1);
411+
#if SELECT_CMP_STEPS == 8
412+
#if !EARLY_REJECT || PLAINTEXT_LENGTH > 27
413+
bitmap_index = hash[1] & BITMAP_MASK;
414+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 0 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
415+
bitmap_index = (hash[1] >> 8) & BITMAP_MASK;
416+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 1 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
417+
bitmap_index = (hash[1] >> 16) & BITMAP_MASK;
418+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 2 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
419+
bitmap_index = (hash[1] >> 24) & BITMAP_MASK;
414420
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
415-
bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1);
416-
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 3) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
417-
bitmap_index = (hash[2] >> 16) & (BITMAP_SIZE_BITS - 1);
421+
#endif
422+
bitmap_index = hash[0] & BITMAP_MASK;
423+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 4 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
424+
bitmap_index = (hash[0] >> 8) & BITMAP_MASK;
418425
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 5 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
419-
bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1);
426+
bitmap_index = (hash[0] >> 16) & BITMAP_MASK;
420427
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 6 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
421-
bitmap_index = (hash[3] >> 16) & (BITMAP_SIZE_BITS - 1);
428+
bitmap_index = (hash[0] >> 24) & BITMAP_MASK;
422429
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 7 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
423-
#elif SELECT_CMP_STEPS > 2
424-
bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1);
425-
tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
426-
bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1);
427-
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
428-
bitmap_index = hash[1] & (BITMAP_SIZE_BITS - 1);
429-
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 4) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
430-
bitmap_index = hash[0] & (BITMAP_SIZE_BITS - 1);
430+
431+
#elif SELECT_CMP_STEPS == 4
432+
#if !EARLY_REJECT || PLAINTEXT_LENGTH > 27
433+
bitmap_index = hash[1] & BITMAP_MASK;
434+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 0 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
435+
bitmap_index = (hash[1] >> 16) & BITMAP_MASK;
436+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 1 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
437+
#endif
438+
bitmap_index = hash[0] & BITMAP_MASK;
439+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 2 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
440+
bitmap_index = (hash[0] >> 16) & BITMAP_MASK;
431441
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
432-
#elif SELECT_CMP_STEPS > 1
433-
bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1);
434-
tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
435-
bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1);
436-
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
437-
#else
438-
bitmap_index = hash[3] & BITMAP_SIZE_BITS_LESS_ONE;
439-
tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
442+
443+
#elif SELECT_CMP_STEPS == 2
444+
#if !EARLY_REJECT || PLAINTEXT_LENGTH > 27
445+
bitmap_index = hash[1] & BITMAP_MASK;
446+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 0 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
440447
#endif
448+
bitmap_index = hash[0] & BITMAP_MASK;
449+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 1 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
450+
451+
#elif !EARLY_REJECT || PLAINTEXT_LENGTH > 27 /* SELECT_CMP_STEPS == 1 */
452+
bitmap_index = hash[1] & BITMAP_MASK;
453+
tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 0 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
454+
455+
#endif /* SELECT_CMP_STEPS == 8 */
441456

442457
if (tmp)
443-
cmp_final(gid, iter, hash, offset_table, hash_table, return_hashes, output, bitmap_dupe);
458+
cmp_final(gid, iter, hash, offset_table, hash_table, output, bitmap_dupe);
444459
}
445460

446461
/*
@@ -455,7 +470,6 @@ __kernel void nt(__global uint *keys,
455470
__global uint *bitmaps,
456471
__global uint *offset_table,
457472
__global uint *hash_table,
458-
__global uint *return_hashes,
459473
volatile __global uint *out_hash_ids,
460474
volatile __global uint *bitmap_dupe)
461475
{
@@ -503,8 +517,8 @@ __kernel void nt(__global uint *keys,
503517
uint lws = get_local_size(0);
504518
__local uint s_bitmaps[(BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS];
505519

506-
for (i = 0; i < (((BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS) / lws); i++)
507-
s_bitmaps[i*lws + lid] = bitmaps[i*lws + lid];
520+
for (i = lid; i < (BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS; i+= lws)
521+
s_bitmaps[i] = bitmaps[i];
508522

509523
barrier(CLK_LOCAL_MEM_FENCE);
510524

@@ -540,7 +554,7 @@ __kernel void nt(__global uint *keys,
540554
#endif
541555
#endif
542556
#endif
543-
nt_crypt(hash, nt_buffer, md4_size);
544-
cmp(gid, i, hash, BITMAPS, offset_table, hash_table, return_hashes, out_hash_ids, bitmap_dupe);
557+
if (nt_crypt(hash, nt_buffer, md4_size, BITMAPS))
558+
cmp(gid, i, hash, BITMAPS, offset_table, hash_table, out_hash_ids, bitmap_dupe);
545559
}
546560
}

0 commit comments

Comments
 (0)