optimize dupe test

This commit is contained in:
John Tromp 2016-11-10 21:34:16 -05:00
parent 024aff4935
commit 13770ee55c
5 changed files with 58 additions and 79 deletions

View File

@ -65,7 +65,7 @@ Performance summary (on 4GHz i7-4790K and NVidia GTX980):
- 8 x eqavx21: 20.3 Sol/s - 8 x eqavx21: 20.3 Sol/s
- 8 x dev1: 20.6 Sol/s - 8 x dev1: 20.6 Sol/s
- eqcuda: 23.6 Sol/s - eqcuda: 24.7 Sol/s
And now, for something completely different: (144,5) taking 2.6 GB of memory And now, for something completely different: (144,5) taking 2.6 GB of memory

View File

@ -303,7 +303,7 @@ ALIGN(64) static const uint32_t indices[12][16] = {
} \ } \
} while(0) } while(0)
void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) { void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) {
__m256i v[16], s[8], iv[8], w[16], counter, flag; __m256i v[16], s[8], iv[8], w[16], counter, flag;
uint32_t b, i, r; uint32_t b, i, r;

View File

@ -6,6 +6,7 @@
typedef uint32_t u32; typedef uint32_t u32;
typedef unsigned char uchar; typedef unsigned char uchar;
void blake2bip_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx); void blake2bx4_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx);
void blake2bx8_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx);
#endif #endif

View File

@ -194,7 +194,7 @@ struct equi {
nslot = 0; nslot = 0;
return n; return n;
} }
__device__ void orderindices(u32 *indices, u32 size) { __device__ bool orderindices(u32 *indices, u32 size) {
if (indices[0] > indices[size]) { if (indices[0] > indices[size]) {
for (u32 i=0; i < size; i++) { for (u32 i=0; i < size; i++) {
const u32 tmp = indices[i]; const u32 tmp = indices[i];
@ -202,84 +202,84 @@ struct equi {
indices[size+i] = tmp; indices[size+i] = tmp;
} }
} }
return false;
} }
__device__ void listindices1(const tree t, u32 *indices) { __device__ bool listindices1(const tree t, u32 *indices) {
const bucket0 &buck = hta.trees0[0][t.bucketid()]; const bucket0 &buck = hta.trees0[0][t.bucketid()];
const u32 size = 1 << 0; const u32 size = 1 << 0;
indices[0] = buck[t.slotid0()].attr.getindex(); indices[0] = buck[t.slotid0()].attr.getindex();
indices[size] = buck[t.slotid1()].attr.getindex(); indices[size] = buck[t.slotid1()].attr.getindex();
orderindices(indices, size); orderindices(indices, size);
return false;
} }
__device__ void listindices2(const tree t, u32 *indices) { __device__ bool listindices2(const tree t, u32 *indices) {
const bucket1 &buck = hta.trees1[0][t.bucketid()]; const bucket1 &buck = hta.trees1[0][t.bucketid()];
const u32 size = 1 << 1; const u32 size = 1 << 1;
listindices1(buck[t.slotid0()].attr, indices); return listindices1(buck[t.slotid0()].attr, indices) ||
listindices1(buck[t.slotid1()].attr, indices+size); listindices1(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
__device__ void listindices3(const tree t, u32 *indices) { __device__ bool listindices3(const tree t, u32 *indices) {
const bucket0 &buck = hta.trees0[1][t.bucketid()]; const bucket0 &buck = hta.trees0[1][t.bucketid()];
const u32 size = 1 << 2; const u32 size = 1 << 2;
listindices2(buck[t.slotid0()].attr, indices); return listindices2(buck[t.slotid0()].attr, indices) ||
listindices2(buck[t.slotid1()].attr, indices+size); listindices2(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
__device__ void listindices4(const tree t, u32 *indices) { __device__ bool listindices4(const tree t, u32 *indices) {
const bucket1 &buck = hta.trees1[1][t.bucketid()]; const bucket1 &buck = hta.trees1[1][t.bucketid()];
const u32 size = 1 << 3; const u32 size = 1 << 3;
listindices3(buck[t.slotid0()].attr, indices); return listindices3(buck[t.slotid0()].attr, indices) ||
listindices3(buck[t.slotid1()].attr, indices+size); listindices3(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
__device__ void listindices5(const tree t, u32 *indices) { __device__ bool listindices5(const tree t, u32 *indices) {
const bucket0 &buck = hta.trees0[2][t.bucketid()]; const bucket0 &buck = hta.trees0[2][t.bucketid()];
const u32 size = 1 << 4; const u32 size = 1 << 4;
listindices4(buck[t.slotid0()].attr, indices); return listindices4(buck[t.slotid0()].attr, indices) ||
listindices4(buck[t.slotid1()].attr, indices+size); listindices4(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
#if WK == 9 #if WK == 9
__device__ void listindices6(const tree t, u32 *indices) { __device__ bool listindices6(const tree t, u32 *indices) {
const bucket1 &buck = hta.trees1[2][t.bucketid()]; const bucket1 &buck = hta.trees1[2][t.bucketid()];
const u32 size = 1 << 5; const u32 size = 1 << 5;
listindices5(buck[t.slotid0()].attr, indices); return listindices5(buck[t.slotid0()].attr, indices) ||
listindices5(buck[t.slotid1()].attr, indices+size); listindices5(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
__device__ void listindices7(const tree t, u32 *indices) { __device__ bool listindices7(const tree t, u32 *indices) {
const bucket0 &buck = hta.trees0[3][t.bucketid()]; const bucket0 &buck = hta.trees0[3][t.bucketid()];
const u32 size = 1 << 6; const u32 size = 1 << 6;
listindices6(buck[t.slotid0()].attr, indices); return listindices6(buck[t.slotid0()].attr, indices) ||
listindices6(buck[t.slotid1()].attr, indices+size); listindices6(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
__device__ void listindices8(const tree t, u32 *indices) { __device__ bool listindices8(const tree t, u32 *indices) {
const bucket1 &buck = hta.trees1[3][t.bucketid()]; const bucket1 &buck = hta.trees1[3][t.bucketid()];
const u32 size = 1 << 7; const u32 size = 1 << 7;
listindices7(buck[t.slotid0()].attr, indices); return listindices7(buck[t.slotid0()].attr, indices) ||
listindices7(buck[t.slotid1()].attr, indices+size); listindices7(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
__device__ void listindices9(const tree t, u32 *indices) { __device__ bool listindices9(const tree t, u32 *indices) {
const bucket0 &buck = hta.trees0[4][t.bucketid()]; const bucket0 &buck = hta.trees0[4][t.bucketid()];
const u32 size = 1 << 8; const u32 size = 1 << 8;
listindices8(buck[t.slotid0()].attr, indices); return listindices8(buck[t.slotid0()].attr, indices) ||
listindices8(buck[t.slotid1()].attr, indices+size); listindices8(buck[t.slotid1()].attr, indices+size) ||
orderindices(indices, size); orderindices(indices, size) || indices[0] == indices[size];
} }
#endif #endif
__device__ void candidate(const tree t) { __device__ void candidate(const tree t) {
proof prf; proof prf;
#if WK==9 #if WK==9
listindices9(t, prf); if (listindices9(t, prf)) return;
#elif WK==5 #elif WK==5
listindices5(t, prf); if (listindices5(t, prf)) return;
#else #else
#error not implemented #error not implemented
#endif #endif
if (probdupe(prf))
return;
u32 soli = atomicAdd(&nsols, 1); u32 soli = atomicAdd(&nsols, 1);
if (soli < MAXSOLS) if (soli < MAXSOLS)
#if WK==9 #if WK==9
@ -317,19 +317,6 @@ struct equi {
printf("\n"); printf("\n");
#endif #endif
} }
// proper dupe test is a little costly on GPU, so allow false negatives
__device__ bool probdupe(u32 *prf) {
unsigned short susp[PROOFSIZE];
memset(susp, 0xffff, PROOFSIZE * sizeof(unsigned short));
for (u32 i=0; i<PROOFSIZE; i++) {
u32 bin = prf[i] & (PROOFSIZE-1);
unsigned short msb = prf[i]>>WK;
if (msb == susp[bin])
return true;
susp[bin] = msb;
}
return false;
}
struct htlayout { struct htlayout {
htalloc hta; htalloc hta;
u32 prevhashunits; u32 prevhashunits;
@ -998,6 +985,7 @@ int main(int argc, char **argv) {
cudaEventElapsedTime(&duration, start, stop); cudaEventElapsedTime(&duration, start, stop);
printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f); printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f);
printf("%d candidate solutions\n", eq.nsols);
u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols); u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols);
for (s = nsols = 0; s < maxsols; s++) { for (s = nsols = 0; s < maxsols; s++) {
if (duped(sols[s])) { if (duped(sols[s])) {

View File

@ -396,52 +396,42 @@ struct equi {
} }
// if dupes != 0, list indices in arbitrary order and return true if dupe found // if dupes != 0, list indices in arbitrary order and return true if dupe found
// if dupes == 0, order indices as in Wagner condition // if dupes == 0, order indices as in Wagner condition
bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) { bool listindices0(u32 r, const tree t, u32 *indices) {
if (r == 0) { if (r == 0) {
u32 idx = t.getindex(); u32 idx = t.getindex();
if (dupes) {
// recognize most dupes by storing last seen index
// with same K least significant bits in array dupes
u32 bin = idx & (PROOFSIZE-1);
if (idx == dupes[bin]) return true;
dupes[bin] = idx;
}
*indices = idx; *indices = idx;
return false; return false;
} }
const slot1 *buck = hta.heap1[t.bucketid()]; const slot1 *buck = hta.heap1[t.bucketid()];
const u32 size = 1 << --r; const u32 size = 1 << --r;
u32 tagi = hashwords(hashsize(r)); u32 tagi = hashwords(hashsize(r));
return listindices1(r, buck[t.slotid0()][tagi].tag, indices, dupes) return listindices1(r, buck[t.slotid0()][tagi].tag, indices)
|| listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size)
|| (!dupes && orderindices(indices, size)); || orderindices(indices, size) || indices[0] == indices[size];
} }
// need separate instance for accessing (differently typed) heap1 // need separate instance for accessing (differently typed) heap1
bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) { bool listindices1(u32 r, const tree t, u32 *indices) {
const slot0 *buck = hta.heap0[t.bucketid()]; const slot0 *buck = hta.heap0[t.bucketid()];
const u32 size = 1 << --r; const u32 size = 1 << --r;
u32 tagi = hashwords(hashsize(r)); u32 tagi = hashwords(hashsize(r));
return listindices0(r, buck[t.slotid0()][tagi].tag, indices, dupes) return listindices0(r, buck[t.slotid0()][tagi].tag, indices)
|| listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size)
|| (!dupes && orderindices(indices, size)); || orderindices(indices, size) || indices[0] == indices[size];
} }
// check a candidate that resulted in 0 xor // check a candidate that resulted in 0 xor
// add as solution, with proper subtree ordering, if it has unique indices // add as solution, with proper subtree ordering, if it has unique indices
void candidate(const tree t) { void candidate(const tree t) {
proof prf, dupes; proof prf;
memset(dupes, 0xffff, sizeof(proof)); // listindices combines index tree reconstruction with probably dupe test
if (listindices1(WK, t, prf, dupes)) return; // assume WK odd if (listindices1(WK, t, prf) || duped(prf)) return; // assume WK odd
// it survived the probable dupe test, now check fully // and now we have ourselves a genuine solution
qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
for (u32 i=1; i<PROOFSIZE; i++) if (prf[i] <= prf[i-1]) return;
// and now we have ourselves a genuine solution, not yet properly ordered
#ifdef ATOMIC #ifdef ATOMIC
u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed); u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
#else #else
u32 soli = nsols++; u32 soli = nsols++;
#endif #endif
// retrieve solution indices in correct order // copy solution into final place
if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); // assume WK odd if (soli < MAXSOLS) memcpy(sols[soli], prf, sizeof(proof));
} }
#endif #endif
// show bucket stats and, if desired, size distribution // show bucket stats and, if desired, size distribution
@ -609,7 +599,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster
for (u32 block = id; block < NBLOCKS; block += nthreads) { for (u32 block = id; block < NBLOCKS; block += nthreads) {
#ifdef USE_AVX2 #ifdef USE_AVX2
blake2bip_final(&state0, hashes, block); blake2bx4_final(&state0, hashes, block);
#else #else
blake2b_state state = state0; // make another copy since blake2b_final modifies it blake2b_state state = state0; // make another copy since blake2b_final modifies it
u32 leb = htole32(block); u32 leb = htole32(block);