diff --git a/README.md b/README.md index e56b79a..a3d76b8 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Performance summary (on 4GHz i7-4790K and NVidia GTX980): - 8 x eqavx21: 20.3 Sol/s - 8 x dev1: 20.6 Sol/s -- eqcuda: 23.6 Sol/s +- eqcuda: 24.7 Sol/s And now, for something completely different: (144,5) taking 2.6 GB of memory diff --git a/blake2-avx2/blake2bip.c b/blake2-avx2/blake2bip.c index 2abb7ea..e8288de 100644 --- a/blake2-avx2/blake2bip.c +++ b/blake2-avx2/blake2bip.c @@ -303,7 +303,7 @@ ALIGN(64) static const uint32_t indices[12][16] = { } \ } while(0) -void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) { +void blake2bx4_final(const blake2b_state *S, uchar *out, u32 blockidx) { __m256i v[16], s[8], iv[8], w[16], counter, flag; uint32_t b, i, r; diff --git a/blake2-avx2/blake2bip.h b/blake2-avx2/blake2bip.h index 11799c6..a046a5e 100644 --- a/blake2-avx2/blake2bip.h +++ b/blake2-avx2/blake2bip.h @@ -6,6 +6,7 @@ typedef uint32_t u32; typedef unsigned char uchar; -void blake2bip_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx); +void blake2bx4_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx); +void blake2bx8_final(const blake2b_state *midstate, uchar *hashout, u32 blockidx); #endif diff --git a/equi_miner.cu b/equi_miner.cu index 96b9d26..60ed138 100644 --- a/equi_miner.cu +++ b/equi_miner.cu @@ -194,7 +194,7 @@ struct equi { nslot = 0; return n; } - __device__ void orderindices(u32 *indices, u32 size) { + __device__ bool orderindices(u32 *indices, u32 size) { if (indices[0] > indices[size]) { for (u32 i=0; i < size; i++) { const u32 tmp = indices[i]; @@ -202,84 +202,84 @@ struct equi { indices[size+i] = tmp; } } + return false; } - __device__ void listindices1(const tree t, u32 *indices) { + __device__ bool listindices1(const tree t, u32 *indices) { const bucket0 &buck = hta.trees0[0][t.bucketid()]; const u32 size = 1 << 0; indices[0] = buck[t.slotid0()].attr.getindex(); indices[size] = buck[t.slotid1()].attr.getindex(); orderindices(indices, size); + return false; } - __device__ void listindices2(const tree t, u32 *indices) { + __device__ bool listindices2(const tree t, u32 *indices) { const bucket1 &buck = hta.trees1[0][t.bucketid()]; const u32 size = 1 << 1; - listindices1(buck[t.slotid0()].attr, indices); - listindices1(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices1(buck[t.slotid0()].attr, indices) || + listindices1(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } - __device__ void listindices3(const tree t, u32 *indices) { + __device__ bool listindices3(const tree t, u32 *indices) { const bucket0 &buck = hta.trees0[1][t.bucketid()]; const u32 size = 1 << 2; - listindices2(buck[t.slotid0()].attr, indices); - listindices2(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices2(buck[t.slotid0()].attr, indices) || + listindices2(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } - __device__ void listindices4(const tree t, u32 *indices) { + __device__ bool listindices4(const tree t, u32 *indices) { const bucket1 &buck = hta.trees1[1][t.bucketid()]; const u32 size = 1 << 3; - listindices3(buck[t.slotid0()].attr, indices); - listindices3(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices3(buck[t.slotid0()].attr, indices) || + listindices3(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } - __device__ void listindices5(const tree t, u32 *indices) { + __device__ bool listindices5(const tree t, u32 *indices) { const bucket0 &buck = hta.trees0[2][t.bucketid()]; const u32 size = 1 << 4; - listindices4(buck[t.slotid0()].attr, indices); - listindices4(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices4(buck[t.slotid0()].attr, indices) || + listindices4(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } #if WK == 9 - __device__ void listindices6(const tree t, u32 *indices) { + __device__ bool listindices6(const tree t, u32 *indices) { const bucket1 &buck = hta.trees1[2][t.bucketid()]; const u32 size = 1 << 5; - listindices5(buck[t.slotid0()].attr, indices); - listindices5(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices5(buck[t.slotid0()].attr, indices) || + listindices5(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } - __device__ void listindices7(const tree t, u32 *indices) { + __device__ bool listindices7(const tree t, u32 *indices) { const bucket0 &buck = hta.trees0[3][t.bucketid()]; const u32 size = 1 << 6; - listindices6(buck[t.slotid0()].attr, indices); - listindices6(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices6(buck[t.slotid0()].attr, indices) || + listindices6(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } - __device__ void listindices8(const tree t, u32 *indices) { + __device__ bool listindices8(const tree t, u32 *indices) { const bucket1 &buck = hta.trees1[3][t.bucketid()]; const u32 size = 1 << 7; - listindices7(buck[t.slotid0()].attr, indices); - listindices7(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices7(buck[t.slotid0()].attr, indices) || + listindices7(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } - __device__ void listindices9(const tree t, u32 *indices) { + __device__ bool listindices9(const tree t, u32 *indices) { const bucket0 &buck = hta.trees0[4][t.bucketid()]; const u32 size = 1 << 8; - listindices8(buck[t.slotid0()].attr, indices); - listindices8(buck[t.slotid1()].attr, indices+size); - orderindices(indices, size); + return listindices8(buck[t.slotid0()].attr, indices) || + listindices8(buck[t.slotid1()].attr, indices+size) || + orderindices(indices, size) || indices[0] == indices[size]; } #endif __device__ void candidate(const tree t) { proof prf; #if WK==9 - listindices9(t, prf); + if (listindices9(t, prf)) return; #elif WK==5 - listindices5(t, prf); + if (listindices5(t, prf)) return; #else #error not implemented #endif - if (probdupe(prf)) - return; u32 soli = atomicAdd(&nsols, 1); if (soli < MAXSOLS) #if WK==9 @@ -317,19 +317,6 @@ struct equi { printf("\n"); #endif } - // proper dupe test is a little costly on GPU, so allow false negatives - __device__ bool probdupe(u32 *prf) { - unsigned short susp[PROOFSIZE]; - memset(susp, 0xffff, PROOFSIZE * sizeof(unsigned short)); - for (u32 i=0; i>WK; - if (msb == susp[bin]) - return true; - susp[bin] = msb; - } - return false; - } struct htlayout { htalloc hta; u32 prevhashunits; @@ -998,6 +985,7 @@ int main(int argc, char **argv) { cudaEventElapsedTime(&duration, start, stop); printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f); + printf("%d candidate solutions\n", eq.nsols); u32 s, nsols, maxsols = min(MAXSOLS, eq.nsols); for (s = nsols = 0; s < maxsols; s++) { if (duped(sols[s])) { diff --git a/equi_miner.h b/equi_miner.h index a11c39f..21eda46 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -396,52 +396,42 @@ struct equi { } // if dupes != 0, list indices in arbitrary order and return true if dupe found // if dupes == 0, order indices as in Wagner condition - bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) { + bool listindices0(u32 r, const tree t, u32 *indices) { if (r == 0) { u32 idx = t.getindex(); - if (dupes) { - // recognize most dupes by storing last seen index - // with same K least significant bits in array dupes - u32 bin = idx & (PROOFSIZE-1); - if (idx == dupes[bin]) return true; - dupes[bin] = idx; - } *indices = idx; return false; } const slot1 *buck = hta.heap1[t.bucketid()]; const u32 size = 1 << --r; u32 tagi = hashwords(hashsize(r)); - return listindices1(r, buck[t.slotid0()][tagi].tag, indices, dupes) - || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) - || (!dupes && orderindices(indices, size)); + return listindices1(r, buck[t.slotid0()][tagi].tag, indices) + || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size) + || orderindices(indices, size) || indices[0] == indices[size]; } // need separate instance for accessing (differently typed) heap1 - bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) { + bool listindices1(u32 r, const tree t, u32 *indices) { const slot0 *buck = hta.heap0[t.bucketid()]; const u32 size = 1 << --r; u32 tagi = hashwords(hashsize(r)); - return listindices0(r, buck[t.slotid0()][tagi].tag, indices, dupes) - || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) - || (!dupes && orderindices(indices, size)); + return listindices0(r, buck[t.slotid0()][tagi].tag, indices) + || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size) + || orderindices(indices, size) || indices[0] == indices[size]; } // check a candidate that resulted in 0 xor // add as solution, with proper subtree ordering, if it has unique indices void candidate(const tree t) { - proof prf, dupes; - memset(dupes, 0xffff, sizeof(proof)); - if (listindices1(WK, t, prf, dupes)) return; // assume WK odd - // it survived the probable dupe test, now check fully - qsort(prf, PROOFSIZE, sizeof(u32), &compu32); - for (u32 i=1; i