diff --git a/equi_miner.h b/equi_miner.h index 161ef61..da4b7dd 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -380,11 +380,11 @@ struct equi { #endif if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); } -// this is a differrent way to recognize most (but not all) dupes -// unlike MERGESORT it doesn't end up sorting the indices, -// but the few remaining candidates can easily -// affort to have a qsort applied to them in order to find remaining dupes #else + // this is a differrent way to recognize most (but not all) dupes + // unlike MERGESORT it doesn't end up sorting the indices, + // but the few remaining candidates can easily + // affort to have a qsort applied to them in order to find remaining dupes bool orderindices(u32 *indices, u32 size) { if (indices[0] > indices[size]) { for (u32 i=0; i < size; i++) { @@ -526,13 +526,13 @@ struct equi { } }; - // this thread-local object performs in-bucket collissions + // this thread-local object performs in-bucket collisions // by linking together slots that have identical rest bits // (which is in essense a 2nd stage bucket sort) struct collisiondata { -// the bitmap is an early experiment in a bitmap encoding -// that works only for at most 64 slots -// it might as well be obsoleted as it performs worse even in that case + // the bitmap is an early experiment in a bitmap encoding + // that works only for at most 64 slots + // it might as well be obsoleted as it performs worse even in that case #ifdef XBITMAP #if NSLOTS > 64 #error cant use XBITMAP with more than 64 slots @@ -540,10 +540,10 @@ struct equi { u64 xhashmap[NRESTS]; u64 xmap; #else -// This maintains NRESTS = 2^RESTBITS lists whose starting slot -// are in xhashslots[] and where subsequent slots in each list -// are found through nextxhashslot[] -// since 0 is already a valid slot number, use ~0 as nil value + // This maintains NRESTS = 2^RESTBITS lists whose starting slot + // are in xhashslots[] and where subsequent (next-lower-numbered) + // slots in each list are found through nextxhashslot[] + // since 0 is already a valid slot number, use ~0 as nil value #if RESTBITS <= 6 typedef uchar xslot; #else @@ -607,12 +607,12 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; htlayout htl(this, 0); const u32 hashbytes = hashsize(0); uchar hashes[BLAKESINPARALLEL * 64]; - blake2b_state state0 = blake_ctx; + blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster for (u32 block = id; block < NBLOCKS; block += nthreads) { #ifdef USE_AVX2 blake2bip_final(&state0, hashes, block); #else - blake2b_state state = state0; + blake2b_state state = state0; // make another copy since blake2b_final modifies it u32 leb = htole32(block); blake2b_update(&state, (uchar *)&leb, sizeof(u32)); blake2b_final(&state, hashes, HASHOUT); @@ -620,25 +620,27 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; for (u32 i = 0; i> 4; #elif BUCKBITS == 16 && RESTBITS == 4 const u32 bucketid = ((u32)ph[0] << 8) | ph[1]; #elif BUCKBITS == 20 && RESTBITS == 4 const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4; -#elif BUCKBITS == 12 && RESTBITS == 4 - const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4; - const u32 xhash = ph[1] & 0xf; #else #error not implemented #endif + // grab next available slot in that bucket const u32 slot = getslot0(bucketid); if (slot >= NSLOTS) { - bfull++; + bfull++; // this actually never seems to happen in round 0 due to uniformity continue; } + // location for slot's tag htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits; + // hash should end right before tag memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes); + // round 0 tags store hash-generating index s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j); } } @@ -648,21 +650,22 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; void digitodd(const u32 r, const u32 id) { htlayout htl(this, r); collisiondata cd; + // threads process buckets in round-robin fashion for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = htl.hta.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { + cd.clear(); // could have made this the constructor, and declare here + slot0 *buck = htl.hta.heap0[bucketid]; // point to first slot of this bucket + u32 bsize = getnslots0(bucketid); // grab and reset bucket size + for (u32 s1 = 0; s1 < bsize; s1++) { // loop over slots const htunit *slot1 = buck[s1]; - cd.addslot(s1, htl.getxhash0(slot1)); + cd.addslot(s1, htl.getxhash0(slot1));// identify list of previous colliding slots for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); const htunit *slot0 = buck[s0]; - if (htl.equal(slot0, slot1)) { - hfull++; + if (htl.equal(slot0, slot1)) { // expect difference in last 32 bits unless duped + hfull++; // record discarding continue; } - u32 xorbucketid; + u32 xorbucketid; // determine bucket for s0 xor s1 const uchar *bytes0 = slot0->bytes, *bytes1 = slot1->bytes; #if WN == 200 && BUCKBITS == 12 && RESTBITS == 8 xorbucketid = (((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8) @@ -677,14 +680,18 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; #else #error not implemented #endif + // grab next available slot in that bucket const u32 xorslot = getslot1(xorbucketid); if (xorslot >= NSLOTS) { - bfull++; + bfull++; // SAVEMEM determines how often this happens continue; } + // start of slot for s0 ^ s1 htunit *xs = htl.hta.heap1[xorbucketid][xorslot]; + // store xor of hashes possibly minus initial 0 word due to collision for (u32 i=htl.dunits; i < htl.prevhtunits; i++) xs++->word = slot0[i].word ^ slot1[i].word; + // store tree node right after hash xs->tag = tree(bucketid, s0, s1); } } @@ -737,6 +744,9 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; } } + // functions digit1 through digit9 are unrolled versions specific to the + // (N=200,K=9) parameters with 8 RESTBITS + // and will be used with compile option -DUNROLL void digit1(const u32 id) { htalloc heaps = hta; collisiondata cd; @@ -997,6 +1007,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; } } + // final round looks simpler void digitK(const u32 id) { collisiondata cd; htlayout htl(this, WK); @@ -1010,14 +1021,14 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; cd.addslot(s1, htl.getxhash0(slot1)); // assume WK odd for (; cd.nextcollision(); ) { const u32 s0 = cd.slot(); - if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE - candidate(tree(bucketid, s0, s1)); + if (htl.equal(buck[s0], slot1)) { // there is only 1 word of hash left + candidate(tree(bucketid, s0, s1)); // so a match gives a solution candidate nc++; } } } } - // printf(" %d candidates ", nc); + // printf(" %d candidates ", nc); // this gets uncommented a lot for debugging } }; @@ -1035,6 +1046,7 @@ void barrier(pthread_barrier_t *barry) { } } +// do all rounds for each thread void *worker(void *vp) { thread_ctx *tp = (thread_ctx *)vp; equi *eq = tp->eq;