final code comments, i think

This commit is contained in:
tromp 2016-10-27 19:51:59 -04:00
parent e3db4ca99b
commit 9179de6aef
1 changed files with 42 additions and 30 deletions

View File

@ -380,11 +380,11 @@ struct equi {
#endif
if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0);
}
// this is a differrent way to recognize most (but not all) dupes
// unlike MERGESORT it doesn't end up sorting the indices,
// but the few remaining candidates can easily
// affort to have a qsort applied to them in order to find remaining dupes
#else
// this is a differrent way to recognize most (but not all) dupes
// unlike MERGESORT it doesn't end up sorting the indices,
// but the few remaining candidates can easily
// affort to have a qsort applied to them in order to find remaining dupes
bool orderindices(u32 *indices, u32 size) {
if (indices[0] > indices[size]) {
for (u32 i=0; i < size; i++) {
@ -526,13 +526,13 @@ struct equi {
}
};
// this thread-local object performs in-bucket collissions
// this thread-local object performs in-bucket collisions
// by linking together slots that have identical rest bits
// (which is in essense a 2nd stage bucket sort)
struct collisiondata {
// the bitmap is an early experiment in a bitmap encoding
// that works only for at most 64 slots
// it might as well be obsoleted as it performs worse even in that case
// the bitmap is an early experiment in a bitmap encoding
// that works only for at most 64 slots
// it might as well be obsoleted as it performs worse even in that case
#ifdef XBITMAP
#if NSLOTS > 64
#error cant use XBITMAP with more than 64 slots
@ -540,10 +540,10 @@ struct equi {
u64 xhashmap[NRESTS];
u64 xmap;
#else
// This maintains NRESTS = 2^RESTBITS lists whose starting slot
// are in xhashslots[] and where subsequent slots in each list
// are found through nextxhashslot[]
// since 0 is already a valid slot number, use ~0 as nil value
// This maintains NRESTS = 2^RESTBITS lists whose starting slot
// are in xhashslots[] and where subsequent (next-lower-numbered)
// slots in each list are found through nextxhashslot[]
// since 0 is already a valid slot number, use ~0 as nil value
#if RESTBITS <= 6
typedef uchar xslot;
#else
@ -607,12 +607,12 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
htlayout htl(this, 0);
const u32 hashbytes = hashsize(0);
uchar hashes[BLAKESINPARALLEL * 64];
blake2b_state state0 = blake_ctx;
blake2b_state state0 = blake_ctx; // local copy on stack can be copied faster
for (u32 block = id; block < NBLOCKS; block += nthreads) {
#ifdef USE_AVX2
blake2bip_final(&state0, hashes, block);
#else
blake2b_state state = state0;
blake2b_state state = state0; // make another copy since blake2b_final modifies it
u32 leb = htole32(block);
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
blake2b_final(&state, hashes, HASHOUT);
@ -620,25 +620,27 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
for (u32 i = 0; i<BLAKESINPARALLEL; i++) {
for (u32 j = 0; j<HASHESPERBLAKE; j++) {
const uchar *ph = hashes + i * 64 + j * WN/8;
// figure out bucket for this hash by extracting leading BUCKBITS bits
#if BUCKBITS == 12 && RESTBITS == 8
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
#elif BUCKBITS == 16 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
#elif BUCKBITS == 20 && RESTBITS == 4
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
#elif BUCKBITS == 12 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
const u32 xhash = ph[1] & 0xf;
#else
#error not implemented
#endif
// grab next available slot in that bucket
const u32 slot = getslot0(bucketid);
if (slot >= NSLOTS) {
bfull++;
bfull++; // this actually never seems to happen in round 0 due to uniformity
continue;
}
// location for slot's tag
htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits;
// hash should end right before tag
memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
// round 0 tags store hash-generating index
s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j);
}
}
@ -648,21 +650,22 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
void digitodd(const u32 r, const u32 id) {
htlayout htl(this, r);
collisiondata cd;
// threads process buckets in round-robin fashion
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) {
cd.clear();
slot0 *buck = htl.hta.heap0[bucketid];
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
cd.clear(); // could have made this the constructor, and declare here
slot0 *buck = htl.hta.heap0[bucketid]; // point to first slot of this bucket
u32 bsize = getnslots0(bucketid); // grab and reset bucket size
for (u32 s1 = 0; s1 < bsize; s1++) { // loop over slots
const htunit *slot1 = buck[s1];
cd.addslot(s1, htl.getxhash0(slot1));
cd.addslot(s1, htl.getxhash0(slot1));// identify list of previous colliding slots
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
if (htl.equal(slot0, slot1)) {
hfull++;
if (htl.equal(slot0, slot1)) { // expect difference in last 32 bits unless duped
hfull++; // record discarding
continue;
}
u32 xorbucketid;
u32 xorbucketid; // determine bucket for s0 xor s1
const uchar *bytes0 = slot0->bytes, *bytes1 = slot1->bytes;
#if WN == 200 && BUCKBITS == 12 && RESTBITS == 8
xorbucketid = (((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8)
@ -677,14 +680,18 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
#else
#error not implemented
#endif
// grab next available slot in that bucket
const u32 xorslot = getslot1(xorbucketid);
if (xorslot >= NSLOTS) {
bfull++;
bfull++; // SAVEMEM determines how often this happens
continue;
}
// start of slot for s0 ^ s1
htunit *xs = htl.hta.heap1[xorbucketid][xorslot];
// store xor of hashes possibly minus initial 0 word due to collision
for (u32 i=htl.dunits; i < htl.prevhtunits; i++)
xs++->word = slot0[i].word ^ slot1[i].word;
// store tree node right after hash
xs->tag = tree(bucketid, s0, s1);
}
}
@ -737,6 +744,9 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
}
}
// functions digit1 through digit9 are unrolled versions specific to the
// (N=200,K=9) parameters with 8 RESTBITS
// and will be used with compile option -DUNROLL
void digit1(const u32 id) {
htalloc heaps = hta;
collisiondata cd;
@ -997,6 +1007,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
}
}
// final round looks simpler
void digitK(const u32 id) {
collisiondata cd;
htlayout htl(this, WK);
@ -1010,14 +1021,14 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
cd.addslot(s1, htl.getxhash0(slot1)); // assume WK odd
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE
candidate(tree(bucketid, s0, s1));
if (htl.equal(buck[s0], slot1)) { // there is only 1 word of hash left
candidate(tree(bucketid, s0, s1)); // so a match gives a solution candidate
nc++;
}
}
}
}
// printf(" %d candidates ", nc);
// printf(" %d candidates ", nc); // this gets uncommented a lot for debugging
}
};
@ -1035,6 +1046,7 @@ void barrier(pthread_barrier_t *barry) {
}
}
// do all rounds for each thread
void *worker(void *vp) {
thread_ctx *tp = (thread_ctx *)vp;
equi *eq = tp->eq;