more code comments

This commit is contained in:
tromp 2016-10-27 19:04:18 -04:00
parent de8dbb92ab
commit e3db4ca99b
1 changed files with 62 additions and 33 deletions

View File

@ -82,24 +82,15 @@ typedef u32 au32;
// number of buckets
static const u32 NBUCKETS = 1<<BUCKBITS;
// corresponding bucket mask
static const u32 BUCKMASK = NBUCKETS-1;
// 2_log of number of slots per bucket
static const u32 SLOTBITS = RESTBITS+1+1;
// default bucket capacity
static const u32 SLOTRANGE = 1<<SLOTBITS;
// corresponding SLOTBITS mask
static const u32 SLOTMASK = SLOTRANGE-1;
// most significat bit in SLOTMASK
static const u32 SLOTMSB = 1<<(SLOTBITS-1);
// number of slots per bucket
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
// number of possible values of RESTBITS bits
static const u32 NRESTS = 1<<RESTBITS;
// more than 8 solutions are rare (less than one in 100000 runs)
static const u32 MAXSOLS = 8;
static const u32 NBUCKETS = 1<<BUCKBITS; // number of buckets
static const u32 BUCKMASK = NBUCKETS-1; // corresponding bucket mask
static const u32 SLOTBITS = RESTBITS+1+1; // 2_log of number of slots per bucket
static const u32 SLOTRANGE = 1<<SLOTBITS; // default bucket capacity
static const u32 SLOTMASK = SLOTRANGE-1; // corresponding SLOTBITS mask
static const u32 SLOTMSB = 1<<(SLOTBITS-1); // most significat bit in SLOTMASK
static const u32 NSLOTS = SLOTRANGE * SAVEMEM; // number of slots per bucket
static const u32 NRESTS = 1<<RESTBITS; // number of possible values of RESTBITS bits
static const u32 MAXSOLS = 8; // more than 8 solutions are rare
// tree node identifying its children as two different slots in
// a bucket on previous layer with matching rest bits (x-tra hash)
@ -267,16 +258,17 @@ struct htalloc {
// main solver object, shared between all threads
struct equi {
blake2b_state blake_ctx;
htalloc hta;
bsizes *nslots;
proof *sols;
au32 nsols;
blake2b_state blake_ctx; // holds blake2b midstate after call to setheadernounce
htalloc hta; // holds allocated heaps
bsizes *nslots; // counts number of slots used in buckets
proof *sols; // store found solutions here (only first MAXSOLS)
au32 nsols; // number of solutions found
u32 nthreads;
u32 bfull;
u32 hfull;
pthread_barrier_t barry;
u32 bfull; // count number of times bucket can't fit new item
u32 hfull; // count number of xor-ed hash with last 32 bits zero
pthread_barrier_t barry; // used to sync threads
equi(const u32 n_threads) {
assert(sizeof(htunit) == 4);
assert(WK&1); // assumed in candidate() calling indices1()
@ -292,11 +284,13 @@ struct equi {
// prepare blake2b midstate for new run and initialize counters
void setheadernonce(const char *headernonce, const u32 len) {
setheader(&blake_ctx, headernonce);
memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
nsols = bfull = hfull = 0;
// get heap0 bucket size in threadsafe manner
u32 getslot0(const u32 bucketi) {
#ifdef ATOMIC
return std::atomic_fetch_add_explicit(&nslots[0][bucketi], 1U, std::memory_order_relaxed);
@ -304,6 +298,7 @@ struct equi {
return nslots[0][bucketi]++;
// get heap1 bucket size in threadsafe manner
u32 getslot1(const u32 bucketi) {
#ifdef ATOMIC
return std::atomic_fetch_add_explicit(&nslots[1][bucketi], 1U, std::memory_order_relaxed);
@ -311,18 +306,23 @@ struct equi {
return nslots[1][bucketi]++;
// get old heap0 bucket size and clear it for next round
u32 getnslots0(const u32 bid) {
au32 &nslot = nslots[0][bid];
const u32 n = min(nslot, NSLOTS);
nslot = 0;
return n;
// get old heap1 bucket size and clear it for next round
u32 getnslots1(const u32 bid) {
au32 &nslot = nslots[1][bid];
const u32 n = min(nslot, NSLOTS);
nslot = 0;
return n;
// this was an experiment that turned out to be a slowdown
// one can integrate a merge sort into the index recovery
// but due to the memcpy's it's slower at recognizing dupes
// if merged != 0, mergesort indices and return true if dupe found
// if merged == 0, order indices as in Wagner condition
@ -380,6 +380,10 @@ struct equi {
if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0);
// this is a differrent way to recognize most (but not all) dupes
// unlike MERGESORT it doesn't end up sorting the indices,
// but the few remaining candidates can easily
// affort to have a qsort applied to them in order to find remaining dupes
bool orderindices(u32 *indices, u32 size) {
if (indices[0] > indices[size]) {
@ -397,6 +401,8 @@ struct equi {
if (r == 0) {
u32 idx = t.getindex();
if (dupes) {
// recognize most dupes by storing last seen index
// with same K least significant bits in array dupes
u32 bin = idx & (PROOFSIZE-1);
if (idx == dupes[bin]) return true;
dupes[bin] = idx;
@ -411,6 +417,7 @@ struct equi {
|| listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
|| (!dupes && orderindices(indices, size));
// need separate instance for accessing (differently typed) heap1
bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) {
const slot0 *buck = hta.heap0[t.bucketid()];
const u32 size = 1 << --r;
@ -419,24 +426,31 @@ struct equi {
|| listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
|| (!dupes && orderindices(indices, size));
// check a candidate that resulted in 0 xor
// add as solution, with proper subtree ordering, if it has unique indices
void candidate(const tree t) {
proof prf, dupes;
memset(dupes, 0xffff, sizeof(proof));
if (listindices1(WK, t, prf, dupes)) return; // assume WK odd
// it survived the probable dupe test, now check fully
qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
for (u32 i=1; i<PROOFSIZE; i++) if (prf[i] <= prf[i-1]) return;
// and now we have ourselves a genuine solution, not yet properly ordered
#ifdef ATOMIC
u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
u32 soli = nsols++;
// retrieve solution indices in correct order
if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); // assume WK odd
// show bucket stats and, if desired, size distribution
void showbsizes(u32 r) {
printf(" b%d h%d\n", bfull, hfull);
bfull = hfull = 0;
#if defined(HIST) || defined(SPARK) || defined(LOGSPARK)
// group bucket sizes in 64 bins, from empty to full (ignoring SAVEMEM)
u32 binsizes[65];
memset(binsizes, 0, 65 * sizeof(u32));
for (u32 bucketid = 0; bucketid < NBUCKETS; bucketid++) {
@ -444,10 +458,10 @@ struct equi {
for (u32 i=0; i < 65; i++) {
#ifdef HIST
#ifdef HIST // exact counts are useful for debugging
printf(" %d:%d", i, binsizes[i]);
#ifdef SPARK
#ifdef SPARK // everybody loves sparklines
u32 sparks = binsizes[i] / SPARKSCALE;
u32 sparks = 0;
@ -462,6 +476,8 @@ struct equi {
printf("Digit %d", r+1);
// thread-local object that precomputes various slot metrics for each round
// facilitating access to various bits in the variable size slots
struct htlayout {
htalloc hta;
u32 prevhtunits;
@ -470,16 +486,17 @@ struct equi {
u32 prevbo;
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
u32 nexthashbytes = hashsize(r);
nexthtunits = hashwords(nexthashbytes);
prevbo = 0;
if (r) {
u32 nexthashbytes = hashsize(r); // number of bytes occupied by round r hash
nexthtunits = hashwords(nexthashbytes); // number of 32bit words taken up by those bytes
prevbo = 0; // byte offset for accessing hash form previous round
if (r) { // similar measure for previous round
u32 prevhashbytes = hashsize(r-1);
prevhtunits = hashwords(prevhashbytes);
prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
dunits = prevhtunits - nexthtunits;
dunits = prevhtunits - nexthtunits; // number of words by which hash shrinks
// extract remaining bits in digit slots in same bucket still need to collide on
u32 getxhash0(const htunit* slot) const {
#if WN == 200 && RESTBITS == 4
return slot->bytes[prevbo] >> 4;
@ -491,6 +508,7 @@ struct equi {
#error non implemented
// similar but accounting for possible change in hashsize modulo 4 bits
u32 getxhash1(const htunit* slot) const {
#if WN == 200 && RESTBITS == 4
return slot->bytes[prevbo] & 0xf;
@ -502,12 +520,19 @@ struct equi {
#error non implemented
// test whether two hashes match in last 32 bits
bool equal(const htunit *hash0, const htunit *hash1) const {
return hash0[prevhtunits-1].word == hash1[prevhtunits-1].word;
// this thread-local object performs in-bucket collissions
// by linking together slots that have identical rest bits
// (which is in essense a 2nd stage bucket sort)
struct collisiondata {
// the bitmap is an early experiment in a bitmap encoding
// that works only for at most 64 slots
// it might as well be obsoleted as it performs worse even in that case
#ifdef XBITMAP
#if NSLOTS > 64
#error cant use XBITMAP with more than 64 slots
@ -515,6 +540,10 @@ struct equi {
u64 xhashmap[NRESTS];
u64 xmap;
// This maintains NRESTS = 2^RESTBITS lists whose starting slot
// are in xhashslots[] and where subsequent slots in each list
// are found through nextxhashslot[]
// since 0 is already a valid slot number, use ~0 as nil value
#if RESTBITS <= 6
typedef uchar xslot;