diff --git a/equi_miner.h b/equi_miner.h
index dea1b2d..161ef61 100644
--- a/equi_miner.h
+++ b/equi_miner.h
@@ -82,24 +82,15 @@ typedef u32 au32;
 #endif
 #endif
 
-// number of buckets
-static const u32 NBUCKETS = 1<<BUCKBITS;
-// corresponding bucket mask
-static const u32 BUCKMASK = NBUCKETS-1;
-// 2_log of number of slots per bucket
-static const u32 SLOTBITS = RESTBITS+1+1;
-// default bucket capacity
-static const u32 SLOTRANGE = 1<<SLOTBITS;
-// corresponding SLOTBITS mask
-static const u32 SLOTMASK = SLOTRANGE-1;
-// most significat bit in SLOTMASK
-static const u32 SLOTMSB = 1<<(SLOTBITS-1);
-// number of slots per bucket
-static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
-// number of possible values of RESTBITS bits
-static const u32 NRESTS = 1<<RESTBITS;
-// more than 8 solutions are rare (less than one in 100000 runs)
-static const u32 MAXSOLS = 8;
+static const u32 NBUCKETS = 1<<BUCKBITS;    // number of buckets
+static const u32 BUCKMASK = NBUCKETS-1;     // corresponding bucket mask
+static const u32 SLOTBITS = RESTBITS+1+1;   // 2_log of number of slots per bucket
+static const u32 SLOTRANGE = 1<<SLOTBITS;   // default bucket capacity
+static const u32 SLOTMASK = SLOTRANGE-1;    // corresponding SLOTBITS mask
+static const u32 SLOTMSB = 1<<(SLOTBITS-1); // most significat bit in SLOTMASK
+static const u32 NSLOTS = SLOTRANGE * SAVEMEM; // number of slots per bucket
+static const u32 NRESTS = 1<<RESTBITS;      // number of possible values of RESTBITS bits
+static const u32 MAXSOLS = 8;               // more than 8 solutions are rare
 
 // tree node identifying its children as two different slots in
 // a bucket on previous layer with matching rest bits (x-tra hash)
@@ -267,16 +258,17 @@ struct htalloc {
   }
 };
 
+// main solver object, shared between all threads
 struct equi {
-  blake2b_state blake_ctx;
-  htalloc hta;
-  bsizes *nslots;
-  proof *sols;
-  au32 nsols;
+  blake2b_state blake_ctx; // holds blake2b midstate after call to setheadernounce
+  htalloc hta;             // holds allocated heaps
+  bsizes *nslots;          // counts number of slots used in buckets
+  proof *sols;             // store found solutions here (only first MAXSOLS)
+  au32 nsols;              // number of solutions found
   u32 nthreads;
-  u32 bfull;
-  u32 hfull;
-  pthread_barrier_t barry;
+  u32 bfull;               // count number of times bucket can't fit new item
+  u32 hfull;               // count number of xor-ed hash with last 32 bits zero
+  pthread_barrier_t barry; // used to sync threads
   equi(const u32 n_threads) {
     assert(sizeof(htunit) == 4);
     assert(WK&1); // assumed in candidate() calling indices1()
@@ -292,11 +284,13 @@ struct equi {
     free(nslots);
     free(sols);
   }
+  // prepare blake2b midstate for new run and initialize counters
   void setheadernonce(const char *headernonce, const u32 len) {
     setheader(&blake_ctx, headernonce);
     memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
     nsols = bfull = hfull = 0;
   }
+  // get heap0 bucket size in threadsafe manner
   u32 getslot0(const u32 bucketi) {
 #ifdef ATOMIC
     return std::atomic_fetch_add_explicit(&nslots[0][bucketi], 1U, std::memory_order_relaxed);
@@ -304,6 +298,7 @@ struct equi {
     return nslots[0][bucketi]++;
 #endif
   }
+  // get heap1 bucket size in threadsafe manner
   u32 getslot1(const u32 bucketi) {
 #ifdef ATOMIC
     return std::atomic_fetch_add_explicit(&nslots[1][bucketi], 1U, std::memory_order_relaxed);
@@ -311,18 +306,23 @@ struct equi {
     return nslots[1][bucketi]++;
 #endif
   }
+  // get old heap0 bucket size and clear it for next round
   u32 getnslots0(const u32 bid) {
     au32 &nslot = nslots[0][bid];
     const u32 n = min(nslot, NSLOTS);
     nslot = 0;
     return n;
   }
+  // get old heap1 bucket size and clear it for next round
   u32 getnslots1(const u32 bid) {
     au32 &nslot = nslots[1][bid];
     const u32 n = min(nslot, NSLOTS);
     nslot = 0;
     return n;
   }
+// this was an experiment that turned out to be a slowdown
+// one can integrate a merge sort into the index recovery
+// but due to the memcpy's it's slower at recognizing dupes
 #ifdef MERGESORT
   // if merged != 0, mergesort indices and return true if dupe found
   // if merged == 0, order indices as in Wagner condition
@@ -380,6 +380,10 @@ struct equi {
 #endif
     if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0);
   }
+// this is a differrent way to recognize most (but not all) dupes
+// unlike MERGESORT it doesn't end up sorting the indices,
+// but the few remaining candidates can easily
+// affort to have a qsort applied to them in order to find remaining dupes
 #else
   bool orderindices(u32 *indices, u32 size) {
     if (indices[0] > indices[size]) {
@@ -397,6 +401,8 @@ struct equi {
     if (r == 0) {
       u32 idx = t.getindex();
       if (dupes) {
+      // recognize most dupes by storing last seen index
+      // with same K least significant bits in array dupes
         u32 bin = idx & (PROOFSIZE-1);
         if (idx == dupes[bin]) return true;
         dupes[bin] = idx;
@@ -411,6 +417,7 @@ struct equi {
         || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
         || (!dupes && orderindices(indices, size));
   }
+  // need separate instance for accessing (differently typed) heap1
   bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) {
     const slot0 *buck = hta.heap0[t.bucketid()];
     const u32 size = 1 << --r;
@@ -419,24 +426,31 @@ struct equi {
         || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes)
         || (!dupes && orderindices(indices, size));
   }
+  // check a candidate that resulted in 0 xor
+  // add as solution, with proper subtree ordering, if it has unique indices
   void candidate(const tree t) {
     proof prf, dupes;
     memset(dupes, 0xffff, sizeof(proof));
     if (listindices1(WK, t, prf, dupes)) return; // assume WK odd
+    // it survived the probable dupe test, now check fully
     qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
     for (u32 i=1; i<PROOFSIZE; i++) if (prf[i] <= prf[i-1]) return;
+    // and now we have ourselves a genuine solution, not yet properly ordered
 #ifdef ATOMIC
     u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
 #else
     u32 soli = nsols++;
 #endif
+    // retrieve solution indices in correct order
     if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); // assume WK odd
   }
 #endif
+  // show bucket stats and, if desired, size distribution
   void showbsizes(u32 r) {
     printf(" b%d h%d\n", bfull, hfull);
     bfull = hfull = 0;
 #if defined(HIST) || defined(SPARK) || defined(LOGSPARK)
+    // group bucket sizes in 64 bins, from empty to full (ignoring SAVEMEM)
     u32 binsizes[65];
     memset(binsizes, 0, 65 * sizeof(u32));
     for (u32 bucketid = 0; bucketid < NBUCKETS; bucketid++) {
@@ -444,10 +458,10 @@ struct equi {
       binsizes[bsize]++;
     }
     for (u32 i=0; i < 65; i++) {
-#ifdef HIST
+#ifdef HIST  // exact counts are useful for debugging
       printf(" %d:%d", i, binsizes[i]);
 #else
-#ifdef SPARK
+#ifdef SPARK // everybody loves sparklines
       u32 sparks = binsizes[i] / SPARKSCALE;
 #else
       u32 sparks = 0;
@@ -462,6 +476,8 @@ struct equi {
     printf("Digit %d", r+1);
   }
 
+  // thread-local object that precomputes various slot metrics for each round
+  // facilitating access to various bits in the variable size slots
   struct htlayout {
     htalloc hta;
     u32 prevhtunits;
@@ -470,16 +486,17 @@ struct equi {
     u32 prevbo;
   
     htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
-      u32 nexthashbytes = hashsize(r);
-      nexthtunits = hashwords(nexthashbytes);
-      prevbo = 0;
-      if (r) {
+      u32 nexthashbytes = hashsize(r);        // number of bytes occupied by round r hash
+      nexthtunits = hashwords(nexthashbytes); // number of 32bit words taken up by those bytes
+      prevbo = 0;                  // byte offset for accessing hash form previous round
+      if (r) {     // similar measure for previous round
         u32 prevhashbytes = hashsize(r-1);
         prevhtunits = hashwords(prevhashbytes);
         prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
-        dunits = prevhtunits - nexthtunits;
+        dunits = prevhtunits - nexthtunits; // number of words by which hash shrinks
       }
     }
+    // extract remaining bits in digit slots in same bucket still need to collide on
     u32 getxhash0(const htunit* slot) const {
 #if WN == 200 && RESTBITS == 4
       return slot->bytes[prevbo] >> 4;
@@ -491,6 +508,7 @@ struct equi {
 #error non implemented
 #endif
     }
+    // similar but accounting for possible change in hashsize modulo 4 bits
     u32 getxhash1(const htunit* slot) const {
 #if WN == 200 && RESTBITS == 4
       return slot->bytes[prevbo] & 0xf;
@@ -502,12 +520,19 @@ struct equi {
 #error non implemented
 #endif
     }
+    // test whether two hashes match in last 32 bits
     bool equal(const htunit *hash0, const htunit *hash1) const {
       return hash0[prevhtunits-1].word == hash1[prevhtunits-1].word;
     }
   };
 
+  // this thread-local object performs in-bucket collissions
+  // by linking together slots that have identical rest bits
+  // (which is in essense a 2nd stage bucket sort)
   struct collisiondata {
+// the bitmap is an early experiment in a bitmap encoding
+// that works only for at most 64 slots
+// it might as well be obsoleted as it performs worse even in that case
 #ifdef XBITMAP
 #if NSLOTS > 64
 #error cant use XBITMAP with more than 64 slots
@@ -515,6 +540,10 @@ struct equi {
     u64 xhashmap[NRESTS];
     u64 xmap;
 #else
+// This maintains NRESTS = 2^RESTBITS lists whose starting slot
+// are in xhashslots[] and where subsequent slots in each list
+// are found through nextxhashslot[]
+// since 0 is already a valid slot number, use ~0 as nil value
 #if RESTBITS <= 6
     typedef uchar xslot;
 #else