diff --git a/Makefile b/Makefile index f996aa4..03254c0 100644 --- a/Makefile +++ b/Makefile @@ -5,10 +5,10 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS) all: equi equi1 verify test spark test1445 equi: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi + $(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi equi1: equi.h equi_miner.h equi_miner.cpp Makefile - $(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1 + $(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1 equi1g: equi.h equi_miner.h equi_miner.cpp Makefile g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g diff --git a/equi_miner.h b/equi_miner.h index c76c994..1edda4d 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -1,22 +1,38 @@ // Equihash solver // Copyright (c) 2016 John Tromp -// Fix N, K, such that n = N/(k+1) is integer -// Fix M = 2^{n+1} hashes each of length N bits, -// H_0, ... , H_{M-1}, generated fom (n+1)-bit indices. -// Problem: find binary tree on 2^K distinct indices, -// for which the exclusive-or of leaf hashes is all 0s. +// Equihash presents the following problem +// +// Fix N, K, such that N is a multiple of K+1 +// Let integer n = N/(K+1), and view N-bit words +// as having K+1 "digits" of n bits each +// Fix M = 2^{n+1} N-bit hashes H_0, ... , H_{M-1} +// as outputs of a hash function applied to an (n+1)-bit index +// +// Problem: find a binary tree on 2^K distinct indices, +// for which the exclusive-or of leaf hashes is all 0s // Additionally, it should satisfy the Wagner conditions: -// for each height i subtree, the exclusive-or -// of its 2^i corresponding hashes starts with i*n 0 bits, -// and for i>0 the leftmost leaf of its left subtree -// is less than the leftmost leaf of its right subtree - -// The algorithm below solves this by maintaining the tree -// in a graph of K layers, each split into buckets -// with buckets indexed by the first n-RESTBITS bits following -// the i*n 0s, each bucket having 4 * 2^RESTBITS slots, -// twice the number of subtrees expected to land there. +// 1) for each height i subtree, the exclusive-or +// of its 2^i leaf hashes starts with i*n 0 bits, +// 2) the leftmost leaf of any left subtree is less +// than the leftmost leaf of the corresponding right subtree +// +// The algorithm below solves this by storing trees +// as a directed acyclic graph of K layers +// The n digit bits are split into +// n-RESTBITS bucket bits and RESTBITS leftover bits +// Each layer i, consisting of height i subtrees +// whose xor starts with i*n 0s, is partitioned into +// 2^{n-RESTBITS} buckets according to the next n-RESTBITS +// in the xor +// Within each bucket, trees whose xor match in the +// next RESTBITS bits are combined to produce trees +// in the next layer +// To eliminate trees with duplicated indices, +// we simply test if the last 32 bits of the xor are 0, +// and if so, assume that this is due to index duplication +// In practice this works very well to avoid bucket overflow +// and produces negligible false positives #include "equi.h" #include @@ -31,9 +47,11 @@ #define htobe32(x) OSSwapHostToBigInt32(x) #endif +// u32 already defined in equi.h typedef uint16_t u16; typedef uint64_t u64; +// rquired for avoiding multio-threading race conflicts #ifdef ATOMIC #include typedef std::atomic au32; @@ -48,47 +66,63 @@ typedef u32 au32; // 2_log of number of buckets #define BUCKBITS (DIGITBITS-RESTBITS) +// by default buckets have a capacity of twice their expected size +// but this factor reduced it accordingly #ifndef SAVEMEM #if RESTBITS == 4 // can't save memory in such small buckets #define SAVEMEM 1 #elif RESTBITS >= 8 -// take advantage of law of large numbers (sum of 2^8 random numbers) -// this reduces (200,9) memory to under 144MB, with negligible discarding +// an expected size of at least 512 has such relatively small +// standard deviation that we can reduce capacity with negligible discarding +// this value reduces (200,9) memory to under 144MB #define SAVEMEM 9/14 #endif #endif // number of buckets static const u32 NBUCKETS = 1<> (2 * SLOTBITS - 1); @@ -110,6 +146,7 @@ struct tree { return bid_s0_s1 >> (2 * SLOTBITS); #endif } + // retrieve first slot index u32 slotid0() const { #ifdef SLOTDIFF return (bid_s0_s1 >> (SLOTBITS-1)) & SLOTMASK; @@ -117,6 +154,7 @@ struct tree { return (bid_s0_s1 >> SLOTBITS) & SLOTMASK; #endif } + // retrieve second slot index u32 slotid1() const { #ifdef SLOTDIFF return (slotid0() + 1 + (bid_s0_s1 & (SLOTMASK>>1))) & SLOTMASK; @@ -126,6 +164,12 @@ struct tree { } }; +// each bucket slot occupies a variable number of hash/tree units, +// all but the last of which hold the xor over all leaf hashes, +// or what's left of it after stripping the initial i*n 0s +// the last unit holds the tree node itself +// the hash is sometimes accessed 32 bits at a time (word) +// and sometimes 8 bits at a time (bytes) union htunit { tree tag; u32 word; @@ -148,6 +192,43 @@ typedef bucket0 digit0[NBUCKETS]; typedef bucket1 digit1[NBUCKETS]; typedef au32 bsizes[NBUCKETS]; +// The algorithm proceeds in K+1 rounds, one for each digit +// All data is stored in two heaps, +// heap0 of type digit0, and heap1 of type digit1 +// The following table shows the layout of these heaps +// in each round, which is an optimized version +// of xenoncat's fixed memory layout, avoiding any waste +// Each line shows only a single slot, which is actually +// replicated NSLOTS * NBUCKETS times +// +// heap0 heap1 +// round hashes tree hashes tree +// 0 A A A A A A 0 . . . . . . +// 1 A A A A A A 0 B B B B B 1 +// 2 C C C C C 2 0 B B B B B 1 +// 3 C C C C C 2 0 D D D D 3 1 +// 4 E E E E 4 2 0 D D D D 3 1 +// 5 E E E E 4 2 0 F F F 5 3 1 +// 6 G G 6 . 4 2 0 F F F 5 3 1 +// 7 G G 6 . 4 2 0 H H 7 5 3 1 +// 8 I 8 6 . 4 2 0 H H 7 5 3 1 +// +// Round 0 generates hashes and stores them in the buckets +// of heap0 according to the initial n-RESTBITS bits +// These hashes are denoted A above and followed by the +// tree tag denoted 0 +// In round 1 we combine each pair of slots in the same bucket +// with matching RESTBITS of digit 0 and store the resulting +// 1-tree in heap1 with its xor hash denoted B +// Upon finishing round 1, the A space is no longer needed, +// and is re-used in round 2 to store both the shorter C hashes, +// and their tree tags denoted 2 +// Continuing in this manner, each round reads buckets from one +// heap, and writes buckets in the other heap. +// In the final round K, all pairs leading to 0 xors are identified +// and their leafs recovered through the DAG of tree nodes + +// convenience function u32 min(const u32 a, const u32 b) { return a < b ? a : b; } @@ -158,6 +239,7 @@ u32 hashsize(const u32 r) { return (hashbits + 7) / 8; } +// convert bytes into words,rounding up u32 hashwords(u32 bytes) { return (bytes + 3) / 4; } @@ -171,17 +253,6 @@ struct htalloc { alloced = 0; } void alloctrees() { -// optimize xenoncat's fixed memory layout, avoiding any waste -// digit hashes tree hashes tree -// 0 A A A A A A 0 . . . . . . -// 1 A A A A A A 0 B B B B B 1 -// 2 C C C C C 2 0 B B B B B 1 -// 3 C C C C C 2 0 D D D D 3 1 -// 4 E E E E 4 2 0 D D D D 3 1 -// 5 E E E E 4 2 0 F F F 5 3 1 -// 6 G G 6 . 4 2 0 F F F 5 3 1 -// 7 G G 6 . 4 2 0 H H 7 5 3 1 -// 8 I 8 6 . 4 2 0 H H 7 5 3 1 assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits heap0 = (bucket0 *)alloc(NBUCKETS, sizeof(bucket0)); heap1 = (bucket1 *)alloc(NBUCKETS, sizeof(bucket1)); @@ -967,7 +1038,7 @@ void *worker(void *vp) { barrier(&eq->barry); if (tp->id == 0) eq->showbsizes(0); barrier(&eq->barry); -#if WN == 200 && WK == 9 && RESTBITS == 8 +#if WN == 200 && WK == 9 && RESTBITS == 8 && defined UNROLL eq->digit1(tp->id); barrier(&eq->barry); if (tp->id == 0) eq->showbsizes(1);