diff --git a/src/pow/tromp/equi.h b/src/pow/tromp/equi.h new file mode 100644 index 000000000..7a0355a73 --- /dev/null +++ b/src/pow/tromp/equi.h @@ -0,0 +1,129 @@ +// Equihash solver +// Copyright (c) 2016-2016 John Tromp + +#include "blake/blake2.h" +#ifdef __APPLE__ +#include "osx_barrier.h" +#include +#include +#define htole32(x) OSSwapHostToLittleInt32(x) +#else +#include +#endif +#include // for types uint32_t,uint64_t +#include // for functions memset +#include // for function qsort + +typedef uint32_t u32; +typedef unsigned char uchar; + +// algorithm parameters, prefixed with W to reduce include file conflicts + +#ifndef WN +#define WN 200 +#endif + +#ifndef WK +#define WK 9 +#endif + +#define NDIGITS (WK+1) +#define DIGITBITS (WN/(NDIGITS)) + +static const u32 PROOFSIZE = 1<digest_length = HASHOUT; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + P->leaf_length = 0; + P->node_offset = 0; + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memcpy(P->personal, (const uint8_t *)personal, 16); + blake2b_init_param(ctx, P); + blake2b_update(ctx, (const uchar *)header, headerlen); + uchar nonce[32]; + memset(nonce, 0, 32); + uint32_t le_nonce = htole32(nce); + memcpy(nonce, &le_nonce, 4); + blake2b_update(ctx, nonce, 32); +} + +enum verify_code { POW_OK, POW_DUPLICATE, POW_OUT_OF_ORDER, POW_NONZERO_XOR }; +const char *errstr[] = { "OK", "duplicate index", "indices out of order", "nonzero xor" }; + +void genhash(blake2b_state *ctx, u32 idx, uchar *hash) { + blake2b_state state = *ctx; + u32 leb = htole32(idx / HASHESPERBLAKE); + blake2b_update(&state, (uchar *)&leb, sizeof(u32)); + uchar blakehash[HASHOUT]; + blake2b_final(&state, blakehash, HASHOUT); + memcpy(hash, blakehash + (idx % HASHESPERBLAKE) * WN/8, WN/8); +} + +int verifyrec(blake2b_state *ctx, u32 *indices, uchar *hash, int r) { + if (r == 0) { + genhash(ctx, *indices, hash); + return POW_OK; + } + u32 *indices1 = indices + (1 << (r-1)); + if (*indices >= *indices1) + return POW_OUT_OF_ORDER; + uchar hash0[WN/8], hash1[WN/8]; + int vrf0 = verifyrec(ctx, indices, hash0, r-1); + if (vrf0 != POW_OK) + return vrf0; + int vrf1 = verifyrec(ctx, indices1, hash1, r-1); + if (vrf1 != POW_OK) + return vrf1; + for (int i=0; i < WN/8; i++) + hash[i] = hash0[i] ^ hash1[i]; + int i, b = r * DIGITBITS; + for (i = 0; i < b/8; i++) + if (hash[i]) + return POW_NONZERO_XOR; + if ((b%8) && hash[i] >> (8-(b%8))) + return POW_NONZERO_XOR; + return POW_OK; +} + +int compu32(const void *pa, const void *pb) { + u32 a = *(u32 *)pa, b = *(u32 *)pb; + return a0 the leftmost leaf of its left subtree +// is less than the leftmost leaf of its right subtree + +// The algorithm below solves this by maintaining the trees +// in a graph of K layers, each split into buckets +// with buckets indexed by the first n-RESTBITS bits following +// the i*n 0s, each bucket having 4 * 2^RESTBITS slots, +// twice the number of subtrees expected to land there. + +#include "equi.h" +#include +#include +#include +#include + +typedef uint16_t u16; +typedef uint64_t u64; + +#ifdef ATOMIC +#include +typedef std::atomic au32; +#else +typedef u32 au32; +#endif + +#ifndef RESTBITS +#define RESTBITS 8 +#endif + +// 2_log of number of buckets +#define BUCKBITS (DIGITBITS-RESTBITS) + +#ifndef SAVEMEM +#if RESTBITS == 4 +// can't save memory in such small buckets +#define SAVEMEM 1 +#elif RESTBITS >= 8 +// take advantage of law of large numbers (sum of 2^8 random numbers) +// this reduces (200,9) memory to under 144MB, with negligible discarding +#define SAVEMEM 9/14 +#endif +#endif + +// number of buckets +static const u32 NBUCKETS = 1<> (2 * SLOTBITS - 1); +#else + return bid_s0_s1 >> (2 * SLOTBITS); +#endif + } + u32 slotid0() const { +#ifdef SLOTDIFF + return (bid_s0_s1 >> (SLOTBITS-1)) & SLOTMASK; +#else + return (bid_s0_s1 >> SLOTBITS) & SLOTMASK; +#endif + } + u32 slotid1() const { +#ifdef SLOTDIFF + return (slotid0() + 1 + (bid_s0_s1 & (SLOTMASK>>1))) & SLOTMASK; +#else + return bid_s0_s1 & SLOTMASK; +#endif + } +}; + +union hashunit { + u32 word; + uchar bytes[sizeof(u32)]; +}; + +#define WORDS(bits) ((bits + 31) / 32) +#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS) +#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS) + +struct slot0 { + tree attr; + hashunit hash[HASHWORDS0]; +}; + +struct slot1 { + tree attr; + hashunit hash[HASHWORDS1]; +}; + +// a bucket is NSLOTS treenodes +typedef slot0 bucket0[NSLOTS]; +typedef slot1 bucket1[NSLOTS]; +// the N-bit hash consists of K+1 n-bit "digits" +// each of which corresponds to a layer of NBUCKETS buckets +typedef bucket0 digit0[NBUCKETS]; +typedef bucket1 digit1[NBUCKETS]; + +// size (in bytes) of hash in round 0 <= r < WK +u32 hashsize(const u32 r) { + const u32 hashbits = WN - (r+1) * DIGITBITS + RESTBITS; + return (hashbits + 7) / 8; +} + +u32 hashwords(u32 bytes) { + return (bytes + 3) / 4; +} + +// manages hash and tree data +struct htalloc { + u32 *heap0; + u32 *heap1; + bucket0 *trees0[(WK+1)/2]; + bucket1 *trees1[WK/2]; + u32 alloced; + htalloc() { + alloced = 0; + } + void alloctrees() { +// optimize xenoncat's fixed memory layout, avoiding any waste +// digit trees hashes trees hashes +// 0 0 A A A A A A . . . . . . +// 1 0 A A A A A A 1 B B B B B +// 2 0 2 C C C C C 1 B B B B B +// 3 0 2 C C C C C 1 3 D D D D +// 4 0 2 4 E E E E 1 3 D D D D +// 5 0 2 4 E E E E 1 3 5 F F F +// 6 0 2 4 6 . G G 1 3 5 F F F +// 7 0 2 4 6 . G G 1 3 5 7 H H +// 8 0 2 4 6 8 . I 1 3 5 7 H H + assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits + heap0 = (u32 *)alloc(1, sizeof(digit0)); + heap1 = (u32 *)alloc(1, sizeof(digit1)); + for (int r=0; r indices[size]) { + for (u32 i=0; i < size; i++) { + const u32 tmp = indices[i]; + indices[i] = indices[size+i]; + indices[size+i] = tmp; + } + } + } + void listindices0(u32 r, const tree t, u32 *indices) { + if (r == 0) { + *indices = t.getindex(); + return; + } + const bucket1 &buck = hta.trees1[--r/2][t.bucketid()]; + const u32 size = 1 << r; + u32 *indices1 = indices + size; + listindices1(r, buck[t.slotid0()].attr, indices); + listindices1(r, buck[t.slotid1()].attr, indices1); + orderindices(indices, size); + } + void listindices1(u32 r, const tree t, u32 *indices) { + const bucket0 &buck = hta.trees0[--r/2][t.bucketid()]; + const u32 size = 1 << r; + u32 *indices1 = indices + size; + listindices0(r, buck[t.slotid0()].attr, indices); + listindices0(r, buck[t.slotid1()].attr, indices1); + orderindices(indices, size); + } + void candidate(const tree t) { + proof prf; + listindices1(WK, t, prf); // assume WK odd + qsort(prf, PROOFSIZE, sizeof(u32), &compu32); + for (u32 i=1; i> (SLOTBITS-6); + binsizes[bsize]++; + } + for (u32 i=0; i < 65; i++) { +#ifdef HIST + printf(" %d:%d", i, binsizes[i]); +#else +#ifdef SPARK + u32 sparks = binsizes[i] / SPARKSCALE; +#else + u32 sparks = 0; + for (u32 bs = binsizes[i]; bs; bs >>= 1) sparks++; + sparks = sparks * 7 / SPARKSCALE; +#endif + printf("\342\226%c", '\201' + sparks); +#endif + } + printf("\n"); +#endif + } + + struct htlayout { + htalloc hta; + u32 prevhashunits; + u32 nexthashunits; + u32 dunits; + u32 prevbo; + u32 nextbo; + + htlayout(equi *eq, u32 r): hta(eq->hta), prevhashunits(0), dunits(0) { + u32 nexthashbytes = hashsize(r); + nexthashunits = hashwords(nexthashbytes); + prevbo = 0; + nextbo = nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3 + if (r) { + u32 prevhashbytes = hashsize(r-1); + prevhashunits = hashwords(prevhashbytes); + prevbo = prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3 + dunits = prevhashunits - nexthashunits; + } + } + u32 getxhash0(const slot0* pslot) const { +#if WN == 200 && RESTBITS == 4 + return pslot->hash->bytes[prevbo] >> 4; +#elif WN == 200 && RESTBITS == 8 + return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4; +#elif WN == 200 && RESTBITS == 9 + return (pslot->hash->bytes[prevbo] & 0x1f) << 4 | pslot->hash->bytes[prevbo+1] >> 4; +#elif WN == 144 && RESTBITS == 4 + return pslot->hash->bytes[prevbo] & 0xf; +#else +#error non implemented +#endif + } + u32 getxhash1(const slot1* pslot) const { +#if WN == 200 && RESTBITS == 4 + return pslot->hash->bytes[prevbo] & 0xf; +#elif WN == 200 && RESTBITS == 8 + return pslot->hash->bytes[prevbo]; +#elif WN == 200 && RESTBITS == 9 + return (pslot->hash->bytes[prevbo]&1) << 8 | pslot->hash->bytes[prevbo+1]; +#elif WN == 144 && RESTBITS == 4 + return pslot->hash->bytes[prevbo] & 0xf; +#else +#error non implemented +#endif + } + bool equal(const hashunit *hash0, const hashunit *hash1) const { + return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word; + } + }; + + struct collisiondata { +#ifdef XBITMAP +#if NSLOTS > 64 +#error cant use XBITMAP with more than 64 slots +#endif + u64 xhashmap[NRESTS]; + u64 xmap; +#else +#if RESTBITS <= 6 + typedef uchar xslot; +#else + typedef u16 xslot; +#endif + xslot nxhashslots[NRESTS]; + xslot xhashslots[NRESTS][XFULL]; + xslot *xx; + u32 n0; + u32 n1; +#endif + u32 s0; + + void clear() { +#ifdef XBITMAP + memset(xhashmap, 0, NRESTS * sizeof(u64)); +#else + memset(nxhashslots, 0, NRESTS * sizeof(xslot)); +#endif + } + bool addslot(u32 s1, u32 xh) { +#ifdef XBITMAP + xmap = xhashmap[xh]; + xhashmap[xh] |= (u64)1 << s1; + s0 = -1; + return true; +#else + n1 = (u32)nxhashslots[xh]++; + if (n1 >= XFULL) + return false; + xx = xhashslots[xh]; + xx[n1] = s1; + n0 = 0; + return true; +#endif + } + bool nextcollision() const { +#ifdef XBITMAP + return xmap != 0; +#else + return n0 < n1; +#endif + } + u32 slot() { +#ifdef XBITMAP + const u32 ffs = __builtin_ffsll(xmap); + s0 += ffs; xmap >>= ffs; + return s0; +#else + return (u32)xx[n0++]; +#endif + } + }; + + void digit0(const u32 id) { + uchar hash[HASHOUT]; + blake2b_state state; + htlayout htl(this, 0); + const u32 hashbytes = hashsize(0); + for (u32 block = id; block < NBLOCKS; block += nthreads) { + state = blake_ctx; + u32 leb = htole32(block); + blake2b_update(&state, (uchar *)&leb, sizeof(u32)); + blake2b_final(&state, hash, HASHOUT); + for (u32 i = 0; i> 4; +#elif BUCKBITS == 11 && RESTBITS == 9 + const u32 bucketid = ((u32)ph[0] << 3) | ph[1] >> 5; +#elif BUCKBITS == 20 && RESTBITS == 4 + const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4; +#elif BUCKBITS == 12 && RESTBITS == 4 + const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4; + const u32 xhash = ph[1] & 0xf; +#else +#error not implemented +#endif + const u32 slot = getslot(0, bucketid); + if (slot >= NSLOTS) { + bfull++; + continue; + } + slot0 &s = hta.trees0[0][bucketid][slot]; + s.attr = tree(block * HASHESPERBLAKE + i); + memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes); + } + } + } + + void digitodd(const u32 r, const u32 id) { + htlayout htl(this, r); + collisiondata cd; + for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { + cd.clear(); + slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?! + u32 bsize = getnslots(r-1, bucketid); // optimize by putting bucketsize with block?! + for (u32 s1 = 0; s1 < bsize; s1++) { + const slot0 *pslot1 = buck + s1; // optimize by updating previous pslot1?! + if (!cd.addslot(s1, htl.getxhash0(pslot1))) { + xfull++; + continue; + } + for (; cd.nextcollision(); ) { + const u32 s0 = cd.slot(); + const slot0 *pslot0 = buck + s0; + if (htl.equal(pslot0->hash, pslot1->hash)) { + hfull++; + continue; + } + u32 xorbucketid; + const uchar *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; +#if WN == 200 && BUCKBITS == 12 && RESTBITS == 8 + xorbucketid = (((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]); +#elif WN == 200 && BUCKBITS == 11 && RESTBITS == 9 + xorbucketid = (((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 7) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 1; +#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 + xorbucketid = ((((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) + | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; +#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 + xorbucketid = ((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; +#else +#error not implemented +#endif + const u32 xorslot = getslot(r, xorbucketid); + if (xorslot >= NSLOTS) { + bfull++; + continue; + } + slot1 &xs = htl.hta.trees1[r/2][xorbucketid][xorslot]; + xs.attr = tree(bucketid, s0, s1); + for (u32 i=htl.dunits; i < htl.prevhashunits; i++) + xs.hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; + } + } + } + } + + void digiteven(const u32 r, const u32 id) { + htlayout htl(this, r); + collisiondata cd; + for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { + cd.clear(); + slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS + u32 bsize = getnslots(r-1, bucketid); + for (u32 s1 = 0; s1 < bsize; s1++) { + const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS + if (!cd.addslot(s1, htl.getxhash1(pslot1))) { + xfull++; + continue; + } + for (; cd.nextcollision(); ) { + const u32 s0 = cd.slot(); + const slot1 *pslot0 = buck + s0; + if (htl.equal(pslot0->hash, pslot1->hash)) { + hfull++; + continue; + } + u32 xorbucketid; + const uchar *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; +#if WN == 200 && BUCKBITS == 12 && RESTBITS == 8 + xorbucketid = ((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; +#elif WN == 200 && BUCKBITS == 11 && RESTBITS == 9 + xorbucketid = ((u32)(bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) << 3) + | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 5; +#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 + xorbucketid = ((((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) + | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; +#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 + xorbucketid = ((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) + | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; +#else +#error not implemented +#endif + const u32 xorslot = getslot(r, xorbucketid); + if (xorslot >= NSLOTS) { + bfull++; + continue; + } + slot0 &xs = htl.hta.trees0[r/2][xorbucketid][xorslot]; + xs.attr = tree(bucketid, s0, s1); + for (u32 i=htl.dunits; i < htl.prevhashunits; i++) + xs.hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; + } + } + } + } + + void digitK(const u32 id) { + collisiondata cd; + htlayout htl(this, WK); +u32 nc = 0; + for (u32 bucketid = id; bucketid < NBUCKETS; bucketid += nthreads) { + cd.clear(); + slot0 *buck = htl.hta.trees0[(WK-1)/2][bucketid]; + u32 bsize = getnslots(WK-1, bucketid); + for (u32 s1 = 0; s1 < bsize; s1++) { + const slot0 *pslot1 = buck + s1; + if (!cd.addslot(s1, htl.getxhash0(pslot1))) // assume WK odd + continue; + for (; cd.nextcollision(); ) { + const u32 s0 = cd.slot(); + if (htl.equal(buck[s0].hash, pslot1->hash)) +nc++, candidate(tree(bucketid, s0, s1)); + } + } + } +printf(" %d candidates ", nc); + } +}; + +typedef struct { + u32 id; + pthread_t thread; + equi *eq; +} thread_ctx; + +void barrier(pthread_barrier_t *barry) { + const int rc = pthread_barrier_wait(barry); + if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { + printf("Could not wait on barrier\n"); + pthread_exit(NULL); + } +} + +void *worker(void *vp) { + thread_ctx *tp = (thread_ctx *)vp; + equi *eq = tp->eq; + + if (tp->id == 0) + printf("Digit 0\n"); + barrier(&eq->barry); + eq->digit0(tp->id); + barrier(&eq->barry); + if (tp->id == 0) { + eq->xfull = eq->bfull = eq->hfull = 0; + eq->showbsizes(0); + } + barrier(&eq->barry); + for (u32 r = 1; r < WK; r++) { + if (tp->id == 0) + printf("Digit %d", r); + barrier(&eq->barry); + r&1 ? eq->digitodd(r, tp->id) : eq->digiteven(r, tp->id); + barrier(&eq->barry); + if (tp->id == 0) { + printf(" x%d b%d h%d\n", eq->xfull, eq->bfull, eq->hfull); + eq->xfull = eq->bfull = eq->hfull = 0; + eq->showbsizes(r); + } + barrier(&eq->barry); + } + if (tp->id == 0) + printf("Digit %d\n", WK); + eq->digitK(tp->id); + barrier(&eq->barry); + pthread_exit(NULL); + return 0; +} diff --git a/src/pow/tromp/osx_barrier.h b/src/pow/tromp/osx_barrier.h new file mode 100644 index 000000000..da05b3552 --- /dev/null +++ b/src/pow/tromp/osx_barrier.h @@ -0,0 +1,70 @@ +#ifdef __APPLE__ + +#ifndef PTHREAD_BARRIER_H_ +#define PTHREAD_BARRIER_H_ + +#include +#include + +typedef int pthread_barrierattr_t; +#define PTHREAD_BARRIER_SERIAL_THREAD 1 + +typedef struct +{ + pthread_mutex_t mutex; + pthread_cond_t cond; + int count; + int tripCount; +} pthread_barrier_t; + + +int pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned int count) +{ + if(count == 0) + { + errno = EINVAL; + return -1; + } + if(pthread_mutex_init(&barrier->mutex, 0) < 0) + { + return -1; + } + if(pthread_cond_init(&barrier->cond, 0) < 0) + { + pthread_mutex_destroy(&barrier->mutex); + return -1; + } + barrier->tripCount = count; + barrier->count = 0; + + return 0; +} + +int pthread_barrier_destroy(pthread_barrier_t *barrier) +{ + pthread_cond_destroy(&barrier->cond); + pthread_mutex_destroy(&barrier->mutex); + return 0; +} + +int pthread_barrier_wait(pthread_barrier_t *barrier) +{ + pthread_mutex_lock(&barrier->mutex); + ++(barrier->count); + if(barrier->count >= barrier->tripCount) + { + barrier->count = 0; + pthread_cond_broadcast(&barrier->cond); + pthread_mutex_unlock(&barrier->mutex); + return PTHREAD_BARRIER_SERIAL_THREAD; + } + else + { + pthread_cond_wait(&barrier->cond, &(barrier->mutex)); + pthread_mutex_unlock(&barrier->mutex); + return 0; + } +} + +#endif // PTHREAD_BARRIER_H_ +#endif // __APPLE__