From d3454d92285c02e3cd46a5b08e1889d00b6ac210 Mon Sep 17 00:00:00 2001 From: tromp Date: Thu, 27 Oct 2016 15:55:09 -0400 Subject: [PATCH] separate make targets for AVX2 --- Makefile | 14 +- README.md | 16 +- dev_miner.h | 18 +- equi_dev_miner.cpp | 84 ---- equi_dev_miner.h | 1025 -------------------------------------------- equi_miner.cpp | 2 +- equi_miner.h | 4 +- 7 files changed, 28 insertions(+), 1135 deletions(-) delete mode 100644 equi_dev_miner.cpp delete mode 100644 equi_dev_miner.h diff --git a/Makefile b/Makefile index 22555de..84c74d3 100644 --- a/Makefile +++ b/Makefile @@ -4,11 +4,17 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS) all: equi equi1 verify test spark test1445 -equi: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi +equi: equi.h equi_miner.h equi_miner.cpp Makefile + $(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi -equi1: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile - $(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi1 +equi1: equi.h equi_miner.h equi_miner.cpp Makefile + $(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1 + +eqavx2: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -DUSE_AVX2 -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx2 + +eqavx21: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile + $(GPP) -DUSE_AVX2 -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o eqavx21 equi1g: equi.h equi_miner.h equi_miner.cpp Makefile g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g diff --git a/README.md b/README.md index 026bf57..2291a7a 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,12 @@ More detailed documentation is available in the equi_miner.h source code. Performance summary (on 4GHz i7-4790K and NVidia GTX980): -- equi1: 4.6 Sol/s - 5.9 Sol/s (with AVX2) -- equi -t 8: 16.7 Sol/s -- 8 x equi1: 20.3 Sol/s -- dev1: 6.5 Sol/s (xenoncat's blake) -- 8 x dev1: 20.6 Sol/s -- dev -t 8: 17.2 Sol/s -- eqcuda: 23.6 Sol/s +- equi1: 4.6 Sol/s +- eqavx21: 5.9 Sol/s +- equi -t 8: 4.6 Sol/s +- eqavx2 -t 8: TBA Sol/s +- 8 x equi1: 20.3 Sol/s +- dev1: 6.5 Sol/s (xenoncat's blake) +- 8 x dev1: 20.6 Sol/s +- dev -t 8: 17.2 Sol/s +- eqcuda: 23.6 Sol/s diff --git a/dev_miner.h b/dev_miner.h index 69c045a..7af510b 100644 --- a/dev_miner.h +++ b/dev_miner.h @@ -83,14 +83,6 @@ typedef u32 au32; #endif #endif -#ifdef __AVX2__ -#define BLAKESINPARALLEL 4 -#elif defined __AVX__ -#define BLAKESINPARALLEL 2 -#else -#define BLAKESINPARALLEL 1 -#endif - // number of buckets static const u32 NBUCKETS = 1< - -int main(int argc, char **argv) { - int nthreads = 1; - int nonce = 0; - int range = 1; - bool showsol = false; - const char *header = ""; - int c; - while ((c = getopt (argc, argv, "h:n:r:t:s")) != -1) { - switch (c) { - case 'h': - header = optarg; - break; - case 'n': - nonce = atoi(optarg); - break; - case 'r': - range = atoi(optarg); - break; - case 's': - showsol = true; - break; - case 't': - nthreads = atoi(optarg); - break; - } - } -#ifndef XWITHASH - if (sizeof(tree) > 4) - printf("WARNING: please compile with -DXWITHASH to shrink tree!\n"); -#endif -#ifdef ATOMIC - if (nthreads==1) - printf("WARNING: use of atomics hurts single threaded performance!\n"); -#else - assert(nthreads==1); -#endif - printf("Looking for wagner-tree on (\"%s\",%d", header, nonce); - if (range > 1) - printf("-%d", nonce+range-1); - printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads); - thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx)); - assert(threads); - equi eq(nthreads); - printf("Using %dMB of memory\n", 1 + eq.hta.alloced / 0x100000); - u32 sumnsols = 0; - char headernonce[HEADERNONCELEN]; - u32 hdrlen = strlen(header); - memcpy(headernonce, header, hdrlen); - memset(headernonce+hdrlen, 0, sizeof(headernonce)-hdrlen); - for (int r = 0; r < range; r++) { - ((u32 *)headernonce)[32] = htole32(nonce+r); - eq.setheadernonce(headernonce, sizeof(headernonce)); - for (int t = 0; t < nthreads; t++) { - threads[t].id = t; - threads[t].eq = &eq; - int err = pthread_create(&threads[t].thread, NULL, worker, (void *)&threads[t]); - assert(err == 0); - } - for (int t = 0; t < nthreads; t++) { - int err = pthread_join(threads[t].thread, NULL); - assert(err == 0); - } - u32 nsols = 0; - for (unsigned s = 0; s < eq.nsols; s++) { - nsols++; - if (showsol) { - printf("\nSolution"); - for (u32 i = 0; i < PROOFSIZE; i++) - printf(" %jx", (uintmax_t)eq.sols[s][i]); - } - } - printf("\n%d solutions\n", nsols); - sumnsols += nsols; - } - free(threads); - printf("%d total solutions\n", sumnsols); - return 0; -} diff --git a/equi_dev_miner.h b/equi_dev_miner.h deleted file mode 100644 index dab2440..0000000 --- a/equi_dev_miner.h +++ /dev/null @@ -1,1025 +0,0 @@ -// Equihash solver -// Copyright (c) 2016 John Tromp - -// This semi-development version uses Samuel Neves' -// AVX2-optimized blake2bp code from https://github.com/sneves/blake2-avx2 -// modified to perform a 4-way parallel blake2b_final - -// Fix N, K, such that n = N/(k+1) is integer -// Fix M = 2^{n+1} hashes each of length N bits, -// H_0, ... , H_{M-1}, generated fom (n+1)-bit indices. -// Problem: find binary tree on 2^K distinct indices, -// for which the exclusive-or of leaf hashes is all 0s. -// Additionally, it should satisfy the Wagner conditions: -// for each height i subtree, the exclusive-or -// of its 2^i corresponding hashes starts with i*n 0 bits, -// and for i>0 the leftmost leaf of its left subtree -// is less than the leftmost leaf of its right subtree - -// The algorithm below solves this by maintaining the tree -// in a graph of K layers, each split into buckets -// with buckets indexed by the first n-RESTBITS bits following -// the i*n 0s, each bucket having 4 * 2^RESTBITS slots, -// twice the number of subtrees expected to land there. - -#include "equi.h" -#include -#include -#include - -#include "blake2-avx2/blake2bip.h" - -#if defined __builtin_bswap32 && defined __LITTLE_ENDIAN -#undef htobe32 -#define htobe32(x) __builtin_bswap32(x) -#elif defined __APPLE__ -#undef htobe32 -#define htobe32(x) OSSwapHostToBigInt32(x) -#endif - -typedef uint16_t u16; -typedef uint64_t u64; - -#ifdef ATOMIC -#include -typedef std::atomic au32; -#else -typedef u32 au32; -#endif - -#ifndef RESTBITS -#define RESTBITS 8 -#endif - -// 2_log of number of buckets -#define BUCKBITS (DIGITBITS-RESTBITS) - -#ifndef SAVEMEM -#if RESTBITS == 4 -// can't save memory in such small buckets -#define SAVEMEM 1 -#elif RESTBITS >= 8 -// take advantage of law of large numbers (sum of 2^8 random numbers) -// this reduces (200,9) memory to under 144MB, with negligible discarding -#define SAVEMEM 9/14 -#endif -#endif - -#ifdef __AVX2__ -#define BLAKESINPARALLEL 4 -#elif defined __AVX__ -#define BLAKESINPARALLEL 2 -#else -#define BLAKESINPARALLEL 1 -#endif - -// number of buckets -static const u32 NBUCKETS = 1<> (2 * SLOTBITS - 1); -#else - return bid_s0_s1 >> (2 * SLOTBITS); -#endif - } - u32 slotid0() const { -#ifdef SLOTDIFF - return (bid_s0_s1 >> (SLOTBITS-1)) & SLOTMASK; -#else - return (bid_s0_s1 >> SLOTBITS) & SLOTMASK; -#endif - } - u32 slotid1() const { -#ifdef SLOTDIFF - return (slotid0() + 1 + (bid_s0_s1 & (SLOTMASK>>1))) & SLOTMASK; -#else - return bid_s0_s1 & SLOTMASK; -#endif - } -}; - -union htunit { - tree tag; - u32 word; - uchar bytes[sizeof(u32)]; -}; - -#define WORDS(bits) ((bits + 31) / 32) -#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS) -#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS) - -// A slot is up to HASHWORDS0 hash units followed by a tag -typedef htunit slot0[HASHWORDS0+1]; -typedef htunit slot1[HASHWORDS1+1]; -// a bucket is NSLOTS treenodes -typedef slot0 bucket0[NSLOTS]; -typedef slot1 bucket1[NSLOTS]; -// the N-bit hash consists of K+1 n-bit "digits" -// each of which corresponds to a layer of NBUCKETS buckets -typedef bucket0 digit0[NBUCKETS]; -typedef bucket1 digit1[NBUCKETS]; -typedef au32 bsizes[NBUCKETS]; - -u32 min(const u32 a, const u32 b) { - return a < b ? a : b; -} - -// size (in bytes) of hash in round 0 <= r < WK -u32 hashsize(const u32 r) { - const u32 hashbits = WN - (r+1) * DIGITBITS + RESTBITS; - return (hashbits + 7) / 8; -} - -u32 hashwords(u32 bytes) { - return (bytes + 3) / 4; -} - -// manages hash and tree data -struct htalloc { - bucket0 *heap0; - bucket1 *heap1; - u32 alloced; - htalloc() { - alloced = 0; - } - void alloctrees() { -// optimize xenoncat's fixed memory layout, avoiding any waste -// digit hashes tree hashes tree -// 0 A A A A A A 0 . . . . . . -// 1 A A A A A A 0 B B B B B 1 -// 2 C C C C C 2 0 B B B B B 1 -// 3 C C C C C 2 0 D D D D 3 1 -// 4 E E E E 4 2 0 D D D D 3 1 -// 5 E E E E 4 2 0 F F F 5 3 1 -// 6 G G 6 . 4 2 0 F F F 5 3 1 -// 7 G G 6 . 4 2 0 H H 7 5 3 1 -// 8 I 8 6 . 4 2 0 H H 7 5 3 1 - assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits - heap0 = (bucket0 *)alloc(NBUCKETS, sizeof(bucket0)); - heap1 = (bucket1 *)alloc(NBUCKETS, sizeof(bucket1)); - } - void dealloctrees() { - free(heap0); - free(heap1); - } - void *alloc(const u32 n, const u32 sz) { - void *mem = calloc(n, sz); - assert(mem); - alloced += n * sz; - return mem; - } -}; - -struct equi { - blake2b_state blake_ctx; - htalloc hta; - bsizes *nslots; - proof *sols; - au32 nsols; - u32 nthreads; - u32 xfull; - u32 bfull; - u32 hfull; - pthread_barrier_t barry; - equi(const u32 n_threads) { - assert(sizeof(htunit) == 4); - assert(WK&1); // assumed in candidate() calling indices1() - nthreads = n_threads; - const int err = pthread_barrier_init(&barry, NULL, nthreads); - assert(!err); - hta.alloctrees(); - nslots = (bsizes *)hta.alloc(2 * NBUCKETS, sizeof(au32)); - sols = (proof *)hta.alloc(MAXSOLS, sizeof(proof)); - } - ~equi() { - hta.dealloctrees(); - free(nslots); - free(sols); - } - void setheadernonce(const char *headernonce, const u32 len) { - setheader(&blake_ctx, headernonce); - memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing - nsols = xfull = bfull = hfull = 0; - } - u32 getslot0(const u32 bucketi) { -#ifdef ATOMIC - return std::atomic_fetch_add_explicit(&nslots[0][bucketi], 1U, std::memory_order_relaxed); -#else - return nslots[0][bucketi]++; -#endif - } - u32 getslot1(const u32 bucketi) { -#ifdef ATOMIC - return std::atomic_fetch_add_explicit(&nslots[1][bucketi], 1U, std::memory_order_relaxed); -#else - return nslots[1][bucketi]++; -#endif - } - u32 getnslots0(const u32 bid) { - au32 &nslot = nslots[0][bid]; - const u32 n = min(nslot, NSLOTS); - nslot = 0; - return n; - } - u32 getnslots1(const u32 bid) { - au32 &nslot = nslots[1][bid]; - const u32 n = min(nslot, NSLOTS); - nslot = 0; - return n; - } -#ifdef MERGESORT - // if merged != 0, mergesort indices and return true if dupe found - // if merged == 0, order indices as in Wagner condition - bool orderindices(u32 *indices, u32 size, u32 *merged) { - if (merged) { - u32 i = 0, j = 0, k; - for (k = 0; i indices[size]) { - for (u32 i=0; i < size; i++) { - const u32 tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } - return false; - } - } - // return true if dupe found - bool listindices0(u32 r, const tree t, u32 *indices, u32 *merged) { - if (r == 0) { - *indices = t.getindex(); - return false; - } - const slot1 *buck = hta.heap1[t.bucketid()]; - const u32 size = 1 << --r; - u32 *indices1 = indices + size; - u32 tagi = hashwords(hashsize(r)); - return listindices1(r, buck[t.slotid0()][tagi].tag, indices, merged) - || listindices1(r, buck[t.slotid1()][tagi].tag, indices1, merged) - || orderindices(indices, size, merged); - } - bool listindices1(u32 r, const tree t, u32 *indices, u32 *merged) { - const slot0 *buck = hta.heap0[t.bucketid()]; - const u32 size = 1 << --r; - u32 *indices1 = indices + size; - u32 tagi = hashwords(hashsize(r)); - return listindices0(r, buck[t.slotid0()][tagi].tag, indices, merged) - || listindices0(r, buck[t.slotid1()][tagi].tag, indices1, merged) - || orderindices(indices, size, merged); - } - void candidate(const tree t) { - proof prf, merged; - if (listindices1(WK, t, prf, merged)) return; -#ifdef ATOMIC - u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed); -#else - u32 soli = nsols++; -#endif - if (soli < MAXSOLS) listindices1(WK, t, sols[soli], 0); - } -#else - bool orderindices(u32 *indices, u32 size) { - if (indices[0] > indices[size]) { - for (u32 i=0; i < size; i++) { - const u32 tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } - return false; - } - // if dupes != 0, list indices in arbitrary order and return true if dupe found - // if dupes == 0, order indices as in Wagner condition - bool listindices0(u32 r, const tree t, u32 *indices, u32 *dupes) { - if (r == 0) { - u32 idx = t.getindex(); - if (dupes) { - u32 bin = idx & (PROOFSIZE-1); - if (idx == dupes[bin]) return true; - dupes[bin] = idx; - } - *indices = idx; - return false; - } - const slot1 *buck = hta.heap1[t.bucketid()]; - const u32 size = 1 << --r; - u32 tagi = hashwords(hashsize(r)); - return listindices1(r, buck[t.slotid0()][tagi].tag, indices, dupes) - || listindices1(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) - || (!dupes && orderindices(indices, size)); - } - bool listindices1(u32 r, const tree t, u32 *indices, u32 *dupes) { - const slot0 *buck = hta.heap0[t.bucketid()]; - const u32 size = 1 << --r; - u32 tagi = hashwords(hashsize(r)); - return listindices0(r, buck[t.slotid0()][tagi].tag, indices, dupes) - || listindices0(r, buck[t.slotid1()][tagi].tag, indices+size, dupes) - || (!dupes && orderindices(indices, size)); - } - void candidate(const tree t) { - proof prf, dupes; - memset(dupes, 0xffff, sizeof(proof)); - if (listindices1(WK, t, prf, dupes)) return; // assume WK odd - qsort(prf, PROOFSIZE, sizeof(u32), &compu32); - for (u32 i=1; i> (SLOTBITS-6); - binsizes[bsize]++; - } - for (u32 i=0; i < 65; i++) { -#ifdef HIST - printf(" %d:%d", i, binsizes[i]); -#else -#ifdef SPARK - u32 sparks = binsizes[i] / SPARKSCALE; -#else - u32 sparks = 0; - for (u32 bs = binsizes[i]; bs; bs >>= 1) sparks++; - sparks = sparks * 7 / SPARKSCALE; -#endif - printf("\342\226%c", '\201' + sparks); -#endif - } - printf("\n"); -#endif - printf("Digit %d", r+1); - } - - struct htlayout { - htalloc hta; - u32 prevhtunits; - u32 nexthtunits; - u32 dunits; - u32 prevbo; - - htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) { - u32 nexthashbytes = hashsize(r); - nexthtunits = hashwords(nexthashbytes); - prevbo = 0; - if (r) { - u32 prevhashbytes = hashsize(r-1); - prevhtunits = hashwords(prevhashbytes); - prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3 - dunits = prevhtunits - nexthtunits; - } - } - u32 getxhash0(const htunit* slot) const { -#if WN == 200 && RESTBITS == 4 - return slot->bytes[prevbo] >> 4; -#elif WN == 200 && RESTBITS == 8 - return (slot->bytes[prevbo] & 0xf) << 4 | slot->bytes[prevbo+1] >> 4; -#elif WN == 144 && RESTBITS == 4 - return slot->bytes[prevbo] & 0xf; -#else -#error non implemented -#endif - } - u32 getxhash1(const htunit* slot) const { -#if WN == 200 && RESTBITS == 4 - return slot->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 8 - return slot->bytes[prevbo]; -#elif WN == 144 && RESTBITS == 4 - return slot->bytes[prevbo] & 0xf; -#else -#error non implemented -#endif - } - bool equal(const htunit *hash0, const htunit *hash1) const { - return hash0[prevhtunits-1].word == hash1[prevhtunits-1].word; - } - }; - - struct collisiondata { -#ifdef XBITMAP -#if NSLOTS > 64 -#error cant use XBITMAP with more than 64 slots -#endif - u64 xhashmap[NRESTS]; - u64 xmap; -#else -#if RESTBITS <= 6 - typedef uchar xslot; -#else - typedef u16 xslot; -#endif - xslot nxhashslots[NRESTS]; - xslot xhashslots[NRESTS][XFULL]; - xslot *xx; - u32 n0; - u32 n1; -#endif - u32 s0; - - void clear() { -#ifdef XBITMAP - memset(xhashmap, 0, NRESTS * sizeof(u64)); -#else - memset(nxhashslots, 0, NRESTS * sizeof(xslot)); -#endif - } - bool addslot(u32 s1, u32 xh) { -#ifdef XBITMAP - xmap = xhashmap[xh]; - xhashmap[xh] |= (u64)1 << s1; - s0 = -1; - return true; -#else - n1 = (u32)nxhashslots[xh]++; - if (n1 >= XFULL) - return false; - xx = xhashslots[xh]; - xx[n1] = s1; - n0 = 0; - return true; -#endif - } - bool nextcollision() const { -#ifdef XBITMAP - return xmap != 0; -#else - return n0 < n1; -#endif - } - u32 slot() { -#ifdef XBITMAP - const u32 ffs = __builtin_ffsll(xmap); - s0 += ffs; xmap >>= ffs; - return s0; -#else - return (u32)xx[n0++]; -#endif - } - }; - - void digit0(const u32 id) { - htlayout htl(this, 0); -#ifndef HASHONLY - const u32 hashbytes = hashsize(0); -#endif - uchar hashes[BLAKESINPARALLEL * 64]; - blake2b_state state = blake_ctx; - for (u32 block = id; block < NBLOCKS; block += nthreads) { - blake2bip_final(&state, hashes, block); -#ifndef HASHONLY - for (u32 i = 0; i> 4; - const u32 slot = getslot0(bucketid); - if (slot >= NSLOTS) { - bfull++; - continue; - } - htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits; - memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes); - s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j); - } - } -#endif - } - } - - void digitodd(const u32 r, const u32 id) { - htlayout htl(this, r); - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = htl.hta.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, htl.getxhash0(slot1))) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (htl.equal(slot0, slot1)) { - hfull++; - continue; - } - u32 xorbucketid; - const uchar *bytes0 = slot0->bytes, *bytes1 = slot1->bytes; -#if WN == 200 && BUCKBITS == 12 && RESTBITS == 8 - xorbucketid = (((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]); -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#else -#error not implemented -#endif - const u32 xorslot = getslot1(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - htunit *xs = htl.hta.heap1[xorbucketid][xorslot]; - for (u32 i=htl.dunits; i < htl.prevhtunits; i++) - xs++->word = slot0[i].word ^ slot1[i].word; - xs->tag = tree(bucketid, s0, s1); - } - } - } - } - - void digiteven(const u32 r, const u32 id) { - htlayout htl(this, r); - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot1 *buck = htl.hta.heap1[bucketid]; - u32 bsize = getnslots1(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, htl.getxhash1(slot1))) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (htl.equal(slot0, slot1)) { - hfull++; - continue; - } - u32 xorbucketid; - const uchar *bytes0 = slot0->bytes, *bytes1 = slot1->bytes; -#if WN == 200 && BUCKBITS == 12 && RESTBITS == 8 - xorbucketid = ((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((u32)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#else -#error not implemented -#endif - const u32 xorslot = getslot0(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - htunit *xs = htl.hta.heap0[xorbucketid][xorslot]; - for (u32 i=htl.dunits; i < htl.prevhtunits; i++) - xs++->word = slot0[i].word ^ slot1[i].word; - xs->tag = tree(bucketid, s0, s1); - } - } - } - } - - void digit1(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = heaps.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff)) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[5].word == slot1[5].word) { - hfull++; - continue; - } - u32 xorbucketid = htobe32(slot0->word ^ slot1->word) >> 8 & BUCKMASK; - const u32 xorslot = getslot1(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - u64 *x = (u64 *)heaps.heap1[xorbucketid][xorslot]; - u64 *x0 = (u64 *)slot0, *x1 = (u64 *)slot1; - *x++ = x0[0] ^ x1[0]; - *x++ = x0[1] ^ x1[1]; - *x++ = x0[2] ^ x1[2]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit2(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot1 *buck = heaps.heap1[bucketid]; - u32 bsize = getnslots1(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, slot1->bytes[3])) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[5].word == slot1[5].word) { - hfull++; - continue; - } - u32 xorbucketid = htobe32(slot0[1].word ^ slot1[1].word) >> 20; - const u32 xorslot = getslot0(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - htunit *xs = heaps.heap0[xorbucketid][xorslot]; - xs++->word = slot0[1].word ^ slot1[1].word; - u64 *x = (u64 *)xs, *x0 = (u64 *)slot0, *x1 = (u64 *)slot1; - *x++ = x0[1] ^ x1[1]; - *x++ = x0[2] ^ x1[2]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit3(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = heaps.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff)) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[4].word == slot1[4].word) { - hfull++; - continue; - } - u32 xorbucketid = htobe32(slot0[0].word ^ slot1[0].word) & BUCKMASK; - const u32 xorslot = getslot1(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - u64 *x = (u64 *)heaps.heap1[xorbucketid][xorslot]; - u64 *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); - *x++ = x0[0] ^ x1[0]; - *x++ = x0[1] ^ x1[1]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit4(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot1 *buck = heaps.heap1[bucketid]; - u32 bsize = getnslots1(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, slot1->bytes[0])) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[3].word == slot1[3].word) { - hfull++; - continue; - } - u32 xorbucketid = htobe32(slot0[0].word ^ slot1[0].word) >> 12 & BUCKMASK; - const u32 xorslot = getslot0(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - u64 *x = (u64 *)heaps.heap0[xorbucketid][xorslot]; - u64 *x0 = (u64 *)slot0, *x1 = (u64 *)slot1; - *x++ = x0[0] ^ x1[0]; - *x++ = x0[1] ^ x1[1]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit5(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = heaps.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff)) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[3].word == slot1[3].word) { - hfull++; - continue; - } - u32 xor1 = slot0[1].word ^ slot1[1].word; - u32 xorbucketid = (((u32)(slot0->bytes[3] ^ slot1->bytes[3]) & 0xf) - << 8) | (xor1 & 0xff); - const u32 xorslot = getslot1(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - htunit *xs = heaps.heap1[xorbucketid][xorslot]; - xs++->word = xor1; - u64 *x = (u64 *)xs, *x0 = (u64 *)slot0, *x1 = (u64 *)slot1; - *x++ = x0[1] ^ x1[1]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit6(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot1 *buck = heaps.heap1[bucketid]; - u32 bsize = getnslots1(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, slot1->bytes[1])) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[2].word == slot1[2].word) { - hfull++; - continue; - } - u32 xorbucketid = htobe32(slot0[0].word ^ slot1[0].word) >> 4 & BUCKMASK; - const u32 xorslot = getslot0(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - htunit *xs = heaps.heap0[xorbucketid][xorslot]; - xs++->word = slot0[0].word ^ slot1[0].word; - u64 *x = (u64 *)xs, *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); - *x++ = x0[0] ^ x1[0]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit7(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = heaps.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4)) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - if (slot0[2].word == slot1[2].word) { - hfull++; - continue; - } - u32 xorbucketid = htobe32(slot0[1].word ^ slot1[1].word) >> 16 & BUCKMASK; - const u32 xorslot = getslot1(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - u64 *x = (u64 *)heaps.heap1[xorbucketid][xorslot]; - u64 *x0 = (u64 *)(slot0+1), *x1 = (u64 *)(slot1+1); - *x++ = x0[0] ^ x1[0]; - ((htunit *)x)->tag = tree(bucketid, s0, s1); - } - } - } - } - void digit8(const u32 id) { - htalloc heaps = hta; - collisiondata cd; - for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot1 *buck = heaps.heap1[bucketid]; - u32 bsize = getnslots1(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, slot1->bytes[2])) { - xfull++; - continue; - } - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - const htunit *slot0 = buck[s0]; - u32 xor1 = slot0[1].word ^ slot1[1].word; - if (!xor1) { - hfull++; - continue; - } - u32 xorbucketid = ((u32)(slot0->bytes[3] ^ slot1->bytes[3]) << 4) - | (xor1 >> 4 & 0xf); - const u32 xorslot = getslot0(xorbucketid); - if (xorslot >= NSLOTS) { - bfull++; - continue; - } - htunit *xs = heaps.heap0[xorbucketid][xorslot]; - xs++->word = xor1; - xs->tag = tree(bucketid, s0, s1); - } - } - } - } - - void digitK(const u32 id) { - collisiondata cd; - htlayout htl(this, WK); - u32 nc = 0; - for (u32 bucketid = id; bucketid < NBUCKETS; bucketid += nthreads) { - cd.clear(); - slot0 *buck = htl.hta.heap0[bucketid]; - u32 bsize = getnslots0(bucketid); - for (u32 s1 = 0; s1 < bsize; s1++) { - const htunit *slot1 = buck[s1]; - if (!cd.addslot(s1, htl.getxhash0(slot1))) // assume WK odd - continue; - for (; cd.nextcollision(); ) { - const u32 s0 = cd.slot(); - if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE - candidate(tree(bucketid, s0, s1)); - nc++; - } - } - } - } - printf(" %d candidates ", nc); - } -}; - -typedef struct { - u32 id; - pthread_t thread; - equi *eq; -} thread_ctx; - -void barrier(pthread_barrier_t *barry) { - const int rc = pthread_barrier_wait(barry); - if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { - printf("Could not wait on barrier\n"); - pthread_exit(NULL); - } -} - -void *worker(void *vp) { - thread_ctx *tp = (thread_ctx *)vp; - equi *eq = tp->eq; - - if (tp->id == 0) printf("Digit 0"); - eq->digit0(tp->id); -#ifdef HASHONLY - pthread_exit(NULL); -#endif - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(0); - barrier(&eq->barry); -#if WN == 200 && WK == 9 && RESTBITS == 8 - eq->digit1(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(1); - barrier(&eq->barry); - eq->digit2(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(2); - barrier(&eq->barry); - eq->digit3(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(3); - barrier(&eq->barry); - eq->digit4(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(4); - barrier(&eq->barry); - eq->digit5(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(5); - barrier(&eq->barry); - eq->digit6(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(6); - barrier(&eq->barry); - eq->digit7(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(7); - barrier(&eq->barry); - eq->digit8(tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(8); - barrier(&eq->barry); -#else - for (u32 r = 1; r < WK; r++) { - r&1 ? eq->digitodd(r, tp->id) : eq->digiteven(r, tp->id); - barrier(&eq->barry); - if (tp->id == 0) eq->showbsizes(r); - barrier(&eq->barry); - } -#endif - eq->digitK(tp->id); - pthread_exit(NULL); - return 0; -} diff --git a/equi_miner.cpp b/equi_miner.cpp index 46bf3f7..31d0917 100644 --- a/equi_miner.cpp +++ b/equi_miner.cpp @@ -48,7 +48,7 @@ int main(int argc, char **argv) { assert(threads); equi eq(nthreads); printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000); -#ifdef __AVX2__ +#ifdef USE_AVX2 printf(" and AVX2 intrinsics to compute 4-way blake2b\n"); #else printf("; no AVX2 detected\n"); diff --git a/equi_miner.h b/equi_miner.h index a823558..2d34e9d 100644 --- a/equi_miner.h +++ b/equi_miner.h @@ -564,7 +564,7 @@ struct equi { } }; -#ifdef __AVX2__ +#ifdef USE_AVX2 static const u32 BLAKESINPARALLEL = 4; #else static const u32 BLAKESINPARALLEL = 1; @@ -580,7 +580,7 @@ static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK; uchar hashes[BLAKESINPARALLEL * 64]; blake2b_state state0 = blake_ctx; for (u32 block = id; block < NBLOCKS; block += nthreads) { -#ifdef __AVX2__ +#ifdef USE_AVX2 blake2bip_final(&state0, hashes, block); #else blake2b_state state = state0;