change 2nd stage bucketsort to slot linking

This commit is contained in:
John Tromp 2016-10-27 13:17:41 -04:00
parent 7f5063ba72
commit fc72754ded
7 changed files with 123 additions and 195 deletions

View File

@ -4,11 +4,11 @@ GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)
all: equi equi1 verify test spark test1445
equi: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi
equi: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DATOMIC -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp -o equi1
equi1: equi.h equi_miner.h equi_miner.cpp blake2-avx2/blake2bip.c Makefile
$(GPP) -DUNROLL equi_miner.cpp blake/blake2b.cpp blake2-avx2/blake2bip.c -o equi1
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
g++ -g -std=c++11 -DLOGSPARK -DSPARKSCALE=11 equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g

View File

@ -298,32 +298,23 @@ ALIGN(64) static const uint32_t indices[12][16] = {
} while(0)
void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
__m256i v[16], s[8], iv[8], w[16]; // 16 * 32, 8 * 32
union {
__m256i v;
uint64_t w[4];
} counter;
uint32_t b;
size_t inlen;
int i, r;
__m256i v[16], s[8], iv[8], w[16], counter, flag;
uint32_t b, i, r;
ALIGN(64) uint8_t buffer[4 * BLAKE2B_BLOCKBYTES]; // 4 * 128
memset(buffer, 0, 4 * BLAKE2B_BLOCKBYTES); // zero whole buffer
memset(buffer, 0, 4 * BLAKE2B_BLOCKBYTES);
for (i = 0; i < 4; i++) {
memcpy(buffer+128*i, S->buf, S->buflen);
b = htole32(4 * blockidx + i);
memcpy(buffer+128*i + S->buflen, &b, 4);
}
inlen = S->buflen + 4;
for(i = 0; i < 8; ++i) {
v[i] = _mm256_set1_epi64x(S->h[i]); // initialize all 256/64 = 4 lanes
v[i] = _mm256_set1_epi64x(S->h[i]);
}
__m256i x4 = _mm256_set1_epi64x(128+inlen);
__m256i f0 = _mm256_set1_epi64x(~0);
counter.v = _mm256_add_epi64(counter.v, x4);
counter = _mm256_set1_epi64x(128 + S->buflen + 4);
flag = _mm256_set1_epi64x(~0);
for(i = 0; i < 8; ++i) {
iv[i] = v[i];
@ -332,9 +323,9 @@ void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
v[ 9] = _mm256_set1_epi64x(blake2b_IV[1]);
v[10] = _mm256_set1_epi64x(blake2b_IV[2]);
v[11] = _mm256_set1_epi64x(blake2b_IV[3]);
v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter.v);
v[12] = XOR(_mm256_set1_epi64x(blake2b_IV[4]), counter);
v[13] = _mm256_set1_epi64x(blake2b_IV[5]);
v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), f0);
v[14] = XOR(_mm256_set1_epi64x(blake2b_IV[6]), flag);
v[15] = _mm256_set1_epi64x(blake2b_IV[7]);
BLAKE2B_LOADMSG_V4(w, buffer);
for(r = 0; r < 12; ++r) {
@ -346,7 +337,5 @@ void blake2bip_final(const blake2b_state *S, uchar *out, u32 blockidx) {
BLAKE2B_UNPACK_STATE_V4(s, v);
// big loop obsoleted as it would if(!inlen) break
memcpy(out, s, 256); // instead of return blake2b_root(out, (void *)s);
memcpy(out, s, 256);
}

View File

@ -101,8 +101,6 @@ static const u32 SLOTRANGE = 1<<SLOTBITS;
static const u32 SLOTMSB = 1<<(SLOTBITS-1);
// number of slots per bucket
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
// number of per-xhash slots
static const u32 XFULL = 16;
// SLOTBITS mask
static const u32 SLOTMASK = SLOTRANGE-1;
// number of possible values of xhash (rest of n) bits
@ -239,7 +237,6 @@ struct equi {
proof *sols;
au32 nsols;
u32 nthreads;
u32 xfull;
u32 bfull;
u32 hfull;
pthread_barrier_t barry;
@ -266,7 +263,7 @@ struct equi {
Blake2PrepareMidstate4(midstate, alignheader);
memcpy(&blake_ctx, midstate, 256);
memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
nsols = xfull = bfull = hfull = 0;
nsols = bfull = hfull = 0;
}
u32 getslot0(const u32 bucketi) {
#ifdef ATOMIC
@ -405,8 +402,8 @@ struct equi {
}
#endif
void showbsizes(u32 r) {
printf(" x%d b%d h%d\n", xfull, bfull, hfull);
xfull = bfull = hfull = 0;
printf(" b%d h%d\n", bfull, hfull);
bfull = hfull = 0;
#if defined(HIST) || defined(SPARK) || defined(LOGSPARK)
u32 binsizes[65];
memset(binsizes, 0, 65 * sizeof(u32));
@ -491,11 +488,10 @@ struct equi {
#else
typedef u16 xslot;
#endif
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
u32 n0;
u32 n1;
static const xslot xnil = ~0;
xslot xhashslots[NRESTS];
xslot nextxhashslot[NSLOTS];
xslot nextslot;
#endif
u32 s0;
@ -503,40 +499,36 @@ struct equi {
#ifdef XBITMAP
memset(xhashmap, 0, NRESTS * sizeof(u64));
#else
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
memset(xhashslots, xnil, NRESTS * sizeof(xslot));
memset(nextxhashslot, xnil, NSLOTS * sizeof(xslot));
#endif
}
bool addslot(u32 s1, u32 xh) {
void addslot(u32 s1, u32 xh) {
#ifdef XBITMAP
xmap = xhashmap[xh];
xhashmap[xh] |= (u64)1 << s1;
s0 = -1;
return true;
#else
n1 = (u32)nxhashslots[xh]++;
if (n1 >= XFULL)
return false;
xx = xhashslots[xh];
xx[n1] = s1;
n0 = 0;
return true;
nextslot = xhashslots[xh];
nextxhashslot[s1] = nextslot;
xhashslots[xh] = s1;
#endif
}
bool nextcollision() const {
#ifdef XBITMAP
return xmap != 0;
#else
return n0 < n1;
return nextslot != xnil;
#endif
}
u32 slot() {
#ifdef XBITMAP
const u32 ffs = __builtin_ffsll(xmap);
s0 += ffs; xmap >>= ffs;
return s0;
#else
return (u32)xx[n0++];
nextslot = nextxhashslot[s0 = nextslot];
#endif
return s0;
}
};
@ -578,10 +570,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htl.getxhash0(slot1))) {
xfull++;
continue;
}
cd.addslot(s1, htl.getxhash0(slot1));
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -627,10 +616,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htl.getxhash1(slot1))) {
xfull++;
continue;
}
cd.addslot(s1, htl.getxhash1(slot1));
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -676,10 +662,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff)) {
xfull++;
continue;
}
cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -712,10 +695,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[3])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[3]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -748,10 +728,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff)) {
xfull++;
continue;
}
cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -783,10 +760,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[0])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[0]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -818,10 +792,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff)) {
xfull++;
continue;
}
cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -855,10 +826,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[1])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[1]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -890,10 +858,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4)) {
xfull++;
continue;
}
cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -924,10 +889,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[2])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[2]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -961,8 +923,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htl.getxhash0(slot1))) // assume WK odd
continue;
cd.addslot(s1, htl.getxhash0(slot1)); // assume WK odd
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE

View File

@ -70,7 +70,7 @@ typedef u32 au32;
#define BLAKESINPARALLEL 2
#else
#define BLAKESINPARALLEL 1
#endif
#endif
// number of buckets
static const u32 NBUCKETS = 1<<BUCKBITS;

View File

@ -47,7 +47,12 @@ int main(int argc, char **argv) {
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
assert(threads);
equi eq(nthreads);
printf("Using %dMB of memory\n", 1 + eq.hta.alloced / 0x100000);
printf("Using %dMB of memory", 1 + eq.hta.alloced / 0x100000);
#ifdef __AVX2__
printf(" and AVX2 intrinsics to compute 4-way blake2b\n");
#else
printf("; no AVX2 detected\n");
#endif
u32 sumnsols = 0;
char headernonce[HEADERNONCELEN];
u32 hdrlen = strlen(header);

View File

@ -45,8 +45,6 @@ static const u32 SLOTBITS = RESTBITS+1+1;
static const u32 SLOTRANGE = 1<<SLOTBITS;
// number of slots per bucket
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
// number of per-xhash slots
static const u32 XFULL = 16;
// SLOTBITS mask
static const u32 SLOTMASK = SLOTRANGE-1;
// number of possible values of xhash (rest of n) bits
@ -399,11 +397,10 @@ struct equi {
#else
typedef u16 xslot;
#endif
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
u32 n0;
u32 n1;
static const xslot xnil = ~0;
xslot xhashslots[NRESTS];
xslot nextxhashslot[NSLOTS];
xslot nextslot;
#endif
u32 s0;
@ -411,7 +408,8 @@ struct equi {
#ifdef XBITMAP
memset(xhashmap, 0, NRESTS * sizeof(u64));
#else
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
memset(xhashslots, xnil, NRESTS * sizeof(xslot));
memset(nextxhashslot, xnil, NSLOTS * sizeof(xslot));
#endif
}
__device__ bool addslot(u32 s1, u32 xh) {
@ -421,12 +419,9 @@ struct equi {
s0 = ~0;
return true;
#else
n1 = (u32)nxhashslots[xh]++;
if (n1 >= XFULL)
return false;
xx = xhashslots[xh];
xx[n1] = s1;
n0 = 0;
nextslot = xhashslots[xh];
nextxhashslot[s1] = nextslot;
xhashslots[xh] = s1;
return true;
#endif
}
@ -434,17 +429,17 @@ struct equi {
#ifdef XBITMAP
return xmap != 0;
#else
return n0 < n1;
return nextslot != xnil;
#endif
}
__device__ u32 slot() {
#ifdef XBITMAP
const u32 ffs = __ffsll(xmap);
s0 += ffs; xmap >>= ffs;
return s0;
#else
return (u32)xx[n0++];
nextslot = nextxhashslot[s0 = nextslot];
#endif
return s0;
}
};
};

View File

@ -39,6 +39,8 @@
#include <pthread.h>
#include <assert.h>
#include "blake2-avx2/blake2bip.h"
#if defined __builtin_bswap32 && defined __LITTLE_ENDIAN
#undef htobe32
#define htobe32(x) __builtin_bswap32(x)
@ -51,7 +53,7 @@
typedef uint16_t u16;
typedef uint64_t u64;
// rquired for avoiding multio-threading race conflicts
// required for avoiding multio-threading race conflicts
#ifdef ATOMIC
#include <atomic>
typedef std::atomic<u32> au32;
@ -94,12 +96,8 @@ static const u32 SLOTMASK = SLOTRANGE-1;
static const u32 SLOTMSB = 1<<(SLOTBITS-1);
// number of slots per bucket
static const u32 NSLOTS = SLOTRANGE * SAVEMEM;
// number of per-xhash slots
static const u32 XFULL = 16;
// number of possible values of RESTBITS bits
static const u32 NRESTS = 1<<RESTBITS;
// number of blocks of hashes extracted from single 512 bit blake2b output
static const u32 NBLOCKS = (NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE;
// more than 8 solutions are rare (less than one in 100000 runs)
static const u32 MAXSOLS = 8;
@ -276,7 +274,6 @@ struct equi {
proof *sols;
au32 nsols;
u32 nthreads;
u32 xfull;
u32 bfull;
u32 hfull;
pthread_barrier_t barry;
@ -298,7 +295,7 @@ struct equi {
void setheadernonce(const char *headernonce, const u32 len) {
setheader(&blake_ctx, headernonce);
memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
nsols = xfull = bfull = hfull = 0;
nsols = bfull = hfull = 0;
}
u32 getslot0(const u32 bucketi) {
#ifdef ATOMIC
@ -437,8 +434,8 @@ struct equi {
}
#endif
void showbsizes(u32 r) {
printf(" x%d b%d h%d\n", xfull, bfull, hfull);
xfull = bfull = hfull = 0;
printf(" b%d h%d\n", bfull, hfull);
bfull = hfull = 0;
#if defined(HIST) || defined(SPARK) || defined(LOGSPARK)
u32 binsizes[65];
memset(binsizes, 0, 65 * sizeof(u32));
@ -523,11 +520,10 @@ struct equi {
#else
typedef u16 xslot;
#endif
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
u32 n0;
u32 n1;
static const xslot xnil = ~0;
xslot xhashslots[NRESTS];
xslot nextxhashslot[NSLOTS];
xslot nextslot;
#endif
u32 s0;
@ -535,75 +531,88 @@ struct equi {
#ifdef XBITMAP
memset(xhashmap, 0, NRESTS * sizeof(u64));
#else
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
memset(xhashslots, xnil, NRESTS * sizeof(xslot));
memset(nextxhashslot, xnil, NSLOTS * sizeof(xslot));
#endif
}
bool addslot(u32 s1, u32 xh) {
void addslot(u32 s1, u32 xh) {
#ifdef XBITMAP
xmap = xhashmap[xh];
xhashmap[xh] |= (u64)1 << s1;
s0 = -1;
return true;
#else
n1 = (u32)nxhashslots[xh]++;
if (n1 >= XFULL)
return false;
xx = xhashslots[xh];
xx[n1] = s1;
n0 = 0;
return true;
nextslot = xhashslots[xh];
nextxhashslot[s1] = nextslot;
xhashslots[xh] = s1;
#endif
}
bool nextcollision() const {
#ifdef XBITMAP
return xmap != 0;
#else
return n0 < n1;
return nextslot != xnil;
#endif
}
u32 slot() {
#ifdef XBITMAP
const u32 ffs = __builtin_ffsll(xmap);
s0 += ffs; xmap >>= ffs;
return s0;
#else
return (u32)xx[n0++];
nextslot = nextxhashslot[s0 = nextslot];
#endif
return s0;
}
};
#undef __AVX2__
#ifdef __AVX2__
static const u32 BLAKESINPARALLEL = 4;
#else
static const u32 BLAKESINPARALLEL = 1;
#endif
// number of hashes extracted from BLAKESINPARALLEL blake2b outputs
static const u32 HASHESPERBLOCK = BLAKESINPARALLEL*HASHESPERBLAKE;
// number of blocks of parallel blake2b calls
static const u32 NBLOCKS = (NHASHES+HASHESPERBLOCK-1)/HASHESPERBLOCK;
void digit0(const u32 id) {
uchar hash[HASHOUT];
blake2b_state state0 = blake_ctx;
htlayout htl(this, 0);
const u32 hashbytes = hashsize(0);
uchar hashes[BLAKESINPARALLEL * 64];
blake2b_state state0 = blake_ctx;
for (u32 block = id; block < NBLOCKS; block += nthreads) {
#ifdef __AVX2__
blake2bip_final(&state0, hashes, block);
#else
blake2b_state state = state0;
u32 leb = htole32(block);
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
blake2b_final(&state, hash, HASHOUT);
for (u32 i = 0; i<HASHESPERBLAKE; i++) {
const uchar *ph = hash + i * WN/8;
blake2b_final(&state, hashes, HASHOUT);
#endif
for (u32 i = 0; i<BLAKESINPARALLEL; i++) {
for (u32 j = 0; j<HASHESPERBLAKE; j++) {
const uchar *ph = hashes+ i * 64 + j * WN/8;
#if BUCKBITS == 12 && RESTBITS == 8
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
#elif BUCKBITS == 16 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
#elif BUCKBITS == 20 && RESTBITS == 4
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
#elif BUCKBITS == 12 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
const u32 xhash = ph[1] & 0xf;
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
const u32 xhash = ph[1] & 0xf;
#else
#error not implemented
#endif
const u32 slot = getslot0(bucketid);
if (slot >= NSLOTS) {
bfull++;
continue;
const u32 slot = getslot0(bucketid);
if (slot >= NSLOTS) {
bfull++;
continue;
}
htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits;
memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
s->tag = tree((block * BLAKESINPARALLEL + i) * HASHESPERBLAKE + j);
}
htunit *s = hta.heap0[bucketid][slot] + htl.nexthtunits;
memcpy(s->bytes-hashbytes, ph+WN/8-hashbytes, hashbytes);
s->tag = tree(block * HASHESPERBLAKE + i);
}
}
}
@ -617,10 +626,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htl.getxhash0(slot1))) {
xfull++;
continue;
}
cd.addslot(s1, htl.getxhash0(slot1));
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -666,10 +672,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htl.getxhash1(slot1))) {
xfull++;
continue;
}
cd.addslot(s1, htl.getxhash1(slot1));
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -715,10 +718,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff)) {
xfull++;
continue;
}
cd.addslot(s1, htobe32(slot1->word) >> 20 & 0xff);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -751,10 +751,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[3])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[3]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -787,10 +784,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff)) {
xfull++;
continue;
}
cd.addslot(s1, htobe32(slot1->word) >> 12 & 0xff);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -822,10 +816,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[0])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[0]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -857,10 +848,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff)) {
xfull++;
continue;
}
cd.addslot(s1, htobe32(slot1->word) >> 4 & 0xff);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -894,10 +882,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[1])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[1]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -929,10 +914,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4)) {
xfull++;
continue;
}
cd.addslot(s1, (slot1->bytes[3] & 0xf) << 4 | slot1->bytes[4] >> 4);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -963,10 +945,7 @@ struct equi {
u32 bsize = getnslots1(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, slot1->bytes[2])) {
xfull++;
continue;
}
cd.addslot(s1, slot1->bytes[2]);
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *slot0 = buck[s0];
@ -1000,8 +979,7 @@ struct equi {
u32 bsize = getnslots0(bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *slot1 = buck[s1];
if (!cd.addslot(s1, htl.getxhash0(slot1))) // assume WK odd
continue;
cd.addslot(s1, htl.getxhash0(slot1)); // assume WK odd
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
if (htl.equal(buck[s0], slot1)) { // EASY OPTIMIZE