add solvers

This commit is contained in:
tromp 2016-10-14 14:28:34 -04:00
parent e22714c5f6
commit 72db6ff8a5
16 changed files with 3534 additions and 0 deletions

44
Makefile Normal file
View File

@ -0,0 +1,44 @@
OPT = -O3
FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread
GPP = g++ -march=native -m64 -maes -mavx -std=c++11 $(FLAGS)
all: equi equi1 faster faster1 verify test
equi: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi
equi1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1
equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
faster: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster
faster1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1
equi965: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965
equi1445: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DWN=144 -DWK=5 -DXWITHASH equi_miner.cpp blake/blake2b.cpp -o equi1445
eqcuda: equi_miner.cu equi.h blake2b.cu Makefile
nvcc -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcuda
eqcuda1445: equi_miner.cu equi.h blake2b.cu Makefile
nvcc -DWN=144 -DWK=5 -DXWITHASH -arch sm_35 equi_miner.cu blake/blake2b.cpp -o eqcuda1445
feqcuda: equi_miner.cu equi.h blake2b.cu Makefile
nvcc -DUNROLL -DJOINHT -arch sm_35 equi_miner.cu blake/blake2b.cpp -o feqcuda
verify: equi.h equi.c Makefile
g++ -g equi.c blake/blake2b.cpp -o verify
bench: equi
time for i in {0..9}; do ./faster -n $$i; done
test: equi verify Makefile
time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0

72
blake/blake2-config.h Normal file
View File

@ -0,0 +1,72 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_CONFIG_H__
#define __BLAKE2_CONFIG_H__
// These don't work everywhere
#if defined(__SSE2__)
#define HAVE_SSE2
#endif
#if defined(__SSSE3__)
#define HAVE_SSSE3
#endif
#if defined(__SSE4_1__)
#define HAVE_SSE41
#endif
#if defined(__AVX__)
#define HAVE_AVX
#endif
#if defined(__XOP__)
#define HAVE_XOP
#endif
#ifdef HAVE_AVX2
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif
#ifdef HAVE_XOP
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif
#ifdef HAVE_AVX
#ifndef HAVE_SSE41
#define HAVE_SSE41
#endif
#endif
#ifdef HAVE_SSE41
#ifndef HAVE_SSSE3
#define HAVE_SSSE3
#endif
#endif
#ifdef HAVE_SSSE3
#define HAVE_SSE2
#endif
#if !defined(HAVE_SSE2)
#error "This code requires at least SSE2."
#endif
#endif

136
blake/blake2-impl.h Normal file
View File

@ -0,0 +1,136 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_IMPL_H__
#define __BLAKE2_IMPL_H__
#include <stdint.h>
static inline uint32_t load32( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
uint32_t w = *p++;
w |= ( uint32_t )( *p++ ) << 8;
w |= ( uint32_t )( *p++ ) << 16;
w |= ( uint32_t )( *p++ ) << 24;
return w;
#endif
}
static inline uint64_t load64( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
uint64_t w = *p++;
w |= ( uint64_t )( *p++ ) << 8;
w |= ( uint64_t )( *p++ ) << 16;
w |= ( uint64_t )( *p++ ) << 24;
w |= ( uint64_t )( *p++ ) << 32;
w |= ( uint64_t )( *p++ ) << 40;
w |= ( uint64_t )( *p++ ) << 48;
w |= ( uint64_t )( *p++ ) << 56;
return w;
#endif
}
static inline void store32( void *dst, uint32_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}
static inline void store64( void *dst, uint64_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}
static inline uint64_t load48( const void *src )
{
const uint8_t *p = ( const uint8_t * )src;
uint64_t w = *p++;
w |= ( uint64_t )( *p++ ) << 8;
w |= ( uint64_t )( *p++ ) << 16;
w |= ( uint64_t )( *p++ ) << 24;
w |= ( uint64_t )( *p++ ) << 32;
w |= ( uint64_t )( *p++ ) << 40;
return w;
}
static inline void store48( void *dst, uint64_t w )
{
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
}
static inline uint32_t rotl32( const uint32_t w, const unsigned c )
{
return ( w << c ) | ( w >> ( 32 - c ) );
}
static inline uint64_t rotl64( const uint64_t w, const unsigned c )
{
return ( w << c ) | ( w >> ( 64 - c ) );
}
static inline uint32_t rotr32( const uint32_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 32 - c ) );
}
static inline uint64_t rotr64( const uint64_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 64 - c ) );
}
/* prevents compiler optimizing out memset() */
static inline void secure_zero_memory( void *v, size_t n )
{
volatile uint8_t *p = ( volatile uint8_t * )v;
while( n-- ) *p++ = 0;
}
#endif

85
blake/blake2-round.h Normal file
View File

@ -0,0 +1,85 @@
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
row1l = _mm_add_epi64(row1l, row2l); \
row1h = _mm_add_epi64(row1h, row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24); \
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
row1l = _mm_add_epi64(row1l, row2l); \
row1h = _mm_add_epi64(row1h, row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63); \
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
\
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
\
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
\
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);

156
blake/blake2.h Normal file
View File

@ -0,0 +1,156 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_H__
#define __BLAKE2_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__ ((__aligned__(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
typedef struct __blake2s_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_param;
ALIGN( 64 ) typedef struct __blake2s_state
{
uint32_t h[8];
uint32_t t[2];
uint32_t f[2];
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
size_t buflen;
uint8_t last_node;
} blake2s_state;
typedef struct __blake2b_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint64_t node_offset; // 16
uint8_t node_depth; // 17
uint8_t inner_length; // 18
uint8_t reserved[14]; // 32
uint8_t salt[BLAKE2B_SALTBYTES]; // 48
uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64
} blake2b_param;
ALIGN( 64 ) typedef struct __blake2b_state
{
uint64_t h[8];
uint8_t buf[BLAKE2B_BLOCKBYTES];
uint16_t counter;
uint8_t buflen;
uint8_t lastblock;
} blake2b_state;
ALIGN( 64 ) typedef struct __blake2sp_state
{
blake2s_state S[8][1];
blake2s_state R[1];
uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
size_t buflen;
} blake2sp_state;
ALIGN( 64 ) typedef struct __blake2bp_state
{
blake2b_state S[4][1];
blake2b_state R[1];
uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
size_t buflen;
} blake2bp_state;
#pragma pack(pop)
// Streaming API
int blake2s_init( blake2s_state *S, const uint8_t outlen );
int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
int blake2b_init( blake2b_state *S, const uint8_t outlen );
int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
// Simple API
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen);
int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
return blake2b( out, in, key, outlen, inlen, keylen );
}
#if defined(__cplusplus)
}
#endif
#endif

68
blake/blake2b-load-sse2.h Normal file
View File

@ -0,0 +1,68 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE2_H__
#define __BLAKE2B_LOAD_SSE2_H__
#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#endif

402
blake/blake2b-load-sse41.h Normal file
View File

@ -0,0 +1,402 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE41_H__
#define __BLAKE2B_LOAD_SSE41_H__
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#endif

170
blake/blake2b-round.h Normal file
View File

@ -0,0 +1,170 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_ROUND_H__
#define __BLAKE2B_ROUND_H__
#define LOAD(p) _mm_load_si128( (const __m128i *)(p) )
#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))
#define LIKELY(x) __builtin_expect((x),1)
/* Microarchitecture-specific macros */
#ifndef HAVE_XOP
#ifdef HAVE_SSSE3
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#else
#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
#endif
#else
/* ... */
#endif
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24); \
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63); \
#if defined(HAVE_SSSE3)
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#else
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = row4l;\
t1 = row2l;\
row4l = row3l;\
row3l = row3h;\
row3h = row4l;\
row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = row3l;\
row3l = row3h;\
row3h = t0;\
t0 = row2l;\
t1 = row4l;\
row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
#endif
#if defined(HAVE_SSE41)
#include "blake2b-load-sse41.h"
#else
#include "blake2b-load-sse2.h"
#endif
#define ROUND(r) \
LOAD_MSG_ ##r ##_1(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_2(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
LOAD_MSG_ ##r ##_3(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_4(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
#endif
#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
\
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
\
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
\
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);

339
blake/blake2b.cpp Normal file
View File

@ -0,0 +1,339 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake2.h"
#include "blake2-impl.h"
#include "blake2-config.h"
#include <emmintrin.h>
#if defined(HAVE_SSSE3)
#include <tmmintrin.h>
#endif
#if defined(HAVE_SSE41)
#include <smmintrin.h>
#endif
#if defined(HAVE_AVX)
#include <immintrin.h>
#endif
#if defined(HAVE_XOP)
#include <x86intrin.h>
#endif
#include "blake2b-round.h"
ALIGN( 64 ) static const uint64_t blake2b_IV[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/* init xors IV with input parameter block */
int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
{
//blake2b_init0( S );
const uint8_t * v = ( const uint8_t * )( blake2b_IV );
const uint8_t * p = ( const uint8_t * )( P );
uint8_t * h = ( uint8_t * )( S->h );
/* IV XOR ParamBlock */
memset( S, 0, sizeof( blake2b_state ) );
for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
return 0;
}
/* Some sort of default parameter block initialization, for sequential blake2b */
int blake2b_init( blake2b_state *S, const uint8_t outlen )
{
if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
const blake2b_param P =
{
outlen,
0,
1,
1,
0,
0,
0,
0,
{0},
{0},
{0}
};
return blake2b_init_param( S, &P );
}
int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
{
if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
const blake2b_param P =
{
outlen,
keylen,
1,
1,
0,
0,
0,
0,
{0},
{0},
{0}
};
if( blake2b_init_param( S, &P ) < 0 )
return 0;
{
uint8_t block[BLAKE2B_BLOCKBYTES];
memset( block, 0, BLAKE2B_BLOCKBYTES );
memcpy( block, key, keylen );
blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
}
return 0;
}
static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
#endif
#if defined(HAVE_SSE41)
const __m128i m0 = LOADU( block + 00 );
const __m128i m1 = LOADU( block + 16 );
const __m128i m2 = LOADU( block + 32 );
const __m128i m3 = LOADU( block + 48 );
const __m128i m4 = LOADU( block + 64 );
const __m128i m5 = LOADU( block + 80 );
const __m128i m6 = LOADU( block + 96 );
const __m128i m7 = LOADU( block + 112 );
#else
const uint64_t m0 = ( ( uint64_t * )block )[ 0];
const uint64_t m1 = ( ( uint64_t * )block )[ 1];
const uint64_t m2 = ( ( uint64_t * )block )[ 2];
const uint64_t m3 = ( ( uint64_t * )block )[ 3];
const uint64_t m4 = ( ( uint64_t * )block )[ 4];
const uint64_t m5 = ( ( uint64_t * )block )[ 5];
const uint64_t m6 = ( ( uint64_t * )block )[ 6];
const uint64_t m7 = ( ( uint64_t * )block )[ 7];
const uint64_t m8 = ( ( uint64_t * )block )[ 8];
const uint64_t m9 = ( ( uint64_t * )block )[ 9];
const uint64_t m10 = ( ( uint64_t * )block )[10];
const uint64_t m11 = ( ( uint64_t * )block )[11];
const uint64_t m12 = ( ( uint64_t * )block )[12];
const uint64_t m13 = ( ( uint64_t * )block )[13];
const uint64_t m14 = ( ( uint64_t * )block )[14];
const uint64_t m15 = ( ( uint64_t * )block )[15];
#endif
row1l = LOADU( &S->h[0] );
row1h = LOADU( &S->h[2] );
row2l = LOADU( &S->h[4] );
row2h = LOADU( &S->h[6] );
row3l = LOADU( &blake2b_IV[0] );
row3h = LOADU( &blake2b_IV[2] );
row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), _mm_set_epi32(0,0,0,S->counter) );
row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), _mm_set_epi32(0,0,0L-S->lastblock,0L-S->lastblock) );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
row1l = _mm_xor_si128( row3l, row1l );
row1h = _mm_xor_si128( row3h, row1h );
STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
row2l = _mm_xor_si128( row4l, row2l );
row2h = _mm_xor_si128( row4h, row2h );
STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
return 0;
}
int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
{
while( inlen > 0 )
{
size_t left = S->buflen;
size_t fill = BLAKE2B_BLOCKBYTES - left;
if( inlen > fill )
{
memcpy( S->buf + left, in, fill ); // Fill buffer
in += fill;
inlen -= fill;
S->counter += BLAKE2B_BLOCKBYTES;
blake2b_compress( S, S->buf ); // Compress
S->buflen = 0;
}
else // inlen <= fill
{
memcpy( S->buf + left, in, inlen );
S->buflen += inlen; // not enough to compress
in += inlen;
inlen = 0;
}
}
return 0;
}
int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
{
if( outlen > BLAKE2B_OUTBYTES )
return -1;
if( S->buflen > BLAKE2B_BLOCKBYTES )
{
S->counter += BLAKE2B_BLOCKBYTES;
blake2b_compress( S, S->buf );
S->buflen -= BLAKE2B_BLOCKBYTES;
memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
}
S->counter += S->buflen;
S->lastblock = 1;
memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
blake2b_compress( S, S->buf );
memcpy( out, &S->h[0], outlen );
S->lastblock = 0;
return 0;
}
int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
blake2b_state S[1];
/* Verify parameters */
if ( NULL == in ) return -1;
if ( NULL == out ) return -1;
if( NULL == key ) keylen = 0;
if( keylen )
{
if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
}
else
{
if( blake2b_init( S, outlen ) < 0 ) return -1;
}
blake2b_update( S, ( const uint8_t * )in, inlen );
blake2b_final( S, out, outlen );
return 0;
}
#if defined(SUPERCOP)
int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
{
return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
}
#endif
#if defined(BLAKE2B_SELFTEST)
#include <string.h>
#include "blake2-kat.h"
int main( int argc, char **argv )
{
uint8_t key[BLAKE2B_KEYBYTES];
uint8_t buf[KAT_LENGTH];
for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
key[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
buf[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2B_OUTBYTES];
blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
{
puts( "error" );
return -1;
}
}
puts( "ok" );
return 0;
}
#endif
int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
{
blake2b_state blake_state;
if (outlen <= BLAKE2B_OUTBYTES)
{
blake2b_init(&blake_state, outlen);
blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
blake2b_update(&blake_state, (const uint8_t *)in, inlen);
blake2b_final(&blake_state, out, outlen);
}
else
{
uint8_t out_buffer[BLAKE2B_OUTBYTES];
uint8_t in_buffer[BLAKE2B_OUTBYTES];
blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
blake2b_update(&blake_state, (const uint8_t *)in, inlen);
blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
while (toproduce > BLAKE2B_OUTBYTES)
{
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce -= BLAKE2B_OUTBYTES / 2;
}
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
memcpy(out, out_buffer, toproduce);
}
return 0;
}

166
blake2b.cu Normal file
View File

@ -0,0 +1,166 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
/**
* uint2 direct ops by c++ operator definitions
*/
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
return make_uint2(a.x ^ b.x, a.y ^ b.y);
}
// uint2 ROR/ROL methods
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
#if __CUDA_ARCH__ > 300
if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
} else /* if (offset < 64) */ {
/* offset SHOULD BE < 64 ! */
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
#else
if (!offset)
result = a;
else if (offset < 32) {
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
}
#endif
return result;
}
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
return make_uint2(value.y, value.x);
}
#ifdef __CUDA_ARCH__
__device__ __inline__ uint2 ROR24(const uint2 a) {
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x2107);
result.y = __byte_perm(a.y, a.x, 0x6543);
return result;
}
__device__ __inline__ uint2 ROR16(const uint2 a) {
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x1076);
result.y = __byte_perm(a.y, a.x, 0x5432);
return result;
}
#else
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
#endif
typedef uint64_t u64;
static __constant__ const int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
__device__ __forceinline__
static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
a = a + b + m[ blake2b_sigma[r][2*i] ];
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
a = a + b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, v[0], v[4], v[ 8], v[12], m); \
G(r, 1, v[1], v[5], v[ 9], v[13], m); \
G(r, 2, v[2], v[6], v[10], v[14], m); \
G(r, 3, v[3], v[7], v[11], v[15], m); \
G(r, 4, v[0], v[5], v[10], v[15], m); \
G(r, 5, v[1], v[6], v[11], v[12], m); \
G(r, 6, v[2], v[7], v[ 8], v[13], m); \
G(r, 7, v[3], v[4], v[ 9], v[14], m);
__device__ void blake2b_gpu_hash(blake2b_state *state, u32 idx, uchar *hash, u32 outlen) {
const u32 leb = htole32(idx);
memcpy(state->buf + state->buflen, &leb, sizeof(u32));
state->buflen += sizeof(u32);
state->counter += state->buflen;
memset(state->buf + state->buflen, 0, BLAKE2B_BLOCKBYTES - state->buflen);
u64 *d_data = (u64 *)state->buf;
u64 m[16];
m[0] = d_data[0];
m[1] = d_data[1];
m[2] = d_data[2];
m[3] = d_data[3];
m[4] = d_data[4];
m[5] = d_data[5];
m[6] = d_data[6];
m[7] = d_data[7];
m[8] = d_data[8];
m[9] = d_data[9];
m[10] = d_data[10];
m[11] = d_data[11];
m[12] = d_data[12];
m[13] = d_data[13];
m[14] = d_data[14];
m[15] = d_data[15];
u64 v[16];
v[0] = state->h[0];
v[1] = state->h[1];
v[2] = state->h[2];
v[3] = state->h[3];
v[4] = state->h[4];
v[5] = state->h[5];
v[6] = state->h[6];
v[7] = state->h[7];
v[8] = 0x6a09e667f3bcc908;
v[9] = 0xbb67ae8584caa73b;
v[10] = 0x3c6ef372fe94f82b;
v[11] = 0xa54ff53a5f1d36f1;
v[12] = 0x510e527fade682d1 ^ state->counter;
v[13] = 0x9b05688c2b3e6c1f;
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
v[15] = 0x5be0cd19137e2179;
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
state->h[0] ^= v[0] ^ v[ 8];
state->h[1] ^= v[1] ^ v[ 9];
state->h[2] ^= v[2] ^ v[10];
state->h[3] ^= v[3] ^ v[11];
state->h[4] ^= v[4] ^ v[12];
state->h[5] ^= v[5] ^ v[13];
state->h[6] ^= v[6] ^ v[14];
state->h[7] ^= v[7] ^ v[15];
memcpy(hash, (uchar *)state->h, outlen);
}

36
equi.c Normal file
View File

@ -0,0 +1,36 @@
#include "equi.h"
#include <inttypes.h> // for SCNx64 macro
#include <stdio.h> // printf/scanf
#include <stdlib.h> // exit
#include <unistd.h> // getopt
#include <assert.h> // d'uh
int main(int argc, char **argv) {
const char *header = "";
int nonce = 0, c;
while ((c = getopt (argc, argv, "h:n:")) != -1) {
switch (c) {
case 'h':
header = optarg;
break;
case 'n':
nonce = atoi(optarg);
break;
}
}
printf("Verifying size %d proof for equi(\"%s\",%d)\n",
PROOFSIZE, header, nonce);
for (int nsols=0; scanf(" Solution") == 0; nsols++) {
u32 indices[PROOFSIZE];
for (int n = 0; n < PROOFSIZE; n++) {
int nscan = scanf(" %x", &indices[n]);
assert(nscan == 1);
}
int pow_rc = verify(indices, header, nonce);
if (pow_rc == POW_OK)
printf("Verified\n");
else
printf("FAILED due to %s\n", errstr[pow_rc]);
}
return 0;
}

129
equi.h Normal file
View File

@ -0,0 +1,129 @@
// Equihash solver
// Copyright (c) 2016-2016 John Tromp
#include "blake/blake2.h"
#ifdef __APPLE__
#include "osx_barrier.h"
#include <machine/endian.h>
#include <libkern/OSByteOrder.h>
#define htole32(x) OSSwapHostToLittleInt32(x)
#else
#include <endian.h>
#endif
#include <stdint.h> // for types uint32_t,uint64_t
#include <string.h> // for functions strlen, memset
#include <stdlib.h> // for function qsort
typedef uint32_t u32;
typedef unsigned char uchar;
// algorithm parameters, prefixed with W to reduce include file conflicts
#ifndef WN
#define WN 200
#endif
#ifndef WK
#define WK 9
#endif
#define NDIGITS (WK+1)
#define DIGITBITS (WN/(NDIGITS))
static const u32 PROOFSIZE = 1<<WK;
static const u32 BASE = 1<<DIGITBITS;
static const u32 NHASHES = 2*BASE;
static const u32 HASHESPERBLAKE = 512/WN;
static const u32 HASHOUT = HASHESPERBLAKE*WN/8;
typedef u32 proof[PROOFSIZE];
void setheader(blake2b_state *ctx, const char *header, u32 nce) {
uint32_t le_N = htole32(WN);
uint32_t le_K = htole32(WK);
uchar personal[] = "ZcashPoW01230123";
memcpy(personal+8, &le_N, 4);
memcpy(personal+12, &le_K, 4);
blake2b_param P[1];
P->digest_length = HASHOUT;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
P->leaf_length = 0;
P->node_offset = 0;
P->node_depth = 0;
P->inner_length = 0;
memset(P->reserved, 0, sizeof(P->reserved));
memset(P->salt, 0, sizeof(P->salt));
memcpy(P->personal, (const uint8_t *)personal, 16);
blake2b_init_param(ctx, P);
blake2b_update(ctx, (const uchar *)header, strlen(header));
uchar nonce[32];
memset(nonce, 0, 32);
uint32_t le_nonce = htole32(nce);
memcpy(nonce, &le_nonce, 4);
blake2b_update(ctx, nonce, 32);
}
enum verify_code { POW_OK, POW_DUPLICATE, POW_OUT_OF_ORDER, POW_NONZERO_XOR };
const char *errstr[] = { "OK", "duplicate index", "indices out of order", "nonzero xor" };
void genhash(blake2b_state *ctx, u32 idx, uchar *hash) {
blake2b_state state = *ctx;
u32 leb = htole32(idx / HASHESPERBLAKE);
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
uchar blakehash[HASHOUT];
blake2b_final(&state, blakehash, HASHOUT);
memcpy(hash, blakehash + (idx % HASHESPERBLAKE) * WN/8, WN/8);
}
int verifyrec(blake2b_state *ctx, u32 *indices, uchar *hash, int r) {
if (r == 0) {
genhash(ctx, *indices, hash);
return POW_OK;
}
u32 *indices1 = indices + (1 << (r-1));
if (*indices >= *indices1)
return POW_OUT_OF_ORDER;
uchar hash0[WN/8], hash1[WN/8];
int vrf0 = verifyrec(ctx, indices, hash0, r-1);
if (vrf0 != POW_OK)
return vrf0;
int vrf1 = verifyrec(ctx, indices1, hash1, r-1);
if (vrf1 != POW_OK)
return vrf1;
for (int i=0; i < WN/8; i++)
hash[i] = hash0[i] ^ hash1[i];
int i, b = r * DIGITBITS;
for (i = 0; i < b/8; i++)
if (hash[i])
return POW_NONZERO_XOR;
if ((b%8) && hash[i] >> (8-(b%8)))
return POW_NONZERO_XOR;
return POW_OK;
}
int compu32(const void *pa, const void *pb) {
u32 a = *(u32 *)pa, b = *(u32 *)pb;
return a<b ? -1 : a==b ? 0 : +1;
}
bool duped(proof prf) {
proof sortprf;
memcpy(sortprf, prf, sizeof(proof));
qsort(sortprf, PROOFSIZE, sizeof(u32), &compu32);
for (u32 i=1; i<PROOFSIZE; i++)
if (sortprf[i] <= sortprf[i-1])
return true;
return false;
}
// verify Wagner conditions
int verify(u32 indices[PROOFSIZE], const char *header, int nonce) {
if (duped(indices))
return POW_DUPLICATE;
blake2b_state ctx;
setheader(&ctx, header, nonce);
uchar hash[WN/8];
return verifyrec(&ctx, indices, hash, WK);
}

79
equi_miner.cpp Normal file
View File

@ -0,0 +1,79 @@
// Wagner's algorithm for Generalized Birthday Paradox, a memory-hard proof-of-work
// Copyright (c) 2013-2016 John Tromp
#include "equi_miner.h"
#include <unistd.h>
int main(int argc, char **argv) {
int nthreads = 1;
int nonce = 0;
int range = 1;
bool showsol = false;
const char *header = "";
int c;
while ((c = getopt (argc, argv, "h:n:r:t:s")) != -1) {
switch (c) {
case 'h':
header = optarg;
break;
case 'n':
nonce = atoi(optarg);
break;
case 'r':
range = atoi(optarg);
break;
case 's':
showsol = true;
break;
case 't':
nthreads = atoi(optarg);
break;
}
}
#ifndef XWITHASH
if (sizeof(tree) > 4)
printf("WARNING: please compile with -DXWITHASH to shrink tree!\n");
#endif
#ifdef ATOMIC
if (nthreads==1)
printf("WARNING: use of atomics hurts single threaded performance!\n");
#else
assert(nthreads==1);
#endif
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
if (range > 1)
printf("-%d", nonce+range-1);
printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
assert(threads);
equi eq(nthreads);
u32 sumnsols = 0;
for (int r = 0; r < range; r++) {
eq.setnonce(header, nonce+r);
for (int t = 0; t < nthreads; t++) {
threads[t].id = t;
threads[t].eq = &eq;
int err = pthread_create(&threads[t].thread, NULL, worker, (void *)&threads[t]);
assert(err == 0);
}
for (int t = 0; t < nthreads; t++) {
int err = pthread_join(threads[t].thread, NULL);
assert(err == 0);
}
u32 nsols = 0;
for (unsigned s = 0; s < eq.nsols; s++) {
nsols++;
if (showsol) {
printf("Solution");
for (u32 i = 0; i < PROOFSIZE; i++)
printf(" %jx", (uintmax_t)eq.sols[s][i]);
printf("\n");
}
}
printf("%d solutions\n", nsols);
sumnsols += nsols;
}
free(threads);
printf("%d total solutions\n", sumnsols);
return 0;
}

994
equi_miner.cu Normal file
View File

@ -0,0 +1,994 @@
// Equihash CUDA solver
// Copyright (c) 2013-2016 John Tromp
#include "equi.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "blake2b.cu"
typedef uint64_t u64;
#define checkCudaErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#ifndef RESTBITS
#define RESTBITS 4
#endif
// 2_log of number of buckets
#define BUCKBITS (DIGITBITS-RESTBITS)
// number of buckets
static const u32 NBUCKETS = 1<<BUCKBITS;
// 2_log of number of slots per bucket
static const u32 SLOTBITS = RESTBITS+1+1;
// number of slots per bucket
static const u32 NSLOTS = 1<<SLOTBITS;
// number of per-xhash slots
static const u32 XFULL = NSLOTS/4;
// SLOTBITS mask
static const u32 SLOTMASK = NSLOTS-1;
// number of possible values of xhash (rest of n) bits
static const u32 NRESTS = 1<<RESTBITS;
// number of blocks of hashes extracted from single 512 bit blake2b output
static const u32 NBLOCKS = (NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE;
// nothing larger found in 100000 runs
static const u32 MAXSOLS = 8;
// scaling factor for showing bucketsize histogra as sparkline
#ifndef SPARKSCALE
#define SPARKSCALE (40 << (BUCKBITS-12))
#endif
// tree node identifying its children as two different slots in
// a bucket on previous layer with the same rest bits (x-tra hash)
struct tree {
unsigned bucketid : BUCKBITS;
unsigned slotid0 : SLOTBITS;
unsigned slotid1 : SLOTBITS;
#ifndef XWITHASH
unsigned xhash : RESTBITS;
#endif
// layer 0 has no children bit needs to encode index
__device__ u32 getindex() const {
return (bucketid << SLOTBITS) | slotid0;
}
__device__ void setindex(const u32 idx) {
slotid0 = idx & SLOTMASK;
bucketid = idx >> SLOTBITS;
}
};
union htunit {
tree attr;
u32 hash;
uchar bytes[sizeof(u32)];
};
// a bucket is NSLOTS treenodes
typedef htunit bucket[NSLOTS];
// the N-bit hash consists of K+1 n-bit "digits"
// each of which corresponds to a layer of NBUCKETS buckets
typedef bucket digit[NBUCKETS];
// size (in bytes) of hash in round 0 <= r < WK
u32 hhashsize(const u32 r) {
#ifdef XWITHASH
const u32 hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
#else
const u32 hashbits = WN - (r+1) * DIGITBITS;
#endif
return (hashbits + 7) / 8;
}
// size (in bytes) of hash in round 0 <= r < WK
__device__ u32 hashsize(const u32 r) {
#ifdef XWITHASH
const u32 hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
#else
const u32 hashbits = WN - (r+1) * DIGITBITS;
#endif
return (hashbits + 7) / 8;
}
u32 hhtunits(u32 bytes) {
return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
}
__device__ u32 htunits(u32 bytes) {
return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
}
#ifdef JOINHT
__device__ u32 slotsize(const u32 r) {
return 1 + htunits(hashsize(r));
}
// size (in htunits) of bucket in round 0 <= r < WK
__device__ u32 bucketsize(const u32 r) {
return NSLOTS * slotsize(r);
}
#else
__device__ u32 slotsize(const u32 r) {
return 1;
}
#endif
// manages hash and tree data
struct htalloc {
#ifdef JOINHT
htunit *trees[WK];
#else
digit *trees;
htunit *hashes[2];
#endif
__device__ htunit *getbucket(u32 r, u32 bid) const {
#ifdef JOINHT
return &trees[r][bid * bucketsize(r)];
#else
return trees[r][bid];
#endif
}
};
typedef u32 bsizes[NBUCKETS];
//u32 __device__ min(const u32 a, const u32 b) {
// return a < b ? a : b;
//}
struct equi {
blake2b_state blake_ctx;
htalloc hta;
bsizes *nslots;
proof *sols;
u32 nsols;
u32 nthreads;
equi(const u32 n_threads) {
nthreads = n_threads;
}
void setnonce(const char *header, u32 nonce) {
setheader(&blake_ctx, header, nonce);
checkCudaErrors(cudaMemset(nslots, 0, NBUCKETS * sizeof(u32)));
nsols = 0;
}
__device__ u32 getnslots(const u32 r, const u32 bid) {
u32 &nslot = nslots[r&1][bid];
const u32 n = min(nslot, NSLOTS);
nslot = 0;
return n;
}
__device__ void orderindices(u32 *indices, u32 size) {
if (indices[0] > indices[size]) {
for (u32 i=0; i < size; i++) {
const u32 tmp = indices[i];
indices[i] = indices[size+i];
indices[size+i] = tmp;
}
}
}
__device__ void listindices(u32 r, const tree t, u32 *indices) {
if (r == 0) {
*indices = t.getindex();
return;
}
const htunit *bt = hta.getbucket(--r,t.bucketid);
const u32 size = 1 << r;
u32 *indices1 = indices + size;
listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
orderindices(indices, size);
}
__device__ void listindices1(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(0,t.bucketid);
const u32 size = 1 << 0;
indices[0] = bt[t.slotid0 * slotsize(0)].attr.getindex();
indices[size] = bt[t.slotid1 * slotsize(0)].attr.getindex();
orderindices(indices, size);
}
__device__ void listindices2(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(1,t.bucketid);
const u32 size = 1 << 1;
listindices1(bt[t.slotid0 * slotsize(1)].attr, indices);
listindices1(bt[t.slotid1 * slotsize(1)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices3(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(2,t.bucketid);
const u32 size = 1 << 2;
listindices2(bt[t.slotid0 * slotsize(2)].attr, indices);
listindices2(bt[t.slotid1 * slotsize(2)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices4(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(3,t.bucketid);
const u32 size = 1 << 3;
listindices3(bt[t.slotid0 * slotsize(3)].attr, indices);
listindices3(bt[t.slotid1 * slotsize(3)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices5(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(4,t.bucketid);
const u32 size = 1 << 4;
listindices4(bt[t.slotid0 * slotsize(4)].attr, indices);
listindices4(bt[t.slotid1 * slotsize(4)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices6(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(5,t.bucketid);
const u32 size = 1 << 5;
listindices5(bt[t.slotid0 * slotsize(5)].attr, indices);
listindices5(bt[t.slotid1 * slotsize(5)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices7(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(6,t.bucketid);
const u32 size = 1 << 6;
listindices6(bt[t.slotid0 * slotsize(6)].attr, indices);
listindices6(bt[t.slotid1 * slotsize(6)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices8(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(7,t.bucketid);
const u32 size = 1 << 7;
listindices7(bt[t.slotid0 * slotsize(7)].attr, indices);
listindices7(bt[t.slotid1 * slotsize(7)].attr, indices+size);
orderindices(indices, size);
}
__device__ void listindices9(const tree t, u32 *indices) {
const htunit *bt = hta.getbucket(8,t.bucketid);
const u32 size = 1 << 8;
listindices8(bt[t.slotid0 * slotsize(8)].attr, indices);
listindices8(bt[t.slotid1 * slotsize(8)].attr, indices+size);
orderindices(indices, size);
}
void showbsizes(u32 r) {
#if defined(HIST) || defined(SPARK)
u32 ns[NBUCKETS];
checkCudaErrors(cudaMemcpy(ns, nslots[r&1], NBUCKETS * sizeof(u32), cudaMemcpyDeviceToHost));
u32 bsizes[NSLOTS+1];
memset(bsizes, 0, (NSLOTS+1) * sizeof(u32));
for (u32 bucketid = 0; bucketid < NBUCKETS; bucketid++) {
u32 bsize = ns[bucketid];
if (bsize > NSLOTS)
bsize = NSLOTS;
bsizes[bsize]++;
}
for (u32 i=0; i<=NSLOTS; i++) {
#ifdef HIST
printf(" %d:%d", i, bsizes[i]);
#else
printf("\342\226%c", (uchar)'\201'+bsizes[i]/SPARKSCALE);
#endif
}
printf("\n");
#endif
}
// proper dupe test is a little costly on GPU, so allow false negatives
__device__ bool probdupe(u32 *prf) {
unsigned short susp[PROOFSIZE];
memset(susp, 0xffff, PROOFSIZE * sizeof(unsigned short));
for (u32 i=0; i<PROOFSIZE; i++) {
u32 bin = prf[i] & (PROOFSIZE-1);
unsigned short msb = prf[i]>>WK;
if (msb == susp[bin])
return true;
susp[bin] = msb;
}
return false;
}
__device__ void candidate(const tree t) {
proof prf;
#if WK==9
listindices9(t, prf);
#elif WK==5
listindices5(t, prf);
#else
#error not implemented
#endif
if (probdupe(prf))
return;
u32 soli = atomicAdd(&nsols, 1);
if (soli < MAXSOLS)
#if WK==9
listindices9(t, sols[soli]);
#elif WK==5
listindices5(t, sols[soli]);
#else
#error not implemented
#endif
}
struct htlayout {
htalloc hta;
u32 prevhtunits;
u32 nexthtunits;
u32 dunits;
u32 prevbo;
u32 nextbo;
htunit *buck;
htunit *hashbase;
__device__ htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
u32 nexthashbytes = hashsize(r);
nexthtunits = htunits(nexthashbytes);
prevbo = 0;
nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
if (r) {
u32 prevhashbytes = hashsize(r-1);
prevhtunits = htunits(prevhashbytes);
prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
dunits = prevhtunits - nexthtunits;
}
#ifdef JOINHT
nexthtunits++;
prevhtunits++;
#endif
}
__device__ void setbucket(u32 r, u32 bid) {
buck = hta.getbucket(r, bid);
#ifdef JOINHT
hashbase = buck + 1;
#else
hashbase = hta.hashes[r&1] + (bid * NSLOTS) * prevhtunits;
#endif
}
__device__ u32 getxhash(const u32 slot, const htunit *hash) const {
#ifdef XWITHASH
return hash->bytes[prevbo] & 0xf;
#elif defined JOINHT
return buck[slot * prevhtunits].attr.xhash;
#else
return buck[slot].attr.xhash;
#endif
}
__device__ u32 prevhashunits() const {
#ifdef JOINHT
return prevhtunits - 1;
#else
return prevhtunits;
#endif
}
__device__ bool equal(const htunit *hash0, const htunit *hash1) const {
return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
}
__device__ htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
htunit *buck = hta.getbucket(r,bid);
#ifdef JOINHT
htunit *slotree = buck + slot * nexthtunits;
slotree->attr = t;
return slotree + 1;
#else
buck[slot].attr = t;
return hta.hashes[r&1] + (bid * NSLOTS + slot) * nexthtunits;
#endif
}
};
struct collisiondata {
#ifdef XBITMAP
u64 xhashmap[NRESTS];
u64 xmap;
#else
typedef uchar xslot;
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
u32 n0;
u32 n1;
#endif
u32 s0;
__device__ void clear() {
#ifdef XBITMAP
memset(xhashmap, 0, NRESTS * sizeof(u64));
#else
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
#endif
}
__device__ bool addslot(u32 s1, u32 xh) {
#ifdef XBITMAP
xmap = xhashmap[xh];
xhashmap[xh] |= (u64)1 << s1;
s0 = ~0;
return true;
#else
n1 = (u32)nxhashslots[xh]++;
if (n1 >= XFULL)
return false;
xx = xhashslots[xh];
xx[n1] = s1;
n0 = 0;
return true;
#endif
}
__device__ bool nextcollision() const {
#ifdef XBITMAP
return xmap != 0;
#else
return n0 < n1;
#endif
}
__device__ u32 slot() {
#ifdef XBITMAP
const u32 ffs = __ffsll(xmap);
s0 += ffs; xmap >>= ffs;
return s0;
#else
return (u32)xx[n0++];
#endif
}
};
};
__global__ void digit0(equi *eq) {
uchar hash[HASHOUT];
blake2b_state state;
equi::htlayout htl(eq, 0);
const u32 hashbytes = hashsize(0);
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 block = id; block < NBLOCKS; block += eq->nthreads) {
state = eq->blake_ctx;
blake2b_gpu_hash(&state, block, hash, HASHOUT);
for (u32 i = 0; i<HASHESPERBLAKE; i++) {
const uchar *ph = hash + i * WN/8;
#if BUCKBITS == 16 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
const u32 xhash = ph[2] >> 4;
#elif BUCKBITS == 20 && RESTBITS == 4
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
#ifndef XWITHASH
const u32 xhash = ph[2] & 0xf;
#endif
#elif BUCKBITS == 12 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
const u32 xhash = ph[1] & 0xf;
#else
#error not implemented
#endif
const u32 slot = atomicAdd(&eq->nslots[0][bucketid], 1);
if (slot >= NSLOTS)
continue;
tree leaf;
leaf.setindex(block*HASHESPERBLAKE+i);
#ifndef XWITHASH
leaf.xhash = xhash;
#endif
htunit *dest = htl.addtree(0, leaf, bucketid, slot);
memcpy(dest->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes);
}
}
}
__global__ void digitr(equi *eq, const u32 r) {
equi::htlayout htl(eq, r);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(r-1, bucketid);
u32 bsize = eq->getnslots(r-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
#if BUCKBITS == 16 && RESTBITS == 4
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2];
if (r&1) {
xorbucketid = ((xorbucketid & 0xfff) << 4) | (xhash >> 4);
xhash &= 0xf;
} else xhash >>= 4;
#elif BUCKBITS == 20 && RESTBITS == 4 && defined XWITHASH
xhash = hash0->bytes[htl.prevbo+3] ^ hash1->bytes[htl.prevbo+3];
xorbucketid = ((((u32)(hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]) << 8) | (hash0->bytes[htl.prevbo+2]^hash1->bytes[htl.prevbo+2])) << 4) | xhash >> 4;
xhash &= 0xf;
#elif BUCKBITS == 12 && RESTBITS == 4
xhash = hash0->bytes[htl.prevbo+1] ^ hash1->bytes[htl.prevbo+1];
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 4) | xhash >> 4;
xhash &= 0xf;
#else
#error not implemented
#endif
const u32 xorslot = atomicAdd(&eq->nslots[r&1][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
#ifndef XWITHASH
xort.xhash = xhash;
#endif
htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
}
}
}
}
#ifdef UNROLL
__global__ void digit1(equi *eq) {
equi::htlayout htl(eq, 1);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(0, bucketid);
u32 bsize = eq->getnslots(0, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2];
xorbucketid = ((xorbucketid & 0xfff) << 4) | (xhash >> 4);
xhash &= 0xf;
const u32 xorslot = atomicAdd(&eq->nslots[1][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(1, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[1].hash ^ hash1[1].hash;
xorhash[1].hash = hash0[2].hash ^ hash1[2].hash;
xorhash[2].hash = hash0[3].hash ^ hash1[3].hash;
xorhash[3].hash = hash0[4].hash ^ hash1[4].hash;
xorhash[4].hash = hash0[5].hash ^ hash1[5].hash;
}
}
}
}
__global__ void digit2(equi *eq) {
equi::htlayout htl(eq, 2);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(1, bucketid);
u32 bsize = eq->getnslots(1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = (hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2]) >> 4;
const u32 xorslot = atomicAdd(&eq->nslots[0][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(2, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[0].hash ^ hash1[0].hash;
xorhash[1].hash = hash0[1].hash ^ hash1[1].hash;
xorhash[2].hash = hash0[2].hash ^ hash1[2].hash;
xorhash[3].hash = hash0[3].hash ^ hash1[3].hash;
xorhash[4].hash = hash0[4].hash ^ hash1[4].hash;
}
}
}
}
__global__ void digit3(equi *eq) {
equi::htlayout htl(eq, 3);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(2, bucketid);
u32 bsize = eq->getnslots(2, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2];
xorbucketid = ((xorbucketid & 0xfff) << 4) | (xhash >> 4);
xhash &= 0xf;
const u32 xorslot = atomicAdd(&eq->nslots[1][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(3, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[1].hash ^ hash1[1].hash;
xorhash[1].hash = hash0[2].hash ^ hash1[2].hash;
xorhash[2].hash = hash0[3].hash ^ hash1[3].hash;
xorhash[3].hash = hash0[4].hash ^ hash1[4].hash;
}
}
}
}
__global__ void digit4(equi *eq) {
equi::htlayout htl(eq, 4);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(3, bucketid);
u32 bsize = eq->getnslots(3, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = (hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2]) >> 4;
const u32 xorslot = atomicAdd(&eq->nslots[0][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(4, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[0].hash ^ hash1[0].hash;
xorhash[1].hash = hash0[1].hash ^ hash1[1].hash;
xorhash[2].hash = hash0[2].hash ^ hash1[2].hash;
xorhash[3].hash = hash0[3].hash ^ hash1[3].hash;
}
}
}
}
__global__ void digit5(equi *eq) {
equi::htlayout htl(eq, 5);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(4, bucketid);
u32 bsize = eq->getnslots(4, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2];
xorbucketid = ((xorbucketid & 0xfff) << 4) | (xhash >> 4);
xhash &= 0xf;
const u32 xorslot = atomicAdd(&eq->nslots[1][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(5, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[1].hash ^ hash1[1].hash;
xorhash[1].hash = hash0[2].hash ^ hash1[2].hash;
xorhash[2].hash = hash0[3].hash ^ hash1[3].hash;
}
}
}
}
__global__ void digit6(equi *eq) {
equi::htlayout htl(eq, 6);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(5, bucketid);
u32 bsize = eq->getnslots(5, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = (hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2]) >> 4;
const u32 xorslot = atomicAdd(&eq->nslots[0][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(6, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[1].hash ^ hash1[1].hash;
xorhash[1].hash = hash0[2].hash ^ hash1[2].hash;
}
}
}
}
__global__ void digit7(equi *eq) {
equi::htlayout htl(eq, 7);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(6, bucketid);
u32 bsize = eq->getnslots(6, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2];
xorbucketid = ((xorbucketid & 0xfff) << 4) | (xhash >> 4);
xhash &= 0xf;
const u32 xorslot = atomicAdd(&eq->nslots[1][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(7, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[0].hash ^ hash1[0].hash;
xorhash[1].hash = hash0[1].hash ^ hash1[1].hash;
}
}
}
}
__global__ void digit8(equi *eq) {
equi::htlayout htl(eq, 8);
equi::collisiondata cd;
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(7, bucketid);
u32 bsize = eq->getnslots(7, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1))
continue;
u32 xorbucketid;
u32 xhash;
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = (hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2]) >> 4;
const u32 xorslot = atomicAdd(&eq->nslots[0][xorbucketid], 1);
if (xorslot >= NSLOTS)
continue;
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
xort.xhash = xhash;
htunit *xorhash = htl.addtree(8, xort, xorbucketid, xorslot);
xorhash[0].hash = hash0[1].hash ^ hash1[1].hash;
}
}
}
}
#endif
__global__ void digitK(equi *eq) {
equi::collisiondata cd;
equi::htlayout htl(eq, WK);
const u32 id = blockIdx.x * blockDim.x + threadIdx.x;
for (u32 bucketid = id; bucketid < NBUCKETS; bucketid += eq->nthreads) {
cd.clear();
htl.setbucket(WK-1, bucketid);
u32 bsize = eq->getnslots(WK-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1)) {
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
eq->candidate(xort);
}
}
}
}
}
#include <unistd.h>
int main(int argc, char **argv) {
int nthreads = 8192;
int nonce = 0;
int tpb = 0;
int range = 1;
bool showsol = false;
const char *header = "";
int c;
while ((c = getopt (argc, argv, "h:n:r:t:p:s")) != -1) {
switch (c) {
case 'h':
header = optarg;
break;
case 'n':
nonce = atoi(optarg);
break;
case 't':
nthreads = atoi(optarg);
break;
case 'p':
tpb = atoi(optarg);
break;
case 'r':
range = atoi(optarg);
break;
case 's':
showsol = true;
break;
}
}
if (!tpb) // if not set, then default threads per block to roughly square root of threads
for (tpb = 1; tpb*tpb < nthreads; tpb *= 2) ;
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
if (range > 1)
printf("-%d", nonce+range-1);
printf(") with %d %d-bits digits and %d threads (%d per block)\n", NDIGITS, DIGITBITS, nthreads, tpb);
equi eq(nthreads);
#ifdef JOINHT
for (u32 r=0; r < WK; r++)
checkCudaErrors(cudaMalloc((void**)&eq.hta.trees[r], NBUCKETS * NSLOTS * (1 + hhtunits(hhashsize(r))) * sizeof(htunit)));
#else
checkCudaErrors(cudaMalloc((void**)&eq.hta.trees, WK * NBUCKETS * NSLOTS * sizeof(tree)));
for (u32 r=0; r < 2; r++)
checkCudaErrors(cudaMalloc((void**)&eq.hta.hashes[r], NBUCKETS * NSLOTS * hhtunits(hhashsize(r)) * sizeof(htunit)));
#endif
checkCudaErrors(cudaMalloc((void**)&eq.nslots, 2 * NBUCKETS * sizeof(u32)));
checkCudaErrors(cudaMalloc((void**)&eq.sols, MAXSOLS * sizeof(proof)));
equi *device_eq;
checkCudaErrors(cudaMalloc((void**)&device_eq, sizeof(equi)));
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
proof sols[MAXSOLS];
u32 sumnsols = 0;
for (int r = 0; r < range; r++) {
cudaEventRecord(start, NULL);
eq.setnonce(header, nonce+r);
checkCudaErrors(cudaMemcpy(device_eq, &eq, sizeof(equi), cudaMemcpyHostToDevice));
printf("Digit 0\n");
digit0<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(0);
#if BUCKBITS == 16 && RESTBITS == 4 && defined(UNROLL)
printf("Digit %d\n", 1);
digit1<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(1);
printf("Digit %d\n", 2);
digit2<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(2);
printf("Digit %d\n", 3);
digit3<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(3);
printf("Digit %d\n", 4);
digit4<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(4);
printf("Digit %d\n", 5);
digit5<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(5);
printf("Digit %d\n", 6);
digit6<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(6);
printf("Digit %d\n", 7);
digit7<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(7);
printf("Digit %d\n", 8);
digit8<<<nthreads/tpb,tpb >>>(device_eq);
eq.showbsizes(8);
#else
for (u32 r=1; r < WK; r++) {
printf("Digit %d\n", r);
digitr<<<nthreads/tpb,tpb >>>(device_eq, r);
eq.showbsizes(r);
}
#endif
printf("Digit %d\n", WK);
digitK<<<nthreads/tpb,tpb >>>(device_eq);
checkCudaErrors(cudaMemcpy(&eq, device_eq, sizeof(equi), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(sols, eq.sols, MAXSOLS * sizeof(proof), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop);
float duration;
cudaEventElapsedTime(&duration, start, stop);
printf("%d rounds completed in %.3f seconds.\n", WK, duration / 1000.0f);
u32 nsols = 0;
for (unsigned s = 0; s < eq.nsols; s++) {
if (duped(sols[s])) {
printf("Duped!\n");
continue;
}
nsols++;
if (showsol) {
printf("Solution");
for (int i = 0; i < PROOFSIZE; i++)
printf(" %jx", (uintmax_t)sols[s][i]);
printf("\n");
}
}
printf("%d solutions\n", nsols);
sumnsols += nsols;
}
checkCudaErrors(cudaFree(eq.nslots));
checkCudaErrors(cudaFree(eq.sols));
#ifdef JOINHT
for (u32 r=0; r < WK; r++)
checkCudaErrors(cudaFree(eq.hta.trees[r]));
#else
checkCudaErrors(cudaFree(eq.hta.trees));
for (u32 r=0; r < 2; r++)
checkCudaErrors(cudaFree(eq.hta.hashes[r]));
#endif
printf("%d total solutions\n", sumnsols);
return 0;
}

588
equi_miner.h Normal file
View File

@ -0,0 +1,588 @@
// Equihash solver
// Copyright (c) 2013-2016 John Tromp
// Fix N, K, such that n = N/(k+1) is integer
// Fix M = 2^{n+1} hashes each of length N bits,
// H_0, ... , H_{M-1}, generated fom (n+1)-bit indices.
// Problem: find binary tree on 2^K distinct indices,
// for which the exclusive-or of leaf hashes is all 0s.
// Additionally, it should satisfy the Wagner conditions:
// for each height i subtree, the exclusive-or
// of its 2^i corresponding hashes starts with i*n 0 bits,
// and for i>0 the leftmost leaf of its left subtree
// is less than the leftmost leaf of its right subtree
// The algorithm below solves this by maintaining the trees
// in a graph of K layers, each split into buckets
// with buckets indexed by the first n-RESTBITS bits following
// the i*n 0s, each bucket having 4 * 2^RESTBITS slots,
// twice the number of subtrees expected to land there.
#include "equi.h"
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <assert.h>
typedef uint64_t u64;
#ifdef ATOMIC
#include <atomic>
typedef std::atomic<u32> au32;
#else
typedef u32 au32;
#endif
#ifndef RESTBITS
#define RESTBITS 4
#endif
// 2_log of number of buckets
#define BUCKBITS (DIGITBITS-RESTBITS)
// number of buckets
static const u32 NBUCKETS = 1<<BUCKBITS;
// 2_log of number of slots per bucket
static const u32 SLOTBITS = RESTBITS+1+1;
// number of slots per bucket
static const u32 NSLOTS = 1<<SLOTBITS;
// number of per-xhash slots
static const u32 XFULL = NSLOTS/4;
// SLOTBITS mask
static const u32 SLOTMASK = NSLOTS-1;
// number of possible values of xhash (rest of n) bits
static const u32 NRESTS = 1<<RESTBITS;
// number of blocks of hashes extracted from single 512 bit blake2b output
static const u32 NBLOCKS = (NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE;
// nothing larger found in 100000 runs
static const u32 MAXSOLS = 8;
// scaling factor for showing bucketsize histogra as sparkline
#ifndef SPARKSCALE
#define SPARKSCALE (40 << (BUCKBITS-12))
#endif
// tree node identifying its children as two different slots in
// a bucket on previous layer with the same rest bits (x-tra hash)
struct tree {
unsigned bucketid : BUCKBITS;
unsigned slotid0 : SLOTBITS;
unsigned slotid1 : SLOTBITS;
#ifndef XWITHASH
unsigned xhash : RESTBITS;
#endif
// layer 0 has no children bit needs to encode index
u32 getindex() const {
return (bucketid << SLOTBITS) | slotid0;
}
void setindex(const u32 idx) {
slotid0 = idx & SLOTMASK;
bucketid = idx >> SLOTBITS;
}
};
union htunit {
tree attr;
u32 hash;
uchar bytes[sizeof(u32)];
};
// a bucket is NSLOTS treenodes
typedef htunit bucket[NSLOTS];
// the N-bit hash consists of K+1 n-bit "digits"
// each of which corresponds to a layer of NBUCKETS buckets
// size (in bytes) of hash in round 0 <= r < WK
u32 hashsize(const u32 r) {
#ifdef XWITHASH
const u32 hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
#else
const u32 hashbits = WN - (r+1) * DIGITBITS;
#endif
return (hashbits + 7) / 8;
}
u32 htunits(u32 bytes) {
return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
}
#ifdef JOINHT
u32 slotsize(const u32 r) {
return 1 + htunits(hashsize(r));
}
// size (in htunits) of bucket in round 0 <= r < WK
u32 bucketsize(const u32 r) {
return NSLOTS * slotsize(r);
}
#else
u32 slotsize(const u32 r) {
return 1;
}
#endif
// manages hash and tree data
struct htalloc {
#ifdef JOINHT
htunit *trees[WK];
#else
bucket *trees[WK];
htunit *hashes[WK];
#endif
u64 alloced;
htalloc() {
alloced = 0;
}
void alloctrees() {
#ifdef JOINHT
for (int r=0; r<WK; r++)
trees[r] = (htunit *)alloc(NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
#endif
}
void dealloctrees() {
#ifdef JOINHT
for (int r=0; r<WK; r++)
dealloc(trees[r], NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
#else
for (int r=0; r<WK; r++)
dealloc(trees[r], NBUCKETS, sizeof(bucket));
#endif
}
void alloc_ht(u32 r) {
#ifndef JOINHT
hashes[r] = (htunit *)alloc(NBUCKETS * NSLOTS * htunits(hashsize(r)), sizeof(htunit));
trees[r] = (bucket *)alloc(NBUCKETS, sizeof(bucket));
#endif
}
void dealloc_ht(u32 r) {
#ifndef JOINHT
dealloc(hashes[r], NBUCKETS * NSLOTS * htunits(hashsize(r)), sizeof(htunit));
#endif
}
htunit *getbucket(u32 r, u32 bid) const {
#ifdef JOINHT
return &trees[r][bid * bucketsize(r)];
#else
return trees[r][bid];
#endif
}
void *alloc(const u32 n, const u32 sz) {
void *mem = calloc(n, sz);
assert(mem);
alloced += (u64)n * sz;
return mem;
}
void dealloc(void *mem, const u32 n, const u32 sz) {
free(mem);
alloced -= (u64)n * sz;
}
};
typedef au32 bsizes[NBUCKETS];
u32 min(const u32 a, const u32 b) {
return a < b ? a : b;
}
struct equi {
blake2b_state blake_ctx;
htalloc hta;
bsizes *nslots;
proof *sols;
au32 nsols;
u32 nthreads;
u32 xfull;
u32 hfull;
u32 bfull;
pthread_barrier_t barry;
equi(const u32 n_threads) {
nthreads = n_threads;
const int err = pthread_barrier_init(&barry, NULL, nthreads);
assert(!err);
hta.alloctrees();
nslots = (bsizes *)hta.alloc(2 * NBUCKETS, sizeof(au32));
sols = (proof *)hta.alloc(MAXSOLS, sizeof(proof));
}
~equi() {
free(nslots);
free(sols);
}
void setnonce(const char *header, u32 nonce) {
setheader(&blake_ctx, header, nonce);
memset(nslots, 0, NBUCKETS * sizeof(au32)); // only nslots[0] needs zeroing
nsols = 0;
}
u32 findslot(const u32 r, const u32 bucketi) {
#ifdef ATOMIC
return std::atomic_fetch_add_explicit(&nslots[r&1][bucketi], 1U, std::memory_order_relaxed);
#else
return nslots[r&1][bucketi]++;
#endif
}
u32 getnslots(const u32 r, const u32 bid) {
au32 &nslot = nslots[r&1][bid];
const u32 n = min(nslot, NSLOTS);
nslot = 0;
return n;
}
void listindices(u32 r, const tree t, u32 *indices) {
if (r == 0) {
*indices = t.getindex();
return;
}
const htunit *bt = hta.getbucket(--r,t.bucketid);
const u32 size = 1 << r;
u32 *indices1 = indices + size;
listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
if (*indices > *indices1) {
for (u32 i=0; i < size; i++) {
const u32 tmp = indices[i];
indices[i] = indices1[i];
indices1[i] = tmp;
}
}
}
void candidate(const tree t) {
proof prf;
listindices(WK, t, prf);
qsort(prf, PROOFSIZE, sizeof(u32), &compu32);
for (u32 i=1; i<PROOFSIZE; i++)
if (prf[i] <= prf[i-1])
return;
#ifdef ATOMIC
u32 soli = std::atomic_fetch_add_explicit(&nsols, 1U, std::memory_order_relaxed);
#else
u32 soli = nsols++;
#endif
if (soli < MAXSOLS)
listindices(WK, t, sols[soli]);
}
void showbsizes(u32 r) {
#if defined(HIST) || defined(SPARK)
u32 bsizes[NSLOTS+1];
memset(bsizes, 0, (NSLOTS+1) * sizeof(u32));
for (u32 bucketid = 0; bucketid < NBUCKETS; bucketid++) {
u32 bsize = nslots[r&1][bucketid];
if (bsize < NSLOTS)
bsizes[bsize]++;
else
bsizes[NSLOTS]++;
}
for (u32 i=0; i<=NSLOTS; i++) {
#ifdef HIST
printf(" %d:%d", i, bsizes[i]);
#else
printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE);
#endif
}
printf(" %ld MB\n", hta.alloced >> 20);
#endif
}
struct htlayout {
htalloc hta;
u32 prevhtunits;
u32 nexthtunits;
u32 dunits;
u32 prevbo;
u32 nextbo;
htunit *buck;
htunit *hashbase;
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
u32 nexthashbytes = hashsize(r);
nexthtunits = htunits(nexthashbytes);
prevbo = 0;
nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
if (r) {
u32 prevhashbytes = hashsize(r-1);
prevhtunits = htunits(prevhashbytes);
prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
dunits = prevhtunits - nexthtunits;
}
#ifdef JOINHT
nexthtunits++;
prevhtunits++;
#endif
}
void setbucket(u32 r, u32 bid) {
buck = hta.getbucket(r, bid);
#ifdef JOINHT
hashbase = buck + 1;
#else
hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits;
#endif
}
u32 getxhash(const u32 slot, const htunit *hash) const {
#ifdef XWITHASH
return hash->bytes[prevbo] & 0xf;
#elif defined JOINHT
return buck[slot * prevhtunits].attr.xhash;
#else
return buck[slot].attr.xhash;
#endif
}
u32 prevhashunits() const {
#ifdef JOINHT
return prevhtunits - 1;
#else
return prevhtunits;
#endif
}
bool equal(const htunit *hash0, const htunit *hash1) const {
return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
}
htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
htunit *buck = hta.getbucket(r,bid);
#ifdef JOINHT
htunit *slotree = buck + slot * nexthtunits;
slotree->attr = t;
return slotree + 1;
#else
buck[slot].attr = t;
return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits;
#endif
}
};
struct collisiondata {
#ifdef XBITMAP
u64 xhashmap[NRESTS];
u64 xmap;
#else
typedef uchar xslot;
xslot nxhashslots[NRESTS];
xslot xhashslots[NRESTS][XFULL];
xslot *xx;
u32 n0;
u32 n1;
#endif
u32 s0;
void clear() {
#ifdef XBITMAP
memset(xhashmap, 0, NRESTS * sizeof(u64));
#else
memset(nxhashslots, 0, NRESTS * sizeof(xslot));
#endif
}
bool addslot(u32 s1, u32 xh) {
#ifdef XBITMAP
xmap = xhashmap[xh];
xhashmap[xh] |= (u64)1 << s1;
s0 = -1;
return true;
#else
n1 = (u32)nxhashslots[xh]++;
if (n1 >= XFULL)
return false;
xx = xhashslots[xh];
xx[n1] = s1;
n0 = 0;
return true;
#endif
}
bool nextcollision() const {
#ifdef XBITMAP
return xmap != 0;
#else
return n0 < n1;
#endif
}
u32 slot() {
#ifdef XBITMAP
const u32 ffs = __builtin_ffsll(xmap);
s0 += ffs; xmap >>= ffs;
return s0;
#else
return (u32)xx[n0++];
#endif
}
};
void digit0(const u32 id) {
uchar hash[HASHOUT];
blake2b_state state;
htlayout htl(this, 0);
const u32 hashbytes = hashsize(0);
for (u32 block = id; block < NBLOCKS; block += nthreads) {
state = blake_ctx;
const u32 leb = htole32(block);
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
blake2b_final(&state, hash, HASHOUT);
for (u32 i = 0; i<HASHESPERBLAKE; i++) {
const uchar *ph = hash + i * WN/8;
#if BUCKBITS == 16 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 8) | ph[1];
const u32 xhash = ph[2] >> 4;
#elif BUCKBITS == 20 && RESTBITS == 4
const u32 bucketid = ((((u32)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
#ifndef XWITHASH
const u32 xhash = ph[2] & 0xf;
#endif
#elif BUCKBITS == 12 && RESTBITS == 4
const u32 bucketid = ((u32)ph[0] << 4) | ph[1] >> 4;
const u32 xhash = ph[1] & 0xf;
#else
#error not implemented
#endif
const u32 slot = findslot(0, bucketid);
if (slot >= NSLOTS) {
bfull++;
continue;
}
tree leaf;
leaf.setindex(block*HASHESPERBLAKE+i);
#ifndef XWITHASH
leaf.xhash = xhash;
#endif
htunit *dest = htl.addtree(0, leaf, bucketid, slot);
memcpy(dest->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes);
}
}
}
void digitr(const u32 r, const u32 id) {
htlayout htl(this, r);
collisiondata cd;
for (u32 bucketid=id; bucketid < NBUCKETS; bucketid += nthreads) {
cd.clear();
htl.setbucket(r-1, bucketid);
u32 bsize = getnslots(r-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1))) {
xfull++;
continue;
}
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1)) {
hfull++;
continue;
}
u32 xorbucketid;
u32 xhash;
#if BUCKBITS == 16 && RESTBITS == 4
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 8)
| (hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]);
xhash = hash0->bytes[htl.prevbo+2] ^ hash1->bytes[htl.prevbo+2];
if (r&1) {
xorbucketid = ((xorbucketid & 0xfff) << 4) | (xhash >> 4);
xhash &= 0xf;
} else xhash >>= 4;
#elif BUCKBITS == 20 && RESTBITS == 4 && defined XWITHASH
xhash = hash0->bytes[htl.prevbo+3] ^ hash1->bytes[htl.prevbo+3];
xorbucketid = ((((u32)(hash0->bytes[htl.prevbo+1]^hash1->bytes[htl.prevbo+1]) << 8) | (hash0->bytes[htl.prevbo+2]^hash1->bytes[htl.prevbo+2])) << 4) | xhash >> 4;
xhash &= 0xf;
#elif BUCKBITS == 12 && RESTBITS == 4
xhash = hash0->bytes[htl.prevbo+1] ^ hash1->bytes[htl.prevbo+1];
xorbucketid = ((u32)(hash0->bytes[htl.prevbo]^hash1->bytes[htl.prevbo]) << 4) | xhash >> 4;
xhash &= 0xf;
#else
#error not implemented
#endif
const u32 xorslot = findslot(r, xorbucketid);
if (xorslot >= NSLOTS) {
bfull++;
continue;
}
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
#ifndef XWITHASH
xort.xhash = xhash;
#endif
htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
}
}
}
}
void digitK(const u32 id) {
collisiondata cd;
htlayout htl(this, WK);
for (u32 bucketid = id; bucketid < NBUCKETS; bucketid += nthreads) {
cd.clear();
htl.setbucket(WK-1, bucketid);
u32 bsize = getnslots(WK-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
if (htl.equal(hash0, hash1)) {
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
candidate(xort);
}
}
}
}
}
};
typedef struct {
u32 id;
pthread_t thread;
equi *eq;
} thread_ctx;
void barrier(pthread_barrier_t *barry) {
const int rc = pthread_barrier_wait(barry);
if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) {
printf("Could not wait on barrier\n");
pthread_exit(NULL);
}
}
void *worker(void *vp) {
thread_ctx *tp = (thread_ctx *)vp;
equi *eq = tp->eq;
if (tp->id == 0) {
printf("Digit 0\n");
eq->hta.alloc_ht(0);
}
barrier(&eq->barry);
eq->digit0(tp->id);
barrier(&eq->barry);
if (tp->id == 0) {
eq->xfull = eq->bfull = eq->hfull = 0;
eq->showbsizes(0);
}
barrier(&eq->barry);
for (u32 r = 1; r < WK; r++) {
if (tp->id == 0) {
printf("Digit %d", r);
eq->hta.alloc_ht(r);
}
barrier(&eq->barry);
eq->digitr(r, tp->id);
barrier(&eq->barry);
if (tp->id == 0) {
printf(" x%d b%d h%d\n", eq->xfull, eq->bfull, eq->hfull);
eq->xfull = eq->bfull = eq->hfull = 0;
eq->showbsizes(r);
eq->hta.dealloc_ht(r-1);
}
barrier(&eq->barry);
}
if (tp->id == 0)
printf("Digit %d\n", WK);
eq->digitK(tp->id);
barrier(&eq->barry);
if (tp->id == 0) {
eq->hta.dealloc_ht(WK-1);
eq->hta.dealloctrees();
}
pthread_exit(NULL);
return 0;
}

70
osx_barrier.h Normal file
View File

@ -0,0 +1,70 @@
#ifdef __APPLE__
#ifndef PTHREAD_BARRIER_H_
#define PTHREAD_BARRIER_H_
#include <pthread.h>
#include <errno.h>
typedef int pthread_barrierattr_t;
#define PTHREAD_BARRIER_SERIAL_THREAD 1
typedef struct
{
pthread_mutex_t mutex;
pthread_cond_t cond;
int count;
int tripCount;
} pthread_barrier_t;
int pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned int count)
{
if(count == 0)
{
errno = EINVAL;
return -1;
}
if(pthread_mutex_init(&barrier->mutex, 0) < 0)
{
return -1;
}
if(pthread_cond_init(&barrier->cond, 0) < 0)
{
pthread_mutex_destroy(&barrier->mutex);
return -1;
}
barrier->tripCount = count;
barrier->count = 0;
return 0;
}
int pthread_barrier_destroy(pthread_barrier_t *barrier)
{
pthread_cond_destroy(&barrier->cond);
pthread_mutex_destroy(&barrier->mutex);
return 0;
}
int pthread_barrier_wait(pthread_barrier_t *barrier)
{
pthread_mutex_lock(&barrier->mutex);
++(barrier->count);
if(barrier->count >= barrier->tripCount)
{
barrier->count = 0;
pthread_cond_broadcast(&barrier->cond);
pthread_mutex_unlock(&barrier->mutex);
return PTHREAD_BARRIER_SERIAL_THREAD;
}
else
{
pthread_cond_wait(&barrier->cond, &(barrier->mutex));
pthread_mutex_unlock(&barrier->mutex);
return 0;
}
}
#endif // PTHREAD_BARRIER_H_
#endif // __APPLE__