fix Linux build, one cmake file (TODO make shared lib all in one build)

This commit is contained in:
Stanko Krstić 2017-01-12 21:26:59 +01:00
parent d56e5e8e26
commit af3a5c48ac
50 changed files with 2147 additions and 1575 deletions

198
CMakeLists.txt Normal file
View File

@ -0,0 +1,198 @@
project(nheqminer)
cmake_minimum_required(VERSION 3.5)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # -Wall
## Enable solvers here
#### older slower
option(USE_CPU_TROMP "USE CPU_TROMP" OFF)
option(USE_CUDA_TROMP "USE CUDA_TROMP" OFF)
#### faster
option(USE_CPU_XENONCAT "USE CPU_XENONCAT" ON)
option(USE_CUDA_DJEZO "USE CUDA_DJEZO" ON)
## Add solvers here
if (USE_CPU_TROMP)
add_definitions(-DUSE_CPU_TROMP)
message("-- USE_CPU_TROMP DEFINED")
endif()
if (USE_CPU_XENONCAT)
add_definitions(-DUSE_CPU_XENONCAT)
message("-- USE_CPU_XENONCAT DEFINED")
endif()
if (USE_CUDA_TROMP)
add_definitions(-DUSE_CUDA_TROMP)
message("-- USE_CUDA_TROMP DEFINED")
endif()
if (USE_CUDA_DJEZO)
add_definitions(-DUSE_CUDA_DJEZO)
message("-- USE_CUDA_DJEZO DEFINED")
endif()
########
# LINUX
if(CMAKE_COMPILER_IS_GNUCXX)
# # use native cpu features
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -fPIC")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fPIC")
# # optimizations
# add_definitions(-O3)
# use
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse2")
# optimizations
add_definitions(-O2)
endif()
# Common
include_directories(${nheqminer_SOURCE_DIR}/nheqminer)
# BOOST
#find_package(Threads REQUIRED COMPONENTS)
# compile boost staticaly
set(Boost_USE_STATIC_LIBS ON)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
#set(BUILD_SHARED_LIBRARIES OFF)
#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
if (Boost_FOUND)
# From the offical documentation:
# Add include directories to the build. [...] If the SYSTEM option is given,
# the compiler will be told the directories are meant as system include
# directories on some platforms (signalling this setting might achieve effects
# such as the compiler skipping warnings [...])."
include_directories (SYSTEM ${Boost_INCLUDE_DIR})
# From the offical documentation:
# "Specify directories in which the linker will look for libraries. [...] Note
# that this command is rarely necessary. Library locations returned by
# find_package() and find_library() are absolute paths. Pass these absolute
# library file paths directly to the target_link_libraries() command. CMake
# will ensure the linker finds them."
link_directories (${Boost_LIBRARY_DIRS})
else()
message("Boost_FOUND NOT FOUND")
endif ()
include_directories(${CMAKE_CURRENT_BINARY_DIR}/../)
set(SOURCE_FILES
# sources
nheqminer/amount.cpp
nheqminer/api.cpp
nheqminer/arith_uint256.cpp
nheqminer/crypto/sha256.cpp
nheqminer/json/json_spirit_reader.cpp
nheqminer/json/json_spirit_value.cpp
nheqminer/json/json_spirit_writer.cpp
nheqminer/libstratum/ZcashStratum.cpp
nheqminer/main.cpp
nheqminer/primitives/block.cpp
nheqminer/speed.cpp
nheqminer/uint256.cpp
nheqminer/utilstrencodings.cpp
# headers
nheqminer/amount.h
nheqminer/api.hpp
nheqminer/arith_uint256.h
nheqminer/crypto/sha256.h
nheqminer/hash.h
nheqminer/json/json_spirit.h
nheqminer/json/json_spirit_error_position.h
nheqminer/json/json_spirit_reader.h
nheqminer/json/json_spirit_reader_template.h
nheqminer/json/json_spirit_stream_reader.h
nheqminer/json/json_spirit_utils.h
nheqminer/json/json_spirit_value.h
nheqminer/json/json_spirit_writer.h
nheqminer/json/json_spirit_writer_template.h
nheqminer/libstratum/StratumClient.cpp
nheqminer/libstratum/StratumClient.h
nheqminer/libstratum/ZcashStratum.cpp
nheqminer/libstratum/ZcashStratum.h
nheqminer/primitives/block.h
nheqminer/primitives/transaction.h
nheqminer/script/script.h
nheqminer/serialize.h
nheqminer/speed.hpp
nheqminer/streams.h
nheqminer/support/allocators/zeroafterfree.h
nheqminer/tinyformat.h
nheqminer/uint252.h
nheqminer/uint256.h
nheqminer/utilstrencodings.h
nheqminer/version.h
nheqminer/zcash/JoinSplit.hpp
nheqminer/zcash/NoteEncryption.hpp
nheqminer/zcash/Proof.hpp
nheqminer/zcash/Zcash.h
nheqminer/SolverStub.h # just a stub
nheqminer/AvailableSolvers.h
nheqminer/ISolver.h
nheqminer/Solver.h
nheqminer/MinerFactory.h
nheqminer/MinerFactory.cpp
# make same path on windows
#blake shared
# src
blake2/blake2bx.cpp
# headers
blake2/blake2.h
blake2/blake2b-load-sse2.h
blake2/blake2b-load-sse41.h
blake2/blake2b-round.h
blake2/blake2-config.h
blake2/blake2-impl.h
blake2/blake2-round.h
)
#set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES})
set(LIBS ${LIBS} ${Boost_LIBRARIES})
message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
message("-- LIBS: ${LIBS}")
if (USE_CPU_TROMP)
add_subdirectory(cpu_tromp)
endif()
if (USE_CPU_XENONCAT)
add_subdirectory(cpu_xenoncat)
endif()
if (USE_CUDA_TROMP)
add_subdirectory(cuda_tromp)
endif()
if (USE_CUDA_DJEZO)
add_subdirectory(cuda_djezo)
endif()
#add_subdirectory(cpu_xenoncat)
ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES})
#target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} )
target_link_libraries(${PROJECT_NAME} ${CMAKE_THREAD_LIBS_INIT} ${LIBS} )
# link libs
if (USE_CPU_TROMP)
target_link_libraries(${PROJECT_NAME} cpu_tromp)
endif()
if (USE_CPU_XENONCAT)
add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx1.o" )
add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx2.o" )
target_link_libraries(${PROJECT_NAME} cpu_xenoncat xenoncat_avx1 xenoncat_avx2)
endif()
if (USE_CUDA_TROMP)
target_link_libraries(${PROJECT_NAME} cuda_tromp)
endif()
if (USE_CUDA_DJEZO)
target_link_libraries(${PROJECT_NAME} cuda_djezo)
endif()

View File

@ -17,34 +17,7 @@ endif()
# Common
include_directories(${nheqminer_SOURCE_DIR})
#add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
# BOOST
#find_package(Threads REQUIRED COMPONENTS)
# compile boost staticaly
set(Boost_USE_STATIC_LIBS ON)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
#set(BUILD_SHARED_LIBRARIES OFF)
#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
if (Boost_FOUND)
# From the offical documentation:
# Add include directories to the build. [...] If the SYSTEM option is given,
# the compiler will be told the directories are meant as system include
# directories on some platforms (signalling this setting might achieve effects
# such as the compiler skipping warnings [...])."
include_directories (SYSTEM ${Boost_INCLUDE_DIR})
# From the offical documentation:
# "Specify directories in which the linker will look for libraries. [...] Note
# that this command is rarely necessary. Library locations returned by
# find_package() and find_library() are absolute paths. Pass these absolute
# library file paths directly to the target_link_libraries() command. CMake
# will ensure the linker finds them."
link_directories (${Boost_LIBRARY_DIRS})
else()
message("Boost_FOUND NOT FOUND")
endif ()
add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
@ -75,6 +48,25 @@ else()
message("CUDA NOT FOUND")
endif()
if (Boost_FOUND)
# From the offical documentation:
# Add include directories to the build. [...] If the SYSTEM option is given,
# the compiler will be told the directories are meant as system include
# directories on some platforms (signalling this setting might achieve effects
# such as the compiler skipping warnings [...])."
include_directories (SYSTEM ${Boost_INCLUDE_DIR})
# From the offical documentation:
# "Specify directories in which the linker will look for libraries. [...] Note
# that this command is rarely necessary. Library locations returned by
# find_package() and find_library() are absolute paths. Pass these absolute
# library file paths directly to the target_link_libraries() command. CMake
# will ensure the linker finds them."
link_directories (${Boost_LIBRARY_DIRS})
else()
message("Boost_FOUND NOT FOUND")
endif ()
## Add solvers here
#add_definitions(-DUSE_CPU_XENONCAT)
#add_definitions(-DUSE_CPU_TROMP)
@ -141,11 +133,11 @@ set(SOURCE_FILES
../../nheqminer/SolverStub.h # just a stub
../../nheqminer/AvailableSolvers.h
../../nheqminer/ISolver.h
../../nheqminer/Solver.h
../../nheqminer/MinerFactory.h
../../nheqminer/MinerFactory.cpp
../../nheqminer/AvailableSolvers.h
../../nheqminer/ISolver.h
../../nheqminer/Solver.h
../../nheqminer/MinerFactory.h
../../nheqminer/MinerFactory.cpp
# # cpu tromp
# ../../cpu_tromp/blake2/blake2bx.cpp

72
blake2/blake2-config.h Normal file
View File

@ -0,0 +1,72 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_CONFIG_H__
#define __BLAKE2_CONFIG_H__
// These don't work everywhere
#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
#define HAVE_SSE2
#endif
#if defined(__SSSE3__)
#define HAVE_SSSE3
#endif
#if defined(__SSE4_1__)
#define HAVE_SSE41
#endif
#if defined(__AVX__)
#define HAVE_AVX
#endif
#if defined(__XOP__)
#define HAVE_XOP
#endif
#ifdef HAVE_AVX2
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif
#ifdef HAVE_XOP
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif
#ifdef HAVE_AVX
#ifndef HAVE_SSE41
#define HAVE_SSE41
#endif
#endif
#ifdef HAVE_SSE41
#ifndef HAVE_SSSE3
#define HAVE_SSSE3
#endif
#endif
#ifdef HAVE_SSSE3
#define HAVE_SSE2
#endif
#if !defined(HAVE_SSE2)
#error "This code requires at least SSE2."
#endif
#endif

136
blake2/blake2-impl.h Normal file
View File

@ -0,0 +1,136 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_IMPL_H__
#define __BLAKE2_IMPL_H__
#include <stdint.h>
static inline uint32_t load32( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
uint32_t w = *p++;
w |= ( uint32_t )( *p++ ) << 8;
w |= ( uint32_t )( *p++ ) << 16;
w |= ( uint32_t )( *p++ ) << 24;
return w;
#endif
}
static inline uint64_t load64( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
uint64_t w = *p++;
w |= ( uint64_t )( *p++ ) << 8;
w |= ( uint64_t )( *p++ ) << 16;
w |= ( uint64_t )( *p++ ) << 24;
w |= ( uint64_t )( *p++ ) << 32;
w |= ( uint64_t )( *p++ ) << 40;
w |= ( uint64_t )( *p++ ) << 48;
w |= ( uint64_t )( *p++ ) << 56;
return w;
#endif
}
static inline void store32( void *dst, uint32_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}
static inline void store64( void *dst, uint64_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}
static inline uint64_t load48( const void *src )
{
const uint8_t *p = ( const uint8_t * )src;
uint64_t w = *p++;
w |= ( uint64_t )( *p++ ) << 8;
w |= ( uint64_t )( *p++ ) << 16;
w |= ( uint64_t )( *p++ ) << 24;
w |= ( uint64_t )( *p++ ) << 32;
w |= ( uint64_t )( *p++ ) << 40;
return w;
}
static inline void store48( void *dst, uint64_t w )
{
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
}
static inline uint32_t rotl32( const uint32_t w, const unsigned c )
{
return ( w << c ) | ( w >> ( 32 - c ) );
}
static inline uint64_t rotl64( const uint64_t w, const unsigned c )
{
return ( w << c ) | ( w >> ( 64 - c ) );
}
static inline uint32_t rotr32( const uint32_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 32 - c ) );
}
static inline uint64_t rotr64( const uint64_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 64 - c ) );
}
/* prevents compiler optimizing out memset() */
static inline void secure_zero_memory( void *v, size_t n )
{
volatile uint8_t *p = ( volatile uint8_t * )v;
while( n-- ) *p++ = 0;
}
#endif

85
blake2/blake2-round.h Normal file
View File

@ -0,0 +1,85 @@
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
row1l = _mm_add_epi64(row1l, row2l); \
row1h = _mm_add_epi64(row1h, row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24); \
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
row1l = _mm_add_epi64(row1l, row2l); \
row1h = _mm_add_epi64(row1h, row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63); \
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
\
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
\
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
\
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);

156
blake2/blake2.h Normal file
View File

@ -0,0 +1,156 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_H__
#define __BLAKE2_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__ ((__aligned__(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
typedef struct __blake2s_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_param;
ALIGN( 64 ) typedef struct __blake2s_state
{
uint32_t h[8];
uint32_t t[2];
uint32_t f[2];
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
size_t buflen;
uint8_t last_node;
} blake2s_state;
typedef struct __blake2b_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint64_t node_offset; // 16
uint8_t node_depth; // 17
uint8_t inner_length; // 18
uint8_t reserved[14]; // 32
uint8_t salt[BLAKE2B_SALTBYTES]; // 48
uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64
} blake2b_param;
ALIGN( 64 ) typedef struct __blake2b_state
{
uint64_t h[8];
uint8_t buf[BLAKE2B_BLOCKBYTES];
uint16_t counter;
uint8_t buflen;
uint8_t lastblock;
} blake2b_state;
ALIGN( 64 ) typedef struct __blake2sp_state
{
blake2s_state S[8][1];
blake2s_state R[1];
uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
size_t buflen;
} blake2sp_state;
ALIGN( 64 ) typedef struct __blake2bp_state
{
blake2b_state S[4][1];
blake2b_state R[1];
uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
size_t buflen;
} blake2bp_state;
#pragma pack(pop)
// Streaming API
int blake2s_init( blake2s_state *S, const uint8_t outlen );
int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
int blake2b_init( blake2b_state *S, const uint8_t outlen );
int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
// Simple API
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen);
int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
return blake2b( out, in, key, outlen, inlen, keylen );
}
#if defined(__cplusplus)
}
#endif
#endif

View File

@ -0,0 +1,68 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE2_H__
#define __BLAKE2B_LOAD_SSE2_H__
#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#endif

402
blake2/blake2b-load-sse41.h Normal file
View File

@ -0,0 +1,402 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE41_H__
#define __BLAKE2B_LOAD_SSE41_H__
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#endif

170
blake2/blake2b-round.h Normal file
View File

@ -0,0 +1,170 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_ROUND_H__
#define __BLAKE2B_ROUND_H__
#define LOAD(p) _mm_load_si128( (const __m128i *)(p) )
#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))
#define LIKELY(x) __builtin_expect((x),1)
/* Microarchitecture-specific macros */
#ifndef HAVE_XOP
#ifdef HAVE_SSSE3
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#else
#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
#endif
#else
/* ... */
#endif
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, (-32)); \
row4h = _mm_roti_epi64(row4h, (-32)); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, (-24)); \
row2h = _mm_roti_epi64(row2h, (-24)); \
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, (-16)); \
row4h = _mm_roti_epi64(row4h, (-16)); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, (-63)); \
row2h = _mm_roti_epi64(row2h, (-63)); \
#if defined(HAVE_SSSE3)
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#else
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = row4l;\
t1 = row2l;\
row4l = row3l;\
row3l = row3h;\
row3h = row4l;\
row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = row3l;\
row3l = row3h;\
row3h = t0;\
t0 = row2l;\
t1 = row4l;\
row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
#endif
#if defined(HAVE_SSE41)
#include "blake2b-load-sse41.h"
#else
#include "blake2b-load-sse2.h"
#endif
#define ROUND(r) \
LOAD_MSG_ ##r ##_1(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_2(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
LOAD_MSG_ ##r ##_3(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_4(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
#endif
#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
\
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
\
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
\
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);

346
blake2/blake2bx.cpp Normal file
View File

@ -0,0 +1,346 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake2.h"
#include "blake2-impl.h"
#include "blake2-config.h"
#ifdef WIN32
#include <intrin.h>
#endif
#include <emmintrin.h>
#if defined(HAVE_SSSE3)
#include <tmmintrin.h>
#endif
#if defined(HAVE_SSE41)
#include <smmintrin.h>
#endif
#if defined(HAVE_AVX)
#include <immintrin.h>
#endif
#if defined(HAVE_XOP)
#include <x86intrin.h>
#endif
#include "blake2b-round.h"
ALIGN(64) static const uint64_t blake2b_IV[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/* init xors IV with input parameter block */
int blake2b_init_param(blake2b_state *S, const blake2b_param *P)
{
//blake2b_init0( S );
const uint8_t * v = (const uint8_t *)(blake2b_IV);
const uint8_t * p = (const uint8_t *)(P);
uint8_t * h = (uint8_t *)(S->h);
/* IV XOR ParamBlock */
memset(S, 0, sizeof(blake2b_state));
for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
return 0;
}
/* Some sort of default parameter block initialization, for sequential blake2b */
int blake2b_init(blake2b_state *S, const uint8_t outlen)
{
if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
const blake2b_param P =
{
outlen,
0,
1,
1,
0,
0,
0,
0,
{ 0 },
{ 0 },
{ 0 }
};
return blake2b_init_param(S, &P);
}
int blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
{
if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
const blake2b_param P =
{
outlen,
keylen,
1,
1,
0,
0,
0,
0,
{ 0 },
{ 0 },
{ 0 }
};
if (blake2b_init_param(S, &P) < 0)
return 0;
{
uint8_t block[BLAKE2B_BLOCKBYTES];
memset(block, 0, BLAKE2B_BLOCKBYTES);
memcpy(block, key, keylen);
blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
}
return 0;
}
static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
#endif
#if defined(HAVE_SSE41)
const __m128i m0 = LOADU(block + 00);
const __m128i m1 = LOADU(block + 16);
const __m128i m2 = LOADU(block + 32);
const __m128i m3 = LOADU(block + 48);
const __m128i m4 = LOADU(block + 64);
const __m128i m5 = LOADU(block + 80);
const __m128i m6 = LOADU(block + 96);
const __m128i m7 = LOADU(block + 112);
#else
const uint64_t m0 = ( ( uint64_t * )block )[ 0];
const uint64_t m1 = ( ( uint64_t * )block )[ 1];
const uint64_t m2 = ( ( uint64_t * )block )[ 2];
const uint64_t m3 = ( ( uint64_t * )block )[ 3];
const uint64_t m4 = ( ( uint64_t * )block )[ 4];
const uint64_t m5 = ( ( uint64_t * )block )[ 5];
const uint64_t m6 = ( ( uint64_t * )block )[ 6];
const uint64_t m7 = ( ( uint64_t * )block )[ 7];
const uint64_t m8 = ( ( uint64_t * )block )[ 8];
const uint64_t m9 = ( ( uint64_t * )block )[ 9];
const uint64_t m10 = ( ( uint64_t * )block )[10];
const uint64_t m11 = ( ( uint64_t * )block )[11];
const uint64_t m12 = ( ( uint64_t * )block )[12];
const uint64_t m13 = ( ( uint64_t * )block )[13];
const uint64_t m14 = ( ( uint64_t * )block )[14];
const uint64_t m15 = ( ( uint64_t * )block )[15];
#endif
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
row2h = LOADU(&S->h[6]);
row3l = LOADU(&blake2b_IV[0]);
row3h = LOADU(&blake2b_IV[2]);
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
ROUND(10);
ROUND(11);
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
return 0;
}
int blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
{
while (inlen > 0)
{
size_t left = S->buflen;
size_t fill = BLAKE2B_BLOCKBYTES - left;
if (inlen > fill)
{
memcpy(S->buf + left, in, fill); // Fill buffer
in += fill;
inlen -= fill;
S->counter += BLAKE2B_BLOCKBYTES;
blake2b_compress(S, S->buf); // Compress
S->buflen = 0;
}
else // inlen <= fill
{
memcpy(S->buf + left, in, inlen);
S->buflen += inlen; // not enough to compress
in += inlen;
inlen = 0;
}
}
return 0;
}
int blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
{
if (outlen > BLAKE2B_OUTBYTES)
return -1;
if (S->buflen > BLAKE2B_BLOCKBYTES)
{
S->counter += BLAKE2B_BLOCKBYTES;
blake2b_compress(S, S->buf);
S->buflen -= BLAKE2B_BLOCKBYTES;
memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
}
S->counter += S->buflen;
S->lastblock = 1;
memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
blake2b_compress(S, S->buf);
memcpy(out, &S->h[0], outlen);
S->lastblock = 0;
return 0;
}
int blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
{
blake2b_state S[1];
/* Verify parameters */
if (NULL == in) return -1;
if (NULL == out) return -1;
if (NULL == key) keylen = 0;
if (keylen)
{
if (blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
}
else
{
if (blake2b_init(S, outlen) < 0) return -1;
}
blake2b_update(S, (const uint8_t *)in, inlen);
blake2b_final(S, out, outlen);
return 0;
}
#if defined(SUPERCOP)
int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
{
return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
}
#endif
#if defined(BLAKE2B_SELFTEST)
#include <string.h>
#include "blake2-kat.h"
int main( int argc, char **argv )
{
uint8_t key[BLAKE2B_KEYBYTES];
uint8_t buf[KAT_LENGTH];
for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
key[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
buf[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2B_OUTBYTES];
blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
{
puts( "error" );
return -1;
}
}
puts( "ok" );
return 0;
}
#endif
int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
{
blake2b_state blake_state;
if (outlen <= BLAKE2B_OUTBYTES)
{
blake2b_init(&blake_state, outlen);
blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
blake2b_update(&blake_state, (const uint8_t *)in, inlen);
blake2b_final(&blake_state, out, outlen);
}
else
{
uint8_t out_buffer[BLAKE2B_OUTBYTES];
uint8_t in_buffer[BLAKE2B_OUTBYTES];
blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
blake2b_update(&blake_state, (const uint8_t *)in, inlen);
blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
while (toproduce > BLAKE2B_OUTBYTES)
{
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce -= BLAKE2B_OUTBYTES / 2;
}
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
memcpy(out, out_buffer, toproduce);
}
return 0;
}

19
cpu_tromp/CMakeLists.txt Normal file
View File

@ -0,0 +1,19 @@
set(EXECUTABLE cpu_tromp)
#cpu_tromp/
file(GLOB SRC_LIST
cpu_tromp.cpp )
file(GLOB HEADERS
cpu_tromp.hpp
equi.h
equi_miner.h
)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(..)
ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
TARGET_LINK_LIBRARIES(${EXECUTABLE} )
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )

View File

@ -2,6 +2,7 @@
// Equihash solver
// Copyright (c) 2016-2016 John Tromp
#include "blake2/blake2.h"
#ifdef __APPLE__
#include "osx_barrier.h"
@ -131,3 +132,4 @@ int verify(u32 indices[PROOFSIZE], const char *header, const u32 headerlen, cons
uchar hash[WN/8];
return verifyrec(&ctx, indices, hash, WK);
}

View File

@ -0,0 +1,17 @@
set(EXECUTABLE cpu_xenoncat)
#cpu_xenoncat/
file(GLOB SRC_LIST
xenoncat.cpp )
file(GLOB HEADERS
cpu_xenoncat.hpp
)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(..)
ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
TARGET_LINK_LIBRARIES(${EXECUTABLE} )
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )

Binary file not shown.

View File

@ -1,2 +0,0 @@
fasm zcblake2_avx1.asm
fasm zcblake2_avx2.asm

View File

@ -1,36 +0,0 @@
xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10
xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9
xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8
xctrinc dd 0,2, 0,2
align 32
iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b ;0x32=50 bytes output
s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a ;Personalization
s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8 ;n=200, k=9
iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f
iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179
align 32
yctrinit dd 0,0, 0,1, 0,2, 0,3
yctrinc dd 0,4, 0,4, 0,4, 0,4
blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15
db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3
db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4
db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8
db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13
db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9
db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11
db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10
db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5
db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0

View File

@ -1,349 +0,0 @@
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq xmm0,xmm0,xmm4
vpaddq xmm1,xmm1,xmm5
vpaddq xmm2,xmm2,xmm6
vpaddq xmm3,xmm3,xmm7
if m0<lim
vpaddq xmm0,xmm0, xword [src+m0*16]
end if
if m1<lim
vpaddq xmm1,xmm1, xword [src+m1*16]
end if
if m2<lim
vpaddq xmm2,xmm2, xword [src+m2*16]
end if
if m3<lim
vpaddq xmm3,xmm3, xword [src+m3*16]
end if
vpxor xmm12,xmm12,xmm0
vpxor xmm13,xmm13,xmm1
vpxor xmm14,xmm14,xmm2
vpxor xmm15,xmm15,xmm3
vpshufd xmm12,xmm12,0xB1
vpshufd xmm13,xmm13,0xB1
vpshufd xmm14,xmm14,0xB1
vpshufd xmm15,xmm15,0xB1
vpaddq xmm8,xmm8,xmm12
vpaddq xmm9,xmm9,xmm13
vpaddq xmm10,xmm10,xmm14
vpaddq xmm11,xmm11,xmm15
vpxor xmm4,xmm4,xmm8
vpxor xmm5,xmm5,xmm9
vpxor xmm6,xmm6,xmm10
vpxor xmm7,xmm7,xmm11
vmovdqa [rsp], xmm8
vmovdqa xmm8, xword [xshufb_ror24]
vpshufb xmm4,xmm4,xmm8
vpshufb xmm5,xmm5,xmm8
vpshufb xmm6,xmm6,xmm8
vpshufb xmm7,xmm7,xmm8
vmovdqa xmm8, [rsp]
vpaddq xmm0,xmm0,xmm4
vpaddq xmm1,xmm1,xmm5
vpaddq xmm2,xmm2,xmm6
vpaddq xmm3,xmm3,xmm7
if m4<lim
vpaddq xmm0,xmm0, xword [src+m4*16]
end if
if m5<lim
vpaddq xmm1,xmm1, xword [src+m5*16]
end if
if m6<lim
vpaddq xmm2,xmm2, xword [src+m6*16]
end if
if m7<lim
vpaddq xmm3,xmm3, xword [src+m7*16]
end if
vpxor xmm12,xmm12,xmm0
vpxor xmm13,xmm13,xmm1
vpxor xmm14,xmm14,xmm2
vpxor xmm15,xmm15,xmm3
vmovdqa [rsp], xmm0
vmovdqa xmm0, xword [xshufb_ror16]
vpshufb xmm12,xmm12,xmm0
vpshufb xmm13,xmm13,xmm0
vpshufb xmm14,xmm14,xmm0
vpshufb xmm15,xmm15,xmm0
vpaddq xmm8,xmm8,xmm12
vpaddq xmm9,xmm9,xmm13
vpaddq xmm10,xmm10,xmm14
vpaddq xmm11,xmm11,xmm15
vpxor xmm4,xmm4,xmm8
vpxor xmm5,xmm5,xmm9
vpxor xmm6,xmm6,xmm10
vpxor xmm7,xmm7,xmm11
vpaddq xmm0,xmm4,xmm4
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm0
vpaddq xmm0,xmm5,xmm5
vpsrlq xmm5,xmm5,63
vpor xmm5,xmm5,xmm0
vpaddq xmm0,xmm6,xmm6
vpsrlq xmm6,xmm6,63
vpor xmm6,xmm6,xmm0
vpaddq xmm0,xmm7,xmm7
vpsrlq xmm7,xmm7,63
vpor xmm7,xmm7,xmm0
vmovdqa xmm0, [rsp]
}
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq xmm0,xmm0,xmm5
vpaddq xmm1,xmm1,xmm6
vpaddq xmm2,xmm2,xmm7
vpaddq xmm3,xmm3,xmm4
if m0<lim
vpaddq xmm0,xmm0, xword [src+m0*16]
end if
if m1<lim
vpaddq xmm1,xmm1, xword [src+m1*16]
end if
if m2<lim
vpaddq xmm2,xmm2, xword [src+m2*16]
end if
if m3<lim
vpaddq xmm3,xmm3, xword [src+m3*16]
end if
vpxor xmm15,xmm15,xmm0
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm2
vpxor xmm14,xmm14,xmm3
vpshufd xmm15,xmm15,0xB1
vpshufd xmm12,xmm12,0xB1
vpshufd xmm13,xmm13,0xB1
vpshufd xmm14,xmm14,0xB1
vpaddq xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm12
vpaddq xmm8,xmm8,xmm13
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm6,xmm6,xmm11
vpxor xmm7,xmm7,xmm8
vpxor xmm4,xmm4,xmm9
vmovdqa [rsp], xmm10
vmovdqa xmm10, xword [xshufb_ror24]
vpshufb xmm5,xmm5,xmm10
vpshufb xmm6,xmm6,xmm10
vpshufb xmm7,xmm7,xmm10
vpshufb xmm4,xmm4,xmm10
vmovdqa xmm10, [rsp]
vpaddq xmm0,xmm0,xmm5
vpaddq xmm1,xmm1,xmm6
vpaddq xmm2,xmm2,xmm7
vpaddq xmm3,xmm3,xmm4
if m4<lim
vpaddq xmm0,xmm0, xword [src+m4*16]
end if
if m5<lim
vpaddq xmm1,xmm1, xword [src+m5*16]
end if
if m6<lim
vpaddq xmm2,xmm2, xword [src+m6*16]
end if
if m7<lim
vpaddq xmm3,xmm3, xword [src+m7*16]
end if
vpxor xmm15,xmm15,xmm0
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm2
vpxor xmm14,xmm14,xmm3
vmovdqa [rsp], xmm0
vmovdqa xmm0, xword [xshufb_ror16]
vpshufb xmm15,xmm15,xmm0
vpshufb xmm12,xmm12,xmm0
vpshufb xmm13,xmm13,xmm0
vpshufb xmm14,xmm14,xmm0
vpaddq xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm12
vpaddq xmm8,xmm8,xmm13
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm6,xmm6,xmm11
vpxor xmm7,xmm7,xmm8
vpxor xmm4,xmm4,xmm9
vpaddq xmm0,xmm5,xmm5
vpsrlq xmm5,xmm5,63
vpor xmm5,xmm5,xmm0
vpaddq xmm0,xmm6,xmm6
vpsrlq xmm6,xmm6,63
vpor xmm6,xmm6,xmm0
vpaddq xmm0,xmm7,xmm7
vpsrlq xmm7,xmm7,63
vpor xmm7,xmm7,xmm0
vpaddq xmm0,xmm4,xmm4
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm0
vmovdqa xmm0, [rsp]
}
macro Blake2bRounds2 lim,src
{
;ROUND 0
;hR0 0,2,4,6,1,3,5,7,lim,src
;hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 1
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
;ROUND 2
hR0 11,12,5,15,8,0,2,13,lim,src
hR1 10,3,7,9,14,6,1,4,lim,src
;ROUND 3
hR0 7,3,13,11,9,1,12,14,lim,src
hR1 2,5,4,15,6,10,0,8,lim,src
;ROUND 4
hR0 9,5,2,10,0,7,4,15,lim,src
hR1 14,11,6,3,1,12,8,13,lim,src
;ROUND 5
hR0 2,6,0,8,12,10,11,3,lim,src
hR1 4,7,15,1,13,5,14,9,lim,src
;ROUND 6
hR0 12,1,14,4,5,15,13,10,lim,src
hR1 0,6,9,8,7,3,2,11,lim,src
;ROUND 7
hR0 13,7,12,3,11,14,1,9,lim,src
hR1 5,15,8,2,0,4,6,10,lim,src
;ROUND 8
hR0 6,14,11,0,15,9,3,8,lim,src
hR1 12,13,1,10,2,7,4,5,lim,src
;ROUND 9
hR0 10,8,7,1,2,4,6,5,lim,src
hR1 15,9,3,13,11,14,12,0,lim,src
;ROUND 10
hR0 0,2,4,6,1,3,5,7,lim,src
hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 11
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
}
macro Blake2beq2of2 mids, src
{
vmovddup xmm0, qword [mids]
vpaddq xmm0,xmm0, xword [src+1*16]
vmovddup xmm12, qword [mids+0x08]
vpxor xmm12,xmm12,xmm0
vpshufb xmm12,xmm12, xword [xshufb_ror16]
vmovddup xmm8, qword [mids+0x10]
vpaddq xmm8,xmm8,xmm12
vmovddup xmm4, qword [mids+0x18]
vpxor xmm4,xmm4,xmm8
vpaddq xmm2,xmm4,xmm4 ;xmm2 is temp
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm2
vmovddup xmm5, qword [mids+0x20]
vpaddq xmm0,xmm0,xmm5
vmovddup xmm1, qword [mids+0x30]
vpxor xmm12,xmm12,xmm1
vpshufd xmm12,xmm12,0xB1
vmovddup xmm13, qword [mids+0x38]
vpaddq xmm8,xmm8,xmm13
vmovddup xmm3, qword [mids+0x60]
vpaddq xmm3,xmm3,xmm4
vmovddup xmm15, qword [mids+0x48]
vpxor xmm15,xmm15,xmm0
vpshufd xmm15,xmm15,0xB1
vmovddup xmm11, qword [mids+0x58]
vpaddq xmm11,xmm11,xmm12
vmovddup xmm7, qword [mids+0x68]
vpxor xmm7,xmm7,xmm8
vmovddup xmm14, qword [mids+0x40]
vpxor xmm14,xmm14,xmm3
vpshufd xmm14,xmm14,0xB1
vmovddup xmm10, qword [mids+0x50]
vpaddq xmm10,xmm10,xmm15
vmovddup xmm6, qword [mids+0x28]
vpxor xmm6,xmm6,xmm11
vmovddup xmm9, qword [mids+0x70]
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm4,xmm4,xmm9
vmovdqa xmm2, xword [xshufb_ror24] ;xmm2 is temp
vpshufb xmm5,xmm5,xmm2
vpshufb xmm6,xmm6,xmm2
vpshufb xmm7,xmm7,xmm2
vpshufb xmm4,xmm4,xmm2
vmovddup xmm2, qword [mids+0x78]
vpaddq xmm0,xmm0,xmm5
vpaddq xmm1,xmm1,xmm6
vpaddq xmm2,xmm2,xmm7
vpaddq xmm3,xmm3,xmm4
vpxor xmm15,xmm15,xmm0
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm2
vpxor xmm14,xmm14,xmm3
vmovdqa [rsp], xmm0
vmovdqa xmm0, xword [xshufb_ror16]
vpshufb xmm15,xmm15,xmm0
vpshufb xmm12,xmm12,xmm0
vpshufb xmm13,xmm13,xmm0
vpshufb xmm14,xmm14,xmm0
vpaddq xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm12
vpaddq xmm8,xmm8,xmm13
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm6,xmm6,xmm11
vpxor xmm7,xmm7,xmm8
vpxor xmm4,xmm4,xmm9
vpaddq xmm0,xmm5,xmm5
vpsrlq xmm5,xmm5,63
vpor xmm5,xmm5,xmm0
vpaddq xmm0,xmm6,xmm6
vpsrlq xmm6,xmm6,63
vpor xmm6,xmm6,xmm0
vpaddq xmm0,xmm7,xmm7
vpsrlq xmm7,xmm7,63
vpor xmm7,xmm7,xmm0
vpaddq xmm0,xmm4,xmm4
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm0
vmovdqa xmm0, [rsp]
Blake2bRounds2 2,src
vpxor xmm0, xmm0, xmm8
vpxor xmm1, xmm1, xmm9
vpxor xmm2, xmm2, xmm10
vpxor xmm3, xmm3, xmm11
vpxor xmm4, xmm4, xmm12
vpxor xmm5, xmm5, xmm13
vpxor xmm6, xmm6, xmm14
;vpxor xmm7, xmm7, xmm15
vmovddup xmm8, qword [mids+0x80]
vmovddup xmm9, qword [mids+0x88]
vmovddup xmm10, qword [mids+0x90]
vmovddup xmm11, qword [mids+0x98]
vmovddup xmm12, qword [mids+0xa0]
vmovddup xmm13, qword [mids+0xa8]
vmovddup xmm14, qword [mids+0xb0]
;vmovddup xmm15, qword [mids+0xb8]
vpxor xmm0, xmm0, xmm8
vpxor xmm1, xmm1, xmm9
vpxor xmm2, xmm2, xmm10
vpxor xmm3, xmm3, xmm11
vpxor xmm4, xmm4, xmm12
vpxor xmm5, xmm5, xmm13
vpxor xmm6, xmm6, xmm14
;vpxor xmm7, xmm7, xmm15
}

View File

@ -1,350 +0,0 @@
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq ymm0,ymm0,ymm4
vpaddq ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm6
vpaddq ymm3,ymm3,ymm7
if m0<lim
vpaddq ymm0,ymm0, yword [src+m0*32]
end if
if m1<lim
vpaddq ymm1,ymm1, yword [src+m1*32]
end if
if m2<lim
vpaddq ymm2,ymm2, yword [src+m2*32]
end if
if m3<lim
vpaddq ymm3,ymm3, yword [src+m3*32]
end if
vpxor ymm12,ymm12,ymm0
vpxor ymm13,ymm13,ymm1
vpxor ymm14,ymm14,ymm2
vpxor ymm15,ymm15,ymm3
vpshufd ymm12,ymm12,0xB1
vpshufd ymm13,ymm13,0xB1
vpshufd ymm14,ymm14,0xB1
vpshufd ymm15,ymm15,0xB1
vpaddq ymm8,ymm8,ymm12
vpaddq ymm9,ymm9,ymm13
vpaddq ymm10,ymm10,ymm14
vpaddq ymm11,ymm11,ymm15
vpxor ymm4,ymm4,ymm8
vpxor ymm5,ymm5,ymm9
vpxor ymm6,ymm6,ymm10
vpxor ymm7,ymm7,ymm11
vmovdqa [rsp], ymm8
vbroadcasti128 ymm8, xword [xshufb_ror24]
vpshufb ymm4,ymm4,ymm8
vpshufb ymm5,ymm5,ymm8
vpshufb ymm6,ymm6,ymm8
vpshufb ymm7,ymm7,ymm8
vmovdqa ymm8, [rsp]
vpaddq ymm0,ymm0,ymm4
vpaddq ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm6
vpaddq ymm3,ymm3,ymm7
if m4<lim
vpaddq ymm0,ymm0, yword [src+m4*32]
end if
if m5<lim
vpaddq ymm1,ymm1, yword [src+m5*32]
end if
if m6<lim
vpaddq ymm2,ymm2, yword [src+m6*32]
end if
if m7<lim
vpaddq ymm3,ymm3, yword [src+m7*32]
end if
vpxor ymm12,ymm12,ymm0
vpxor ymm13,ymm13,ymm1
vpxor ymm14,ymm14,ymm2
vpxor ymm15,ymm15,ymm3
vmovdqa [rsp], ymm0
vbroadcasti128 ymm0, xword [xshufb_ror16]
vpshufb ymm12,ymm12,ymm0
vpshufb ymm13,ymm13,ymm0
vpshufb ymm14,ymm14,ymm0
vpshufb ymm15,ymm15,ymm0
vpaddq ymm8,ymm8,ymm12
vpaddq ymm9,ymm9,ymm13
vpaddq ymm10,ymm10,ymm14
vpaddq ymm11,ymm11,ymm15
vpxor ymm4,ymm4,ymm8
vpxor ymm5,ymm5,ymm9
vpxor ymm6,ymm6,ymm10
vpxor ymm7,ymm7,ymm11
vpaddq ymm0,ymm4,ymm4
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm0
vpaddq ymm0,ymm5,ymm5
vpsrlq ymm5,ymm5,63
vpor ymm5,ymm5,ymm0
vpaddq ymm0,ymm6,ymm6
vpsrlq ymm6,ymm6,63
vpor ymm6,ymm6,ymm0
vpaddq ymm0,ymm7,ymm7
vpsrlq ymm7,ymm7,63
vpor ymm7,ymm7,ymm0
vmovdqa ymm0, [rsp]
}
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm6
vpaddq ymm2,ymm2,ymm7
vpaddq ymm3,ymm3,ymm4
if m0<lim
vpaddq ymm0,ymm0, yword [src+m0*32]
end if
if m1<lim
vpaddq ymm1,ymm1, yword [src+m1*32]
end if
if m2<lim
vpaddq ymm2,ymm2, yword [src+m2*32]
end if
if m3<lim
vpaddq ymm3,ymm3, yword [src+m3*32]
end if
vpxor ymm15,ymm15,ymm0
vpxor ymm12,ymm12,ymm1
vpxor ymm13,ymm13,ymm2
vpxor ymm14,ymm14,ymm3
vpshufd ymm15,ymm15,0xB1
vpshufd ymm12,ymm12,0xB1
vpshufd ymm13,ymm13,0xB1
vpshufd ymm14,ymm14,0xB1
vpaddq ymm10,ymm10,ymm15
vpaddq ymm11,ymm11,ymm12
vpaddq ymm8,ymm8,ymm13
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm6,ymm6,ymm11
vpxor ymm7,ymm7,ymm8
vpxor ymm4,ymm4,ymm9
vmovdqa [rsp], ymm10
vbroadcasti128 ymm10, xword [xshufb_ror24]
vpshufb ymm5,ymm5,ymm10
vpshufb ymm6,ymm6,ymm10
vpshufb ymm7,ymm7,ymm10
vpshufb ymm4,ymm4,ymm10
vmovdqa ymm10, [rsp]
vpaddq ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm6
vpaddq ymm2,ymm2,ymm7
vpaddq ymm3,ymm3,ymm4
if m4<lim
vpaddq ymm0,ymm0, yword [src+m4*32]
end if
if m5<lim
vpaddq ymm1,ymm1, yword [src+m5*32]
end if
if m6<lim
vpaddq ymm2,ymm2, yword [src+m6*32]
end if
if m7<lim
vpaddq ymm3,ymm3, yword [src+m7*32]
end if
vpxor ymm15,ymm15,ymm0
vpxor ymm12,ymm12,ymm1
vpxor ymm13,ymm13,ymm2
vpxor ymm14,ymm14,ymm3
vmovdqa [rsp], ymm0
vbroadcasti128 ymm0, xword [xshufb_ror16]
vpshufb ymm15,ymm15,ymm0
vpshufb ymm12,ymm12,ymm0
vpshufb ymm13,ymm13,ymm0
vpshufb ymm14,ymm14,ymm0
vpaddq ymm10,ymm10,ymm15
vpaddq ymm11,ymm11,ymm12
vpaddq ymm8,ymm8,ymm13
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm6,ymm6,ymm11
vpxor ymm7,ymm7,ymm8
vpxor ymm4,ymm4,ymm9
vpaddq ymm0,ymm5,ymm5
vpsrlq ymm5,ymm5,63
vpor ymm5,ymm5,ymm0
vpaddq ymm0,ymm6,ymm6
vpsrlq ymm6,ymm6,63
vpor ymm6,ymm6,ymm0
vpaddq ymm0,ymm7,ymm7
vpsrlq ymm7,ymm7,63
vpor ymm7,ymm7,ymm0
vpaddq ymm0,ymm4,ymm4
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm0
vmovdqa ymm0, [rsp]
}
macro Blake2bRounds2 lim,src
{
;ROUND 0
;hR0 0,2,4,6,1,3,5,7,lim,src
;hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 1
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
;ROUND 2
hR0 11,12,5,15,8,0,2,13,lim,src
hR1 10,3,7,9,14,6,1,4,lim,src
;ROUND 3
hR0 7,3,13,11,9,1,12,14,lim,src
hR1 2,5,4,15,6,10,0,8,lim,src
;ROUND 4
hR0 9,5,2,10,0,7,4,15,lim,src
hR1 14,11,6,3,1,12,8,13,lim,src
;ROUND 5
hR0 2,6,0,8,12,10,11,3,lim,src
hR1 4,7,15,1,13,5,14,9,lim,src
;ROUND 6
hR0 12,1,14,4,5,15,13,10,lim,src
hR1 0,6,9,8,7,3,2,11,lim,src
;ROUND 7
hR0 13,7,12,3,11,14,1,9,lim,src
hR1 5,15,8,2,0,4,6,10,lim,src
;ROUND 8
hR0 6,14,11,0,15,9,3,8,lim,src
hR1 12,13,1,10,2,7,4,5,lim,src
;ROUND 9
hR0 10,8,7,1,2,4,6,5,lim,src
hR1 15,9,3,13,11,14,12,0,lim,src
;ROUND 10
hR0 0,2,4,6,1,3,5,7,lim,src
hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 11
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
}
macro Blake2beq2of2 mids, src
{
vpbroadcastq ymm0, qword [mids]
vpaddq ymm0,ymm0, yword [src+1*32]
vpbroadcastq ymm12, qword [mids+0x08]
vpxor ymm12,ymm12,ymm0
vbroadcasti128 ymm2, xword [xshufb_ror16] ;ymm2 is temp
vpshufb ymm12,ymm12,ymm2
vpbroadcastq ymm8, qword [mids+0x10]
vpaddq ymm8,ymm8,ymm12
vpbroadcastq ymm4, qword [mids+0x18]
vpxor ymm4,ymm4,ymm8
vpaddq ymm2,ymm4,ymm4 ;ymm2 is temp
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm2
vpbroadcastq ymm5, qword [mids+0x20]
vpaddq ymm0,ymm0,ymm5
vpbroadcastq ymm1, qword [mids+0x30]
vpxor ymm12,ymm12,ymm1
vpshufd ymm12,ymm12,0xB1
vpbroadcastq ymm13, qword [mids+0x38]
vpaddq ymm8,ymm8,ymm13
vpbroadcastq ymm3, qword [mids+0x60]
vpaddq ymm3,ymm3,ymm4
vpbroadcastq ymm15, qword [mids+0x48]
vpxor ymm15,ymm15,ymm0
vpshufd ymm15,ymm15,0xB1
vpbroadcastq ymm11, qword [mids+0x58]
vpaddq ymm11,ymm11,ymm12
vpbroadcastq ymm7, qword [mids+0x68]
vpxor ymm7,ymm7,ymm8
vpbroadcastq ymm14, qword [mids+0x40]
vpxor ymm14,ymm14,ymm3
vpshufd ymm14,ymm14,0xB1
vpbroadcastq ymm10, qword [mids+0x50]
vpaddq ymm10,ymm10,ymm15
vpbroadcastq ymm6, qword [mids+0x28]
vpxor ymm6,ymm6,ymm11
vpbroadcastq ymm9, qword [mids+0x70]
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm4,ymm4,ymm9
vbroadcasti128 ymm2, xword [xshufb_ror24] ;ymm2 is temp
vpshufb ymm5,ymm5,ymm2
vpshufb ymm6,ymm6,ymm2
vpshufb ymm7,ymm7,ymm2
vpshufb ymm4,ymm4,ymm2
vpbroadcastq ymm2, qword [mids+0x78]
vpaddq ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm6
vpaddq ymm2,ymm2,ymm7
vpaddq ymm3,ymm3,ymm4
vpxor ymm15,ymm15,ymm0
vpxor ymm12,ymm12,ymm1
vpxor ymm13,ymm13,ymm2
vpxor ymm14,ymm14,ymm3
vmovdqa [rsp], ymm0
vbroadcasti128 ymm0, xword [xshufb_ror16]
vpshufb ymm15,ymm15,ymm0
vpshufb ymm12,ymm12,ymm0
vpshufb ymm13,ymm13,ymm0
vpshufb ymm14,ymm14,ymm0
vpaddq ymm10,ymm10,ymm15
vpaddq ymm11,ymm11,ymm12
vpaddq ymm8,ymm8,ymm13
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm6,ymm6,ymm11
vpxor ymm7,ymm7,ymm8
vpxor ymm4,ymm4,ymm9
vpaddq ymm0,ymm5,ymm5
vpsrlq ymm5,ymm5,63
vpor ymm5,ymm5,ymm0
vpaddq ymm0,ymm6,ymm6
vpsrlq ymm6,ymm6,63
vpor ymm6,ymm6,ymm0
vpaddq ymm0,ymm7,ymm7
vpsrlq ymm7,ymm7,63
vpor ymm7,ymm7,ymm0
vpaddq ymm0,ymm4,ymm4
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm0
vmovdqa ymm0, [rsp]
Blake2bRounds2 2,src
vpxor ymm0, ymm0, ymm8
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm10
vpxor ymm3, ymm3, ymm11
vpxor ymm4, ymm4, ymm12
vpxor ymm5, ymm5, ymm13
vpxor ymm6, ymm6, ymm14
;vpxor ymm7, ymm7, ymm15
vpbroadcastq ymm8, qword [mids+0x80]
vpbroadcastq ymm9, qword [mids+0x88]
vpbroadcastq ymm10, qword [mids+0x90]
vpbroadcastq ymm11, qword [mids+0x98]
vpbroadcastq ymm12, qword [mids+0xa0]
vpbroadcastq ymm13, qword [mids+0xa8]
vpbroadcastq ymm14, qword [mids+0xb0]
;vpbroadcastq ymm15, qword [mids+0xb8]
vpxor ymm0, ymm0, ymm8
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm10
vpxor ymm3, ymm3, ymm11
vpxor ymm4, ymm4, ymm12
vpxor ymm5, ymm5, ymm13
vpxor ymm6, ymm6, ymm14
;vpxor ymm7, ymm7, ymm15
}

View File

@ -1,39 +0,0 @@
;void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
;hashout: hash output buffer: 2*64 bytes
;midstate: 256 bytes from Blake2PrepareMidstate2
;indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
include "macro_blake2b_avx1.asm"
Blake2Run2:
mov rax, rsp
sub rsp, 0x28
and rsp, -32
mov [rsp+0x20], rax
mov [rsi+0xd4], edx
add edx, 1
mov [rsi+0xdc], edx
Blake2beq2of2 rsi, rsi+0xc0
vpunpcklqdq xmm8, xmm0, xmm1
vpunpckhqdq xmm1, xmm0, xmm1
vpunpcklqdq xmm10, xmm2, xmm3
vpunpckhqdq xmm3, xmm2, xmm3
vpunpcklqdq xmm12, xmm4, xmm5
vpunpckhqdq xmm5, xmm4, xmm5
vpunpcklqdq xmm14, xmm6, xmm7
vpunpckhqdq xmm7, xmm6, xmm7
vmovdqa [rdi], xmm8
vmovdqa [rdi+0x10], xmm10
vmovdqa [rdi+0x20], xmm12
vmovdqa [rdi+0x30], xmm14
vmovdqa [rdi+0x40], xmm1
vmovdqa [rdi+0x50], xmm3
vmovdqa [rdi+0x60], xmm5
vmovdqa [rdi+0x70], xmm7
mov rsp, [rsp+0x20]
ret

View File

@ -1,49 +0,0 @@
;void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
;hashout: hash output buffer: 4*64 bytes
;midstate: 256 bytes from Blake2PrepareMidstate4
;indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
include "macro_blake2b_avx2.asm"
Blake2Run4:
mov rax, rsp
sub rsp, 0x28
and rsp, -32
mov [rsp+0x20], rax
vmovd xmm0, edx ;indexctr
vpbroadcastd ymm0, xmm0
vpaddd ymm0, ymm0, yword [yctrinit]
vpblendd ymm0, ymm0, yword [rsi+0xe0], 0x55
vmovdqa yword [rsi+0xe0], ymm0
Blake2beq2of2 rsi, rsi+0xc0
vpunpcklqdq ymm8, ymm0, ymm1
vpunpckhqdq ymm9, ymm0, ymm1
vpunpcklqdq ymm10, ymm2, ymm3
vpunpckhqdq ymm11, ymm2, ymm3
vpunpcklqdq ymm12, ymm4, ymm5
vpunpckhqdq ymm13, ymm4, ymm5
vpunpcklqdq ymm14, ymm6, ymm7
vpunpckhqdq ymm15, ymm6, ymm7
vperm2i128 ymm0, ymm8, ymm10, 0x20
vperm2i128 ymm1, ymm12, ymm14, 0x20
vperm2i128 ymm2, ymm9, ymm11, 0x20
vperm2i128 ymm3, ymm13, ymm15, 0x20
vperm2i128 ymm4, ymm8, ymm10, 0x31
vperm2i128 ymm5, ymm12, ymm14, 0x31
vperm2i128 ymm6, ymm9, ymm11, 0x31
vperm2i128 ymm7, ymm13, ymm15, 0x31
vmovdqa [rdi], ymm0
vmovdqa [rdi+0x20], ymm1
vmovdqa [rdi+0x40], ymm2
vmovdqa [rdi+0x60], ymm3
vmovdqa [rdi+0x80], ymm4
vmovdqa [rdi+0xa0], ymm5
vmovdqa [rdi+0xc0], ymm6
vmovdqa [rdi+0xe0], ymm7
mov rsp, [rsp+0x20]
ret

View File

@ -1,212 +0,0 @@
;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
;midstate: 256 bytes of buffer for output midstate, aligned by 32
;input: 140 bytes header, preferably aligned by 8
Blake2PrepareMidstate2:
sub rsp, 0x188
vmovdqa xmm10, xword [xshufb_ror24]
vmovdqa xmm11, xword [xshufb_ror16]
vmovdqa xmm0, xword [s0]
vmovdqa xmm1, xword [s2]
vmovdqa xmm2, xword [s4]
vmovdqa xmm3, xword [s6]
vmovdqa xmm4, xword [iv]
vmovdqa xmm5, xword [iv+0x10]
vmovdqa xmm6, xword [iv4xor128]
vmovdqa xmm7, xword [iv4xor128+0x10]
mov r8, rsp
lea r9, [blake2sigma]
lea r11, [blake2sigma+160]
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
_LoopEhPrepare1:
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r9, 16
cmp r9, r11
jb _LoopEhPrepare1
mov r8, rsp
call _ProcBlakeRound
add r8, 0x80
call _ProcBlakeRound
vpxor xmm0, xmm0, xmm4
vpxor xmm1, xmm1, xmm5
vpxor xmm2, xmm2, xmm6
vpxor xmm3, xmm3, xmm7
vpxor xmm0, xmm0, xword [s0]
vpxor xmm1, xmm1, xword [s2]
vpxor xmm2, xmm2, xword [s4]
vpxor xmm3, xmm3, xword [s6]
vmovdqa xword [rdi+0x80], xmm0
vmovdqa xword [rdi+0x90], xmm1
vmovdqa xword [rdi+0xa0], xmm2
vmovdqa xword [rdi+0xb0], xmm3
vmovq xmm8, [rsi+0x80]
vpshufd xmm4, xmm8, 0x44
vmovdqa xword [rdi+0xc0], xmm4
vmovd xmm4, [rsi+0x88]
vpshufd xmm4, xmm4, 0x44
vmovdqa xword [rdi+0xd0], xmm4
;Begin second message block
vmovdqa xmm4, xword [iv]
vmovdqa xmm5, xword [iv+0x10]
vmovdqa xmm6, xword [iv4xor144]
vmovdqa xmm7, xword [iv6inverted]
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, xmm8 ;xmm8[63:0]=message
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufd xmm6, xmm6, 0xb1
vmovq [rdi+0x08], xmm6 ;v12
vpshufd xmm7, xmm7, 0xb1
vpaddq xmm4, xmm4, xmm6
vmovq [rdi+0x10], xmm4 ;v8
vpaddq xmm5, xmm5, xmm7
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpshufb xmm2, xmm2, xmm10
vmovq [rdi+0x18], xmm2 ;v4
vpshufb xmm3, xmm3, xmm10
vpaddq xmm0, xmm0, xmm2
vmovq [rdi], xmm0 ;v0
vpaddq xmm1, xmm1, xmm3
vpextrq [rdi+0x60], xmm1, 1 ;v3
;add message (nonce, index) to xmm0 here, but we don't have
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufb xmm6, xmm6, xmm11
vpshufb xmm7, xmm7, xmm11
vmovdqa xword [rdi+0x40], xmm7 ;v14,15
vpaddq xmm4, xmm4, xmm6
vpextrq [rdi+0x70], xmm4, 1 ;v9
vpaddq xmm5, xmm5, xmm7
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpaddq xmm8, xmm2, xmm2
vpsrlq xmm2, xmm2, 63
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
vpsrlq xmm3, xmm3, 63
vpor xmm3, xmm3, xmm2
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
vmovdqa xword [rdi+0x20], xmm2 ;v5,6
vpsrldq xmm3, xmm3, 8
vmovq [rdi+0x68], xmm3 ;v7
vpsrldq xmm7, xmm6, 8
vpaddq xmm0, xmm0, xmm2
vpextrq [rdi+0x30], xmm0, 1 ;v1
vpaddq xmm1, xmm1, xmm3
vmovq [rdi+0x78], xmm1 ;v2
vpxor xmm7, xmm7, xmm1
vpshufd xmm7, xmm7, 0xb1
vmovq [rdi+0x38], xmm7 ;v13
add rsp, 0x188
ret
align 16
_ProcBlakeMsgSched:
;rsi=src
;r8=dst
;r9=sigma table
xor r10d, r10d
_LoopBlakeMsgSched:
movzx eax, byte [r9+r10]
mov rax, [rsi+rax*8]
mov [r8+r10*8], rax
add r10d, 1
cmp r10d, 16
jb _LoopBlakeMsgSched
ret
align 16
_ProcBlakeRound:
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8]
vpaddq xmm1, xmm1, [r8+0x10]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufd xmm6, xmm6, 0xb1
vpshufd xmm7, xmm7, 0xb1
vpaddq xmm4, xmm4, xmm6
vpaddq xmm5, xmm5, xmm7
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpshufb xmm2, xmm2, xmm10
vpshufb xmm3, xmm3, xmm10
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8+0x20]
vpaddq xmm1, xmm1, [r8+0x30]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
vpshufb xmm7, xmm7, xmm11
vpaddq xmm4, xmm4, xmm9
vpaddq xmm5, xmm5, xmm7
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpaddq xmm8, xmm2, xmm2
vpsrlq xmm2, xmm2, 63
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
vpsrlq xmm3, xmm3, 63
vpor xmm3, xmm3, xmm2
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
vpalignr xmm3, xmm8, xmm3, 8
vpalignr xmm6, xmm9, xmm7, 8 ;xmm6 resume
vpalignr xmm7, xmm7, xmm9, 8
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8+0x40]
vpaddq xmm1, xmm1, [r8+0x50]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufd xmm6, xmm6, 0xb1
vpshufd xmm7, xmm7, 0xb1
vpaddq xmm5, xmm5, xmm6
vpaddq xmm4, xmm4, xmm7
vpxor xmm2, xmm2, xmm5
vpxor xmm3, xmm3, xmm4
vpshufb xmm2, xmm2, xmm10
vpshufb xmm3, xmm3, xmm10
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8+0x60]
vpaddq xmm1, xmm1, [r8+0x70]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
vpshufb xmm7, xmm7, xmm11
vpaddq xmm5, xmm5, xmm9
vpaddq xmm4, xmm4, xmm7
vpxor xmm2, xmm2, xmm5
vpxor xmm3, xmm3, xmm4
vpaddq xmm8, xmm2, xmm2
vpsrlq xmm2, xmm2, 63
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
vpsrlq xmm3, xmm3, 63
vpor xmm3, xmm3, xmm2
vpalignr xmm2, xmm8, xmm3, 8 ;xmm2 resume
vpalignr xmm3, xmm3, xmm8, 8
vpalignr xmm6, xmm7, xmm9, 8 ;xmm6 resume
vpalignr xmm7, xmm9, xmm7, 8
ret

View File

@ -1,166 +0,0 @@
;void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
;midstate: 256 bytes of buffer for output midstate, aligned by 32
;input: 140 bytes header, preferably aligned by 8
Blake2PrepareMidstate4:
sub rsp, 0x188
vbroadcasti128 ymm6, xword [xshufb_ror24]
vbroadcasti128 ymm7, xword [xshufb_ror16]
vmovdqa ymm0, yword [s0]
vmovdqa ymm1, yword [s4]
vmovdqa ymm2, yword [iv]
vmovdqa ymm3, yword [iv4xor128]
mov r8, rsp
lea r9, [blake2sigma]
lea r11, [blake2sigma+160]
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
_LoopEhPrepare1:
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r9, 16
cmp r9, r11
jb _LoopEhPrepare1
mov r8, rsp
call _ProcBlakeRound
add r8, 0x80
call _ProcBlakeRound
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
vpxor ymm0, ymm0, yword [s0]
vpxor ymm1, ymm1, yword [s4]
vmovdqa yword [rdi+0x80], ymm0
vmovdqa yword [rdi+0xa0], ymm1
vmovq xmm5, [rsi+0x80]
vpbroadcastq ymm4, xmm5
vmovdqa yword [rdi+0xc0], ymm4
vmovd xmm4, [rsi+0x88]
vpbroadcastq ymm4, xmm4
vmovdqa yword [rdi+0xe0], ymm4
;Begin second message block
vmovdqa ymm2, yword [iv]
vmovdqa ymm3, yword [iv4xor144] ;also loads iv6inverted
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, ymm5 ;ymm5[63:0]=message
vpxor ymm3, ymm3, ymm0
vpshufd ymm3, ymm3, 0xb1
vmovq [rdi+0x08], xmm3 ;v12
vpaddq ymm2, ymm2, ymm3
vmovq [rdi+0x10], xmm2 ;v8
vpxor ymm1, ymm1, ymm2
vpshufb ymm1, ymm1, ymm6
vmovq [rdi+0x18], xmm1 ;v4
vpaddq ymm0, ymm0, ymm1
vmovq [rdi], xmm0 ;v0, v3 ready
;add message (nonce, index) to xmm0 here, but we don't have
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm7
vextracti128 xmm4, ymm3, 1
vmovdqa xword [rdi+0x40], xmm4 ;v14,15
vpaddq ymm2, ymm2, ymm3
vpextrq [rdi+0x70], xmm2, 1 ;v9
vextracti128 xmm5, ymm2, 1
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
vpxor ymm1, ymm1, ymm2
vpaddq ymm4, ymm1, ymm1
vpsrlq ymm1, ymm1, 63
vpor ymm1, ymm1, ymm4
;Valid:
; v1 v2 v3
; v5 v6 v7
; v9 v10 v11
; v13 v14 v15
;
;v1 v2 <- v6 v7
;v13 <- v2
vpermq ymm1, ymm1, 0x39
vmovdqa xword [rdi+0x20], xmm1 ;v5,6
vextracti128 xmm4, ymm0, 1
vextracti128 xmm5, ymm1, 1
vpextrq [rdi+0x60], xmm4, 1 ;v3
vmovq [rdi+0x68], xmm5 ;v7
vpsrldq xmm3, xmm3, 8
vpaddq xmm0, xmm0, xmm1
vpextrq [rdi+0x30], xmm0, 1 ;v1
vpaddq xmm4, xmm4, xmm5
vmovq [rdi+0x78], xmm4 ;v2
vpxor xmm3, xmm3, xmm4
vpshufd xmm3, xmm3, 0xb1
vmovq [rdi+0x38], xmm3 ;v13
add rsp, 0x188
ret
align 16
_ProcBlakeMsgSched:
;rsi=src
;r8=dst
;r9=sigma table
xor r10d, r10d
_LoopBlakeMsgSched:
movzx eax, byte [r9+r10]
mov rax, [rsi+rax*8]
mov [r8+r10*8], rax
add r10d, 1
cmp r10d, 16
jb _LoopBlakeMsgSched
ret
align 16
_ProcBlakeRound:
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8]
vpxor ymm3, ymm3, ymm0
vpshufd ymm3, ymm3, 0xb1
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpshufb ymm1, ymm1, ymm6 ;ror24
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8+0x20]
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm7 ;ror16
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpaddq ymm4, ymm1, ymm1
vpsrlq ymm1, ymm1, 63
vpor ymm1, ymm1, ymm4
vpermq ymm1, ymm1, 0x39
vpermq ymm2, ymm2, 0x4e
vpermq ymm3, ymm3, 0x93
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8+0x40]
vpxor ymm3, ymm3, ymm0
vpshufd ymm3, ymm3, 0xb1
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpshufb ymm1, ymm1, ymm6 ;ror24
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8+0x60]
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm7 ;ror16
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpaddq ymm4, ymm1, ymm1
vpsrlq ymm1, ymm1, 63
vpor ymm1, ymm1, ymm4
vpermq ymm1, ymm1, 0x93
vpermq ymm2, ymm2, 0x4e
vpermq ymm3, ymm3, 0x39
ret

View File

@ -1,11 +0,0 @@
format elf64
public Blake2PrepareMidstate2
public Blake2Run2
section '.text' executable align 64
include "proc_prepmidstate_avx1.asm"
align 16
include "proc_blake2_avx1.asm"
section '.data' writeable align 64
include "data_blake2b.asm"

View File

@ -1,11 +0,0 @@
format elf64
public Blake2PrepareMidstate4
public Blake2Run4
section '.text' executable align 64
include "proc_prepmidstate_avx2.asm"
align 16
include "proc_blake2_avx2.asm"
section '.data' writeable align 64
include "data_blake2b.asm"

View File

@ -1,51 +0,0 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
//midstate: 256 bytes of buffer for output midstate, aligned by 32
//input: 140 bytes header, preferably aligned by 8
void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
//hashout: hash output buffer: 2*64 bytes
//midstate: 256 bytes from Blake2PrepareMidstate2
//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
unsigned char __attribute__((aligned(8))) testdata[140] =
{
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
};
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
int main(void)
{
unsigned char midstate_a[256+32];
void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
unsigned char hashout_a[128+32];
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
unsigned char buf[128];
FILE *outfile;
int i;
Blake2PrepareMidstate2(pmidstate, testdata);
outfile = fopen("out.bin", "wb");
for (i=0; i<1048576; i+=2) {
Blake2Run2(phashout, pmidstate, i);
memcpy(buf, phashout, 50);
memcpy(buf+50, phashout+64, 50);
fwrite(buf, 100, 1, outfile);
}
fclose(outfile);
return 0;
}

View File

@ -1,53 +0,0 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
//midstate: 256 bytes of buffer for output midstate, aligned by 32
//input: 140 bytes header, preferably aligned by 8
void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
//hashout: hash output buffer: 4*64 bytes
//midstate: 256 bytes from Blake2PrepareMidstate4
//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
unsigned char __attribute__((aligned(8))) testdata[140] =
{
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
};
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
int main(void)
{
unsigned char midstate_a[256+32];
void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
unsigned char hashout_a[256+32];
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
unsigned char buf[256];
FILE *outfile;
int i;
Blake2PrepareMidstate4(pmidstate, testdata);
outfile = fopen("out.bin", "wb");
for (i=0; i<1048576; i+=4) {
Blake2Run4(phashout, pmidstate, i);
memcpy(buf, phashout, 50);
memcpy(buf+50, phashout+64, 50);
memcpy(buf+100, phashout+128, 50);
memcpy(buf+150, phashout+192, 50);
fwrite(buf, 200, 1, outfile);
}
fclose(outfile);
return 0;
}

Binary file not shown.

View File

@ -1,78 +0,0 @@
//compile with
//gcc -o quickbench quickbench.c equihash_avx2.o
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#define CONTEXT_SIZE 178033152
#define ITERATIONS 10
//Linkage with assembly
//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
void EhPrepare(void *context, void *input);
int32_t EhSolver(void *context, uint32_t nonce);
extern char testinput[];
int main(void)
{
void *context_alloc, *context, *context_end;
uint32_t *pu32;
uint64_t *pu64, previous_rdtsc;
uint8_t inputheader[144]; //140 byte header
FILE *infile, *outfile;
struct timespec time0, time1;
long t0, t1;
int32_t numsolutions, total_solutions;
uint32_t nonce, delta_time, total_time;
int i, j;
context_alloc = malloc(CONTEXT_SIZE+4096);
context = (void*) (((long) context_alloc+4095) & -4096);
context_end = context + CONTEXT_SIZE;
infile = 0;
infile = fopen("input.bin", "rb");
if (infile) {
puts("Reading input.bin");
fread(inputheader, 140, 1, infile);
fclose(infile);
} else {
puts("input.bin not found, use sample data (beta1 testnet block 2)");
memcpy(inputheader, testinput, 140);
}
EhPrepare(context, (void *) inputheader);
//Warm up, timing not taken into average
nonce = 0;
clock_gettime(CLOCK_MONOTONIC, &time0);
numsolutions = EhSolver(context, nonce);
clock_gettime(CLOCK_MONOTONIC, &time1);
delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
printf("(Warm up) Time: %u ms, solutions: %u\n", delta_time, numsolutions);
printf("Running %d iterations...\n", ITERATIONS);
nonce = 58; //arbritary number to get 19 solutions in 10 iterations (to match 1.88 solutions per run)
total_time = total_solutions = 0;
for (i=0; i<ITERATIONS; i++) {
clock_gettime(CLOCK_MONOTONIC, &time0);
numsolutions = EhSolver(context, nonce);
clock_gettime(CLOCK_MONOTONIC, &time1);
nonce++;
delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
total_time += delta_time;
total_solutions += numsolutions;
printf("Time: %u ms, solutions: %u\n", delta_time, numsolutions);
}
printf("Average time: %d ms; %.3f Sol/s\n", total_time/ITERATIONS, (double) 1000.0*total_solutions/total_time);
free(context_alloc);
return 0;
}

View File

@ -1,128 +0,0 @@
//compile with
//gcc -o solver solver.c equihash_avx2.o
//
//./solver
//sha256sum out2.bin
//Expected result with default input.bin (beta1 testnet block 2),
//257d3c3250c14978614ac169edcf72bd131a2e4c227c8d7e21a2cd6131a13dda out2.bin
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <time.h>
#include <x86intrin.h> //for rdtsc
#define CONTEXT_SIZE 178033152
//Linkage with assembly
//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
void EhPrepare(void *context, void *input);
int32_t EhSolver(void *context, uint32_t nonce);
extern char testinput[];
//context is the memory used for Equihash computation. It should be allocated outside of SolverFunction, the size is defined by CONTEXT_SIZE, about 180MB.
//SolverFunction API has slight overhead in mining due to missing opportunity to run EhSolver multiple times after a single EhPrepare.
int SolverFunction(void* context, const unsigned char* input,
bool (*validBlock)(void*, const unsigned char*),
void* validBlockData,
bool (*cancelled)(void*),
void* cancelledData,
int numThreads,
int n, int k)
{
int numsolutions, i;
EhPrepare(context, (void *) input);
numsolutions = EhSolver(context, *(uint32_t *)(input+136));
for (i=0; i<numsolutions; i++) {
validBlock(validBlockData, (unsigned char*)(context+1344*i));
}
return numsolutions;
}
bool validBlock(void *validBlockData, const unsigned char *solution)
{
return 0;
}
bool cancelled(void *cancelledData)
{
return 0;
}
int main(void)
{
void *context_alloc, *context, *context_end;
uint32_t *pu32;
uint64_t *pu64, previous_rdtsc;
uint8_t inputheader[144]; //140 byte header
FILE *infile, *outfile;
struct timespec time0, time1;
uint64_t rdtsc0, rdtsc1;
long t0, t1;
int32_t numsolutions;
int i, j;
char outfilename[32];
context_alloc = malloc(CONTEXT_SIZE+4096);
context = (void*) (((long) context_alloc+4095) & -4096);
context_end = context + CONTEXT_SIZE;
//Init page tables. This is not necessary, but useful to get a more consistent single-run timing.
for (pu32=context; (void*) pu32<context_end; pu32+=1024)
*pu32 = 0;
infile = 0;
infile = fopen("input.bin", "rb");
if (infile) {
puts("Reading input.bin");
fread(inputheader, 140, 1, infile);
fclose(infile);
} else {
puts("input.bin not found, use sample data (beta1 testnet block 2)");
memcpy(inputheader, testinput, 140);
}
puts("Running solver...");
clock_gettime(CLOCK_MONOTONIC, &time0);
rdtsc0 = __rdtsc();
numsolutions = SolverFunction(context, inputheader, validBlock, 0, cancelled, 0, 1, 200, 9);
//EhPrepare(context, (void *) inputheader);
//numsolutions = EhSolver(context, *(uint32_t *)(inputheader+136));
clock_gettime(CLOCK_MONOTONIC, &time1);
rdtsc1 = __rdtsc();
//Print some debug information
pu64 = (uint64_t *) (context + 102408); //Read the debug area for statistics
printf("BLAKE2b rdtsc: %lu\n", pu64[1]-pu64[0]);
previous_rdtsc = pu64[1];
for (i=1, j=2; i<=9; i++, j+=2) {
printf("Stage %u, Output pairs %u, rdtsc: %lu\n", i, (uint32_t) pu64[j+1], pu64[j]-previous_rdtsc);
previous_rdtsc = pu64[j];
}
printf("Number of solutions before duplicate removal: %u\n", *(uint32_t *) (context+16384));
printf("Duplicate removal and tree expand rdtsc: %lu\n", pu64[j]-previous_rdtsc);
printf("Number of solutions: %d\n", numsolutions);
j = numsolutions < 4 ? numsolutions : 4;
for (i=0; i<j; i++) {
sprintf(outfilename, "out%d.bin", i);
outfile = fopen(outfilename, "wb");
fwrite(context+1344*i, 1344, 1, outfile);
fclose(outfile);
}
t0 = time0.tv_sec * 1000000000 + time0.tv_nsec;
t1 = time1.tv_sec * 1000000000 + time1.tv_nsec;
printf("Time: %ld ms\n", (t1-t0)/1000000);
t0 = (t1-t0)/1000;
printf("Measure rdtsc frequency = %.3f MHz\n", (double) (rdtsc1-rdtsc0)/t0);
free(context_alloc);
return 0;
}

View File

@ -1,7 +1,6 @@
format elf64
public EhPrepare as 'EhPrepareAVX1'
public EhSolver as 'EhSolverAVX1'
public testinput as 'testinputAVX1'
include "struct.inc"
include "params.inc"
@ -14,4 +13,3 @@ include "proc_ehsolver_avx1.asm"
section '.data' writeable align 64
include "data_blake2b.asm"
testinput file "t2.bin"

View File

@ -1,7 +1,6 @@
format elf64
public EhPrepare as 'EhPrepareAVX2'
public EhSolver as 'EhSolverAVX2'
public testinput as 'testinputAVX2'
include "struct.inc"
include "params.inc"
@ -14,4 +13,3 @@ include "proc_ehsolver_avx2.asm"
section '.data' writeable align 64
include "data_blake2b.asm"
testinput file "t2.bin"

View File

43
cuda_djezo/CMakeLists.txt Normal file
View File

@ -0,0 +1,43 @@
set(EXECUTABLE cuda_djezo)
option(ENABLE_CUDA "Enable the cuda build" ON)
# depending on gcc version
# ;-std=c++11 => Ubuntu 14.04 check gcc versions
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
file(GLOB SRC_LIST
cuda_djezo.cpp
equi_miner.cu )
file(GLOB HEADERS
cuda_djezo.hpp
eqcuda.hpp
)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-D_FORCE_INLINES;--disable-warnings;--ptxas-options=-v;-Xptxas=-dlcm=ca;-Xptxas=-dscm=cs; -O3)
FIND_PACKAGE(CUDA REQUIRED)
if(COMPUTE AND (COMPUTE GREATER 0))
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
else(COMPUTE AND (COMPUTE GREATER 0))
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_60,code=sm_60 )
endif(COMPUTE AND (COMPUTE GREATER 0))
if(CUDA_FOUND)
message("CUDA FOUND")
else()
message("CUDA NOT FOUND")
endif()
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(..)
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} cuda)
message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )

336
cuda_djezo/blake2b.cu Normal file
View File

@ -0,0 +1,336 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016
/**
* uint2 direct ops by c++ operator definitions
*/
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
return make_uint2(a.x ^ b.x, a.y ^ b.y);
}
static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) {
return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
}
// uint2 ROR/ROL methods
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
#if __CUDA_ARCH__ > 300
/* if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
} else *//* if (offset < 64) */ {
/* offset SHOULD BE < 64 ! */
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
#else
if (!offset)
result = a;
else if (offset < 32) {
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
}
#endif
return result;
}
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
return make_uint2(value.y, value.x);
}
#ifdef __CUDA_ARCH__
__device__ __inline__ uint2 ROR24(const uint2 a) {
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x2107);
result.y = __byte_perm(a.y, a.x, 0x6543);
return result;
}
__device__ __inline__ uint2 ROR16(const uint2 a) {
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x1076);
result.y = __byte_perm(a.y, a.x, 0x5432);
return result;
}
#else
#define ROR24(u) ROR2(u,24)
#define ROR16(u) ROR2(u,16)
#endif
typedef uint64_t u64;
static __constant__ const int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
__device__ __constant__
static const u64 blake_iv[] =
{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
};
__device__ __forceinline__
static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
a = a + b + m[ blake2b_sigma[r][2*i] ];
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
a = a + b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
}
//__device__ __forceinline__
//static void G2(u64 &a, u64 &b, u64 &c, u64 &d, u64 x, u64 y) {
// a = a + b + x;
// ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
// c = c + d;
// ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
// a = a + b + y;
// ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
// c = c + d;
// ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
//}
__device__ __forceinline__
static void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) {
a = a + b + x;
((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
c = c + d;
((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
a = a + b + y;
((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
c = c + d;
((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, v[0], v[4], v[ 8], v[12], m); \
G(r, 1, v[1], v[5], v[ 9], v[13], m); \
G(r, 2, v[2], v[6], v[10], v[14], m); \
G(r, 3, v[3], v[7], v[11], v[15], m); \
G(r, 4, v[0], v[5], v[10], v[15], m); \
G(r, 5, v[1], v[6], v[11], v[12], m); \
G(r, 6, v[2], v[7], v[ 8], v[13], m); \
G(r, 7, v[3], v[4], v[ 9], v[14], m);
__forceinline__ __device__ void blake2b_gpu_hash3(uint64_t* h, u32 idx, u32 nonce) {
u64 m = (u64)idx << 32 | (u64)nonce;
u64 v[16];
v[0] = h[0];
v[1] = h[1];
v[2] = h[2];
v[3] = h[3];
v[4] = h[4];
v[5] = h[5];
v[6] = h[6];
v[7] = h[7];
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4] ^ (128 + 16);
v[13] = blake_iv[5];
v[14] = blake_iv[6] ^ 0xffffffffffffffff;
v[15] = blake_iv[7];
// mix 1
G2(v[0], v[4], v[8], v[12], 0, m);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 2
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], m, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 3
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, m);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 4
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, m);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 5
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, m);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 6
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], m, 0);
// mix 7
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], m, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 8
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, m);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 9
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], m, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 10
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], m, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 11
G2(v[0], v[4], v[8], v[12], 0, m);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], 0, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
// mix 12
G2(v[0], v[4], v[8], v[12], 0, 0);
G2(v[1], v[5], v[9], v[13], 0, 0);
G2(v[2], v[6], v[10], v[14], 0, 0);
G2(v[3], v[7], v[11], v[15], 0, 0);
G2(v[0], v[5], v[10], v[15], m, 0);
G2(v[1], v[6], v[11], v[12], 0, 0);
G2(v[2], v[7], v[8], v[13], 0, 0);
G2(v[3], v[4], v[9], v[14], 0, 0);
h[0] ^= v[0] ^ v[8];
h[1] ^= v[1] ^ v[9];
h[2] ^= v[2] ^ v[10];
h[3] ^= v[3] ^ v[11];
h[4] ^= v[4] ^ v[12];
h[5] ^= v[5] ^ v[13];
h[6] ^= v[6] ^ v[14];
}
__forceinline__ __device__ void blake2b_gpu_hash2(uint64_t* h, u32 idx) {
u64 m[16] = { 0 };
u32* ptr = (u32*)&m[1];
ptr[1] = idx;
u64 v[16];
v[0] = h[0];
v[1] = h[1];
v[2] = h[2];
v[3] = h[3];
v[4] = h[4];
v[5] = h[5];
v[6] = h[6];
v[7] = h[7];
v[8] = 0x6a09e667f3bcc908;
v[9] = 0xbb67ae8584caa73b;
v[10] = 0x3c6ef372fe94f82b;
v[11] = 0xa54ff53a5f1d36f1;
v[12] = 0x510e527fade682d1 ^ (128 + 16);
v[13] = 0x9b05688c2b3e6c1f;
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
v[15] = 0x5be0cd19137e2179;
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
ROUND(10);
ROUND(11);
h[0] ^= v[0] ^ v[8];
h[1] ^= v[1] ^ v[9];
h[2] ^= v[2] ^ v[10];
h[3] ^= v[3] ^ v[11];
h[4] ^= v[4] ^ v[12];
h[5] ^= v[5] ^ v[13];
h[6] ^= v[6] ^ v[14];
//h[7] ^= v[7] ^ v[15];
//memcpy(hash, (uchar *)h, outlen);
}

View File

@ -2156,4 +2156,4 @@ template class eq_cuda_context<CONFIG_MODE_2>;
#ifdef CONFIG_MODE_3
template class eq_cuda_context<CONFIG_MODE_3>;
#endif
#endif

57
cuda_tromp/CMakeLists.txt Normal file
View File

@ -0,0 +1,57 @@
set(EXECUTABLE cuda_tromp)
option(ENABLE_CUDA "Enable the cuda build" ON)
# depending on gcc version
# ;-std=c++11 => Ubuntu 14.04 check gcc versions
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
file(GLOB SRC_LIST
cuda_tromp.cpp
equi_miner.cu )
file(GLOB HEADERS
cuda_tromp.hpp
eqcuda.hpp
)
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
add_definitions(-DHIST)
#add_definitions(-DXINTREE)
#add_definitions(-DUNROLL)
list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
FIND_PACKAGE(CUDA REQUIRED)
if(COMPUTE AND (COMPUTE GREATER 0))
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
else(COMPUTE AND (COMPUTE GREATER 0))
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 )
endif(COMPUTE AND (COMPUTE GREATER 0))
include_directories(${CUDA_INCLUDE_DIRS})
find_package(Threads REQUIRED COMPONENTS)
find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
if(CUDA_FOUND)
message("CUDA FOUND")
else()
message("CUDA NOT FOUND")
endif()
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(..)
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES})
message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )

View File

@ -56,6 +56,8 @@ void MinerFactory::ClearAllSolvers() {
}
ISolver * MinerFactory::GenCPUSolver(int use_opt) {
// TODO fix dynamic linking on Linux
#ifdef USE_CPU_XENONCAT
if (_use_xenoncat) {
_solvers.push_back(new CPUSolverXenoncat(use_opt));
return _solvers.back();
@ -63,6 +65,10 @@ ISolver * MinerFactory::GenCPUSolver(int use_opt) {
_solvers.push_back(new CPUSolverTromp(use_opt));
return _solvers.back();
}
#else
_solvers.push_back(new CPUSolverTromp(use_opt));
return _solvers.back();
#endif
}
ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperblock) {
@ -75,7 +81,7 @@ ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperbloc
return _solvers.back();
}
}
// no OpenCL solvers at the moment keep for future reference
ISolver * MinerFactory::GenOPENCLSolver(int platf_id, int dev_id) {
if (_use_silentarmy) {
_solvers.push_back(new OPENCLSolverSilentarmy(platf_id, dev_id));

View File

@ -108,7 +108,12 @@ void print_help()
void print_cuda_info()
{
int num_devices = cuda_tromp::getcount();
#if defined(USE_CUDA_DJEZO) || defined(USE_CUDA_TROMP)
#ifdef USE_CUDA_DJEZO
int num_devices = cuda_djezo::getcount();
#elif USE_CUDA_TROMP
int num_devices = cuda_tromp::getcount();
#endif
std::cout << "Number of CUDA devices found: " << num_devices << std::endl;
@ -123,6 +128,7 @@ void print_cuda_info()
#endif
std::cout << "\t#" << i << " " << gpuname << " | SM version: " << version << " | SM count: " << smcount << std::endl;
}
#endif
}
void print_opencl_info() {