fix Linux build, one cmake file (TODO make shared lib all in one build)
This commit is contained in:
parent
d56e5e8e26
commit
af3a5c48ac
|
@ -0,0 +1,198 @@
|
|||
project(nheqminer)
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # -Wall
|
||||
|
||||
## Enable solvers here
|
||||
#### older slower
|
||||
option(USE_CPU_TROMP "USE CPU_TROMP" OFF)
|
||||
option(USE_CUDA_TROMP "USE CUDA_TROMP" OFF)
|
||||
#### faster
|
||||
option(USE_CPU_XENONCAT "USE CPU_XENONCAT" ON)
|
||||
option(USE_CUDA_DJEZO "USE CUDA_DJEZO" ON)
|
||||
|
||||
## Add solvers here
|
||||
if (USE_CPU_TROMP)
|
||||
add_definitions(-DUSE_CPU_TROMP)
|
||||
message("-- USE_CPU_TROMP DEFINED")
|
||||
endif()
|
||||
if (USE_CPU_XENONCAT)
|
||||
add_definitions(-DUSE_CPU_XENONCAT)
|
||||
message("-- USE_CPU_XENONCAT DEFINED")
|
||||
endif()
|
||||
if (USE_CUDA_TROMP)
|
||||
add_definitions(-DUSE_CUDA_TROMP)
|
||||
message("-- USE_CUDA_TROMP DEFINED")
|
||||
endif()
|
||||
if (USE_CUDA_DJEZO)
|
||||
add_definitions(-DUSE_CUDA_DJEZO)
|
||||
message("-- USE_CUDA_DJEZO DEFINED")
|
||||
endif()
|
||||
|
||||
|
||||
########
|
||||
# LINUX
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
# # use native cpu features
|
||||
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -fPIC")
|
||||
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fPIC")
|
||||
|
||||
# # optimizations
|
||||
# add_definitions(-O3)
|
||||
|
||||
# use
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse2")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse2")
|
||||
# optimizations
|
||||
add_definitions(-O2)
|
||||
endif()
|
||||
|
||||
# Common
|
||||
include_directories(${nheqminer_SOURCE_DIR}/nheqminer)
|
||||
|
||||
# BOOST
|
||||
#find_package(Threads REQUIRED COMPONENTS)
|
||||
# compile boost staticaly
|
||||
set(Boost_USE_STATIC_LIBS ON)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
|
||||
#set(BUILD_SHARED_LIBRARIES OFF)
|
||||
#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
|
||||
find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
|
||||
|
||||
if (Boost_FOUND)
|
||||
# From the offical documentation:
|
||||
# Add include directories to the build. [...] If the SYSTEM option is given,
|
||||
# the compiler will be told the directories are meant as system include
|
||||
# directories on some platforms (signalling this setting might achieve effects
|
||||
# such as the compiler skipping warnings [...])."
|
||||
include_directories (SYSTEM ${Boost_INCLUDE_DIR})
|
||||
|
||||
# From the offical documentation:
|
||||
# "Specify directories in which the linker will look for libraries. [...] Note
|
||||
# that this command is rarely necessary. Library locations returned by
|
||||
# find_package() and find_library() are absolute paths. Pass these absolute
|
||||
# library file paths directly to the target_link_libraries() command. CMake
|
||||
# will ensure the linker finds them."
|
||||
link_directories (${Boost_LIBRARY_DIRS})
|
||||
else()
|
||||
message("Boost_FOUND NOT FOUND")
|
||||
endif ()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR}/../)
|
||||
|
||||
set(SOURCE_FILES
|
||||
# sources
|
||||
nheqminer/amount.cpp
|
||||
nheqminer/api.cpp
|
||||
nheqminer/arith_uint256.cpp
|
||||
nheqminer/crypto/sha256.cpp
|
||||
nheqminer/json/json_spirit_reader.cpp
|
||||
nheqminer/json/json_spirit_value.cpp
|
||||
nheqminer/json/json_spirit_writer.cpp
|
||||
nheqminer/libstratum/ZcashStratum.cpp
|
||||
nheqminer/main.cpp
|
||||
nheqminer/primitives/block.cpp
|
||||
nheqminer/speed.cpp
|
||||
nheqminer/uint256.cpp
|
||||
nheqminer/utilstrencodings.cpp
|
||||
# headers
|
||||
nheqminer/amount.h
|
||||
nheqminer/api.hpp
|
||||
nheqminer/arith_uint256.h
|
||||
nheqminer/crypto/sha256.h
|
||||
nheqminer/hash.h
|
||||
nheqminer/json/json_spirit.h
|
||||
nheqminer/json/json_spirit_error_position.h
|
||||
nheqminer/json/json_spirit_reader.h
|
||||
nheqminer/json/json_spirit_reader_template.h
|
||||
nheqminer/json/json_spirit_stream_reader.h
|
||||
nheqminer/json/json_spirit_utils.h
|
||||
nheqminer/json/json_spirit_value.h
|
||||
nheqminer/json/json_spirit_writer.h
|
||||
nheqminer/json/json_spirit_writer_template.h
|
||||
nheqminer/libstratum/StratumClient.cpp
|
||||
nheqminer/libstratum/StratumClient.h
|
||||
nheqminer/libstratum/ZcashStratum.cpp
|
||||
nheqminer/libstratum/ZcashStratum.h
|
||||
nheqminer/primitives/block.h
|
||||
nheqminer/primitives/transaction.h
|
||||
nheqminer/script/script.h
|
||||
nheqminer/serialize.h
|
||||
nheqminer/speed.hpp
|
||||
nheqminer/streams.h
|
||||
nheqminer/support/allocators/zeroafterfree.h
|
||||
nheqminer/tinyformat.h
|
||||
nheqminer/uint252.h
|
||||
nheqminer/uint256.h
|
||||
nheqminer/utilstrencodings.h
|
||||
nheqminer/version.h
|
||||
nheqminer/zcash/JoinSplit.hpp
|
||||
nheqminer/zcash/NoteEncryption.hpp
|
||||
nheqminer/zcash/Proof.hpp
|
||||
nheqminer/zcash/Zcash.h
|
||||
nheqminer/SolverStub.h # just a stub
|
||||
|
||||
nheqminer/AvailableSolvers.h
|
||||
nheqminer/ISolver.h
|
||||
nheqminer/Solver.h
|
||||
nheqminer/MinerFactory.h
|
||||
nheqminer/MinerFactory.cpp
|
||||
|
||||
# make same path on windows
|
||||
#blake shared
|
||||
# src
|
||||
blake2/blake2bx.cpp
|
||||
# headers
|
||||
blake2/blake2.h
|
||||
blake2/blake2b-load-sse2.h
|
||||
blake2/blake2b-load-sse41.h
|
||||
blake2/blake2b-round.h
|
||||
blake2/blake2-config.h
|
||||
blake2/blake2-impl.h
|
||||
blake2/blake2-round.h
|
||||
)
|
||||
|
||||
#set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES})
|
||||
set(LIBS ${LIBS} ${Boost_LIBRARIES})
|
||||
|
||||
message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
|
||||
message("-- LIBS: ${LIBS}")
|
||||
|
||||
if (USE_CPU_TROMP)
|
||||
add_subdirectory(cpu_tromp)
|
||||
endif()
|
||||
if (USE_CPU_XENONCAT)
|
||||
add_subdirectory(cpu_xenoncat)
|
||||
endif()
|
||||
if (USE_CUDA_TROMP)
|
||||
add_subdirectory(cuda_tromp)
|
||||
endif()
|
||||
if (USE_CUDA_DJEZO)
|
||||
add_subdirectory(cuda_djezo)
|
||||
endif()
|
||||
|
||||
#add_subdirectory(cpu_xenoncat)
|
||||
|
||||
ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES})
|
||||
|
||||
#target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} )
|
||||
target_link_libraries(${PROJECT_NAME} ${CMAKE_THREAD_LIBS_INIT} ${LIBS} )
|
||||
|
||||
# link libs
|
||||
if (USE_CPU_TROMP)
|
||||
target_link_libraries(${PROJECT_NAME} cpu_tromp)
|
||||
endif()
|
||||
if (USE_CPU_XENONCAT)
|
||||
add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
|
||||
set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx1.o" )
|
||||
add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
|
||||
set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx2.o" )
|
||||
target_link_libraries(${PROJECT_NAME} cpu_xenoncat xenoncat_avx1 xenoncat_avx2)
|
||||
endif()
|
||||
if (USE_CUDA_TROMP)
|
||||
target_link_libraries(${PROJECT_NAME} cuda_tromp)
|
||||
endif()
|
||||
if (USE_CUDA_DJEZO)
|
||||
target_link_libraries(${PROJECT_NAME} cuda_djezo)
|
||||
endif()
|
||||
|
|
@ -17,34 +17,7 @@ endif()
|
|||
# Common
|
||||
include_directories(${nheqminer_SOURCE_DIR})
|
||||
|
||||
#add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
|
||||
# BOOST
|
||||
#find_package(Threads REQUIRED COMPONENTS)
|
||||
# compile boost staticaly
|
||||
set(Boost_USE_STATIC_LIBS ON)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
|
||||
#set(BUILD_SHARED_LIBRARIES OFF)
|
||||
#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
|
||||
find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
|
||||
|
||||
if (Boost_FOUND)
|
||||
# From the offical documentation:
|
||||
# Add include directories to the build. [...] If the SYSTEM option is given,
|
||||
# the compiler will be told the directories are meant as system include
|
||||
# directories on some platforms (signalling this setting might achieve effects
|
||||
# such as the compiler skipping warnings [...])."
|
||||
include_directories (SYSTEM ${Boost_INCLUDE_DIR})
|
||||
|
||||
# From the offical documentation:
|
||||
# "Specify directories in which the linker will look for libraries. [...] Note
|
||||
# that this command is rarely necessary. Library locations returned by
|
||||
# find_package() and find_library() are absolute paths. Pass these absolute
|
||||
# library file paths directly to the target_link_libraries() command. CMake
|
||||
# will ensure the linker finds them."
|
||||
link_directories (${Boost_LIBRARY_DIRS})
|
||||
else()
|
||||
message("Boost_FOUND NOT FOUND")
|
||||
endif ()
|
||||
add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
|
||||
|
||||
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
|
||||
|
||||
|
@ -75,6 +48,25 @@ else()
|
|||
message("CUDA NOT FOUND")
|
||||
endif()
|
||||
|
||||
if (Boost_FOUND)
|
||||
# From the offical documentation:
|
||||
# Add include directories to the build. [...] If the SYSTEM option is given,
|
||||
# the compiler will be told the directories are meant as system include
|
||||
# directories on some platforms (signalling this setting might achieve effects
|
||||
# such as the compiler skipping warnings [...])."
|
||||
include_directories (SYSTEM ${Boost_INCLUDE_DIR})
|
||||
|
||||
# From the offical documentation:
|
||||
# "Specify directories in which the linker will look for libraries. [...] Note
|
||||
# that this command is rarely necessary. Library locations returned by
|
||||
# find_package() and find_library() are absolute paths. Pass these absolute
|
||||
# library file paths directly to the target_link_libraries() command. CMake
|
||||
# will ensure the linker finds them."
|
||||
link_directories (${Boost_LIBRARY_DIRS})
|
||||
else()
|
||||
message("Boost_FOUND NOT FOUND")
|
||||
endif ()
|
||||
|
||||
## Add solvers here
|
||||
#add_definitions(-DUSE_CPU_XENONCAT)
|
||||
#add_definitions(-DUSE_CPU_TROMP)
|
||||
|
@ -141,11 +133,11 @@ set(SOURCE_FILES
|
|||
../../nheqminer/SolverStub.h # just a stub
|
||||
|
||||
|
||||
../../nheqminer/AvailableSolvers.h
|
||||
../../nheqminer/ISolver.h
|
||||
../../nheqminer/Solver.h
|
||||
../../nheqminer/MinerFactory.h
|
||||
../../nheqminer/MinerFactory.cpp
|
||||
../../nheqminer/AvailableSolvers.h
|
||||
../../nheqminer/ISolver.h
|
||||
../../nheqminer/Solver.h
|
||||
../../nheqminer/MinerFactory.h
|
||||
../../nheqminer/MinerFactory.cpp
|
||||
|
||||
# # cpu tromp
|
||||
# ../../cpu_tromp/blake2/blake2bx.cpp
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef __BLAKE2_CONFIG_H__
|
||||
#define __BLAKE2_CONFIG_H__
|
||||
|
||||
// These don't work everywhere
|
||||
#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
|
||||
#define HAVE_SSE2
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#define HAVE_SSSE3
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#define HAVE_SSE41
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define HAVE_AVX
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
#define HAVE_XOP
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
#ifndef HAVE_AVX
|
||||
#define HAVE_AVX
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_XOP
|
||||
#ifndef HAVE_AVX
|
||||
#define HAVE_AVX
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_AVX
|
||||
#ifndef HAVE_SSE41
|
||||
#define HAVE_SSE41
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSE41
|
||||
#ifndef HAVE_SSSE3
|
||||
#define HAVE_SSSE3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSSE3
|
||||
#define HAVE_SSE2
|
||||
#endif
|
||||
|
||||
#if !defined(HAVE_SSE2)
|
||||
#error "This code requires at least SSE2."
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef __BLAKE2_IMPL_H__
|
||||
#define __BLAKE2_IMPL_H__
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
static inline uint32_t load32( const void *src )
|
||||
{
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
uint32_t w;
|
||||
memcpy(&w, src, sizeof w);
|
||||
return w;
|
||||
#else
|
||||
const uint8_t *p = ( const uint8_t * )src;
|
||||
uint32_t w = *p++;
|
||||
w |= ( uint32_t )( *p++ ) << 8;
|
||||
w |= ( uint32_t )( *p++ ) << 16;
|
||||
w |= ( uint32_t )( *p++ ) << 24;
|
||||
return w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t load64( const void *src )
|
||||
{
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
uint64_t w;
|
||||
memcpy(&w, src, sizeof w);
|
||||
return w;
|
||||
#else
|
||||
const uint8_t *p = ( const uint8_t * )src;
|
||||
uint64_t w = *p++;
|
||||
w |= ( uint64_t )( *p++ ) << 8;
|
||||
w |= ( uint64_t )( *p++ ) << 16;
|
||||
w |= ( uint64_t )( *p++ ) << 24;
|
||||
w |= ( uint64_t )( *p++ ) << 32;
|
||||
w |= ( uint64_t )( *p++ ) << 40;
|
||||
w |= ( uint64_t )( *p++ ) << 48;
|
||||
w |= ( uint64_t )( *p++ ) << 56;
|
||||
return w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void store32( void *dst, uint32_t w )
|
||||
{
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = ( uint8_t * )dst;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void store64( void *dst, uint64_t w )
|
||||
{
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = ( uint8_t * )dst;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t load48( const void *src )
|
||||
{
|
||||
const uint8_t *p = ( const uint8_t * )src;
|
||||
uint64_t w = *p++;
|
||||
w |= ( uint64_t )( *p++ ) << 8;
|
||||
w |= ( uint64_t )( *p++ ) << 16;
|
||||
w |= ( uint64_t )( *p++ ) << 24;
|
||||
w |= ( uint64_t )( *p++ ) << 32;
|
||||
w |= ( uint64_t )( *p++ ) << 40;
|
||||
return w;
|
||||
}
|
||||
|
||||
static inline void store48( void *dst, uint64_t w )
|
||||
{
|
||||
uint8_t *p = ( uint8_t * )dst;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w; w >>= 8;
|
||||
*p++ = ( uint8_t )w;
|
||||
}
|
||||
|
||||
static inline uint32_t rotl32( const uint32_t w, const unsigned c )
|
||||
{
|
||||
return ( w << c ) | ( w >> ( 32 - c ) );
|
||||
}
|
||||
|
||||
static inline uint64_t rotl64( const uint64_t w, const unsigned c )
|
||||
{
|
||||
return ( w << c ) | ( w >> ( 64 - c ) );
|
||||
}
|
||||
|
||||
static inline uint32_t rotr32( const uint32_t w, const unsigned c )
|
||||
{
|
||||
return ( w >> c ) | ( w << ( 32 - c ) );
|
||||
}
|
||||
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c )
|
||||
{
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
/* prevents compiler optimizing out memset() */
|
||||
static inline void secure_zero_memory( void *v, size_t n )
|
||||
{
|
||||
volatile uint8_t *p = ( volatile uint8_t * )v;
|
||||
while( n-- ) *p++ = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
row1l = _mm_add_epi64(row1l, row2l); \
|
||||
row1h = _mm_add_epi64(row1h, row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -32); \
|
||||
row4h = _mm_roti_epi64(row4h, -32); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -24); \
|
||||
row2h = _mm_roti_epi64(row2h, -24); \
|
||||
|
||||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
row1l = _mm_add_epi64(row1l, row2l); \
|
||||
row1h = _mm_add_epi64(row1h, row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -16); \
|
||||
row4h = _mm_roti_epi64(row4h, -16); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -63); \
|
||||
row2h = _mm_roti_epi64(row2h, -63); \
|
||||
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
\
|
||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
\
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
\
|
||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
|
@ -0,0 +1,156 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef __BLAKE2_H__
|
||||
#define __BLAKE2_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define ALIGN(x) __declspec(align(x))
|
||||
#else
|
||||
#define ALIGN(x) __attribute__ ((__aligned__(x)))
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
BLAKE2S_OUTBYTES = 32,
|
||||
BLAKE2S_KEYBYTES = 32,
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
|
||||
enum blake2b_constant
|
||||
{
|
||||
BLAKE2B_BLOCKBYTES = 128,
|
||||
BLAKE2B_OUTBYTES = 64,
|
||||
BLAKE2B_KEYBYTES = 64,
|
||||
BLAKE2B_SALTBYTES = 16,
|
||||
BLAKE2B_PERSONALBYTES = 16
|
||||
};
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
|
||||
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
||||
} blake2s_param;
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_state
|
||||
{
|
||||
uint32_t h[8];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
|
||||
size_t buflen;
|
||||
uint8_t last_node;
|
||||
} blake2s_state;
|
||||
|
||||
typedef struct __blake2b_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint64_t node_offset; // 16
|
||||
uint8_t node_depth; // 17
|
||||
uint8_t inner_length; // 18
|
||||
uint8_t reserved[14]; // 32
|
||||
uint8_t salt[BLAKE2B_SALTBYTES]; // 48
|
||||
uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64
|
||||
} blake2b_param;
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2b_state
|
||||
{
|
||||
uint64_t h[8];
|
||||
uint8_t buf[BLAKE2B_BLOCKBYTES];
|
||||
uint16_t counter;
|
||||
uint8_t buflen;
|
||||
uint8_t lastblock;
|
||||
} blake2b_state;
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2sp_state
|
||||
{
|
||||
blake2s_state S[8][1];
|
||||
blake2s_state R[1];
|
||||
uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
|
||||
size_t buflen;
|
||||
} blake2sp_state;
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2bp_state
|
||||
{
|
||||
blake2b_state S[4][1];
|
||||
blake2b_state R[1];
|
||||
uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
|
||||
size_t buflen;
|
||||
} blake2bp_state;
|
||||
#pragma pack(pop)
|
||||
|
||||
// Streaming API
|
||||
int blake2s_init( blake2s_state *S, const uint8_t outlen );
|
||||
int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
|
||||
int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
|
||||
int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
|
||||
int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
|
||||
|
||||
int blake2b_init( blake2b_state *S, const uint8_t outlen );
|
||||
int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
|
||||
int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
|
||||
int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
|
||||
int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
|
||||
|
||||
int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
|
||||
int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
|
||||
int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
|
||||
int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
|
||||
|
||||
int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
|
||||
int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
|
||||
int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
|
||||
int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
|
||||
|
||||
// Simple API
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
|
||||
int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
|
||||
int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen);
|
||||
|
||||
int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
|
||||
int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
|
||||
|
||||
static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
return blake2b( out, in, key, outlen, inlen, keylen );
|
||||
}
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_LOAD_SSE2_H__
|
||||
#define __BLAKE2B_LOAD_SSE2_H__
|
||||
|
||||
#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
|
||||
#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
|
||||
#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
|
||||
#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
|
||||
#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
|
||||
#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
|
||||
#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
|
||||
#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
|
||||
#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
|
||||
#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
|
||||
#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
|
||||
#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
|
||||
#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
|
||||
#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
|
||||
#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
|
||||
#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
|
||||
#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
|
||||
#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
|
||||
#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
|
||||
#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
|
||||
#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
|
||||
#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
|
||||
#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
|
||||
#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
|
||||
#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
|
||||
#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
|
||||
#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
|
||||
#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
|
||||
#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
|
||||
#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
|
||||
#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
|
||||
#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
|
||||
#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
|
||||
#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
|
||||
#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
|
||||
#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
|
||||
#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
|
||||
#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
|
||||
#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
|
||||
#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
|
||||
#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
|
||||
#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
|
||||
#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
|
||||
#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
|
||||
#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
|
||||
#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
|
||||
#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
|
||||
#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
|
||||
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,402 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_LOAD_SSE41_H__
|
||||
#define __BLAKE2B_LOAD_SSE41_H__
|
||||
|
||||
#define LOAD_MSG_0_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m0, m1); \
|
||||
b1 = _mm_unpacklo_epi64(m2, m3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m0, m1); \
|
||||
b1 = _mm_unpackhi_epi64(m2, m3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m4, m5); \
|
||||
b1 = _mm_unpacklo_epi64(m6, m7); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m4, m5); \
|
||||
b1 = _mm_unpackhi_epi64(m6, m7); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m7, m2); \
|
||||
b1 = _mm_unpackhi_epi64(m4, m6); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
b1 = _mm_alignr_epi8(m3, m7, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
||||
b1 = _mm_unpackhi_epi64(m5, m2); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m6, m1); \
|
||||
b1 = _mm_unpackhi_epi64(m3, m1); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_alignr_epi8(m6, m5, 8); \
|
||||
b1 = _mm_unpackhi_epi64(m2, m7); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m4, m0); \
|
||||
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
|
||||
b1 = _mm_unpackhi_epi64(m3, m4); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m7, m3); \
|
||||
b1 = _mm_alignr_epi8(m2, m0, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
b1 = _mm_unpackhi_epi64(m6, m5); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m4, m0); \
|
||||
b1 = _mm_unpacklo_epi64(m6, m7); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
|
||||
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m3, m5); \
|
||||
b1 = _mm_unpacklo_epi64(m0, m4); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m4, m2); \
|
||||
b1 = _mm_unpacklo_epi64(m1, m5); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
|
||||
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
|
||||
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_alignr_epi8(m6, m0, 8); \
|
||||
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m1, m3); \
|
||||
b1 = _mm_unpacklo_epi64(m0, m4); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m6, m5); \
|
||||
b1 = _mm_unpackhi_epi64(m5, m1); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
|
||||
b1 = _mm_unpackhi_epi64(m7, m0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m6, m2); \
|
||||
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
|
||||
b1 = _mm_unpacklo_epi64(m7, m2); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m2, m7); \
|
||||
b1 = _mm_alignr_epi8(m5, m6, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m0, m3); \
|
||||
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m6, m3); \
|
||||
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_alignr_epi8(m7, m5, 8); \
|
||||
b1 = _mm_unpackhi_epi64(m0, m4); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m2, m7); \
|
||||
b1 = _mm_unpacklo_epi64(m4, m1); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m0, m2); \
|
||||
b1 = _mm_unpacklo_epi64(m3, m5); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m3, m7); \
|
||||
b1 = _mm_alignr_epi8(m0, m5, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m7, m4); \
|
||||
b1 = _mm_alignr_epi8(m4, m1, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = m6; \
|
||||
b1 = _mm_alignr_epi8(m5, m0, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
|
||||
b1 = m2; \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
b1 = _mm_unpackhi_epi64(m3, m0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m1, m2); \
|
||||
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m7, m4); \
|
||||
b1 = _mm_unpackhi_epi64(m1, m6); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_alignr_epi8(m7, m5, 8); \
|
||||
b1 = _mm_unpacklo_epi64(m6, m0); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m0, m1); \
|
||||
b1 = _mm_unpacklo_epi64(m2, m3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m0, m1); \
|
||||
b1 = _mm_unpackhi_epi64(m2, m3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m4, m5); \
|
||||
b1 = _mm_unpacklo_epi64(m6, m7); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpackhi_epi64(m4, m5); \
|
||||
b1 = _mm_unpackhi_epi64(m6, m7); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m7, m2); \
|
||||
b1 = _mm_unpackhi_epi64(m4, m6); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
b1 = _mm_alignr_epi8(m3, m7, 8); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
||||
b1 = _mm_unpackhi_epi64(m5, m2); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
b0 = _mm_unpacklo_epi64(m6, m1); \
|
||||
b1 = _mm_unpackhi_epi64(m3, m1); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_ROUND_H__
|
||||
#define __BLAKE2B_ROUND_H__
|
||||
|
||||
#define LOAD(p) _mm_load_si128( (const __m128i *)(p) )
|
||||
#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
|
||||
|
||||
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
|
||||
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
|
||||
|
||||
#define TOF(reg) _mm_castsi128_ps((reg))
|
||||
#define TOI(reg) _mm_castps_si128((reg))
|
||||
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
|
||||
|
||||
/* Microarchitecture-specific macros */
|
||||
#ifndef HAVE_XOP
|
||||
#ifdef HAVE_SSSE3
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
|
||||
#else
|
||||
#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
|
||||
#endif
|
||||
#else
|
||||
/* ... */
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, (-32)); \
|
||||
row4h = _mm_roti_epi64(row4h, (-32)); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, (-24)); \
|
||||
row2h = _mm_roti_epi64(row2h, (-24)); \
|
||||
|
||||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, (-16)); \
|
||||
row4h = _mm_roti_epi64(row4h, (-16)); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, (-63)); \
|
||||
row2h = _mm_roti_epi64(row2h, (-63)); \
|
||||
|
||||
#if defined(HAVE_SSSE3)
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
#else
|
||||
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = row4l;\
|
||||
t1 = row2l;\
|
||||
row4l = row3l;\
|
||||
row3l = row3h;\
|
||||
row3h = row4l;\
|
||||
row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
|
||||
row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
|
||||
row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
|
||||
row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = row3l;\
|
||||
row3l = row3h;\
|
||||
row3h = t0;\
|
||||
t0 = row2l;\
|
||||
t1 = row4l;\
|
||||
row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
|
||||
row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
|
||||
row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
|
||||
row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_SSE41)
|
||||
#include "blake2b-load-sse41.h"
|
||||
#else
|
||||
#include "blake2b-load-sse2.h"
|
||||
#endif
|
||||
|
||||
#define ROUND(r) \
|
||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
|
||||
#endif
|
||||
|
||||
#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
|
||||
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
|
||||
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
|
||||
\
|
||||
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
|
||||
\
|
||||
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
|
||||
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
|
||||
\
|
||||
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
|
|
@ -0,0 +1,346 @@
|
|||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
|
||||
#include "blake2-config.h"
|
||||
|
||||
#ifdef WIN32
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
#if defined(HAVE_SSSE3)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined(HAVE_SSE41)
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#if defined(HAVE_AVX)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#if defined(HAVE_XOP)
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include "blake2b-round.h"
|
||||
|
||||
|
||||
|
||||
ALIGN(64) static const uint64_t blake2b_IV[8] =
|
||||
{
|
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
/* init xors IV with input parameter block */
|
||||
int blake2b_init_param(blake2b_state *S, const blake2b_param *P)
|
||||
{
|
||||
//blake2b_init0( S );
|
||||
const uint8_t * v = (const uint8_t *)(blake2b_IV);
|
||||
const uint8_t * p = (const uint8_t *)(P);
|
||||
uint8_t * h = (uint8_t *)(S->h);
|
||||
/* IV XOR ParamBlock */
|
||||
memset(S, 0, sizeof(blake2b_state));
|
||||
|
||||
for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Some sort of default parameter block initialization, for sequential blake2b */
|
||||
int blake2b_init(blake2b_state *S, const uint8_t outlen)
|
||||
{
|
||||
if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
|
||||
|
||||
const blake2b_param P =
|
||||
{
|
||||
outlen,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
{ 0 },
|
||||
{ 0 },
|
||||
{ 0 }
|
||||
};
|
||||
return blake2b_init_param(S, &P);
|
||||
}
|
||||
|
||||
int blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
|
||||
{
|
||||
if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
|
||||
|
||||
if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
|
||||
|
||||
const blake2b_param P =
|
||||
{
|
||||
outlen,
|
||||
keylen,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
{ 0 },
|
||||
{ 0 },
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
if (blake2b_init_param(S, &P) < 0)
|
||||
return 0;
|
||||
|
||||
{
|
||||
uint8_t block[BLAKE2B_BLOCKBYTES];
|
||||
memset(block, 0, BLAKE2B_BLOCKBYTES);
|
||||
memcpy(block, key, keylen);
|
||||
blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
|
||||
secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
|
||||
{
|
||||
__m128i row1l, row1h;
|
||||
__m128i row2l, row2h;
|
||||
__m128i row3l, row3h;
|
||||
__m128i row4l, row4h;
|
||||
__m128i b0, b1;
|
||||
__m128i t0, t1;
|
||||
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
|
||||
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
||||
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
||||
#endif
|
||||
#if defined(HAVE_SSE41)
|
||||
const __m128i m0 = LOADU(block + 00);
|
||||
const __m128i m1 = LOADU(block + 16);
|
||||
const __m128i m2 = LOADU(block + 32);
|
||||
const __m128i m3 = LOADU(block + 48);
|
||||
const __m128i m4 = LOADU(block + 64);
|
||||
const __m128i m5 = LOADU(block + 80);
|
||||
const __m128i m6 = LOADU(block + 96);
|
||||
const __m128i m7 = LOADU(block + 112);
|
||||
#else
|
||||
const uint64_t m0 = ( ( uint64_t * )block )[ 0];
|
||||
const uint64_t m1 = ( ( uint64_t * )block )[ 1];
|
||||
const uint64_t m2 = ( ( uint64_t * )block )[ 2];
|
||||
const uint64_t m3 = ( ( uint64_t * )block )[ 3];
|
||||
const uint64_t m4 = ( ( uint64_t * )block )[ 4];
|
||||
const uint64_t m5 = ( ( uint64_t * )block )[ 5];
|
||||
const uint64_t m6 = ( ( uint64_t * )block )[ 6];
|
||||
const uint64_t m7 = ( ( uint64_t * )block )[ 7];
|
||||
const uint64_t m8 = ( ( uint64_t * )block )[ 8];
|
||||
const uint64_t m9 = ( ( uint64_t * )block )[ 9];
|
||||
const uint64_t m10 = ( ( uint64_t * )block )[10];
|
||||
const uint64_t m11 = ( ( uint64_t * )block )[11];
|
||||
const uint64_t m12 = ( ( uint64_t * )block )[12];
|
||||
const uint64_t m13 = ( ( uint64_t * )block )[13];
|
||||
const uint64_t m14 = ( ( uint64_t * )block )[14];
|
||||
const uint64_t m15 = ( ( uint64_t * )block )[15];
|
||||
#endif
|
||||
row1l = LOADU(&S->h[0]);
|
||||
row1h = LOADU(&S->h[2]);
|
||||
row2l = LOADU(&S->h[4]);
|
||||
row2h = LOADU(&S->h[6]);
|
||||
row3l = LOADU(&blake2b_IV[0]);
|
||||
row3h = LOADU(&blake2b_IV[2]);
|
||||
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
|
||||
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
|
||||
ROUND(0);
|
||||
ROUND(1);
|
||||
ROUND(2);
|
||||
ROUND(3);
|
||||
ROUND(4);
|
||||
ROUND(5);
|
||||
ROUND(6);
|
||||
ROUND(7);
|
||||
ROUND(8);
|
||||
ROUND(9);
|
||||
ROUND(10);
|
||||
ROUND(11);
|
||||
row1l = _mm_xor_si128(row3l, row1l);
|
||||
row1h = _mm_xor_si128(row3h, row1h);
|
||||
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
|
||||
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
|
||||
row2l = _mm_xor_si128(row4l, row2l);
|
||||
row2h = _mm_xor_si128(row4h, row2h);
|
||||
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
|
||||
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
|
||||
{
|
||||
while (inlen > 0)
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
size_t fill = BLAKE2B_BLOCKBYTES - left;
|
||||
|
||||
if (inlen > fill)
|
||||
{
|
||||
memcpy(S->buf + left, in, fill); // Fill buffer
|
||||
in += fill;
|
||||
inlen -= fill;
|
||||
S->counter += BLAKE2B_BLOCKBYTES;
|
||||
blake2b_compress(S, S->buf); // Compress
|
||||
S->buflen = 0;
|
||||
}
|
||||
else // inlen <= fill
|
||||
{
|
||||
memcpy(S->buf + left, in, inlen);
|
||||
S->buflen += inlen; // not enough to compress
|
||||
in += inlen;
|
||||
inlen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
|
||||
{
|
||||
if (outlen > BLAKE2B_OUTBYTES)
|
||||
return -1;
|
||||
|
||||
if (S->buflen > BLAKE2B_BLOCKBYTES)
|
||||
{
|
||||
S->counter += BLAKE2B_BLOCKBYTES;
|
||||
blake2b_compress(S, S->buf);
|
||||
S->buflen -= BLAKE2B_BLOCKBYTES;
|
||||
memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
|
||||
}
|
||||
|
||||
S->counter += S->buflen;
|
||||
S->lastblock = 1;
|
||||
memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
|
||||
blake2b_compress(S, S->buf);
|
||||
memcpy(out, &S->h[0], outlen);
|
||||
S->lastblock = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
|
||||
{
|
||||
blake2b_state S[1];
|
||||
|
||||
/* Verify parameters */
|
||||
if (NULL == in) return -1;
|
||||
|
||||
if (NULL == out) return -1;
|
||||
|
||||
if (NULL == key) keylen = 0;
|
||||
|
||||
if (keylen)
|
||||
{
|
||||
if (blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (blake2b_init(S, outlen) < 0) return -1;
|
||||
}
|
||||
|
||||
blake2b_update(S, (const uint8_t *)in, inlen);
|
||||
blake2b_final(S, out, outlen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(SUPERCOP)
|
||||
int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
|
||||
{
|
||||
return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(BLAKE2B_SELFTEST)
|
||||
#include <string.h>
|
||||
#include "blake2-kat.h"
|
||||
int main( int argc, char **argv )
|
||||
{
|
||||
uint8_t key[BLAKE2B_KEYBYTES];
|
||||
uint8_t buf[KAT_LENGTH];
|
||||
|
||||
for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
|
||||
key[i] = ( uint8_t )i;
|
||||
|
||||
for( size_t i = 0; i < KAT_LENGTH; ++i )
|
||||
buf[i] = ( uint8_t )i;
|
||||
|
||||
for( size_t i = 0; i < KAT_LENGTH; ++i )
|
||||
{
|
||||
uint8_t hash[BLAKE2B_OUTBYTES];
|
||||
blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
|
||||
|
||||
if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
|
||||
{
|
||||
puts( "error" );
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
puts( "ok" );
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
|
||||
{
|
||||
blake2b_state blake_state;
|
||||
if (outlen <= BLAKE2B_OUTBYTES)
|
||||
{
|
||||
blake2b_init(&blake_state, outlen);
|
||||
blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
|
||||
blake2b_update(&blake_state, (const uint8_t *)in, inlen);
|
||||
blake2b_final(&blake_state, out, outlen);
|
||||
}
|
||||
else
|
||||
{
|
||||
uint8_t out_buffer[BLAKE2B_OUTBYTES];
|
||||
uint8_t in_buffer[BLAKE2B_OUTBYTES];
|
||||
blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
|
||||
blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
|
||||
blake2b_update(&blake_state, (const uint8_t *)in, inlen);
|
||||
blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
|
||||
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
||||
out += BLAKE2B_OUTBYTES / 2;
|
||||
uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
|
||||
while (toproduce > BLAKE2B_OUTBYTES)
|
||||
{
|
||||
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
||||
blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
|
||||
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
||||
out += BLAKE2B_OUTBYTES / 2;
|
||||
toproduce -= BLAKE2B_OUTBYTES / 2;
|
||||
}
|
||||
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
||||
blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
|
||||
memcpy(out, out_buffer, toproduce);
|
||||
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
set(EXECUTABLE cpu_tromp)
|
||||
|
||||
#cpu_tromp/
|
||||
file(GLOB SRC_LIST
|
||||
cpu_tromp.cpp )
|
||||
file(GLOB HEADERS
|
||||
cpu_tromp.hpp
|
||||
equi.h
|
||||
equi_miner.h
|
||||
)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
include_directories(..)
|
||||
ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
|
||||
TARGET_LINK_LIBRARIES(${EXECUTABLE} )
|
||||
|
||||
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
|
||||
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
|
|
@ -2,6 +2,7 @@
|
|||
// Equihash solver
|
||||
// Copyright (c) 2016-2016 John Tromp
|
||||
|
||||
|
||||
#include "blake2/blake2.h"
|
||||
#ifdef __APPLE__
|
||||
#include "osx_barrier.h"
|
||||
|
@ -131,3 +132,4 @@ int verify(u32 indices[PROOFSIZE], const char *header, const u32 headerlen, cons
|
|||
uchar hash[WN/8];
|
||||
return verifyrec(&ctx, indices, hash, WK);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
set(EXECUTABLE cpu_xenoncat)
|
||||
|
||||
#cpu_xenoncat/
|
||||
file(GLOB SRC_LIST
|
||||
xenoncat.cpp )
|
||||
file(GLOB HEADERS
|
||||
cpu_xenoncat.hpp
|
||||
)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
include_directories(..)
|
||||
ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
|
||||
TARGET_LINK_LIBRARIES(${EXECUTABLE} )
|
||||
|
||||
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
|
||||
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
|
Binary file not shown.
|
@ -1,2 +0,0 @@
|
|||
fasm zcblake2_avx1.asm
|
||||
fasm zcblake2_avx2.asm
|
|
@ -1,36 +0,0 @@
|
|||
xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10
|
||||
xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9
|
||||
xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8
|
||||
xctrinc dd 0,2, 0,2
|
||||
|
||||
align 32
|
||||
iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
|
||||
dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
|
||||
dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
|
||||
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
|
||||
|
||||
s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b ;0x32=50 bytes output
|
||||
s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
|
||||
s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
|
||||
s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a ;Personalization
|
||||
s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8 ;n=200, k=9
|
||||
|
||||
iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f
|
||||
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
|
||||
iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f
|
||||
iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179
|
||||
|
||||
align 32
|
||||
yctrinit dd 0,0, 0,1, 0,2, 0,3
|
||||
yctrinc dd 0,4, 0,4, 0,4, 0,4
|
||||
|
||||
blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15
|
||||
db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3
|
||||
db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4
|
||||
db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8
|
||||
db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13
|
||||
db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9
|
||||
db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11
|
||||
db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10
|
||||
db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5
|
||||
db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0
|
|
@ -1,349 +0,0 @@
|
|||
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||
{
|
||||
vpaddq xmm0,xmm0,xmm4
|
||||
vpaddq xmm1,xmm1,xmm5
|
||||
vpaddq xmm2,xmm2,xmm6
|
||||
vpaddq xmm3,xmm3,xmm7
|
||||
if m0<lim
|
||||
vpaddq xmm0,xmm0, xword [src+m0*16]
|
||||
end if
|
||||
if m1<lim
|
||||
vpaddq xmm1,xmm1, xword [src+m1*16]
|
||||
end if
|
||||
if m2<lim
|
||||
vpaddq xmm2,xmm2, xword [src+m2*16]
|
||||
end if
|
||||
if m3<lim
|
||||
vpaddq xmm3,xmm3, xword [src+m3*16]
|
||||
end if
|
||||
vpxor xmm12,xmm12,xmm0
|
||||
vpxor xmm13,xmm13,xmm1
|
||||
vpxor xmm14,xmm14,xmm2
|
||||
vpxor xmm15,xmm15,xmm3
|
||||
vpshufd xmm12,xmm12,0xB1
|
||||
vpshufd xmm13,xmm13,0xB1
|
||||
vpshufd xmm14,xmm14,0xB1
|
||||
vpshufd xmm15,xmm15,0xB1
|
||||
vpaddq xmm8,xmm8,xmm12
|
||||
vpaddq xmm9,xmm9,xmm13
|
||||
vpaddq xmm10,xmm10,xmm14
|
||||
vpaddq xmm11,xmm11,xmm15
|
||||
vpxor xmm4,xmm4,xmm8
|
||||
vpxor xmm5,xmm5,xmm9
|
||||
vpxor xmm6,xmm6,xmm10
|
||||
vpxor xmm7,xmm7,xmm11
|
||||
vmovdqa [rsp], xmm8
|
||||
vmovdqa xmm8, xword [xshufb_ror24]
|
||||
vpshufb xmm4,xmm4,xmm8
|
||||
vpshufb xmm5,xmm5,xmm8
|
||||
vpshufb xmm6,xmm6,xmm8
|
||||
vpshufb xmm7,xmm7,xmm8
|
||||
vmovdqa xmm8, [rsp]
|
||||
|
||||
vpaddq xmm0,xmm0,xmm4
|
||||
vpaddq xmm1,xmm1,xmm5
|
||||
vpaddq xmm2,xmm2,xmm6
|
||||
vpaddq xmm3,xmm3,xmm7
|
||||
if m4<lim
|
||||
vpaddq xmm0,xmm0, xword [src+m4*16]
|
||||
end if
|
||||
if m5<lim
|
||||
vpaddq xmm1,xmm1, xword [src+m5*16]
|
||||
end if
|
||||
if m6<lim
|
||||
vpaddq xmm2,xmm2, xword [src+m6*16]
|
||||
end if
|
||||
if m7<lim
|
||||
vpaddq xmm3,xmm3, xword [src+m7*16]
|
||||
end if
|
||||
vpxor xmm12,xmm12,xmm0
|
||||
vpxor xmm13,xmm13,xmm1
|
||||
vpxor xmm14,xmm14,xmm2
|
||||
vpxor xmm15,xmm15,xmm3
|
||||
vmovdqa [rsp], xmm0
|
||||
vmovdqa xmm0, xword [xshufb_ror16]
|
||||
vpshufb xmm12,xmm12,xmm0
|
||||
vpshufb xmm13,xmm13,xmm0
|
||||
vpshufb xmm14,xmm14,xmm0
|
||||
vpshufb xmm15,xmm15,xmm0
|
||||
vpaddq xmm8,xmm8,xmm12
|
||||
vpaddq xmm9,xmm9,xmm13
|
||||
vpaddq xmm10,xmm10,xmm14
|
||||
vpaddq xmm11,xmm11,xmm15
|
||||
vpxor xmm4,xmm4,xmm8
|
||||
vpxor xmm5,xmm5,xmm9
|
||||
vpxor xmm6,xmm6,xmm10
|
||||
vpxor xmm7,xmm7,xmm11
|
||||
|
||||
vpaddq xmm0,xmm4,xmm4
|
||||
vpsrlq xmm4,xmm4,63
|
||||
vpor xmm4,xmm4,xmm0
|
||||
vpaddq xmm0,xmm5,xmm5
|
||||
vpsrlq xmm5,xmm5,63
|
||||
vpor xmm5,xmm5,xmm0
|
||||
vpaddq xmm0,xmm6,xmm6
|
||||
vpsrlq xmm6,xmm6,63
|
||||
vpor xmm6,xmm6,xmm0
|
||||
vpaddq xmm0,xmm7,xmm7
|
||||
vpsrlq xmm7,xmm7,63
|
||||
vpor xmm7,xmm7,xmm0
|
||||
|
||||
vmovdqa xmm0, [rsp]
|
||||
}
|
||||
|
||||
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||
{
|
||||
vpaddq xmm0,xmm0,xmm5
|
||||
vpaddq xmm1,xmm1,xmm6
|
||||
vpaddq xmm2,xmm2,xmm7
|
||||
vpaddq xmm3,xmm3,xmm4
|
||||
if m0<lim
|
||||
vpaddq xmm0,xmm0, xword [src+m0*16]
|
||||
end if
|
||||
if m1<lim
|
||||
vpaddq xmm1,xmm1, xword [src+m1*16]
|
||||
end if
|
||||
if m2<lim
|
||||
vpaddq xmm2,xmm2, xword [src+m2*16]
|
||||
end if
|
||||
if m3<lim
|
||||
vpaddq xmm3,xmm3, xword [src+m3*16]
|
||||
end if
|
||||
vpxor xmm15,xmm15,xmm0
|
||||
vpxor xmm12,xmm12,xmm1
|
||||
vpxor xmm13,xmm13,xmm2
|
||||
vpxor xmm14,xmm14,xmm3
|
||||
vpshufd xmm15,xmm15,0xB1
|
||||
vpshufd xmm12,xmm12,0xB1
|
||||
vpshufd xmm13,xmm13,0xB1
|
||||
vpshufd xmm14,xmm14,0xB1
|
||||
vpaddq xmm10,xmm10,xmm15
|
||||
vpaddq xmm11,xmm11,xmm12
|
||||
vpaddq xmm8,xmm8,xmm13
|
||||
vpaddq xmm9,xmm9,xmm14
|
||||
vpxor xmm5,xmm5,xmm10
|
||||
vpxor xmm6,xmm6,xmm11
|
||||
vpxor xmm7,xmm7,xmm8
|
||||
vpxor xmm4,xmm4,xmm9
|
||||
vmovdqa [rsp], xmm10
|
||||
vmovdqa xmm10, xword [xshufb_ror24]
|
||||
vpshufb xmm5,xmm5,xmm10
|
||||
vpshufb xmm6,xmm6,xmm10
|
||||
vpshufb xmm7,xmm7,xmm10
|
||||
vpshufb xmm4,xmm4,xmm10
|
||||
vmovdqa xmm10, [rsp]
|
||||
|
||||
vpaddq xmm0,xmm0,xmm5
|
||||
vpaddq xmm1,xmm1,xmm6
|
||||
vpaddq xmm2,xmm2,xmm7
|
||||
vpaddq xmm3,xmm3,xmm4
|
||||
if m4<lim
|
||||
vpaddq xmm0,xmm0, xword [src+m4*16]
|
||||
end if
|
||||
if m5<lim
|
||||
vpaddq xmm1,xmm1, xword [src+m5*16]
|
||||
end if
|
||||
if m6<lim
|
||||
vpaddq xmm2,xmm2, xword [src+m6*16]
|
||||
end if
|
||||
if m7<lim
|
||||
vpaddq xmm3,xmm3, xword [src+m7*16]
|
||||
end if
|
||||
vpxor xmm15,xmm15,xmm0
|
||||
vpxor xmm12,xmm12,xmm1
|
||||
vpxor xmm13,xmm13,xmm2
|
||||
vpxor xmm14,xmm14,xmm3
|
||||
vmovdqa [rsp], xmm0
|
||||
vmovdqa xmm0, xword [xshufb_ror16]
|
||||
vpshufb xmm15,xmm15,xmm0
|
||||
vpshufb xmm12,xmm12,xmm0
|
||||
vpshufb xmm13,xmm13,xmm0
|
||||
vpshufb xmm14,xmm14,xmm0
|
||||
vpaddq xmm10,xmm10,xmm15
|
||||
vpaddq xmm11,xmm11,xmm12
|
||||
vpaddq xmm8,xmm8,xmm13
|
||||
vpaddq xmm9,xmm9,xmm14
|
||||
vpxor xmm5,xmm5,xmm10
|
||||
vpxor xmm6,xmm6,xmm11
|
||||
vpxor xmm7,xmm7,xmm8
|
||||
vpxor xmm4,xmm4,xmm9
|
||||
|
||||
vpaddq xmm0,xmm5,xmm5
|
||||
vpsrlq xmm5,xmm5,63
|
||||
vpor xmm5,xmm5,xmm0
|
||||
vpaddq xmm0,xmm6,xmm6
|
||||
vpsrlq xmm6,xmm6,63
|
||||
vpor xmm6,xmm6,xmm0
|
||||
vpaddq xmm0,xmm7,xmm7
|
||||
vpsrlq xmm7,xmm7,63
|
||||
vpor xmm7,xmm7,xmm0
|
||||
vpaddq xmm0,xmm4,xmm4
|
||||
vpsrlq xmm4,xmm4,63
|
||||
vpor xmm4,xmm4,xmm0
|
||||
|
||||
vmovdqa xmm0, [rsp]
|
||||
}
|
||||
|
||||
macro Blake2bRounds2 lim,src
|
||||
{
|
||||
;ROUND 0
|
||||
;hR0 0,2,4,6,1,3,5,7,lim,src
|
||||
;hR1 8,10,12,14,9,11,13,15,lim,src
|
||||
|
||||
;ROUND 1
|
||||
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||
|
||||
;ROUND 2
|
||||
hR0 11,12,5,15,8,0,2,13,lim,src
|
||||
hR1 10,3,7,9,14,6,1,4,lim,src
|
||||
|
||||
;ROUND 3
|
||||
hR0 7,3,13,11,9,1,12,14,lim,src
|
||||
hR1 2,5,4,15,6,10,0,8,lim,src
|
||||
|
||||
;ROUND 4
|
||||
hR0 9,5,2,10,0,7,4,15,lim,src
|
||||
hR1 14,11,6,3,1,12,8,13,lim,src
|
||||
|
||||
;ROUND 5
|
||||
hR0 2,6,0,8,12,10,11,3,lim,src
|
||||
hR1 4,7,15,1,13,5,14,9,lim,src
|
||||
|
||||
;ROUND 6
|
||||
hR0 12,1,14,4,5,15,13,10,lim,src
|
||||
hR1 0,6,9,8,7,3,2,11,lim,src
|
||||
|
||||
;ROUND 7
|
||||
hR0 13,7,12,3,11,14,1,9,lim,src
|
||||
hR1 5,15,8,2,0,4,6,10,lim,src
|
||||
|
||||
;ROUND 8
|
||||
hR0 6,14,11,0,15,9,3,8,lim,src
|
||||
hR1 12,13,1,10,2,7,4,5,lim,src
|
||||
|
||||
;ROUND 9
|
||||
hR0 10,8,7,1,2,4,6,5,lim,src
|
||||
hR1 15,9,3,13,11,14,12,0,lim,src
|
||||
|
||||
;ROUND 10
|
||||
hR0 0,2,4,6,1,3,5,7,lim,src
|
||||
hR1 8,10,12,14,9,11,13,15,lim,src
|
||||
|
||||
;ROUND 11
|
||||
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||
}
|
||||
|
||||
macro Blake2beq2of2 mids, src
|
||||
{
|
||||
vmovddup xmm0, qword [mids]
|
||||
vpaddq xmm0,xmm0, xword [src+1*16]
|
||||
vmovddup xmm12, qword [mids+0x08]
|
||||
vpxor xmm12,xmm12,xmm0
|
||||
vpshufb xmm12,xmm12, xword [xshufb_ror16]
|
||||
vmovddup xmm8, qword [mids+0x10]
|
||||
vpaddq xmm8,xmm8,xmm12
|
||||
vmovddup xmm4, qword [mids+0x18]
|
||||
vpxor xmm4,xmm4,xmm8
|
||||
vpaddq xmm2,xmm4,xmm4 ;xmm2 is temp
|
||||
vpsrlq xmm4,xmm4,63
|
||||
vpor xmm4,xmm4,xmm2
|
||||
|
||||
vmovddup xmm5, qword [mids+0x20]
|
||||
vpaddq xmm0,xmm0,xmm5
|
||||
vmovddup xmm1, qword [mids+0x30]
|
||||
vpxor xmm12,xmm12,xmm1
|
||||
vpshufd xmm12,xmm12,0xB1
|
||||
vmovddup xmm13, qword [mids+0x38]
|
||||
vpaddq xmm8,xmm8,xmm13
|
||||
vmovddup xmm3, qword [mids+0x60]
|
||||
vpaddq xmm3,xmm3,xmm4
|
||||
vmovddup xmm15, qword [mids+0x48]
|
||||
vpxor xmm15,xmm15,xmm0
|
||||
vpshufd xmm15,xmm15,0xB1
|
||||
vmovddup xmm11, qword [mids+0x58]
|
||||
vpaddq xmm11,xmm11,xmm12
|
||||
vmovddup xmm7, qword [mids+0x68]
|
||||
vpxor xmm7,xmm7,xmm8
|
||||
vmovddup xmm14, qword [mids+0x40]
|
||||
vpxor xmm14,xmm14,xmm3
|
||||
vpshufd xmm14,xmm14,0xB1
|
||||
vmovddup xmm10, qword [mids+0x50]
|
||||
vpaddq xmm10,xmm10,xmm15
|
||||
vmovddup xmm6, qword [mids+0x28]
|
||||
vpxor xmm6,xmm6,xmm11
|
||||
vmovddup xmm9, qword [mids+0x70]
|
||||
vpaddq xmm9,xmm9,xmm14
|
||||
vpxor xmm5,xmm5,xmm10
|
||||
vpxor xmm4,xmm4,xmm9
|
||||
vmovdqa xmm2, xword [xshufb_ror24] ;xmm2 is temp
|
||||
vpshufb xmm5,xmm5,xmm2
|
||||
vpshufb xmm6,xmm6,xmm2
|
||||
vpshufb xmm7,xmm7,xmm2
|
||||
vpshufb xmm4,xmm4,xmm2
|
||||
vmovddup xmm2, qword [mids+0x78]
|
||||
|
||||
vpaddq xmm0,xmm0,xmm5
|
||||
vpaddq xmm1,xmm1,xmm6
|
||||
vpaddq xmm2,xmm2,xmm7
|
||||
vpaddq xmm3,xmm3,xmm4
|
||||
vpxor xmm15,xmm15,xmm0
|
||||
vpxor xmm12,xmm12,xmm1
|
||||
vpxor xmm13,xmm13,xmm2
|
||||
vpxor xmm14,xmm14,xmm3
|
||||
vmovdqa [rsp], xmm0
|
||||
vmovdqa xmm0, xword [xshufb_ror16]
|
||||
vpshufb xmm15,xmm15,xmm0
|
||||
vpshufb xmm12,xmm12,xmm0
|
||||
vpshufb xmm13,xmm13,xmm0
|
||||
vpshufb xmm14,xmm14,xmm0
|
||||
vpaddq xmm10,xmm10,xmm15
|
||||
vpaddq xmm11,xmm11,xmm12
|
||||
vpaddq xmm8,xmm8,xmm13
|
||||
vpaddq xmm9,xmm9,xmm14
|
||||
vpxor xmm5,xmm5,xmm10
|
||||
vpxor xmm6,xmm6,xmm11
|
||||
vpxor xmm7,xmm7,xmm8
|
||||
vpxor xmm4,xmm4,xmm9
|
||||
vpaddq xmm0,xmm5,xmm5
|
||||
vpsrlq xmm5,xmm5,63
|
||||
vpor xmm5,xmm5,xmm0
|
||||
vpaddq xmm0,xmm6,xmm6
|
||||
vpsrlq xmm6,xmm6,63
|
||||
vpor xmm6,xmm6,xmm0
|
||||
vpaddq xmm0,xmm7,xmm7
|
||||
vpsrlq xmm7,xmm7,63
|
||||
vpor xmm7,xmm7,xmm0
|
||||
vpaddq xmm0,xmm4,xmm4
|
||||
vpsrlq xmm4,xmm4,63
|
||||
vpor xmm4,xmm4,xmm0
|
||||
vmovdqa xmm0, [rsp]
|
||||
|
||||
Blake2bRounds2 2,src
|
||||
|
||||
vpxor xmm0, xmm0, xmm8
|
||||
vpxor xmm1, xmm1, xmm9
|
||||
vpxor xmm2, xmm2, xmm10
|
||||
vpxor xmm3, xmm3, xmm11
|
||||
vpxor xmm4, xmm4, xmm12
|
||||
vpxor xmm5, xmm5, xmm13
|
||||
vpxor xmm6, xmm6, xmm14
|
||||
;vpxor xmm7, xmm7, xmm15
|
||||
vmovddup xmm8, qword [mids+0x80]
|
||||
vmovddup xmm9, qword [mids+0x88]
|
||||
vmovddup xmm10, qword [mids+0x90]
|
||||
vmovddup xmm11, qword [mids+0x98]
|
||||
vmovddup xmm12, qword [mids+0xa0]
|
||||
vmovddup xmm13, qword [mids+0xa8]
|
||||
vmovddup xmm14, qword [mids+0xb0]
|
||||
;vmovddup xmm15, qword [mids+0xb8]
|
||||
vpxor xmm0, xmm0, xmm8
|
||||
vpxor xmm1, xmm1, xmm9
|
||||
vpxor xmm2, xmm2, xmm10
|
||||
vpxor xmm3, xmm3, xmm11
|
||||
vpxor xmm4, xmm4, xmm12
|
||||
vpxor xmm5, xmm5, xmm13
|
||||
vpxor xmm6, xmm6, xmm14
|
||||
;vpxor xmm7, xmm7, xmm15
|
||||
}
|
|
@ -1,350 +0,0 @@
|
|||
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||
{
|
||||
vpaddq ymm0,ymm0,ymm4
|
||||
vpaddq ymm1,ymm1,ymm5
|
||||
vpaddq ymm2,ymm2,ymm6
|
||||
vpaddq ymm3,ymm3,ymm7
|
||||
if m0<lim
|
||||
vpaddq ymm0,ymm0, yword [src+m0*32]
|
||||
end if
|
||||
if m1<lim
|
||||
vpaddq ymm1,ymm1, yword [src+m1*32]
|
||||
end if
|
||||
if m2<lim
|
||||
vpaddq ymm2,ymm2, yword [src+m2*32]
|
||||
end if
|
||||
if m3<lim
|
||||
vpaddq ymm3,ymm3, yword [src+m3*32]
|
||||
end if
|
||||
vpxor ymm12,ymm12,ymm0
|
||||
vpxor ymm13,ymm13,ymm1
|
||||
vpxor ymm14,ymm14,ymm2
|
||||
vpxor ymm15,ymm15,ymm3
|
||||
vpshufd ymm12,ymm12,0xB1
|
||||
vpshufd ymm13,ymm13,0xB1
|
||||
vpshufd ymm14,ymm14,0xB1
|
||||
vpshufd ymm15,ymm15,0xB1
|
||||
vpaddq ymm8,ymm8,ymm12
|
||||
vpaddq ymm9,ymm9,ymm13
|
||||
vpaddq ymm10,ymm10,ymm14
|
||||
vpaddq ymm11,ymm11,ymm15
|
||||
vpxor ymm4,ymm4,ymm8
|
||||
vpxor ymm5,ymm5,ymm9
|
||||
vpxor ymm6,ymm6,ymm10
|
||||
vpxor ymm7,ymm7,ymm11
|
||||
vmovdqa [rsp], ymm8
|
||||
vbroadcasti128 ymm8, xword [xshufb_ror24]
|
||||
vpshufb ymm4,ymm4,ymm8
|
||||
vpshufb ymm5,ymm5,ymm8
|
||||
vpshufb ymm6,ymm6,ymm8
|
||||
vpshufb ymm7,ymm7,ymm8
|
||||
vmovdqa ymm8, [rsp]
|
||||
|
||||
vpaddq ymm0,ymm0,ymm4
|
||||
vpaddq ymm1,ymm1,ymm5
|
||||
vpaddq ymm2,ymm2,ymm6
|
||||
vpaddq ymm3,ymm3,ymm7
|
||||
if m4<lim
|
||||
vpaddq ymm0,ymm0, yword [src+m4*32]
|
||||
end if
|
||||
if m5<lim
|
||||
vpaddq ymm1,ymm1, yword [src+m5*32]
|
||||
end if
|
||||
if m6<lim
|
||||
vpaddq ymm2,ymm2, yword [src+m6*32]
|
||||
end if
|
||||
if m7<lim
|
||||
vpaddq ymm3,ymm3, yword [src+m7*32]
|
||||
end if
|
||||
vpxor ymm12,ymm12,ymm0
|
||||
vpxor ymm13,ymm13,ymm1
|
||||
vpxor ymm14,ymm14,ymm2
|
||||
vpxor ymm15,ymm15,ymm3
|
||||
vmovdqa [rsp], ymm0
|
||||
vbroadcasti128 ymm0, xword [xshufb_ror16]
|
||||
vpshufb ymm12,ymm12,ymm0
|
||||
vpshufb ymm13,ymm13,ymm0
|
||||
vpshufb ymm14,ymm14,ymm0
|
||||
vpshufb ymm15,ymm15,ymm0
|
||||
vpaddq ymm8,ymm8,ymm12
|
||||
vpaddq ymm9,ymm9,ymm13
|
||||
vpaddq ymm10,ymm10,ymm14
|
||||
vpaddq ymm11,ymm11,ymm15
|
||||
vpxor ymm4,ymm4,ymm8
|
||||
vpxor ymm5,ymm5,ymm9
|
||||
vpxor ymm6,ymm6,ymm10
|
||||
vpxor ymm7,ymm7,ymm11
|
||||
|
||||
vpaddq ymm0,ymm4,ymm4
|
||||
vpsrlq ymm4,ymm4,63
|
||||
vpor ymm4,ymm4,ymm0
|
||||
vpaddq ymm0,ymm5,ymm5
|
||||
vpsrlq ymm5,ymm5,63
|
||||
vpor ymm5,ymm5,ymm0
|
||||
vpaddq ymm0,ymm6,ymm6
|
||||
vpsrlq ymm6,ymm6,63
|
||||
vpor ymm6,ymm6,ymm0
|
||||
vpaddq ymm0,ymm7,ymm7
|
||||
vpsrlq ymm7,ymm7,63
|
||||
vpor ymm7,ymm7,ymm0
|
||||
|
||||
vmovdqa ymm0, [rsp]
|
||||
}
|
||||
|
||||
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||
{
|
||||
vpaddq ymm0,ymm0,ymm5
|
||||
vpaddq ymm1,ymm1,ymm6
|
||||
vpaddq ymm2,ymm2,ymm7
|
||||
vpaddq ymm3,ymm3,ymm4
|
||||
if m0<lim
|
||||
vpaddq ymm0,ymm0, yword [src+m0*32]
|
||||
end if
|
||||
if m1<lim
|
||||
vpaddq ymm1,ymm1, yword [src+m1*32]
|
||||
end if
|
||||
if m2<lim
|
||||
vpaddq ymm2,ymm2, yword [src+m2*32]
|
||||
end if
|
||||
if m3<lim
|
||||
vpaddq ymm3,ymm3, yword [src+m3*32]
|
||||
end if
|
||||
vpxor ymm15,ymm15,ymm0
|
||||
vpxor ymm12,ymm12,ymm1
|
||||
vpxor ymm13,ymm13,ymm2
|
||||
vpxor ymm14,ymm14,ymm3
|
||||
vpshufd ymm15,ymm15,0xB1
|
||||
vpshufd ymm12,ymm12,0xB1
|
||||
vpshufd ymm13,ymm13,0xB1
|
||||
vpshufd ymm14,ymm14,0xB1
|
||||
vpaddq ymm10,ymm10,ymm15
|
||||
vpaddq ymm11,ymm11,ymm12
|
||||
vpaddq ymm8,ymm8,ymm13
|
||||
vpaddq ymm9,ymm9,ymm14
|
||||
vpxor ymm5,ymm5,ymm10
|
||||
vpxor ymm6,ymm6,ymm11
|
||||
vpxor ymm7,ymm7,ymm8
|
||||
vpxor ymm4,ymm4,ymm9
|
||||
vmovdqa [rsp], ymm10
|
||||
vbroadcasti128 ymm10, xword [xshufb_ror24]
|
||||
vpshufb ymm5,ymm5,ymm10
|
||||
vpshufb ymm6,ymm6,ymm10
|
||||
vpshufb ymm7,ymm7,ymm10
|
||||
vpshufb ymm4,ymm4,ymm10
|
||||
vmovdqa ymm10, [rsp]
|
||||
|
||||
vpaddq ymm0,ymm0,ymm5
|
||||
vpaddq ymm1,ymm1,ymm6
|
||||
vpaddq ymm2,ymm2,ymm7
|
||||
vpaddq ymm3,ymm3,ymm4
|
||||
if m4<lim
|
||||
vpaddq ymm0,ymm0, yword [src+m4*32]
|
||||
end if
|
||||
if m5<lim
|
||||
vpaddq ymm1,ymm1, yword [src+m5*32]
|
||||
end if
|
||||
if m6<lim
|
||||
vpaddq ymm2,ymm2, yword [src+m6*32]
|
||||
end if
|
||||
if m7<lim
|
||||
vpaddq ymm3,ymm3, yword [src+m7*32]
|
||||
end if
|
||||
vpxor ymm15,ymm15,ymm0
|
||||
vpxor ymm12,ymm12,ymm1
|
||||
vpxor ymm13,ymm13,ymm2
|
||||
vpxor ymm14,ymm14,ymm3
|
||||
vmovdqa [rsp], ymm0
|
||||
vbroadcasti128 ymm0, xword [xshufb_ror16]
|
||||
vpshufb ymm15,ymm15,ymm0
|
||||
vpshufb ymm12,ymm12,ymm0
|
||||
vpshufb ymm13,ymm13,ymm0
|
||||
vpshufb ymm14,ymm14,ymm0
|
||||
vpaddq ymm10,ymm10,ymm15
|
||||
vpaddq ymm11,ymm11,ymm12
|
||||
vpaddq ymm8,ymm8,ymm13
|
||||
vpaddq ymm9,ymm9,ymm14
|
||||
vpxor ymm5,ymm5,ymm10
|
||||
vpxor ymm6,ymm6,ymm11
|
||||
vpxor ymm7,ymm7,ymm8
|
||||
vpxor ymm4,ymm4,ymm9
|
||||
|
||||
vpaddq ymm0,ymm5,ymm5
|
||||
vpsrlq ymm5,ymm5,63
|
||||
vpor ymm5,ymm5,ymm0
|
||||
vpaddq ymm0,ymm6,ymm6
|
||||
vpsrlq ymm6,ymm6,63
|
||||
vpor ymm6,ymm6,ymm0
|
||||
vpaddq ymm0,ymm7,ymm7
|
||||
vpsrlq ymm7,ymm7,63
|
||||
vpor ymm7,ymm7,ymm0
|
||||
vpaddq ymm0,ymm4,ymm4
|
||||
vpsrlq ymm4,ymm4,63
|
||||
vpor ymm4,ymm4,ymm0
|
||||
|
||||
vmovdqa ymm0, [rsp]
|
||||
}
|
||||
|
||||
macro Blake2bRounds2 lim,src
|
||||
{
|
||||
;ROUND 0
|
||||
;hR0 0,2,4,6,1,3,5,7,lim,src
|
||||
;hR1 8,10,12,14,9,11,13,15,lim,src
|
||||
|
||||
;ROUND 1
|
||||
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||
|
||||
;ROUND 2
|
||||
hR0 11,12,5,15,8,0,2,13,lim,src
|
||||
hR1 10,3,7,9,14,6,1,4,lim,src
|
||||
|
||||
;ROUND 3
|
||||
hR0 7,3,13,11,9,1,12,14,lim,src
|
||||
hR1 2,5,4,15,6,10,0,8,lim,src
|
||||
|
||||
;ROUND 4
|
||||
hR0 9,5,2,10,0,7,4,15,lim,src
|
||||
hR1 14,11,6,3,1,12,8,13,lim,src
|
||||
|
||||
;ROUND 5
|
||||
hR0 2,6,0,8,12,10,11,3,lim,src
|
||||
hR1 4,7,15,1,13,5,14,9,lim,src
|
||||
|
||||
;ROUND 6
|
||||
hR0 12,1,14,4,5,15,13,10,lim,src
|
||||
hR1 0,6,9,8,7,3,2,11,lim,src
|
||||
|
||||
;ROUND 7
|
||||
hR0 13,7,12,3,11,14,1,9,lim,src
|
||||
hR1 5,15,8,2,0,4,6,10,lim,src
|
||||
|
||||
;ROUND 8
|
||||
hR0 6,14,11,0,15,9,3,8,lim,src
|
||||
hR1 12,13,1,10,2,7,4,5,lim,src
|
||||
|
||||
;ROUND 9
|
||||
hR0 10,8,7,1,2,4,6,5,lim,src
|
||||
hR1 15,9,3,13,11,14,12,0,lim,src
|
||||
|
||||
;ROUND 10
|
||||
hR0 0,2,4,6,1,3,5,7,lim,src
|
||||
hR1 8,10,12,14,9,11,13,15,lim,src
|
||||
|
||||
;ROUND 11
|
||||
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||
}
|
||||
|
||||
macro Blake2beq2of2 mids, src
|
||||
{
|
||||
vpbroadcastq ymm0, qword [mids]
|
||||
vpaddq ymm0,ymm0, yword [src+1*32]
|
||||
vpbroadcastq ymm12, qword [mids+0x08]
|
||||
vpxor ymm12,ymm12,ymm0
|
||||
vbroadcasti128 ymm2, xword [xshufb_ror16] ;ymm2 is temp
|
||||
vpshufb ymm12,ymm12,ymm2
|
||||
vpbroadcastq ymm8, qword [mids+0x10]
|
||||
vpaddq ymm8,ymm8,ymm12
|
||||
vpbroadcastq ymm4, qword [mids+0x18]
|
||||
vpxor ymm4,ymm4,ymm8
|
||||
vpaddq ymm2,ymm4,ymm4 ;ymm2 is temp
|
||||
vpsrlq ymm4,ymm4,63
|
||||
vpor ymm4,ymm4,ymm2
|
||||
|
||||
vpbroadcastq ymm5, qword [mids+0x20]
|
||||
vpaddq ymm0,ymm0,ymm5
|
||||
vpbroadcastq ymm1, qword [mids+0x30]
|
||||
vpxor ymm12,ymm12,ymm1
|
||||
vpshufd ymm12,ymm12,0xB1
|
||||
vpbroadcastq ymm13, qword [mids+0x38]
|
||||
vpaddq ymm8,ymm8,ymm13
|
||||
vpbroadcastq ymm3, qword [mids+0x60]
|
||||
vpaddq ymm3,ymm3,ymm4
|
||||
vpbroadcastq ymm15, qword [mids+0x48]
|
||||
vpxor ymm15,ymm15,ymm0
|
||||
vpshufd ymm15,ymm15,0xB1
|
||||
vpbroadcastq ymm11, qword [mids+0x58]
|
||||
vpaddq ymm11,ymm11,ymm12
|
||||
vpbroadcastq ymm7, qword [mids+0x68]
|
||||
vpxor ymm7,ymm7,ymm8
|
||||
vpbroadcastq ymm14, qword [mids+0x40]
|
||||
vpxor ymm14,ymm14,ymm3
|
||||
vpshufd ymm14,ymm14,0xB1
|
||||
vpbroadcastq ymm10, qword [mids+0x50]
|
||||
vpaddq ymm10,ymm10,ymm15
|
||||
vpbroadcastq ymm6, qword [mids+0x28]
|
||||
vpxor ymm6,ymm6,ymm11
|
||||
vpbroadcastq ymm9, qword [mids+0x70]
|
||||
vpaddq ymm9,ymm9,ymm14
|
||||
vpxor ymm5,ymm5,ymm10
|
||||
vpxor ymm4,ymm4,ymm9
|
||||
vbroadcasti128 ymm2, xword [xshufb_ror24] ;ymm2 is temp
|
||||
vpshufb ymm5,ymm5,ymm2
|
||||
vpshufb ymm6,ymm6,ymm2
|
||||
vpshufb ymm7,ymm7,ymm2
|
||||
vpshufb ymm4,ymm4,ymm2
|
||||
vpbroadcastq ymm2, qword [mids+0x78]
|
||||
|
||||
vpaddq ymm0,ymm0,ymm5
|
||||
vpaddq ymm1,ymm1,ymm6
|
||||
vpaddq ymm2,ymm2,ymm7
|
||||
vpaddq ymm3,ymm3,ymm4
|
||||
vpxor ymm15,ymm15,ymm0
|
||||
vpxor ymm12,ymm12,ymm1
|
||||
vpxor ymm13,ymm13,ymm2
|
||||
vpxor ymm14,ymm14,ymm3
|
||||
vmovdqa [rsp], ymm0
|
||||
vbroadcasti128 ymm0, xword [xshufb_ror16]
|
||||
vpshufb ymm15,ymm15,ymm0
|
||||
vpshufb ymm12,ymm12,ymm0
|
||||
vpshufb ymm13,ymm13,ymm0
|
||||
vpshufb ymm14,ymm14,ymm0
|
||||
vpaddq ymm10,ymm10,ymm15
|
||||
vpaddq ymm11,ymm11,ymm12
|
||||
vpaddq ymm8,ymm8,ymm13
|
||||
vpaddq ymm9,ymm9,ymm14
|
||||
vpxor ymm5,ymm5,ymm10
|
||||
vpxor ymm6,ymm6,ymm11
|
||||
vpxor ymm7,ymm7,ymm8
|
||||
vpxor ymm4,ymm4,ymm9
|
||||
vpaddq ymm0,ymm5,ymm5
|
||||
vpsrlq ymm5,ymm5,63
|
||||
vpor ymm5,ymm5,ymm0
|
||||
vpaddq ymm0,ymm6,ymm6
|
||||
vpsrlq ymm6,ymm6,63
|
||||
vpor ymm6,ymm6,ymm0
|
||||
vpaddq ymm0,ymm7,ymm7
|
||||
vpsrlq ymm7,ymm7,63
|
||||
vpor ymm7,ymm7,ymm0
|
||||
vpaddq ymm0,ymm4,ymm4
|
||||
vpsrlq ymm4,ymm4,63
|
||||
vpor ymm4,ymm4,ymm0
|
||||
vmovdqa ymm0, [rsp]
|
||||
|
||||
Blake2bRounds2 2,src
|
||||
|
||||
vpxor ymm0, ymm0, ymm8
|
||||
vpxor ymm1, ymm1, ymm9
|
||||
vpxor ymm2, ymm2, ymm10
|
||||
vpxor ymm3, ymm3, ymm11
|
||||
vpxor ymm4, ymm4, ymm12
|
||||
vpxor ymm5, ymm5, ymm13
|
||||
vpxor ymm6, ymm6, ymm14
|
||||
;vpxor ymm7, ymm7, ymm15
|
||||
vpbroadcastq ymm8, qword [mids+0x80]
|
||||
vpbroadcastq ymm9, qword [mids+0x88]
|
||||
vpbroadcastq ymm10, qword [mids+0x90]
|
||||
vpbroadcastq ymm11, qword [mids+0x98]
|
||||
vpbroadcastq ymm12, qword [mids+0xa0]
|
||||
vpbroadcastq ymm13, qword [mids+0xa8]
|
||||
vpbroadcastq ymm14, qword [mids+0xb0]
|
||||
;vpbroadcastq ymm15, qword [mids+0xb8]
|
||||
vpxor ymm0, ymm0, ymm8
|
||||
vpxor ymm1, ymm1, ymm9
|
||||
vpxor ymm2, ymm2, ymm10
|
||||
vpxor ymm3, ymm3, ymm11
|
||||
vpxor ymm4, ymm4, ymm12
|
||||
vpxor ymm5, ymm5, ymm13
|
||||
vpxor ymm6, ymm6, ymm14
|
||||
;vpxor ymm7, ymm7, ymm15
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
;void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||
;hashout: hash output buffer: 2*64 bytes
|
||||
;midstate: 256 bytes from Blake2PrepareMidstate2
|
||||
;indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
|
||||
|
||||
include "macro_blake2b_avx1.asm"
|
||||
|
||||
Blake2Run2:
|
||||
mov rax, rsp
|
||||
sub rsp, 0x28
|
||||
and rsp, -32
|
||||
mov [rsp+0x20], rax
|
||||
|
||||
mov [rsi+0xd4], edx
|
||||
add edx, 1
|
||||
mov [rsi+0xdc], edx
|
||||
|
||||
Blake2beq2of2 rsi, rsi+0xc0
|
||||
|
||||
vpunpcklqdq xmm8, xmm0, xmm1
|
||||
vpunpckhqdq xmm1, xmm0, xmm1
|
||||
vpunpcklqdq xmm10, xmm2, xmm3
|
||||
vpunpckhqdq xmm3, xmm2, xmm3
|
||||
vpunpcklqdq xmm12, xmm4, xmm5
|
||||
vpunpckhqdq xmm5, xmm4, xmm5
|
||||
vpunpcklqdq xmm14, xmm6, xmm7
|
||||
vpunpckhqdq xmm7, xmm6, xmm7
|
||||
|
||||
vmovdqa [rdi], xmm8
|
||||
vmovdqa [rdi+0x10], xmm10
|
||||
vmovdqa [rdi+0x20], xmm12
|
||||
vmovdqa [rdi+0x30], xmm14
|
||||
vmovdqa [rdi+0x40], xmm1
|
||||
vmovdqa [rdi+0x50], xmm3
|
||||
vmovdqa [rdi+0x60], xmm5
|
||||
vmovdqa [rdi+0x70], xmm7
|
||||
|
||||
mov rsp, [rsp+0x20]
|
||||
ret
|
|
@ -1,49 +0,0 @@
|
|||
;void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||
;hashout: hash output buffer: 4*64 bytes
|
||||
;midstate: 256 bytes from Blake2PrepareMidstate4
|
||||
;indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
|
||||
|
||||
include "macro_blake2b_avx2.asm"
|
||||
|
||||
Blake2Run4:
|
||||
mov rax, rsp
|
||||
sub rsp, 0x28
|
||||
and rsp, -32
|
||||
mov [rsp+0x20], rax
|
||||
|
||||
vmovd xmm0, edx ;indexctr
|
||||
vpbroadcastd ymm0, xmm0
|
||||
vpaddd ymm0, ymm0, yword [yctrinit]
|
||||
vpblendd ymm0, ymm0, yword [rsi+0xe0], 0x55
|
||||
vmovdqa yword [rsi+0xe0], ymm0
|
||||
|
||||
Blake2beq2of2 rsi, rsi+0xc0
|
||||
|
||||
vpunpcklqdq ymm8, ymm0, ymm1
|
||||
vpunpckhqdq ymm9, ymm0, ymm1
|
||||
vpunpcklqdq ymm10, ymm2, ymm3
|
||||
vpunpckhqdq ymm11, ymm2, ymm3
|
||||
vpunpcklqdq ymm12, ymm4, ymm5
|
||||
vpunpckhqdq ymm13, ymm4, ymm5
|
||||
vpunpcklqdq ymm14, ymm6, ymm7
|
||||
vpunpckhqdq ymm15, ymm6, ymm7
|
||||
vperm2i128 ymm0, ymm8, ymm10, 0x20
|
||||
vperm2i128 ymm1, ymm12, ymm14, 0x20
|
||||
vperm2i128 ymm2, ymm9, ymm11, 0x20
|
||||
vperm2i128 ymm3, ymm13, ymm15, 0x20
|
||||
vperm2i128 ymm4, ymm8, ymm10, 0x31
|
||||
vperm2i128 ymm5, ymm12, ymm14, 0x31
|
||||
vperm2i128 ymm6, ymm9, ymm11, 0x31
|
||||
vperm2i128 ymm7, ymm13, ymm15, 0x31
|
||||
|
||||
vmovdqa [rdi], ymm0
|
||||
vmovdqa [rdi+0x20], ymm1
|
||||
vmovdqa [rdi+0x40], ymm2
|
||||
vmovdqa [rdi+0x60], ymm3
|
||||
vmovdqa [rdi+0x80], ymm4
|
||||
vmovdqa [rdi+0xa0], ymm5
|
||||
vmovdqa [rdi+0xc0], ymm6
|
||||
vmovdqa [rdi+0xe0], ymm7
|
||||
|
||||
mov rsp, [rsp+0x20]
|
||||
ret
|
|
@ -1,212 +0,0 @@
|
|||
;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
|
||||
;midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||
;input: 140 bytes header, preferably aligned by 8
|
||||
|
||||
Blake2PrepareMidstate2:
|
||||
sub rsp, 0x188
|
||||
|
||||
vmovdqa xmm10, xword [xshufb_ror24]
|
||||
vmovdqa xmm11, xword [xshufb_ror16]
|
||||
|
||||
vmovdqa xmm0, xword [s0]
|
||||
vmovdqa xmm1, xword [s2]
|
||||
vmovdqa xmm2, xword [s4]
|
||||
vmovdqa xmm3, xword [s6]
|
||||
vmovdqa xmm4, xword [iv]
|
||||
vmovdqa xmm5, xword [iv+0x10]
|
||||
vmovdqa xmm6, xword [iv4xor128]
|
||||
vmovdqa xmm7, xword [iv4xor128+0x10]
|
||||
|
||||
mov r8, rsp
|
||||
lea r9, [blake2sigma]
|
||||
lea r11, [blake2sigma+160]
|
||||
call _ProcBlakeMsgSched
|
||||
call _ProcBlakeRound
|
||||
add r8, 0x80
|
||||
add r9, 16
|
||||
call _ProcBlakeMsgSched
|
||||
call _ProcBlakeRound
|
||||
add r8, 0x80
|
||||
add r9, 16
|
||||
_LoopEhPrepare1:
|
||||
call _ProcBlakeMsgSched
|
||||
call _ProcBlakeRound
|
||||
add r9, 16
|
||||
cmp r9, r11
|
||||
jb _LoopEhPrepare1
|
||||
mov r8, rsp
|
||||
call _ProcBlakeRound
|
||||
add r8, 0x80
|
||||
call _ProcBlakeRound
|
||||
|
||||
vpxor xmm0, xmm0, xmm4
|
||||
vpxor xmm1, xmm1, xmm5
|
||||
vpxor xmm2, xmm2, xmm6
|
||||
vpxor xmm3, xmm3, xmm7
|
||||
vpxor xmm0, xmm0, xword [s0]
|
||||
vpxor xmm1, xmm1, xword [s2]
|
||||
vpxor xmm2, xmm2, xword [s4]
|
||||
vpxor xmm3, xmm3, xword [s6]
|
||||
vmovdqa xword [rdi+0x80], xmm0
|
||||
vmovdqa xword [rdi+0x90], xmm1
|
||||
vmovdqa xword [rdi+0xa0], xmm2
|
||||
vmovdqa xword [rdi+0xb0], xmm3
|
||||
vmovq xmm8, [rsi+0x80]
|
||||
vpshufd xmm4, xmm8, 0x44
|
||||
vmovdqa xword [rdi+0xc0], xmm4
|
||||
vmovd xmm4, [rsi+0x88]
|
||||
vpshufd xmm4, xmm4, 0x44
|
||||
vmovdqa xword [rdi+0xd0], xmm4
|
||||
|
||||
;Begin second message block
|
||||
vmovdqa xmm4, xword [iv]
|
||||
vmovdqa xmm5, xword [iv+0x10]
|
||||
vmovdqa xmm6, xword [iv4xor144]
|
||||
vmovdqa xmm7, xword [iv6inverted]
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vpaddq xmm0, xmm0, xmm8 ;xmm8[63:0]=message
|
||||
vpxor xmm6, xmm6, xmm0
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufd xmm6, xmm6, 0xb1
|
||||
vmovq [rdi+0x08], xmm6 ;v12
|
||||
vpshufd xmm7, xmm7, 0xb1
|
||||
vpaddq xmm4, xmm4, xmm6
|
||||
vmovq [rdi+0x10], xmm4 ;v8
|
||||
vpaddq xmm5, xmm5, xmm7
|
||||
vpxor xmm2, xmm2, xmm4
|
||||
vpxor xmm3, xmm3, xmm5
|
||||
vpshufb xmm2, xmm2, xmm10
|
||||
vmovq [rdi+0x18], xmm2 ;v4
|
||||
vpshufb xmm3, xmm3, xmm10
|
||||
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vmovq [rdi], xmm0 ;v0
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vpextrq [rdi+0x60], xmm1, 1 ;v3
|
||||
;add message (nonce, index) to xmm0 here, but we don't have
|
||||
vpxor xmm6, xmm6, xmm0
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufb xmm6, xmm6, xmm11
|
||||
vpshufb xmm7, xmm7, xmm11
|
||||
vmovdqa xword [rdi+0x40], xmm7 ;v14,15
|
||||
vpaddq xmm4, xmm4, xmm6
|
||||
vpextrq [rdi+0x70], xmm4, 1 ;v9
|
||||
vpaddq xmm5, xmm5, xmm7
|
||||
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
|
||||
vpxor xmm2, xmm2, xmm4
|
||||
vpxor xmm3, xmm3, xmm5
|
||||
vpaddq xmm8, xmm2, xmm2
|
||||
vpsrlq xmm2, xmm2, 63
|
||||
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
||||
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
||||
vpsrlq xmm3, xmm3, 63
|
||||
vpor xmm3, xmm3, xmm2
|
||||
|
||||
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
|
||||
vmovdqa xword [rdi+0x20], xmm2 ;v5,6
|
||||
vpsrldq xmm3, xmm3, 8
|
||||
vmovq [rdi+0x68], xmm3 ;v7
|
||||
vpsrldq xmm7, xmm6, 8
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vpextrq [rdi+0x30], xmm0, 1 ;v1
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vmovq [rdi+0x78], xmm1 ;v2
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufd xmm7, xmm7, 0xb1
|
||||
vmovq [rdi+0x38], xmm7 ;v13
|
||||
|
||||
add rsp, 0x188
|
||||
ret
|
||||
|
||||
align 16
|
||||
_ProcBlakeMsgSched:
|
||||
;rsi=src
|
||||
;r8=dst
|
||||
;r9=sigma table
|
||||
xor r10d, r10d
|
||||
_LoopBlakeMsgSched:
|
||||
movzx eax, byte [r9+r10]
|
||||
mov rax, [rsi+rax*8]
|
||||
mov [r8+r10*8], rax
|
||||
add r10d, 1
|
||||
cmp r10d, 16
|
||||
jb _LoopBlakeMsgSched
|
||||
ret
|
||||
|
||||
align 16
|
||||
_ProcBlakeRound:
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vpaddq xmm0, xmm0, [r8]
|
||||
vpaddq xmm1, xmm1, [r8+0x10]
|
||||
vpxor xmm6, xmm6, xmm0
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufd xmm6, xmm6, 0xb1
|
||||
vpshufd xmm7, xmm7, 0xb1
|
||||
vpaddq xmm4, xmm4, xmm6
|
||||
vpaddq xmm5, xmm5, xmm7
|
||||
vpxor xmm2, xmm2, xmm4
|
||||
vpxor xmm3, xmm3, xmm5
|
||||
vpshufb xmm2, xmm2, xmm10
|
||||
vpshufb xmm3, xmm3, xmm10
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vpaddq xmm0, xmm0, [r8+0x20]
|
||||
vpaddq xmm1, xmm1, [r8+0x30]
|
||||
vpxor xmm6, xmm6, xmm0
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
|
||||
vpshufb xmm7, xmm7, xmm11
|
||||
vpaddq xmm4, xmm4, xmm9
|
||||
vpaddq xmm5, xmm5, xmm7
|
||||
vpxor xmm2, xmm2, xmm4
|
||||
vpxor xmm3, xmm3, xmm5
|
||||
vpaddq xmm8, xmm2, xmm2
|
||||
vpsrlq xmm2, xmm2, 63
|
||||
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
||||
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
||||
vpsrlq xmm3, xmm3, 63
|
||||
vpor xmm3, xmm3, xmm2
|
||||
|
||||
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
|
||||
vpalignr xmm3, xmm8, xmm3, 8
|
||||
vpalignr xmm6, xmm9, xmm7, 8 ;xmm6 resume
|
||||
vpalignr xmm7, xmm7, xmm9, 8
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vpaddq xmm0, xmm0, [r8+0x40]
|
||||
vpaddq xmm1, xmm1, [r8+0x50]
|
||||
vpxor xmm6, xmm6, xmm0
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufd xmm6, xmm6, 0xb1
|
||||
vpshufd xmm7, xmm7, 0xb1
|
||||
vpaddq xmm5, xmm5, xmm6
|
||||
vpaddq xmm4, xmm4, xmm7
|
||||
vpxor xmm2, xmm2, xmm5
|
||||
vpxor xmm3, xmm3, xmm4
|
||||
vpshufb xmm2, xmm2, xmm10
|
||||
vpshufb xmm3, xmm3, xmm10
|
||||
vpaddq xmm0, xmm0, xmm2
|
||||
vpaddq xmm1, xmm1, xmm3
|
||||
vpaddq xmm0, xmm0, [r8+0x60]
|
||||
vpaddq xmm1, xmm1, [r8+0x70]
|
||||
vpxor xmm6, xmm6, xmm0
|
||||
vpxor xmm7, xmm7, xmm1
|
||||
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
|
||||
vpshufb xmm7, xmm7, xmm11
|
||||
vpaddq xmm5, xmm5, xmm9
|
||||
vpaddq xmm4, xmm4, xmm7
|
||||
vpxor xmm2, xmm2, xmm5
|
||||
vpxor xmm3, xmm3, xmm4
|
||||
vpaddq xmm8, xmm2, xmm2
|
||||
vpsrlq xmm2, xmm2, 63
|
||||
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
||||
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
||||
vpsrlq xmm3, xmm3, 63
|
||||
vpor xmm3, xmm3, xmm2
|
||||
vpalignr xmm2, xmm8, xmm3, 8 ;xmm2 resume
|
||||
vpalignr xmm3, xmm3, xmm8, 8
|
||||
vpalignr xmm6, xmm7, xmm9, 8 ;xmm6 resume
|
||||
vpalignr xmm7, xmm9, xmm7, 8
|
||||
ret
|
|
@ -1,166 +0,0 @@
|
|||
;void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
|
||||
;midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||
;input: 140 bytes header, preferably aligned by 8
|
||||
|
||||
Blake2PrepareMidstate4:
|
||||
sub rsp, 0x188
|
||||
vbroadcasti128 ymm6, xword [xshufb_ror24]
|
||||
vbroadcasti128 ymm7, xword [xshufb_ror16]
|
||||
|
||||
vmovdqa ymm0, yword [s0]
|
||||
vmovdqa ymm1, yword [s4]
|
||||
vmovdqa ymm2, yword [iv]
|
||||
vmovdqa ymm3, yword [iv4xor128]
|
||||
|
||||
mov r8, rsp
|
||||
lea r9, [blake2sigma]
|
||||
lea r11, [blake2sigma+160]
|
||||
call _ProcBlakeMsgSched
|
||||
call _ProcBlakeRound
|
||||
add r8, 0x80
|
||||
add r9, 16
|
||||
call _ProcBlakeMsgSched
|
||||
call _ProcBlakeRound
|
||||
add r8, 0x80
|
||||
add r9, 16
|
||||
_LoopEhPrepare1:
|
||||
call _ProcBlakeMsgSched
|
||||
call _ProcBlakeRound
|
||||
add r9, 16
|
||||
cmp r9, r11
|
||||
jb _LoopEhPrepare1
|
||||
mov r8, rsp
|
||||
call _ProcBlakeRound
|
||||
add r8, 0x80
|
||||
call _ProcBlakeRound
|
||||
|
||||
vpxor ymm0, ymm0, ymm2
|
||||
vpxor ymm1, ymm1, ymm3
|
||||
vpxor ymm0, ymm0, yword [s0]
|
||||
vpxor ymm1, ymm1, yword [s4]
|
||||
vmovdqa yword [rdi+0x80], ymm0
|
||||
vmovdqa yword [rdi+0xa0], ymm1
|
||||
vmovq xmm5, [rsi+0x80]
|
||||
vpbroadcastq ymm4, xmm5
|
||||
vmovdqa yword [rdi+0xc0], ymm4
|
||||
vmovd xmm4, [rsi+0x88]
|
||||
vpbroadcastq ymm4, xmm4
|
||||
vmovdqa yword [rdi+0xe0], ymm4
|
||||
|
||||
;Begin second message block
|
||||
vmovdqa ymm2, yword [iv]
|
||||
vmovdqa ymm3, yword [iv4xor144] ;also loads iv6inverted
|
||||
vpaddq ymm0, ymm0, ymm1
|
||||
vpaddq ymm0, ymm0, ymm5 ;ymm5[63:0]=message
|
||||
vpxor ymm3, ymm3, ymm0
|
||||
vpshufd ymm3, ymm3, 0xb1
|
||||
vmovq [rdi+0x08], xmm3 ;v12
|
||||
vpaddq ymm2, ymm2, ymm3
|
||||
vmovq [rdi+0x10], xmm2 ;v8
|
||||
vpxor ymm1, ymm1, ymm2
|
||||
vpshufb ymm1, ymm1, ymm6
|
||||
vmovq [rdi+0x18], xmm1 ;v4
|
||||
|
||||
vpaddq ymm0, ymm0, ymm1
|
||||
vmovq [rdi], xmm0 ;v0, v3 ready
|
||||
;add message (nonce, index) to xmm0 here, but we don't have
|
||||
vpxor ymm3, ymm3, ymm0
|
||||
vpshufb ymm3, ymm3, ymm7
|
||||
vextracti128 xmm4, ymm3, 1
|
||||
vmovdqa xword [rdi+0x40], xmm4 ;v14,15
|
||||
vpaddq ymm2, ymm2, ymm3
|
||||
vpextrq [rdi+0x70], xmm2, 1 ;v9
|
||||
vextracti128 xmm5, ymm2, 1
|
||||
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
|
||||
vpxor ymm1, ymm1, ymm2
|
||||
vpaddq ymm4, ymm1, ymm1
|
||||
vpsrlq ymm1, ymm1, 63
|
||||
vpor ymm1, ymm1, ymm4
|
||||
;Valid:
|
||||
; v1 v2 v3
|
||||
; v5 v6 v7
|
||||
; v9 v10 v11
|
||||
; v13 v14 v15
|
||||
;
|
||||
;v1 v2 <- v6 v7
|
||||
;v13 <- v2
|
||||
|
||||
vpermq ymm1, ymm1, 0x39
|
||||
vmovdqa xword [rdi+0x20], xmm1 ;v5,6
|
||||
|
||||
vextracti128 xmm4, ymm0, 1
|
||||
vextracti128 xmm5, ymm1, 1
|
||||
vpextrq [rdi+0x60], xmm4, 1 ;v3
|
||||
vmovq [rdi+0x68], xmm5 ;v7
|
||||
|
||||
vpsrldq xmm3, xmm3, 8
|
||||
vpaddq xmm0, xmm0, xmm1
|
||||
vpextrq [rdi+0x30], xmm0, 1 ;v1
|
||||
vpaddq xmm4, xmm4, xmm5
|
||||
vmovq [rdi+0x78], xmm4 ;v2
|
||||
vpxor xmm3, xmm3, xmm4
|
||||
vpshufd xmm3, xmm3, 0xb1
|
||||
vmovq [rdi+0x38], xmm3 ;v13
|
||||
|
||||
add rsp, 0x188
|
||||
ret
|
||||
|
||||
align 16
|
||||
_ProcBlakeMsgSched:
|
||||
;rsi=src
|
||||
;r8=dst
|
||||
;r9=sigma table
|
||||
xor r10d, r10d
|
||||
_LoopBlakeMsgSched:
|
||||
movzx eax, byte [r9+r10]
|
||||
mov rax, [rsi+rax*8]
|
||||
mov [r8+r10*8], rax
|
||||
add r10d, 1
|
||||
cmp r10d, 16
|
||||
jb _LoopBlakeMsgSched
|
||||
ret
|
||||
|
||||
align 16
|
||||
_ProcBlakeRound:
|
||||
vpaddq ymm0, ymm0, ymm1
|
||||
vpaddq ymm0, ymm0, [r8]
|
||||
vpxor ymm3, ymm3, ymm0
|
||||
vpshufd ymm3, ymm3, 0xb1
|
||||
vpaddq ymm2, ymm2, ymm3
|
||||
vpxor ymm1, ymm1, ymm2
|
||||
vpshufb ymm1, ymm1, ymm6 ;ror24
|
||||
vpaddq ymm0, ymm0, ymm1
|
||||
vpaddq ymm0, ymm0, [r8+0x20]
|
||||
vpxor ymm3, ymm3, ymm0
|
||||
vpshufb ymm3, ymm3, ymm7 ;ror16
|
||||
vpaddq ymm2, ymm2, ymm3
|
||||
vpxor ymm1, ymm1, ymm2
|
||||
vpaddq ymm4, ymm1, ymm1
|
||||
vpsrlq ymm1, ymm1, 63
|
||||
vpor ymm1, ymm1, ymm4
|
||||
|
||||
vpermq ymm1, ymm1, 0x39
|
||||
vpermq ymm2, ymm2, 0x4e
|
||||
vpermq ymm3, ymm3, 0x93
|
||||
|
||||
vpaddq ymm0, ymm0, ymm1
|
||||
vpaddq ymm0, ymm0, [r8+0x40]
|
||||
vpxor ymm3, ymm3, ymm0
|
||||
vpshufd ymm3, ymm3, 0xb1
|
||||
vpaddq ymm2, ymm2, ymm3
|
||||
vpxor ymm1, ymm1, ymm2
|
||||
vpshufb ymm1, ymm1, ymm6 ;ror24
|
||||
vpaddq ymm0, ymm0, ymm1
|
||||
vpaddq ymm0, ymm0, [r8+0x60]
|
||||
vpxor ymm3, ymm3, ymm0
|
||||
vpshufb ymm3, ymm3, ymm7 ;ror16
|
||||
vpaddq ymm2, ymm2, ymm3
|
||||
vpxor ymm1, ymm1, ymm2
|
||||
vpaddq ymm4, ymm1, ymm1
|
||||
vpsrlq ymm1, ymm1, 63
|
||||
vpor ymm1, ymm1, ymm4
|
||||
|
||||
vpermq ymm1, ymm1, 0x93
|
||||
vpermq ymm2, ymm2, 0x4e
|
||||
vpermq ymm3, ymm3, 0x39
|
||||
ret
|
|
@ -1,11 +0,0 @@
|
|||
format elf64
|
||||
public Blake2PrepareMidstate2
|
||||
public Blake2Run2
|
||||
|
||||
section '.text' executable align 64
|
||||
include "proc_prepmidstate_avx1.asm"
|
||||
align 16
|
||||
include "proc_blake2_avx1.asm"
|
||||
|
||||
section '.data' writeable align 64
|
||||
include "data_blake2b.asm"
|
|
@ -1,11 +0,0 @@
|
|||
format elf64
|
||||
public Blake2PrepareMidstate4
|
||||
public Blake2Run4
|
||||
|
||||
section '.text' executable align 64
|
||||
include "proc_prepmidstate_avx2.asm"
|
||||
align 16
|
||||
include "proc_blake2_avx2.asm"
|
||||
|
||||
section '.data' writeable align 64
|
||||
include "data_blake2b.asm"
|
|
@ -1,51 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
|
||||
//midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||
//input: 140 bytes header, preferably aligned by 8
|
||||
|
||||
void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||
//hashout: hash output buffer: 2*64 bytes
|
||||
//midstate: 256 bytes from Blake2PrepareMidstate2
|
||||
//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
|
||||
|
||||
unsigned char __attribute__((aligned(8))) testdata[140] =
|
||||
{
|
||||
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
|
||||
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
|
||||
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
|
||||
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
|
||||
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
|
||||
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
|
||||
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
|
||||
};
|
||||
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
|
||||
|
||||
int main(void)
|
||||
{
|
||||
unsigned char midstate_a[256+32];
|
||||
void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
|
||||
unsigned char hashout_a[128+32];
|
||||
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
|
||||
unsigned char buf[128];
|
||||
FILE *outfile;
|
||||
int i;
|
||||
|
||||
Blake2PrepareMidstate2(pmidstate, testdata);
|
||||
outfile = fopen("out.bin", "wb");
|
||||
|
||||
for (i=0; i<1048576; i+=2) {
|
||||
Blake2Run2(phashout, pmidstate, i);
|
||||
memcpy(buf, phashout, 50);
|
||||
memcpy(buf+50, phashout+64, 50);
|
||||
fwrite(buf, 100, 1, outfile);
|
||||
}
|
||||
|
||||
fclose(outfile);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
|
||||
//midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||
//input: 140 bytes header, preferably aligned by 8
|
||||
|
||||
void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||
//hashout: hash output buffer: 4*64 bytes
|
||||
//midstate: 256 bytes from Blake2PrepareMidstate4
|
||||
//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
|
||||
|
||||
unsigned char __attribute__((aligned(8))) testdata[140] =
|
||||
{
|
||||
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
|
||||
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
|
||||
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
|
||||
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
|
||||
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
|
||||
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
|
||||
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
|
||||
};
|
||||
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
|
||||
|
||||
int main(void)
|
||||
{
|
||||
unsigned char midstate_a[256+32];
|
||||
void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
|
||||
unsigned char hashout_a[256+32];
|
||||
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
|
||||
unsigned char buf[256];
|
||||
FILE *outfile;
|
||||
int i;
|
||||
|
||||
Blake2PrepareMidstate4(pmidstate, testdata);
|
||||
outfile = fopen("out.bin", "wb");
|
||||
|
||||
for (i=0; i<1048576; i+=4) {
|
||||
Blake2Run4(phashout, pmidstate, i);
|
||||
memcpy(buf, phashout, 50);
|
||||
memcpy(buf+50, phashout+64, 50);
|
||||
memcpy(buf+100, phashout+128, 50);
|
||||
memcpy(buf+150, phashout+192, 50);
|
||||
fwrite(buf, 200, 1, outfile);
|
||||
}
|
||||
|
||||
fclose(outfile);
|
||||
|
||||
return 0;
|
||||
}
|
Binary file not shown.
|
@ -1,78 +0,0 @@
|
|||
//compile with
|
||||
//gcc -o quickbench quickbench.c equihash_avx2.o
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#define CONTEXT_SIZE 178033152
|
||||
#define ITERATIONS 10
|
||||
|
||||
//Linkage with assembly
|
||||
//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
|
||||
//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
|
||||
void EhPrepare(void *context, void *input);
|
||||
int32_t EhSolver(void *context, uint32_t nonce);
|
||||
extern char testinput[];
|
||||
|
||||
int main(void)
|
||||
{
|
||||
void *context_alloc, *context, *context_end;
|
||||
uint32_t *pu32;
|
||||
uint64_t *pu64, previous_rdtsc;
|
||||
uint8_t inputheader[144]; //140 byte header
|
||||
FILE *infile, *outfile;
|
||||
struct timespec time0, time1;
|
||||
long t0, t1;
|
||||
int32_t numsolutions, total_solutions;
|
||||
uint32_t nonce, delta_time, total_time;
|
||||
int i, j;
|
||||
|
||||
context_alloc = malloc(CONTEXT_SIZE+4096);
|
||||
context = (void*) (((long) context_alloc+4095) & -4096);
|
||||
context_end = context + CONTEXT_SIZE;
|
||||
|
||||
infile = 0;
|
||||
infile = fopen("input.bin", "rb");
|
||||
if (infile) {
|
||||
puts("Reading input.bin");
|
||||
fread(inputheader, 140, 1, infile);
|
||||
fclose(infile);
|
||||
} else {
|
||||
puts("input.bin not found, use sample data (beta1 testnet block 2)");
|
||||
memcpy(inputheader, testinput, 140);
|
||||
}
|
||||
|
||||
|
||||
EhPrepare(context, (void *) inputheader);
|
||||
|
||||
//Warm up, timing not taken into average
|
||||
nonce = 0;
|
||||
clock_gettime(CLOCK_MONOTONIC, &time0);
|
||||
numsolutions = EhSolver(context, nonce);
|
||||
clock_gettime(CLOCK_MONOTONIC, &time1);
|
||||
delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
|
||||
- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
|
||||
printf("(Warm up) Time: %u ms, solutions: %u\n", delta_time, numsolutions);
|
||||
|
||||
printf("Running %d iterations...\n", ITERATIONS);
|
||||
nonce = 58; //arbritary number to get 19 solutions in 10 iterations (to match 1.88 solutions per run)
|
||||
total_time = total_solutions = 0;
|
||||
for (i=0; i<ITERATIONS; i++) {
|
||||
clock_gettime(CLOCK_MONOTONIC, &time0);
|
||||
numsolutions = EhSolver(context, nonce);
|
||||
clock_gettime(CLOCK_MONOTONIC, &time1);
|
||||
nonce++;
|
||||
delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
|
||||
- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
|
||||
total_time += delta_time;
|
||||
total_solutions += numsolutions;
|
||||
printf("Time: %u ms, solutions: %u\n", delta_time, numsolutions);
|
||||
}
|
||||
|
||||
printf("Average time: %d ms; %.3f Sol/s\n", total_time/ITERATIONS, (double) 1000.0*total_solutions/total_time);
|
||||
|
||||
free(context_alloc);
|
||||
return 0;
|
||||
}
|
|
@ -1,128 +0,0 @@
|
|||
//compile with
|
||||
//gcc -o solver solver.c equihash_avx2.o
|
||||
//
|
||||
//./solver
|
||||
//sha256sum out2.bin
|
||||
//Expected result with default input.bin (beta1 testnet block 2),
|
||||
//257d3c3250c14978614ac169edcf72bd131a2e4c227c8d7e21a2cd6131a13dda out2.bin
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <x86intrin.h> //for rdtsc
|
||||
|
||||
#define CONTEXT_SIZE 178033152
|
||||
|
||||
//Linkage with assembly
|
||||
//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
|
||||
//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
|
||||
void EhPrepare(void *context, void *input);
|
||||
int32_t EhSolver(void *context, uint32_t nonce);
|
||||
extern char testinput[];
|
||||
|
||||
//context is the memory used for Equihash computation. It should be allocated outside of SolverFunction, the size is defined by CONTEXT_SIZE, about 180MB.
|
||||
//SolverFunction API has slight overhead in mining due to missing opportunity to run EhSolver multiple times after a single EhPrepare.
|
||||
int SolverFunction(void* context, const unsigned char* input,
|
||||
bool (*validBlock)(void*, const unsigned char*),
|
||||
void* validBlockData,
|
||||
bool (*cancelled)(void*),
|
||||
void* cancelledData,
|
||||
int numThreads,
|
||||
int n, int k)
|
||||
{
|
||||
int numsolutions, i;
|
||||
|
||||
EhPrepare(context, (void *) input);
|
||||
numsolutions = EhSolver(context, *(uint32_t *)(input+136));
|
||||
|
||||
for (i=0; i<numsolutions; i++) {
|
||||
validBlock(validBlockData, (unsigned char*)(context+1344*i));
|
||||
}
|
||||
return numsolutions;
|
||||
}
|
||||
|
||||
bool validBlock(void *validBlockData, const unsigned char *solution)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool cancelled(void *cancelledData)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
void *context_alloc, *context, *context_end;
|
||||
uint32_t *pu32;
|
||||
uint64_t *pu64, previous_rdtsc;
|
||||
uint8_t inputheader[144]; //140 byte header
|
||||
FILE *infile, *outfile;
|
||||
struct timespec time0, time1;
|
||||
uint64_t rdtsc0, rdtsc1;
|
||||
long t0, t1;
|
||||
int32_t numsolutions;
|
||||
int i, j;
|
||||
char outfilename[32];
|
||||
|
||||
context_alloc = malloc(CONTEXT_SIZE+4096);
|
||||
context = (void*) (((long) context_alloc+4095) & -4096);
|
||||
context_end = context + CONTEXT_SIZE;
|
||||
|
||||
//Init page tables. This is not necessary, but useful to get a more consistent single-run timing.
|
||||
for (pu32=context; (void*) pu32<context_end; pu32+=1024)
|
||||
*pu32 = 0;
|
||||
|
||||
infile = 0;
|
||||
infile = fopen("input.bin", "rb");
|
||||
if (infile) {
|
||||
puts("Reading input.bin");
|
||||
fread(inputheader, 140, 1, infile);
|
||||
fclose(infile);
|
||||
} else {
|
||||
puts("input.bin not found, use sample data (beta1 testnet block 2)");
|
||||
memcpy(inputheader, testinput, 140);
|
||||
}
|
||||
|
||||
puts("Running solver...");
|
||||
clock_gettime(CLOCK_MONOTONIC, &time0);
|
||||
rdtsc0 = __rdtsc();
|
||||
numsolutions = SolverFunction(context, inputheader, validBlock, 0, cancelled, 0, 1, 200, 9);
|
||||
//EhPrepare(context, (void *) inputheader);
|
||||
//numsolutions = EhSolver(context, *(uint32_t *)(inputheader+136));
|
||||
clock_gettime(CLOCK_MONOTONIC, &time1);
|
||||
rdtsc1 = __rdtsc();
|
||||
|
||||
//Print some debug information
|
||||
pu64 = (uint64_t *) (context + 102408); //Read the debug area for statistics
|
||||
printf("BLAKE2b rdtsc: %lu\n", pu64[1]-pu64[0]);
|
||||
previous_rdtsc = pu64[1];
|
||||
for (i=1, j=2; i<=9; i++, j+=2) {
|
||||
printf("Stage %u, Output pairs %u, rdtsc: %lu\n", i, (uint32_t) pu64[j+1], pu64[j]-previous_rdtsc);
|
||||
previous_rdtsc = pu64[j];
|
||||
}
|
||||
printf("Number of solutions before duplicate removal: %u\n", *(uint32_t *) (context+16384));
|
||||
printf("Duplicate removal and tree expand rdtsc: %lu\n", pu64[j]-previous_rdtsc);
|
||||
|
||||
printf("Number of solutions: %d\n", numsolutions);
|
||||
|
||||
j = numsolutions < 4 ? numsolutions : 4;
|
||||
for (i=0; i<j; i++) {
|
||||
sprintf(outfilename, "out%d.bin", i);
|
||||
outfile = fopen(outfilename, "wb");
|
||||
fwrite(context+1344*i, 1344, 1, outfile);
|
||||
fclose(outfile);
|
||||
}
|
||||
|
||||
t0 = time0.tv_sec * 1000000000 + time0.tv_nsec;
|
||||
t1 = time1.tv_sec * 1000000000 + time1.tv_nsec;
|
||||
printf("Time: %ld ms\n", (t1-t0)/1000000);
|
||||
t0 = (t1-t0)/1000;
|
||||
printf("Measure rdtsc frequency = %.3f MHz\n", (double) (rdtsc1-rdtsc0)/t0);
|
||||
|
||||
free(context_alloc);
|
||||
return 0;
|
||||
}
|
0
cpu_xenoncat/Linux/asm/assemble.sh → cpu_xenoncat/asm_linux/assemble.sh
Executable file → Normal file
0
cpu_xenoncat/Linux/asm/assemble.sh → cpu_xenoncat/asm_linux/assemble.sh
Executable file → Normal file
|
@ -1,7 +1,6 @@
|
|||
format elf64
|
||||
public EhPrepare as 'EhPrepareAVX1'
|
||||
public EhSolver as 'EhSolverAVX1'
|
||||
public testinput as 'testinputAVX1'
|
||||
|
||||
include "struct.inc"
|
||||
include "params.inc"
|
||||
|
@ -14,4 +13,3 @@ include "proc_ehsolver_avx1.asm"
|
|||
|
||||
section '.data' writeable align 64
|
||||
include "data_blake2b.asm"
|
||||
testinput file "t2.bin"
|
|
@ -1,7 +1,6 @@
|
|||
format elf64
|
||||
public EhPrepare as 'EhPrepareAVX2'
|
||||
public EhSolver as 'EhSolverAVX2'
|
||||
public testinput as 'testinputAVX2'
|
||||
|
||||
include "struct.inc"
|
||||
include "params.inc"
|
||||
|
@ -14,4 +13,3 @@ include "proc_ehsolver_avx2.asm"
|
|||
|
||||
section '.data' writeable align 64
|
||||
include "data_blake2b.asm"
|
||||
testinput file "t2.bin"
|
|
@ -0,0 +1,43 @@
|
|||
set(EXECUTABLE cuda_djezo)
|
||||
|
||||
option(ENABLE_CUDA "Enable the cuda build" ON)
|
||||
|
||||
# depending on gcc version
|
||||
# ;-std=c++11 => Ubuntu 14.04 check gcc versions
|
||||
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
|
||||
|
||||
file(GLOB SRC_LIST
|
||||
cuda_djezo.cpp
|
||||
equi_miner.cu )
|
||||
file(GLOB HEADERS
|
||||
cuda_djezo.hpp
|
||||
eqcuda.hpp
|
||||
)
|
||||
|
||||
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-D_FORCE_INLINES;--disable-warnings;--ptxas-options=-v;-Xptxas=-dlcm=ca;-Xptxas=-dscm=cs; -O3)
|
||||
|
||||
FIND_PACKAGE(CUDA REQUIRED)
|
||||
if(COMPUTE AND (COMPUTE GREATER 0))
|
||||
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
|
||||
else(COMPUTE AND (COMPUTE GREATER 0))
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_60,code=sm_60 )
|
||||
endif(COMPUTE AND (COMPUTE GREATER 0))
|
||||
|
||||
if(CUDA_FOUND)
|
||||
message("CUDA FOUND")
|
||||
else()
|
||||
message("CUDA NOT FOUND")
|
||||
endif()
|
||||
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
include_directories(..)
|
||||
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
|
||||
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} cuda)
|
||||
|
||||
message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
|
||||
|
||||
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
|
||||
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
|
|
@ -0,0 +1,336 @@
|
|||
// Blake2-B CUDA Implementation
|
||||
// tpruvot@github July 2016
|
||||
// permission granted to use under MIT license
|
||||
// modified for use in Zcash by John Tromp September 2016
|
||||
|
||||
/**
|
||||
* uint2 direct ops by c++ operator definitions
|
||||
*/
|
||||
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
|
||||
return make_uint2(a.x ^ b.x, a.y ^ b.y);
|
||||
}
|
||||
static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) {
|
||||
return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
|
||||
}
|
||||
// uint2 ROR/ROL methods
|
||||
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
|
||||
uint2 result;
|
||||
#if __CUDA_ARCH__ > 300
|
||||
/* if (offset < 32) {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||
} else *//* if (offset < 64) */ {
|
||||
/* offset SHOULD BE < 64 ! */
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||
}
|
||||
#else
|
||||
if (!offset)
|
||||
result = a;
|
||||
else if (offset < 32) {
|
||||
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
|
||||
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
|
||||
} else if (offset == 32) {
|
||||
result.y = a.x;
|
||||
result.x = a.y;
|
||||
} else {
|
||||
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
|
||||
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
|
||||
return make_uint2(value.y, value.x);
|
||||
}
|
||||
#ifdef __CUDA_ARCH__
|
||||
__device__ __inline__ uint2 ROR24(const uint2 a) {
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x2107);
|
||||
result.y = __byte_perm(a.y, a.x, 0x6543);
|
||||
return result;
|
||||
}
|
||||
__device__ __inline__ uint2 ROR16(const uint2 a) {
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x1076);
|
||||
result.y = __byte_perm(a.y, a.x, 0x5432);
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
#define ROR24(u) ROR2(u,24)
|
||||
#define ROR16(u) ROR2(u,16)
|
||||
#endif
|
||||
|
||||
typedef uint64_t u64;
|
||||
|
||||
static __constant__ const int8_t blake2b_sigma[12][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
__device__ __constant__
|
||||
static const u64 blake_iv[] =
|
||||
{
|
||||
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
|
||||
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
|
||||
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
|
||||
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
|
||||
};
|
||||
|
||||
__device__ __forceinline__
|
||||
static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
|
||||
a = a + b + m[ blake2b_sigma[r][2*i] ];
|
||||
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
|
||||
c = c + d;
|
||||
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
|
||||
a = a + b + m[ blake2b_sigma[r][2*i+1] ];
|
||||
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
|
||||
c = c + d;
|
||||
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
|
||||
}
|
||||
|
||||
//__device__ __forceinline__
|
||||
//static void G2(u64 &a, u64 &b, u64 &c, u64 &d, u64 x, u64 y) {
|
||||
// a = a + b + x;
|
||||
// ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
|
||||
// c = c + d;
|
||||
// ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
|
||||
// a = a + b + y;
|
||||
// ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
|
||||
// c = c + d;
|
||||
// ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
|
||||
//}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) {
|
||||
a = a + b + x;
|
||||
((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
|
||||
c = c + d;
|
||||
((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
|
||||
a = a + b + y;
|
||||
((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
|
||||
c = c + d;
|
||||
((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
|
||||
}
|
||||
|
||||
#define ROUND(r) \
|
||||
G(r, 0, v[0], v[4], v[ 8], v[12], m); \
|
||||
G(r, 1, v[1], v[5], v[ 9], v[13], m); \
|
||||
G(r, 2, v[2], v[6], v[10], v[14], m); \
|
||||
G(r, 3, v[3], v[7], v[11], v[15], m); \
|
||||
G(r, 4, v[0], v[5], v[10], v[15], m); \
|
||||
G(r, 5, v[1], v[6], v[11], v[12], m); \
|
||||
G(r, 6, v[2], v[7], v[ 8], v[13], m); \
|
||||
G(r, 7, v[3], v[4], v[ 9], v[14], m);
|
||||
|
||||
|
||||
__forceinline__ __device__ void blake2b_gpu_hash3(uint64_t* h, u32 idx, u32 nonce) {
|
||||
u64 m = (u64)idx << 32 | (u64)nonce;
|
||||
|
||||
u64 v[16];
|
||||
|
||||
v[0] = h[0];
|
||||
v[1] = h[1];
|
||||
v[2] = h[2];
|
||||
v[3] = h[3];
|
||||
v[4] = h[4];
|
||||
v[5] = h[5];
|
||||
v[6] = h[6];
|
||||
v[7] = h[7];
|
||||
v[8] = blake_iv[0];
|
||||
v[9] = blake_iv[1];
|
||||
v[10] = blake_iv[2];
|
||||
v[11] = blake_iv[3];
|
||||
v[12] = blake_iv[4] ^ (128 + 16);
|
||||
v[13] = blake_iv[5];
|
||||
v[14] = blake_iv[6] ^ 0xffffffffffffffff;
|
||||
v[15] = blake_iv[7];
|
||||
|
||||
// mix 1
|
||||
G2(v[0], v[4], v[8], v[12], 0, m);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 2
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], m, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 3
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, m);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 4
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, m);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 5
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, m);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 6
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], m, 0);
|
||||
|
||||
// mix 7
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], m, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 8
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, m);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 9
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], m, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 10
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], m, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 11
|
||||
G2(v[0], v[4], v[8], v[12], 0, m);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], 0, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
// mix 12
|
||||
G2(v[0], v[4], v[8], v[12], 0, 0);
|
||||
G2(v[1], v[5], v[9], v[13], 0, 0);
|
||||
G2(v[2], v[6], v[10], v[14], 0, 0);
|
||||
G2(v[3], v[7], v[11], v[15], 0, 0);
|
||||
G2(v[0], v[5], v[10], v[15], m, 0);
|
||||
G2(v[1], v[6], v[11], v[12], 0, 0);
|
||||
G2(v[2], v[7], v[8], v[13], 0, 0);
|
||||
G2(v[3], v[4], v[9], v[14], 0, 0);
|
||||
|
||||
h[0] ^= v[0] ^ v[8];
|
||||
h[1] ^= v[1] ^ v[9];
|
||||
h[2] ^= v[2] ^ v[10];
|
||||
h[3] ^= v[3] ^ v[11];
|
||||
h[4] ^= v[4] ^ v[12];
|
||||
h[5] ^= v[5] ^ v[13];
|
||||
h[6] ^= v[6] ^ v[14];
|
||||
}
|
||||
|
||||
|
||||
__forceinline__ __device__ void blake2b_gpu_hash2(uint64_t* h, u32 idx) {
|
||||
u64 m[16] = { 0 };
|
||||
u32* ptr = (u32*)&m[1];
|
||||
|
||||
ptr[1] = idx;
|
||||
|
||||
u64 v[16];
|
||||
|
||||
v[0] = h[0];
|
||||
v[1] = h[1];
|
||||
v[2] = h[2];
|
||||
v[3] = h[3];
|
||||
v[4] = h[4];
|
||||
v[5] = h[5];
|
||||
v[6] = h[6];
|
||||
v[7] = h[7];
|
||||
v[8] = 0x6a09e667f3bcc908;
|
||||
v[9] = 0xbb67ae8584caa73b;
|
||||
v[10] = 0x3c6ef372fe94f82b;
|
||||
v[11] = 0xa54ff53a5f1d36f1;
|
||||
v[12] = 0x510e527fade682d1 ^ (128 + 16);
|
||||
v[13] = 0x9b05688c2b3e6c1f;
|
||||
v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
|
||||
v[15] = 0x5be0cd19137e2179;
|
||||
|
||||
ROUND(0);
|
||||
ROUND(1);
|
||||
ROUND(2);
|
||||
ROUND(3);
|
||||
ROUND(4);
|
||||
ROUND(5);
|
||||
ROUND(6);
|
||||
ROUND(7);
|
||||
ROUND(8);
|
||||
ROUND(9);
|
||||
ROUND(10);
|
||||
ROUND(11);
|
||||
|
||||
h[0] ^= v[0] ^ v[8];
|
||||
h[1] ^= v[1] ^ v[9];
|
||||
h[2] ^= v[2] ^ v[10];
|
||||
h[3] ^= v[3] ^ v[11];
|
||||
h[4] ^= v[4] ^ v[12];
|
||||
h[5] ^= v[5] ^ v[13];
|
||||
h[6] ^= v[6] ^ v[14];
|
||||
//h[7] ^= v[7] ^ v[15];
|
||||
//memcpy(hash, (uchar *)h, outlen);
|
||||
}
|
|
@ -2156,4 +2156,4 @@ template class eq_cuda_context<CONFIG_MODE_2>;
|
|||
|
||||
#ifdef CONFIG_MODE_3
|
||||
template class eq_cuda_context<CONFIG_MODE_3>;
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
set(EXECUTABLE cuda_tromp)
|
||||
|
||||
option(ENABLE_CUDA "Enable the cuda build" ON)
|
||||
|
||||
# depending on gcc version
|
||||
# ;-std=c++11 => Ubuntu 14.04 check gcc versions
|
||||
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
|
||||
|
||||
file(GLOB SRC_LIST
|
||||
cuda_tromp.cpp
|
||||
equi_miner.cu )
|
||||
file(GLOB HEADERS
|
||||
cuda_tromp.hpp
|
||||
eqcuda.hpp
|
||||
)
|
||||
|
||||
|
||||
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
|
||||
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
|
||||
|
||||
add_definitions(-DHIST)
|
||||
#add_definitions(-DXINTREE)
|
||||
#add_definitions(-DUNROLL)
|
||||
|
||||
list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
|
||||
|
||||
|
||||
FIND_PACKAGE(CUDA REQUIRED)
|
||||
if(COMPUTE AND (COMPUTE GREATER 0))
|
||||
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
|
||||
else(COMPUTE AND (COMPUTE GREATER 0))
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 )
|
||||
endif(COMPUTE AND (COMPUTE GREATER 0))
|
||||
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
|
||||
find_package(Threads REQUIRED COMPONENTS)
|
||||
find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
|
||||
|
||||
if(CUDA_FOUND)
|
||||
message("CUDA FOUND")
|
||||
else()
|
||||
message("CUDA NOT FOUND")
|
||||
endif()
|
||||
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
include_directories(..)
|
||||
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
|
||||
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES})
|
||||
|
||||
message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
|
||||
|
||||
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
|
||||
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
|
|
@ -56,6 +56,8 @@ void MinerFactory::ClearAllSolvers() {
|
|||
}
|
||||
|
||||
ISolver * MinerFactory::GenCPUSolver(int use_opt) {
|
||||
// TODO fix dynamic linking on Linux
|
||||
#ifdef USE_CPU_XENONCAT
|
||||
if (_use_xenoncat) {
|
||||
_solvers.push_back(new CPUSolverXenoncat(use_opt));
|
||||
return _solvers.back();
|
||||
|
@ -63,6 +65,10 @@ ISolver * MinerFactory::GenCPUSolver(int use_opt) {
|
|||
_solvers.push_back(new CPUSolverTromp(use_opt));
|
||||
return _solvers.back();
|
||||
}
|
||||
#else
|
||||
_solvers.push_back(new CPUSolverTromp(use_opt));
|
||||
return _solvers.back();
|
||||
#endif
|
||||
}
|
||||
|
||||
ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperblock) {
|
||||
|
@ -75,7 +81,7 @@ ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperbloc
|
|||
return _solvers.back();
|
||||
}
|
||||
}
|
||||
|
||||
// no OpenCL solvers at the moment keep for future reference
|
||||
ISolver * MinerFactory::GenOPENCLSolver(int platf_id, int dev_id) {
|
||||
if (_use_silentarmy) {
|
||||
_solvers.push_back(new OPENCLSolverSilentarmy(platf_id, dev_id));
|
||||
|
|
|
@ -108,7 +108,12 @@ void print_help()
|
|||
|
||||
void print_cuda_info()
|
||||
{
|
||||
int num_devices = cuda_tromp::getcount();
|
||||
#if defined(USE_CUDA_DJEZO) || defined(USE_CUDA_TROMP)
|
||||
#ifdef USE_CUDA_DJEZO
|
||||
int num_devices = cuda_djezo::getcount();
|
||||
#elif USE_CUDA_TROMP
|
||||
int num_devices = cuda_tromp::getcount();
|
||||
#endif
|
||||
|
||||
std::cout << "Number of CUDA devices found: " << num_devices << std::endl;
|
||||
|
||||
|
@ -123,6 +128,7 @@ void print_cuda_info()
|
|||
#endif
|
||||
std::cout << "\t#" << i << " " << gpuname << " | SM version: " << version << " | SM count: " << smcount << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void print_opencl_info() {
|
||||
|
|
Loading…
Reference in New Issue