add other xenoncat blake2b files

This commit is contained in:
John Tromp 2016-10-23 15:21:30 -04:00
parent fa73e24c4b
commit ef21c94286
19 changed files with 1345 additions and 0 deletions

11
blake2b/Makefile Normal file
View File

@ -0,0 +1,11 @@
all: example_avx1 example_avx2
example_avx1: example_avx1.c zcblake2_avx1.o
gcc -o example_avx1 example_avx1.c zcblake2_avx1.o
example_avx2: example_avx2.c zcblake2_avx2.o
gcc -o example_avx2 example_avx2.c zcblake2_avx2.o
.PHONY: clean
clean:
rm example_avx1 example_avx2

2
blake2b/asm/assemble.sh Normal file
View File

@ -0,0 +1,2 @@
fasm zcblake2_avx1.asm
fasm zcblake2_avx2.asm

View File

@ -0,0 +1,36 @@
xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10
xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9
xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8
xctrinc dd 0,2, 0,2
align 32
iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b ;0x32=50 bytes output
s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a ;Personalization
s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8 ;n=200, k=9
iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f
iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179
align 32
yctrinit dd 0,0, 0,1, 0,2, 0,3
yctrinc dd 0,4, 0,4, 0,4, 0,4
blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15
db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3
db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4
db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8
db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13
db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9
db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11
db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10
db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5
db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0

View File

@ -0,0 +1,349 @@
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq xmm0,xmm0,xmm4
vpaddq xmm1,xmm1,xmm5
vpaddq xmm2,xmm2,xmm6
vpaddq xmm3,xmm3,xmm7
if m0<lim
vpaddq xmm0,xmm0, xword [src+m0*16]
end if
if m1<lim
vpaddq xmm1,xmm1, xword [src+m1*16]
end if
if m2<lim
vpaddq xmm2,xmm2, xword [src+m2*16]
end if
if m3<lim
vpaddq xmm3,xmm3, xword [src+m3*16]
end if
vpxor xmm12,xmm12,xmm0
vpxor xmm13,xmm13,xmm1
vpxor xmm14,xmm14,xmm2
vpxor xmm15,xmm15,xmm3
vpshufd xmm12,xmm12,0xB1
vpshufd xmm13,xmm13,0xB1
vpshufd xmm14,xmm14,0xB1
vpshufd xmm15,xmm15,0xB1
vpaddq xmm8,xmm8,xmm12
vpaddq xmm9,xmm9,xmm13
vpaddq xmm10,xmm10,xmm14
vpaddq xmm11,xmm11,xmm15
vpxor xmm4,xmm4,xmm8
vpxor xmm5,xmm5,xmm9
vpxor xmm6,xmm6,xmm10
vpxor xmm7,xmm7,xmm11
vmovdqa [rsp], xmm8
vmovdqa xmm8, xword [xshufb_ror24]
vpshufb xmm4,xmm4,xmm8
vpshufb xmm5,xmm5,xmm8
vpshufb xmm6,xmm6,xmm8
vpshufb xmm7,xmm7,xmm8
vmovdqa xmm8, [rsp]
vpaddq xmm0,xmm0,xmm4
vpaddq xmm1,xmm1,xmm5
vpaddq xmm2,xmm2,xmm6
vpaddq xmm3,xmm3,xmm7
if m4<lim
vpaddq xmm0,xmm0, xword [src+m4*16]
end if
if m5<lim
vpaddq xmm1,xmm1, xword [src+m5*16]
end if
if m6<lim
vpaddq xmm2,xmm2, xword [src+m6*16]
end if
if m7<lim
vpaddq xmm3,xmm3, xword [src+m7*16]
end if
vpxor xmm12,xmm12,xmm0
vpxor xmm13,xmm13,xmm1
vpxor xmm14,xmm14,xmm2
vpxor xmm15,xmm15,xmm3
vmovdqa [rsp], xmm0
vmovdqa xmm0, xword [xshufb_ror16]
vpshufb xmm12,xmm12,xmm0
vpshufb xmm13,xmm13,xmm0
vpshufb xmm14,xmm14,xmm0
vpshufb xmm15,xmm15,xmm0
vpaddq xmm8,xmm8,xmm12
vpaddq xmm9,xmm9,xmm13
vpaddq xmm10,xmm10,xmm14
vpaddq xmm11,xmm11,xmm15
vpxor xmm4,xmm4,xmm8
vpxor xmm5,xmm5,xmm9
vpxor xmm6,xmm6,xmm10
vpxor xmm7,xmm7,xmm11
vpaddq xmm0,xmm4,xmm4
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm0
vpaddq xmm0,xmm5,xmm5
vpsrlq xmm5,xmm5,63
vpor xmm5,xmm5,xmm0
vpaddq xmm0,xmm6,xmm6
vpsrlq xmm6,xmm6,63
vpor xmm6,xmm6,xmm0
vpaddq xmm0,xmm7,xmm7
vpsrlq xmm7,xmm7,63
vpor xmm7,xmm7,xmm0
vmovdqa xmm0, [rsp]
}
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq xmm0,xmm0,xmm5
vpaddq xmm1,xmm1,xmm6
vpaddq xmm2,xmm2,xmm7
vpaddq xmm3,xmm3,xmm4
if m0<lim
vpaddq xmm0,xmm0, xword [src+m0*16]
end if
if m1<lim
vpaddq xmm1,xmm1, xword [src+m1*16]
end if
if m2<lim
vpaddq xmm2,xmm2, xword [src+m2*16]
end if
if m3<lim
vpaddq xmm3,xmm3, xword [src+m3*16]
end if
vpxor xmm15,xmm15,xmm0
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm2
vpxor xmm14,xmm14,xmm3
vpshufd xmm15,xmm15,0xB1
vpshufd xmm12,xmm12,0xB1
vpshufd xmm13,xmm13,0xB1
vpshufd xmm14,xmm14,0xB1
vpaddq xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm12
vpaddq xmm8,xmm8,xmm13
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm6,xmm6,xmm11
vpxor xmm7,xmm7,xmm8
vpxor xmm4,xmm4,xmm9
vmovdqa [rsp], xmm10
vmovdqa xmm10, xword [xshufb_ror24]
vpshufb xmm5,xmm5,xmm10
vpshufb xmm6,xmm6,xmm10
vpshufb xmm7,xmm7,xmm10
vpshufb xmm4,xmm4,xmm10
vmovdqa xmm10, [rsp]
vpaddq xmm0,xmm0,xmm5
vpaddq xmm1,xmm1,xmm6
vpaddq xmm2,xmm2,xmm7
vpaddq xmm3,xmm3,xmm4
if m4<lim
vpaddq xmm0,xmm0, xword [src+m4*16]
end if
if m5<lim
vpaddq xmm1,xmm1, xword [src+m5*16]
end if
if m6<lim
vpaddq xmm2,xmm2, xword [src+m6*16]
end if
if m7<lim
vpaddq xmm3,xmm3, xword [src+m7*16]
end if
vpxor xmm15,xmm15,xmm0
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm2
vpxor xmm14,xmm14,xmm3
vmovdqa [rsp], xmm0
vmovdqa xmm0, xword [xshufb_ror16]
vpshufb xmm15,xmm15,xmm0
vpshufb xmm12,xmm12,xmm0
vpshufb xmm13,xmm13,xmm0
vpshufb xmm14,xmm14,xmm0
vpaddq xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm12
vpaddq xmm8,xmm8,xmm13
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm6,xmm6,xmm11
vpxor xmm7,xmm7,xmm8
vpxor xmm4,xmm4,xmm9
vpaddq xmm0,xmm5,xmm5
vpsrlq xmm5,xmm5,63
vpor xmm5,xmm5,xmm0
vpaddq xmm0,xmm6,xmm6
vpsrlq xmm6,xmm6,63
vpor xmm6,xmm6,xmm0
vpaddq xmm0,xmm7,xmm7
vpsrlq xmm7,xmm7,63
vpor xmm7,xmm7,xmm0
vpaddq xmm0,xmm4,xmm4
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm0
vmovdqa xmm0, [rsp]
}
macro Blake2bRounds2 lim,src
{
;ROUND 0
;hR0 0,2,4,6,1,3,5,7,lim,src
;hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 1
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
;ROUND 2
hR0 11,12,5,15,8,0,2,13,lim,src
hR1 10,3,7,9,14,6,1,4,lim,src
;ROUND 3
hR0 7,3,13,11,9,1,12,14,lim,src
hR1 2,5,4,15,6,10,0,8,lim,src
;ROUND 4
hR0 9,5,2,10,0,7,4,15,lim,src
hR1 14,11,6,3,1,12,8,13,lim,src
;ROUND 5
hR0 2,6,0,8,12,10,11,3,lim,src
hR1 4,7,15,1,13,5,14,9,lim,src
;ROUND 6
hR0 12,1,14,4,5,15,13,10,lim,src
hR1 0,6,9,8,7,3,2,11,lim,src
;ROUND 7
hR0 13,7,12,3,11,14,1,9,lim,src
hR1 5,15,8,2,0,4,6,10,lim,src
;ROUND 8
hR0 6,14,11,0,15,9,3,8,lim,src
hR1 12,13,1,10,2,7,4,5,lim,src
;ROUND 9
hR0 10,8,7,1,2,4,6,5,lim,src
hR1 15,9,3,13,11,14,12,0,lim,src
;ROUND 10
hR0 0,2,4,6,1,3,5,7,lim,src
hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 11
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
}
macro Blake2beq2of2 mids, src
{
vmovddup xmm0, qword [mids]
vpaddq xmm0,xmm0, xword [src+1*16]
vmovddup xmm12, qword [mids+0x08]
vpxor xmm12,xmm12,xmm0
vpshufb xmm12,xmm12, xword [xshufb_ror16]
vmovddup xmm8, qword [mids+0x10]
vpaddq xmm8,xmm8,xmm12
vmovddup xmm4, qword [mids+0x18]
vpxor xmm4,xmm4,xmm8
vpaddq xmm2,xmm4,xmm4 ;xmm2 is temp
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm2
vmovddup xmm5, qword [mids+0x20]
vpaddq xmm0,xmm0,xmm5
vmovddup xmm1, qword [mids+0x30]
vpxor xmm12,xmm12,xmm1
vpshufd xmm12,xmm12,0xB1
vmovddup xmm13, qword [mids+0x38]
vpaddq xmm8,xmm8,xmm13
vmovddup xmm3, qword [mids+0x60]
vpaddq xmm3,xmm3,xmm4
vmovddup xmm15, qword [mids+0x48]
vpxor xmm15,xmm15,xmm0
vpshufd xmm15,xmm15,0xB1
vmovddup xmm11, qword [mids+0x58]
vpaddq xmm11,xmm11,xmm12
vmovddup xmm7, qword [mids+0x68]
vpxor xmm7,xmm7,xmm8
vmovddup xmm14, qword [mids+0x40]
vpxor xmm14,xmm14,xmm3
vpshufd xmm14,xmm14,0xB1
vmovddup xmm10, qword [mids+0x50]
vpaddq xmm10,xmm10,xmm15
vmovddup xmm6, qword [mids+0x28]
vpxor xmm6,xmm6,xmm11
vmovddup xmm9, qword [mids+0x70]
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm4,xmm4,xmm9
vmovdqa xmm2, xword [xshufb_ror24] ;xmm2 is temp
vpshufb xmm5,xmm5,xmm2
vpshufb xmm6,xmm6,xmm2
vpshufb xmm7,xmm7,xmm2
vpshufb xmm4,xmm4,xmm2
vmovddup xmm2, qword [mids+0x78]
vpaddq xmm0,xmm0,xmm5
vpaddq xmm1,xmm1,xmm6
vpaddq xmm2,xmm2,xmm7
vpaddq xmm3,xmm3,xmm4
vpxor xmm15,xmm15,xmm0
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm2
vpxor xmm14,xmm14,xmm3
vmovdqa [rsp], xmm0
vmovdqa xmm0, xword [xshufb_ror16]
vpshufb xmm15,xmm15,xmm0
vpshufb xmm12,xmm12,xmm0
vpshufb xmm13,xmm13,xmm0
vpshufb xmm14,xmm14,xmm0
vpaddq xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm12
vpaddq xmm8,xmm8,xmm13
vpaddq xmm9,xmm9,xmm14
vpxor xmm5,xmm5,xmm10
vpxor xmm6,xmm6,xmm11
vpxor xmm7,xmm7,xmm8
vpxor xmm4,xmm4,xmm9
vpaddq xmm0,xmm5,xmm5
vpsrlq xmm5,xmm5,63
vpor xmm5,xmm5,xmm0
vpaddq xmm0,xmm6,xmm6
vpsrlq xmm6,xmm6,63
vpor xmm6,xmm6,xmm0
vpaddq xmm0,xmm7,xmm7
vpsrlq xmm7,xmm7,63
vpor xmm7,xmm7,xmm0
vpaddq xmm0,xmm4,xmm4
vpsrlq xmm4,xmm4,63
vpor xmm4,xmm4,xmm0
vmovdqa xmm0, [rsp]
Blake2bRounds2 2,src
vpxor xmm0, xmm0, xmm8
vpxor xmm1, xmm1, xmm9
vpxor xmm2, xmm2, xmm10
vpxor xmm3, xmm3, xmm11
vpxor xmm4, xmm4, xmm12
vpxor xmm5, xmm5, xmm13
vpxor xmm6, xmm6, xmm14
;vpxor xmm7, xmm7, xmm15
vmovddup xmm8, qword [mids+0x80]
vmovddup xmm9, qword [mids+0x88]
vmovddup xmm10, qword [mids+0x90]
vmovddup xmm11, qword [mids+0x98]
vmovddup xmm12, qword [mids+0xa0]
vmovddup xmm13, qword [mids+0xa8]
vmovddup xmm14, qword [mids+0xb0]
;vmovddup xmm15, qword [mids+0xb8]
vpxor xmm0, xmm0, xmm8
vpxor xmm1, xmm1, xmm9
vpxor xmm2, xmm2, xmm10
vpxor xmm3, xmm3, xmm11
vpxor xmm4, xmm4, xmm12
vpxor xmm5, xmm5, xmm13
vpxor xmm6, xmm6, xmm14
;vpxor xmm7, xmm7, xmm15
}

View File

@ -0,0 +1,350 @@
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq ymm0,ymm0,ymm4
vpaddq ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm6
vpaddq ymm3,ymm3,ymm7
if m0<lim
vpaddq ymm0,ymm0, yword [src+m0*32]
end if
if m1<lim
vpaddq ymm1,ymm1, yword [src+m1*32]
end if
if m2<lim
vpaddq ymm2,ymm2, yword [src+m2*32]
end if
if m3<lim
vpaddq ymm3,ymm3, yword [src+m3*32]
end if
vpxor ymm12,ymm12,ymm0
vpxor ymm13,ymm13,ymm1
vpxor ymm14,ymm14,ymm2
vpxor ymm15,ymm15,ymm3
vpshufd ymm12,ymm12,0xB1
vpshufd ymm13,ymm13,0xB1
vpshufd ymm14,ymm14,0xB1
vpshufd ymm15,ymm15,0xB1
vpaddq ymm8,ymm8,ymm12
vpaddq ymm9,ymm9,ymm13
vpaddq ymm10,ymm10,ymm14
vpaddq ymm11,ymm11,ymm15
vpxor ymm4,ymm4,ymm8
vpxor ymm5,ymm5,ymm9
vpxor ymm6,ymm6,ymm10
vpxor ymm7,ymm7,ymm11
vmovdqa [rsp], ymm8
vbroadcasti128 ymm8, xword [xshufb_ror24]
vpshufb ymm4,ymm4,ymm8
vpshufb ymm5,ymm5,ymm8
vpshufb ymm6,ymm6,ymm8
vpshufb ymm7,ymm7,ymm8
vmovdqa ymm8, [rsp]
vpaddq ymm0,ymm0,ymm4
vpaddq ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm6
vpaddq ymm3,ymm3,ymm7
if m4<lim
vpaddq ymm0,ymm0, yword [src+m4*32]
end if
if m5<lim
vpaddq ymm1,ymm1, yword [src+m5*32]
end if
if m6<lim
vpaddq ymm2,ymm2, yword [src+m6*32]
end if
if m7<lim
vpaddq ymm3,ymm3, yword [src+m7*32]
end if
vpxor ymm12,ymm12,ymm0
vpxor ymm13,ymm13,ymm1
vpxor ymm14,ymm14,ymm2
vpxor ymm15,ymm15,ymm3
vmovdqa [rsp], ymm0
vbroadcasti128 ymm0, xword [xshufb_ror16]
vpshufb ymm12,ymm12,ymm0
vpshufb ymm13,ymm13,ymm0
vpshufb ymm14,ymm14,ymm0
vpshufb ymm15,ymm15,ymm0
vpaddq ymm8,ymm8,ymm12
vpaddq ymm9,ymm9,ymm13
vpaddq ymm10,ymm10,ymm14
vpaddq ymm11,ymm11,ymm15
vpxor ymm4,ymm4,ymm8
vpxor ymm5,ymm5,ymm9
vpxor ymm6,ymm6,ymm10
vpxor ymm7,ymm7,ymm11
vpaddq ymm0,ymm4,ymm4
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm0
vpaddq ymm0,ymm5,ymm5
vpsrlq ymm5,ymm5,63
vpor ymm5,ymm5,ymm0
vpaddq ymm0,ymm6,ymm6
vpsrlq ymm6,ymm6,63
vpor ymm6,ymm6,ymm0
vpaddq ymm0,ymm7,ymm7
vpsrlq ymm7,ymm7,63
vpor ymm7,ymm7,ymm0
vmovdqa ymm0, [rsp]
}
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
{
vpaddq ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm6
vpaddq ymm2,ymm2,ymm7
vpaddq ymm3,ymm3,ymm4
if m0<lim
vpaddq ymm0,ymm0, yword [src+m0*32]
end if
if m1<lim
vpaddq ymm1,ymm1, yword [src+m1*32]
end if
if m2<lim
vpaddq ymm2,ymm2, yword [src+m2*32]
end if
if m3<lim
vpaddq ymm3,ymm3, yword [src+m3*32]
end if
vpxor ymm15,ymm15,ymm0
vpxor ymm12,ymm12,ymm1
vpxor ymm13,ymm13,ymm2
vpxor ymm14,ymm14,ymm3
vpshufd ymm15,ymm15,0xB1
vpshufd ymm12,ymm12,0xB1
vpshufd ymm13,ymm13,0xB1
vpshufd ymm14,ymm14,0xB1
vpaddq ymm10,ymm10,ymm15
vpaddq ymm11,ymm11,ymm12
vpaddq ymm8,ymm8,ymm13
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm6,ymm6,ymm11
vpxor ymm7,ymm7,ymm8
vpxor ymm4,ymm4,ymm9
vmovdqa [rsp], ymm10
vbroadcasti128 ymm10, xword [xshufb_ror24]
vpshufb ymm5,ymm5,ymm10
vpshufb ymm6,ymm6,ymm10
vpshufb ymm7,ymm7,ymm10
vpshufb ymm4,ymm4,ymm10
vmovdqa ymm10, [rsp]
vpaddq ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm6
vpaddq ymm2,ymm2,ymm7
vpaddq ymm3,ymm3,ymm4
if m4<lim
vpaddq ymm0,ymm0, yword [src+m4*32]
end if
if m5<lim
vpaddq ymm1,ymm1, yword [src+m5*32]
end if
if m6<lim
vpaddq ymm2,ymm2, yword [src+m6*32]
end if
if m7<lim
vpaddq ymm3,ymm3, yword [src+m7*32]
end if
vpxor ymm15,ymm15,ymm0
vpxor ymm12,ymm12,ymm1
vpxor ymm13,ymm13,ymm2
vpxor ymm14,ymm14,ymm3
vmovdqa [rsp], ymm0
vbroadcasti128 ymm0, xword [xshufb_ror16]
vpshufb ymm15,ymm15,ymm0
vpshufb ymm12,ymm12,ymm0
vpshufb ymm13,ymm13,ymm0
vpshufb ymm14,ymm14,ymm0
vpaddq ymm10,ymm10,ymm15
vpaddq ymm11,ymm11,ymm12
vpaddq ymm8,ymm8,ymm13
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm6,ymm6,ymm11
vpxor ymm7,ymm7,ymm8
vpxor ymm4,ymm4,ymm9
vpaddq ymm0,ymm5,ymm5
vpsrlq ymm5,ymm5,63
vpor ymm5,ymm5,ymm0
vpaddq ymm0,ymm6,ymm6
vpsrlq ymm6,ymm6,63
vpor ymm6,ymm6,ymm0
vpaddq ymm0,ymm7,ymm7
vpsrlq ymm7,ymm7,63
vpor ymm7,ymm7,ymm0
vpaddq ymm0,ymm4,ymm4
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm0
vmovdqa ymm0, [rsp]
}
macro Blake2bRounds2 lim,src
{
;ROUND 0
;hR0 0,2,4,6,1,3,5,7,lim,src
;hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 1
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
;ROUND 2
hR0 11,12,5,15,8,0,2,13,lim,src
hR1 10,3,7,9,14,6,1,4,lim,src
;ROUND 3
hR0 7,3,13,11,9,1,12,14,lim,src
hR1 2,5,4,15,6,10,0,8,lim,src
;ROUND 4
hR0 9,5,2,10,0,7,4,15,lim,src
hR1 14,11,6,3,1,12,8,13,lim,src
;ROUND 5
hR0 2,6,0,8,12,10,11,3,lim,src
hR1 4,7,15,1,13,5,14,9,lim,src
;ROUND 6
hR0 12,1,14,4,5,15,13,10,lim,src
hR1 0,6,9,8,7,3,2,11,lim,src
;ROUND 7
hR0 13,7,12,3,11,14,1,9,lim,src
hR1 5,15,8,2,0,4,6,10,lim,src
;ROUND 8
hR0 6,14,11,0,15,9,3,8,lim,src
hR1 12,13,1,10,2,7,4,5,lim,src
;ROUND 9
hR0 10,8,7,1,2,4,6,5,lim,src
hR1 15,9,3,13,11,14,12,0,lim,src
;ROUND 10
hR0 0,2,4,6,1,3,5,7,lim,src
hR1 8,10,12,14,9,11,13,15,lim,src
;ROUND 11
hR0 14,4,9,13,10,8,15,6,lim,src
hR1 1,0,11,5,12,2,7,3,lim,src
}
macro Blake2beq2of2 mids, src
{
vpbroadcastq ymm0, qword [mids]
vpaddq ymm0,ymm0, yword [src+1*32]
vpbroadcastq ymm12, qword [mids+0x08]
vpxor ymm12,ymm12,ymm0
vbroadcasti128 ymm2, xword [xshufb_ror16] ;ymm2 is temp
vpshufb ymm12,ymm12,ymm2
vpbroadcastq ymm8, qword [mids+0x10]
vpaddq ymm8,ymm8,ymm12
vpbroadcastq ymm4, qword [mids+0x18]
vpxor ymm4,ymm4,ymm8
vpaddq ymm2,ymm4,ymm4 ;ymm2 is temp
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm2
vpbroadcastq ymm5, qword [mids+0x20]
vpaddq ymm0,ymm0,ymm5
vpbroadcastq ymm1, qword [mids+0x30]
vpxor ymm12,ymm12,ymm1
vpshufd ymm12,ymm12,0xB1
vpbroadcastq ymm13, qword [mids+0x38]
vpaddq ymm8,ymm8,ymm13
vpbroadcastq ymm3, qword [mids+0x60]
vpaddq ymm3,ymm3,ymm4
vpbroadcastq ymm15, qword [mids+0x48]
vpxor ymm15,ymm15,ymm0
vpshufd ymm15,ymm15,0xB1
vpbroadcastq ymm11, qword [mids+0x58]
vpaddq ymm11,ymm11,ymm12
vpbroadcastq ymm7, qword [mids+0x68]
vpxor ymm7,ymm7,ymm8
vpbroadcastq ymm14, qword [mids+0x40]
vpxor ymm14,ymm14,ymm3
vpshufd ymm14,ymm14,0xB1
vpbroadcastq ymm10, qword [mids+0x50]
vpaddq ymm10,ymm10,ymm15
vpbroadcastq ymm6, qword [mids+0x28]
vpxor ymm6,ymm6,ymm11
vpbroadcastq ymm9, qword [mids+0x70]
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm4,ymm4,ymm9
vbroadcasti128 ymm2, xword [xshufb_ror24] ;ymm2 is temp
vpshufb ymm5,ymm5,ymm2
vpshufb ymm6,ymm6,ymm2
vpshufb ymm7,ymm7,ymm2
vpshufb ymm4,ymm4,ymm2
vpbroadcastq ymm2, qword [mids+0x78]
vpaddq ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm6
vpaddq ymm2,ymm2,ymm7
vpaddq ymm3,ymm3,ymm4
vpxor ymm15,ymm15,ymm0
vpxor ymm12,ymm12,ymm1
vpxor ymm13,ymm13,ymm2
vpxor ymm14,ymm14,ymm3
vmovdqa [rsp], ymm0
vbroadcasti128 ymm0, xword [xshufb_ror16]
vpshufb ymm15,ymm15,ymm0
vpshufb ymm12,ymm12,ymm0
vpshufb ymm13,ymm13,ymm0
vpshufb ymm14,ymm14,ymm0
vpaddq ymm10,ymm10,ymm15
vpaddq ymm11,ymm11,ymm12
vpaddq ymm8,ymm8,ymm13
vpaddq ymm9,ymm9,ymm14
vpxor ymm5,ymm5,ymm10
vpxor ymm6,ymm6,ymm11
vpxor ymm7,ymm7,ymm8
vpxor ymm4,ymm4,ymm9
vpaddq ymm0,ymm5,ymm5
vpsrlq ymm5,ymm5,63
vpor ymm5,ymm5,ymm0
vpaddq ymm0,ymm6,ymm6
vpsrlq ymm6,ymm6,63
vpor ymm6,ymm6,ymm0
vpaddq ymm0,ymm7,ymm7
vpsrlq ymm7,ymm7,63
vpor ymm7,ymm7,ymm0
vpaddq ymm0,ymm4,ymm4
vpsrlq ymm4,ymm4,63
vpor ymm4,ymm4,ymm0
vmovdqa ymm0, [rsp]
Blake2bRounds2 2,src
vpxor ymm0, ymm0, ymm8
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm10
vpxor ymm3, ymm3, ymm11
vpxor ymm4, ymm4, ymm12
vpxor ymm5, ymm5, ymm13
vpxor ymm6, ymm6, ymm14
;vpxor ymm7, ymm7, ymm15
vpbroadcastq ymm8, qword [mids+0x80]
vpbroadcastq ymm9, qword [mids+0x88]
vpbroadcastq ymm10, qword [mids+0x90]
vpbroadcastq ymm11, qword [mids+0x98]
vpbroadcastq ymm12, qword [mids+0xa0]
vpbroadcastq ymm13, qword [mids+0xa8]
vpbroadcastq ymm14, qword [mids+0xb0]
;vpbroadcastq ymm15, qword [mids+0xb8]
vpxor ymm0, ymm0, ymm8
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm10
vpxor ymm3, ymm3, ymm11
vpxor ymm4, ymm4, ymm12
vpxor ymm5, ymm5, ymm13
vpxor ymm6, ymm6, ymm14
;vpxor ymm7, ymm7, ymm15
}

View File

@ -0,0 +1,39 @@
;void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
;hashout: hash output buffer: 2*64 bytes
;midstate: 256 bytes from Blake2PrepareMidstate2
;indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
include "macro_blake2b_avx1.asm"
Blake2Run2:
mov rax, rsp
sub rsp, 0x28
and rsp, -32
mov [rsp+0x20], rax
mov [rsi+0xd4], edx
add edx, 1
mov [rsi+0xdc], edx
Blake2beq2of2 rsi, rsi+0xc0
vpunpcklqdq xmm8, xmm0, xmm1
vpunpckhqdq xmm1, xmm0, xmm1
vpunpcklqdq xmm10, xmm2, xmm3
vpunpckhqdq xmm3, xmm2, xmm3
vpunpcklqdq xmm12, xmm4, xmm5
vpunpckhqdq xmm5, xmm4, xmm5
vpunpcklqdq xmm14, xmm6, xmm7
vpunpckhqdq xmm7, xmm6, xmm7
vmovdqa [rdi], xmm8
vmovdqa [rdi+0x10], xmm10
vmovdqa [rdi+0x20], xmm12
vmovdqa [rdi+0x30], xmm14
vmovdqa [rdi+0x40], xmm1
vmovdqa [rdi+0x50], xmm3
vmovdqa [rdi+0x60], xmm5
vmovdqa [rdi+0x70], xmm7
mov rsp, [rsp+0x20]
ret

View File

@ -0,0 +1,49 @@
;void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
;hashout: hash output buffer: 4*64 bytes
;midstate: 256 bytes from Blake2PrepareMidstate4
;indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
include "macro_blake2b_avx2.asm"
Blake2Run4:
mov rax, rsp
sub rsp, 0x28
and rsp, -32
mov [rsp+0x20], rax
vmovd xmm0, edx ;indexctr
vpbroadcastd ymm0, xmm0
vpaddd ymm0, ymm0, yword [yctrinit]
vpblendd ymm0, ymm0, yword [rsi+0xe0], 0x55
vmovdqa yword [rsi+0xe0], ymm0
Blake2beq2of2 rsi, rsi+0xc0
vpunpcklqdq ymm8, ymm0, ymm1
vpunpckhqdq ymm9, ymm0, ymm1
vpunpcklqdq ymm10, ymm2, ymm3
vpunpckhqdq ymm11, ymm2, ymm3
vpunpcklqdq ymm12, ymm4, ymm5
vpunpckhqdq ymm13, ymm4, ymm5
vpunpcklqdq ymm14, ymm6, ymm7
vpunpckhqdq ymm15, ymm6, ymm7
vperm2i128 ymm0, ymm8, ymm10, 0x20
vperm2i128 ymm1, ymm12, ymm14, 0x20
vperm2i128 ymm2, ymm9, ymm11, 0x20
vperm2i128 ymm3, ymm13, ymm15, 0x20
vperm2i128 ymm4, ymm8, ymm10, 0x31
vperm2i128 ymm5, ymm12, ymm14, 0x31
vperm2i128 ymm6, ymm9, ymm11, 0x31
vperm2i128 ymm7, ymm13, ymm15, 0x31
vmovdqa [rdi], ymm0
vmovdqa [rdi+0x20], ymm1
vmovdqa [rdi+0x40], ymm2
vmovdqa [rdi+0x60], ymm3
vmovdqa [rdi+0x80], ymm4
vmovdqa [rdi+0xa0], ymm5
vmovdqa [rdi+0xc0], ymm6
vmovdqa [rdi+0xe0], ymm7
mov rsp, [rsp+0x20]
ret

View File

@ -0,0 +1,212 @@
;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
;midstate: 256 bytes of buffer for output midstate, aligned by 32
;input: 140 bytes header, preferably aligned by 8
Blake2PrepareMidstate2:
sub rsp, 0x188
vmovdqa xmm10, xword [xshufb_ror24]
vmovdqa xmm11, xword [xshufb_ror16]
vmovdqa xmm0, xword [s0]
vmovdqa xmm1, xword [s2]
vmovdqa xmm2, xword [s4]
vmovdqa xmm3, xword [s6]
vmovdqa xmm4, xword [iv]
vmovdqa xmm5, xword [iv+0x10]
vmovdqa xmm6, xword [iv4xor128]
vmovdqa xmm7, xword [iv4xor128+0x10]
mov r8, rsp
lea r9, [blake2sigma]
lea r11, [blake2sigma+160]
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
_LoopEhPrepare1:
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r9, 16
cmp r9, r11
jb _LoopEhPrepare1
mov r8, rsp
call _ProcBlakeRound
add r8, 0x80
call _ProcBlakeRound
vpxor xmm0, xmm0, xmm4
vpxor xmm1, xmm1, xmm5
vpxor xmm2, xmm2, xmm6
vpxor xmm3, xmm3, xmm7
vpxor xmm0, xmm0, xword [s0]
vpxor xmm1, xmm1, xword [s2]
vpxor xmm2, xmm2, xword [s4]
vpxor xmm3, xmm3, xword [s6]
vmovdqa xword [rdi+0x80], xmm0
vmovdqa xword [rdi+0x90], xmm1
vmovdqa xword [rdi+0xa0], xmm2
vmovdqa xword [rdi+0xb0], xmm3
vmovq xmm8, [rsi+0x80]
vpshufd xmm4, xmm8, 0x44
vmovdqa xword [rdi+0xc0], xmm4
vmovd xmm4, [rsi+0x88]
vpshufd xmm4, xmm4, 0x44
vmovdqa xword [rdi+0xd0], xmm4
;Begin second message block
vmovdqa xmm4, xword [iv]
vmovdqa xmm5, xword [iv+0x10]
vmovdqa xmm6, xword [iv4xor144]
vmovdqa xmm7, xword [iv6inverted]
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, xmm8 ;xmm8[63:0]=message
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufd xmm6, xmm6, 0xb1
vmovq [rdi+0x08], xmm6 ;v12
vpshufd xmm7, xmm7, 0xb1
vpaddq xmm4, xmm4, xmm6
vmovq [rdi+0x10], xmm4 ;v8
vpaddq xmm5, xmm5, xmm7
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpshufb xmm2, xmm2, xmm10
vmovq [rdi+0x18], xmm2 ;v4
vpshufb xmm3, xmm3, xmm10
vpaddq xmm0, xmm0, xmm2
vmovq [rdi], xmm0 ;v0
vpaddq xmm1, xmm1, xmm3
vpextrq [rdi+0x60], xmm1, 1 ;v3
;add message (nonce, index) to xmm0 here, but we don't have
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufb xmm6, xmm6, xmm11
vpshufb xmm7, xmm7, xmm11
vmovdqa xword [rdi+0x40], xmm7 ;v14,15
vpaddq xmm4, xmm4, xmm6
vpextrq [rdi+0x70], xmm4, 1 ;v9
vpaddq xmm5, xmm5, xmm7
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpaddq xmm8, xmm2, xmm2
vpsrlq xmm2, xmm2, 63
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
vpsrlq xmm3, xmm3, 63
vpor xmm3, xmm3, xmm2
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
vmovdqa xword [rdi+0x20], xmm2 ;v5,6
vpsrldq xmm3, xmm3, 8
vmovq [rdi+0x68], xmm3 ;v7
vpsrldq xmm7, xmm6, 8
vpaddq xmm0, xmm0, xmm2
vpextrq [rdi+0x30], xmm0, 1 ;v1
vpaddq xmm1, xmm1, xmm3
vmovq [rdi+0x78], xmm1 ;v2
vpxor xmm7, xmm7, xmm1
vpshufd xmm7, xmm7, 0xb1
vmovq [rdi+0x38], xmm7 ;v13
add rsp, 0x188
ret
align 16
_ProcBlakeMsgSched:
;rsi=src
;r8=dst
;r9=sigma table
xor r10d, r10d
_LoopBlakeMsgSched:
movzx eax, byte [r9+r10]
mov rax, [rsi+rax*8]
mov [r8+r10*8], rax
add r10d, 1
cmp r10d, 16
jb _LoopBlakeMsgSched
ret
align 16
_ProcBlakeRound:
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8]
vpaddq xmm1, xmm1, [r8+0x10]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufd xmm6, xmm6, 0xb1
vpshufd xmm7, xmm7, 0xb1
vpaddq xmm4, xmm4, xmm6
vpaddq xmm5, xmm5, xmm7
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpshufb xmm2, xmm2, xmm10
vpshufb xmm3, xmm3, xmm10
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8+0x20]
vpaddq xmm1, xmm1, [r8+0x30]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
vpshufb xmm7, xmm7, xmm11
vpaddq xmm4, xmm4, xmm9
vpaddq xmm5, xmm5, xmm7
vpxor xmm2, xmm2, xmm4
vpxor xmm3, xmm3, xmm5
vpaddq xmm8, xmm2, xmm2
vpsrlq xmm2, xmm2, 63
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
vpsrlq xmm3, xmm3, 63
vpor xmm3, xmm3, xmm2
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
vpalignr xmm3, xmm8, xmm3, 8
vpalignr xmm6, xmm9, xmm7, 8 ;xmm6 resume
vpalignr xmm7, xmm7, xmm9, 8
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8+0x40]
vpaddq xmm1, xmm1, [r8+0x50]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufd xmm6, xmm6, 0xb1
vpshufd xmm7, xmm7, 0xb1
vpaddq xmm5, xmm5, xmm6
vpaddq xmm4, xmm4, xmm7
vpxor xmm2, xmm2, xmm5
vpxor xmm3, xmm3, xmm4
vpshufb xmm2, xmm2, xmm10
vpshufb xmm3, xmm3, xmm10
vpaddq xmm0, xmm0, xmm2
vpaddq xmm1, xmm1, xmm3
vpaddq xmm0, xmm0, [r8+0x60]
vpaddq xmm1, xmm1, [r8+0x70]
vpxor xmm6, xmm6, xmm0
vpxor xmm7, xmm7, xmm1
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
vpshufb xmm7, xmm7, xmm11
vpaddq xmm5, xmm5, xmm9
vpaddq xmm4, xmm4, xmm7
vpxor xmm2, xmm2, xmm5
vpxor xmm3, xmm3, xmm4
vpaddq xmm8, xmm2, xmm2
vpsrlq xmm2, xmm2, 63
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
vpsrlq xmm3, xmm3, 63
vpor xmm3, xmm3, xmm2
vpalignr xmm2, xmm8, xmm3, 8 ;xmm2 resume
vpalignr xmm3, xmm3, xmm8, 8
vpalignr xmm6, xmm7, xmm9, 8 ;xmm6 resume
vpalignr xmm7, xmm9, xmm7, 8
ret

View File

@ -0,0 +1,166 @@
;void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
;midstate: 256 bytes of buffer for output midstate, aligned by 32
;input: 140 bytes header, preferably aligned by 8
Blake2PrepareMidstate4:
sub rsp, 0x188
vbroadcasti128 ymm6, xword [xshufb_ror24]
vbroadcasti128 ymm7, xword [xshufb_ror16]
vmovdqa ymm0, yword [s0]
vmovdqa ymm1, yword [s4]
vmovdqa ymm2, yword [iv]
vmovdqa ymm3, yword [iv4xor128]
mov r8, rsp
lea r9, [blake2sigma]
lea r11, [blake2sigma+160]
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r8, 0x80
add r9, 16
_LoopEhPrepare1:
call _ProcBlakeMsgSched
call _ProcBlakeRound
add r9, 16
cmp r9, r11
jb _LoopEhPrepare1
mov r8, rsp
call _ProcBlakeRound
add r8, 0x80
call _ProcBlakeRound
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
vpxor ymm0, ymm0, yword [s0]
vpxor ymm1, ymm1, yword [s4]
vmovdqa yword [rdi+0x80], ymm0
vmovdqa yword [rdi+0xa0], ymm1
vmovq xmm5, [rsi+0x80]
vpbroadcastq ymm4, xmm5
vmovdqa yword [rdi+0xc0], ymm4
vmovd xmm4, [rsi+0x88]
vpbroadcastq ymm4, xmm4
vmovdqa yword [rdi+0xe0], ymm4
;Begin second message block
vmovdqa ymm2, yword [iv]
vmovdqa ymm3, yword [iv4xor144] ;also loads iv6inverted
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, ymm5 ;ymm5[63:0]=message
vpxor ymm3, ymm3, ymm0
vpshufd ymm3, ymm3, 0xb1
vmovq [rdi+0x08], xmm3 ;v12
vpaddq ymm2, ymm2, ymm3
vmovq [rdi+0x10], xmm2 ;v8
vpxor ymm1, ymm1, ymm2
vpshufb ymm1, ymm1, ymm6
vmovq [rdi+0x18], xmm1 ;v4
vpaddq ymm0, ymm0, ymm1
vmovq [rdi], xmm0 ;v0, v3 ready
;add message (nonce, index) to xmm0 here, but we don't have
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm7
vextracti128 xmm4, ymm3, 1
vmovdqa xword [rdi+0x40], xmm4 ;v14,15
vpaddq ymm2, ymm2, ymm3
vpextrq [rdi+0x70], xmm2, 1 ;v9
vextracti128 xmm5, ymm2, 1
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
vpxor ymm1, ymm1, ymm2
vpaddq ymm4, ymm1, ymm1
vpsrlq ymm1, ymm1, 63
vpor ymm1, ymm1, ymm4
;Valid:
; v1 v2 v3
; v5 v6 v7
; v9 v10 v11
; v13 v14 v15
;
;v1 v2 <- v6 v7
;v13 <- v2
vpermq ymm1, ymm1, 0x39
vmovdqa xword [rdi+0x20], xmm1 ;v5,6
vextracti128 xmm4, ymm0, 1
vextracti128 xmm5, ymm1, 1
vpextrq [rdi+0x60], xmm4, 1 ;v3
vmovq [rdi+0x68], xmm5 ;v7
vpsrldq xmm3, xmm3, 8
vpaddq xmm0, xmm0, xmm1
vpextrq [rdi+0x30], xmm0, 1 ;v1
vpaddq xmm4, xmm4, xmm5
vmovq [rdi+0x78], xmm4 ;v2
vpxor xmm3, xmm3, xmm4
vpshufd xmm3, xmm3, 0xb1
vmovq [rdi+0x38], xmm3 ;v13
add rsp, 0x188
ret
align 16
_ProcBlakeMsgSched:
;rsi=src
;r8=dst
;r9=sigma table
xor r10d, r10d
_LoopBlakeMsgSched:
movzx eax, byte [r9+r10]
mov rax, [rsi+rax*8]
mov [r8+r10*8], rax
add r10d, 1
cmp r10d, 16
jb _LoopBlakeMsgSched
ret
align 16
_ProcBlakeRound:
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8]
vpxor ymm3, ymm3, ymm0
vpshufd ymm3, ymm3, 0xb1
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpshufb ymm1, ymm1, ymm6 ;ror24
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8+0x20]
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm7 ;ror16
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpaddq ymm4, ymm1, ymm1
vpsrlq ymm1, ymm1, 63
vpor ymm1, ymm1, ymm4
vpermq ymm1, ymm1, 0x39
vpermq ymm2, ymm2, 0x4e
vpermq ymm3, ymm3, 0x93
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8+0x40]
vpxor ymm3, ymm3, ymm0
vpshufd ymm3, ymm3, 0xb1
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpshufb ymm1, ymm1, ymm6 ;ror24
vpaddq ymm0, ymm0, ymm1
vpaddq ymm0, ymm0, [r8+0x60]
vpxor ymm3, ymm3, ymm0
vpshufb ymm3, ymm3, ymm7 ;ror16
vpaddq ymm2, ymm2, ymm3
vpxor ymm1, ymm1, ymm2
vpaddq ymm4, ymm1, ymm1
vpsrlq ymm1, ymm1, 63
vpor ymm1, ymm1, ymm4
vpermq ymm1, ymm1, 0x93
vpermq ymm2, ymm2, 0x4e
vpermq ymm3, ymm3, 0x39
ret

View File

@ -0,0 +1,11 @@
format elf64
public Blake2PrepareMidstate2
public Blake2Run2
section '.text' executable align 64
include "proc_prepmidstate_avx1.asm"
align 16
include "proc_blake2_avx1.asm"
section '.data' writeable align 64
include "data_blake2b.asm"

BIN
blake2b/asm/zcblake2_avx1.o Normal file

Binary file not shown.

View File

@ -0,0 +1,11 @@
format elf64
public Blake2PrepareMidstate4
public Blake2Run4
section '.text' executable align 64
include "proc_prepmidstate_avx2.asm"
align 16
include "proc_blake2_avx2.asm"
section '.data' writeable align 64
include "data_blake2b.asm"

BIN
blake2b/asm/zcblake2_avx2.o Normal file

Binary file not shown.

BIN
blake2b/example_avx1 Executable file

Binary file not shown.

51
blake2b/example_avx1.c Normal file
View File

@ -0,0 +1,51 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
//midstate: 256 bytes of buffer for output midstate, aligned by 32
//input: 140 bytes header, preferably aligned by 8
void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
//hashout: hash output buffer: 2*64 bytes
//midstate: 256 bytes from Blake2PrepareMidstate2
//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
unsigned char __attribute__((aligned(8))) testdata[140] =
{
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
};
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
int main(void)
{
unsigned char midstate_a[256+32];
void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
unsigned char hashout_a[128+32];
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
unsigned char buf[128];
FILE *outfile;
int i;
Blake2PrepareMidstate2(pmidstate, testdata);
outfile = fopen("out.bin", "wb");
for (i=0; i<1048576; i+=2) {
Blake2Run2(phashout, pmidstate, i);
memcpy(buf, phashout, 50);
memcpy(buf+50, phashout+64, 50);
fwrite(buf, 100, 1, outfile);
}
fclose(outfile);
return 0;
}

BIN
blake2b/example_avx2 Executable file

Binary file not shown.

58
blake2b/example_avx2.c Normal file
View File

@ -0,0 +1,58 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
//midstate: 256 bytes of buffer for output midstate, aligned by 32
//input: 140 bytes header, preferably aligned by 8
void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
//hashout: hash output buffer: 4*64 bytes
//midstate: 256 bytes from Blake2PrepareMidstate4
//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
unsigned char __attribute__((aligned(8))) testdata[140] =
{
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
};
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
int main(void)
{
unsigned char midstate_a[256+32];
void *pmidstate = midstate_a; // (void *) (((long) midstate_a+31L) & -32L);
unsigned char hashout_a[256+32];
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
unsigned char buf[256];
FILE *outfile;
int i,x;
Blake2PrepareMidstate4(pmidstate, testdata);
outfile = fopen("out.bin", "wb");
for (i=0; i<10*1048576; i+=4) {
Blake2Run4(phashout, pmidstate, i);
#if 0
memcpy(buf, phashout, 50);
memcpy(buf+50, phashout+64, 50);
memcpy(buf+100, phashout+128, 50);
memcpy(buf+150, phashout+192, 50);
fwrite(buf, 200, 1, outfile);
#else
x += phashout[0];
#endif
}
fclose(outfile);
printf("x = %d\n", x);
return 0;
}

0
blake2b/out.bin Normal file
View File

BIN
blake2b/zcblake2_avx1.o Normal file

Binary file not shown.