add other xenoncat blake2b files
This commit is contained in:
parent
fa73e24c4b
commit
ef21c94286
|
@ -0,0 +1,11 @@
|
||||||
|
all: example_avx1 example_avx2
|
||||||
|
|
||||||
|
example_avx1: example_avx1.c zcblake2_avx1.o
|
||||||
|
gcc -o example_avx1 example_avx1.c zcblake2_avx1.o
|
||||||
|
|
||||||
|
example_avx2: example_avx2.c zcblake2_avx2.o
|
||||||
|
gcc -o example_avx2 example_avx2.c zcblake2_avx2.o
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm example_avx1 example_avx2
|
|
@ -0,0 +1,2 @@
|
||||||
|
fasm zcblake2_avx1.asm
|
||||||
|
fasm zcblake2_avx2.asm
|
|
@ -0,0 +1,36 @@
|
||||||
|
xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10
|
||||||
|
xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9
|
||||||
|
xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8
|
||||||
|
xctrinc dd 0,2, 0,2
|
||||||
|
|
||||||
|
align 32
|
||||||
|
iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
|
||||||
|
dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
|
||||||
|
dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
|
||||||
|
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
|
||||||
|
|
||||||
|
s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b ;0x32=50 bytes output
|
||||||
|
s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
|
||||||
|
s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
|
||||||
|
s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a ;Personalization
|
||||||
|
s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8 ;n=200, k=9
|
||||||
|
|
||||||
|
iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f
|
||||||
|
dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
|
||||||
|
iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f
|
||||||
|
iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179
|
||||||
|
|
||||||
|
align 32
|
||||||
|
yctrinit dd 0,0, 0,1, 0,2, 0,3
|
||||||
|
yctrinc dd 0,4, 0,4, 0,4, 0,4
|
||||||
|
|
||||||
|
blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15
|
||||||
|
db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3
|
||||||
|
db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4
|
||||||
|
db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8
|
||||||
|
db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13
|
||||||
|
db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9
|
||||||
|
db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11
|
||||||
|
db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10
|
||||||
|
db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5
|
||||||
|
db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0
|
|
@ -0,0 +1,349 @@
|
||||||
|
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||||
|
{
|
||||||
|
vpaddq xmm0,xmm0,xmm4
|
||||||
|
vpaddq xmm1,xmm1,xmm5
|
||||||
|
vpaddq xmm2,xmm2,xmm6
|
||||||
|
vpaddq xmm3,xmm3,xmm7
|
||||||
|
if m0<lim
|
||||||
|
vpaddq xmm0,xmm0, xword [src+m0*16]
|
||||||
|
end if
|
||||||
|
if m1<lim
|
||||||
|
vpaddq xmm1,xmm1, xword [src+m1*16]
|
||||||
|
end if
|
||||||
|
if m2<lim
|
||||||
|
vpaddq xmm2,xmm2, xword [src+m2*16]
|
||||||
|
end if
|
||||||
|
if m3<lim
|
||||||
|
vpaddq xmm3,xmm3, xword [src+m3*16]
|
||||||
|
end if
|
||||||
|
vpxor xmm12,xmm12,xmm0
|
||||||
|
vpxor xmm13,xmm13,xmm1
|
||||||
|
vpxor xmm14,xmm14,xmm2
|
||||||
|
vpxor xmm15,xmm15,xmm3
|
||||||
|
vpshufd xmm12,xmm12,0xB1
|
||||||
|
vpshufd xmm13,xmm13,0xB1
|
||||||
|
vpshufd xmm14,xmm14,0xB1
|
||||||
|
vpshufd xmm15,xmm15,0xB1
|
||||||
|
vpaddq xmm8,xmm8,xmm12
|
||||||
|
vpaddq xmm9,xmm9,xmm13
|
||||||
|
vpaddq xmm10,xmm10,xmm14
|
||||||
|
vpaddq xmm11,xmm11,xmm15
|
||||||
|
vpxor xmm4,xmm4,xmm8
|
||||||
|
vpxor xmm5,xmm5,xmm9
|
||||||
|
vpxor xmm6,xmm6,xmm10
|
||||||
|
vpxor xmm7,xmm7,xmm11
|
||||||
|
vmovdqa [rsp], xmm8
|
||||||
|
vmovdqa xmm8, xword [xshufb_ror24]
|
||||||
|
vpshufb xmm4,xmm4,xmm8
|
||||||
|
vpshufb xmm5,xmm5,xmm8
|
||||||
|
vpshufb xmm6,xmm6,xmm8
|
||||||
|
vpshufb xmm7,xmm7,xmm8
|
||||||
|
vmovdqa xmm8, [rsp]
|
||||||
|
|
||||||
|
vpaddq xmm0,xmm0,xmm4
|
||||||
|
vpaddq xmm1,xmm1,xmm5
|
||||||
|
vpaddq xmm2,xmm2,xmm6
|
||||||
|
vpaddq xmm3,xmm3,xmm7
|
||||||
|
if m4<lim
|
||||||
|
vpaddq xmm0,xmm0, xword [src+m4*16]
|
||||||
|
end if
|
||||||
|
if m5<lim
|
||||||
|
vpaddq xmm1,xmm1, xword [src+m5*16]
|
||||||
|
end if
|
||||||
|
if m6<lim
|
||||||
|
vpaddq xmm2,xmm2, xword [src+m6*16]
|
||||||
|
end if
|
||||||
|
if m7<lim
|
||||||
|
vpaddq xmm3,xmm3, xword [src+m7*16]
|
||||||
|
end if
|
||||||
|
vpxor xmm12,xmm12,xmm0
|
||||||
|
vpxor xmm13,xmm13,xmm1
|
||||||
|
vpxor xmm14,xmm14,xmm2
|
||||||
|
vpxor xmm15,xmm15,xmm3
|
||||||
|
vmovdqa [rsp], xmm0
|
||||||
|
vmovdqa xmm0, xword [xshufb_ror16]
|
||||||
|
vpshufb xmm12,xmm12,xmm0
|
||||||
|
vpshufb xmm13,xmm13,xmm0
|
||||||
|
vpshufb xmm14,xmm14,xmm0
|
||||||
|
vpshufb xmm15,xmm15,xmm0
|
||||||
|
vpaddq xmm8,xmm8,xmm12
|
||||||
|
vpaddq xmm9,xmm9,xmm13
|
||||||
|
vpaddq xmm10,xmm10,xmm14
|
||||||
|
vpaddq xmm11,xmm11,xmm15
|
||||||
|
vpxor xmm4,xmm4,xmm8
|
||||||
|
vpxor xmm5,xmm5,xmm9
|
||||||
|
vpxor xmm6,xmm6,xmm10
|
||||||
|
vpxor xmm7,xmm7,xmm11
|
||||||
|
|
||||||
|
vpaddq xmm0,xmm4,xmm4
|
||||||
|
vpsrlq xmm4,xmm4,63
|
||||||
|
vpor xmm4,xmm4,xmm0
|
||||||
|
vpaddq xmm0,xmm5,xmm5
|
||||||
|
vpsrlq xmm5,xmm5,63
|
||||||
|
vpor xmm5,xmm5,xmm0
|
||||||
|
vpaddq xmm0,xmm6,xmm6
|
||||||
|
vpsrlq xmm6,xmm6,63
|
||||||
|
vpor xmm6,xmm6,xmm0
|
||||||
|
vpaddq xmm0,xmm7,xmm7
|
||||||
|
vpsrlq xmm7,xmm7,63
|
||||||
|
vpor xmm7,xmm7,xmm0
|
||||||
|
|
||||||
|
vmovdqa xmm0, [rsp]
|
||||||
|
}
|
||||||
|
|
||||||
|
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||||
|
{
|
||||||
|
vpaddq xmm0,xmm0,xmm5
|
||||||
|
vpaddq xmm1,xmm1,xmm6
|
||||||
|
vpaddq xmm2,xmm2,xmm7
|
||||||
|
vpaddq xmm3,xmm3,xmm4
|
||||||
|
if m0<lim
|
||||||
|
vpaddq xmm0,xmm0, xword [src+m0*16]
|
||||||
|
end if
|
||||||
|
if m1<lim
|
||||||
|
vpaddq xmm1,xmm1, xword [src+m1*16]
|
||||||
|
end if
|
||||||
|
if m2<lim
|
||||||
|
vpaddq xmm2,xmm2, xword [src+m2*16]
|
||||||
|
end if
|
||||||
|
if m3<lim
|
||||||
|
vpaddq xmm3,xmm3, xword [src+m3*16]
|
||||||
|
end if
|
||||||
|
vpxor xmm15,xmm15,xmm0
|
||||||
|
vpxor xmm12,xmm12,xmm1
|
||||||
|
vpxor xmm13,xmm13,xmm2
|
||||||
|
vpxor xmm14,xmm14,xmm3
|
||||||
|
vpshufd xmm15,xmm15,0xB1
|
||||||
|
vpshufd xmm12,xmm12,0xB1
|
||||||
|
vpshufd xmm13,xmm13,0xB1
|
||||||
|
vpshufd xmm14,xmm14,0xB1
|
||||||
|
vpaddq xmm10,xmm10,xmm15
|
||||||
|
vpaddq xmm11,xmm11,xmm12
|
||||||
|
vpaddq xmm8,xmm8,xmm13
|
||||||
|
vpaddq xmm9,xmm9,xmm14
|
||||||
|
vpxor xmm5,xmm5,xmm10
|
||||||
|
vpxor xmm6,xmm6,xmm11
|
||||||
|
vpxor xmm7,xmm7,xmm8
|
||||||
|
vpxor xmm4,xmm4,xmm9
|
||||||
|
vmovdqa [rsp], xmm10
|
||||||
|
vmovdqa xmm10, xword [xshufb_ror24]
|
||||||
|
vpshufb xmm5,xmm5,xmm10
|
||||||
|
vpshufb xmm6,xmm6,xmm10
|
||||||
|
vpshufb xmm7,xmm7,xmm10
|
||||||
|
vpshufb xmm4,xmm4,xmm10
|
||||||
|
vmovdqa xmm10, [rsp]
|
||||||
|
|
||||||
|
vpaddq xmm0,xmm0,xmm5
|
||||||
|
vpaddq xmm1,xmm1,xmm6
|
||||||
|
vpaddq xmm2,xmm2,xmm7
|
||||||
|
vpaddq xmm3,xmm3,xmm4
|
||||||
|
if m4<lim
|
||||||
|
vpaddq xmm0,xmm0, xword [src+m4*16]
|
||||||
|
end if
|
||||||
|
if m5<lim
|
||||||
|
vpaddq xmm1,xmm1, xword [src+m5*16]
|
||||||
|
end if
|
||||||
|
if m6<lim
|
||||||
|
vpaddq xmm2,xmm2, xword [src+m6*16]
|
||||||
|
end if
|
||||||
|
if m7<lim
|
||||||
|
vpaddq xmm3,xmm3, xword [src+m7*16]
|
||||||
|
end if
|
||||||
|
vpxor xmm15,xmm15,xmm0
|
||||||
|
vpxor xmm12,xmm12,xmm1
|
||||||
|
vpxor xmm13,xmm13,xmm2
|
||||||
|
vpxor xmm14,xmm14,xmm3
|
||||||
|
vmovdqa [rsp], xmm0
|
||||||
|
vmovdqa xmm0, xword [xshufb_ror16]
|
||||||
|
vpshufb xmm15,xmm15,xmm0
|
||||||
|
vpshufb xmm12,xmm12,xmm0
|
||||||
|
vpshufb xmm13,xmm13,xmm0
|
||||||
|
vpshufb xmm14,xmm14,xmm0
|
||||||
|
vpaddq xmm10,xmm10,xmm15
|
||||||
|
vpaddq xmm11,xmm11,xmm12
|
||||||
|
vpaddq xmm8,xmm8,xmm13
|
||||||
|
vpaddq xmm9,xmm9,xmm14
|
||||||
|
vpxor xmm5,xmm5,xmm10
|
||||||
|
vpxor xmm6,xmm6,xmm11
|
||||||
|
vpxor xmm7,xmm7,xmm8
|
||||||
|
vpxor xmm4,xmm4,xmm9
|
||||||
|
|
||||||
|
vpaddq xmm0,xmm5,xmm5
|
||||||
|
vpsrlq xmm5,xmm5,63
|
||||||
|
vpor xmm5,xmm5,xmm0
|
||||||
|
vpaddq xmm0,xmm6,xmm6
|
||||||
|
vpsrlq xmm6,xmm6,63
|
||||||
|
vpor xmm6,xmm6,xmm0
|
||||||
|
vpaddq xmm0,xmm7,xmm7
|
||||||
|
vpsrlq xmm7,xmm7,63
|
||||||
|
vpor xmm7,xmm7,xmm0
|
||||||
|
vpaddq xmm0,xmm4,xmm4
|
||||||
|
vpsrlq xmm4,xmm4,63
|
||||||
|
vpor xmm4,xmm4,xmm0
|
||||||
|
|
||||||
|
vmovdqa xmm0, [rsp]
|
||||||
|
}
|
||||||
|
|
||||||
|
macro Blake2bRounds2 lim,src
|
||||||
|
{
|
||||||
|
;ROUND 0
|
||||||
|
;hR0 0,2,4,6,1,3,5,7,lim,src
|
||||||
|
;hR1 8,10,12,14,9,11,13,15,lim,src
|
||||||
|
|
||||||
|
;ROUND 1
|
||||||
|
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||||
|
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||||
|
|
||||||
|
;ROUND 2
|
||||||
|
hR0 11,12,5,15,8,0,2,13,lim,src
|
||||||
|
hR1 10,3,7,9,14,6,1,4,lim,src
|
||||||
|
|
||||||
|
;ROUND 3
|
||||||
|
hR0 7,3,13,11,9,1,12,14,lim,src
|
||||||
|
hR1 2,5,4,15,6,10,0,8,lim,src
|
||||||
|
|
||||||
|
;ROUND 4
|
||||||
|
hR0 9,5,2,10,0,7,4,15,lim,src
|
||||||
|
hR1 14,11,6,3,1,12,8,13,lim,src
|
||||||
|
|
||||||
|
;ROUND 5
|
||||||
|
hR0 2,6,0,8,12,10,11,3,lim,src
|
||||||
|
hR1 4,7,15,1,13,5,14,9,lim,src
|
||||||
|
|
||||||
|
;ROUND 6
|
||||||
|
hR0 12,1,14,4,5,15,13,10,lim,src
|
||||||
|
hR1 0,6,9,8,7,3,2,11,lim,src
|
||||||
|
|
||||||
|
;ROUND 7
|
||||||
|
hR0 13,7,12,3,11,14,1,9,lim,src
|
||||||
|
hR1 5,15,8,2,0,4,6,10,lim,src
|
||||||
|
|
||||||
|
;ROUND 8
|
||||||
|
hR0 6,14,11,0,15,9,3,8,lim,src
|
||||||
|
hR1 12,13,1,10,2,7,4,5,lim,src
|
||||||
|
|
||||||
|
;ROUND 9
|
||||||
|
hR0 10,8,7,1,2,4,6,5,lim,src
|
||||||
|
hR1 15,9,3,13,11,14,12,0,lim,src
|
||||||
|
|
||||||
|
;ROUND 10
|
||||||
|
hR0 0,2,4,6,1,3,5,7,lim,src
|
||||||
|
hR1 8,10,12,14,9,11,13,15,lim,src
|
||||||
|
|
||||||
|
;ROUND 11
|
||||||
|
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||||
|
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||||
|
}
|
||||||
|
|
||||||
|
macro Blake2beq2of2 mids, src
|
||||||
|
{
|
||||||
|
vmovddup xmm0, qword [mids]
|
||||||
|
vpaddq xmm0,xmm0, xword [src+1*16]
|
||||||
|
vmovddup xmm12, qword [mids+0x08]
|
||||||
|
vpxor xmm12,xmm12,xmm0
|
||||||
|
vpshufb xmm12,xmm12, xword [xshufb_ror16]
|
||||||
|
vmovddup xmm8, qword [mids+0x10]
|
||||||
|
vpaddq xmm8,xmm8,xmm12
|
||||||
|
vmovddup xmm4, qword [mids+0x18]
|
||||||
|
vpxor xmm4,xmm4,xmm8
|
||||||
|
vpaddq xmm2,xmm4,xmm4 ;xmm2 is temp
|
||||||
|
vpsrlq xmm4,xmm4,63
|
||||||
|
vpor xmm4,xmm4,xmm2
|
||||||
|
|
||||||
|
vmovddup xmm5, qword [mids+0x20]
|
||||||
|
vpaddq xmm0,xmm0,xmm5
|
||||||
|
vmovddup xmm1, qword [mids+0x30]
|
||||||
|
vpxor xmm12,xmm12,xmm1
|
||||||
|
vpshufd xmm12,xmm12,0xB1
|
||||||
|
vmovddup xmm13, qword [mids+0x38]
|
||||||
|
vpaddq xmm8,xmm8,xmm13
|
||||||
|
vmovddup xmm3, qword [mids+0x60]
|
||||||
|
vpaddq xmm3,xmm3,xmm4
|
||||||
|
vmovddup xmm15, qword [mids+0x48]
|
||||||
|
vpxor xmm15,xmm15,xmm0
|
||||||
|
vpshufd xmm15,xmm15,0xB1
|
||||||
|
vmovddup xmm11, qword [mids+0x58]
|
||||||
|
vpaddq xmm11,xmm11,xmm12
|
||||||
|
vmovddup xmm7, qword [mids+0x68]
|
||||||
|
vpxor xmm7,xmm7,xmm8
|
||||||
|
vmovddup xmm14, qword [mids+0x40]
|
||||||
|
vpxor xmm14,xmm14,xmm3
|
||||||
|
vpshufd xmm14,xmm14,0xB1
|
||||||
|
vmovddup xmm10, qword [mids+0x50]
|
||||||
|
vpaddq xmm10,xmm10,xmm15
|
||||||
|
vmovddup xmm6, qword [mids+0x28]
|
||||||
|
vpxor xmm6,xmm6,xmm11
|
||||||
|
vmovddup xmm9, qword [mids+0x70]
|
||||||
|
vpaddq xmm9,xmm9,xmm14
|
||||||
|
vpxor xmm5,xmm5,xmm10
|
||||||
|
vpxor xmm4,xmm4,xmm9
|
||||||
|
vmovdqa xmm2, xword [xshufb_ror24] ;xmm2 is temp
|
||||||
|
vpshufb xmm5,xmm5,xmm2
|
||||||
|
vpshufb xmm6,xmm6,xmm2
|
||||||
|
vpshufb xmm7,xmm7,xmm2
|
||||||
|
vpshufb xmm4,xmm4,xmm2
|
||||||
|
vmovddup xmm2, qword [mids+0x78]
|
||||||
|
|
||||||
|
vpaddq xmm0,xmm0,xmm5
|
||||||
|
vpaddq xmm1,xmm1,xmm6
|
||||||
|
vpaddq xmm2,xmm2,xmm7
|
||||||
|
vpaddq xmm3,xmm3,xmm4
|
||||||
|
vpxor xmm15,xmm15,xmm0
|
||||||
|
vpxor xmm12,xmm12,xmm1
|
||||||
|
vpxor xmm13,xmm13,xmm2
|
||||||
|
vpxor xmm14,xmm14,xmm3
|
||||||
|
vmovdqa [rsp], xmm0
|
||||||
|
vmovdqa xmm0, xword [xshufb_ror16]
|
||||||
|
vpshufb xmm15,xmm15,xmm0
|
||||||
|
vpshufb xmm12,xmm12,xmm0
|
||||||
|
vpshufb xmm13,xmm13,xmm0
|
||||||
|
vpshufb xmm14,xmm14,xmm0
|
||||||
|
vpaddq xmm10,xmm10,xmm15
|
||||||
|
vpaddq xmm11,xmm11,xmm12
|
||||||
|
vpaddq xmm8,xmm8,xmm13
|
||||||
|
vpaddq xmm9,xmm9,xmm14
|
||||||
|
vpxor xmm5,xmm5,xmm10
|
||||||
|
vpxor xmm6,xmm6,xmm11
|
||||||
|
vpxor xmm7,xmm7,xmm8
|
||||||
|
vpxor xmm4,xmm4,xmm9
|
||||||
|
vpaddq xmm0,xmm5,xmm5
|
||||||
|
vpsrlq xmm5,xmm5,63
|
||||||
|
vpor xmm5,xmm5,xmm0
|
||||||
|
vpaddq xmm0,xmm6,xmm6
|
||||||
|
vpsrlq xmm6,xmm6,63
|
||||||
|
vpor xmm6,xmm6,xmm0
|
||||||
|
vpaddq xmm0,xmm7,xmm7
|
||||||
|
vpsrlq xmm7,xmm7,63
|
||||||
|
vpor xmm7,xmm7,xmm0
|
||||||
|
vpaddq xmm0,xmm4,xmm4
|
||||||
|
vpsrlq xmm4,xmm4,63
|
||||||
|
vpor xmm4,xmm4,xmm0
|
||||||
|
vmovdqa xmm0, [rsp]
|
||||||
|
|
||||||
|
Blake2bRounds2 2,src
|
||||||
|
|
||||||
|
vpxor xmm0, xmm0, xmm8
|
||||||
|
vpxor xmm1, xmm1, xmm9
|
||||||
|
vpxor xmm2, xmm2, xmm10
|
||||||
|
vpxor xmm3, xmm3, xmm11
|
||||||
|
vpxor xmm4, xmm4, xmm12
|
||||||
|
vpxor xmm5, xmm5, xmm13
|
||||||
|
vpxor xmm6, xmm6, xmm14
|
||||||
|
;vpxor xmm7, xmm7, xmm15
|
||||||
|
vmovddup xmm8, qword [mids+0x80]
|
||||||
|
vmovddup xmm9, qword [mids+0x88]
|
||||||
|
vmovddup xmm10, qword [mids+0x90]
|
||||||
|
vmovddup xmm11, qword [mids+0x98]
|
||||||
|
vmovddup xmm12, qword [mids+0xa0]
|
||||||
|
vmovddup xmm13, qword [mids+0xa8]
|
||||||
|
vmovddup xmm14, qword [mids+0xb0]
|
||||||
|
;vmovddup xmm15, qword [mids+0xb8]
|
||||||
|
vpxor xmm0, xmm0, xmm8
|
||||||
|
vpxor xmm1, xmm1, xmm9
|
||||||
|
vpxor xmm2, xmm2, xmm10
|
||||||
|
vpxor xmm3, xmm3, xmm11
|
||||||
|
vpxor xmm4, xmm4, xmm12
|
||||||
|
vpxor xmm5, xmm5, xmm13
|
||||||
|
vpxor xmm6, xmm6, xmm14
|
||||||
|
;vpxor xmm7, xmm7, xmm15
|
||||||
|
}
|
|
@ -0,0 +1,350 @@
|
||||||
|
macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||||
|
{
|
||||||
|
vpaddq ymm0,ymm0,ymm4
|
||||||
|
vpaddq ymm1,ymm1,ymm5
|
||||||
|
vpaddq ymm2,ymm2,ymm6
|
||||||
|
vpaddq ymm3,ymm3,ymm7
|
||||||
|
if m0<lim
|
||||||
|
vpaddq ymm0,ymm0, yword [src+m0*32]
|
||||||
|
end if
|
||||||
|
if m1<lim
|
||||||
|
vpaddq ymm1,ymm1, yword [src+m1*32]
|
||||||
|
end if
|
||||||
|
if m2<lim
|
||||||
|
vpaddq ymm2,ymm2, yword [src+m2*32]
|
||||||
|
end if
|
||||||
|
if m3<lim
|
||||||
|
vpaddq ymm3,ymm3, yword [src+m3*32]
|
||||||
|
end if
|
||||||
|
vpxor ymm12,ymm12,ymm0
|
||||||
|
vpxor ymm13,ymm13,ymm1
|
||||||
|
vpxor ymm14,ymm14,ymm2
|
||||||
|
vpxor ymm15,ymm15,ymm3
|
||||||
|
vpshufd ymm12,ymm12,0xB1
|
||||||
|
vpshufd ymm13,ymm13,0xB1
|
||||||
|
vpshufd ymm14,ymm14,0xB1
|
||||||
|
vpshufd ymm15,ymm15,0xB1
|
||||||
|
vpaddq ymm8,ymm8,ymm12
|
||||||
|
vpaddq ymm9,ymm9,ymm13
|
||||||
|
vpaddq ymm10,ymm10,ymm14
|
||||||
|
vpaddq ymm11,ymm11,ymm15
|
||||||
|
vpxor ymm4,ymm4,ymm8
|
||||||
|
vpxor ymm5,ymm5,ymm9
|
||||||
|
vpxor ymm6,ymm6,ymm10
|
||||||
|
vpxor ymm7,ymm7,ymm11
|
||||||
|
vmovdqa [rsp], ymm8
|
||||||
|
vbroadcasti128 ymm8, xword [xshufb_ror24]
|
||||||
|
vpshufb ymm4,ymm4,ymm8
|
||||||
|
vpshufb ymm5,ymm5,ymm8
|
||||||
|
vpshufb ymm6,ymm6,ymm8
|
||||||
|
vpshufb ymm7,ymm7,ymm8
|
||||||
|
vmovdqa ymm8, [rsp]
|
||||||
|
|
||||||
|
vpaddq ymm0,ymm0,ymm4
|
||||||
|
vpaddq ymm1,ymm1,ymm5
|
||||||
|
vpaddq ymm2,ymm2,ymm6
|
||||||
|
vpaddq ymm3,ymm3,ymm7
|
||||||
|
if m4<lim
|
||||||
|
vpaddq ymm0,ymm0, yword [src+m4*32]
|
||||||
|
end if
|
||||||
|
if m5<lim
|
||||||
|
vpaddq ymm1,ymm1, yword [src+m5*32]
|
||||||
|
end if
|
||||||
|
if m6<lim
|
||||||
|
vpaddq ymm2,ymm2, yword [src+m6*32]
|
||||||
|
end if
|
||||||
|
if m7<lim
|
||||||
|
vpaddq ymm3,ymm3, yword [src+m7*32]
|
||||||
|
end if
|
||||||
|
vpxor ymm12,ymm12,ymm0
|
||||||
|
vpxor ymm13,ymm13,ymm1
|
||||||
|
vpxor ymm14,ymm14,ymm2
|
||||||
|
vpxor ymm15,ymm15,ymm3
|
||||||
|
vmovdqa [rsp], ymm0
|
||||||
|
vbroadcasti128 ymm0, xword [xshufb_ror16]
|
||||||
|
vpshufb ymm12,ymm12,ymm0
|
||||||
|
vpshufb ymm13,ymm13,ymm0
|
||||||
|
vpshufb ymm14,ymm14,ymm0
|
||||||
|
vpshufb ymm15,ymm15,ymm0
|
||||||
|
vpaddq ymm8,ymm8,ymm12
|
||||||
|
vpaddq ymm9,ymm9,ymm13
|
||||||
|
vpaddq ymm10,ymm10,ymm14
|
||||||
|
vpaddq ymm11,ymm11,ymm15
|
||||||
|
vpxor ymm4,ymm4,ymm8
|
||||||
|
vpxor ymm5,ymm5,ymm9
|
||||||
|
vpxor ymm6,ymm6,ymm10
|
||||||
|
vpxor ymm7,ymm7,ymm11
|
||||||
|
|
||||||
|
vpaddq ymm0,ymm4,ymm4
|
||||||
|
vpsrlq ymm4,ymm4,63
|
||||||
|
vpor ymm4,ymm4,ymm0
|
||||||
|
vpaddq ymm0,ymm5,ymm5
|
||||||
|
vpsrlq ymm5,ymm5,63
|
||||||
|
vpor ymm5,ymm5,ymm0
|
||||||
|
vpaddq ymm0,ymm6,ymm6
|
||||||
|
vpsrlq ymm6,ymm6,63
|
||||||
|
vpor ymm6,ymm6,ymm0
|
||||||
|
vpaddq ymm0,ymm7,ymm7
|
||||||
|
vpsrlq ymm7,ymm7,63
|
||||||
|
vpor ymm7,ymm7,ymm0
|
||||||
|
|
||||||
|
vmovdqa ymm0, [rsp]
|
||||||
|
}
|
||||||
|
|
||||||
|
macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
|
||||||
|
{
|
||||||
|
vpaddq ymm0,ymm0,ymm5
|
||||||
|
vpaddq ymm1,ymm1,ymm6
|
||||||
|
vpaddq ymm2,ymm2,ymm7
|
||||||
|
vpaddq ymm3,ymm3,ymm4
|
||||||
|
if m0<lim
|
||||||
|
vpaddq ymm0,ymm0, yword [src+m0*32]
|
||||||
|
end if
|
||||||
|
if m1<lim
|
||||||
|
vpaddq ymm1,ymm1, yword [src+m1*32]
|
||||||
|
end if
|
||||||
|
if m2<lim
|
||||||
|
vpaddq ymm2,ymm2, yword [src+m2*32]
|
||||||
|
end if
|
||||||
|
if m3<lim
|
||||||
|
vpaddq ymm3,ymm3, yword [src+m3*32]
|
||||||
|
end if
|
||||||
|
vpxor ymm15,ymm15,ymm0
|
||||||
|
vpxor ymm12,ymm12,ymm1
|
||||||
|
vpxor ymm13,ymm13,ymm2
|
||||||
|
vpxor ymm14,ymm14,ymm3
|
||||||
|
vpshufd ymm15,ymm15,0xB1
|
||||||
|
vpshufd ymm12,ymm12,0xB1
|
||||||
|
vpshufd ymm13,ymm13,0xB1
|
||||||
|
vpshufd ymm14,ymm14,0xB1
|
||||||
|
vpaddq ymm10,ymm10,ymm15
|
||||||
|
vpaddq ymm11,ymm11,ymm12
|
||||||
|
vpaddq ymm8,ymm8,ymm13
|
||||||
|
vpaddq ymm9,ymm9,ymm14
|
||||||
|
vpxor ymm5,ymm5,ymm10
|
||||||
|
vpxor ymm6,ymm6,ymm11
|
||||||
|
vpxor ymm7,ymm7,ymm8
|
||||||
|
vpxor ymm4,ymm4,ymm9
|
||||||
|
vmovdqa [rsp], ymm10
|
||||||
|
vbroadcasti128 ymm10, xword [xshufb_ror24]
|
||||||
|
vpshufb ymm5,ymm5,ymm10
|
||||||
|
vpshufb ymm6,ymm6,ymm10
|
||||||
|
vpshufb ymm7,ymm7,ymm10
|
||||||
|
vpshufb ymm4,ymm4,ymm10
|
||||||
|
vmovdqa ymm10, [rsp]
|
||||||
|
|
||||||
|
vpaddq ymm0,ymm0,ymm5
|
||||||
|
vpaddq ymm1,ymm1,ymm6
|
||||||
|
vpaddq ymm2,ymm2,ymm7
|
||||||
|
vpaddq ymm3,ymm3,ymm4
|
||||||
|
if m4<lim
|
||||||
|
vpaddq ymm0,ymm0, yword [src+m4*32]
|
||||||
|
end if
|
||||||
|
if m5<lim
|
||||||
|
vpaddq ymm1,ymm1, yword [src+m5*32]
|
||||||
|
end if
|
||||||
|
if m6<lim
|
||||||
|
vpaddq ymm2,ymm2, yword [src+m6*32]
|
||||||
|
end if
|
||||||
|
if m7<lim
|
||||||
|
vpaddq ymm3,ymm3, yword [src+m7*32]
|
||||||
|
end if
|
||||||
|
vpxor ymm15,ymm15,ymm0
|
||||||
|
vpxor ymm12,ymm12,ymm1
|
||||||
|
vpxor ymm13,ymm13,ymm2
|
||||||
|
vpxor ymm14,ymm14,ymm3
|
||||||
|
vmovdqa [rsp], ymm0
|
||||||
|
vbroadcasti128 ymm0, xword [xshufb_ror16]
|
||||||
|
vpshufb ymm15,ymm15,ymm0
|
||||||
|
vpshufb ymm12,ymm12,ymm0
|
||||||
|
vpshufb ymm13,ymm13,ymm0
|
||||||
|
vpshufb ymm14,ymm14,ymm0
|
||||||
|
vpaddq ymm10,ymm10,ymm15
|
||||||
|
vpaddq ymm11,ymm11,ymm12
|
||||||
|
vpaddq ymm8,ymm8,ymm13
|
||||||
|
vpaddq ymm9,ymm9,ymm14
|
||||||
|
vpxor ymm5,ymm5,ymm10
|
||||||
|
vpxor ymm6,ymm6,ymm11
|
||||||
|
vpxor ymm7,ymm7,ymm8
|
||||||
|
vpxor ymm4,ymm4,ymm9
|
||||||
|
|
||||||
|
vpaddq ymm0,ymm5,ymm5
|
||||||
|
vpsrlq ymm5,ymm5,63
|
||||||
|
vpor ymm5,ymm5,ymm0
|
||||||
|
vpaddq ymm0,ymm6,ymm6
|
||||||
|
vpsrlq ymm6,ymm6,63
|
||||||
|
vpor ymm6,ymm6,ymm0
|
||||||
|
vpaddq ymm0,ymm7,ymm7
|
||||||
|
vpsrlq ymm7,ymm7,63
|
||||||
|
vpor ymm7,ymm7,ymm0
|
||||||
|
vpaddq ymm0,ymm4,ymm4
|
||||||
|
vpsrlq ymm4,ymm4,63
|
||||||
|
vpor ymm4,ymm4,ymm0
|
||||||
|
|
||||||
|
vmovdqa ymm0, [rsp]
|
||||||
|
}
|
||||||
|
|
||||||
|
macro Blake2bRounds2 lim,src
|
||||||
|
{
|
||||||
|
;ROUND 0
|
||||||
|
;hR0 0,2,4,6,1,3,5,7,lim,src
|
||||||
|
;hR1 8,10,12,14,9,11,13,15,lim,src
|
||||||
|
|
||||||
|
;ROUND 1
|
||||||
|
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||||
|
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||||
|
|
||||||
|
;ROUND 2
|
||||||
|
hR0 11,12,5,15,8,0,2,13,lim,src
|
||||||
|
hR1 10,3,7,9,14,6,1,4,lim,src
|
||||||
|
|
||||||
|
;ROUND 3
|
||||||
|
hR0 7,3,13,11,9,1,12,14,lim,src
|
||||||
|
hR1 2,5,4,15,6,10,0,8,lim,src
|
||||||
|
|
||||||
|
;ROUND 4
|
||||||
|
hR0 9,5,2,10,0,7,4,15,lim,src
|
||||||
|
hR1 14,11,6,3,1,12,8,13,lim,src
|
||||||
|
|
||||||
|
;ROUND 5
|
||||||
|
hR0 2,6,0,8,12,10,11,3,lim,src
|
||||||
|
hR1 4,7,15,1,13,5,14,9,lim,src
|
||||||
|
|
||||||
|
;ROUND 6
|
||||||
|
hR0 12,1,14,4,5,15,13,10,lim,src
|
||||||
|
hR1 0,6,9,8,7,3,2,11,lim,src
|
||||||
|
|
||||||
|
;ROUND 7
|
||||||
|
hR0 13,7,12,3,11,14,1,9,lim,src
|
||||||
|
hR1 5,15,8,2,0,4,6,10,lim,src
|
||||||
|
|
||||||
|
;ROUND 8
|
||||||
|
hR0 6,14,11,0,15,9,3,8,lim,src
|
||||||
|
hR1 12,13,1,10,2,7,4,5,lim,src
|
||||||
|
|
||||||
|
;ROUND 9
|
||||||
|
hR0 10,8,7,1,2,4,6,5,lim,src
|
||||||
|
hR1 15,9,3,13,11,14,12,0,lim,src
|
||||||
|
|
||||||
|
;ROUND 10
|
||||||
|
hR0 0,2,4,6,1,3,5,7,lim,src
|
||||||
|
hR1 8,10,12,14,9,11,13,15,lim,src
|
||||||
|
|
||||||
|
;ROUND 11
|
||||||
|
hR0 14,4,9,13,10,8,15,6,lim,src
|
||||||
|
hR1 1,0,11,5,12,2,7,3,lim,src
|
||||||
|
}
|
||||||
|
|
||||||
|
macro Blake2beq2of2 mids, src
|
||||||
|
{
|
||||||
|
vpbroadcastq ymm0, qword [mids]
|
||||||
|
vpaddq ymm0,ymm0, yword [src+1*32]
|
||||||
|
vpbroadcastq ymm12, qword [mids+0x08]
|
||||||
|
vpxor ymm12,ymm12,ymm0
|
||||||
|
vbroadcasti128 ymm2, xword [xshufb_ror16] ;ymm2 is temp
|
||||||
|
vpshufb ymm12,ymm12,ymm2
|
||||||
|
vpbroadcastq ymm8, qword [mids+0x10]
|
||||||
|
vpaddq ymm8,ymm8,ymm12
|
||||||
|
vpbroadcastq ymm4, qword [mids+0x18]
|
||||||
|
vpxor ymm4,ymm4,ymm8
|
||||||
|
vpaddq ymm2,ymm4,ymm4 ;ymm2 is temp
|
||||||
|
vpsrlq ymm4,ymm4,63
|
||||||
|
vpor ymm4,ymm4,ymm2
|
||||||
|
|
||||||
|
vpbroadcastq ymm5, qword [mids+0x20]
|
||||||
|
vpaddq ymm0,ymm0,ymm5
|
||||||
|
vpbroadcastq ymm1, qword [mids+0x30]
|
||||||
|
vpxor ymm12,ymm12,ymm1
|
||||||
|
vpshufd ymm12,ymm12,0xB1
|
||||||
|
vpbroadcastq ymm13, qword [mids+0x38]
|
||||||
|
vpaddq ymm8,ymm8,ymm13
|
||||||
|
vpbroadcastq ymm3, qword [mids+0x60]
|
||||||
|
vpaddq ymm3,ymm3,ymm4
|
||||||
|
vpbroadcastq ymm15, qword [mids+0x48]
|
||||||
|
vpxor ymm15,ymm15,ymm0
|
||||||
|
vpshufd ymm15,ymm15,0xB1
|
||||||
|
vpbroadcastq ymm11, qword [mids+0x58]
|
||||||
|
vpaddq ymm11,ymm11,ymm12
|
||||||
|
vpbroadcastq ymm7, qword [mids+0x68]
|
||||||
|
vpxor ymm7,ymm7,ymm8
|
||||||
|
vpbroadcastq ymm14, qword [mids+0x40]
|
||||||
|
vpxor ymm14,ymm14,ymm3
|
||||||
|
vpshufd ymm14,ymm14,0xB1
|
||||||
|
vpbroadcastq ymm10, qword [mids+0x50]
|
||||||
|
vpaddq ymm10,ymm10,ymm15
|
||||||
|
vpbroadcastq ymm6, qword [mids+0x28]
|
||||||
|
vpxor ymm6,ymm6,ymm11
|
||||||
|
vpbroadcastq ymm9, qword [mids+0x70]
|
||||||
|
vpaddq ymm9,ymm9,ymm14
|
||||||
|
vpxor ymm5,ymm5,ymm10
|
||||||
|
vpxor ymm4,ymm4,ymm9
|
||||||
|
vbroadcasti128 ymm2, xword [xshufb_ror24] ;ymm2 is temp
|
||||||
|
vpshufb ymm5,ymm5,ymm2
|
||||||
|
vpshufb ymm6,ymm6,ymm2
|
||||||
|
vpshufb ymm7,ymm7,ymm2
|
||||||
|
vpshufb ymm4,ymm4,ymm2
|
||||||
|
vpbroadcastq ymm2, qword [mids+0x78]
|
||||||
|
|
||||||
|
vpaddq ymm0,ymm0,ymm5
|
||||||
|
vpaddq ymm1,ymm1,ymm6
|
||||||
|
vpaddq ymm2,ymm2,ymm7
|
||||||
|
vpaddq ymm3,ymm3,ymm4
|
||||||
|
vpxor ymm15,ymm15,ymm0
|
||||||
|
vpxor ymm12,ymm12,ymm1
|
||||||
|
vpxor ymm13,ymm13,ymm2
|
||||||
|
vpxor ymm14,ymm14,ymm3
|
||||||
|
vmovdqa [rsp], ymm0
|
||||||
|
vbroadcasti128 ymm0, xword [xshufb_ror16]
|
||||||
|
vpshufb ymm15,ymm15,ymm0
|
||||||
|
vpshufb ymm12,ymm12,ymm0
|
||||||
|
vpshufb ymm13,ymm13,ymm0
|
||||||
|
vpshufb ymm14,ymm14,ymm0
|
||||||
|
vpaddq ymm10,ymm10,ymm15
|
||||||
|
vpaddq ymm11,ymm11,ymm12
|
||||||
|
vpaddq ymm8,ymm8,ymm13
|
||||||
|
vpaddq ymm9,ymm9,ymm14
|
||||||
|
vpxor ymm5,ymm5,ymm10
|
||||||
|
vpxor ymm6,ymm6,ymm11
|
||||||
|
vpxor ymm7,ymm7,ymm8
|
||||||
|
vpxor ymm4,ymm4,ymm9
|
||||||
|
vpaddq ymm0,ymm5,ymm5
|
||||||
|
vpsrlq ymm5,ymm5,63
|
||||||
|
vpor ymm5,ymm5,ymm0
|
||||||
|
vpaddq ymm0,ymm6,ymm6
|
||||||
|
vpsrlq ymm6,ymm6,63
|
||||||
|
vpor ymm6,ymm6,ymm0
|
||||||
|
vpaddq ymm0,ymm7,ymm7
|
||||||
|
vpsrlq ymm7,ymm7,63
|
||||||
|
vpor ymm7,ymm7,ymm0
|
||||||
|
vpaddq ymm0,ymm4,ymm4
|
||||||
|
vpsrlq ymm4,ymm4,63
|
||||||
|
vpor ymm4,ymm4,ymm0
|
||||||
|
vmovdqa ymm0, [rsp]
|
||||||
|
|
||||||
|
Blake2bRounds2 2,src
|
||||||
|
|
||||||
|
vpxor ymm0, ymm0, ymm8
|
||||||
|
vpxor ymm1, ymm1, ymm9
|
||||||
|
vpxor ymm2, ymm2, ymm10
|
||||||
|
vpxor ymm3, ymm3, ymm11
|
||||||
|
vpxor ymm4, ymm4, ymm12
|
||||||
|
vpxor ymm5, ymm5, ymm13
|
||||||
|
vpxor ymm6, ymm6, ymm14
|
||||||
|
;vpxor ymm7, ymm7, ymm15
|
||||||
|
vpbroadcastq ymm8, qword [mids+0x80]
|
||||||
|
vpbroadcastq ymm9, qword [mids+0x88]
|
||||||
|
vpbroadcastq ymm10, qword [mids+0x90]
|
||||||
|
vpbroadcastq ymm11, qword [mids+0x98]
|
||||||
|
vpbroadcastq ymm12, qword [mids+0xa0]
|
||||||
|
vpbroadcastq ymm13, qword [mids+0xa8]
|
||||||
|
vpbroadcastq ymm14, qword [mids+0xb0]
|
||||||
|
;vpbroadcastq ymm15, qword [mids+0xb8]
|
||||||
|
vpxor ymm0, ymm0, ymm8
|
||||||
|
vpxor ymm1, ymm1, ymm9
|
||||||
|
vpxor ymm2, ymm2, ymm10
|
||||||
|
vpxor ymm3, ymm3, ymm11
|
||||||
|
vpxor ymm4, ymm4, ymm12
|
||||||
|
vpxor ymm5, ymm5, ymm13
|
||||||
|
vpxor ymm6, ymm6, ymm14
|
||||||
|
;vpxor ymm7, ymm7, ymm15
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
;void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||||
|
;hashout: hash output buffer: 2*64 bytes
|
||||||
|
;midstate: 256 bytes from Blake2PrepareMidstate2
|
||||||
|
;indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
|
||||||
|
|
||||||
|
include "macro_blake2b_avx1.asm"
|
||||||
|
|
||||||
|
Blake2Run2:
|
||||||
|
mov rax, rsp
|
||||||
|
sub rsp, 0x28
|
||||||
|
and rsp, -32
|
||||||
|
mov [rsp+0x20], rax
|
||||||
|
|
||||||
|
mov [rsi+0xd4], edx
|
||||||
|
add edx, 1
|
||||||
|
mov [rsi+0xdc], edx
|
||||||
|
|
||||||
|
Blake2beq2of2 rsi, rsi+0xc0
|
||||||
|
|
||||||
|
vpunpcklqdq xmm8, xmm0, xmm1
|
||||||
|
vpunpckhqdq xmm1, xmm0, xmm1
|
||||||
|
vpunpcklqdq xmm10, xmm2, xmm3
|
||||||
|
vpunpckhqdq xmm3, xmm2, xmm3
|
||||||
|
vpunpcklqdq xmm12, xmm4, xmm5
|
||||||
|
vpunpckhqdq xmm5, xmm4, xmm5
|
||||||
|
vpunpcklqdq xmm14, xmm6, xmm7
|
||||||
|
vpunpckhqdq xmm7, xmm6, xmm7
|
||||||
|
|
||||||
|
vmovdqa [rdi], xmm8
|
||||||
|
vmovdqa [rdi+0x10], xmm10
|
||||||
|
vmovdqa [rdi+0x20], xmm12
|
||||||
|
vmovdqa [rdi+0x30], xmm14
|
||||||
|
vmovdqa [rdi+0x40], xmm1
|
||||||
|
vmovdqa [rdi+0x50], xmm3
|
||||||
|
vmovdqa [rdi+0x60], xmm5
|
||||||
|
vmovdqa [rdi+0x70], xmm7
|
||||||
|
|
||||||
|
mov rsp, [rsp+0x20]
|
||||||
|
ret
|
|
@ -0,0 +1,49 @@
|
||||||
|
;void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||||
|
;hashout: hash output buffer: 4*64 bytes
|
||||||
|
;midstate: 256 bytes from Blake2PrepareMidstate4
|
||||||
|
;indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
|
||||||
|
|
||||||
|
include "macro_blake2b_avx2.asm"
|
||||||
|
|
||||||
|
Blake2Run4:
|
||||||
|
mov rax, rsp
|
||||||
|
sub rsp, 0x28
|
||||||
|
and rsp, -32
|
||||||
|
mov [rsp+0x20], rax
|
||||||
|
|
||||||
|
vmovd xmm0, edx ;indexctr
|
||||||
|
vpbroadcastd ymm0, xmm0
|
||||||
|
vpaddd ymm0, ymm0, yword [yctrinit]
|
||||||
|
vpblendd ymm0, ymm0, yword [rsi+0xe0], 0x55
|
||||||
|
vmovdqa yword [rsi+0xe0], ymm0
|
||||||
|
|
||||||
|
Blake2beq2of2 rsi, rsi+0xc0
|
||||||
|
|
||||||
|
vpunpcklqdq ymm8, ymm0, ymm1
|
||||||
|
vpunpckhqdq ymm9, ymm0, ymm1
|
||||||
|
vpunpcklqdq ymm10, ymm2, ymm3
|
||||||
|
vpunpckhqdq ymm11, ymm2, ymm3
|
||||||
|
vpunpcklqdq ymm12, ymm4, ymm5
|
||||||
|
vpunpckhqdq ymm13, ymm4, ymm5
|
||||||
|
vpunpcklqdq ymm14, ymm6, ymm7
|
||||||
|
vpunpckhqdq ymm15, ymm6, ymm7
|
||||||
|
vperm2i128 ymm0, ymm8, ymm10, 0x20
|
||||||
|
vperm2i128 ymm1, ymm12, ymm14, 0x20
|
||||||
|
vperm2i128 ymm2, ymm9, ymm11, 0x20
|
||||||
|
vperm2i128 ymm3, ymm13, ymm15, 0x20
|
||||||
|
vperm2i128 ymm4, ymm8, ymm10, 0x31
|
||||||
|
vperm2i128 ymm5, ymm12, ymm14, 0x31
|
||||||
|
vperm2i128 ymm6, ymm9, ymm11, 0x31
|
||||||
|
vperm2i128 ymm7, ymm13, ymm15, 0x31
|
||||||
|
|
||||||
|
vmovdqa [rdi], ymm0
|
||||||
|
vmovdqa [rdi+0x20], ymm1
|
||||||
|
vmovdqa [rdi+0x40], ymm2
|
||||||
|
vmovdqa [rdi+0x60], ymm3
|
||||||
|
vmovdqa [rdi+0x80], ymm4
|
||||||
|
vmovdqa [rdi+0xa0], ymm5
|
||||||
|
vmovdqa [rdi+0xc0], ymm6
|
||||||
|
vmovdqa [rdi+0xe0], ymm7
|
||||||
|
|
||||||
|
mov rsp, [rsp+0x20]
|
||||||
|
ret
|
|
@ -0,0 +1,212 @@
|
||||||
|
;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
|
||||||
|
;midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||||
|
;input: 140 bytes header, preferably aligned by 8
|
||||||
|
|
||||||
|
Blake2PrepareMidstate2:
|
||||||
|
sub rsp, 0x188
|
||||||
|
|
||||||
|
vmovdqa xmm10, xword [xshufb_ror24]
|
||||||
|
vmovdqa xmm11, xword [xshufb_ror16]
|
||||||
|
|
||||||
|
vmovdqa xmm0, xword [s0]
|
||||||
|
vmovdqa xmm1, xword [s2]
|
||||||
|
vmovdqa xmm2, xword [s4]
|
||||||
|
vmovdqa xmm3, xword [s6]
|
||||||
|
vmovdqa xmm4, xword [iv]
|
||||||
|
vmovdqa xmm5, xword [iv+0x10]
|
||||||
|
vmovdqa xmm6, xword [iv4xor128]
|
||||||
|
vmovdqa xmm7, xword [iv4xor128+0x10]
|
||||||
|
|
||||||
|
mov r8, rsp
|
||||||
|
lea r9, [blake2sigma]
|
||||||
|
lea r11, [blake2sigma+160]
|
||||||
|
call _ProcBlakeMsgSched
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r8, 0x80
|
||||||
|
add r9, 16
|
||||||
|
call _ProcBlakeMsgSched
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r8, 0x80
|
||||||
|
add r9, 16
|
||||||
|
_LoopEhPrepare1:
|
||||||
|
call _ProcBlakeMsgSched
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r9, 16
|
||||||
|
cmp r9, r11
|
||||||
|
jb _LoopEhPrepare1
|
||||||
|
mov r8, rsp
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r8, 0x80
|
||||||
|
call _ProcBlakeRound
|
||||||
|
|
||||||
|
vpxor xmm0, xmm0, xmm4
|
||||||
|
vpxor xmm1, xmm1, xmm5
|
||||||
|
vpxor xmm2, xmm2, xmm6
|
||||||
|
vpxor xmm3, xmm3, xmm7
|
||||||
|
vpxor xmm0, xmm0, xword [s0]
|
||||||
|
vpxor xmm1, xmm1, xword [s2]
|
||||||
|
vpxor xmm2, xmm2, xword [s4]
|
||||||
|
vpxor xmm3, xmm3, xword [s6]
|
||||||
|
vmovdqa xword [rdi+0x80], xmm0
|
||||||
|
vmovdqa xword [rdi+0x90], xmm1
|
||||||
|
vmovdqa xword [rdi+0xa0], xmm2
|
||||||
|
vmovdqa xword [rdi+0xb0], xmm3
|
||||||
|
vmovq xmm8, [rsi+0x80]
|
||||||
|
vpshufd xmm4, xmm8, 0x44
|
||||||
|
vmovdqa xword [rdi+0xc0], xmm4
|
||||||
|
vmovd xmm4, [rsi+0x88]
|
||||||
|
vpshufd xmm4, xmm4, 0x44
|
||||||
|
vmovdqa xword [rdi+0xd0], xmm4
|
||||||
|
|
||||||
|
;Begin second message block
|
||||||
|
vmovdqa xmm4, xword [iv]
|
||||||
|
vmovdqa xmm5, xword [iv+0x10]
|
||||||
|
vmovdqa xmm6, xword [iv4xor144]
|
||||||
|
vmovdqa xmm7, xword [iv6inverted]
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vpaddq xmm0, xmm0, xmm8 ;xmm8[63:0]=message
|
||||||
|
vpxor xmm6, xmm6, xmm0
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufd xmm6, xmm6, 0xb1
|
||||||
|
vmovq [rdi+0x08], xmm6 ;v12
|
||||||
|
vpshufd xmm7, xmm7, 0xb1
|
||||||
|
vpaddq xmm4, xmm4, xmm6
|
||||||
|
vmovq [rdi+0x10], xmm4 ;v8
|
||||||
|
vpaddq xmm5, xmm5, xmm7
|
||||||
|
vpxor xmm2, xmm2, xmm4
|
||||||
|
vpxor xmm3, xmm3, xmm5
|
||||||
|
vpshufb xmm2, xmm2, xmm10
|
||||||
|
vmovq [rdi+0x18], xmm2 ;v4
|
||||||
|
vpshufb xmm3, xmm3, xmm10
|
||||||
|
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vmovq [rdi], xmm0 ;v0
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vpextrq [rdi+0x60], xmm1, 1 ;v3
|
||||||
|
;add message (nonce, index) to xmm0 here, but we don't have
|
||||||
|
vpxor xmm6, xmm6, xmm0
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufb xmm6, xmm6, xmm11
|
||||||
|
vpshufb xmm7, xmm7, xmm11
|
||||||
|
vmovdqa xword [rdi+0x40], xmm7 ;v14,15
|
||||||
|
vpaddq xmm4, xmm4, xmm6
|
||||||
|
vpextrq [rdi+0x70], xmm4, 1 ;v9
|
||||||
|
vpaddq xmm5, xmm5, xmm7
|
||||||
|
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
|
||||||
|
vpxor xmm2, xmm2, xmm4
|
||||||
|
vpxor xmm3, xmm3, xmm5
|
||||||
|
vpaddq xmm8, xmm2, xmm2
|
||||||
|
vpsrlq xmm2, xmm2, 63
|
||||||
|
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
||||||
|
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
||||||
|
vpsrlq xmm3, xmm3, 63
|
||||||
|
vpor xmm3, xmm3, xmm2
|
||||||
|
|
||||||
|
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
|
||||||
|
vmovdqa xword [rdi+0x20], xmm2 ;v5,6
|
||||||
|
vpsrldq xmm3, xmm3, 8
|
||||||
|
vmovq [rdi+0x68], xmm3 ;v7
|
||||||
|
vpsrldq xmm7, xmm6, 8
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vpextrq [rdi+0x30], xmm0, 1 ;v1
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vmovq [rdi+0x78], xmm1 ;v2
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufd xmm7, xmm7, 0xb1
|
||||||
|
vmovq [rdi+0x38], xmm7 ;v13
|
||||||
|
|
||||||
|
add rsp, 0x188
|
||||||
|
ret
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_ProcBlakeMsgSched:
|
||||||
|
;rsi=src
|
||||||
|
;r8=dst
|
||||||
|
;r9=sigma table
|
||||||
|
xor r10d, r10d
|
||||||
|
_LoopBlakeMsgSched:
|
||||||
|
movzx eax, byte [r9+r10]
|
||||||
|
mov rax, [rsi+rax*8]
|
||||||
|
mov [r8+r10*8], rax
|
||||||
|
add r10d, 1
|
||||||
|
cmp r10d, 16
|
||||||
|
jb _LoopBlakeMsgSched
|
||||||
|
ret
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_ProcBlakeRound:
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vpaddq xmm0, xmm0, [r8]
|
||||||
|
vpaddq xmm1, xmm1, [r8+0x10]
|
||||||
|
vpxor xmm6, xmm6, xmm0
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufd xmm6, xmm6, 0xb1
|
||||||
|
vpshufd xmm7, xmm7, 0xb1
|
||||||
|
vpaddq xmm4, xmm4, xmm6
|
||||||
|
vpaddq xmm5, xmm5, xmm7
|
||||||
|
vpxor xmm2, xmm2, xmm4
|
||||||
|
vpxor xmm3, xmm3, xmm5
|
||||||
|
vpshufb xmm2, xmm2, xmm10
|
||||||
|
vpshufb xmm3, xmm3, xmm10
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vpaddq xmm0, xmm0, [r8+0x20]
|
||||||
|
vpaddq xmm1, xmm1, [r8+0x30]
|
||||||
|
vpxor xmm6, xmm6, xmm0
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
|
||||||
|
vpshufb xmm7, xmm7, xmm11
|
||||||
|
vpaddq xmm4, xmm4, xmm9
|
||||||
|
vpaddq xmm5, xmm5, xmm7
|
||||||
|
vpxor xmm2, xmm2, xmm4
|
||||||
|
vpxor xmm3, xmm3, xmm5
|
||||||
|
vpaddq xmm8, xmm2, xmm2
|
||||||
|
vpsrlq xmm2, xmm2, 63
|
||||||
|
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
||||||
|
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
||||||
|
vpsrlq xmm3, xmm3, 63
|
||||||
|
vpor xmm3, xmm3, xmm2
|
||||||
|
|
||||||
|
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
|
||||||
|
vpalignr xmm3, xmm8, xmm3, 8
|
||||||
|
vpalignr xmm6, xmm9, xmm7, 8 ;xmm6 resume
|
||||||
|
vpalignr xmm7, xmm7, xmm9, 8
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vpaddq xmm0, xmm0, [r8+0x40]
|
||||||
|
vpaddq xmm1, xmm1, [r8+0x50]
|
||||||
|
vpxor xmm6, xmm6, xmm0
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufd xmm6, xmm6, 0xb1
|
||||||
|
vpshufd xmm7, xmm7, 0xb1
|
||||||
|
vpaddq xmm5, xmm5, xmm6
|
||||||
|
vpaddq xmm4, xmm4, xmm7
|
||||||
|
vpxor xmm2, xmm2, xmm5
|
||||||
|
vpxor xmm3, xmm3, xmm4
|
||||||
|
vpshufb xmm2, xmm2, xmm10
|
||||||
|
vpshufb xmm3, xmm3, xmm10
|
||||||
|
vpaddq xmm0, xmm0, xmm2
|
||||||
|
vpaddq xmm1, xmm1, xmm3
|
||||||
|
vpaddq xmm0, xmm0, [r8+0x60]
|
||||||
|
vpaddq xmm1, xmm1, [r8+0x70]
|
||||||
|
vpxor xmm6, xmm6, xmm0
|
||||||
|
vpxor xmm7, xmm7, xmm1
|
||||||
|
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
|
||||||
|
vpshufb xmm7, xmm7, xmm11
|
||||||
|
vpaddq xmm5, xmm5, xmm9
|
||||||
|
vpaddq xmm4, xmm4, xmm7
|
||||||
|
vpxor xmm2, xmm2, xmm5
|
||||||
|
vpxor xmm3, xmm3, xmm4
|
||||||
|
vpaddq xmm8, xmm2, xmm2
|
||||||
|
vpsrlq xmm2, xmm2, 63
|
||||||
|
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
||||||
|
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
||||||
|
vpsrlq xmm3, xmm3, 63
|
||||||
|
vpor xmm3, xmm3, xmm2
|
||||||
|
vpalignr xmm2, xmm8, xmm3, 8 ;xmm2 resume
|
||||||
|
vpalignr xmm3, xmm3, xmm8, 8
|
||||||
|
vpalignr xmm6, xmm7, xmm9, 8 ;xmm6 resume
|
||||||
|
vpalignr xmm7, xmm9, xmm7, 8
|
||||||
|
ret
|
|
@ -0,0 +1,166 @@
|
||||||
|
;void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
|
||||||
|
;midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||||
|
;input: 140 bytes header, preferably aligned by 8
|
||||||
|
|
||||||
|
Blake2PrepareMidstate4:
|
||||||
|
sub rsp, 0x188
|
||||||
|
vbroadcasti128 ymm6, xword [xshufb_ror24]
|
||||||
|
vbroadcasti128 ymm7, xword [xshufb_ror16]
|
||||||
|
|
||||||
|
vmovdqa ymm0, yword [s0]
|
||||||
|
vmovdqa ymm1, yword [s4]
|
||||||
|
vmovdqa ymm2, yword [iv]
|
||||||
|
vmovdqa ymm3, yword [iv4xor128]
|
||||||
|
|
||||||
|
mov r8, rsp
|
||||||
|
lea r9, [blake2sigma]
|
||||||
|
lea r11, [blake2sigma+160]
|
||||||
|
call _ProcBlakeMsgSched
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r8, 0x80
|
||||||
|
add r9, 16
|
||||||
|
call _ProcBlakeMsgSched
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r8, 0x80
|
||||||
|
add r9, 16
|
||||||
|
_LoopEhPrepare1:
|
||||||
|
call _ProcBlakeMsgSched
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r9, 16
|
||||||
|
cmp r9, r11
|
||||||
|
jb _LoopEhPrepare1
|
||||||
|
mov r8, rsp
|
||||||
|
call _ProcBlakeRound
|
||||||
|
add r8, 0x80
|
||||||
|
call _ProcBlakeRound
|
||||||
|
|
||||||
|
vpxor ymm0, ymm0, ymm2
|
||||||
|
vpxor ymm1, ymm1, ymm3
|
||||||
|
vpxor ymm0, ymm0, yword [s0]
|
||||||
|
vpxor ymm1, ymm1, yword [s4]
|
||||||
|
vmovdqa yword [rdi+0x80], ymm0
|
||||||
|
vmovdqa yword [rdi+0xa0], ymm1
|
||||||
|
vmovq xmm5, [rsi+0x80]
|
||||||
|
vpbroadcastq ymm4, xmm5
|
||||||
|
vmovdqa yword [rdi+0xc0], ymm4
|
||||||
|
vmovd xmm4, [rsi+0x88]
|
||||||
|
vpbroadcastq ymm4, xmm4
|
||||||
|
vmovdqa yword [rdi+0xe0], ymm4
|
||||||
|
|
||||||
|
;Begin second message block
|
||||||
|
vmovdqa ymm2, yword [iv]
|
||||||
|
vmovdqa ymm3, yword [iv4xor144] ;also loads iv6inverted
|
||||||
|
vpaddq ymm0, ymm0, ymm1
|
||||||
|
vpaddq ymm0, ymm0, ymm5 ;ymm5[63:0]=message
|
||||||
|
vpxor ymm3, ymm3, ymm0
|
||||||
|
vpshufd ymm3, ymm3, 0xb1
|
||||||
|
vmovq [rdi+0x08], xmm3 ;v12
|
||||||
|
vpaddq ymm2, ymm2, ymm3
|
||||||
|
vmovq [rdi+0x10], xmm2 ;v8
|
||||||
|
vpxor ymm1, ymm1, ymm2
|
||||||
|
vpshufb ymm1, ymm1, ymm6
|
||||||
|
vmovq [rdi+0x18], xmm1 ;v4
|
||||||
|
|
||||||
|
vpaddq ymm0, ymm0, ymm1
|
||||||
|
vmovq [rdi], xmm0 ;v0, v3 ready
|
||||||
|
;add message (nonce, index) to xmm0 here, but we don't have
|
||||||
|
vpxor ymm3, ymm3, ymm0
|
||||||
|
vpshufb ymm3, ymm3, ymm7
|
||||||
|
vextracti128 xmm4, ymm3, 1
|
||||||
|
vmovdqa xword [rdi+0x40], xmm4 ;v14,15
|
||||||
|
vpaddq ymm2, ymm2, ymm3
|
||||||
|
vpextrq [rdi+0x70], xmm2, 1 ;v9
|
||||||
|
vextracti128 xmm5, ymm2, 1
|
||||||
|
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
|
||||||
|
vpxor ymm1, ymm1, ymm2
|
||||||
|
vpaddq ymm4, ymm1, ymm1
|
||||||
|
vpsrlq ymm1, ymm1, 63
|
||||||
|
vpor ymm1, ymm1, ymm4
|
||||||
|
;Valid:
|
||||||
|
; v1 v2 v3
|
||||||
|
; v5 v6 v7
|
||||||
|
; v9 v10 v11
|
||||||
|
; v13 v14 v15
|
||||||
|
;
|
||||||
|
;v1 v2 <- v6 v7
|
||||||
|
;v13 <- v2
|
||||||
|
|
||||||
|
vpermq ymm1, ymm1, 0x39
|
||||||
|
vmovdqa xword [rdi+0x20], xmm1 ;v5,6
|
||||||
|
|
||||||
|
vextracti128 xmm4, ymm0, 1
|
||||||
|
vextracti128 xmm5, ymm1, 1
|
||||||
|
vpextrq [rdi+0x60], xmm4, 1 ;v3
|
||||||
|
vmovq [rdi+0x68], xmm5 ;v7
|
||||||
|
|
||||||
|
vpsrldq xmm3, xmm3, 8
|
||||||
|
vpaddq xmm0, xmm0, xmm1
|
||||||
|
vpextrq [rdi+0x30], xmm0, 1 ;v1
|
||||||
|
vpaddq xmm4, xmm4, xmm5
|
||||||
|
vmovq [rdi+0x78], xmm4 ;v2
|
||||||
|
vpxor xmm3, xmm3, xmm4
|
||||||
|
vpshufd xmm3, xmm3, 0xb1
|
||||||
|
vmovq [rdi+0x38], xmm3 ;v13
|
||||||
|
|
||||||
|
add rsp, 0x188
|
||||||
|
ret
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_ProcBlakeMsgSched:
|
||||||
|
;rsi=src
|
||||||
|
;r8=dst
|
||||||
|
;r9=sigma table
|
||||||
|
xor r10d, r10d
|
||||||
|
_LoopBlakeMsgSched:
|
||||||
|
movzx eax, byte [r9+r10]
|
||||||
|
mov rax, [rsi+rax*8]
|
||||||
|
mov [r8+r10*8], rax
|
||||||
|
add r10d, 1
|
||||||
|
cmp r10d, 16
|
||||||
|
jb _LoopBlakeMsgSched
|
||||||
|
ret
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_ProcBlakeRound:
|
||||||
|
vpaddq ymm0, ymm0, ymm1
|
||||||
|
vpaddq ymm0, ymm0, [r8]
|
||||||
|
vpxor ymm3, ymm3, ymm0
|
||||||
|
vpshufd ymm3, ymm3, 0xb1
|
||||||
|
vpaddq ymm2, ymm2, ymm3
|
||||||
|
vpxor ymm1, ymm1, ymm2
|
||||||
|
vpshufb ymm1, ymm1, ymm6 ;ror24
|
||||||
|
vpaddq ymm0, ymm0, ymm1
|
||||||
|
vpaddq ymm0, ymm0, [r8+0x20]
|
||||||
|
vpxor ymm3, ymm3, ymm0
|
||||||
|
vpshufb ymm3, ymm3, ymm7 ;ror16
|
||||||
|
vpaddq ymm2, ymm2, ymm3
|
||||||
|
vpxor ymm1, ymm1, ymm2
|
||||||
|
vpaddq ymm4, ymm1, ymm1
|
||||||
|
vpsrlq ymm1, ymm1, 63
|
||||||
|
vpor ymm1, ymm1, ymm4
|
||||||
|
|
||||||
|
vpermq ymm1, ymm1, 0x39
|
||||||
|
vpermq ymm2, ymm2, 0x4e
|
||||||
|
vpermq ymm3, ymm3, 0x93
|
||||||
|
|
||||||
|
vpaddq ymm0, ymm0, ymm1
|
||||||
|
vpaddq ymm0, ymm0, [r8+0x40]
|
||||||
|
vpxor ymm3, ymm3, ymm0
|
||||||
|
vpshufd ymm3, ymm3, 0xb1
|
||||||
|
vpaddq ymm2, ymm2, ymm3
|
||||||
|
vpxor ymm1, ymm1, ymm2
|
||||||
|
vpshufb ymm1, ymm1, ymm6 ;ror24
|
||||||
|
vpaddq ymm0, ymm0, ymm1
|
||||||
|
vpaddq ymm0, ymm0, [r8+0x60]
|
||||||
|
vpxor ymm3, ymm3, ymm0
|
||||||
|
vpshufb ymm3, ymm3, ymm7 ;ror16
|
||||||
|
vpaddq ymm2, ymm2, ymm3
|
||||||
|
vpxor ymm1, ymm1, ymm2
|
||||||
|
vpaddq ymm4, ymm1, ymm1
|
||||||
|
vpsrlq ymm1, ymm1, 63
|
||||||
|
vpor ymm1, ymm1, ymm4
|
||||||
|
|
||||||
|
vpermq ymm1, ymm1, 0x93
|
||||||
|
vpermq ymm2, ymm2, 0x4e
|
||||||
|
vpermq ymm3, ymm3, 0x39
|
||||||
|
ret
|
|
@ -0,0 +1,11 @@
|
||||||
|
format elf64
|
||||||
|
public Blake2PrepareMidstate2
|
||||||
|
public Blake2Run2
|
||||||
|
|
||||||
|
section '.text' executable align 64
|
||||||
|
include "proc_prepmidstate_avx1.asm"
|
||||||
|
align 16
|
||||||
|
include "proc_blake2_avx1.asm"
|
||||||
|
|
||||||
|
section '.data' writeable align 64
|
||||||
|
include "data_blake2b.asm"
|
Binary file not shown.
|
@ -0,0 +1,11 @@
|
||||||
|
format elf64
|
||||||
|
public Blake2PrepareMidstate4
|
||||||
|
public Blake2Run4
|
||||||
|
|
||||||
|
section '.text' executable align 64
|
||||||
|
include "proc_prepmidstate_avx2.asm"
|
||||||
|
align 16
|
||||||
|
include "proc_blake2_avx2.asm"
|
||||||
|
|
||||||
|
section '.data' writeable align 64
|
||||||
|
include "data_blake2b.asm"
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,51 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
|
||||||
|
//midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||||
|
//input: 140 bytes header, preferably aligned by 8
|
||||||
|
|
||||||
|
void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||||
|
//hashout: hash output buffer: 2*64 bytes
|
||||||
|
//midstate: 256 bytes from Blake2PrepareMidstate2
|
||||||
|
//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
|
||||||
|
|
||||||
|
unsigned char __attribute__((aligned(8))) testdata[140] =
|
||||||
|
{
|
||||||
|
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
|
||||||
|
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
|
||||||
|
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
|
||||||
|
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
|
||||||
|
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
|
||||||
|
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
|
||||||
|
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
|
||||||
|
};
|
||||||
|
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
unsigned char midstate_a[256+32];
|
||||||
|
void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
|
||||||
|
unsigned char hashout_a[128+32];
|
||||||
|
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
|
||||||
|
unsigned char buf[128];
|
||||||
|
FILE *outfile;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
Blake2PrepareMidstate2(pmidstate, testdata);
|
||||||
|
outfile = fopen("out.bin", "wb");
|
||||||
|
|
||||||
|
for (i=0; i<1048576; i+=2) {
|
||||||
|
Blake2Run2(phashout, pmidstate, i);
|
||||||
|
memcpy(buf, phashout, 50);
|
||||||
|
memcpy(buf+50, phashout+64, 50);
|
||||||
|
fwrite(buf, 100, 1, outfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(outfile);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,58 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
|
||||||
|
//midstate: 256 bytes of buffer for output midstate, aligned by 32
|
||||||
|
//input: 140 bytes header, preferably aligned by 8
|
||||||
|
|
||||||
|
void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
|
||||||
|
//hashout: hash output buffer: 4*64 bytes
|
||||||
|
//midstate: 256 bytes from Blake2PrepareMidstate4
|
||||||
|
//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
|
||||||
|
|
||||||
|
unsigned char __attribute__((aligned(8))) testdata[140] =
|
||||||
|
{
|
||||||
|
0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06,
|
||||||
|
0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C,
|
||||||
|
0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09,
|
||||||
|
0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7,
|
||||||
|
0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1,
|
||||||
|
0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF,
|
||||||
|
0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
|
||||||
|
};
|
||||||
|
//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
unsigned char midstate_a[256+32];
|
||||||
|
void *pmidstate = midstate_a; // (void *) (((long) midstate_a+31L) & -32L);
|
||||||
|
unsigned char hashout_a[256+32];
|
||||||
|
unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
|
||||||
|
unsigned char buf[256];
|
||||||
|
FILE *outfile;
|
||||||
|
int i,x;
|
||||||
|
|
||||||
|
Blake2PrepareMidstate4(pmidstate, testdata);
|
||||||
|
outfile = fopen("out.bin", "wb");
|
||||||
|
|
||||||
|
for (i=0; i<10*1048576; i+=4) {
|
||||||
|
Blake2Run4(phashout, pmidstate, i);
|
||||||
|
#if 0
|
||||||
|
memcpy(buf, phashout, 50);
|
||||||
|
memcpy(buf+50, phashout+64, 50);
|
||||||
|
memcpy(buf+100, phashout+128, 50);
|
||||||
|
memcpy(buf+150, phashout+192, 50);
|
||||||
|
fwrite(buf, 200, 1, outfile);
|
||||||
|
#else
|
||||||
|
x += phashout[0];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(outfile);
|
||||||
|
printf("x = %d\n", x);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Binary file not shown.
Loading…
Reference in New Issue