213 lines
5.0 KiB
NASM
213 lines
5.0 KiB
NASM
;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
|
|
;midstate: 256 bytes of buffer for output midstate, aligned by 32
|
|
;input: 140 bytes header, preferably aligned by 8
|
|
|
|
Blake2PrepareMidstate2:
|
|
sub rsp, 0x188
|
|
|
|
vmovdqa xmm10, xword [xshufb_ror24]
|
|
vmovdqa xmm11, xword [xshufb_ror16]
|
|
|
|
vmovdqa xmm0, xword [s0]
|
|
vmovdqa xmm1, xword [s2]
|
|
vmovdqa xmm2, xword [s4]
|
|
vmovdqa xmm3, xword [s6]
|
|
vmovdqa xmm4, xword [iv]
|
|
vmovdqa xmm5, xword [iv+0x10]
|
|
vmovdqa xmm6, xword [iv4xor128]
|
|
vmovdqa xmm7, xword [iv4xor128+0x10]
|
|
|
|
mov r8, rsp
|
|
lea r9, [blake2sigma]
|
|
lea r11, [blake2sigma+160]
|
|
call _ProcBlakeMsgSched
|
|
call _ProcBlakeRound
|
|
add r8, 0x80
|
|
add r9, 16
|
|
call _ProcBlakeMsgSched
|
|
call _ProcBlakeRound
|
|
add r8, 0x80
|
|
add r9, 16
|
|
_LoopEhPrepare1:
|
|
call _ProcBlakeMsgSched
|
|
call _ProcBlakeRound
|
|
add r9, 16
|
|
cmp r9, r11
|
|
jb _LoopEhPrepare1
|
|
mov r8, rsp
|
|
call _ProcBlakeRound
|
|
add r8, 0x80
|
|
call _ProcBlakeRound
|
|
|
|
vpxor xmm0, xmm0, xmm4
|
|
vpxor xmm1, xmm1, xmm5
|
|
vpxor xmm2, xmm2, xmm6
|
|
vpxor xmm3, xmm3, xmm7
|
|
vpxor xmm0, xmm0, xword [s0]
|
|
vpxor xmm1, xmm1, xword [s2]
|
|
vpxor xmm2, xmm2, xword [s4]
|
|
vpxor xmm3, xmm3, xword [s6]
|
|
vmovdqa xword [rdi+0x80], xmm0
|
|
vmovdqa xword [rdi+0x90], xmm1
|
|
vmovdqa xword [rdi+0xa0], xmm2
|
|
vmovdqa xword [rdi+0xb0], xmm3
|
|
vmovq xmm8, [rsi+0x80]
|
|
vpshufd xmm4, xmm8, 0x44
|
|
vmovdqa xword [rdi+0xc0], xmm4
|
|
vmovd xmm4, [rsi+0x88]
|
|
vpshufd xmm4, xmm4, 0x44
|
|
vmovdqa xword [rdi+0xd0], xmm4
|
|
|
|
;Begin second message block
|
|
vmovdqa xmm4, xword [iv]
|
|
vmovdqa xmm5, xword [iv+0x10]
|
|
vmovdqa xmm6, xword [iv4xor144]
|
|
vmovdqa xmm7, xword [iv6inverted]
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vpaddq xmm0, xmm0, xmm8 ;xmm8[63:0]=message
|
|
vpxor xmm6, xmm6, xmm0
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufd xmm6, xmm6, 0xb1
|
|
vmovq [rdi+0x08], xmm6 ;v12
|
|
vpshufd xmm7, xmm7, 0xb1
|
|
vpaddq xmm4, xmm4, xmm6
|
|
vmovq [rdi+0x10], xmm4 ;v8
|
|
vpaddq xmm5, xmm5, xmm7
|
|
vpxor xmm2, xmm2, xmm4
|
|
vpxor xmm3, xmm3, xmm5
|
|
vpshufb xmm2, xmm2, xmm10
|
|
vmovq [rdi+0x18], xmm2 ;v4
|
|
vpshufb xmm3, xmm3, xmm10
|
|
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vmovq [rdi], xmm0 ;v0
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vpextrq [rdi+0x60], xmm1, 1 ;v3
|
|
;add message (nonce, index) to xmm0 here, but we don't have
|
|
vpxor xmm6, xmm6, xmm0
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufb xmm6, xmm6, xmm11
|
|
vpshufb xmm7, xmm7, xmm11
|
|
vmovdqa xword [rdi+0x40], xmm7 ;v14,15
|
|
vpaddq xmm4, xmm4, xmm6
|
|
vpextrq [rdi+0x70], xmm4, 1 ;v9
|
|
vpaddq xmm5, xmm5, xmm7
|
|
vmovdqa xword [rdi+0x50], xmm5 ;v10,11
|
|
vpxor xmm2, xmm2, xmm4
|
|
vpxor xmm3, xmm3, xmm5
|
|
vpaddq xmm8, xmm2, xmm2
|
|
vpsrlq xmm2, xmm2, 63
|
|
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
|
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
|
vpsrlq xmm3, xmm3, 63
|
|
vpor xmm3, xmm3, xmm2
|
|
|
|
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
|
|
vmovdqa xword [rdi+0x20], xmm2 ;v5,6
|
|
vpsrldq xmm3, xmm3, 8
|
|
vmovq [rdi+0x68], xmm3 ;v7
|
|
vpsrldq xmm7, xmm6, 8
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vpextrq [rdi+0x30], xmm0, 1 ;v1
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vmovq [rdi+0x78], xmm1 ;v2
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufd xmm7, xmm7, 0xb1
|
|
vmovq [rdi+0x38], xmm7 ;v13
|
|
|
|
add rsp, 0x188
|
|
ret
|
|
|
|
align 16
|
|
_ProcBlakeMsgSched:
|
|
;rsi=src
|
|
;r8=dst
|
|
;r9=sigma table
|
|
xor r10d, r10d
|
|
_LoopBlakeMsgSched:
|
|
movzx eax, byte [r9+r10]
|
|
mov rax, [rsi+rax*8]
|
|
mov [r8+r10*8], rax
|
|
add r10d, 1
|
|
cmp r10d, 16
|
|
jb _LoopBlakeMsgSched
|
|
ret
|
|
|
|
align 16
|
|
_ProcBlakeRound:
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vpaddq xmm0, xmm0, [r8]
|
|
vpaddq xmm1, xmm1, [r8+0x10]
|
|
vpxor xmm6, xmm6, xmm0
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufd xmm6, xmm6, 0xb1
|
|
vpshufd xmm7, xmm7, 0xb1
|
|
vpaddq xmm4, xmm4, xmm6
|
|
vpaddq xmm5, xmm5, xmm7
|
|
vpxor xmm2, xmm2, xmm4
|
|
vpxor xmm3, xmm3, xmm5
|
|
vpshufb xmm2, xmm2, xmm10
|
|
vpshufb xmm3, xmm3, xmm10
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vpaddq xmm0, xmm0, [r8+0x20]
|
|
vpaddq xmm1, xmm1, [r8+0x30]
|
|
vpxor xmm6, xmm6, xmm0
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
|
|
vpshufb xmm7, xmm7, xmm11
|
|
vpaddq xmm4, xmm4, xmm9
|
|
vpaddq xmm5, xmm5, xmm7
|
|
vpxor xmm2, xmm2, xmm4
|
|
vpxor xmm3, xmm3, xmm5
|
|
vpaddq xmm8, xmm2, xmm2
|
|
vpsrlq xmm2, xmm2, 63
|
|
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
|
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
|
vpsrlq xmm3, xmm3, 63
|
|
vpor xmm3, xmm3, xmm2
|
|
|
|
vpalignr xmm2, xmm3, xmm8, 8 ;xmm2 resume
|
|
vpalignr xmm3, xmm8, xmm3, 8
|
|
vpalignr xmm6, xmm9, xmm7, 8 ;xmm6 resume
|
|
vpalignr xmm7, xmm7, xmm9, 8
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vpaddq xmm0, xmm0, [r8+0x40]
|
|
vpaddq xmm1, xmm1, [r8+0x50]
|
|
vpxor xmm6, xmm6, xmm0
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufd xmm6, xmm6, 0xb1
|
|
vpshufd xmm7, xmm7, 0xb1
|
|
vpaddq xmm5, xmm5, xmm6
|
|
vpaddq xmm4, xmm4, xmm7
|
|
vpxor xmm2, xmm2, xmm5
|
|
vpxor xmm3, xmm3, xmm4
|
|
vpshufb xmm2, xmm2, xmm10
|
|
vpshufb xmm3, xmm3, xmm10
|
|
vpaddq xmm0, xmm0, xmm2
|
|
vpaddq xmm1, xmm1, xmm3
|
|
vpaddq xmm0, xmm0, [r8+0x60]
|
|
vpaddq xmm1, xmm1, [r8+0x70]
|
|
vpxor xmm6, xmm6, xmm0
|
|
vpxor xmm7, xmm7, xmm1
|
|
vpshufb xmm9, xmm6, xmm11 ;xmm9 takes xmm6
|
|
vpshufb xmm7, xmm7, xmm11
|
|
vpaddq xmm5, xmm5, xmm9
|
|
vpaddq xmm4, xmm4, xmm7
|
|
vpxor xmm2, xmm2, xmm5
|
|
vpxor xmm3, xmm3, xmm4
|
|
vpaddq xmm8, xmm2, xmm2
|
|
vpsrlq xmm2, xmm2, 63
|
|
vpor xmm8, xmm2, xmm8 ;xmm8 takes xmm2
|
|
vpaddq xmm2, xmm3, xmm3 ;xmm2 is temp
|
|
vpsrlq xmm3, xmm3, 63
|
|
vpor xmm3, xmm3, xmm2
|
|
vpalignr xmm2, xmm8, xmm3, 8 ;xmm2 resume
|
|
vpalignr xmm3, xmm3, xmm8, 8
|
|
vpalignr xmm6, xmm7, xmm9, 8 ;xmm6 resume
|
|
vpalignr xmm7, xmm9, xmm7, 8
|
|
ret
|