HeavyThing - sha2.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	;       
	; sha2.inc: SHA2-{224,256,384,512} goods
	;
	;
	; lots of different ways to skin this cat floating around on the net..
	; someday when I am bored, implement the different ones ;-)
	;
	; these perform as good or better than anything else i could find that
	; was non-SSE4/AVX/AVX2
	; translated loosely from some of the public domain goods from Wei Dai
	; and modified to suit my environment
	;


sha224_state_size = 144
sha256_state_size = 144
sha384_state_size = 240
sha512_state_size = 240

sha_stateptr_ofs = 0
sha_bitcountptr_ofs = 8
sha_bufferptr_ofs = 16

; sha224/sha256 == 32 bytes for stateptr, 16 bytes for bitcount, 64 bytes for buffer, _after_ our three pointers
; sha384/sha512 == 64 bytes for stateptr, 16 bytes for bitcount, 128 bytes for buffer, _after_ our three pointers
;
; on init, we make sure all three of the pointer values are 16 byte aligned

if used sha224$new | defined include_everything
	; no arguments, does a heap$alloc of the required state and performs the sha224$init on it
	; returns initialized state
falign
sha224$new:
	prolog	sha224$new
	mov	edi, sha224_state_size
	call	heap$alloc
	push	rax
	mov	rdi, rax
	call	sha224$init
	pop	rax
	epilog

end if

if used sha224$init | defined include_everything
	; single argument in rdi: our sha state
	; void return
falign
sha224$init:
	prolog	sha224$init
	; if rdi is 16 aligned on entry, then stateptr == rdi+32, bitcountptr == rdi+64, bufferptr = rdi+80
	; if it is not aligned on entry, then stateptr == rdi+24, bitcountptr == rdi+56, bufferptr = rdi+72
	lea	rax, [rdi+32]
	lea	rcx, [rdi+64]
	lea	rdx, [rdi+80]
	lea	r8, [rdi+24]
	lea	r9, [rdi+56]
	lea	r10, [rdi+72]
	test	rdi, 0xf
	cmovnz	rax, r8
	cmovnz	rcx, r9
	cmovnz	rdx, r10
	xor	esi, esi
	mov	[rdi+sha_stateptr_ofs], rax
	mov	[rdi+sha_bitcountptr_ofs], rcx
	mov	[rdi+sha_bufferptr_ofs], rdx
	; so now, each of the 3 pointers is 16 byte aligned within our own state
	push	rax
	add	rdi, 24
	mov	edx, sha224_state_size - 24
	call	memset32
	pop	rdi
	mov	rsi, .initial_hash
	mov	edx, 32
	call	memcpy
	epilog
dalign
.initial_hash:
	dd	0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4

end if


if used sha256$new | defined include_everything
	; no arguments, does a heap$alloc of the required state and performs the sha256$init on it
	; returns initialized state
falign
sha256$new:
	prolog	sha256$new
	mov	rdi, sha256_state_size
	call	heap$alloc
	push	rax
	mov	rdi, rax
	call	sha256$init
	pop	rax
	epilog

end if


if used sha256$init | defined include_everything
	; single argument in rdi: our sha state
	; void return
falign
sha256$init:
	prolog	sha256$init
	; if rdi is 16 aligned on entry, then stateptr == rdi+32, bitcountptr == rdi+64, bufferptr = rdi+80
	; if it is not aligned on entry, then stateptr == rdi+24, bitcountptr == rdi+56, bufferptr = rdi+72
	lea	rax, [rdi+32]
	lea	rcx, [rdi+64]
	lea	rdx, [rdi+80]
	lea	r8, [rdi+24]
	lea	r9, [rdi+56]
	lea	r10, [rdi+72]
	test	rdi, 0xf
	cmovnz	rax, r8
	cmovnz	rcx, r9
	cmovnz	rdx, r10
	xor	esi, esi
	mov	[rdi+sha_stateptr_ofs], rax
	mov	[rdi+sha_bitcountptr_ofs], rcx
	mov	[rdi+sha_bufferptr_ofs], rdx
	; so now, each of the 3 pointers is 16 byte aligned within our own state
	push	rax
	add	rdi, 24
	mov	edx, sha256_state_size - 24
	call	memset32
	pop	rdi
	mov	rsi, .initial_hash
	mov	edx, 32
	call	memcpy
	epilog
dalign
.initial_hash:
	dd	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19


end if



if used sha256$update | used sha224$update | defined include_everything
	; three arguments: rdi == sha state, rsi == byte buffer, rdx == length of same
	; void return
falign
sha256$update:
sha224$update:
	prolog	sha256$update
	mov	r8, [rdi+sha_bitcountptr_ofs]
	test	rdx, rdx
	jz	.nothingtodo
	mov	rcx, [r8]
	mov	r9d, 64
	shr	rcx, 3
	and	rcx, 0x3f
	test	rcx, rcx
	jz	.noused
	sub	r9d, ecx			; 64 - bytes used in the buffer
	cmp	rdx, r9				; are we adding less than the full block?
	jb	.needmore
	; otherwise, we need to fill our buffer, transform that, and then
	; leave the rest to a normal non-buffer based fill
	push	rdi rsi rdx
	mov	rdi, [rdi+sha_bufferptr_ofs]
	mov	rdx, r9
	add	qword [rsp+8], r9
	sub	qword [rsp], r9
	shl	r9, 3
	add	rdi, rcx
	add	qword [r8], r9
	call	memcpy
	mov	rdi, [rsp+16]
	; we need rdx to be set here to a flat 64 bytes for our buffer
	mov	edx, 64
	mov	rsi, [rdi+sha_bufferptr_ofs]
	call	sha256$transform
	pop	rdx rsi rdi
	mov	r8, [rdi+sha_bitcountptr_ofs]
	jmp	.noused
calign
.needmore:
	; rdx is less than the number of bytes we have left in our buffer
	mov	r10, rdx
	shl	r10, 3
	mov	rdi, [rdi+sha_bufferptr_ofs]
	add	qword [r8], r10
	add	rdi, rcx
	call	memcpy
	epilog
calign
.noused:
	; update our bitcount in its entirety beforehand
	mov	rcx, rdx
	shl	rcx, 3
	add	qword [r8], rcx
	cmp	rdx, 64
	jb	.partial
	call	sha256$transform
	; it returns us with how many bytes it did _not_ process
	; and rsi/rdx is the goods that we'd need to copy if there are leftovers
	test	rdx, rdx
	jnz	.partial
	epilog
calign
.partial:
	mov	rdi, [rdi+sha_bufferptr_ofs]
	call	memcpy
	epilog
calign
.nothingtodo:
	epilog


end if


if used sha256$transform | defined include_everything
	; note: not meant to be called externally, but for profiling reasons down the track
	; is made with the normal profiler/public symbol entries
	; called from sha256$update and sha256$final


	; note: keeping the state (8 dwords) in registers is marginally faster than doing it
	; entirely on the stack... though some quality time really needs to be spent in here
	; actually optimizing it properly, hahah, crazy that decoder speed really does matter
	; through here...
sha256_stateregisters = 1

falign
sha256$transform:
	prolog	sha256$transform
	; rdi == our state, rsi == (dd) data
	; we must preserve rdi, rsi, and rdx (updating rsi/rdx as we go), but we are free to kill everything else
	sub	rsp, 288
	mov	eax, 288
	mov	ecx, 280
	lea	r8, [rsp+8]
	test	rsp, 0xf
	cmovnz	rsp, r8
	cmovnz	eax, ecx

	mov	[rsp+0x88], rbx
	mov	[rsp+0x90], rdi
	mov	[rsp+0x98], rsi
	mov	[rsp+0xa0], rdx

	mov	rdi, [rdi+sha_stateptr_ofs]

	mov	qword [rsp+0x80], rax		; amount to add to the stack when we are done
	; so now, we have an aligned 16 stack with the ability to correctly replace it when we are done
if sha256_stateregisters
	; save four more of our callee-saves
	mov	[rsp+0xa8], r12
	mov	[rsp+0xb0], r13
	mov	[rsp+0xb8], r14
	mov	[rsp+0xc0], r15
end if

	; so now we have an aligned-16 working block at rcx on our stack
	; rsi still pointed at our data
	
	mov	qword [rsp+0x68], rdi	; STATE_SAVE

if sha256_stateregisters
	; we want to use r8d..r15d for our state instead of the aligned first 32 bytes of our stack
	mov	r8d, [rdi]
	mov	r9d, [rdi+4]
	mov	r10d, [rdi+8]
	mov	r11d, [rdi+12]
	mov	r12d, [rdi+16]
	mov	r13d, [rdi+20]
	mov	r14d, [rdi+24]
	mov	r15d, [rdi+28]
else
	movdqa	xmm0, [rdi]
	movdqa	xmm1, [rdi+16]
	movdqa	[rsp], xmm0
	movdqa	[rsp+16], xmm1
end if

calign
.nextblock:	; here is where we jump to from the bottom if there was more to do

	; setup rest of message from our data
if use_movbe
	mov	rax, [rsi]
	mov	rbx, [rsi+0x8]
	mov	rcx, [rsi+0x10]
	movbe	[rsp+0x58], rax
	movbe	[rsp+0x50], rbx
	movbe	[rsp+0x48], rcx
	mov	rdx, [rsi+0x18]
	mov	rax, [rsi+0x20]
	mov	rbx, [rsi+0x28]
	movbe	[rsp+0x40], rdx
	movbe	[rsp+0x38], rax
	movbe	[rsp+0x30], rbx
	mov	rcx, [rsi+0x30]
	mov	rdx, [rsi+0x38]
	movbe	[rsp+0x28], rcx
	movbe	[rsp+0x20], rdx
else
	mov	rax, [rsi]
	mov	rbx, [rsi+0x8]
	mov	rcx, [rsi+0x10]
	bswap	rax
	bswap	rbx
	bswap	rcx
	mov	[rsp+0x58], rax
	mov	[rsp+0x50], rbx
	mov	[rsp+0x48], rcx
	mov	rdx, [rsi+0x18]
	mov	rax, [rsi+0x20]
	mov	rbx, [rsi+0x28]
	bswap	rdx
	bswap	rax
	bswap	rbx
	mov	[rsp+0x40], rdx
	mov	[rsp+0x38], rax
	mov	[rsp+0x30], rbx
	mov	rcx, [rsi+0x30]
	mov	rdx, [rsi+0x38]
	bswap	rcx
	bswap	rdx
	mov	[rsp+0x28], rcx
	mov	[rsp+0x20], rdx
end if

if sha256_stateregisters
	mov	eax, r9d		; B
	mov	edi, r12d		; E
	mov	ecx, r8d		; A
	xor	eax, r10d		; B^C
else
	mov	eax, dword [rsp+0x4]	; B
	mov	edi, [rsp+0x10]		; E
	mov	ecx, dword [rsp]	; A
	xor	eax, dword [rsp+0x8]	; B^C
end if

macro sha256_rb1 i*, r1*, r2*, kofs* {
	local	H,Hr,Wt,Wt2,Wt7,Wt15
	H = ((1024 + 7 - i) and 7)
	Wt= (((1024 + 15 - i) and 15) * 4) + 32
	Wt2=(((1024 + 15 - (i - 2)) and 15) * 4) + 32
	Wt7=(((1024 + 15 - (i - 7)) and 15) * 4) + 32
	Wt15=(((1024 + 15 - (i - 15)) and 15) * 4) + 32
	
	mov	esi, [rsp+Wt2]				; W reference
	mov	r2, [rsp+Wt15]				; W reference
	mov	ebx, esi
	shr	esi, 10
	ror	ebx, 17
	xor	esi, ebx
	ror	ebx, 2
	xor	ebx, esi
	mov	esi, r2
	add	ebx, [rsp+Wt7]				; W reference
	shr	esi, 3
	ror	r2, 7
	add	ebx, [rsp+Wt]				; W reference
	xor	esi, r2
	add	r1, [.k + kofs + i*4]				; k reference
	ror	r2, 11
if sha256_stateregisters
	; add	r1, [rsp+H*4]				; state reference
if H = 0
	add	r1, r8d
else if H = 1
	add	r1, r9d
else if H = 2
	add	r1, r10d
else if H = 3
	add	r1, r11d
else if H = 4
	add	r1, r12d
else if H = 5
	add	r1, r13d
else if H = 6
	add	r1, r14d
else if H = 7
	add	r1, r15d
end if
else
	add	r1, [rsp+H*4]				; state reference
end if
	xor	esi, r2
	add	esi, ebx
	mov	[rsp+Wt], esi				; W reference
	add	r1, esi
}

macro sha256_round i*, r*, r1*, r2*, r3*, r4*, kofs* {
	; r1 == eax
	; r2 == ecx
	; r3 == edi
	; r4 == edx
	local	H,G,F,E,D,C,B,A,Wt
	H = ((1024 + 7 - i) and 7)
	G = ((1024 + 7 - (i + 1)) and 7)
	F = ((1024 + 7 - (i + 2)) and 7)
	E = ((1024 + 7 - (i + 3)) and 7)
	D = ((1024 + 7 - (i + 4)) and 7)
	C = ((1024 + 7 - (i + 5)) and 7)
	B = ((1024 + 7 - (i + 6)) and 7)
	A = ((1024 + 7 - (i + 7)) and 7)
	Wt= (((1024 + 15 - i) and 15) * 4) + 32
	
	mov	esi, r3
if sha256_stateregisters
	; mov	r4, [rsp+F*4]				; state reference
if F = 0
	mov	r4, r8d
else if F = 1
	mov	r4, r9d
else if F = 2
	mov	r4, r10d
else if F = 3
	mov	r4, r11d
else if F = 4
	mov	r4, r12d
else if F = 5
	mov	r4, r13d
else if F = 6
	mov	r4, r14d
else if F = 7
	mov	r4, r15d
end if
else
	mov	r4, [rsp+F*4]				; state reference
end if
if sha256_stateregisters
	; xor	r4, [rsp+G*4]				; state reference
if G = 0
	xor	r4, r8d
else if G = 1
	xor	r4, r9d
else if G = 2
	xor	r4, r10d
else if G = 3
	xor	r4, r11d
else if G = 4
	xor	r4, r12d
else if G = 5
	xor	r4, r13d
else if G = 6
	xor	r4, r14d
else if G = 7
	xor	r4, r15d
end if
else
	xor	r4, [rsp+G*4]				; state reference
end if
	ror	esi, 25
	and	r4, r3
if sha256_stateregisters
	; xor	r4, [rsp+G*4]				; state reference
if G = 0
	xor	r4, r8d
else if G = 1
	xor	r4, r9d
else if G = 2
	xor	r4, r10d
else if G = 3
	xor	r4, r11d
else if G = 4
	xor	r4, r12d
else if G = 5
	xor	r4, r13d
else if G = 6
	xor	r4, r14d
else if G = 7
	xor	r4, r15d
end if
else
	xor	r4, [rsp+G*4]				; state reference
end if

	ror	r3, 6
if r = 0
	add	r4, [.k + kofs + i*4]				; k reference
end if
	xor	esi, r3
if r = 0
	add	r4, [rsp+Wt]				; W reference
end if
	ror	r3, 5
if r = 0
if sha256_stateregisters
	; add	r4, [rsp+H*4]				; state reference
if H = 0
	add	r4, r8d
else if H = 1
	add	r4, r9d
else if H = 2
	add	r4, r10d
else if H = 3
	add	r4, r11d
else if H = 4
	add	r4, r12d
else if H = 5
	add	r4, r13d
else if H = 6
	add	r4, r14d
else if H = 7
	add	r4, r15d
end if
else
	add	r4, [rsp+H*4]				; state reference
end if

end if
	xor	esi, r3
	add	r4, esi

if r = 1
	sha256_rb1 i, r4, r3, kofs
end if

	mov	ebx, r2
	mov	esi, r2
if sha256_stateregisters
	; xor	r2, [rsp+B*4]				; state reference
if B = 0
	xor	r2, r8d
else if B = 1
	xor	r2, r9d
else if B = 2
	xor	r2, r10d
else if B = 3
	xor	r2, r11d
else if B = 4
	xor	r2, r12d
else if B = 5
	xor	r2, r13d
else if B = 6
	xor	r2, r14d
else if B = 7
	xor	r2, r15d
end if
else
	xor	r2, [rsp+B*4]				; state reference
end if
	and	r1, r2
	ror	ebx, 2
if sha256_stateregisters
	; xor	r1, [rsp+B*4]				; state reference
if B = 0
	xor	r1, r8d
else if B = 1
	xor	r1, r9d
else if B = 2
	xor	r1, r10d
else if B = 3
	xor	r1, r11d
else if B = 4
	xor	r1, r12d
else if B = 5
	xor	r1, r13d
else if B = 6
	xor	r1, r14d
else if B = 7
	xor	r1, r15d
end if
else
	xor	r1, [rsp+B*4]				; state reference
end if
	add	r1, r4
	ror	esi, 22
if sha256_stateregisters
	; add	r4, [rsp+D*4]				; state reference
if D = 0
	add	r4, r8d
else if D = 1
	add	r4, r9d
else if D = 2
	add	r4, r10d
else if D = 3
	add	r4, r11d
else if D = 4
	add	r4, r12d
else if D = 5
	add	r4, r13d
else if D = 6
	add	r4, r14d
else if D = 7
	add	r4, r15d
end if
else
	add	r4, [rsp+D*4]				; state reference
end if
	xor	esi, ebx
if sha256_stateregisters
	; mov	[rsp+D*4], r4				; state reference (write)
if D = 0
	mov	r8d, r4
else if D = 1
	mov	r9d, r4
else if D = 2
	mov	r10d, r4
else if D = 3
	mov	r11d, r4
else if D = 4
	mov	r12d, r4
else if D = 5
	mov	r13d, r4
else if D = 6
	mov	r14d, r4
else if D = 7
	mov	r15d, r4
end if
else
	mov	[rsp+D*4], r4				; state reference (write)
end if
	ror	ebx, 11
	xor	esi, ebx
	add	r1, esi
if sha256_stateregisters
	; mov	[rsp+H*4], r1				; state reference (write)
if H = 0
	mov	r8d, r1
else if H = 1
	mov	r9d, r1
else if H = 2
	mov	r10d, r1
else if H = 3
	mov	r11d, r1
else if H = 4
	mov 	r12d, r1
else if H = 5
	mov	r13d, r1
else if H = 6
	mov	r14d, r1
else if H = 7
	mov	r15d, r1
end if
else
	mov	[rsp+H*4], r1				; state reference (write)
end if
}
        sha256_round 0, 0, eax, ecx, edi, edx, 0x0
        sha256_round 1, 0, ecx, eax, edx, edi, 0x0
        sha256_round 2, 0, eax, ecx, edi, edx, 0x0
        sha256_round 3, 0, ecx, eax, edx, edi, 0x0
        sha256_round 4, 0, eax, ecx, edi, edx, 0x0
        sha256_round 5, 0, ecx, eax, edx, edi, 0x0
        sha256_round 6, 0, eax, ecx, edi, edx, 0x0
        sha256_round 7, 0, ecx, eax, edx, edi, 0x0
        sha256_round 8, 0, eax, ecx, edi, edx, 0x0
        sha256_round 9, 0, ecx, eax, edx, edi, 0x0
        sha256_round 10, 0, eax, ecx, edi, edx, 0x0
        sha256_round 11, 0, ecx, eax, edx, edi, 0x0
        sha256_round 12, 0, eax, ecx, edi, edx, 0x0
        sha256_round 13, 0, ecx, eax, edx, edi, 0x0
        sha256_round 14, 0, eax, ecx, edi, edx, 0x0
        sha256_round 15, 0, ecx, eax, edx, edi, 0x0

        sha256_round 0, 1, eax, ecx, edi, edx, 0x40
        sha256_round 1, 1, ecx, eax, edx, edi, 0x40
        sha256_round 2, 1, eax, ecx, edi, edx, 0x40
        sha256_round 3, 1, ecx, eax, edx, edi, 0x40
        sha256_round 4, 1, eax, ecx, edi, edx, 0x40
        sha256_round 5, 1, ecx, eax, edx, edi, 0x40
        sha256_round 6, 1, eax, ecx, edi, edx, 0x40
        sha256_round 7, 1, ecx, eax, edx, edi, 0x40
        sha256_round 8, 1, eax, ecx, edi, edx, 0x40
        sha256_round 9, 1, ecx, eax, edx, edi, 0x40
        sha256_round 10, 1, eax, ecx, edi, edx, 0x40
        sha256_round 11, 1, ecx, eax, edx, edi, 0x40
        sha256_round 12, 1, eax, ecx, edi, edx, 0x40
        sha256_round 13, 1, ecx, eax, edx, edi, 0x40
        sha256_round 14, 1, eax, ecx, edi, edx, 0x40
        sha256_round 15, 1, ecx, eax, edx, edi, 0x40

        sha256_round 0, 1, eax, ecx, edi, edx, 0x80
        sha256_round 1, 1, ecx, eax, edx, edi, 0x80
        sha256_round 2, 1, eax, ecx, edi, edx, 0x80
        sha256_round 3, 1, ecx, eax, edx, edi, 0x80
        sha256_round 4, 1, eax, ecx, edi, edx, 0x80
        sha256_round 5, 1, ecx, eax, edx, edi, 0x80
        sha256_round 6, 1, eax, ecx, edi, edx, 0x80
        sha256_round 7, 1, ecx, eax, edx, edi, 0x80
        sha256_round 8, 1, eax, ecx, edi, edx, 0x80
        sha256_round 9, 1, ecx, eax, edx, edi, 0x80
        sha256_round 10, 1, eax, ecx, edi, edx, 0x80
        sha256_round 11, 1, ecx, eax, edx, edi, 0x80
        sha256_round 12, 1, eax, ecx, edi, edx, 0x80
        sha256_round 13, 1, ecx, eax, edx, edi, 0x80
        sha256_round 14, 1, eax, ecx, edi, edx, 0x80
        sha256_round 15, 1, ecx, eax, edx, edi, 0x80

        sha256_round 0, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 1, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 2, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 3, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 4, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 5, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 6, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 7, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 8, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 9, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 10, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 11, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 12, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 13, 1, ecx, eax, edx, edi, 0xc0
        sha256_round 14, 1, eax, ecx, edi, edx, 0xc0
        sha256_round 15, 1, ecx, eax, edx, edi, 0xc0


	mov	rdx, [rsp+0xa0]
	mov	rcx, [rsp+0x68]		; STATE_SAVE
	mov	rsi, [rsp+0x98]
	sub	rdx, 64
	add	rsi, 64
	cmp	rdx, 64
	jae	.moretogo
	; else, remaining bytes is < a full block, so bailout

	; store our updated state and restore our goods
if sha256_stateregisters
	add	dword [rcx], r8d
	add	dword [rcx+4], r9d
	add	dword [rcx+8], r10d

	mov	rax, [rsp+0x80]			; amount to add to the stack
	mov	rbx, [rsp+0x88]
	mov	rdi, [rsp+0x90]

	add	dword [rcx+12], r11d
	add	dword [rcx+16], r12d
	add	dword [rcx+20], r13d
	add	dword [rcx+24], r14d
	add	dword [rcx+28], r15d

	mov	r12, [rsp+0xa8]
	mov	r13, [rsp+0xb0]
	mov	r14, [rsp+0xb8]
	mov	r15, [rsp+0xc0]
else
	movdqa	xmm0, [rcx]
	movdqa	xmm1, [rcx+16]

	mov	rax, [rsp+0x80]			; amount to add to the stack
	mov	rbx, [rsp+0x88]
	mov	rdi, [rsp+0x90]

	paddd	xmm0, [rsp]
	paddd	xmm1, [rsp+16]
	movdqa	[rcx], xmm0
	movdqa	[rcx+16], xmm1
end if
	add	rsp, rax
	epilog
calign
.moretogo:
if sha256_stateregisters
	; at the end of each block, we still need to add the original state:
	; we need to put them back too
	add	r8d, dword [rcx]
	add	r9d, dword [rcx+4]
	add	r10d, dword [rcx+8]

	mov	dword [rcx], r8d
	mov	dword [rcx+4], r9d
	mov	dword [rcx+8], r10d

	add	r11d, dword [rcx+12]
	add	r12d, dword [rcx+16]
	add	r13d, dword [rcx+20]

	mov	dword [rcx+12], r11d
	mov	dword [rcx+16], r12d
	mov	dword [rcx+20], r13d

	add	r14d, dword [rcx+24]
	add	r15d, dword [rcx+28]

	mov	dword [rcx+24], r14d
	mov	dword [rcx+28], r15d
else
	movdqa	xmm0, [rcx]
	movdqa	xmm1, [rcx+16]
	paddd	xmm0, [rsp]
	paddd	xmm1, [rsp+16]
	movdqa	[rcx], xmm0
	movdqa	[rcx+16], xmm1
	; also need to udpate them in our state/stackframe:
	movdqa	[rsp], xmm0
	movdqa	[rsp+16], xmm1

end if

	; and we need to store our updated rsi/rdx for the next fallthrough
	mov	[rsp+0x98], rsi
	mov	[rsp+0xa0], rdx
	jmp	.nextblock
align 16
.k:
	dd	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
        dd	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
        dd	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
        dd	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
        dd	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
        dd	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
        dd	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
        dd	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2


end if

if used sha224$final | defined include_everything
	; three arguments: rdi == sha state, rsi == pointer to 28 byte buffer for the final digest, bool in edx as to whether we should heap$free the state
	; void return
falign
sha224$final:
	prolog	sha224$final
	push	rdi rsi
	sub	rsp, 32
	mov	rsi, rsp
	call	sha256$final
	mov	rdi, [rsp+32]	; rsi of 28 byte buffer
	mov	rsi, rsp
	mov	edx, 28
	call	memcpy
	add	rsp, 32
	pop	rsi rdi
	epilog
end if


if used sha224$mgf1 | defined include_everything
	; one-pass MGF1 as defined by rfc2437 (one-pass == we do stack-based state)
	; four arguments: rdi == seed, rsi == seed length, rdx == destination, rcx == dest length
falign
sha224$mgf1:
	prolog	sha224$mgf1
	push	r12 r13 r14 r15
	mov	r12, rdi
	mov	r13, rsi
	mov	r14, rdx
	mov	r15, rcx
	sub	rsp, sha224_state_size + 28 + 8
	mov	rdi, rsp
	call	sha224$init
	mov	qword [rsp+sha224_state_size+28], 0
calign
.doit:
	mov	rdi, rsp
	mov	rsi, r12
	mov	rdx, r13
	call	sha224$update
	mov	eax, [rsp+sha224_state_size+28]
if use_movbe
	add	dword [rsp+sha224_state_size+28], 1
	movbe	[rsp+sha224_state_size+32], eax
else
	bswap	eax
	add	dword [rsp+sha224_state_size+28], 1
	mov	[rsp+sha224_state_size+32], eax
end if
	mov	rdi, rsp
	lea	rsi, [rsp+sha224_state_size+32]
	mov	edx, 4
	call	sha224$update
	mov	rdi, rsp
	lea	rsi, [rsp+sha224_state_size]
	xor	edx, edx
	call	sha224$final
	mov	rdi, r14
	lea	rsi, [rsp+sha224_state_size]
	mov	edx, 28
	cmp	rdx, r15
	cmova	rdx, r15
	add	r14, rdx
	sub	r15, rdx
	call	memcpy
	test	r15, r15
	jnz	.doit
	add	rsp, sha224_state_size + 28 + 8
	pop	r15 r14 r13 r12
	epilog

end if


if used sha256$final | defined include_everything
	; three arguments: rdi == sha state, rsi == pointer to 32 byte buffer for the final digest, bool in edx as to whether we should heap$free the state
	; void return
falign
sha256$final:
	prolog	sha256$final
	push	rdx rsi rdi

	mov	r8, [rdi+sha_bitcountptr_ofs]
if use_movbe
	mov	rcx, [r8]
	movbe	[r8], rcx
	shr	ecx, 3
	and	ecx, 0x3f
else
	mov	rcx, [r8]
	mov	r9, rcx
	bswap	r9
	shr	rcx, 3
	mov	[r8], r9		; bitcount reversed 64 bits
	and	rcx, 0x3f	
end if
	test	ecx, ecx		; usedspace?
	jz	.noused


	; else, we have to begin our padding with 1 bit: 0x80
	; short block length == 56
	mov	r10, [rdi+sha_bufferptr_ofs]
	mov	byte [r10+rcx], 0x80
	add	rcx, 1
	cmp	rcx, 56
	jle	.zeroremaining
	cmp	rcx, 64
	jae	.dosecondtolast
	; else, zero the remaining 64 - usedspace
	mov	rdi, r10
	add	rdi, rcx
	xor	esi, esi
	mov	edx, 64
	sub	edx, ecx
	call	memset
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
calign
.dosecondtolast:
	mov	rsi, [rdi+sha_bufferptr_ofs]
	mov	edx, 64
	call	sha256$transform
	; setup for final:
	mov	rdi, [rsp]
	xor	esi, esi
	mov	edx, 56
	mov	rdi, [rdi+sha_bufferptr_ofs]
	call	memset32
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	jmp	.dofinal
calign
.zeroremaining:
	mov	rdi, r10
	add	rdi, rcx
	xor	esi, esi
	mov	edx, 56
	sub	edx, ecx
	call	memset
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	jmp	.dofinal
calign
.noused:
	mov	rdi, [rdi+sha_bufferptr_ofs]
	xor	esi, esi
	mov	edx, 56
	call	memset32
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	mov	r9, [rdi+sha_bufferptr_ofs]
	mov	dword [r9], 0x80
calign
.dofinal:
	mov	r8, [rdi+sha_bitcountptr_ofs]
	mov	rcx, [r8]
	mov	r9, [rdi+sha_bufferptr_ofs]
	mov	qword [r9+56], rcx
	mov	edx, 64
	mov	rsi, r9
	call	sha256$transform
	; rdi and rsi both stay in tact across that call
	mov	rsi, [rsp+8]
	mov	rdx, [rdi+sha_stateptr_ofs]
if use_movbe
	mov	eax, dword [rdx]
	mov	r8d, dword [rdx+4]
	mov	r9d, dword [rdx+8]
	movbe	dword [rsi], eax
	movbe	dword [rsi+4], r8d
	movbe	dword [rsi+8], r9d
	mov	r10d, dword [rdx+12]
	mov	eax, dword [rdx+16]
	mov	r8d, dword [rdx+20]
	movbe	dword [rsi+12], r10d
	movbe	dword [rsi+16], eax
	movbe	dword [rsi+20], r8d
	mov	r9d, dword [rdx+24]
	mov	r10d, dword [rdx+28]
	movbe	dword [rsi+24], r9d
	movbe	dword [rsi+28], r10d
else
	mov	eax, dword [rdx]
	mov	r8d, dword [rdx+4]
	mov	r9d, dword [rdx+8]
	bswap	eax
	bswap	r8d
	bswap	r9d
	mov	dword [rsi], eax
	mov	dword [rsi+4], r8d
	mov	dword [rsi+8], r9d
	mov	r10d, dword [rdx+12]
	mov	eax, dword [rdx+16]
	mov	r8d, dword [rdx+20]
	bswap	r10d
	bswap	eax
	bswap	r8d
	mov	dword [rsi+12], r10d
	mov	dword [rsi+16], eax
	mov	dword [rsi+20], r8d
	mov	r9d, dword [rdx+24]
	mov	r10d, dword [rdx+28]
	bswap	r9d
	bswap	r10d
	mov	dword [rsi+24], r9d
	mov	dword [rsi+28], r10d
end if
	; last but not least, reinitialize our state for further use
	; rdi is still intact
	call	sha256$init
	cmp	dword [rsp+16], 0
	jne	.freeandreturn
	add	rsp, 24
	epilog
calign
.freeandreturn:
	mov	rdi, [rsp]
	call	heap$free
	add	rsp, 24
	epilog

end if

if used sha256$mgf1 | defined include_everything
	; one-pass MGF1 as defined by rfc2437 (one-pass == we do stack-based state)
	; four arguments: rdi == seed, rsi == seed length, rdx == destination, rcx == dest length
falign
sha256$mgf1:
	prolog	sha256$mgf1
	push	r12 r13 r14 r15
	mov	r12, rdi
	mov	r13, rsi
	mov	r14, rdx
	mov	r15, rcx
	sub	rsp, sha256_state_size + 32 + 8
	mov	rdi, rsp
	call	sha256$init
	mov	qword [rsp+sha256_state_size+32], 0
calign
.doit:
	mov	rdi, rsp
	mov	rsi, r12
	mov	rdx, r13
	call	sha256$update
	mov	eax, [rsp+sha256_state_size+32]
if use_movbe
	add	dword [rsp+sha256_state_size+32], 1
	movbe	[rsp+sha256_state_size+36], eax
else
	bswap	eax
	add	dword [rsp+sha256_state_size+32], 1
	mov	[rsp+sha256_state_size+36], eax
end if
	mov	rdi, rsp
	lea	rsi, [rsp+sha256_state_size+36]
	mov	edx, 4
	call	sha256$update
	mov	rdi, rsp
	lea	rsi, [rsp+sha256_state_size]
	xor	edx, edx
	call	sha256$final
	mov	rdi, r14
	lea	rsi, [rsp+sha256_state_size]
	mov	edx, 32
	cmp	rdx, r15
	cmova	rdx, r15
	add	r14, rdx
	sub	r15, rdx
	call	memcpy
	test	r15, r15
	jnz	.doit
	add	rsp, sha256_state_size + 32 + 8
	pop	r15 r14 r13 r12
	epilog

end if



if used sha384$new | defined include_everything
	; no arguments, does a heap$alloc of the required state and performs the sha384$init on it
	; returns initialized state
falign
sha384$new:
	prolog	sha384$new
	mov	rdi, sha384_state_size
	call	heap$alloc
	push	rax
	mov	rdi, rax
	call	sha384$init
	pop	rax
	epilog

end if


if used sha384$init | defined include_everything
	; single argument in rdi: our sha state
	; void return
falign
sha384$init:
	prolog	sha384$init
	; if rdi is 16 aligned on entry, then stateptr == rdi+32, bitcountptr == rdi+96, bufferptr == rdi+112
	; if it is not aligned on entry, then stateptr == rdi+24, bitcountptr == rdi+88, bufferptr == rdi+104
	lea	rax, [rdi+32]
	lea	rcx, [rdi+96]
	lea	rdx, [rdi+112]
	lea	r8, [rdi+24]
	lea	r9, [rdi+88]
	lea	r10, [rdi+104]
	test	rdi, 0xf
	cmovnz	rax, r8
	cmovnz	rcx, r9
	cmovnz	rdx, r10
	xor	esi, esi
	mov	[rdi+sha_stateptr_ofs], rax
	mov	[rdi+sha_bitcountptr_ofs], rcx
	mov	[rdi+sha_bufferptr_ofs], rdx
	; so now, each of the 3 pointers is 16 byte aligned within our state
	push	rax
	add	rdi, 24
	mov	edx, sha384_state_size - 24
	call	memset32
	pop	rdi
	mov	rsi, .initial_hash
	mov	edx, 64
	call	memcpy
	epilog
dalign
.initial_hash:
	dq	        0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939, 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4

end if


if used sha384$final | defined include_everything
	; three arguments: rdi == sha state, rsi == pointer to 48 byte buffer for the final digest, bool in edx as to whether we should heap$free the state
	; void return
falign
sha384$final:
	prolog	sha384$final
	push	rdi rsi
	sub	rsp, 64
	mov	rsi, rsp
	call	sha512$final
	mov	rdi, [rsp+64]	; rsi of 28 byte buffer
	mov	rsi, rsp
	mov	edx, 48
	call	memcpy
	add	rsp, 64
	pop	rsi rdi
	epilog

end if


if used sha384$mgf1 | defined include_everything
	; one-pass MGF1 as defined by rfc2437 (one-pass == we do stack-based state)
	; four arguments: rdi == seed, rsi == seed length, rdx == destination, rcx == dest length
falign
sha384$mgf1:
	prolog	sha384$mgf1
	push	r12 r13 r14 r15
	mov	r12, rdi
	mov	r13, rsi
	mov	r14, rdx
	mov	r15, rcx
	sub	rsp, sha384_state_size + 48 + 8
	mov	rdi, rsp
	call	sha384$init
	mov	qword [rsp+sha384_state_size+48], 0
calign
.doit:
	mov	rdi, rsp
	mov	rsi, r12
	mov	rdx, r13
	call	sha384$update
	mov	eax, [rsp+sha384_state_size+48]
if use_movbe
	add	dword [rsp+sha384_state_size+48], 1
	movbe	[rsp+sha384_state_size+52], eax
else
	bswap	eax
	add	dword [rsp+sha384_state_size+48], 1
	mov	[rsp+sha384_state_size+52], eax
end if
	mov	rdi, rsp
	lea	rsi, [rsp+sha384_state_size+52]
	mov	edx, 4
	call	sha384$update
	mov	rdi, rsp
	lea	rsi, [rsp+sha384_state_size]
	xor	edx, edx
	call	sha384$final
	mov	rdi, r14
	lea	rsi, [rsp+sha384_state_size]
	mov	edx, 48
	cmp	rdx, r15
	cmova	rdx, r15
	add	r14, rdx
	sub	r15, rdx
	call	memcpy
	test	r15, r15
	jnz	.doit
	add	rsp, sha384_state_size + 48 + 8
	pop	r15 r14 r13 r12
	epilog

end if



if used sha512$new | defined include_everything
	; no arguments, does a heap$alloc of the required state and performs the sha512$init on it
	; returns initialized state
falign
sha512$new:
	prolog	sha512$new
	mov	rdi, sha512_state_size
	call	heap$alloc
	push	rax
	mov	rdi, rax
	call	sha512$init
	pop	rax
	epilog

end if


if used sha512$init | defined include_everything
	; single argument in rdi: our sha state
	; void return
falign
sha512$init:
	prolog	sha512$init
	; if rdi is 16 aligned on entry, then stateptr == rdi+32, bitcountptr == rdi+96, bufferptr == rdi+112
	; if it is not aligned on entry, then stateptr == rdi+24, bitcountptr == rdi+88, bufferptr == rdi+104
	lea	rax, [rdi+32]
	lea	rcx, [rdi+96]
	lea	rdx, [rdi+112]
	lea	r8, [rdi+24]
	lea	r9, [rdi+88]
	lea	r10, [rdi+104]
	test	rdi, 0xf
	cmovnz	rax, r8
	cmovnz	rcx, r9
	cmovnz	rdx, r10
	xor	esi, esi
	mov	[rdi+sha_stateptr_ofs], rax
	mov	[rdi+sha_bitcountptr_ofs], rcx
	mov	[rdi+sha_bufferptr_ofs], rdx
	; so now, each of the 3 pointers is 16 byte aligned within our state
	push	rax
	add	rdi, 24
	mov	edx, sha512_state_size - 24
	call	memset32
	pop	rdi
	mov	rsi, .initial_hash
	mov	edx, 64
	call	memcpy
	epilog
dalign
.initial_hash:
        dq	0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179


end if


if used sha512$update | used sha384$update | defined include_everything
	; three arguments: rdi == sha state, rsi == byte buffer, rdx == length of same
	; void return
falign
sha512$update:
sha384$update:
	prolog	sha512$update
	test	rdx, rdx
	jz	.nothingtodo
	mov	r8, [rdi+sha_bitcountptr_ofs]
	mov	rcx, [r8]
	shr	rcx, 3
	and	rcx, 0x7f
	test	rcx, rcx
	jz	.noused
	mov	r9d, 128
	sub	r9d, ecx			; 128 - bytes used in the buffer
	cmp	rdx, r9				; are we adding less than the full block?
	jb	.needmore
	; otherwise, we need to fill our buffer, transform that, and then
	; leave the rest to a normal non-buffer based fill
	push	rdi rsi rdx
	mov	rdi, [rdi+sha_bufferptr_ofs]
	add	rdi, rcx
	mov	rdx, r9
	add	qword [rsp+8], r9
	sub	qword [rsp], r9
	shl	r9, 3
	add	qword [r8], r9
	call	memcpy
	mov	rdi, [rsp+16]
	mov	rsi, [rdi+sha_bufferptr_ofs]
	; we need rdx to be set here to a flat 128 bytes for our buffer
	mov	edx, 128
	call	sha512$transform
	pop	rdx rsi rdi
	mov	r8, [rdi+sha_bitcountptr_ofs]
	jmp	.noused
calign
.needmore:
	; rdx is less than the number of bytes we have left in our buffer
	mov	r10, rdx
	shl	r10, 3
	add	qword [r8], r10
	mov	rdi, [rdi+sha_bufferptr_ofs]
	add	rdi, rcx
	call	memcpy
	epilog
calign
.noused:
	; update our bitcount in its entirety beforehand
	; TODO: check overflow of 2^64-1 and update the SECOND bitcount (none of my goods use 2^64-1+ messages, hahah)
	mov	rcx, rdx
	shl	rcx, 3
	add	qword [r8], rcx
	cmp	rdx, 128
	jb	.partial
	call	sha512$transform
	; it returns us with how many bytes it did _not_ process
	; and rsi/rdx is the goods that we'd need to copy if there are leftovers
	test	rdx, rdx
	jnz	.partial
	epilog
calign
.partial:
	mov	rdi, [rdi+sha_bufferptr_ofs]
	call	memcpy
	epilog
calign
.nothingtodo:
	epilog

end if



if used sha512$transform | defined include_everything
	; note: not meant to be called externall, but for profiling reasons down the track
	; is made with the normal profiler/public symbol entries
	; called from sha512$update and sha512$final

	; TODO: see if a loop w/ .k reference is actually faster than the currently unrolled kofs method

falign
sha512$transform:
	prolog	sha512$transform

	mov	eax, 576
	mov	ecx, 568
	sub	rsp, 576
	mov	r8, rsp
	add	r8, 8
	test	rsp, 0xf
	cmovnz	rsp, r8
	cmovnz	eax, ecx
	mov	qword [rsp+0x1d0], rax		; amount to add to the stack when we are done
	mov	[rsp+0x1d8], rbx
	mov	[rsp+0x1e0], rdi
	mov	[rsp+0x1e8], rsi
	mov	[rsp+0x1f0], rdx
	; save four more of our callee-saves
	mov	[rsp+0x1f8], r12
	mov	[rsp+0x200], r13
	mov	[rsp+0x208], r14
	mov	[rsp+0x210], r15

	mov	rdi, [rdi+sha_stateptr_ofs]
	mov	[rsp+0x218], rdi		; STATE_SAVE

	; state -> working vars
	mov	r8, [rdi]
	mov	r9, [rdi+8]
	mov	r10, [rdi+16]
	mov	r11, [rdi+24]
	mov	r12, [rdi+32]
	mov	r13, [rdi+40]
	mov	r14, [rdi+48]
	mov	r15, [rdi+56]

calign
.nextblock:

	; set W to our 16 qwords of input data
if use_movbe
	mov	rax, [rsi]
	mov	rbx, [rsi+8]
	mov	rcx, [rsi+16]
	movbe	[rsp+64], rax
	movbe	[rsp+72], rbx
	movbe	[rsp+80], rcx
	mov	rdx, [rsi+24]
	mov	rax, [rsi+32]
	mov	rbx, [rsi+40]
	movbe	[rsp+88], rdx
	movbe	[rsp+96], rax
	movbe	[rsp+104], rbx
	mov	rcx, [rsi+48]
	mov	rdx, [rsi+56]
	mov	rax, [rsi+64]
	movbe	[rsp+112], rcx
	movbe	[rsp+120], rdx
	movbe	[rsp+128], rax
	mov	rbx, [rsi+72]
	mov	rcx, [rsi+80]
	mov	rdx, [rsi+88]
	movbe	[rsp+136], rbx
	movbe	[rsp+144], rcx
	movbe	[rsp+152], rdx
	mov	rax, [rsi+96]
	mov	rbx, [rsi+104]
	mov	rcx, [rsi+112]
	movbe	[rsp+160], rax
	movbe	[rsp+168], rbx
	movbe	[rsp+176], rcx
	mov	rdx, [rsi+120]
	movbe	[rsp+184], rdx
else

	mov	rax, [rsi]
	mov	rbx, [rsi+8]
	mov	rcx, [rsi+16]
	bswap	rax
	bswap	rbx
	bswap	rcx
	mov	[rsp+64], rax
	mov	[rsp+72], rbx
	mov	[rsp+80], rcx

	mov	rdx, [rsi+24]
	mov	rax, [rsi+32]
	mov	rbx, [rsi+40]
	bswap	rdx
	bswap	rax
	bswap	rbx
	mov	[rsp+88], rdx
	mov	[rsp+96], rax
	mov	[rsp+104], rbx
	
	mov	rcx, [rsi+48]
	mov	rdx, [rsi+56]
	mov	rax, [rsi+64]
	bswap	rcx
	bswap	rdx
	bswap	rax
	mov	[rsp+112], rcx
	mov	[rsp+120], rdx
	mov	[rsp+128], rax

	mov	rbx, [rsi+72]
	mov	rcx, [rsi+80]
	mov	rdx, [rsi+88]
	bswap	rbx
	bswap	rcx
	bswap	rdx
	mov	[rsp+136], rbx
	mov	[rsp+144], rcx
	mov	[rsp+152], rdx

	mov	rax, [rsi+96]
	mov	rbx, [rsi+104]
	mov	rcx, [rsi+112]
	mov	rdx, [rsi+120]
	bswap	rax
	bswap	rbx
	bswap	rcx
	bswap	rdx
	mov	[rsp+160], rax
	mov	[rsp+168], rbx
	mov	[rsp+176], rcx
	mov	[rsp+184], rdx

end if

	mov	rax, r9				; B
	mov	rdi, r12			; E
	mov	rcx, r8				; A
	xor	rax, r10			; B^C

macro sha512_rb1 i*, r1*, r2*, kofs* {
	local	H,Wt,Wt2,Wt7,Wt15
	H = ((1024 + 7 - i) and 7)
	Wt= (i * 8) + 64
	Wt2 = (((i - 2) and 15) * 8) + 64
	Wt7 = (((i - 7) and 15) * 8) + 64
	Wt15 = (((i - 15) and 15) * 8) + 64

	mov	rsi, [rsp+Wt2]		; Wt2 into rsi
	mov	r2, [rsp+Wt15]		; Wt15 into r2
	mov	rbx, rsi		; rsi into rbx
	shr	rsi, 6			; shr rsi, 6 (Wt2 >> 6)
	ror	rbx, 19			; ror rbx, 19 (Wt2 >>> 19)
	xor	rsi, rbx		; xor rsi, rbx (so in rsi we have: (Wt2 >> 6) xor (Wt2 >>> 19))
	ror	rbx, 42			; ror rbx, 42 (Wt2 >>> 61)
	xor	rbx, rsi		; s1 complete

	mov	rsi, r2
	add	rbx, [rsp+Wt7]

	shr	rsi, 7			; Wt15 >> 7
	ror	r2, 1			; Wt15 >>> 1
	add	rbx, [rsp+Wt]

	xor	rsi, r2			; (Wt15 >> 7) xor (Wt15 >>> 1)
	add	r1, [.k + kofs + i*8]
	ror	r2, 7			; Wt15 >>> 8
if H = 0
	add	r1, r8
else if H = 1
	add	r1, r9
else if H = 2
	add	r1, r10
else if H = 3
	add	r1, r11
else if H = 4
	add	r1, r12
else if H = 5
	add	r1, r13
else if H = 6
	add	r1, r14
else if H = 7
	add	r1, r15
end if
	xor	rsi, r2			; s0 complete
	add	rsi, rbx
	mov	[rsp+Wt], rsi
	add	r1, rsi
}

macro sha512_round i*, r*, r1*, r2*, r3*, r4*, kofs* {
	local	H,G,F,E,D,C,B,A,Wt
	H = ((1024 + 7 - i) and 7)
	G = ((1024 + 7 - (i + 1)) and 7)
	F = ((1024 + 7 - (i + 2)) and 7)
	E = ((1024 + 7 - (i + 3)) and 7)
	D = ((1024 + 7 - (i + 4)) and 7)
	C = ((1024 + 7 - (i + 5)) and 7)
	B = ((1024 + 7 - (i + 6)) and 7)
	A = ((1024 + 7 - (i + 7)) and 7)
	Wt = (i * 8) + 64
	
	mov	rsi, r3
if F = 0
	mov	r4, r8
else if F = 1
	mov	r4, r9
else if F = 2
	mov	r4, r10
else if F = 3
	mov	r4, r11
else if F = 4
	mov	r4, r12
else if F = 5
	mov	r4, r13
else if F = 6
	mov	r4, r14
else if F = 7
	mov	r4, r15
end if
if G = 0
	xor	r4, r8
else if G = 1
	xor	r4, r9
else if G = 2
	xor	r4, r10
else if G = 3
	xor	r4, r11
else if G = 4
	xor	r4, r12
else if G = 5
	xor	r4, r13
else if G = 6
	xor	r4, r14
else if G = 7
	xor	r4, r15
end if
	ror	rsi, 41			; S1, e >>> 41
	and	r4, r3
if G = 0
	xor	r4, r8
else if G = 1
	xor	r4, r9
else if G = 2
	xor	r4, r10
else if G = 3
	xor	r4, r11
else if G = 4
	xor	r4, r12
else if G = 5
	xor	r4, r13
else if G = 6
	xor	r4, r14
else if G = 7
	xor	r4, r15
end if
	ror	r3, 14			; S1, e >>> 14
if r = 0
	add	r4, [.k + kofs + i*8]
end if
	xor	rsi, r3
if r = 0
	add	r4, [rsp+Wt]
end if
	ror	r3, 4			; S1, e >>> 18
if r = 0
if H = 0
	add	r4, r8
else if H = 1
	add	r4, r9
else if H = 2
	add	r4, r10
else if H = 3
	add	r4, r11
else if H = 4
	add	r4, r12
else if H = 5
	add	r4, r13
else if H = 6
	add	r4, r14
else if H = 7
	add	r4, r15
end if
end if
	xor	rsi, r3
	add	r4, rsi
if r = 1
	sha512_rb1 i, r4, r3, kofs
end if
	mov	rbx, r2
	mov	rsi, r2
if B = 0
	xor	r2, r8
else if B = 1
	xor	r2, r9
else if B = 2
	xor	r2, r10
else if B = 3
	xor	r2, r11
else if B = 4
	xor	r2, r12
else if B = 5
	xor	r2, r13
else if B = 6
	xor	r2, r14
else if B = 7
	xor	r2, r15
end if
	and	r1, r2
	ror	rbx, 28			; S0, a >>> 28
if B = 0
	xor	r1, r8
else if B = 1
	xor	r1, r9
else if B = 2
	xor	r1, r10
else if B = 3
	xor	r1, r11
else if B = 4
	xor	r1, r12
else if B = 5
	xor	r1, r13
else if B = 6
	xor	r1, r14
else if B = 7
	xor	r1, r15
end if
	add	r1, r4
	ror	rsi, 39			; S0, a >>> 39
if D = 0
	add	r4, r8
else if D = 1
	add	r4, r9
else if D = 2
	add	r4, r10
else if D = 3
	add	r4, r11
else if D = 4
	add	r4, r12
else if D = 5
	add	r4, r13
else if D = 6
	add	r4, r14
else if D = 7
	add	r4, r15
end if
	xor	rsi, rbx
if D = 0
	mov	r8, r4
else if D = 1
	mov	r9, r4
else if D = 2
	mov	r10, r4
else if D = 3
	mov	r11, r4
else if D = 4
	mov	r12, r4
else if D = 5
	mov	r13, r4
else if D = 6
	mov	r14, r4
else if D = 7
	mov	r15, r4
end if
	; rbx is already ror'd 28 (which in the original is 2), we need 34
	ror	rbx, 6			; S0, a >>> 34
	xor	rsi, rbx
	add	r1, rsi
if H = 0
	mov	r8, r1
else if H = 1
	mov	r9, r1
else if H = 2
	mov	r10, r1
else if H = 3
	mov	r11, r1
else if H = 4
	mov	r12, r1
else if H = 5
	mov	r13, r1
else if H = 6
	mov	r14, r1
else if H = 7
	mov	r15, r1
end if
}
	sha512_round 0, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 1, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 2, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 3, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 4, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 5, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 6, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 7, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 8, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 9, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 10, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 11, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 12, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 13, 0, rcx, rax, rdx, rdi, 0x0
	sha512_round 14, 0, rax, rcx, rdi, rdx, 0x0
	sha512_round 15, 0, rcx, rax, rdx, rdi, 0x0

	sha512_round 0, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 1, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 2, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 3, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 4, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 5, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 6, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 7, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 8, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 9, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 10, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 11, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 12, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 13, 1, rcx, rax, rdx, rdi, 0x80
	sha512_round 14, 1, rax, rcx, rdi, rdx, 0x80
	sha512_round 15, 1, rcx, rax, rdx, rdi, 0x80

	sha512_round 0, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 1, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 2, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 3, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 4, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 5, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 6, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 7, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 8, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 9, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 10, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 11, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 12, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 13, 1, rcx, rax, rdx, rdi, 0x100
	sha512_round 14, 1, rax, rcx, rdi, rdx, 0x100
	sha512_round 15, 1, rcx, rax, rdx, rdi, 0x100

	sha512_round 0, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 1, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 2, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 3, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 4, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 5, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 6, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 7, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 8, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 9, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 10, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 11, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 12, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 13, 1, rcx, rax, rdx, rdi, 0x180
	sha512_round 14, 1, rax, rcx, rdi, rdx, 0x180
	sha512_round 15, 1, rcx, rax, rdx, rdi, 0x180

	sha512_round 0, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 1, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 2, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 3, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 4, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 5, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 6, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 7, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 8, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 9, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 10, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 11, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 12, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 13, 1, rcx, rax, rdx, rdi, 0x200
	sha512_round 14, 1, rax, rcx, rdi, rdx, 0x200
	sha512_round 15, 1, rcx, rax, rdx, rdi, 0x200

	mov	rcx, [rsp+0x218]		; STATE_SAVE
	mov	rsi, [rsp+0x1e8]
	mov	rdx, [rsp+0x1f0]
	add	rsi, 128
	sub	rdx, 128
	cmp	rdx, 128
	jae	.moretogo
	; else, remaining bytes < a full block, so bailout

	add	[rcx], r8
	add	[rcx+8], r9
	add	[rcx+16], r10
	add	[rcx+24], r11
	add	[rcx+32], r12
	add	[rcx+40], r13
	add	[rcx+48], r14
	add	[rcx+56], r15

	mov	rax, [rsp+0x1d0]		; amount to add to the stack
	mov	rbx, [rsp+0x1d8]
	mov	rdi, [rsp+0x1e0]
	mov	r12, [rsp+0x1f8]
	mov	r13, [rsp+0x200]
	mov	r14, [rsp+0x208]
	mov	r15, [rsp+0x210]

	add	rsp, rax
	epilog
calign
.moretogo:
	add	r8, [rcx]
	add	r9, [rcx+8]
	add	r10, [rcx+16]
	add	r11, [rcx+24]
	add	r12, [rcx+32]
	add	r13, [rcx+40]
	add	r14, [rcx+48]
	add	r15, [rcx+56]

	; we need to put them back too
	mov	[rcx], r8
	mov	[rcx+8], r9
	mov	[rcx+16], r10
	mov	[rcx+24], r11
	mov	[rcx+32], r12
	mov	[rcx+40], r13
	mov	[rcx+48], r14
	mov	[rcx+56], r15

	; store our updated rsi/rdx for the next fallthrough
	mov	[rsp+0x1e8], rsi
	mov	[rsp+0x1f0], rdx
	jmp	.nextblock
align 16
.k:
	dq	0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019
	dq	0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
	dq	0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
	dq	0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
	dq	0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725
	dq	0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
	dq	0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001
	dq	0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8
	dq	0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
	dq	0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec
	dq	0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207
	dq	0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b
	dq	0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
	dq	0x5fcb6fab3ad6faec, 0x6c44198c4a475817 
	

end if



if used sha512$final | defined include_everything
	; three arguments: rdi == sha state, rsi == pointer to 64 byte buffer for the final digest, bool in edx as to whether we should heap$free the state
	; void return
falign
sha512$final:
	prolog	sha512$final
	push	rdx rsi rdi

	mov	r8, [rdi+sha_bitcountptr_ofs]
if use_movbe
	mov	rcx, [r8]
	mov	rax, [r8+8]
	movbe	[r8], rcx
	movbe	[r8+8], rax
	shr	ecx, 3
	and	ecx, 0x7f
else
	mov	rcx, [r8]
	mov	r9, rcx
	bswap	r9
	shr	rcx, 3
	mov	[r8], r9		; bitcount reversed 64 bits

	mov	rax, [r8+8]		; bitcount reversed high 64 bits
	bswap	rax
	mov	[r8+8], rax

	and	rcx, 0x7f	
end if
	test	ecx, ecx		; usedspace?
	jz	.noused
	; else, we have to begin our padding with 1 bit: 0x80
	; short block length == 112
	mov	r10, [rdi+sha_bufferptr_ofs]
	mov	byte [r10+rcx], 0x80
	add	rcx, 1
	cmp	rcx, 112
	jle	.zeroremaining
	cmp	rcx, 128
	jae	.dosecondtolast
	; else, zero the remaining 128 - usedspace
	mov	rdi, r10
	add	rdi, rcx
	xor	esi, esi
	mov	edx, 128
	sub	edx, ecx
	call	memset
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
calign
.dosecondtolast:
	mov	rsi, [rdi+sha_bufferptr_ofs]
	mov	edx, 128
	call	sha512$transform
	; setup for final:
	mov	rdi, [rsp]
	xor	esi, esi
	mov	edx, 112
	mov	rdi, [rdi+sha_bufferptr_ofs]
	call	memset32
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	jmp	.dofinal
calign
.zeroremaining:
	mov	rdi, r10
	add	rdi, rcx
	xor	esi, esi
	mov	edx, 112
	sub	edx, ecx
	call	memset
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	jmp	.dofinal
calign
.noused:
	mov	rdi, [rdi+sha_bufferptr_ofs]
	xor	esi, esi
	mov	edx, 112
	call	memset32
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	mov	r9, [rdi+sha_bufferptr_ofs]
	mov	dword [r9], 0x80
calign
.dofinal:
	mov	r8, [rdi+sha_bitcountptr_ofs]
	mov	rcx, [r8]
	mov	rax, [r8+8]
	mov	r9, [rdi+sha_bufferptr_ofs]
	mov	qword [r9+112], rax		; high first
	mov	qword [r9+120], rcx		; low after
	mov	edx, 128
	mov	rsi, r9
	call	sha512$transform
	; rdi and rsi both stay in tact across that call
	mov	rsi, [rsp+8]
	mov	rdx, [rdi+sha_stateptr_ofs]

if use_movbe
	mov	rax, [rdx]
	mov	r8, [rdx+8]
	mov	r9, [rdx+16]
	movbe	[rsi], rax
	movbe	[rsi+8], r8
	movbe	[rsi+16], r9
	mov	r10, [rdx+24]
	mov	rax, [rdx+32]
	mov	r8, [rdx+40]
	movbe	[rsi+24], r10
	movbe	[rsi+32], rax
	movbe	[rsi+40], r8
	mov	r9, [rdx+48]
	mov	r10, [rdx+56]
	movbe	[rsi+48], r9
	movbe	[rsi+56], r10
else
	mov	rax, [rdx]
	mov	r8, [rdx+8]
	mov	r9, [rdx+16]
	mov	r10, [rdx+24]
	bswap	rax
	bswap	r8
	bswap	r9
	bswap	r10
	mov	[rsi], rax
	mov	[rsi+8], r8
	mov	[rsi+16], r9
	mov	[rsi+24], r10

	mov	rax, [rdx+32]
	mov	r8, [rdx+40]
	mov	r9, [rdx+48]
	mov	r10, [rdx+56]
	bswap	rax
	bswap	r8
	bswap	r9
	bswap	r10
	mov	[rsi+32], rax
	mov	[rsi+40], r8
	mov	[rsi+48], r9
	mov	[rsi+56], r10
end if
	; last but not least, reinitialize our state for further use
	call	sha512$init
	cmp	dword [rsp+16], 0
	jne	.freeandreturn
	add	rsp, 24
	epilog
calign
.freeandreturn:
	mov	rdi, [rsp]
	call	heap$free
	add	rsp, 24
	epilog

end if


if used sha512$mgf1 | defined include_everything
	; one-pass MGF1 as defined by rfc2437 (one-pass == we do stack-based state)
	; four arguments: rdi == seed, rsi == seed length, rdx == destination, rcx == dest length
falign
sha512$mgf1:
	prolog	sha512$mgf1
	push	r12 r13 r14 r15
	mov	r12, rdi
	mov	r13, rsi
	mov	r14, rdx
	mov	r15, rcx
	sub	rsp, sha512_state_size + 64 + 8
	mov	rdi, rsp
	call	sha512$init
	mov	qword [rsp+sha512_state_size+64], 0
calign
.doit:
	mov	rdi, rsp
	mov	rsi, r12
	mov	rdx, r13
	call	sha512$update
	mov	eax, [rsp+sha512_state_size+64]
if use_movbe
	add	dword [rsp+sha512_state_size+64], 1
	movbe	[rsp+sha512_state_size+68], eax
else
	bswap	eax
	add	dword [rsp+sha512_state_size+64], 1
	mov	[rsp+sha512_state_size+68], eax
end if
	mov	rdi, rsp
	lea	rsi, [rsp+sha512_state_size+68]
	mov	edx, 4
	call	sha512$update
	mov	rdi, rsp
	lea	rsi, [rsp+sha512_state_size]
	xor	edx, edx
	call	sha512$final
	mov	rdi, r14
	lea	rsi, [rsp+sha512_state_size]
	mov	edx, 64
	cmp	rdx, r15
	cmova	rdx, r15
	add	r14, rdx
	sub	r15, rdx
	call	memcpy
	test	r15, r15
	jnz	.doit
	add	rsp, sha512_state_size + 64 + 8
	pop	r15 r14 r13 r12
	epilog

end if