HeavyThing - md5.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	;       
	; md5.inc: MD5 goods required by TLS 1.0/1.1
	;
	; copy-paste from the other hashes, guts of this one are Marc Bevand's public domain method
	; 
md5_state_size = 128

if used md5$new | defined include_everything
	; no arguments, does a heap$alloc of the required state and performs the md5$init on it
	; returns initialized state
falign
md5$new:
	prolog	md5$new
	mov	edi, md5_state_size
	call	heap$alloc
	push	rax
	mov	rdi, rax
	call	md5$init
	pop	rax
	epilog

end if

if used md5$init | defined include_everything
	; single argument in rdi: our md5 state
	; void return
falign
md5$init:
	prolog	md5$init
	; if rdi is 16 aligned on entry, then stateptr == rdi+32, bitcountptr == rdi+48, bufferptr == rdi+64
	; if it is not aligned on entry, then stateptr == rdi+24, bitcountptr == rdi+40, bufferptr == rdi+56
	lea	rax, [rdi+32]
	lea	rcx, [rdi+48]
	lea	rdx, [rdi+64]
	lea	r8, [rdi+24]
	lea	r9, [rdi+40]
	lea	r10, [rdi+56]
	test	rdi, 0xf
	cmovnz	rax, r8
	cmovnz	rcx, r9
	cmovnz	rdx, r10
	xor	esi, esi
	mov	[rdi+sha_stateptr_ofs], rax
	mov	[rdi+sha_bitcountptr_ofs], rcx
	mov	[rdi+sha_bufferptr_ofs], rdx
	; so now, each of the 3 pointers is 16 byte aligned within our own state
	push	rax
	add	rdi, 24
	mov	edx, md5_state_size - 24
	call	memset32
	pop	rdi
	mov	rax, qword [.initial_hash]
	mov	rcx, qword [.initial_hash+8]
	mov	[rdi], rax
	mov	[rdi+8], rcx
	epilog
dalign
.initial_hash:
	dd	0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476

end if

if used md5$update | defined include_everything
	; three arguments: rdi == md5 state, rsi == byte buffer, rdx == length of same
	; void return
falign
md5$update:
	prolog	md5$update
	test	rdx, rdx
	jz	.nothingtodo
	mov	r8, [rdi+sha_bitcountptr_ofs]
	mov	rcx, [r8]
	shr	rcx, 3
	and	rcx, 0x3f
	test	rcx, rcx
	jz	.noused
	mov	r9d, 64
	sub	r9d, ecx			; 64 - bytes used in the buffer
	cmp	rdx, r9				; are we adding less than the full block?
	jb	.needmore
	; otherwise, we need to fill our buffer, transform that, and then
	; leave the rest to a normal non-buffer based fill
	push	rdi rsi rdx
	mov	rdi, [rdi+sha_bufferptr_ofs]
	add	rdi, rcx
	mov	rdx, r9
	add	qword [rsp+8], r9
	sub	qword [rsp], r9
	shl	r9, 3
	add	qword [r8], r9
	call	memcpy
	mov	rdi, [rsp+16]
	mov	rsi, [rdi+sha_bufferptr_ofs]
	; we need rdx to be set here to a flat 64 bytes for our buffer
	mov	edx, 64
	call	md5$transform
	pop	rdx rsi rdi
	mov	r8, [rdi+sha_bitcountptr_ofs]
	jmp	.noused
calign
.needmore:
	; rdx is less than the number of bytes we have left in our buffer
	mov	r10, rdx
	shl	r10, 3
	add	qword [r8], r10
	mov	rdi, [rdi+sha_bufferptr_ofs]
	add	rdi, rcx
	call	memcpy
	epilog
calign
.noused:
	; update our bitcount in its entirety beforehand
	mov	rcx, rdx
	shl	rcx, 3
	add	qword [r8], rcx
	cmp	rdx, 64
	jb	.partial
	call	md5$transform
	; it returns us with how many bytes it did _not_ process
	; and rsi/rdx is the goods that we'd need to copy if there are leftovers
	test	rdx, rdx
	jnz	.partial
	epilog
calign
.partial:
	mov	rdi, [rdi+sha_bufferptr_ofs]
	call	memcpy
	epilog
calign
.nothingtodo:
	epilog

end if


if used md5$transform | defined include_everything
	; note: not meant to be called externally, but for profiling reasons down the track
	; is made with the normal profiler/public symbol entries
	; called from md5$update and md5$final
	; NONSTANDARD returns/register preservation
falign
md5$transform:
	prolog	md5$transform
	; rdi == our state, rsi == (dd) data
	; TODO: md5 uses much less stack than the other hashing algos, come back and clean this one up from the copy-paste-modify mess i made, haha
	; we preserve rdi, rsi and rdx (updating rsi/rdx as we go)
	mov	eax, 288
	mov	ecx, 280
	sub	rsp, 288
	mov	r8, rsp
	add	r8, 8
	test	rsp, 0xf
	cmovnz	rsp, r8
	cmovnz	eax, ecx
	mov	qword [rsp+0x80], rax		; amount to add to the stack when we are done
	; so now, we have an aligned 16 stack with the ability to correctly replace it when we are done
	mov	[rsp+0x88], rbx
	mov	[rsp+0x90], rdi
	mov	[rsp+0x98], rsi
	mov	[rsp+0xa0], rdx
	mov	[rsp+0xa8], r14
	mov	[rsp+0xb0], r15
	mov	[rsp+0xb8], r12
	mov	rdi, [rdi+sha_stateptr_ofs]
	mov	[rsp+0x68], rdi			; STATE_SAVE

	mov	eax, [rdi]			; a
	mov	ebx, [rdi+4]			; b
	mov	ecx, [rdi+8]			; c
	mov	edx, [rdi+12]			; d

calign
.nextblock:	; here is where we jump to from the bottom if there was more to do
	; save the old values of a-d
	mov	r8d, eax
	mov	r9d, ebx
	mov	r14d, ecx
	mov	r15d, edx

	; rsi[0..15] is our input data that we don't need to molest

macro md5_round0 initial*, dest*, x*, y*, z*, next*, data*, s* {
	if initial = 1
		mov	r10d, [rsi]
		mov	r11d, edx
	end if
	xor	r11d, y
	lea	dest, [dest+r10d+data]
	and	r11d, x
	xor	r11d, z
	mov	r10d, [rsi+next*4]
	add	dest, r11d
	rol	dest, s
	mov	r11d, y
	add	dest, x
}

macro md5_round1 initial*, dest*, x*, y*, z*, next*, data*, s* {
	if initial = 1
		mov	r10d, [rsi+4]
		mov	r11d, edx
		mov	r12d, edx
	end if
	not	r11d
	lea	dest, [dest+r10d+data]
	and	r12d, x
	and	r11d, y
	mov	r10d, [rsi+next*4]
	or	r12d, r11d
	mov	r11d, y
	add	dest, r12d
	mov	r12d, y
	rol	dest, s
	add	dest, x
}

macro md5_round2 initial*, dest*, x*, y*, z*, next*, data*, s* {
	if initial = 1
		mov	r10d, [rsi+20]
		mov	r11d, ecx
	end if
	lea	dest, [dest+r10d+data]
	mov	r10d, [rsi+next*4]
	xor	r11d, z
	xor	r11d, x
	add	dest, r11d
	rol	dest, s
	mov	r11d, x
	add	dest, x
}

macro md5_round3 initial*, dest*, x*, y*, z*, next*, data*, s* {
	if initial = 1
		mov	r10d, [rsi]
		mov	r11d, 0xffffffff
		xor	r11d, edx
	end if
	lea	dest, [dest+r10d+data]
	or	r11d, x
	xor	r11d, y
	add	dest, r11d
	mov	r10d, [rsi+next*4]
	mov	r11d, 0xffffffff
	rol	dest, s
	xor	r11d, y
	add	dest, x
}


	md5_round0 1, eax, ebx, ecx, edx, 1, 0xd76aa478, 7
	md5_round0 0, edx, eax, ebx, ecx, 2, 0xe8c7b756, 12
	md5_round0 0, ecx, edx, eax, ebx, 3, 0x242070db, 17
	md5_round0 0, ebx, ecx, edx, eax, 4, 0xc1bdceee, 22
	md5_round0 0, eax, ebx, ecx, edx, 5, 0xf57c0faf, 7
	md5_round0 0, edx, eax, ebx, ecx, 6, 0x4787c62a, 12
	md5_round0 0, ecx, edx, eax, ebx, 7, 0xa8304613, 17
	md5_round0 0, ebx, ecx, edx, eax, 8, 0xfd469501, 22
	md5_round0 0, eax, ebx, ecx, edx, 9, 0x698098d8, 7
	md5_round0 0, edx, eax, ebx, ecx, 10, 0x8b44f7af, 12
	md5_round0 0, ecx, edx, eax, ebx, 11, 0xffff5bb1, 17
	md5_round0 0, ebx, ecx, edx, eax, 12, 0x895cd7be, 22
	md5_round0 0, eax, ebx, ecx, edx, 13, 0x6b901122, 7
	md5_round0 0, edx, eax, ebx, ecx, 14, 0xfd987193, 12
	md5_round0 0, ecx, edx, eax, ebx, 15, 0xa679438e, 17
	md5_round0 0, ebx, ecx, edx, eax, 0, 0x49b40821, 22

	md5_round1 1, eax, ebx, ecx, edx, 6, 0xf61e2562, 5
	md5_round1 0, edx, eax, ebx, ecx, 11, 0xc040b340, 9
	md5_round1 0, ecx, edx, eax, ebx, 0, 0x265e5a51, 14
	md5_round1 0, ebx, ecx, edx, eax, 5, 0xe9b6c7aa, 20
	md5_round1 0, eax, ebx, ecx, edx, 10, 0xd62f105d, 5
	md5_round1 0, edx, eax, ebx, ecx, 15, 0x2441453, 9
	md5_round1 0, ecx, edx, eax, ebx, 4, 0xd8a1e681, 14
	md5_round1 0, ebx, ecx, edx, eax, 9, 0xe7d3fbc8, 20
	md5_round1 0, eax, ebx, ecx, edx, 14, 0x21e1cde6, 5
	md5_round1 0, edx, eax, ebx, ecx, 3, 0xc33707d6, 9
	md5_round1 0, ecx, edx, eax, ebx, 8, 0xf4d50d87, 14
	md5_round1 0, ebx, ecx, edx, eax, 13, 0x455a14ed, 20
	md5_round1 0, eax, ebx, ecx, edx, 2, 0xa9e3e905, 5
	md5_round1 0, edx, eax, ebx, ecx, 7, 0xfcefa3f8, 9
	md5_round1 0, ecx, edx, eax, ebx, 12, 0x676f02d9, 14
	md5_round1 0, ebx, ecx, edx, eax, 0, 0x8d2a4c8a, 20

	md5_round2 1, eax, ebx, ecx, edx, 8, 0xfffa3942, 4
	md5_round2 0, edx, eax, ebx, ecx, 11, 0x8771f681, 11
	md5_round2 0, ecx, edx, eax, ebx, 14, 0x6d9d6122, 16
	md5_round2 0, ebx, ecx, edx, eax, 1, 0xfde5380c, 23
	md5_round2 0, eax, ebx, ecx, edx, 4, 0xa4beea44, 4
	md5_round2 0, edx, eax, ebx, ecx, 7, 0x4bdecfa9, 11
	md5_round2 0, ecx, edx, eax, ebx, 10, 0xf6bb4b60, 16
	md5_round2 0, ebx, ecx, edx, eax, 13, 0xbebfbc70, 23
	md5_round2 0, eax, ebx, ecx, edx, 0, 0x289b7ec6, 4
	md5_round2 0, edx, eax, ebx, ecx, 3, 0xeaa127fa, 11
	md5_round2 0, ecx, edx, eax, ebx, 6, 0xd4ef3085, 16
	md5_round2 0, ebx, ecx, edx, eax, 9, 0x4881d05, 23
	md5_round2 0, eax, ebx, ecx, edx, 12, 0xd9d4d039, 4
	md5_round2 0, edx, eax, ebx, ecx, 15, 0xe6db99e5, 11
	md5_round2 0, ecx, edx, eax, ebx, 2, 0x1fa27cf8, 16
	md5_round2 0, ebx, ecx, edx, eax, 0, 0xc4ac5665, 23

	md5_round3 1, eax, ebx, ecx, edx, 7, 0xf4292244, 6
	md5_round3 0, edx, eax, ebx, ecx, 14, 0x432aff97, 10
	md5_round3 0, ecx, edx, eax, ebx, 5, 0xab9423a7, 15
	md5_round3 0, ebx, ecx, edx, eax, 12, 0xfc93a039, 21
	md5_round3 0, eax, ebx, ecx, edx, 3, 0x655b59c3, 6
	md5_round3 0, edx, eax, ebx, ecx, 10, 0x8f0ccc92, 10
	md5_round3 0, ecx, edx, eax, ebx, 1, 0xffeff47d, 15
	md5_round3 0, ebx, ecx, edx, eax, 8, 0x85845dd1, 21
	md5_round3 0, eax, ebx, ecx, edx, 15, 0x6fa87e4f, 6
	md5_round3 0, edx, eax, ebx, ecx, 6, 0xfe2ce6e0, 10
	md5_round3 0, ecx, edx, eax, ebx, 13, 0xa3014314, 15
	md5_round3 0, ebx, ecx, edx, eax, 4, 0x4e0811a1, 21
	md5_round3 0, eax, ebx, ecx, edx, 11, 0xf7537e82, 6
	md5_round3 0, edx, eax, ebx, ecx, 2, 0xbd3af235, 10
	md5_round3 0, ecx, edx, eax, ebx, 9, 0x2ad7d2bb, 15
	md5_round3 0, ebx, ecx, edx, eax, 0, 0xeb86d391, 21

	; add old values of a-d
	add	eax, r8d
	add	ebx, r9d
	add	ecx, r14d
	add	edx, r15d

	sub	qword [rsp+0xa0], 64
	add	rsi, 64
	cmp	qword [rsp+0xa0], 64
	jae	.nextblock
	; else, remaining bytes is < a full block, so bailout
	mov	r8, [rsp+0x68]			; STATE_SAVE
	mov	dword [r8], eax
	mov	dword [r8+4], ebx
	mov	dword [r8+8], ecx
	mov	dword [r8+12], edx
	mov	rcx, r8
	mov	rdx, [rsp+0xa0]
	mov	rax, [rsp+0x80]
	mov	rbx, [rsp+0x88]	
	mov	rdi, [rsp+0x90]
	mov	r12, [rsp+0xb8]
	mov	r14, [rsp+0xa8]
	mov	r15, [rsp+0xb0]
	add	rsp, rax
	epilog

end if

if used md5$final | defined include_everything
	; three arguments: rdi == md5 state, rsi == pointer to 16 byte buffer for the final digest, bool in edx as to whether we should heap$free the state
	; void return, reinitializes our state for further use again if !edx
falign
md5$final:
	prolog	md5$final
	push	rdx rsi rdi
	mov	r8, [rdi+sha_bitcountptr_ofs]
	mov	rcx, [r8]
	mov	r9, rcx
	; little endian for md5, no bswap bswap	r9
	shr	rcx, 3
	mov	[r8], r9		; bitcount reversed 64 bits
	and	rcx, 0x3f
	test	rcx, rcx		; usedspace?
	jz	.noused

	; else, we have to begin our padding with 1 bit: 0x80
	; short block length == 56
	mov	r10, [rdi+sha_bufferptr_ofs]
	mov	byte [r10+rcx], 0x80
	add	rcx, 1
	cmp	rcx, 56
	jle	.zeroremaining
	cmp	rcx, 64
	jae	.dosecondtolast
	; else, zero the remaining 64 - usedspace
	mov	rdi, r10
	add	rdi, rcx
	xor	esi, esi
	mov	edx, 64
	sub	edx, ecx
	call	memset
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
calign
.dosecondtolast:
	mov	rsi, [rdi+sha_bufferptr_ofs]
	mov	edx, 64
	call	md5$transform
	; setup for final:
	mov	rdi, [rsp]
	xor	esi, esi
	mov	edx, 56
	mov	rdi, [rdi+sha_bufferptr_ofs]
	call	memset32
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	jmp	.dofinal
calign
.zeroremaining:
	mov	rdi, r10
	add	rdi, rcx
	xor	esi, esi
	mov	edx, 56
	sub	edx, ecx
	call	memset
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	jmp	.dofinal
calign
.noused:
	mov	rdi, [rdi+sha_bufferptr_ofs]
	xor	esi, esi
	mov	edx, 56
	call	memset32
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	mov	r9, [rdi+sha_bufferptr_ofs]
	mov	dword [r9], 0x80
calign
.dofinal:
	mov	r8, [rdi+sha_bitcountptr_ofs]
	mov	rcx, [r8]
	mov	r9, [rdi+sha_bufferptr_ofs]
	mov	qword [r9+56], rcx
	mov	edx, 64
	mov	rsi, r9
	call	md5$transform
	; rdi and rsi both stay in tact across that call
	mov	rsi, [rsp+8]
	mov	rdx, [rdi+sha_stateptr_ofs]
	mov	eax, dword [rdx]
	mov	r8d, dword [rdx+4]
	mov	r9d, dword [rdx+8]
	mov	r10d, dword [rdx+12]
	mov	dword [rsi], eax
	mov	dword [rsi+4], r8d
	mov	dword [rsi+8], r9d
	mov	dword [rsi+12], r10d
	; last but not least, reinitialize ourselves:
	mov	rdi, [rsp]
	call	md5$init
	cmp	dword [rsp+16], 0
	jne	.freeandreturn
	add	rsp, 24
	epilog
calign
.freeandreturn:
	mov	rdi, [rsp]
	call	heap$free
	add	rsp, 24
	epilog

end if



if used md5$mgf1 | defined include_everything
	; one-pass MGF1 as defined by rfc2437 (one-pass == we do stack-based state)
	; four arguments: rdi == seed, rsi == seed length, rdx == destination, rcx == dest length
falign
md5$mgf1:
	prolog	md5$mgf1
	push	r12 r13 r14 r15
	mov	r12, rdi
	mov	r13, rsi
	mov	r14, rdx
	mov	r15, rcx
	sub	rsp, md5_state_size + 16 + 8
	mov	rdi, rsp
	call	md5$init
	mov	qword [rsp+md5_state_size+16], 0
calign
.doit:
	mov	rdi, rsp
	mov	rsi, r12
	mov	rdx, r13
	call	md5$update
	mov	eax, [rsp+md5_state_size+16]
if use_movbe
	add	dword [rsp+md5_state_size+16], 1
	movbe	[rsp+md5_state_size+20], eax
else
	bswap	eax
	add	dword [rsp+md5_state_size+16], 1
	mov	[rsp+md5_state_size+20], eax
end if
	mov	rdi, rsp
	lea	rsi, [rsp+md5_state_size+20]
	mov	edx, 4
	call	md5$update
	mov	rdi, rsp
	lea	rsi, [rsp+md5_state_size]
	xor	edx, edx
	call	md5$final
	mov	rdi, r14
	lea	rsi, [rsp+md5_state_size]
	mov	edx, 16
	cmp	rdx, r15
	cmova	rdx, r15
	add	r14, rdx
	sub	r15, rdx
	call	memcpy
	test	r15, r15
	jnz	.doit
	add	rsp, md5_state_size + 16 + 8
	pop	r15 r14 r13 r12
	epilog

end if