HeavyThing - poly1305.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	; poly1305.inc: Implementation of the SSE2 public domain variant at
	; https://github.com/floodyberry/poly1305-opt ... same as his but modified
	; to better suit our library requirements
	;


if used poly1305$new | used poly1305$init | defined include_everything


poly1305_h_ofs = 0
poly1305_r_ofs = 40
poly1305_r2_ofs = 60
poly1305_r4_ofs = 80
poly1305_pad_ofs = 100
poly1305_flags_ofs = 120
poly1305_leftover_ofs = 124
poly1305_buffer_ofs = 128

poly1305_state_size =  poly1305_buffer_ofs + 32

poly1305_started = 1
poly1305_final_shift8 = 4
poly1305_final_shift16 = 8
poly1305_final_r2_r = 16
poly1305_final_r_1 = 32

end if


if used poly1305$new | defined include_everything
	; single argument in rdi: 32 byte key or null for no init
falign
poly1305$new:
	prolog	poly1305$new
	push	rdi
	mov	edi, poly1305_state_size
	call	heap$alloc
	pop	rsi
	test	rsi, rsi
	jnz	.withinit
	epilog
.withinit:
	push	rax
	mov	rdi, rax
	call	poly1305$init
	pop	rax
	epilog

end if


if used poly1305$init | defined include_everything
	; two arguments: rdi == state, rsi == 32 byte key
falign
poly1305$init:
	prolog	poly1305$init
	pxor	xmm0, xmm0
	push	rbp rbx r12
	mov	r11, -1
	mov	r9, 0xffc0fffffff
	mov	rbp, 0xfffffffffff
	push	r13 r14 r15
	mov	r13, r11
	xor	ebx, ebx
	mov	rax, [rsi]
	mov	rcx, [rsi+8]
	mov	r14, 0xfffffc0ffff
	mov	r15, 0xffffffc0f
	movups	[rdi], xmm0
	movups	[rdi+0x10], xmm0
	movups	[rdi+0x20], xmm0
	mov	rdx, rcx
	shr	rcx, 0x18
	and	r9, rax
	shl	rdx, 0x14
	shr	rax, 0x2c
	mov	r8, r9
	or	rdx, rax
	mov	eax, r9d
	shr	r8, 0x1a
	and	rdx, r14
	and	rcx, r15
	and	eax, 0x3ffffff
	mov	[rdi+0x28], eax
	mov	eax, edx
	shl	eax, 0x12
	or	eax, r8d
	mov	r8, rdx
	and	eax, 0x3ffffff
	shr	r8, 0x22
	mov	[rdi+0x2c], eax
	mov	rax, rdx
	shr	rax, 8
	and	eax, 0x3ffffff
	mov	[rdi+0x30], eax
	mov	eax, ecx
	shl	eax, 10
	or	eax, r8d
	mov	r8, rdi
	and	eax, 0x3ffffff
	mov	[rdi+0x34], eax
	mov	rax, rcx
	mov	r14, [rsi+0x10]
	mov	r15, [rsi+0x18]
	shr	rax, 16
	mov	[rdi+0x38], eax
	mov	[rdi+0x68], r14		; pad0
	mov	[rdi+0x70], r15		; pad1
	mov	rsi, rdx
.outermost:
	test	rbx, rbx	
	jnz	.outermost_nonzero
	cmp	r13, 16
	jbe	.bailout
	xor	eax, eax
	lea	rdi, [r8+0x3c]
	jmp	.innerbulk
calign
.outermost_nonzero:
	cmp	r13, 96
	jb	.bailout
	xor	eax, eax
	lea	rdi, [r8+0x50]
.innerbulk:
	imul	r10, rcx, 0x14
	mov	[rsp-0x30], rax
	mov	[rsp-0x20], rax
	lea	r14, [rsi+rsi]
	lea	r11, [r9+r9]
	mov	rax, r10
	mul	r14
	mov	r14, rax
	mov	rax, r9
	mov	r15, rdx
	mul	r9
	add	r14, rax
	mov	rax, r14
	adc	r15, rdx
	lea	rdx, [rcx+rcx]
	and	rax, rbp
	mov	[rsp-0x10], rax
	mov	rax, r11
	mov	[rsp-0x18], rdx
	mul	rsi
	mov	r11, rax
	mov	rax, r10
	mov	r12, rdx
	mul	rcx
	mov	rcx, [rsp-0x10]
	add	r11, rax
	mov	rax, r14
	adc	r12, rdx
	shrd	rax, r15, 0x2c
	mov	[rsp-0x38], rax
	mov	rax, [rsp-0x18]
	add	r11, [rsp-0x38]
	adc	r12, [rsp-0x30]

	mul	r9
	mov	r14, r11
	and	r14, rbp
	mov	r9, rax
	mov	rax, rsi
	mov	r10, rdx
	mul	rsi
	add	r9, rax
	mov	rax, r11
	adc	r10, rdx
	shrd	rax, r12, 0x2c
	mov	[rsp-0x28], rax
	mov	rax, 0x3ffffffffff
	add	r9, [rsp-0x28]
	adc	r10, [rsp-0x20]

	and	rax, r9
	add	rbx, 1
	shrd	r9, r10, 0x2a
	lea	r9, [r9+r9*4]
	add	rcx, r9
	mov	r9, rcx
	shr	rcx, 0x2c
	add	rcx, r14
	and	r9, rbp
	mov	rsi, rcx
	shr	rcx, 0x2c
	mov	rdx, r9
	add	rcx, rax
	mov	eax, r9d
	and	rsi, rbp
	and	eax, 0x3ffffff
	shr	rdx, 0x1a
	mov	[rdi], eax

	mov	eax, esi
	shl	eax, 0x12
	or	eax, edx
	mov	rdx, rsi
	and	eax, 0x3ffffff
	shr	rdx, 0x22
	mov	[rdi+4], eax
	mov	rax, rsi
	shr	rax, 8
	and	eax, 0x3ffffff
	mov	[rdi+8], eax
	mov	eax, ecx
	shl	eax, 0xa
	or	eax, edx
	and	eax, 0x3ffffff
	mov	[rdi+12], eax

	mov	rax, rcx
	shr	rax, 16
	cmp	rbx, 2
	mov	[rdi+16], eax
	jne	.outermost

.bailout:
	mov	qword [r8+poly1305_flags_ofs], 0	; blasts leftover too
	pop	r15 r14 r13 r12 rbx rbp
	epilog

end if


if used poly1305$update | defined include_everything
	; three arguments: rdi == state, rsi == message, rdx == length of same
falign
poly1305$update:
	prolog	poly1305$update
	mov	eax, 32
	mov	r8d, [rdi+poly1305_leftover_ofs]
	push	rbx r12 r13
	mov	rbx, rdi
	sub	eax, r8d
	mov	r12, rsi
	mov	r13, rdx
	push	r14 r15
	test	r8d, r8d
	jz	.noleftovers
	cmp	rax, rdx
	cmova	eax, edx
	mov	r14d, eax
	mov	r15d, r8d
	lea	rdi, [rdi+r8+poly1305_buffer_ofs]
	mov	edx, eax
	call	memcpy
	sub	r13, r14
	add	r12, r14
	add	[rbx+poly1305_leftover_ofs], r14d
	cmp	dword [rbx+poly1305_leftover_ofs], 32
	jb	.bailout
	mov	rdi, rbx
	lea	rsi, [rbx+poly1305_buffer_ofs]
	mov	edx, 32
	call	poly1305$blocks
	mov	dword [rbx+poly1305_leftover_ofs], 0
.noleftovers:
	mov	rcx, r13
	test	r13, r13
	jz	.bailout
	cmp	r13, 32
	jb	.nofullblocks
	and	rcx, not 31
	; consume(state, in, want)
	test	r12, 7
	jnz	.maligned
	mov	rdi, rbx
	mov	rsi, r12
	mov	rdx, rcx
	sub	r13, rcx
	add	r12, rcx
	call	poly1305$blocks
	test	r13, r13
	jnz	.addleftover
	pop	r15 r14 r13 r12 rbx
	epilog
.addleftover:
	lea	rdi, [rbx+poly1305_buffer_ofs]
	mov	rsi, r12
	mov	rdx, r13
	call	memcpy
	add	[rbx+poly1305_leftover_ofs], r13d
	pop	r15 r14 r13 r12 rbx
	epilog
.maligned:
	; copy it to a word-aligned stackbuffer first
	sub	rsp, 1024
	mov	edx, 1024
	cmp	r13, rdx
	cmovb	rdx, r13
	mov	rdi, rsp
	mov	rsi, r12
	mov	r14, rdx
	call	memcpy
	mov	rdi, rbx
	mov	rsi, rsp
	mov	rdx, r14
	call	poly1305$blocks
	add	r12, r14
	sub	r13, r14
	add	rsp, 1024
	jnz	.maligned
	; hmmm, I am not sure his implementation deals with this correctly, no ~not 31 here
	; and no leftover additions like the aligned cousin above...
	; ... breakpointing this for now...
	breakpoint
	pop	r15 r14 r13 r12 rbx
	epilog

.nofullblocks:
	mov	r8d, [rbx+poly1305_leftover_ofs]
	mov	rsi, r12
	mov	edx, r13d
	lea	rdi, [rbx+r8+poly1305_buffer_ofs]
	add	[rbx+poly1305_leftover_ofs], r13d
	call	memcpy
.bailout:
	pop	r15 r14 r13 r12 rbx
	epilog

end if

if used poly1305$final | defined include_everything
	; three arguments: rdi == state, rsi == ptr to 16 byte buffer for the mac, edx == bool as to whether we should destroy (heap$free/clear)
	; the state when we're done
falign
poly1305$final:
	prolog	poly1305$final
	push	rbp rbx r12 r13
	pxor	xmm0, xmm0
	mov	ebp, [rdi+poly1305_leftover_ofs]
	mov	rbx, rdi
	mov	r12, rsi
	mov	r13d, edx
	lea	rsi, [rdi+poly1305_buffer_ofs]
	sub	rsp, 32
	test	ebp, ebp
	jz	.noleftover
	xor	eax, eax
	movups	[rsp], xmm0
	movups	[rsp+0x10], xmm0
	sub	rsi, rsp
	test	ebp, 0x10
	mov	rax, rsp
	jz	.b8
	movups	xmm0, [rsp+rsi]
	add	rax, 0x10
	movups	[rsp], xmm0
.b8:
	test	ebp, 8
	jz	.b4
	mov	rdx, [rax+rsi]
	mov	[rax], rdx
	add	rax, 8
.b4:
	test	ebp, 4
	jz	.b2
	mov	edx, [rax+rsi]
	mov	[rax], edx
	add	rax, 4
.b2:
	test	ebp, 2
	jz	.b1
	movzx	edx, word [rax+rsi]
	mov	[rax], dx
	add	rax, 2
.b1:
	test	ebp, 1
	jz	.bleftover
	movzx	edx, byte [rax+rsi]
	mov	[rax], dl
.bleftover:
	cmp	ebp, 0x10
	je	.nolastbyte
	mov	byte [rsp+rbp], 1
.nolastbyte:
	cmp	ebp, 0x10
	mov	edx, 0x20
	mov	rsi, rsp
	sbb	rax, rax
	mov	rdi, rbx
	and	eax, 4
	add	rax, 4
	or	dword [rbx+0x78], eax	; flags
	call	poly1305$blocks
.noleftover:
	mov	eax, [rbx+0x78]		; flags
	test	eax, 1			; started?
	jz	.notstarted
	sub	ebp, 1
	mov	ecx, eax
	mov	edx, eax
	or	ecx, 0x10
	or	edx, 0x20

	cmp	ebp, 0xf
	cmovbe	eax, edx
	cmova	eax, ecx
	mov	edx, 0x20
	xor	esi, esi
	mov	[rbx+0x78], eax
	mov	rdi, rbx
	call	poly1305$blocks
.notstarted:
	pxor	xmm0, xmm0
	mov	rax, [rbx+8]
	mov	rsi, [rbx+0x70]
	mov	rdx, rax
	mov	rcx, rax
	mov	rax, [rbx+0x10]
	shr	rcx, 0x14
	shl	rdx, 0x2c
	or	rdx, [rbx]
	shl	rax, 0x18
	or	rax, rcx
	mov	rcx, [rbx+0x68]
	add	rdx, rcx
	adc	rax, rsi
	movups	[rbx], xmm0
	movups	[rbx+0x10], xmm0
	movups	[rbx+0x20], xmm0
	movups	[rbx+0x30], xmm0
	movups	[rbx+0x40], xmm0
	movups	[rbx+0x50], xmm0
	movups	[rbx+0x60], xmm0
	movups	[rbx+0x70], xmm0
	mov	[r12], rdx
	mov	[r12+0x8], rax
	add	rsp, 0x20
	test	r13d, r13d
	jnz	.withfree
	pop	r13 r12 rbx rbp
	epilog
.withfree:
	; state has already been cleared
	mov	rdi, rbx
	call	heap$free
	pop	r13 r12 rbx rbp
	epilog

end if

if used poly1305$blocks | defined include_everything

falign
poly1305$blocks:
	prolog	poly1305$blocks
	push	rbx
	mov	rax, rsp
	mov	r8, 0x1000000
	mov	r9, 0x3ffffff
	and	rax, 0xf
	sub	rsp, rax
	sub	rsp, 0x158
	movq	xmm0, r9
	movq	xmm1, r8
	mov	[rsp+0x150], rax
	pshufd	xmm0, xmm0, 0x44
	pshufd	xmm1, xmm1, 0x44
	mov	eax, [rdi+poly1305_flags_ofs]
	movaps	[rsp+0x138], xmm1
	test	eax, poly1305_final_shift8
	jz	.noshift8
	psrldq	xmm1, 8
	movaps	[rsp+0x138], xmm1
.noshift8:
	test	eax, poly1305_final_shift16
	jz	.noshift16
	pxor	xmm1, xmm1
	movaps	[rsp+0x138], xmm1
.noshift16:
	test	eax, poly1305_started
	jnz	.alreadystarted
	movq	xmm1, [rsi+0x10]
	movaps	xmm3, xmm0
	movaps	xmm9, xmm0
	movq	xmm15, [rsi]
	or	eax, poly1305_started
	sub	rdx, 32
	movq	xmm12, [rsi+0x8]
	punpcklqdq xmm15, xmm1
	movq	xmm1, [rsi+0x18]
	movaps	xmm8, xmm15
	pand	xmm3, xmm15
	psrlq	xmm15, 0x34
	add	rsi, 0x20
	punpcklqdq xmm12, xmm1
	movaps	xmm1, xmm12
	psrlq	xmm8, 0x1a
	psllq	xmm1, 0xc
	pand	xmm8, xmm0
	mov	[rdi+poly1305_flags_ofs], eax
	por	xmm15, xmm1
	psrlq	xmm12, 0x28
	pand	xmm9, xmm15
	por	xmm12, [rsp+0x138]
	psrlq	xmm15, 0x1a
	pand	xmm15, xmm0
	jmp	.onit
.alreadystarted:
	movups	xmm8, [rdi]
	movups	xmm15, [rdi+0x10]
	movups	xmm12, [rdi+0x20]
	pshufd	xmm3, xmm8, 0x50
	pshufd	xmm8, xmm8, 0xfa
	pshufd	xmm9, xmm15, 0x50
	pshufd	xmm15, xmm15, 0xfa
	pshufd	xmm12, xmm12, 0x50
.onit:
	test	eax, poly1305_final_r2_r
	jnz	.final_r2_r
	test	eax, poly1305_final_r_1
	jnz	.final_r_1
	; otherwise, r squares
	movups	xmm1, [rdi+0x3c]
	movd	xmm2, [rdi+0x4c]
	pshufd	xmm11, xmm1, 0
	pshufd	xmm2, xmm2, 0
	pshufd	xmm7, xmm1, 0x55
	pshufd	xmm4, xmm1, 0xaa
	movaps	[rsp+0xa8], xmm11
	pshufd	xmm1, xmm1, 0xff
.rset:
	mov	r8, 5
	movaps	xmm5, xmm7
	movaps	xmm13, xmm4
	movaps	xmm14, xmm1
	movaps	[rsp+0x108], xmm1
	movaps	xmm1, xmm2
	cmp	rdx, 0x3f
	movq	xmm6, r8
	movaps	[rsp+0x128], xmm4
	pshufd	xmm6, xmm6, 0x44
	movaps	[rsp+0x98], xmm2
	pmuludq xmm5, xmm6
	pmuludq	xmm13, xmm6
	pmuludq xmm14, xmm6
	pmuludq xmm1, xmm6
	movaps	[rsp+0x58], xmm5
	movaps	[rsp+0x48], xmm13
	movaps	[rsp+0x38], xmm14
	movaps	[rsp+0x28], xmm1
	jbe	.check32
	; otherwise >= 64 bytes
	movups	xmm1, [rdi+0x50]
	movd	xmm2, [rdi+0x60]
	mov	rcx, rdx
	pshufd	xmm2, xmm2, 0
	movaps	[rsp+0x18], xmm2
	pmuludq	xmm2, xmm6
	pshufd	xmm4, xmm1, 0x55
	movaps	[rsp+0x118], xmm4
	pmuludq	xmm4, xmm6
	pshufd	xmm13, xmm1, 0xff
	pshufd	xmm5, xmm1, 0xaa
	movaps	xmm14, [rsp+0x48]

	movaps	[rsp+0xd8], xmm5
	pmuludq	xmm5, xmm6
	mov	rax, rsi
	movaps	[rsp-0x18], xmm4
	movaps	xmm4, xmm13
	pshufd	xmm1, xmm1, 0
	pmuludq	xmm4, xmm6
	movaps	[rsp-0x8], xmm14
	movaps	[rsp+8], xmm5
	movaps	xmm5, [rsp+0xa8]
	movaps	[rsp+0xf8], xmm1
	movaps	xmm1, [rsp+0x38]
	movaps	[rsp+0x78], xmm4
	movaps	xmm4, [rsp+0x28]
	movaps	[rsp+0x88], xmm13
	movaps	[rsp+0xc8], xmm2
	movaps	[rsp+0x68], xmm1
	movaps	[rsp+0xb8], xmm4
	movaps	[rsp+0xe8], xmm5

calign
.while64:
	movaps	xmm5, [rsp-0x18]
	movaps	xmm13, xmm8
	sub	rcx, 0x40
	movaps	xmm4, [rsp+0x8]
	movaps	xmm10, [rsp+0x78]
	pmuludq xmm5, xmm12
	pmuludq xmm4, xmm15
	movaps	xmm2, [rsp+0x8]
	pmuludq xmm10, xmm9
	movaps	xmm11, [rsp+0x78]
	movaps	xmm14, [rsp+0xc8]
	pmuludq xmm2, xmm12
	paddq	xmm5, xmm4
	pmuludq xmm11, xmm15
	movaps	xmm1, [rsp+0x78]
	paddq	xmm5, xmm10
	pmuludq xmm14, xmm8
	movaps	xmm10, [rsp+0xc8]
	movaps	xmm4, [rsp+0xc8]
	pmuludq xmm1, xmm12
	movaps	xmm8, [rsp+0xf8]
	pmuludq xmm10, xmm15
	paddq	xmm2, xmm11
	pmuludq xmm4, xmm12
	paddq	xmm5, xmm14
	movaps	xmm11, [rsp+0xc8]
	movaps	xmm14, [rsp+0xf8]
	pmuludq xmm8, xmm15
	pmuludq xmm12, [rsp+0xf8]
	pmuludq xmm11, xmm9
	paddq	xmm1, xmm10
	movaps	xmm10, [rsp+0xf8]
	pmuludq xmm15, [rsp+0x118]
	pmuludq xmm14, xmm3
	paddq	xmm12, xmm15
	paddq	xmm4, xmm8
	pmuludq xmm10, xmm13
	movq	xmm15, [rax+0x18]
	movaps	xmm8, [rsp+0xf8]
	paddq	xmm2, xmm11
	movaps	xmm11, xmm3
	movaps	xmm3, [rsp+0x118]
	paddq	xmm5, xmm14
	pmuludq xmm8, xmm9
	paddq	xmm2, xmm10
	movq	xmm14, [rax+0x10]
	movaps	xmm10, [rsp+0x118]
	pmuludq xmm3, xmm9
	pmuludq xmm9, [rsp+0xd8]
	paddq	xmm12, xmm9
	paddq	xmm1, xmm8
	movq	xmm8, [rax]
	pmuludq xmm10, xmm11
	paddq	xmm4, xmm3
	movaps	xmm3, [rsp+0xd8]
	punpcklqdq xmm8, xmm14
	movaps	xmm14, [rsp+0x118]
	pmuludq xmm3, xmm13
	paddq	xmm2, xmm10
	movq	xmm10, [rax+0x8]
	pmuludq xmm14, xmm13
	pmuludq xmm13, [rsp+0x88]
	paddq	xmm12, xmm13
	punpcklqdq xmm10, xmm15
	movaps	xmm9, xmm10
	movaps	xmm15, [rsp+0xd8]
	paddq	xmm4, xmm3
	psllq	xmm9, 0xc
	movaps	xmm3, xmm0
	paddq	xmm1, xmm14
	pmuludq xmm15, xmm11
	pand	xmm3, xmm8
	movaps	xmm14, [rsp+0x88]
	movaps	[rsp-0x28], xmm3
	movaps	xmm3, xmm8
	movups	xmm13, [rax+0x30]
	psrlq	xmm8, 0x34
	pmuludq xmm14, xmm11
	paddq	xmm1, xmm15
	por	xmm8, xmm9
	pmuludq xmm11, [rsp+0x18]
	paddq	xmm12, xmm11
	movups	xmm11, [rax+0x20]
	movaps	xmm9, xmm10
	psrlq	xmm10, 0x28
	pand	xmm8, xmm0
	movaps	xmm15, xmm11
	paddq	xmm4, xmm14
	pxor	xmm14, xmm14
	punpckldq xmm15, xmm13
	psrlq	xmm9, 0xe
	add	rax, 0x40
	pand	xmm9, xmm0
	psrlq	xmm3, 0x1a
	cmp	rcx, 0x3f
	por	xmm10, [rsp+0x138]
	movaps	[rsp-0x48], xmm13
	movaps	xmm13, xmm15
	punpckldq xmm13, xmm14
	punpckhdq xmm11, [rsp-0x48]
	movaps	[rsp-0x38], xmm13
	movaps	xmm13, xmm11
	punpckhdq xmm11, xmm14
	pand	xmm3, xmm0
	psllq	xmm11, 0x12
	punpckhdq xmm15, xmm14
	punpckldq xmm13, xmm14
	paddq	xmm4, xmm11
	movaps	xmm11, [rsp-0x8]
	psllq	xmm15, 0x6
	psllq	xmm13, 0xc
	movaps	xmm14, [rsp+0x58]
	paddq	xmm2, xmm15
	pmuludq xmm11, xmm10
	paddq	xmm1, xmm13
	movaps	xmm13, [rsp-0x8]
	pmuludq xmm14, xmm10
	paddq	xmm5, [rsp-0x38]
	paddq	xmm12, [rsp+0x138]
	pmuludq xmm13, xmm9
	movaps	xmm15, [rsp+0x68]
	paddq	xmm2, xmm11
	movaps	xmm11, [rsp+0xb8]
	paddq	xmm5, xmm14
	movaps	xmm14, [rsp+0x68]
	pmuludq xmm15, xmm9
	pmuludq xmm11, xmm10
	paddq	xmm5, xmm13
	movaps	xmm13, [rsp+0x68]
	pmuludq xmm14, xmm10
	pmuludq xmm10, [rsp+0xe8]
	paddq	xmm12, xmm10
	pmuludq xmm13, xmm8
	paddq	xmm2, xmm15
	movaps	xmm10, xmm8
	paddq	xmm4, xmm11
	pmuludq xmm10, xmm7
	movaps	xmm11, [rsp+0xe8]
	movaps	xmm15, [rsp+0xb8]
	paddq	xmm1, xmm14
	pmuludq xmm11, xmm9
	paddq	xmm5, xmm13
	movaps	xmm13, [rsp+0xb8]
	movaps	xmm14, [rsp+0xb8]
	pmuludq xmm15, xmm3
	pmuludq xmm13, xmm9
	paddq	xmm4, xmm11
	pmuludq xmm14, xmm8
	movaps	xmm11, [rsp+0xe8]
	paddq	xmm4, xmm10
	paddq	xmm5, xmm15
	pmuludq xmm9, xmm7
	pmuludq xmm11, xmm8
	paddq	xmm1, xmm13
	movaps	xmm13, [rsp+0xe8]
	movaps	xmm10, [rsp+0x128]
	paddq	xmm2, xmm14
	pmuludq xmm8, [rsp+0x128]
	movaps	xmm14, [rsp-0x28]
	pmuludq xmm13, xmm3
	paddq	xmm12, xmm9
	paddq	xmm1, xmm11
	movaps	xmm11, xmm3
	paddq	xmm12, xmm8
	movaps	xmm15, [rsp+0xe8]
	pmuludq xmm11, xmm7
	pmuludq xmm10, xmm3
	paddq	xmm2, xmm13
	movaps	xmm13, xmm14
	movaps	xmm9, [rsp+0x128]
	pmuludq xmm15, xmm14
	pmuludq xmm3, [rsp+0x108]
	paddq	xmm1, xmm11
	pmuludq xmm13, xmm7
	paddq	xmm12, xmm3
	movaps	xmm11, [rsp+0x108]
	paddq	xmm4, xmm10
	pmuludq xmm9, xmm14
	paddq	xmm5, xmm15
	pmuludq xmm11, xmm14
	movaps	xmm8, xmm5
	paddq	xmm2, xmm13
	psrlq	xmm8, 0x1a
	paddq	xmm1, xmm9
	pand	xmm5, xmm0
	pmuludq xmm14, [rsp+0x98]
	paddq	xmm12, xmm14
	paddq	xmm2, xmm8
	paddq	xmm4, xmm11
	movaps	xmm9, xmm2
	movaps	xmm8, xmm2
	movaps	xmm3, xmm4
	psrlq	xmm9, 0x1a
	pand	xmm4, xmm0
	psrlq	xmm3, 0x1a
	paddq	xmm1, xmm9
	pand	xmm8, xmm0
	paddq	xmm12, xmm3
	movaps	xmm10, xmm1
	movaps	xmm9, xmm1
	movaps	xmm3, xmm12
	psrlq	xmm10, 0x1a
	pand	xmm12, xmm0
	psrlq	xmm3, 0x1a
	paddq	xmm4, xmm10
	pand	xmm9, xmm0
	pmuludq xmm3, xmm6
	movaps	xmm1, xmm4
	movaps	xmm15, xmm4
	psrlq	xmm1, 0x1a
	pand	xmm15, xmm0
	paddq	xmm12, xmm1
	paddq	xmm5, xmm3
	movaps	xmm2, xmm5
	movaps	xmm3, xmm5
	psrlq	xmm2, 0x1a
	pand	xmm3, xmm0
	paddq	xmm8, xmm2
	ja	.while64
	lea	rax, [rdx-0x40]
	and	edx, 0x3f
	and	rax, 0xffffffffffffffc0
	lea	rsi, [rsi+rax+0x40]
.check32:
	cmp	rdx, 0x1f
	jbe	.checkm
	movaps	xmm11, [rsp+0x38]
	movaps	xmm1, xmm15
	movaps	xmm14, xmm15
	movaps	xmm5, [rsp+0x48]
	movaps	xmm4, xmm12
	movaps	xmm10, xmm15
	movaps	xmm2, [rsp+0x58]
	pmuludq xmm14, xmm11
	movaps	xmm15, xmm8
	pmuludq xmm1, xmm5
	movaps	xmm13, [rsp+0x28]
	test	rsi, rsi
	pmuludq xmm2, xmm12
	pmuludq xmm5, xmm12
	pmuludq xmm4, xmm11
	paddq	xmm2, xmm1
	pmuludq xmm11, xmm9
	movaps	xmm1, xmm12
	paddq	xmm5, xmm14
	pmuludq xmm15, xmm13
	movaps	xmm14, xmm9
	pmuludq xmm14, xmm13
	pmuludq xmm1, xmm13
	paddq	xmm2, xmm11
	movaps	xmm11, [rsp+0xa8]
	pmuludq xmm13, xmm10
	paddq	xmm2, xmm15
	movaps	xmm15, xmm9
	paddq	xmm5, xmm14
	pmuludq xmm12, xmm11
	movaps	xmm14, xmm3
	pmuludq xmm14, xmm11
	movaps	[rsp+0xf8], xmm13
	movaps	xmm13, xmm10
	pmuludq xmm15, xmm7
	paddq	xmm4, [rsp+0xf8]
	pmuludq xmm13, xmm11
	pmuludq xmm10, xmm7
	paddq	xmm2, xmm14
	movaps	[rsp+0x118], xmm13
	movaps	xmm13, xmm8
	pmuludq xmm13, xmm11
	paddq	xmm12, xmm10
	movaps	xmm10, [rsp+0x128]
	paddq	xmm1, [rsp+0x118]
	pmuludq xmm11, xmm9
	pmuludq xmm9, [rsp+0x128]
	pmuludq xmm10, xmm3
	paddq	xmm12, xmm9
	paddq	xmm5, xmm13
	movaps	xmm13, xmm3
	paddq	xmm1, xmm15
	pmuludq xmm13, xmm7
	paddq	xmm4, xmm11
	movaps	xmm11, [rsp+0x128]
	pmuludq xmm7, xmm8
	pmuludq xmm11, xmm8
	pmuludq xmm8, [rsp+0x108]
	paddq	xmm12, xmm8
	paddq	xmm5, xmm13
	paddq	xmm4, xmm7
	movaps	xmm7, [rsp+0x108]
	paddq	xmm1, xmm11
	paddq	xmm4, xmm10
	pmuludq xmm7, xmm3
	pmuludq xmm3, [rsp+0x98]
	paddq	xmm12, xmm3
	paddq	xmm1, xmm7
	jz	.check32_nom
	movups	xmm7, [rsi]
	pxor	xmm3, xmm3
	paddq	xmm12, [rsp+0x138]
	movups	xmm8, [rsi+0x10]
	movaps	xmm9, xmm7
	punpckldq xmm9, xmm8
	punpckhdq xmm7, xmm8
	movaps	xmm10, xmm9
	movaps	xmm8, xmm7
	punpckldq xmm10, xmm3
	punpckhdq xmm9, xmm3
	punpckhdq xmm7, xmm3
	punpckldq xmm8, xmm3
	movaps	xmm3, xmm8
	psllq	xmm9, 0x6
	paddq	xmm2, xmm10
	psllq	xmm3, 0xc
	paddq	xmm5, xmm9
	psllq	xmm7, 0x12
	paddq	xmm4, xmm3
	paddq	xmm1, xmm7
.check32_nom:
	movaps	xmm8, xmm2
	movaps	xmm3, xmm1
	movaps	xmm15, xmm1
	psrlq	xmm8, 0x1a
	pand	xmm2, xmm0
	pand	xmm15, xmm0
	psrlq	xmm3, 0x1a
	paddq	xmm8, xmm5
	paddq	xmm3, xmm12
	movaps	xmm9, xmm8
	pand	xmm8, xmm0
	movaps	xmm1, xmm3
	psrlq	xmm9, 0x1a
	movaps	xmm12, xmm3
	psrlq	xmm1, 0x1a
	paddq	xmm9, xmm4
	pand	xmm12, xmm0
	pmuludq xmm6, xmm1
	movaps	xmm3, xmm9
	pand	xmm9, xmm0
	psrlq	xmm3, 0x1a
	paddq	xmm15, xmm3
	paddq	xmm2, xmm6
	movaps	xmm3, xmm15
	pand	xmm15, xmm0
	movaps	xmm1, xmm2
	psrlq	xmm3, 0x1a
	psrlq	xmm1, 0x1a
	paddq	xmm12, xmm3
	movaps	xmm3, xmm0
	paddq	xmm8, xmm1
	pand	xmm3, xmm2

.checkm:
	test	rsi, rsi
	jz	.nom
	pshufd	xmm3, xmm3, 0x8
	pshufd	xmm8, xmm8, 0x8
	pshufd	xmm9, xmm9, 0x8
	pshufd	xmm15, xmm15, 0x8
	pshufd	xmm12, xmm12, 0x8
	punpcklqdq xmm3, xmm8
	punpcklqdq xmm9, xmm15
	movups	[rdi], xmm3
	movups	[rdi+0x10], xmm9
	movq	[rdi+0x20], xmm12

	mov	rcx, [rsp+0x150]
	add	rsp, 0x158
	add	rsp, rcx
	pop	rbx
	epilog

calign
.nom:
	movaps	xmm0, xmm3
	movaps	xmm4, xmm8
	movaps	xmm2, xmm9
	psrldq	xmm0, 0x8
	movaps	xmm10, xmm15
	paddq	xmm3, xmm0
	psrldq	xmm4, 0x8
	movaps	xmm0, xmm12
	movd	edx, xmm3
	paddq	xmm8, xmm4
	psrldq	xmm2, 0x8
	mov	ecx, edx
	movd	eax, xmm8
	paddq	xmm9, xmm2
	shr	ecx, 0x1a
	psrldq	xmm10, 0x8
	and	edx, 0x3ffffff
	add	eax, ecx
	movd	ecx, xmm9
	paddq	xmm15, xmm10
	mov	r9d, eax
	shr	eax, 0x1a
	psrldq	xmm0, 0x8
	add	eax, ecx
	movd	ecx, xmm15
	paddq	xmm12, xmm0
	mov	esi, eax
	and	r9d, 0x3ffffff
	movd	r10d, xmm12
	shr	esi, 0x1a
	and	eax, 0x3ffffff
	add	esi, ecx
	shl	rax, 0x8
	mov	ecx, r9d
	shr	r9d, 0x12
	mov	r8d, esi
	shr	esi, 0x1a
	and	r8d, 0x3ffffff
	add	esi, r10d
	or	rax, r9
	shl	rsi, 0x10
	mov	r9, r8
	shr	r8d, 0xa
	shl	rcx, 0x1a
	or	rsi, r8
	shl	r9, 0x22
	or	rcx, rdx
	mov	r11, rsi
	shr	rsi, 0x2a
	mov	rdx, 0xfffffffffff
	or	rax, r9
	mov	r8, 0x3ffffffffff
	and	rcx, rdx
	and	rax, rdx
	and	r11, r8
	lea	rsi, [rsi+rsi*4]
	add	rcx, rsi
	mov	r10, rcx
	shr	rcx, 0x2c
	add	rax, rcx
	and	r10, rdx
	mov	r9, rax
	shr	rax, 0x2c
	add	rax, r11
	and	r9, rdx
	mov	r11, 0xfffffc0000000000
	mov	rcx, rax
	and	rcx, r8
	shr	rax, 0x2a
	lea	rsi, [rax+rax*4]
	add	r11, rcx
	add	rsi, r10
	mov	r8, rsi
	shr	rsi, 0x2c
	and	r8, rdx
	add	rsi, r9
	lea	r9, [r8+0x5]
	mov	rbx, r9
	and	r9, rdx
	shr	rbx, 0x2c
	add	rbx, rsi
	mov	rax, rbx
	and	rdx, rbx
	shr	rax, 0x2c
	add	r11, rax
	mov	rax, r11
	shr	rax, 0x3f
	sub	rax, 1
	mov	r10, rax
	and	r9, rax
	and	rdx, rax
	not	r10
	and	rax, r11
	and	r8, r10
	and	rsi, r10
	and	rcx, r10
	or	r8, r9
	or	rsi, rdx
	or	rcx, rax
	mov	[rdi], r8
	mov	[rdi+0x8], rsi
	mov	[rdi+0x10], rcx

	mov	rcx, [rsp+0x150]
	add	rsp, 0x158
	add	rsp, rcx
	pop	rbx
	epilog

calign
.final_r2_r:
	movd	xmm2, [rdi+0x38]
	lea	rax, [rdi+0x28]

	movups	xmm1, [rdi+0x3c]
	movups	xmm4, [rax]
	movd	eax, xmm2
	movd	xmm2, [rdi+0x4c]
	movaps	xmm7, xmm1
	movd	xmm5, eax
	punpckldq xmm7, xmm4
	punpckhdq xmm1, xmm4
	punpcklqdq xmm2, xmm5

	pshufd	xmm11, xmm7, 0x50
	pshufd	xmm4, xmm1, 0x50
	pshufd	xmm7, xmm7, 0xfa
	movaps	[rsp+0xa8], xmm11

	pshufd	xmm1, xmm1, 0xfa
	jmp	.rset

calign
.final_r_1:
	movd	xmm2, [rdi+0x38]
	lea	rax, [rdi+0x28]

	movups	xmm1, [rax]
	mov	r8d, 1
	movd	xmm4, r8d
	movaps	xmm7, xmm1
	punpckldq xmm7, xmm4
	punpckhdq xmm1, xmm4

	pshufd	xmm11, xmm7, 0x50
	pshufd	xmm4, xmm1, 0x50
	pshufd	xmm7, xmm7, 0xfa
	movaps	[rsp+0xa8], xmm11

	pshufd	xmm1, xmm1, 0xfa
	jmp	.rset

	

end if