HeavyThing - curve25519.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	; curve25519.inc: Implementation of the SSE2 public domain variant at
	; https://github.com/floodyberry/curve25519-donna
	;
	; to generate a public 32 byte key from 32 bytes random (as his docs
	; mention, clamping beforehand not necessary), use curve25519$donna_basepoint
	; 
	; to generate a 32 byte shared key, use curve25519$donna
	;

if used curve25519$donna_basepoint | defined include_everything
	; two arguments: rdi == ptr to 32 byte public key (output), rsi == ptr to 32 bytes rng output
	; unlike the C version, we modify the rsi buffer inpace (rather than make a stack copy of it)
falign
curve25519$donna_basepoint:
	prolog	curve25519$donna_basepoint
	and	byte [rsi], 0xf8
	and	byte [rsi+0x1f], 0x7f
	or	byte [rsi+0x1f], 0x40
	mov	rdx, .basepoint
	call	curve25519$scalarmult_donna
	epilog
dalign
.basepoint:
	db	9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

end if

if used curve25519$donna | defined include_everything
	; three arguments: rdi == ptr to 32 byte shared (output), rsi == ptr to 32 byte secret, rdx == ptr to 32 byte other public
falign
curve25519$donna:
	prolog	curve25519$donna
	and	byte [rsi], 0xf8
	and	byte [rsi+0x1f], 0x7f
	or	byte [rsi+0x1f], 0x40
	call	curve25519$scalarmult_donna
	epilog

end if


if used curve25519$scalarmult_donna | defined include_everything
	; three args (see above)
falign
curve25519$scalarmult_donna:
	prolog	curve25519$scalarmult_donna
	mov	rcx, rsp
	pxor	xmm0, xmm0
	pxor	xmm15, xmm15
	push	rbp rbx r12
	and	rcx, 0xf		; misaligned stack on entry?
	mov	rbp, rdi
	xor	eax, eax
	add	rcx, 0x4d0
	push	r13 r14 r15
	sub	rsp, rcx
	mov	[rsp+0x4c8], rcx	; our stack modification amount

	mov	r12d, [rdx]
	mov	r11d, [rdx+4]
	mov	r10d, [rdx+8]
	movaps	[rsp+0x310], xmm0
	movaps	[rsp+0x320], xmm0
	movaps	[rsp+0x330], xmm0
	mov	r9d, [rdx+12]
	mov	r8d, [rdx+16]
	lea	rbx, [rsp+0x310]
	movaps	[rsp+0x340], xmm0
	movaps	[rsp+0x350], xmm0
	movaps	[rsp+0x360], xmm0
	movaps	xmm7, [.sse2_top64bitmask]
	movaps	xmm8, [.packednineteen]
	mov	eax, r12d
	mov	edi, [rdx+0x14]

	mov	dword [rsp+0x310], 1
	mov	dword [rsp+0x340], 1
	mov	[rsp+0x3c8], rax	; dwords 3c8 and 3cc == 0
	
	mov	ecx, [rdx+0x18]
	and	eax, 0x3ffffff
	movaps	xmm9, xmm7
	mov 	edx, [rdx+0x1c]
	mov	dword [rsp+0x3a0], eax
	
	mov	eax, r11d
	shl	rax, 0x20
	or	rax, r12
	shr	rax, 0x1a
	and	eax, 0x1ffffff
	mov	dword [rsp+0x3a4], eax
	
	mov	eax, r10d
	shl	rax, 0x20
	or	rax, r11
	shr	rax, 0x13
	and	eax, 0x3ffffff
	mov	dword [rsp+0x3a8], eax

	mov	eax, r9d
	shr	r9d, 0x6
	shl	rax, 0x20
	mov	dword [rsp+0x3b0], r9d
	or	rax, r10
	shr	rax, 0xd
	and	eax, 0x1ffffff
	mov	dword [rsp+0x3ac], eax

	mov	eax, r8d
	and	eax, 0x1ffffff
	movaps	xmm5, [rsp+0x3a0]
	mov	dword [rsp+0x3b4], eax

	mov	eax, edi
	shl	rax, 0x20
	pshufd	xmm0, xmm5, 0x55
	or	rax, r8
	pshufd	xmm2, xmm5, 0xaa
	shr	rax, 0x19
	xor	r8d, r8d
	and	eax, 0x3ffffff
	pand	xmm7, xmm0
	mov	dword [rsp+0x3b8], eax

	mov	eax, ecx
	shl	rax, 0x20
	movaps	xmm6, xmm2
	or	rax, rdi
	paddq	xmm7, xmm0
	pshufd	xmm0, xmm5, 0xff
	shr	rax, 0x13
	movaps	xmm10, xmm6
	and	eax, 0x1ffffff
	mov	dword [rsp+0x3bc], eax

	mov	eax, edx
	shl	rax, 0x20
	or	rax, rcx
	shr	rax, 0xc
	and	eax, 0x3ffffff
	mov	dword [rsp+0x3c0], eax

	mov	eax, edx
	mov	edx, 0xfe
	shr	eax, 0x6
	and	eax, 0x1ffffff
	mov	dword [rsp+0x3c4], eax
	movaps	[rsp+0xb0], xmm2

	
	; xmm1, 3, 4, 9, 11, 12, 13, 14 are all unused so far
	pmuludq	xmm10, xmm8
	pshufd	xmm2, xmm5, 0
	movaps	[rsp+0x260], xmm7
	pshufd	xmm7, xmm7, 0xa
	movaps	[rsp+0x50], xmm2
	movaps	xmm3, xmm9
	pand	xmm3, xmm0
	movaps	xmm13, [rsp+0x3b0]
	paddq	xmm3, xmm0
	pshufd	xmm0, xmm13, 0x55
	movaps	xmm2, xmm9
	pshufd	xmm4, xmm13, 0xaa
	pand	xmm2, xmm0
	movaps	xmm1, xmm9
	movaps	xmm14, xmm4
	paddq	xmm2, xmm0
	pshufd	xmm0, xmm13, 0xff
	movaps	[rsp+0x1c0], xmm4
	
	pshufd	xmm4, xmm13, 0
	pand	xmm1, xmm0
	movaps	xmm6, xmm14
	movaps	[rsp+0xc0], xmm3
	movaps	xmm11, xmm4
	pmuludq	xmm6, xmm8
	movaps	[rsp+0x110], xmm4

	paddq	xmm1, xmm0
	pshufd	xmm3, xmm3, 0xa
	movaps	[rsp+0x120], xmm2
	pshufd	xmm2, xmm2, 0xa
	movaps	[rsp+0x1d0], xmm1
	pmuludq	xmm3, xmm8
	pshufd	xmm1, xmm1, 0xa
	movaps	xmm9, xmm11
	movaps	xmm11, xmm7
	pmuludq	xmm2, xmm8
	
	movaps	xmm14, xmm15
	movaps	xmm12, [rsp+0x3c0]
	pmuludq	xmm11, xmm8
	pmuludq	xmm9, xmm8
	pmuludq	xmm1, xmm8
	
	pshufd	xmm4, xmm12, 0xdd
	movaps	xmm0, [.sse2_top64bitmask]
	pand	xmm0, xmm4
	movaps	xmm8, [rsp+0x340]
	paddq	xmm0, xmm4
	pshufd	xmm4, xmm12, 0xcc
	movaps	[rsp+0x270], xmm0
	pshufd	xmm0, xmm0, 0xa
	movaps	[rsp+0x250], xmm4
	pmuludq	xmm4, [.packednineteen]
	pmuludq	xmm0, [.packednineteen]
	movaps	[rsp+0x2a0], xmm2
	movaps	xmm2, xmm15
	movaps	[rsp+0x2c0], xmm3
	movaps	xmm3, xmm15
	movaps	[rsp+0x2d0], xmm6
	movaps	xmm6, xmm8
	movaps	[rsp+0x280], xmm0
	movaps	[rsp+0x290], xmm1
	movaps	[rsp+0x2b0], xmm4
	movaps	[rsp+0x2e0], xmm9
	
	movaps	xmm7, [rsp+0x310]
	movaps	[rsp+0x2f0], xmm10
	movaps	xmm10, xmm12
	movaps	[rsp+0x300], xmm11
	movaps	xmm12, xmm13
	movaps	[rsp+0x1a0], xmm15
	movaps	xmm11, xmm15
	movaps	[rsp+0x1b0], xmm15
	
	movaps	xmm13, xmm10
	movaps	xmm15, xmm5
	movaps	xmm10, xmm14
	movaps	xmm5, xmm14
	jmp	.highentry
align 16
.packednineteen:
	dq	19, 19
.packedmask26:
	dq	0x3ffffff, 0x3ffffff
.packedmask25:
	dq	0x1ffffff, 0x1ffffff
.packed32zeromodp0:
	dd	0x7ffffda, 0x7ffffda, 0x3fffffe, 0x3fffffe
.packed32zeromodp1:
	dd	0x7fffffe, 0x7fffffe, 0x3fffffe, 0x3fffffe
.packedmask26262626:
	dd	0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff
.packedmask25252525:
	dd	0x1ffffff, 0x1ffffff, 0x1ffffff, 0x1ffffff
.packedthirtyeight:
	dq	38, 38
.packed121666121665:
	dq	121666, 121665
.packed2p0:
	dd	0x7ffffda, 0x3fffffe, 0x7fffffe, 0x3fffffe
.packed2p2:
	dd	0x7fffffe, 0x3fffffe, 0x0000000, 0x0000000
.packed2p1:
	dd	0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe
.sse2_bot32bitmask:
	dd	0xffffffff, 0x00000000, 0xffffffff, 0x00000000
.sse2_top64bitmask:
	dd	0x00000000, 0x00000000, 0xffffffff, 0xffffffff
.packed3819:
	dq	38, 19
.sse2_bot64bitmask:
	dd	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
calign
.highloop:
	movaps	xmm6, [rsp+0x200]
	mov	r8, r9
	movaps	xmm13, [rsp+0x210]
	movaps	xmm12, [rsp+0x1f0]
	movaps	xmm15, [rsp+0x1e0]
	movaps	xmm10, [rsp+0x1b0]
calign
.highentry:
	mov	rax, rdx
	mov	ecx, edx
	sub	rdx, 1
	shr	rax, 3
	and	ecx, 7
	movzx	eax, byte [rsi+rax]
	pxor	xmm13, xmm11
	pxor	xmm15, xmm7
	sar	eax, cl
	mov	ecx, r8d

	pxor	xmm12, xmm5
	movaps	xmm9, xmm13

	and	eax, 1
	pxor	xmm6, xmm2
	movaps	xmm1, xmm15
	xor	ecx, eax
	neg	ecx
	pxor	xmm10, xmm3
	movaps	xmm0, xmm12
	movd	xmm4, ecx
	cmp	rdx, 2
	movsxd	r9, eax

	pshufd	xmm4, xmm4, 0
	pand	xmm9, xmm4
	pand	xmm1, xmm4
	pand	xmm0, xmm4
	pxor	xmm9, xmm11
	movaps	xmm11, [rsp+0x1a0]

	pxor	xmm1, xmm7
	pxor	xmm11, xmm14
	movaps	xmm7, xmm6
	pxor	xmm0, xmm5
	pand	xmm7, xmm4
	movaps	xmm5, xmm11
	pxor	xmm15, xmm1
	pxor	xmm12, xmm0
	pand	xmm5, xmm4
	pxor	xmm7, xmm2
	movaps	xmm2, xmm0
	pand	xmm4, xmm10
	pxor	xmm5, xmm14
	pxor	xmm6, xmm7
	movaps	xmm8, xmm7

	punpckldq xmm2, xmm12
	pxor	xmm11, xmm5
	punpckldq xmm8, xmm6
	punpckhdq xmm7, xmm6
	movaps	xmm6, xmm5
	pxor	xmm4, xmm3
	punpckldq xmm6, xmm11
	movaps	xmm3, xmm1
	punpckhdq xmm0, xmm12
	movaps	xmm14, xmm2
	punpckhdq xmm5, xmm11
	paddd	xmm2, [.packed32zeromodp1]

	punpckldq xmm3, xmm15
	psubd	xmm2, xmm6
	pxor	xmm13, xmm9
	paddd	xmm14, xmm6
	movaps	xmm11, xmm14
	movaps	xmm14, xmm0
	paddd	xmm0, [.packed32zeromodp1]
	
	psubd	xmm0, xmm5
	punpckhdq xmm1, xmm15
	pxor	xmm10, xmm4
	paddd	xmm14, xmm5
	punpckldq xmm9, xmm13
	movaps	xmm13, xmm3
	paddd	xmm3, [.packed32zeromodp0]

	psubd	xmm3, xmm8
	movaps	xmm5, xmm3
	punpckldq xmm4, xmm10
	movaps	xmm10, xmm1
	paddd	xmm1, [.packed32zeromodp1]

	punpcklqdq xmm5, xmm2
	psubd	xmm1, xmm7
	paddd	xmm13, xmm8
	punpckhqdq xmm3, xmm2
	movaps	xmm2, xmm1
	paddd	xmm10, xmm7
	movaps	xmm15, xmm9
	paddd	xmm9, [.packed32zeromodp1]

	psubd	xmm9, xmm4
	punpcklqdq xmm2, xmm0
	punpckhqdq xmm1, xmm0
	paddd	xmm15, xmm4
	movaps	[rsp+0x60], xmm14
	movaps	xmm6, xmm2
	movaps	xmm2, xmm5
	pshufd	xmm14, xmm10, 0xfa
	movaps	[rsp+0x70], xmm15
	psrld	xmm2, 0x1a
	movaps	xmm4, xmm2
	movaps	xmm2, xmm6
	pshufd	xmm15, xmm10, 0x50
	paddd	xmm4, xmm3
	movaps	xmm0, xmm4
	pand	xmm5, [.packedmask26262626]
	
	psrld	xmm2, 0x1a
	paddd	xmm2, xmm1
	movaps	xmm1, xmm2
	psrld	xmm0, 0x19
	pand	xmm4, [.packedmask25252525]

	psrld	xmm1, 0x19
	movaps	xmm3, xmm1
	pslldq	xmm1, 0x8
	pand	xmm6, [.packedmask26262626]

	paddd	xmm5, xmm1
	psrldq	xmm3, 0x8
	paddd	xmm3, xmm9
	movaps	xmm8, xmm3
	movaps	xmm3, xmm5
	pand	xmm2, [.packedmask25252525]

	movaps	[rsp+0x140], xmm10
	movaps	[rsp+0x130], xmm13
	punpcklqdq xmm3, xmm4
	paddd	xmm0, xmm6
	pshufd	xmm12, xmm13, 0xfa
	movaps	xmm7, xmm3
	movaps	xmm3, xmm0
	pshufd	xmm9, xmm11, 0x50
	movaps	[rsp+0x150], xmm11
	movaps	[rsp+0x170], xmm7
	punpcklqdq xmm3, xmm2

	pshufd	xmm11, xmm11, 0xfa
	movaps	xmm1, xmm3
	movaps	xmm3, xmm5
	punpckhqdq xmm3, xmm4
	movaps	xmm6, [rsp+0x60]
	movaps	xmm4, [rsp+0x70]
	movaps	xmm5, xmm3
	movaps	xmm3, xmm0
	pshufd	xmm0, xmm6, 0x50
	pshufd	xmm6, xmm6, 0xfa
	punpckhqdq xmm3, xmm2
	pshufd	xmm2, xmm4, 0x50
	pshufd	xmm4, xmm4, 0xfa
	movaps	[rsp+0x190], xmm5
	movaps	[rsp+0x80], xmm3
	movaps	[rsp+0xa0], xmm6
	pshufd	xmm6, xmm5, 0x5
	movaps	[rsp+0x90], xmm0
	movaps	xmm0, xmm1
	movaps	xmm10, xmm6
	pshufd	xmm6, [rsp+0x190], 0xaf
	movaps	[rsp+0x1a0], xmm4
	movaps	[rsp+0x180], xmm0

	pshufd	xmm4, xmm7, 0x5
	pshufd	xmm3, xmm13, 0x50
	movaps	[rsp+0xd0], xmm2
	pshufd	xmm13, [rsp+0x80], 0xaf
	pshufd	xmm2, [rsp+0x170], 0xaf
	pshufd	xmm0, [rsp+0x180], 0xaf
	movaps	xmm5, [rsp+0x80]

	pshufd	xmm1, xmm1, 0x5
	pshufd	xmm7, xmm5, 0x5
	pshufd	xmm5, xmm8, 0x5
	movaps	[rsp], xmm6
	movaps	[rsp+0x20], xmm7
	movaps	xmm7, xmm15
	movaps	[rsp+0x40], xmm5
	movaps	[rsp+0x160], xmm8
	movaps	xmm5, xmm3
	pmuludq	xmm7, xmm4
	pshufd	xmm8, xmm8, 0xaf
	pmuludq	xmm5, xmm4
	movaps	[rsp+0xe0], xmm5
	movaps	[rsp+0x30], xmm8
	movaps	xmm5, xmm12
	movaps	xmm8, xmm3
	pmuludq	xmm5, xmm4
	movaps	[rsp+0x10], xmm10
	pmuludq	xmm8, xmm2
	paddq	xmm5, xmm8
	movaps	xmm8, xmm12
	movaps	[rsp+0xf0], xmm5

	pslld	xmm8, 1
	movaps	xmm5, xmm8
	movaps	xmm6, xmm8
	movaps	xmm8, xmm3
	pmuludq	xmm6, xmm2
	pmuludq	xmm8, xmm1
	paddq	xmm6, xmm7
	movaps	xmm7, xmm15
	paddq	xmm6, xmm8
	movaps	[rsp+0x100], xmm6
	
	movaps	xmm6, xmm14
	pmuludq	xmm7, xmm2
	movaps	xmm8, xmm3
	pmuludq	xmm6, xmm4
	paddq	xmm7, xmm6
	movaps	xmm6, xmm12
	pmuludq	xmm8, xmm0
	pmuludq	xmm6, xmm1
	paddq	xmm6, xmm7
	movaps	xmm7, xmm6
	paddq	xmm7, xmm8
	movaps	xmm8, xmm14
	movaps	[rsp+0x210], xmm7
	
	pslld	xmm8, 1
	movaps	xmm6, xmm8
	movaps	xmm8, xmm9
	movaps	xmm7, xmm6
	pmuludq	xmm8, xmm4
	pmuludq	xmm7, xmm2
	paddq	xmm7, xmm8
	movaps	xmm8, xmm15
	pmuludq	xmm8, xmm1
	paddq	xmm8, xmm7
	movaps	xmm7, xmm5
	pmuludq	xmm7, xmm0
	paddq	xmm7, xmm8
	movaps	xmm8, xmm10
	pmuludq	xmm8, xmm3
	paddq	xmm7, xmm8
	movaps	xmm8, xmm9
	movaps	[rsp+0x1e0], xmm7
	
	movaps	xmm7, xmm11
	pmuludq	xmm8, xmm2
	pmuludq	xmm7, xmm4
	paddq	xmm8, xmm7
	movaps	xmm7, xmm14
	pmuludq	xmm7, xmm1
	paddq	xmm7, xmm8
	movaps	xmm8, xmm15
	pmuludq	xmm8, xmm0
	paddq	xmm8, xmm7
	movaps	xmm7, xmm10
	pmuludq	xmm7, xmm12
	paddq	xmm7, xmm8
	movaps	xmm8, [rsp]
	movaps	xmm10, xmm7
	pmuludq	xmm8, xmm3
	paddq	xmm10, xmm8
	movaps	xmm8, xmm11
	movaps	[rsp+0x1f0], xmm10
	
	pslld	xmm8, 1
	movaps	xmm10, xmm8
	movaps	xmm7, xmm10
	pmuludq	xmm10, xmm0
	movaps	xmm8, [rsp+0x90]
	
	pmuludq	xmm7, xmm2
	pmuludq	xmm8, xmm4
	paddq	xmm7, xmm8
	movaps	xmm8, xmm9
	pmuludq	xmm8, xmm1
	paddq	xmm8, xmm7
	movaps	xmm7, xmm6
	pmuludq	xmm7, xmm0
	paddq	xmm7, xmm8
	movaps	xmm8, [rsp+0x10]
	pmuludq	xmm8, xmm15
	paddq	xmm8, xmm7
	movaps	xmm7, [rsp]
	pmuludq	xmm7, xmm5
	paddq	xmm7, xmm8
	movaps	xmm8, [rsp+0x20]
	pmuludq	xmm5, xmm13
	pmuludq	xmm8, xmm3
	paddq	xmm7, xmm8
	movaps	xmm8, [rsp+0x90]
	
	movaps	[rsp+0x200], xmm7
	movaps	xmm7, [rsp+0xa0]
	
	pmuludq	xmm8, xmm2
	
	pmuludq	xmm7, xmm4
	paddq	xmm8, xmm7
	movaps	xmm7, xmm11
	pmuludq	xmm7, xmm1
	paddq	xmm7, xmm8
	movaps	xmm8, xmm9
	pmuludq	xmm8, xmm0
	paddq	xmm8, xmm7
	movaps	xmm7, [rsp+0x10]
	pmuludq	xmm7, xmm14
	paddq	xmm7, xmm8
	movaps	xmm8, [rsp]
	pmuludq	xmm8, xmm15
	paddq	xmm8, xmm7
	movaps	xmm7, [rsp+0x20]
	pmuludq	xmm7, xmm12
	paddq	xmm7, xmm8
	movaps	xmm8, xmm3
	pmuludq	xmm8, xmm13
	paddq	xmm7, xmm8
	movaps	[rsp+0x220], xmm7
	
	movaps	xmm7, [rsp+0xa0]
	
	pslld	xmm7, 1
	movaps	xmm8, xmm7
	movaps	xmm7, [rsp+0xd0]
	
	pmuludq	xmm8, xmm2
	pmuludq	xmm7, xmm4
	paddq	xmm8, xmm7
	movaps	xmm7, [rsp+0x90]
	
	pmuludq	xmm4, [rsp+0x1a0]
	
	pmuludq	xmm7, xmm1
	paddq	xmm7, xmm8
	movaps	xmm8, [rsp+0x10]
	paddq	xmm10, xmm7
	movaps	xmm7, xmm8
	pmuludq	xmm7, xmm9
	paddq	xmm7, xmm10
	movaps	xmm10, [rsp]
	pmuludq	xmm6, xmm10
	paddq	xmm6, xmm7
	movaps	xmm7, [rsp+0x20]
	pmuludq	xmm7, xmm15
	paddq	xmm7, xmm6
	movaps	xmm6, [rsp+0x40]
	paddq	xmm5, xmm7
	movaps	xmm7, xmm6
	pmuludq	xmm7, xmm3
	paddq	xmm5, xmm7
	movaps	[rsp+0x230], xmm5
	
	movaps	xmm5, [rsp+0xd0]
	movaps	xmm7, [rsp+0x90]
	
	pmuludq	xmm5, xmm2
	paddq	xmm5, xmm4
	movaps	xmm4, [rsp+0xa0]
	
	pmuludq	xmm4, xmm1
	paddq	xmm4, xmm5
	movaps	xmm5, xmm7
	pmuludq	xmm5, xmm0
	paddq	xmm5, xmm4
	movaps	xmm4, xmm8
	pmuludq	xmm4, xmm11
	paddq	xmm4, xmm5
	movaps	xmm5, xmm10
	pmuludq	xmm5, xmm9
	paddq	xmm5, xmm4
	movaps	xmm4, [rsp+0x20]
	pmuludq	xmm4, xmm14
	paddq	xmm4, xmm5
	movaps	xmm5, xmm15
	pmuludq	xmm5, xmm13
	paddq	xmm5, xmm4
	movaps	xmm4, xmm6
	movaps	xmm6, xmm3
	pmuludq	xmm4, xmm12
	paddq	xmm4, xmm5
	pmuludq	xmm6, [rsp+0x30]
	paddq	xmm4, xmm6
	movaps	[rsp+0x240], xmm4
	
	pmuludq	xmm14, [.packednineteen]
	pmuludq	xmm15, [.packednineteen]
	pmuludq	xmm12, [.packednineteen]
	movaps	xmm3, xmm14
	movaps	xmm4, xmm7
	pslld	xmm12, 1
	pslld	xmm3, 1

	pmuludq	xmm9, [.packednineteen]
	pmuludq	xmm4, [.packednineteen]
	movaps	[rsp+0x1b0], xmm3
	
	movaps	xmm3, xmm11
	pmuludq	xmm12, [rsp+0x30]
	pmuludq	xmm14, [rsp+0x40]
	pmuludq	xmm3, [.packednineteen]
	movaps	xmm11, [rsp+0xa0]
	movaps	xmm6, [rsp+0x1a0]
	movaps	xmm10, xmm3
	pslld	xmm3, 1
	
	pmuludq	xmm11, [.packednineteen]
	pmuludq	xmm6, [.packednineteen]
	movaps	xmm8, xmm11
	movaps	[rsp+0x90], xmm3
	pslld	xmm11, 1
	movaps	xmm7, xmm11
	movaps	xmm11, xmm6
	
	pslld	xmm11, 1
	movaps	xmm5, xmm11
	pmuludq	xmm2, xmm5
	movaps	xmm11, [rsp+0x40]
	pmuludq	xmm11, xmm15
	paddq	xmm11, xmm12
	movaps	xmm12, [rsp+0x1b0]
	
	pmuludq	xmm15, [rsp+0x30]
	paddq	xmm14, xmm15
	movaps	xmm15, xmm9
	pmuludq	xmm12, xmm13
	paddq	xmm12, xmm11
	movaps	xmm11, [rsp+0x20]
	pmuludq	xmm15, xmm13
	pmuludq	xmm11, xmm9
	paddq	xmm11, xmm12

	movaps	xmm12, [rsp+0x90]
	pmuludq	xmm12, [rsp]
	paddq	xmm12, xmm11

	movaps	xmm11, [rsp+0x10]
	movaps	xmm3, [rsp+0xd0]
	
	pmuludq	xmm11, xmm4
	paddq	xmm11, xmm12
	movaps	xmm12, xmm7
	pmuludq	xmm3, [.packednineteen]
	pmuludq	xmm12, xmm0
	paddq	xmm12, xmm11
	movaps	xmm11, xmm3
	pmuludq	xmm11, xmm1
	paddq	xmm11, xmm12
	paddq	xmm2, xmm11
	paddq	xmm2, [rsp+0xe0]
	
	pmuludq	xmm1, xmm6

	movaps	[rsp+0xe0], xmm2
	movaps	xmm2, xmm15
	paddq	xmm2, xmm14
	movaps	xmm15, [rsp+0x20]
	movaps	xmm12, [rsp]
	movaps	xmm14, [rsp+0x10]
	movaps	xmm11, xmm15
	pmuludq	xmm11, xmm10
	paddq	xmm11, xmm2
	movaps	xmm2, xmm12

	pmuludq	xmm2, xmm4
	paddq	xmm2, xmm11
	movaps	xmm11, xmm14
	pmuludq	xmm11, xmm8
	paddq	xmm11, xmm2
	movaps	xmm2, xmm3
	pmuludq	xmm2, xmm0
	paddq	xmm2, xmm11
	movaps	xmm11, [rsp+0x40]
	paddq	xmm1, xmm2
	paddq	xmm1, [rsp+0xf0]
	
	pmuludq	xmm0, xmm5
	movaps	[rsp+0xf0], xmm1
	
	pmuludq	xmm10, xmm11
	movaps	xmm1, [rsp+0x1b0]
	
	pmuludq	xmm1, [rsp+0x30]
	movaps	xmm2, xmm1
	movaps	xmm1, xmm11
	pmuludq	xmm1, xmm9
	paddq	xmm1, xmm2
	movaps	xmm2, [rsp+0x90]
	
	pmuludq	xmm2, xmm13
	paddq	xmm2, xmm1
	movaps	xmm1, xmm15
	pmuludq	xmm1, xmm4
	paddq	xmm1, xmm2
	movaps	xmm2, xmm12
	pmuludq	xmm2, xmm7
	paddq	xmm2, xmm1
	movaps	xmm1, xmm14
	pmuludq	xmm1, xmm3
	paddq	xmm1, xmm2
	paddq	xmm0, xmm1
	movaps	xmm1, xmm4
	paddq	xmm0, [rsp+0x100]
	
	pmuludq	xmm9, [rsp+0x30]
	movaps	[rsp+0x100], xmm0
	
	paddq	xmm10, xmm9
	pmuludq	xmm1, xmm13
	movaps	xmm9, xmm12
	movaps	xmm0, xmm1
	movaps	xmm1, xmm15
	pmuludq	xmm9, xmm3
	movaps	xmm2, xmm12
	paddq	xmm0, xmm10
	pmuludq	xmm1, xmm8
	paddq	xmm1, xmm0
	movaps	xmm0, xmm9
	pmuludq	xmm8, xmm11
	movaps	xmm9, xmm11
	movaps	xmm12, [rsp+0x30]
	paddq	xmm0, xmm1
	movaps	xmm1, xmm14
	pmuludq	xmm9, xmm4
	movaps	xmm10, xmm15
	pmuludq	xmm4, xmm12
	pmuludq	xmm1, xmm6
	paddq	xmm1, xmm0
	movaps	xmm0, xmm9
	paddq	xmm1, [rsp+0x210]
	
	movaps	xmm14, xmm1
	movaps	xmm1, [rsp+0x90]
	
	movaps	xmm9, xmm15
	paddq	xmm8, xmm4
	pmuludq	xmm1, xmm12
	pmuludq	xmm9, xmm3
	movaps	xmm4, xmm12
	paddq	xmm0, xmm1
	movaps	xmm1, xmm7
	pmuludq	xmm7, xmm12
	pmuludq	xmm1, xmm13
	movaps	xmm12, [rsp+0x200]
	paddq	xmm1, xmm0
	movaps	xmm0, xmm9
	movaps	xmm9, [rsp+0x1f0]
	
	paddq	xmm0, xmm1
	movaps	xmm1, xmm2
	movaps	xmm2, xmm11
	
	pmuludq	xmm1, xmm5
	paddq	xmm1, xmm0
	paddq	xmm1, [rsp+0x1e0]
	
	movaps	xmm15, xmm1
	movaps	xmm1, xmm3
	pmuludq	xmm1, xmm13
	movaps	xmm0, xmm1
	movaps	xmm1, xmm10
	paddq	xmm0, xmm8
	pmuludq	xmm1, xmm6
	paddq	xmm1, xmm0
	paddq	xmm9, xmm1
	movaps	xmm1, xmm11
	pmuludq	xmm6, xmm2
	pmuludq	xmm1, xmm3
	movaps	xmm0, xmm1
	pmuludq	xmm3, xmm4
	movaps	xmm1, xmm13
	paddq	xmm6, xmm3
	paddq	xmm0, xmm7
	paddq	xmm6, [rsp+0x220]
	pmuludq	xmm1, xmm5
	movaps	xmm7, xmm6
	paddq	xmm1, xmm0
	paddq	xmm12, xmm1
	pmuludq	xmm5, xmm4
	movaps	xmm11, xmm12
	movaps	xmm12, [rsp+0x230]
	
	paddq	xmm12, xmm5
	movaps	xmm4, [rsp+0xe0]
	movaps	xmm10, xmm12
	movaps	xmm12, xmm15
	
	psrlq	xmm12, 0x1a
	movaps	xmm0, xmm12
	movaps	xmm12, [.packedmask26]
	movaps	xmm1, xmm4
	pand	xmm4, xmm12
	psrlq	xmm1, 0x1a
	paddq	xmm1, [rsp+0xf0]
	
	pand	xmm12, xmm15
	movaps	xmm15, xmm1
	pand	xmm1, [.packedmask25]
	movaps	xmm2, xmm12
	psrlq	xmm15, 0x19
	movaps	xmm12, xmm0
	movaps	xmm5, xmm15
	movaps	xmm15, [.packedmask25]
	paddq	xmm12, xmm9
	movaps	xmm3, xmm12
	pand	xmm15, xmm12
	movaps	xmm12, [rsp+0x100]
	
	psrlq	xmm3, 0x19
	paddq	xmm3, xmm11
	paddq	xmm12, xmm5
	movaps	xmm0, xmm15
	movaps	xmm6, xmm12
	movaps	xmm15, xmm3
	psrlq	xmm3, 0x1a
	movaps	xmm9, xmm12
	psrlq	xmm6, 0x1a
	paddq	xmm6, xmm14
	pand	xmm15, [.packedmask26]
	movaps	xmm12, xmm6
	paddq	xmm3, xmm7
	movaps	xmm11, xmm3
	psrlq	xmm12, 0x19
	movaps	xmm7, xmm12
	movaps	xmm12, [.packedmask25]
	pand	xmm9, [.packedmask26]
	paddq	xmm2, xmm7
	movaps	xmm5, xmm15
	movaps	xmm15, xmm6
	movaps	xmm6, xmm3
	pand	xmm12, xmm15
	psrlq	xmm6, 0x19
	paddq	xmm6, xmm10
	movaps	xmm15, xmm6
	movaps	xmm3, xmm12
	psrlq	xmm15, 0x1a
	movaps	xmm12, [.packedmask25]
	movaps	xmm7, xmm15
	pand	xmm12, xmm11
	movaps	xmm15, [.packedmask26]
	pand	xmm15, xmm6
	movaps	xmm8, xmm12
	movaps	xmm12, [rsp+0x240]
	
	movaps	xmm6, xmm15
	paddq	xmm12, xmm7
	movaps	xmm15, xmm12
	psrlq	xmm15, 0x19
	movaps	xmm7, xmm15
	movaps	xmm15, [.packedmask25]
	pmuludq	xmm7, [.packednineteen]
	paddq	xmm4, xmm7
	movaps	xmm11, xmm4
	pand	xmm4, [.packedmask26]
	psrlq	xmm11, 0x1a
	paddq	xmm1, xmm11
	pand	xmm15, xmm12
	movaps	xmm12, xmm2
	pand	xmm2, [.packedmask26]
	psrlq	xmm12, 0x1a
	paddq	xmm0, xmm12
	movaps	xmm12, xmm4
	punpckhqdq xmm4, xmm1
	movaps	xmm10, xmm15
	punpcklqdq xmm12, xmm1
	movaps	xmm1, xmm9
	movaps	xmm15, xmm5
	movaps	xmm7, xmm4
	punpcklqdq xmm1, xmm3
	movaps	xmm4, xmm12
	paddd	xmm12, [.packed32zeromodp0]
	punpcklqdq xmm15, xmm8
	psubd	xmm12, xmm7
	paddd	xmm4, xmm7
	movaps	xmm11, xmm1
	movaps	xmm1, xmm9
	movaps	xmm9, xmm2
	punpckhqdq xmm2, xmm0
	punpcklqdq xmm9, xmm0
	movaps	xmm0, xmm15
	movaps	xmm15, xmm5
	punpckhqdq xmm1, xmm3
	movaps	xmm3, xmm6
	punpckhqdq xmm15, xmm8
	punpcklqdq xmm3, xmm10
	punpckhqdq xmm6, xmm10
	movaps	xmm13, xmm15
	movaps	xmm15, xmm11
	paddd	xmm11, [.packed32zeromodp1]
	psubd	xmm11, xmm1
	paddd	xmm15, xmm1
	movaps	xmm1, xmm12
	movaps	xmm5, xmm15
	movaps	xmm15, xmm9
	paddd	xmm9, [.packed32zeromodp1]
	psubd	xmm9, xmm2
	punpcklqdq xmm1, xmm9
	paddd	xmm15, xmm2
	movaps	xmm10, xmm15
	movaps	xmm15, xmm0
	paddd	xmm0, [.packed32zeromodp1]
	psubd	xmm0, xmm13
	movaps	xmm14, xmm6
	movaps	xmm7, xmm1
	movaps	xmm1, xmm11
	paddd	xmm15, xmm13
	movaps	xmm6, xmm3
	paddd	xmm3, [.packed32zeromodp1]
	psubd	xmm3, xmm14
	punpcklqdq xmm1, xmm0
	movaps	xmm8, xmm15
	paddd	xmm6, xmm14
	punpckhqdq xmm12, xmm9
	punpckhqdq xmm11, xmm0
	movaps	xmm15, [.packedmask26262626]
	movaps	xmm14, xmm1
	movaps	xmm1, xmm7
	pand	xmm15, xmm7
	movaps	xmm9, [.packedmask25252525]
	psrld	xmm1, 0x1a
	movaps	xmm2, xmm1
	movaps	xmm1, xmm14
	pand	xmm14, [.packedmask26262626]
	paddd	xmm2, xmm12
	movaps	xmm12, xmm2
	psrld	xmm1, 0x1a
	paddd	xmm1, xmm11
	pand	xmm9, xmm2
	movaps	xmm13, xmm6
	psrld	xmm12, 0x19
	movaps	xmm0, xmm12
	movaps	xmm12, xmm1
	pand	xmm1, [.packedmask25252525]
	paddd	xmm0, xmm14
	movaps	xmm11, xmm0
	psrld	xmm12, 0x19
	movaps	xmm7, xmm12
	psrldq	xmm12, 0x8
	paddd	xmm3, xmm12
	punpcklqdq xmm13, xmm3
	movaps	xmm12, xmm7
	punpcklqdq xmm11, xmm1
	pslldq	xmm12, 0x8
	movaps	xmm2, xmm12
	paddd	xmm2, xmm15
	movaps	xmm12, xmm2
	punpckhqdq xmm2, xmm9
	movaps	xmm15, xmm8
	punpcklqdq xmm12, xmm9
	movaps	xmm9, xmm0
	punpckhqdq xmm9, xmm1
	movaps	xmm1, xmm4
	punpckhqdq xmm4, xmm12
	punpcklqdq xmm1, xmm12
	movaps	xmm12, xmm5
	punpckhqdq xmm5, xmm11
	punpcklqdq xmm15, xmm9
	punpcklqdq xmm12, xmm11
	movaps	xmm11, xmm8
	punpckhqdq xmm11, xmm9
	movaps	xmm7, xmm12
	movaps	xmm12, xmm10
	movaps	xmm9, xmm6
	punpcklqdq xmm12, xmm2
	punpckhqdq xmm10, xmm2
	movaps	xmm2, xmm15
	punpckhqdq xmm9, xmm3
	movaps	xmm0, xmm12
	movaps	xmm12, xmm1
	movaps	xmm3, xmm4
	pmuludq	xmm12, xmm1
	pslld	xmm1, 1
	movaps	xmm15, xmm1
	movaps	xmm6, xmm1
	pslld	xmm3, 1
	pmuludq	xmm15, xmm4
	pmuludq	xmm4, xmm3
	movaps	[rsp], xmm9
	pmuludq	xmm6, xmm7
	paddq	xmm4, xmm6
	movaps	xmm6, xmm1
	movaps	[rsp+0x1a0], xmm4
	
	pmuludq	xmm6, xmm5
	movaps	xmm4, xmm3
	movaps	xmm8, xmm6
	movaps	xmm9, xmm7
	pmuludq	xmm4, xmm7
	movaps	xmm6, xmm3
	paddq	xmm8, xmm4
	movaps	xmm4, xmm5
	pmuludq	xmm9, xmm7
	pslld	xmm7, 1
	movaps	[rsp+0x20], xmm12
	pslld	xmm4, 1
	pmuludq	xmm6, xmm4
	paddq	xmm6, xmm9
	movaps	xmm9, xmm1
	movaps	[rsp+0x30], xmm15
	pmuludq	xmm9, xmm0
	paddq	xmm6, xmm9
	movaps	xmm9, xmm7
	movaps	[rsp+0xf0], xmm6
	
	movaps	xmm6, xmm3
	pmuludq	xmm9, xmm5
	pmuludq	xmm5, xmm4
	movaps	xmm12, xmm10
	movaps	xmm15, xmm1
	pmuludq	xmm6, xmm0
	paddq	xmm6, xmm9
	movaps	xmm9, xmm1
	pslld	xmm12, 1
	pmuludq	xmm15, xmm2
	movaps	[rsp+0x40], xmm13
	pmuludq	xmm9, xmm10
	paddq	xmm6, xmm9
	movaps	xmm9, xmm7
	movaps	[rsp+0x100], xmm6
	
	pmuludq	xmm9, xmm0
	paddq	xmm9, xmm5
	movaps	xmm5, xmm3
	movaps	xmm6, xmm12
	pmuludq	xmm5, xmm12
	movaps	xmm12, xmm1
	paddq	xmm5, xmm9
	movaps	xmm9, xmm7
	paddq	xmm15, xmm5
	movaps	xmm5, xmm4
	pmuludq	xmm12, xmm11
	movaps	xmm14, xmm12
	pmuludq	xmm5, xmm0
	pmuludq	xmm9, xmm10
	paddq	xmm9, xmm5
	movaps	xmm5, xmm3
	movaps	xmm12, xmm11
	pmuludq	xmm5, xmm2
	paddq	xmm5, xmm9
	movaps	xmm9, xmm0
	paddq	xmm14, xmm5
	movaps	xmm5, xmm4
	pslld	xmm12, 1
	pmuludq	xmm9, xmm0
	movaps	[rsp+0xa0], xmm12
	
	pmuludq	xmm5, xmm6
	paddq	xmm5, xmm9
	movaps	xmm9, xmm7
	pmuludq	xmm9, xmm2
	paddq	xmm9, xmm5
	movaps	xmm5, xmm12
	pmuludq	xmm5, xmm3
	paddq	xmm5, xmm9
	movaps	xmm9, xmm13
	pmuludq	xmm9, xmm1
	movaps	xmm12, xmm9
	movaps	xmm9, xmm0
	paddq	xmm12, xmm5
	movaps	xmm5, xmm4
	pmuludq	xmm9, xmm6
	pmuludq	xmm5, xmm2
	paddq	xmm5, xmm9
	movaps	xmm9, xmm7
	pmuludq	xmm9, xmm11
	paddq	xmm9, xmm5
	movaps	xmm5, xmm13
	pmuludq	xmm5, xmm3
	paddq	xmm5, xmm9
	movaps	xmm9, xmm1
	movaps	xmm1, [rsp]
	pmuludq	xmm9, xmm1
	paddq	xmm5, xmm9
	movaps	xmm9, xmm1
	movaps	[rsp+0x1b0], xmm5
	
	movaps	xmm5, xmm10
	pmuludq	xmm9, [.packedthirtyeight]
	movaps	xmm1, xmm9
	movaps	xmm9, xmm0
	pmuludq	xmm5, [.packedthirtyeight]
	movaps	[rsp+0xd0], xmm5
	
	movaps	xmm5, xmm2
	pslld	xmm9, 1
	pmuludq	xmm3, xmm1
	movaps	[rsp+0x90], xmm9
	
	pmuludq	xmm5, [.packednineteen]
	movaps	xmm13, xmm5
	movaps	xmm5, xmm11
	movaps	xmm9, xmm2
	pmuludq	xmm5, [.packedthirtyeight]
	movaps	[rsp+0x10], xmm5
	pslld	xmm9, 1
	movaps	[rsp+0xe0], xmm9
	
	movaps	xmm9, xmm10
	pmuludq	xmm9, [rsp+0xd0]
	
	movaps	xmm10, [rsp+0x90]
	
	pmuludq	xmm10, xmm13
	paddq	xmm10, xmm9
	movaps	xmm9, [rsp+0x10]
	pmuludq	xmm9, xmm4
	paddq	xmm10, xmm9
	movaps	[rsp+0xd0], xmm10
	
	movaps	xmm5, [rsp+0x40]
	movaps	xmm10, [rsp+0xd0]
	
	pmuludq	xmm5, [.packednineteen]
	movaps	xmm9, xmm5
	pmuludq	xmm9, xmm7
	paddq	xmm10, xmm9
	movaps	xmm9, xmm13
	paddq	xmm3, xmm10
	paddq	xmm3, [rsp+0x20]
	movaps	[rsp+0x20], xmm3
	pmuludq	xmm9, xmm6
	movaps	xmm3, xmm9
	psrld	xmm7, 1
	pmuludq	xmm7, xmm1
	movaps	xmm9, [rsp+0x10]
	pmuludq	xmm9, xmm0
	movaps	xmm10, xmm9
	pmuludq	xmm0, xmm1
	movaps	xmm9, xmm5
	paddq	xmm10, xmm3
	pmuludq	xmm9, xmm4
	movaps	xmm3, xmm9
	pmuludq	xmm4, xmm1
	movaps	xmm9, xmm13
	paddq	xmm3, xmm10
	movaps	xmm10, [rsp+0x10]
	paddq	xmm7, xmm3
	pmuludq	xmm9, xmm2
	paddq	xmm7, [rsp+0x30]
	movaps	[rsp+0x30], xmm7
	movaps	xmm7, xmm9
	movaps	xmm9, xmm10
	pmuludq	xmm9, xmm6
	movaps	xmm3, xmm9
	movaps	xmm9, [rsp+0x90]
	
	paddq	xmm3, xmm7
	pmuludq	xmm9, xmm5
	movaps	xmm13, xmm9
	movaps	xmm9, [rsp+0x1a0]
	
	paddq	xmm13, xmm3
	paddq	xmm4, xmm13
	paddq	xmm9, xmm4
	movaps	xmm7, xmm9
	movaps	xmm9, xmm10
	movaps	xmm13, [rsp+0x40]
	pmuludq	xmm9, xmm2
	movaps	xmm4, xmm9
	pmuludq	xmm2, xmm1
	movaps	xmm9, xmm5
	pmuludq	xmm9, xmm6
	movaps	xmm3, xmm9
	pmuludq	xmm6, xmm1
	movaps	xmm9, xmm10
	paddq	xmm3, xmm4
	movaps	xmm4, [rsp+0xa0]
	
	paddq	xmm0, xmm3
	pmuludq	xmm9, xmm11
	movaps	xmm3, xmm9
	paddq	xmm8, xmm0
	movaps	xmm9, [rsp+0xe0]
	
	movaps	xmm11, xmm4
	pmuludq	xmm9, xmm5
	movaps	xmm0, xmm9
	pmuludq	xmm11, xmm1
	movaps	xmm9, xmm4
	paddq	xmm0, xmm3
	movaps	xmm3, [rsp+0x20]
	paddq	xmm6, xmm0
	pmuludq	xmm9, xmm5
	paddq	xmm2, xmm9
	movaps	xmm9, xmm13
	paddq	xmm6, [rsp+0xf0]
	
	movaps	xmm4, xmm6
	pmuludq	xmm5, xmm13
	pmuludq	xmm9, xmm1
	pmuludq	xmm1, [rsp]
	paddq	xmm12, xmm1
	movaps	xmm1, xmm3
	psrlq	xmm4, 0x1a
	paddq	xmm2, [rsp+0x100]
	
	pand	xmm3, [.packedmask26]
	paddq	xmm11, xmm5
	paddq	xmm15, xmm11
	movaps	xmm11, [.packedmask26]
	psrlq	xmm1, 0x1a
	paddq	xmm1, [rsp+0x30]
	paddq	xmm14, xmm9
	pand	xmm11, xmm6
	movaps	xmm13, xmm3
	movaps	xmm3, xmm4
	paddq	xmm3, xmm2
	movaps	xmm4, xmm3
	movaps	xmm3, xmm1
	psrlq	xmm1, 0x19
	paddq	xmm1, xmm7
	movaps	xmm6, xmm4
	pand	xmm3, [.packedmask25]
	psrlq	xmm6, 0x19
	paddq	xmm15, xmm6
	movaps	xmm9, xmm15
	pand	xmm15, [.packedmask26]
	psrlq	xmm9, 0x1a
	paddq	xmm14, xmm9
	movaps	xmm6, [.packedmask25]
	movaps	xmm2, xmm3
	movaps	xmm3, [.packedmask25]
	movaps	xmm5, xmm15
	movaps	xmm15, xmm14
	pand	xmm3, xmm4
	psrlq	xmm15, 0x19
	movaps	xmm4, xmm1
	movaps	xmm0, xmm15
	psrlq	xmm1, 0x1a
	paddq	xmm8, xmm1
	movaps	xmm1, xmm8
	paddq	xmm12, xmm0
	pand	xmm4, [.packedmask26]
	psrlq	xmm1, 0x19
	paddq	xmm11, xmm1
	movaps	xmm1, xmm12
	pand	xmm12, [.packedmask26]
	psrlq	xmm1, 0x1a
	movaps	xmm0, xmm1
	movaps	xmm15, [.packedmask25]
	pand	xmm6, xmm8
	paddq	xmm0, [rsp+0x1b0]
	
	movaps	xmm1, xmm0
	movaps	xmm10, xmm0
	pand	xmm15, xmm14
	psrlq	xmm1, 0x19
	movaps	xmm0, xmm1
	movaps	xmm8, xmm12
	movaps	xmm12, xmm11
	pmuludq	xmm0, [.packednineteen]
	paddq	xmm13, xmm0
	movaps	xmm9, xmm13
	pand	xmm13, [.packedmask26]
	psrlq	xmm12, 0x1a
	paddq	xmm3, xmm12
	psrlq	xmm9, 0x1a
	movaps	xmm12, xmm4
	paddq	xmm2, xmm9
	pand	xmm11, [.packedmask26]
	movaps	xmm7, xmm15
	movaps	xmm1, [.packedmask25]
	punpckldq xmm12, xmm6
	movaps	xmm9, xmm13
	punpckhdq xmm4, xmm6
	punpckhdq xmm13, xmm2
	punpckldq xmm9, xmm2
	movaps	xmm0, xmm12
	pand	xmm1, xmm10
	punpcklqdq xmm13, xmm4
	movaps	xmm12, xmm9
	movaps	xmm9, xmm5
	punpckhdq xmm5, xmm7
	punpckldq xmm9, xmm15
	movaps	xmm15, xmm8
	punpcklqdq xmm12, xmm0
	pshufd	xmm6, xmm13, 0xfa
	movaps	xmm0, xmm9
	movaps	xmm9, xmm11
	punpckhdq xmm11, xmm3
	punpckldq xmm9, xmm3
	movaps	[rsp+0x1e0], xmm12
	
	punpcklqdq xmm11, xmm5
	pshufd	xmm5, xmm13, 0xd8
	punpckldq xmm15, xmm1
	movaps	xmm3, [rsp+0x260]
	
	punpcklqdq xmm9, xmm0
	punpckhdq xmm8, xmm1
	movaps	xmm0, [rsp+0xc0]
	
	movaps	xmm1, xmm3
	pshufd	xmm7, xmm11, 0xd8
	pmuludq	xmm0, xmm5
	pmuludq	xmm1, xmm6
	movaps	xmm2, xmm3
	paddq	xmm1, xmm0
	movaps	xmm0, [rsp+0xc0]
	
	movaps	xmm4, xmm3
	pmuludq	xmm2, xmm7
	pshufd	xmm12, xmm11, 0xa5
	movaps	[rsp+0x1f0], xmm9
	
	pmuludq	xmm0, xmm6
	paddq	xmm2, xmm0
	pshufd	xmm9, xmm11, 0xfa
	movaps	[rsp+0x210], xmm15
	
	pshufd	xmm14, xmm13, 0xa5
	psrldq	xmm13, 0xc
	movaps	[rsp+0x220], xmm15
	
	pmuludq	xmm4, xmm9
	movaps	xmm15, xmm3
	punpcklqdq xmm13, xmm11
	psrldq	xmm11, 0xc
	punpcklqdq xmm11, xmm8
	pmuludq	xmm15, xmm5
	movaps	xmm0, [rsp+0xc0]
	
	pshufd	xmm8, xmm8, 0xd8
	pmuludq	xmm0, xmm7
	paddq	xmm4, xmm0
	movaps	xmm0, xmm3
	movaps	xmm3, [rsp+0x120]
	
	pshufd	xmm10, xmm8, 0xfa
	pmuludq	xmm0, xmm8
	pmuludq	xmm3, xmm5
	paddq	xmm2, xmm3
	movaps	xmm3, [rsp+0x120]
	
	pmuludq	xmm3, xmm6
	paddq	xmm4, xmm3
	movaps	xmm3, [rsp+0xc0]
	
	pmuludq	xmm3, xmm9
	paddq	xmm0, xmm3
	movaps	xmm3, [rsp+0x1d0]
	
	pmuludq	xmm3, xmm5
	paddq	xmm4, xmm3
	movaps	xmm3, [rsp+0x120]
	
	pmuludq	xmm3, xmm7
	paddq	xmm0, xmm3
	movaps	xmm3, [rsp+0x1d0]
	
	pmuludq	xmm3, xmm6
	paddq	xmm0, xmm3
	movaps	xmm3, [rsp+0x270]
	
	pmuludq	xmm3, xmm5
	paddq	xmm0, xmm3
	movaps	xmm3, xmm4
	pslldq	xmm0, 0x8
	pslldq	xmm4, 0x8
	punpckhqdq xmm3, xmm0
	movaps	xmm0, xmm2
	pslldq	xmm2, 0x8
	punpckhqdq xmm0, xmm4
	movaps	xmm4, xmm0
	movaps	xmm0, xmm1
	pslldq	xmm1, 0x8
	punpckhqdq xmm0, xmm2
	movaps	xmm2, xmm0
	movaps	xmm0, xmm15
	pslldq	xmm15, 0x8
	punpckhqdq xmm0, xmm1
	movaps	xmm1, xmm0
	pxor	xmm0, xmm0
	punpckhqdq xmm0, xmm15
	movaps	xmm15, [rsp+0x50]
	pmuludq	xmm15, xmm5
	paddq	xmm0, xmm15
	movaps	xmm15, [rsp+0x50]
	pmuludq	xmm15, xmm6
	paddq	xmm1, xmm15
	movaps	xmm15, [rsp+0x50]
	pmuludq	xmm15, xmm7
	paddq	xmm2, xmm15
	movaps	xmm15, [rsp+0x50]
	pmuludq	xmm15, xmm9
	paddq	xmm4, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	pmuludq	xmm15, xmm5
	paddq	xmm1, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	pmuludq	xmm15, xmm6
	paddq	xmm2, xmm15
	movaps	xmm15, [rsp+0x110]
	
	pmuludq	xmm15, xmm6
	paddq	xmm4, xmm15
	movaps	xmm15, [rsp+0x50]
	pmuludq	xmm15, xmm8
	paddq	xmm3, xmm15
	movaps	xmm15, [rsp+0x110]
	
	pmuludq	xmm15, xmm5
	paddq	xmm2, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	pmuludq	xmm15, xmm7
	paddq	xmm4, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	pmuludq	xmm15, xmm9
	paddq	xmm3, xmm15
	movaps	xmm15, [rsp+0x1c0]
	
	pmuludq	xmm15, xmm5
	paddq	xmm4, xmm15
	movaps	xmm15, [rsp+0x110]
	
	pmuludq	xmm5, [rsp+0x250]
	
	pmuludq	xmm15, xmm7
	paddq	xmm3, xmm15
	movaps	xmm15, [rsp+0x1c0]
	
	pmuludq	xmm15, xmm6
	paddq	xmm3, xmm15
	movaps	xmm15, [rsp+0x280]
	
	paddq	xmm3, xmm5
	movaps	xmm5, xmm15
	pmuludq	xmm14, xmm15
	paddq	xmm0, xmm14
	movaps	xmm14, [rsp+0x290]
	
	pmuludq	xmm5, xmm13
	paddq	xmm1, xmm5
	movaps	xmm5, xmm15
	pmuludq	xmm13, xmm14
	paddq	xmm0, xmm13
	pmuludq	xmm5, xmm12
	paddq	xmm2, xmm5
	movaps	xmm5, xmm15
	pmuludq	xmm5, xmm11
	paddq	xmm4, xmm5
	movaps	xmm5, xmm14
	pmuludq	xmm5, xmm12
	paddq	xmm1, xmm5
	movaps	xmm5, xmm14
	pmuludq	xmm5, xmm11
	paddq	xmm2, xmm5
	movaps	xmm5, xmm14
	movaps	xmm14, [rsp+0x2a0]
	
	pmuludq	xmm5, xmm10
	paddq	xmm4, xmm5
	pmuludq	xmm12, xmm14
	paddq	xmm0, xmm12
	movaps	xmm12, xmm14
	movaps	xmm5, [rsp+0x2c0]
	
	pmuludq	xmm12, xmm11
	paddq	xmm1, xmm12
	movaps	xmm12, xmm14
	movaps	xmm14, [rsp+0x2b0]
	
	pmuludq	xmm11, xmm5
	paddq	xmm0, xmm11
	pmuludq	xmm12, xmm10
	paddq	xmm2, xmm12
	movaps	xmm12, xmm14
	pmuludq	xmm6, xmm14
	paddq	xmm0, xmm6
	movaps	xmm11, [rsp+0x2d0]
	
	pmuludq	xmm12, xmm8
	paddq	xmm4, xmm12
	movaps	xmm12, xmm5
	movaps	xmm6, [rsp+0x2e0]
	
	pmuludq	xmm12, xmm10
	paddq	xmm1, xmm12
	movaps	xmm12, xmm11
	pmuludq	xmm12, xmm8
	paddq	xmm2, xmm12
	movaps	xmm12, xmm15
	pmuludq	xmm12, xmm10
	paddq	xmm3, xmm12
	movaps	xmm12, xmm14
	pmuludq	xmm10, [rsp+0x300]
	
	pmuludq	xmm12, xmm7
	paddq	xmm1, xmm12
	movaps	xmm12, xmm14
	pmuludq	xmm7, xmm11
	paddq	xmm0, xmm7
	pmuludq	xmm12, xmm9
	paddq	xmm2, xmm12
	movaps	xmm12, xmm11
	pmuludq	xmm12, xmm9
	pmuludq	xmm9, xmm6
	paddq	xmm0, xmm9
	movaps	xmm9, xmm6
	paddq	xmm1, xmm12
	pmuludq	xmm9, xmm8
	pmuludq	xmm8, [rsp+0x2f0]
	
	paddq	xmm0, xmm8
	paddq	xmm0, xmm10
	movaps	xmm5, xmm0
	paddq	xmm1, xmm9
	movaps	xmm6, xmm1
	punpckhqdq xmm1, xmm4
	punpcklqdq xmm5, xmm2
	punpcklqdq xmm6, xmm4
	punpckhqdq xmm0, xmm2
	movaps	xmm2, xmm3
	movaps	xmm4, xmm5
	punpcklqdq xmm2, xmm3
	psrlq	xmm4, 0x1a
	movaps	xmm7, xmm4
	movaps	xmm4, xmm6
	pand	xmm6, [.packedmask26]
	paddq	xmm0, xmm7
	psrlq	xmm4, 0x1a
	paddq	xmm1, xmm4
	movaps	xmm4, xmm0
	punpckhqdq xmm3, xmm3
	psrlq	xmm4, 0x19
	movaps	xmm7, xmm4
	movaps	xmm4, xmm1
	pand	xmm5, [.packedmask26]
	paddq	xmm6, xmm7
	psrlq	xmm4, 0x19
	paddq	xmm2, xmm4
	movaps	xmm7, xmm2
	pslldq	xmm4, 0x8
	pand	xmm1, [.packedmask25]
	psrlq	xmm7, 0x1a
	paddq	xmm3, xmm7
	movaps	xmm7, xmm3
	pand	xmm0, [.packedmask25]
	psrlq	xmm7, 0x19
	pmuludq	xmm7, [.packednineteen]
	punpckhqdq xmm7, xmm4
	pand	xmm3, [.packedmask25]
	paddq	xmm5, xmm7
	movaps	xmm4, xmm5
	pand	xmm5, [.packedmask26]
	psrlq	xmm4, 0x1a
	movaps	xmm7, xmm4
	movaps	xmm4, xmm6
	pand	xmm6, [.packedmask26]
	paddq	xmm0, xmm7
	psrlq	xmm4, 0x1a
	paddq	xmm1, xmm4
	pand	xmm2, [.packedmask26]
	movaps	xmm4, xmm5
	punpckhdq xmm5, xmm0
	punpckldq xmm4, xmm0
	movaps	xmm0, xmm6
	punpckhdq xmm6, xmm1
	punpckldq xmm0, xmm1
	movaps	xmm1, xmm5
	punpckhdq xmm2, xmm3
	punpcklqdq xmm1, xmm6
	punpcklqdq xmm4, xmm0
	movaps	[rsp+0x200], xmm4
	
	movaps	[rsp+0x1a0], xmm1
	
	movaps	[rsp+0x1b0], xmm2
	
	movaps	xmm13, [rsp+0x130]
	
	movaps	xmm7, [rsp+0x170]
	
	movaps	xmm1, xmm13
	movaps	xmm10, [rsp+0x140]
	
	punpcklqdq xmm1, xmm7
	movaps	xmm0, [rsp+0x180]
	
	movaps	xmm4, xmm1
	movaps	xmm1, xmm13
	movaps	xmm14, [rsp+0x150]
	
	punpckhqdq xmm1, xmm7
	movaps	xmm7, xmm10
	movaps	xmm11, [rsp+0x80]
	
	movaps	xmm5, xmm1
	movaps	xmm1, xmm10
	punpcklqdq xmm7, xmm0
	movaps	xmm10, xmm14
	punpckhqdq xmm1, xmm0
	movaps	xmm0, xmm14
	movaps	xmm14, [rsp+0x60]
	movaps	xmm9, xmm7
	movaps	xmm3, xmm14
	movaps	xmm13, [rsp+0x70]
	pmuludq	xmm9, xmm7
	punpckhqdq xmm3, xmm11
	movaps	xmm2, [rsp+0x190]
	
	movaps	xmm12, xmm3
	movaps	xmm8, [rsp+0x160]
	
	movaps	xmm3, xmm13
	punpcklqdq xmm0, xmm2
	punpckhqdq xmm10, xmm2
	movaps	xmm2, xmm14
	movaps	xmm14, xmm4
	punpcklqdq xmm3, xmm8
	punpcklqdq xmm2, xmm11
	pmuludq	xmm14, xmm4
	pslld	xmm4, 1
	punpckhqdq xmm13, xmm8
	movaps	xmm11, xmm4
	movaps	xmm8, xmm5
	movaps	xmm15, xmm4
	pmuludq	xmm11, xmm5
	pslld	xmm8, 1
	movaps	[rsp], xmm3
	pmuludq	xmm5, xmm8
	movaps	xmm3, xmm8
	pmuludq	xmm15, xmm7
	paddq	xmm15, xmm5
	movaps	xmm5, xmm8
	movaps	xmm8, xmm4
	movaps	xmm6, xmm3
	pmuludq	xmm8, xmm1
	pmuludq	xmm5, xmm7
	paddq	xmm5, xmm8
	movaps	[rsp+0xf0], xmm5
	
	pslld	xmm7, 1
	movaps	xmm5, xmm1
	movaps	[rsp+0x60], xmm11
	pslld	xmm5, 1
	pmuludq	xmm6, xmm5
	paddq	xmm6, xmm9
	movaps	xmm9, xmm4
	movaps	xmm8, xmm10
	movaps	xmm11, xmm4
	pmuludq	xmm9, xmm0
	paddq	xmm6, xmm9
	movaps	xmm9, xmm7
	movaps	[rsp+0xd0], xmm6
	
	pslld	xmm8, 1
	pmuludq	xmm11, xmm2
	movaps	xmm6, xmm3
	pmuludq	xmm9, xmm1
	pmuludq	xmm1, xmm5
	movaps	[rsp+0x40], xmm14
	pmuludq	xmm6, xmm0
	paddq	xmm6, xmm9
	movaps	xmm9, xmm7
	movaps	xmm14, xmm12
	pmuludq	xmm9, xmm0
	paddq	xmm9, xmm1
	movaps	xmm1, xmm3
	movaps	[rsp+0x10], xmm13
	pmuludq	xmm14, xmm4
	pmuludq	xmm1, xmm8
	paddq	xmm1, xmm9
	movaps	xmm9, xmm7
	paddq	xmm11, xmm1
	movaps	xmm1, xmm5
	movaps	xmm13, xmm4
	pmuludq	xmm9, xmm10
	pmuludq	xmm1, xmm0
	paddq	xmm9, xmm1
	movaps	xmm1, xmm3
	pmuludq	xmm13, xmm10
	paddq	xmm6, xmm13
	movaps	xmm13, xmm12
	pmuludq	xmm1, xmm2
	paddq	xmm1, xmm9
	paddq	xmm14, xmm1
	movaps	xmm1, xmm12
	movaps	[rsp+0xe0], xmm6
	
	pslld	xmm1, 1
	movaps	xmm12, xmm1
	movaps	xmm1, xmm0
	movaps	xmm6, xmm8
	movaps	[rsp+0x90], xmm12
	
	pmuludq	xmm1, xmm0
	movaps	xmm9, xmm1
	movaps	xmm1, xmm5
	movaps	[rsp+0x70], xmm13
	pmuludq	xmm1, xmm8
	paddq	xmm1, xmm9
	movaps	xmm9, xmm7
	pmuludq	xmm9, xmm2
	paddq	xmm9, xmm1
	movaps	xmm1, xmm12
	movaps	xmm8, [rsp]
	pmuludq	xmm1, xmm3
	paddq	xmm1, xmm9
	movaps	xmm12, xmm8
	pmuludq	xmm12, xmm4
	paddq	xmm12, xmm1
	movaps	xmm1, xmm0
	pmuludq	xmm1, xmm6
	movaps	xmm9, xmm1
	movaps	xmm1, xmm5
	pmuludq	xmm1, xmm2
	paddq	xmm1, xmm9
	movaps	xmm9, xmm13
	movaps	xmm13, [rsp+0x10]
	pmuludq	xmm9, xmm7
	paddq	xmm9, xmm1
	movaps	xmm1, xmm8
	pmuludq	xmm1, xmm3
	paddq	xmm1, xmm9
	movaps	xmm9, xmm4
	movaps	xmm4, xmm10
	pmuludq	xmm9, xmm13
	paddq	xmm9, xmm1
	movaps	xmm1, xmm8
	pmuludq	xmm4, [.packedthirtyeight]
	movaps	[rsp+0x80], xmm4
	
	movaps	xmm4, xmm2
	pmuludq	xmm1, [.packednineteen]
	movaps	xmm8, xmm0
	pmuludq	xmm4, [.packednineteen]
	movaps	[rsp+0x20], xmm4
	pmuludq	xmm10, [rsp+0x80]
	
	pslld	xmm8, 1
	movaps	xmm4, [rsp+0x70]
	pmuludq	xmm4, [.packedthirtyeight]
	movaps	[rsp+0x30], xmm4
	movaps	xmm4, xmm1
	movaps	xmm1, xmm13
	movaps	xmm13, xmm2
	pmuludq	xmm1, [.packedthirtyeight]
	movaps	[rsp+0xa0], xmm10
	
	pmuludq	xmm3, xmm1
	pslld	xmm13, 1
	movaps	[rsp+0x80], xmm8
	
	movaps	xmm10, [rsp+0x20]
	pmuludq	xmm10, xmm8
	movaps	xmm8, [rsp+0x30]
	paddq	xmm10, [rsp+0xa0]
	
	pmuludq	xmm8, xmm5
	paddq	xmm10, xmm8
	movaps	xmm8, xmm10
	movaps	xmm10, xmm4
	pmuludq	xmm10, xmm7
	paddq	xmm10, xmm8
	movaps	xmm8, [rsp+0x20]
	paddq	xmm3, xmm10
	paddq	xmm3, [rsp+0x40]
	psrld	xmm7, 1
	movaps	[rsp+0x40], xmm3
	pmuludq	xmm8, xmm6
	pmuludq	xmm7, xmm1
	movaps	xmm3, xmm8
	movaps	xmm8, [rsp+0x30]
	pmuludq	xmm8, xmm0
	movaps	xmm10, xmm8
	pmuludq	xmm0, xmm1
	movaps	xmm8, xmm4
	paddq	xmm10, xmm3
	pmuludq	xmm8, xmm5
	movaps	xmm3, xmm8
	pmuludq	xmm5, xmm1
	movaps	xmm8, [rsp+0x20]
	paddq	xmm3, xmm10
	paddq	xmm7, xmm3
	paddq	xmm7, [rsp+0x60]
	movaps	xmm10, [rsp+0x30]
	pmuludq	xmm8, xmm2
	movaps	[rsp+0x60], xmm7
	movaps	xmm7, xmm8
	movaps	xmm8, xmm10
	pmuludq	xmm8, xmm6
	movaps	xmm3, xmm8
	paddq	xmm3, xmm7
	movaps	xmm7, [rsp+0x80]
	
	pmuludq	xmm7, xmm4
	paddq	xmm7, xmm3
	paddq	xmm5, xmm7
	movaps	xmm7, xmm10
	paddq	xmm15, xmm5
	movaps	xmm5, xmm10
	movaps	xmm10, xmm4
	pmuludq	xmm10, xmm6
	movaps	xmm3, xmm10
	pmuludq	xmm5, xmm2
	pmuludq	xmm6, xmm1
	pmuludq	xmm2, xmm1
	paddq	xmm3, xmm5
	paddq	xmm0, xmm3
	paddq	xmm0, [rsp+0xf0]
	
	movaps	xmm8, xmm0
	movaps	xmm0, xmm7
	movaps	xmm7, [rsp+0x90]
	
	pmuludq	xmm0, [rsp+0x70]
	movaps	xmm3, xmm0
	movaps	xmm0, xmm13
	pmuludq	xmm0, xmm4
	paddq	xmm0, xmm3
	paddq	xmm6, xmm0
	movaps	xmm0, xmm7
	paddq	xmm6, [rsp+0xd0]
	
	movaps	xmm3, [rsp]
	movaps	xmm10, xmm6
	pmuludq	xmm0, xmm4
	paddq	xmm2, xmm0
	movaps	xmm0, xmm7
	pmuludq	xmm4, xmm3
	paddq	xmm2, [rsp+0xe0]
	
	movaps	xmm5, xmm2
	pmuludq	xmm0, xmm1
	paddq	xmm0, xmm4
	paddq	xmm11, xmm0
	movaps	xmm0, xmm3
	movaps	xmm3, [rsp+0x40]
	pmuludq	xmm0, xmm1
	pmuludq	xmm1, [rsp+0x10]
	paddq	xmm12, xmm1
	movaps	xmm1, xmm3
	paddq	xmm14, xmm0
	movaps	xmm0, xmm6
	pand	xmm3, [.packedmask26]
	psrlq	xmm1, 0x1a
	paddq	xmm1, [rsp+0x60]
	movaps	xmm2, xmm1
	psrlq	xmm0, 0x1a
	psrlq	xmm1, 0x19
	paddq	xmm0, xmm5
	paddq	xmm15, xmm1
	movaps	xmm13, xmm0
	psrlq	xmm0, 0x19
	paddq	xmm11, xmm0
	movaps	xmm1, xmm11
	pand	xmm2, [.packedmask25]
	movaps	xmm0, [.packedmask26]
	psrlq	xmm1, 0x1a
	paddq	xmm14, xmm1
	movaps	xmm1, xmm14
	pand	xmm10, [.packedmask26]
	psrlq	xmm1, 0x19
	paddq	xmm12, xmm1
	movaps	xmm1, xmm12
	movaps	xmm4, xmm2
	movaps	xmm2, xmm15
	psrlq	xmm1, 0x1a
	paddq	xmm9, xmm1
	movaps	xmm1, xmm9
	pand	xmm14, [.packedmask25]
	psrlq	xmm2, 0x1a
	paddq	xmm2, xmm8
	psrlq	xmm1, 0x19
	movaps	xmm5, xmm2
	pmuludq	xmm1, [.packednineteen]
	pand	xmm12, [.packedmask26]
	paddq	xmm3, xmm1
	psrlq	xmm2, 0x19
	paddq	xmm10, xmm2
	movaps	xmm2, xmm3
	movaps	xmm8, [.packedmask25]
	pand	xmm3, [.packedmask26]
	movaps	xmm1, xmm10
	psrlq	xmm2, 0x1a
	paddq	xmm2, xmm4
	pand	xmm10, [.packedmask26]
	movaps	xmm7, xmm14
	movaps	xmm4, xmm2
	psrlq	xmm1, 0x1a
	pand	xmm11, [.packedmask26]
	movaps	xmm14, xmm12
	pand	xmm9, [.packedmask25]
	pand	xmm8, xmm5
	pand	xmm13, [.packedmask25]
	movaps	xmm5, xmm3
	movaps	xmm6, xmm11
	pand	xmm0, xmm15
	pmuludq	xmm5, [.packed121666121665]
	movaps	xmm15, xmm9
	movaps	[rsp], xmm2
	pmuludq	xmm4, [.packed121666121665]
	pmuludq	xmm6, [.packed121666121665]
	movaps	xmm2, xmm4
	movaps	[rsp+0x10], xmm8
	paddq	xmm13, xmm1
	movaps	xmm9, xmm8
	movaps	[rsp+0x40], xmm7
	movaps	xmm4, xmm10
	pmuludq	xmm9, [.packed121666121665]
	movaps	[rsp+0x60], xmm14
	movaps	xmm8, xmm7
	pmuludq	xmm4, [.packed121666121665]
	movaps	xmm7, xmm14
	movaps	xmm14, xmm5
	movaps	[rsp+0x30], xmm11
	pmuludq	xmm8, [.packed121666121665]
	movaps	xmm1, xmm13
	psrlq	xmm14, 0x1a
	pmuludq	xmm7, [.packed121666121665]
	movaps	[rsp+0x70], xmm15
	movaps	xmm11, xmm6
	pmuludq	xmm1, [.packed121666121665]
	movaps	xmm6, xmm15
	movaps	xmm15, xmm14
	movaps	xmm14, xmm4
	paddq	xmm2, xmm15
	movaps	xmm12, xmm0
	pand	xmm5, [.packedmask26]
	psrlq	xmm14, 0x1a
	paddq	xmm1, xmm14
	movaps	xmm14, xmm2
	pmuludq	xmm12, [.packed121666121665]
	pmuludq	xmm6, [.packed121666121665]
	pand	xmm4, [.packedmask26]
	psrlq	xmm14, 0x19
	movaps	xmm15, xmm14
	movaps	xmm14, xmm1
	pand	xmm1, [.packedmask25]
	paddq	xmm12, xmm15
	psrlq	xmm14, 0x19
	paddq	xmm11, xmm14
	movaps	xmm14, xmm12
	pand	xmm2, [.packedmask25]
	psrlq	xmm14, 0x1a
	movaps	xmm15, xmm14
	movaps	xmm14, xmm11
	pand	xmm12, [.packedmask26]
	paddq	xmm9, xmm15
	psrlq	xmm14, 0x1a
	paddq	xmm8, xmm14
	movaps	xmm14, xmm9
	pand	xmm9, [.packedmask25]
	psrlq	xmm14, 0x19
	movaps	xmm15, xmm14
	movaps	xmm14, xmm8
	pand	xmm11, [.packedmask26]
	paddq	xmm4, xmm15
	psrlq	xmm14, 0x19
	paddq	xmm7, xmm14
	movaps	xmm14, xmm7
	pand	xmm8, [.packedmask25]
	psrlq	xmm14, 0x1a
	paddq	xmm6, xmm14
	movaps	xmm14, xmm6
	pand	xmm7, [.packedmask26]
	psrlq	xmm14, 0x19
	pmuludq	xmm14, [.packednineteen]
	paddq	xmm5, xmm14
	movaps	xmm14, xmm5
	pand	xmm5, [.packedmask26]
	psrlq	xmm14, 0x1a
	movaps	xmm15, xmm14
	movaps	xmm14, xmm4
	pand	xmm4, [.packedmask26]
	paddq	xmm2, xmm15
	psrlq	xmm14, 0x1a
	paddq	xmm1, xmm14
	pand	xmm6, [.packedmask25]
	movaps	xmm14, xmm5
	punpckhqdq xmm5, xmm2
	punpcklqdq xmm14, xmm2
	movaps	xmm2, xmm3
	punpckhqdq xmm2, [rsp]
	pslldq	xmm5, 0x4
	movaps	xmm15, xmm14
	movaps	xmm14, xmm3
	pslldq	xmm15, 0x4
	punpcklqdq xmm14, [rsp]
	por	xmm2, xmm5
	movaps	xmm5, xmm12
	punpckhqdq xmm12, xmm9
	punpcklqdq xmm5, xmm9
	movaps	xmm9, xmm0
	por	xmm14, xmm15
	pslldq	xmm12, 0x4
	punpckhqdq xmm9, [rsp+0x10]
	movaps	xmm15, xmm5
	paddd	xmm14, [.packed32zeromodp0]
	psubd	xmm14, xmm2
	movaps	xmm5, xmm0
	movaps	xmm2, xmm14
	pslldq	xmm15, 0x4
	punpcklqdq xmm5, [rsp+0x10]
	por	xmm9, xmm12
	movaps	xmm12, xmm4
	punpckhqdq xmm4, xmm1
	punpcklqdq xmm12, xmm1
	por	xmm5, xmm15
	movaps	xmm1, xmm10
	pslldq	xmm4, 0x4
	movaps	xmm15, xmm12
	movaps	xmm12, xmm10
	punpckhqdq xmm1, xmm13
	paddd	xmm5, [.packed32zeromodp1]
	pslldq	xmm15, 0x4
	psubd	xmm5, xmm9
	punpcklqdq xmm12, xmm13
	por	xmm12, xmm15
	movaps	xmm15, xmm1
	movaps	xmm1, xmm11
	punpckhqdq xmm11, xmm8
	por	xmm15, xmm4
	movaps	xmm4, [rsp+0x30]
	paddd	xmm12, [.packed32zeromodp1]
	punpcklqdq xmm1, xmm8
	punpcklqdq xmm4, [rsp+0x40]
	pslldq	xmm11, 0x4
	movaps	xmm8, [rsp+0x60]
	psubd	xmm12, xmm15
	pslldq	xmm1, 0x4
	punpcklqdq xmm2, xmm12
	punpcklqdq xmm8, [rsp+0x70]
	movaps	xmm15, [rsp]
	por	xmm4, xmm1
	movaps	xmm9, xmm2
	movaps	xmm1, [rsp+0x30]
	movaps	xmm2, xmm5
	punpckhqdq xmm14, xmm12
	punpckhqdq xmm1, [rsp+0x40]
	por	xmm1, xmm11
	movaps	xmm11, xmm7
	punpckhqdq xmm7, xmm6
	punpcklqdq xmm11, xmm6
	movaps	xmm6, [rsp+0x60]
	pslldq	xmm7, 0x4
	punpckhqdq xmm6, [rsp+0x70]
	pslldq	xmm11, 0x4
	por	xmm8, xmm11
	por	xmm6, xmm7
	movaps	xmm7, [.packed32zeromodp1]
	paddd	xmm7, xmm4
	psubd	xmm7, xmm1
	punpcklqdq xmm2, xmm7
	movaps	xmm4, [.packed32zeromodp1]
	punpckhqdq xmm5, xmm7
	paddd	xmm4, xmm8
	psubd	xmm4, xmm6
	movaps	xmm6, xmm2
	movaps	xmm2, xmm9
	movaps	xmm7, [.packedmask26262626]
	pand	xmm9, [.packedmask26262626]
	movaps	xmm1, xmm6
	psrld	xmm2, 0x1a
	paddd	xmm2, xmm14
	movaps	xmm11, xmm2
	pand	xmm7, xmm6
	pand	xmm2, [.packedmask25252525]
	psrld	xmm1, 0x1a
	paddd	xmm1, xmm5
	psrld	xmm11, 0x19
	movaps	xmm5, xmm1
	movaps	xmm6, xmm11
	pand	xmm1, [.packedmask25252525]
	psrld	xmm5, 0x19
	paddd	xmm6, xmm7
	movaps	xmm7, xmm5
	pslldq	xmm5, 0x8
	paddd	xmm9, xmm5
	movaps	xmm5, xmm9
	movaps	xmm8, xmm6
	punpckhqdq xmm9, xmm2
	psrldq	xmm7, 0x8
	paddd	xmm4, xmm7
	punpcklqdq xmm5, xmm2
	movaps	xmm2, xmm3
	punpcklqdq xmm8, xmm1
	punpckhqdq xmm6, xmm1
	pshufd	xmm12, xmm5, 0x50
	pshufd	xmm1, xmm5, 0xfa
	pshufd	xmm14, xmm8, 0x50
	punpcklqdq xmm2, xmm12
	punpckhqdq xmm3, xmm12
	movaps	xmm12, xmm15
	punpckhqdq xmm15, xmm1
	punpcklqdq xmm12, xmm1
	pshufd	xmm8, xmm8, 0xfa
	movaps	xmm1, xmm15
	movaps	xmm15, xmm0
	punpckhqdq xmm0, xmm14
	punpcklqdq xmm15, xmm14
	pshufd	xmm11, xmm9, 0x50
	movaps	[rsp], xmm0
	pshufd	xmm9, xmm9, 0xfa
	pshufd	xmm7, xmm6, 0x50
	pshufd	xmm6, xmm6, 0xfa
	movaps	xmm0, [rsp+0x10]
	pshufd	xmm5, xmm4, 0x50
	pshufd	xmm4, xmm4, 0xfa
	movaps	xmm14, xmm0
	punpckhqdq xmm0, xmm8
	punpcklqdq xmm14, xmm8
	movaps	xmm8, xmm10
	punpckhqdq xmm10, xmm11
	punpcklqdq xmm8, xmm11
	movaps	xmm11, xmm13
	movaps	[rsp+0x10], xmm10
	punpckhqdq xmm11, xmm9
	movaps	xmm10, xmm13
	movaps	xmm13, [rsp+0x30]
	punpcklqdq xmm10, xmm9
	movaps	[rsp+0x20], xmm11
	movaps	xmm11, xmm13
	punpckhqdq xmm13, xmm7
	punpcklqdq xmm11, xmm7
	movaps	xmm9, [rsp+0x40]
	movaps	xmm7, xmm9
	movaps	[rsp+0x30], xmm13
	punpcklqdq xmm7, xmm6
	punpckhqdq xmm9, xmm6
	movaps	xmm13, [rsp+0x60]
	movaps	[rsp+0x80], xmm7
	
	movaps	xmm7, xmm13
	punpckhqdq xmm13, xmm5
	punpcklqdq xmm7, xmm5
	movaps	xmm6, [rsp+0x70]
	movaps	xmm5, xmm6
	punpckhqdq xmm6, xmm4
	punpcklqdq xmm5, xmm4
	movaps	xmm4, xmm2
	movaps	[rsp+0x60], xmm13
	pmuludq	xmm4, xmm3
	movaps	xmm13, xmm6
	movaps	[rsp+0x90], xmm4
	
	movaps	xmm6, xmm2
	movaps	xmm4, xmm12
	pmuludq	xmm6, xmm1
	movaps	[rsp+0x100], xmm5
	
	pmuludq	xmm4, xmm3
	paddq	xmm4, xmm6
	movaps	xmm6, xmm12
	movaps	[rsp+0xa0], xmm4
	
	pslld	xmm6, 1
	movaps	xmm4, xmm6
	movaps	xmm6, xmm15
	movaps	xmm5, xmm4
	movaps	[rsp+0x40], xmm9
	pmuludq	xmm6, xmm3
	pmuludq	xmm5, xmm1
	paddq	xmm5, xmm6
	movaps	xmm6, xmm2
	movaps	[rsp+0xf0], xmm7
	
	movaps	xmm9, [rsp]
	pmuludq	xmm6, xmm9
	paddq	xmm5, xmm6
	movaps	xmm6, xmm15
	movaps	[rsp+0xd0], xmm5
	
	movaps	xmm5, xmm14
	pmuludq	xmm6, xmm1
	pmuludq	xmm5, xmm3
	paddq	xmm6, xmm5
	movaps	xmm5, xmm12
	pmuludq	xmm5, xmm9
	paddq	xmm5, xmm6
	movaps	xmm6, xmm0
	movaps	xmm9, [rsp+0x10]
	pmuludq	xmm6, xmm2
	paddq	xmm5, xmm6
	movaps	xmm6, xmm14
	movaps	[rsp+0xe0], xmm5
	
	pslld	xmm6, 1
	movaps	xmm5, xmm6
	movaps	xmm6, xmm8
	pmuludq	xmm6, xmm3
	movaps	xmm7, xmm6
	movaps	xmm6, xmm5
	pmuludq	xmm6, xmm1
	paddq	xmm6, xmm7
	movaps	xmm7, xmm15
	pmuludq	xmm7, [rsp]
	paddq	xmm7, xmm6
	movaps	xmm6, xmm0
	pmuludq	xmm6, xmm4
	paddq	xmm6, xmm7
	movaps	xmm7, xmm9
	pmuludq	xmm7, xmm2
	paddq	xmm6, xmm7
	movaps	xmm7, xmm8
	movaps	[rsp+0x130], xmm6
	
	movaps	xmm6, xmm10
	pmuludq	xmm7, xmm1
	pmuludq	xmm6, xmm3
	paddq	xmm7, xmm6
	movaps	xmm6, xmm14
	pmuludq	xmm6, [rsp]
	paddq	xmm6, xmm7
	movaps	xmm7, xmm0
	pmuludq	xmm7, xmm15
	paddq	xmm7, xmm6
	movaps	xmm6, xmm9
	pmuludq	xmm6, xmm12
	paddq	xmm6, xmm7
	movaps	xmm7, [rsp+0x20]
	pmuludq	xmm7, xmm2
	paddq	xmm6, xmm7
	movaps	xmm7, xmm10
	movaps	[rsp+0x140], xmm6
	
	pslld	xmm7, 1
	movaps	xmm6, xmm11
	movaps	xmm9, xmm7
	pmuludq	xmm6, xmm3
	movaps	xmm7, xmm6
	movaps	xmm6, xmm9
	pmuludq	xmm9, xmm0
	pmuludq	xmm6, xmm1
	paddq	xmm6, xmm7
	movaps	xmm7, xmm8
	pmuludq	xmm7, [rsp]
	paddq	xmm7, xmm6
	movaps	xmm6, xmm0
	pmuludq	xmm6, xmm5
	paddq	xmm6, xmm7
	movaps	xmm7, [rsp+0x10]
	pmuludq	xmm7, xmm15
	paddq	xmm7, xmm6
	movaps	xmm6, [rsp+0x20]
	pmuludq	xmm6, xmm4
	paddq	xmm6, xmm7
	movaps	xmm7, [rsp+0x30]
	pmuludq	xmm7, xmm2
	paddq	xmm6, xmm7
	movaps	xmm7, xmm11
	movaps	[rsp+0x150], xmm6
	
	pmuludq	xmm7, xmm1
	movaps	xmm6, [rsp+0x80]
	
	pmuludq	xmm6, xmm3
	paddq	xmm7, xmm6
	movaps	xmm6, xmm10
	pmuludq	xmm6, [rsp]
	paddq	xmm6, xmm7
	movaps	xmm7, xmm0
	pmuludq	xmm7, xmm8
	paddq	xmm7, xmm6
	movaps	xmm6, [rsp+0x10]
	pmuludq	xmm6, xmm14
	paddq	xmm6, xmm7
	movaps	xmm7, [rsp+0x20]
	pmuludq	xmm7, xmm15
	paddq	xmm7, xmm6
	movaps	xmm6, [rsp+0x30]
	pmuludq	xmm6, xmm12
	paddq	xmm6, xmm7
	movaps	xmm7, [rsp+0x40]
	pmuludq	xmm7, xmm2
	paddq	xmm6, xmm7
	movaps	[rsp+0x160], xmm6
	
	pmuludq	xmm5, [rsp+0x20]
	pmuludq	xmm4, [rsp+0x40]
	movaps	xmm7, [rsp+0x80]
	
	movaps	xmm6, [rsp+0xf0]
	
	pslld	xmm7, 1
	pmuludq	xmm7, xmm1
	pmuludq	xmm6, xmm3
	paddq	xmm7, xmm6
	movaps	xmm6, xmm11
	pmuludq	xmm3, [rsp+0x100]
	
	pmuludq	xmm6, [rsp]
	paddq	xmm6, xmm7
	paddq	xmm9, xmm6
	movaps	xmm6, [rsp+0x10]
	pmuludq	xmm6, xmm8
	paddq	xmm6, xmm9
	movaps	xmm9, [rsp+0x60]
	paddq	xmm5, xmm6
	movaps	xmm6, [rsp+0x30]
	pmuludq	xmm6, xmm15
	paddq	xmm6, xmm5
	movaps	xmm5, [rsp+0xf0]
	
	paddq	xmm4, xmm6
	movaps	xmm6, xmm9
	movaps	xmm7, [rsp+0x80]
	
	pmuludq	xmm6, xmm2
	paddq	xmm6, xmm4
	movaps	xmm4, xmm5
	pmuludq	xmm2, xmm13
	pmuludq	xmm4, xmm1
	paddq	xmm4, xmm3
	movaps	xmm3, xmm7
	pmuludq	xmm7, [.packednineteen]
	pmuludq	xmm3, [rsp]
	paddq	xmm3, xmm4
	movaps	xmm4, xmm0
	pmuludq	xmm4, xmm11
	paddq	xmm4, xmm3
	movaps	xmm3, [rsp+0x10]
	pmuludq	xmm3, xmm10
	paddq	xmm3, xmm4
	movaps	xmm4, [rsp+0x20]
	pmuludq	xmm10, [.packednineteen]
	pmuludq	xmm4, xmm8
	paddq	xmm4, xmm3
	movaps	xmm3, [rsp+0x30]
	pmuludq	xmm8, [.packednineteen]
	pmuludq	xmm3, xmm14
	paddq	xmm3, xmm4
	movaps	xmm4, [rsp+0x40]
	pmuludq	xmm14, [.packednineteen]
	pmuludq	xmm4, xmm15
	paddq	xmm4, xmm3
	movaps	xmm3, xmm9
	movaps	xmm9, xmm2
	movaps	xmm2, xmm14
	pmuludq	xmm15, [.packednineteen]
	pmuludq	xmm3, xmm12
	paddq	xmm3, xmm4
	movaps	xmm4, xmm7
	pslld	xmm2, 1
	movaps	[rsp+0xf0], xmm2
	
	paddq	xmm9, xmm3
	movaps	xmm2, xmm10
	pslld	xmm4, 1
	pmuludq	xmm12, [.packednineteen]
	movaps	xmm3, xmm11
	pslld	xmm12, 1
	pmuludq	xmm12, xmm13
	pslld	xmm2, 1
	movaps	[rsp+0x70], xmm2
	movaps	xmm2, xmm5
	pmuludq	xmm3, [.packednineteen]
	movaps	[rsp+0x80], xmm4
	
	pmuludq	xmm2, [.packednineteen]
	movaps	xmm5, [rsp+0x100]
	
	pmuludq	xmm5, [.packednineteen]
	movaps	xmm11, xmm5
	pslld	xmm11, 1
	movaps	xmm4, xmm11
	movaps	xmm11, [rsp+0x60]
	pmuludq	xmm1, xmm4
	pmuludq	xmm11, xmm15
	paddq	xmm11, xmm12
	movaps	xmm12, [rsp+0xf0]
	
	pmuludq	xmm15, xmm13
	pmuludq	xmm12, [rsp+0x40]
	paddq	xmm12, xmm11
	movaps	xmm11, [rsp+0x30]
	pmuludq	xmm11, xmm8
	paddq	xmm11, xmm12
	movaps	xmm12, [rsp+0x70]
	pmuludq	xmm12, [rsp+0x20]
	paddq	xmm12, xmm11
	movaps	xmm11, [rsp+0x10]
	pmuludq	xmm11, xmm3
	paddq	xmm11, xmm12
	movaps	xmm12, [rsp+0x80]
	
	pmuludq	xmm12, xmm0
	paddq	xmm12, xmm11
	movaps	xmm11, xmm2
	pmuludq	xmm11, [rsp]
	paddq	xmm11, xmm12
	paddq	xmm1, xmm11
	paddq	xmm1, [rsp+0x90]
	
	movaps	[rsp+0x90], xmm1
	
	movaps	xmm12, [rsp+0x60]
	pmuludq	xmm14, xmm12
	paddq	xmm14, xmm15
	movaps	xmm15, [rsp+0x40]
	movaps	xmm1, xmm15
	movaps	xmm11, [rsp+0x30]
	pmuludq	xmm1, xmm8
	paddq	xmm1, xmm14
	movaps	xmm14, [rsp+0x20]
	pmuludq	xmm11, xmm10
	paddq	xmm11, xmm1
	pmuludq	xmm14, xmm3
	movaps	xmm1, xmm14
	movaps	xmm14, xmm0
	paddq	xmm1, xmm11
	movaps	xmm11, [rsp+0x10]
	pmuludq	xmm14, xmm2
	pmuludq	xmm11, xmm7
	movaps	[rsp+0x40], xmm0
	paddq	xmm11, xmm1
	movaps	xmm1, xmm14
	paddq	xmm1, xmm11
	movaps	xmm11, xmm15
	movaps	xmm0, [rsp]
	movaps	xmm14, [rsp+0xf0]
	
	pmuludq	xmm0, xmm5
	paddq	xmm0, xmm1
	paddq	xmm0, [rsp+0xa0]
	
	pmuludq	xmm14, xmm13
	movaps	[rsp+0xa0], xmm0
	
	movaps	xmm1, xmm14
	movaps	xmm0, xmm12
	movaps	xmm14, xmm12
	pmuludq	xmm0, xmm8
	paddq	xmm0, xmm1
	movaps	xmm1, [rsp+0x70]
	pmuludq	xmm8, xmm13
	pmuludq	xmm10, xmm14
	paddq	xmm10, xmm8
	pmuludq	xmm1, xmm15
	movaps	xmm15, [rsp+0x30]
	paddq	xmm1, xmm0
	movaps	xmm8, xmm11
	movaps	xmm0, xmm15
	movaps	xmm12, [rsp+0x10]
	pmuludq	xmm0, xmm3
	paddq	xmm0, xmm1
	movaps	xmm1, [rsp+0x80]
	
	pmuludq	xmm1, [rsp+0x20]
	paddq	xmm1, xmm0
	movaps	xmm0, xmm12
	pmuludq	xmm0, xmm2
	paddq	xmm0, xmm1
	movaps	xmm1, [rsp+0x40]
	pmuludq	xmm1, xmm4
	paddq	xmm1, xmm0
	movaps	xmm0, xmm11
	paddq	xmm1, [rsp+0xd0]
	
	movaps	xmm11, xmm15
	pmuludq	xmm0, xmm3
	paddq	xmm0, xmm10
	movaps	xmm10, xmm15
	movaps	[rsp+0xd0], xmm1
	
	pmuludq	xmm10, xmm7
	movaps	xmm1, xmm10
	movaps	xmm10, xmm12
	paddq	xmm1, xmm0
	movaps	xmm15, [rsp+0x20]
	pmuludq	xmm10, xmm5
	movaps	xmm0, xmm15
	movaps	xmm12, [rsp+0x80]
	
	pmuludq	xmm0, xmm2
	paddq	xmm0, xmm1
	movaps	xmm1, xmm10
	movaps	xmm10, [rsp+0xe0]
	
	paddq	xmm1, xmm0
	movaps	xmm0, [rsp+0x70]
	paddq	xmm10, xmm1
	pmuludq	xmm0, xmm13
	movaps	xmm1, xmm0
	movaps	xmm0, xmm14
	movaps	[rsp+0xe0], xmm10
	
	pmuludq	xmm0, xmm3
	paddq	xmm0, xmm1
	movaps	xmm1, xmm12
	movaps	xmm10, xmm14
	movaps	xmm14, xmm8
	pmuludq	xmm3, xmm13
	pmuludq	xmm1, xmm8
	movaps	xmm8, xmm11
	paddq	xmm1, xmm0
	movaps	xmm0, xmm11
	movaps	xmm11, xmm15
	pmuludq	xmm7, xmm10
	paddq	xmm7, xmm3
	movaps	xmm3, xmm8
	pmuludq	xmm11, xmm4
	pmuludq	xmm0, xmm2
	paddq	xmm0, xmm1
	movaps	xmm1, xmm11
	pmuludq	xmm3, xmm5
	pmuludq	xmm5, xmm10
	movaps	xmm8, [.packedmask25]
	paddq	xmm1, xmm0
	movaps	xmm0, [rsp+0x130]
	
	paddq	xmm0, xmm1
	movaps	xmm1, xmm3
	movaps	xmm11, xmm0
	movaps	xmm0, xmm14
	movaps	xmm3, [rsp+0x140]
	
	pmuludq	xmm0, xmm2
	paddq	xmm0, xmm7
	paddq	xmm1, xmm0
	movaps	xmm0, xmm12
	paddq	xmm3, xmm1
	movaps	xmm7, xmm14
	movaps	xmm14, [.packedmask26]
	pmuludq	xmm0, xmm13
	movaps	xmm1, xmm0
	movaps	xmm0, xmm10
	pmuludq	xmm7, xmm4
	pmuludq	xmm4, xmm13
	paddq	xmm6, xmm4
	movaps	xmm4, [rsp+0x90]
	
	pand	xmm14, xmm11
	pmuludq	xmm0, xmm2
	paddq	xmm0, xmm1
	movaps	xmm1, xmm7
	movaps	xmm7, [rsp+0x150]
	
	pmuludq	xmm2, xmm13
	paddq	xmm5, xmm2
	paddq	xmm1, xmm0
	movaps	xmm0, xmm11
	paddq	xmm5, [rsp+0x160]
	
	paddq	xmm7, xmm1
	movaps	xmm1, xmm4
	pand	xmm4, [.packedmask26]
	psrlq	xmm0, 0x1a
	paddq	xmm0, xmm3
	movaps	xmm2, xmm0
	psrlq	xmm1, 0x1a
	paddq	xmm1, [rsp+0xa0]
	
	movaps	xmm3, xmm1
	psrlq	xmm2, 0x19
	paddq	xmm2, xmm7
	movaps	xmm12, xmm2
	psrlq	xmm2, 0x1a
	paddq	xmm2, xmm5
	movaps	xmm10, xmm2
	psrlq	xmm2, 0x19
	paddq	xmm6, xmm2
	movaps	xmm2, xmm6
	psrlq	xmm3, 0x19
	paddq	xmm3, [rsp+0xd0]
	
	movaps	xmm13, xmm3
	psrlq	xmm2, 0x1a
	paddq	xmm9, xmm2
	movaps	xmm2, xmm9
	pand	xmm13, [.packedmask26]
	psrlq	xmm3, 0x1a
	paddq	xmm3, [rsp+0xe0]
	
	psrlq	xmm2, 0x19
	movaps	xmm15, xmm3
	pmuludq	xmm2, [.packednineteen]
	pand	xmm0, [.packedmask25]
	paddq	xmm4, xmm2
	movaps	xmm5, xmm4
	psrlq	xmm3, 0x19
	paddq	xmm14, xmm3
	movaps	xmm2, xmm14
	pand	xmm4, [.packedmask26]
	psrlq	xmm5, 0x1a
	psrlq	xmm2, 0x1a
	movaps	xmm3, [.packedmask26]
	pand	xmm15, [.packedmask25]
	paddq	xmm0, xmm2
	movaps	xmm2, xmm13
	pand	xmm1, [.packedmask25]
	movaps	xmm7, xmm4
	pand	xmm12, [.packedmask26]
	pand	xmm14, [.packedmask26]
	paddq	xmm1, xmm5
	pand	xmm10, [.packedmask25]
	punpckldq xmm2, xmm15
	punpckldq xmm7, xmm1
	movaps	xmm5, xmm14
	pand	xmm3, xmm6
	pand	xmm8, xmm9
	punpcklqdq xmm7, xmm2
	movaps	xmm2, xmm12
	punpckldq xmm5, xmm0
	punpckldq xmm2, xmm10
	movaps	xmm11, xmm3
	punpckhdq xmm4, xmm1
	punpckhdq xmm14, xmm0
	punpckldq xmm11, xmm8
	punpckhdq xmm13, xmm15
	punpckhdq xmm12, xmm10
	punpckhdq xmm3, xmm8
	movaps	xmm6, xmm11
	punpcklqdq xmm5, xmm2
	movaps	xmm2, xmm4
	punpcklqdq xmm14, xmm12
	movaps	xmm0, xmm3
	punpcklqdq xmm2, xmm13
	jne    .highloop

	neg    eax
	movaps	xmm8, [rsp+0x200]
	
	movd   xmm1, eax
	movaps	xmm11, xmm5
	pxor	xmm8, xmm2
	mov	[rsp], eax
	pshufd	xmm1, xmm1, 0
	mov	eax, 0x3
	movaps	xmm3, [rsp+0x1e0]
	
	movaps	xmm13, [rsp+0x1f0]
	
	pxor	xmm3, xmm7
	movaps	xmm4, [rsp+0x220]
	
	pxor	xmm13, xmm11
	pand	xmm8, xmm1
	movaps	xmm12, [rsp+0x1a0]
	
	pxor	xmm4, xmm6
	pand	xmm3, xmm1
	movaps	xmm15, [rsp+0x1b0]
	
	pxor	xmm12, xmm14
	pand	xmm13, xmm1
	pxor	xmm15, xmm0
	pand	xmm4, xmm1
	pand	xmm12, xmm1
	pand	xmm15, xmm1
	pxor	xmm8, xmm2
	pxor	xmm12, xmm14
	pxor	xmm15, xmm0
	pxor	xmm4, xmm6
	movaps	xmm2, xmm8
	pxor	xmm13, xmm11
	pxor	xmm3, xmm7
calign
.lowloop:
	movaps	xmm7, xmm3
	paddd	xmm3, [.packed2p0]
	psubd	xmm3, xmm2
	pshufd	xmm5, xmm3, 0xa0
	movaps	xmm8, xmm4
	paddd	xmm4, [.packed2p2]
	pshufd	xmm3, xmm3, 0xf5
	paddd	xmm7, xmm2
	psubd	xmm4, xmm15
	paddd	xmm8, xmm15
	movaps	xmm1, xmm13
	paddd	xmm13, [.packed2p1]
	pand	xmm5, [.sse2_bot32bitmask]
	psubd	xmm13, xmm12
	sub	rax, 1
	paddd	xmm1, xmm12
	pand	xmm3, [.sse2_bot32bitmask]
	movaps	xmm0, xmm5
	pand	xmm5, [.packedmask26]
	movaps	xmm2, xmm3
	psrld	xmm0, 0x1a
	pand	xmm3, [.packedmask25]
	psrld	xmm2, 0x19
	movaps	xmm6, xmm2
	psrldq	xmm2, 0x8
	paddd	xmm13, xmm2
	movaps	xmm2, xmm7
	pslldq	xmm6, 0x8
	paddd	xmm5, xmm6
	paddd	xmm0, xmm3
	movaps	xmm3, xmm5
	punpckldq xmm5, xmm0
	punpckhdq xmm3, xmm0
	movaps	xmm0, xmm5
	punpcklqdq xmm0, xmm3
	movaps	xmm3, xmm8
	punpcklqdq xmm3, xmm4
	punpckhqdq xmm7, xmm0
	punpcklqdq xmm2, xmm0
	movaps	xmm0, xmm1
	movaps	xmm6, xmm3
	punpckhqdq xmm1, xmm13
	movaps	[rsp], xmm6
	punpcklqdq xmm0, xmm13
	pshufd	xmm4, xmm2, 0xf5
	pshufd	xmm13, [rsp], 0xf5
	pshufd	xmm3, xmm1, 0xf5
	pshufd	xmm5, xmm7, 0xf5
	movaps	[rsp+0x10], xmm13
	movaps	xmm13, xmm2
	movaps	xmm15, xmm3
	movaps	xmm3, xmm4
	pmuludq	xmm13, xmm2
	pslld	xmm2, 1
	movaps	xmm10, xmm2
	movaps	xmm14, xmm2
	pslld	xmm3, 1
	movaps	xmm12, xmm2
	pmuludq	xmm10, xmm4
	pmuludq	xmm4, xmm3
	movaps	[rsp+0x30], xmm13
	pmuludq	xmm14, xmm7
	paddq	xmm4, xmm14
	pmuludq	xmm12, xmm5
	movaps	[rsp+0x90], xmm4
	
	movaps	xmm4, xmm3
	movaps	[rsp+0x40], xmm10
	pmuludq	xmm4, xmm7
	movaps	xmm10, xmm7
	paddq	xmm12, xmm4
	movaps	xmm6, xmm3
	movaps	xmm4, xmm5
	movaps	xmm13, xmm2
	pmuludq	xmm10, xmm7
	pslld	xmm7, 1
	pslld	xmm4, 1
	pmuludq	xmm6, xmm4
	paddq	xmm6, xmm10
	pmuludq	xmm13, xmm0
	movaps	xmm10, xmm7
	paddq	xmm13, xmm6
	movaps	xmm6, xmm3
	pshufd	xmm11, xmm0, 0xf5
	pmuludq	xmm10, xmm5
	movaps	xmm14, xmm2
	pmuludq	xmm5, xmm4
	pmuludq	xmm6, xmm0
	paddq	xmm6, xmm10
	movaps	xmm10, xmm7
	pmuludq	xmm14, xmm11
	movaps	xmm8, xmm2
	paddq	xmm6, xmm14
	pmuludq	xmm10, xmm0
	movaps	xmm14, xmm11
	paddq	xmm10, xmm5
	movaps	xmm5, xmm3
	pmuludq	xmm8, xmm1
	movaps	xmm9, xmm15
	pslld	xmm14, 1
	movaps	[rsp+0x80], xmm6
	
	pmuludq	xmm5, xmm14
	paddq	xmm5, xmm10
	movaps	xmm10, xmm7
	paddq	xmm5, xmm8
	movaps	[rsp+0xa0], xmm5
	
	pmuludq	xmm9, xmm2
	movaps	xmm5, xmm4
	pmuludq	xmm10, xmm11
	movaps	xmm6, xmm14
	pmuludq	xmm5, xmm0
	paddq	xmm10, xmm5
	movaps	xmm5, xmm3
	movaps	[rsp+0x50], xmm15
	pmuludq	xmm5, xmm1
	paddq	xmm5, xmm10
	paddq	xmm5, xmm9
	movaps	[rsp+0xb0], xmm5
	
	movaps	xmm5, xmm15
	pslld	xmm5, 1
	movaps	xmm14, xmm5
	movaps	xmm5, xmm0
	movaps	xmm9, [rsp]
	pmuludq	xmm5, xmm0
	movaps	xmm10, xmm5
	movaps	xmm5, xmm4
	movaps	xmm8, [rsp+0x10]
	pmuludq	xmm5, xmm6
	paddq	xmm5, xmm10
	movaps	xmm10, xmm7
	movaps	[rsp+0x70], xmm14
	pmuludq	xmm10, xmm1
	paddq	xmm10, xmm5
	movaps	xmm5, xmm14
	movaps	xmm14, xmm9
	pmuludq	xmm5, xmm3
	paddq	xmm5, xmm10
	pmuludq	xmm14, xmm2
	paddq	xmm14, xmm5
	movaps	xmm5, xmm0
	pmuludq	xmm5, xmm6
	movaps	xmm10, xmm5
	movaps	xmm5, xmm4
	pmuludq	xmm5, xmm1
	paddq	xmm5, xmm10
	movaps	xmm10, xmm15
	pmuludq	xmm10, xmm7
	paddq	xmm10, xmm5
	movaps	xmm5, xmm9
	pmuludq	xmm5, xmm3
	paddq	xmm5, xmm10
	movaps	xmm10, xmm2
	movaps	xmm2, xmm11
	pmuludq	xmm10, xmm8
	paddq	xmm10, xmm5
	movaps	xmm5, xmm9
	pmuludq	xmm2, [.packedthirtyeight]
	movaps	xmm15, xmm2
	movaps	xmm2, xmm1
	pmuludq	xmm5, [.packednineteen]
	movaps	xmm9, xmm1
	pmuludq	xmm2, [.packednineteen]
	movaps	[rsp+0x20], xmm2
	pmuludq	xmm15, xmm11
	pslld	xmm9, 1
	movaps	xmm2, [rsp+0x50]
	pmuludq	xmm2, [.packedthirtyeight]
	movaps	[rsp+0x60], xmm2
	movaps	xmm2, xmm8
	movaps	xmm8, xmm0
	pmuludq	xmm2, [.packedthirtyeight]
	pmuludq	xmm3, xmm2
	movaps	xmm11, [rsp+0x20]
	pslld	xmm8, 1
	pmuludq	xmm11, xmm8
	paddq	xmm11, xmm15
	movaps	xmm15, [rsp+0x60]
	pmuludq	xmm15, xmm4
	paddq	xmm15, xmm11
	movaps	xmm11, xmm5
	pmuludq	xmm11, xmm7
	paddq	xmm11, xmm15
	movaps	xmm15, [rsp+0x60]
	paddq	xmm3, xmm11
	psrld	xmm7, 1
	pmuludq	xmm7, xmm2
	movaps	xmm11, [rsp+0x30]
	paddq	xmm11, xmm3
	movaps	xmm3, [rsp+0x20]
	movaps	[rsp+0x30], xmm11
	pmuludq	xmm3, xmm6
	movaps	xmm11, xmm15
	pmuludq	xmm11, xmm0
	paddq	xmm11, xmm3
	movaps	xmm3, xmm5
	pmuludq	xmm0, xmm2
	pmuludq	xmm3, xmm4
	paddq	xmm3, xmm11
	paddq	xmm7, xmm3
	movaps	xmm3, xmm15
	paddq	xmm7, [rsp+0x40]
	pmuludq	xmm4, xmm2
	movaps	[rsp+0x40], xmm7
	pmuludq	xmm3, xmm6
	movaps	xmm7, [rsp+0x20]
	pmuludq	xmm7, xmm1
	paddq	xmm3, xmm7
	movaps	xmm7, xmm8
	movaps	xmm8, [rsp+0x90]
	
	pmuludq	xmm7, xmm5
	paddq	xmm7, xmm3
	movaps	xmm3, xmm5
	paddq	xmm4, xmm7
	paddq	xmm8, xmm4
	movaps	xmm4, xmm15
	pmuludq	xmm3, xmm6
	pmuludq	xmm6, xmm2
	pmuludq	xmm4, xmm1
	paddq	xmm3, xmm4
	paddq	xmm0, xmm3
	paddq	xmm12, xmm0
	movaps	xmm0, xmm15
	pmuludq	xmm1, xmm2
	movaps	xmm4, [rsp+0x70]
	pmuludq	xmm0, [rsp+0x50]
	movaps	xmm3, xmm0
	movaps	xmm0, xmm9
	movaps	xmm9, [.packedmask25]
	pmuludq	xmm0, xmm5
	paddq	xmm0, xmm3
	paddq	xmm6, xmm0
	movaps	xmm0, xmm4
	paddq	xmm13, xmm6
	movaps	xmm3, [rsp]
	pmuludq	xmm0, xmm5
	paddq	xmm1, xmm0
	movaps	xmm0, xmm4
	pmuludq	xmm5, xmm3
	paddq	xmm1, [rsp+0x80]
	
	movaps	xmm15, xmm1
	pmuludq	xmm0, xmm2
	paddq	xmm0, xmm5
	paddq	xmm0, [rsp+0xa0]
	
	movaps	xmm7, xmm0
	movaps	xmm0, xmm3
	movaps	xmm5, [rsp+0x30]
	pmuludq	xmm0, xmm2
	paddq	xmm0, [rsp+0xb0]
	
	movaps	xmm6, xmm0
	movaps	xmm0, xmm13
	pmuludq	xmm2, [rsp+0x10]
	paddq	xmm14, xmm2
	movaps	xmm1, xmm5
	pand	xmm13, [.packedmask26]
	psrlq	xmm0, 0x1a
	paddq	xmm0, xmm15
	movaps	xmm2, xmm0
	pand	xmm5, [.packedmask26]
	psrlq	xmm0, 0x19
	paddq	xmm0, xmm7
	movaps	xmm3, xmm0
	psrlq	xmm0, 0x1a
	paddq	xmm0, xmm6
	pand	xmm2, [.packedmask25]
	movaps	xmm6, xmm0
	psrlq	xmm0, 0x19
	paddq	xmm14, xmm0
	movaps	xmm0, xmm14
	psrlq	xmm1, 0x1a
	paddq	xmm1, [rsp+0x40]
	movaps	xmm15, xmm1
	psrlq	xmm1, 0x19
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm8
	paddq	xmm10, xmm0
	movaps	xmm0, xmm10
	movaps	xmm4, xmm2
	movaps	xmm2, xmm1
	psrlq	xmm1, 0x1a
	paddq	xmm12, xmm1
	movaps	xmm1, xmm12
	psrlq	xmm0, 0x19
	pand	xmm12, [.packedmask25]
	pmuludq	xmm0, [.packednineteen]
	paddq	xmm5, xmm0
	movaps	xmm8, [.packedmask26]
	psrlq	xmm1, 0x19
	paddq	xmm13, xmm1
	pand	xmm2, [.packedmask26]
	movaps	xmm1, xmm5
	movaps	xmm0, xmm13
	pand	xmm5, [.packedmask26]
	psrlq	xmm1, 0x1a
	psrlq	xmm0, 0x1a
	paddq	xmm0, xmm4
	movaps	[rsp+0x10], xmm0
	pand	xmm13, [.packedmask26]
	pmuludq	xmm0, [.packed121666121665]
	movaps	xmm11, xmm2
	pand	xmm8, xmm3
	movaps	xmm3, xmm12
	pand	xmm10, [.packedmask25]
	pmuludq	xmm11, [.packed121666121665]
	movaps	xmm4, xmm5
	pand	xmm15, [.packedmask25]
	movaps	[rsp], xmm3
	pmuludq	xmm4, [.packed121666121665]
	pand	xmm14, [.packedmask26]
	movaps	[rsp+0x20], xmm8
	pand	xmm9, xmm6
	paddq	xmm15, xmm1
	movaps	xmm6, xmm10
	movaps	xmm1, xmm15
	movaps	xmm10, xmm8
	movaps	xmm7, xmm9
	movaps	xmm9, xmm3
	pmuludq	xmm1, [.packed121666121665]
	movaps	xmm3, xmm13
	pmuludq	xmm10, [.packed121666121665]
	pmuludq	xmm9, [.packed121666121665]
	movaps	xmm8, xmm7
	pmuludq	xmm3, [.packed121666121665]
	movaps	[rsp+0x30], xmm7
	movaps	xmm7, xmm14
	pmuludq	xmm8, [.packed121666121665]
	movaps	[rsp+0x70], xmm14
	movaps	xmm14, xmm6
	pmuludq	xmm7, [.packed121666121665]
	movaps	[rsp+0x80], xmm6
	
	pmuludq	xmm14, [.packed121666121665]
	movaps	xmm12, xmm3
	movaps	xmm6, xmm14
	movaps	xmm14, xmm4
	psrlq	xmm12, 0x1a
	paddq	xmm0, xmm12
	movaps	xmm12, xmm0
	psrlq	xmm14, 0x1a
	paddq	xmm1, xmm14
	movaps	xmm14, xmm1
	pand	xmm3, [.packedmask26]
	psrlq	xmm12, 0x19
	paddq	xmm10, xmm12
	psrlq	xmm14, 0x19
	movaps	xmm12, xmm10
	paddq	xmm11, xmm14
	movaps	xmm14, xmm11
	pand	xmm4, [.packedmask26]
	psrlq	xmm12, 0x1a
	paddq	xmm8, xmm12
	movaps	xmm12, xmm8
	psrlq	xmm14, 0x1a
	paddq	xmm9, xmm14
	movaps	xmm14, xmm9
	psrlq	xmm12, 0x19
	paddq	xmm7, xmm12
	pand	xmm1, [.packedmask25]
	psrlq	xmm14, 0x19
	paddq	xmm3, xmm14
	movaps	xmm14, xmm7
	pand	xmm11, [.packedmask26]
	psrlq	xmm14, 0x1a
	paddq	xmm6, xmm14
	movaps	xmm14, xmm6
	pand	xmm6, [.packedmask25]
	psrlq	xmm14, 0x19
	pmuludq	xmm14, [.packednineteen]
	paddq	xmm4, xmm14
	movaps	xmm14, xmm4
	pand	xmm4, [.packedmask26]
	psrlq	xmm14, 0x1a
	paddq	xmm1, xmm14
	movaps	xmm12, [rsp]
	pand	xmm9, [.packedmask25]
	movaps	[rsp+0x40], xmm6
	pand	xmm0, [.packedmask25]
	movaps	xmm14, xmm4
	punpckhqdq xmm4, xmm1
	movaps	xmm6, xmm3
	punpcklqdq xmm14, xmm1
	movaps	xmm1, xmm5
	pand	xmm3, [.packedmask26]
	psrlq	xmm6, 0x1a
	pslldq	xmm4, 0x4
	paddq	xmm0, xmm6
	punpckhqdq xmm1, xmm15
	movaps	xmm6, xmm14
	movaps	xmm14, xmm5
	pand	xmm10, [.packedmask26]
	pslldq	xmm6, 0x4
	por	xmm1, xmm4
	movaps	xmm4, xmm11
	punpckhqdq xmm11, xmm9
	punpcklqdq xmm14, xmm15
	punpcklqdq xmm4, xmm9
	movaps	xmm9, xmm2
	pslldq	xmm11, 0x4
	pand	xmm8, [.packedmask25]
	punpckhqdq xmm9, xmm12
	por	xmm14, xmm6
	movaps	xmm6, xmm4
	movaps	xmm4, xmm2
	por	xmm9, xmm11
	movaps	xmm11, xmm3
	pslldq	xmm6, 0x4
	punpcklqdq xmm4, xmm12
	paddd	xmm14, [.packed32zeromodp0]
	psubd	xmm14, xmm1
	punpcklqdq xmm11, xmm0
	movaps	xmm12, [rsp+0x10]
	punpckhqdq xmm3, xmm0
	por	xmm4, xmm6
	movaps	xmm0, xmm13
	movaps	xmm6, xmm11
	movaps	xmm11, xmm13
	punpckhqdq xmm0, xmm12
	pslldq	xmm3, 0x4
	pslldq	xmm6, 0x4
	pand	xmm7, [.packedmask26]
	paddd	xmm4, [.packed32zeromodp1]
	psubd	xmm4, xmm9
	movaps	xmm1, xmm4
	punpcklqdq xmm11, xmm12
	movaps	xmm12, xmm0
	movaps	xmm0, xmm10
	por	xmm12, xmm3
	por	xmm11, xmm6
	movaps	xmm6, [rsp+0x20]
	punpcklqdq xmm0, xmm8
	movaps	xmm3, xmm6
	punpckhqdq xmm10, xmm8
	movaps	xmm8, xmm7
	paddd	xmm11, [.packed32zeromodp1]
	psubd	xmm11, xmm12
	punpcklqdq xmm3, [rsp+0x30]
	pslldq	xmm0, 0x4
	pslldq	xmm10, 0x4
	punpcklqdq xmm8, [rsp+0x40]
	punpckhqdq xmm7, [rsp+0x40]
	por	xmm3, xmm0
	movaps	xmm0, xmm6
	pslldq	xmm8, 0x4
	punpckhqdq xmm0, [rsp+0x30]
	paddd	xmm3, [.packed32zeromodp1]
	pslldq	xmm7, 0x4
	por	xmm0, xmm10
	movaps	xmm10, [rsp+0x70]
	movaps	xmm6, xmm10
	psubd	xmm3, xmm0
	punpcklqdq xmm1, xmm3
	punpcklqdq xmm6, [rsp+0x80]
	
	movaps	xmm9, xmm1
	punpckhqdq xmm4, xmm3
	movaps	xmm3, xmm9
	pand	xmm9, [.packedmask26262626]
	por	xmm6, xmm8
	psrld	xmm3, 0x1a
	movaps	xmm8, xmm10
	movaps	xmm0, xmm3
	punpckhqdq xmm8, [rsp+0x80]
	
	paddd	xmm0, xmm4
	paddd	xmm6, [.packed32zeromodp1]
	movaps	xmm3, xmm0
	pand	xmm0, [.packedmask25252525]
	psrld	xmm3, 0x19
	por	xmm8, xmm7
	movaps	xmm7, xmm14
	punpckhqdq xmm14, xmm11
	punpcklqdq xmm7, xmm11
	psubd	xmm6, xmm8
	movaps	xmm1, xmm7
	pand	xmm7, [.packedmask26262626]
	psrld	xmm1, 0x1a
	paddd	xmm1, xmm14
	movaps	xmm4, xmm1
	pand	xmm1, [.packedmask25252525]
	movaps	xmm14, xmm15
	psrld	xmm4, 0x19
	movaps	xmm8, xmm4
	movaps	xmm4, xmm3
	paddd	xmm8, xmm9
	psrldq	xmm4, 0x8
	paddd	xmm6, xmm4
	movaps	xmm4, xmm3
	pslldq	xmm4, 0x8
	movaps	xmm11, xmm4
	movaps	xmm4, xmm8
	punpckhqdq xmm8, xmm0
	paddd	xmm11, xmm7
	movaps	xmm3, xmm11
	punpcklqdq xmm4, xmm0
	punpcklqdq xmm3, xmm1
	punpckhqdq xmm11, xmm1
	pshufd	xmm9, xmm4, 0x50
	pshufd	xmm12, xmm3, 0x50
	pshufd	xmm3, xmm3, 0xfa
	pshufd	xmm1, xmm4, 0xfa
	movaps	xmm4, xmm5
	pshufd	xmm0, xmm11, 0x50
	punpckhqdq xmm15, xmm3
	punpcklqdq xmm14, xmm3
	punpcklqdq xmm4, xmm12
	movaps	xmm3, xmm15
	movaps	xmm15, xmm2
	punpckhqdq xmm2, xmm9
	punpcklqdq xmm15, xmm9
	movaps	xmm9, [rsp]
	punpckhqdq xmm5, xmm12
	movaps	xmm12, xmm14
	movaps	xmm14, xmm9
	punpckhqdq xmm9, xmm1
	punpcklqdq xmm14, xmm1
	pshufd	xmm11, xmm11, 0xfa
	movaps	xmm1, xmm9
	movaps	xmm9, xmm13
	punpckhqdq xmm13, xmm0
	punpcklqdq xmm9, xmm0
	pshufd	xmm10, xmm8, 0x50
	movaps	[rsp], xmm13
	pshufd	xmm8, xmm8, 0xfa
	pshufd	xmm7, xmm6, 0x50
	pshufd	xmm6, xmm6, 0xfa
	movaps	xmm0, [rsp+0x10]
	movaps	xmm13, xmm0
	punpcklqdq xmm13, xmm11
	movaps	[rsp+0x40], xmm13
	movaps	xmm13, xmm0
	movaps	xmm0, xmm12
	punpckhqdq xmm13, xmm11
	movaps	xmm11, [rsp+0x20]
	pslld	xmm0, 1
	movaps	[rsp+0x10], xmm13
	movaps	xmm13, xmm11
	punpckhqdq xmm11, xmm10
	punpcklqdq xmm13, xmm10
	movaps	[rsp+0x20], xmm11
	movaps	[rsp+0x50], xmm13
	movaps	xmm11, [rsp+0x30]
	movaps	xmm10, xmm11
	punpckhqdq xmm11, xmm8
	punpcklqdq xmm10, xmm8
	movaps	xmm8, [rsp+0x70]
	movaps	xmm13, xmm11
	movaps	[rsp+0x60], xmm10
	movaps	xmm10, xmm8
	punpckhqdq xmm8, xmm7
	punpcklqdq xmm10, xmm7
	movaps	xmm11, [rsp+0x40]
	movaps	[rsp+0x70], xmm10
	movaps	[rsp+0x90], xmm8
	
	movaps	xmm8, xmm15
	pmuludq	xmm8, xmm5
	movaps	xmm10, [rsp+0x80]
	
	movaps	xmm7, xmm10
	punpcklqdq xmm7, xmm6
	movaps	[rsp+0xd0], xmm7
	
	movaps	xmm7, xmm10
	movaps	xmm10, xmm4
	punpckhqdq xmm7, xmm6
	movaps	xmm6, xmm4
	pmuludq	xmm10, xmm3
	pmuludq	xmm6, xmm5
	movaps	[rsp+0x80], xmm6
	
	movaps	xmm6, xmm12
	movaps	[rsp+0x30], xmm7
	movaps	xmm7, xmm0
	pmuludq	xmm6, xmm5
	paddq	xmm6, xmm10
	movaps	[rsp+0xa0], xmm6
	
	pmuludq	xmm7, xmm3
	movaps	xmm6, xmm0
	paddq	xmm7, xmm8
	movaps	xmm8, xmm15
	movaps	xmm0, xmm7
	movaps	xmm7, xmm14
	movaps	xmm10, xmm4
	pmuludq	xmm8, xmm3
	pmuludq	xmm7, xmm5
	paddq	xmm8, xmm7
	movaps	xmm7, xmm12
	pmuludq	xmm10, xmm2
	paddq	xmm0, xmm10
	movaps	xmm10, xmm4
	movaps	[rsp+0xb0], xmm0
	
	pmuludq	xmm7, xmm2
	paddq	xmm7, xmm8
	movaps	xmm0, xmm7
	pmuludq	xmm10, xmm1
	movaps	xmm7, xmm14
	paddq	xmm0, xmm10
	movaps	xmm10, xmm9
	pslld	xmm7, 1
	movaps	xmm8, xmm7
	pmuludq	xmm10, xmm5
	movaps	[rsp+0xc0], xmm0
	
	pmuludq	xmm8, xmm3
	paddq	xmm8, xmm10
	movaps	xmm10, xmm15
	pmuludq	xmm10, xmm2
	paddq	xmm10, xmm8
	movaps	xmm8, xmm6
	movaps	xmm0, [rsp]
	pmuludq	xmm8, xmm1
	paddq	xmm8, xmm10
	movaps	xmm10, xmm4
	pmuludq	xmm10, xmm0
	paddq	xmm8, xmm10
	movaps	xmm10, xmm9
	movaps	[rsp+0xf0], xmm8
	
	movaps	xmm8, xmm11
	pmuludq	xmm10, xmm3
	pslld	xmm11, 1
	pmuludq	xmm8, xmm5
	paddq	xmm10, xmm8
	movaps	xmm8, xmm14
	pmuludq	xmm8, xmm2
	paddq	xmm8, xmm10
	movaps	xmm10, xmm15
	pmuludq	xmm10, xmm1
	paddq	xmm10, xmm8
	movaps	xmm8, xmm12
	pmuludq	xmm8, [rsp]
	paddq	xmm8, xmm10
	movaps	xmm10, [rsp+0x10]
	movaps	xmm0, xmm8
	movaps	xmm8, xmm11
	pmuludq	xmm11, xmm1
	pmuludq	xmm10, xmm4
	paddq	xmm0, xmm10
	movaps	xmm10, [rsp+0x50]
	pmuludq	xmm8, xmm3
	pmuludq	xmm10, xmm5
	paddq	xmm8, xmm10
	movaps	xmm10, xmm9
	movaps	[rsp+0x100], xmm0
	
	pmuludq	xmm10, xmm2
	paddq	xmm10, xmm8
	movaps	xmm8, xmm7
	pmuludq	xmm8, xmm1
	paddq	xmm8, xmm10
	movaps	xmm10, xmm15
	movaps	xmm0, [rsp+0x20]
	pmuludq	xmm10, [rsp]
	paddq	xmm10, xmm8
	movaps	xmm8, [rsp+0x10]
	pmuludq	xmm8, xmm6
	paddq	xmm8, xmm10
	movaps	xmm10, xmm0
	pmuludq	xmm6, xmm13
	pmuludq	xmm10, xmm4
	paddq	xmm8, xmm10
	movaps	xmm10, [rsp+0x50]
	movaps	[rsp+0x110], xmm8
	
	pmuludq	xmm10, xmm3
	movaps	xmm8, [rsp+0x60]
	pmuludq	xmm8, xmm5
	paddq	xmm10, xmm8
	movaps	xmm8, [rsp+0x40]
	pmuludq	xmm8, xmm2
	paddq	xmm8, xmm10
	movaps	xmm10, xmm9
	pmuludq	xmm10, xmm1
	paddq	xmm10, xmm8
	movaps	xmm8, xmm14
	pmuludq	xmm8, [rsp]
	paddq	xmm8, xmm10
	movaps	xmm10, [rsp+0x10]
	pmuludq	xmm10, xmm15
	paddq	xmm10, xmm8
	movaps	xmm8, xmm0
	pmuludq	xmm8, xmm12
	paddq	xmm8, xmm10
	movaps	xmm10, xmm4
	movaps	xmm0, xmm8
	pmuludq	xmm10, xmm13
	paddq	xmm0, xmm10
	movaps	[rsp+0x120], xmm0
	
	movaps	xmm10, [rsp+0x60]
	movaps	xmm8, [rsp+0x70]
	pslld	xmm10, 1
	pmuludq	xmm10, xmm3
	pmuludq	xmm8, xmm5
	paddq	xmm10, xmm8
	movaps	xmm8, [rsp+0x50]
	pmuludq	xmm5, [rsp+0xd0]
	
	pmuludq	xmm8, xmm2
	paddq	xmm8, xmm10
	paddq	xmm11, xmm8
	movaps	xmm8, xmm9
	movaps	xmm0, [rsp+0x10]
	pmuludq	xmm8, [rsp]
	paddq	xmm8, xmm11
	movaps	xmm11, [rsp+0x20]
	pmuludq	xmm7, xmm0
	paddq	xmm7, xmm8
	movaps	xmm8, xmm11
	pmuludq	xmm8, xmm15
	paddq	xmm8, xmm7
	paddq	xmm6, xmm8
	movaps	xmm8, [rsp+0x90]
	
	movaps	xmm10, xmm8
	movaps	xmm7, [rsp+0x50]
	pmuludq	xmm10, xmm4
	paddq	xmm6, xmm10
	movaps	[rsp+0x130], xmm6
	
	movaps	xmm6, [rsp+0x70]
	pmuludq	xmm6, xmm3
	paddq	xmm6, xmm5
	movaps	xmm5, [rsp+0x60]
	movaps	xmm10, [rsp+0x40]
	pmuludq	xmm5, xmm2
	paddq	xmm5, xmm6
	movaps	xmm6, xmm7
	pmuludq	xmm6, xmm1
	paddq	xmm6, xmm5
	movaps	xmm5, xmm10
	pmuludq	xmm10, [.packednineteen]
	pmuludq	xmm5, [rsp]
	paddq	xmm5, xmm6
	movaps	xmm6, xmm0
	movaps	xmm0, [rsp+0x30]
	pmuludq	xmm6, xmm9
	paddq	xmm6, xmm5
	movaps	xmm5, xmm11
	pmuludq	xmm4, xmm0
	movaps	xmm11, xmm8
	pmuludq	xmm9, [.packednineteen]
	pmuludq	xmm5, xmm14
	paddq	xmm5, xmm6
	movaps	xmm6, xmm15
	pmuludq	xmm14, [.packednineteen]
	movaps	xmm0, xmm11
	pmuludq	xmm15, [.packednineteen]
	pmuludq	xmm6, xmm13
	paddq	xmm6, xmm5
	movaps	xmm5, xmm8
	pmuludq	xmm0, xmm15
	movaps	xmm11, xmm0
	movaps	xmm8, [rsp+0x60]
	pmuludq	xmm5, xmm12
	paddq	xmm5, xmm6
	paddq	xmm5, xmm4
	movaps	xmm4, xmm14
	pmuludq	xmm12, [.packednineteen]
	pslld	xmm12, 1
	pmuludq	xmm12, [rsp+0x30]
	paddq	xmm11, xmm12
	pmuludq	xmm8, [.packednineteen]
	pslld	xmm4, 1
	movaps	[rsp+0xe0], xmm4
	
	movaps	xmm4, xmm10
	movaps	xmm6, xmm8
	pslld	xmm4, 1
	movaps	[rsp+0x40], xmm4
	pslld	xmm6, 1
	movaps	[rsp+0x140], xmm5
	
	movaps	xmm5, xmm7
	movaps	[rsp+0x50], xmm6
	pmuludq	xmm5, [.packednineteen]
	movaps	xmm0, [rsp+0xe0]
	
	pmuludq	xmm0, xmm13
	movaps	xmm12, xmm0
	movaps	xmm0, [rsp+0x20]
	paddq	xmm12, xmm11
	pmuludq	xmm0, xmm9
	movaps	xmm11, xmm0
	movaps	xmm0, [rsp+0x40]
	paddq	xmm11, xmm12
	pmuludq	xmm0, [rsp+0x10]
	movaps	xmm12, xmm0
	movaps	xmm0, xmm5
	movaps	xmm4, [rsp+0x70]
	paddq	xmm12, xmm11
	pmuludq	xmm0, [rsp]
	movaps	xmm11, xmm0
	pmuludq	xmm4, [.packednineteen]
	movaps	xmm0, [rsp+0x50]
	paddq	xmm11, xmm12
	pmuludq	xmm0, xmm1
	movaps	xmm7, [rsp+0xd0]
	
	movaps	xmm12, xmm0
	movaps	xmm0, xmm4
	pmuludq	xmm7, [.packednineteen]
	movaps	xmm6, xmm7
	paddq	xmm12, xmm11
	pmuludq	xmm0, xmm2
	movaps	xmm11, xmm0
	movaps	xmm0, xmm9
	pslld	xmm6, 1
	pmuludq	xmm3, xmm6
	paddq	xmm11, xmm12
	paddq	xmm3, xmm11
	paddq	xmm3, [rsp+0x80]
	
	pmuludq	xmm0, xmm13
	movaps	[rsp+0x80], xmm3
	
	pmuludq	xmm15, [rsp+0x30]
	movaps	xmm3, xmm0
	pmuludq	xmm2, xmm7
	movaps	xmm12, [rsp+0x90]
	
	pmuludq	xmm14, xmm12
	paddq	xmm14, xmm15
	paddq	xmm3, xmm14
	movaps	xmm14, [rsp+0x20]
	movaps	xmm15, [rsp+0x10]
	movaps	xmm0, xmm14
	pmuludq	xmm0, xmm10
	movaps	xmm11, xmm0
	movaps	xmm0, xmm15
	paddq	xmm11, xmm3
	pmuludq	xmm0, xmm5
	movaps	xmm3, xmm0
	movaps	xmm0, xmm4
	paddq	xmm3, xmm11
	movaps	xmm11, xmm8
	pmuludq	xmm0, xmm1
	pmuludq	xmm1, xmm6
	pmuludq	xmm11, [rsp]
	paddq	xmm11, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, [rsp+0x40]
	paddq	xmm3, xmm11
	paddq	xmm2, xmm3
	paddq	xmm2, [rsp+0xa0]
	
	movaps	[rsp+0xa0], xmm2
	
	pmuludq	xmm0, xmm13
	movaps	xmm11, xmm12
	pmuludq	xmm10, xmm11
	movaps	xmm2, [rsp+0xe0]
	
	pmuludq	xmm2, [rsp+0x30]
	movaps	xmm3, xmm2
	movaps	xmm2, xmm12
	movaps	xmm12, [rsp+0x30]
	pmuludq	xmm2, xmm9
	paddq	xmm2, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, [rsp+0x50]
	pmuludq	xmm9, xmm12
	paddq	xmm10, xmm9
	paddq	xmm3, xmm2
	movaps	xmm2, xmm14
	pmuludq	xmm0, xmm15
	pmuludq	xmm2, xmm5
	paddq	xmm2, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, xmm15
	paddq	xmm3, xmm2
	movaps	xmm2, xmm4
	pmuludq	xmm0, xmm4
	pmuludq	xmm2, [rsp]
	paddq	xmm2, xmm3
	paddq	xmm1, xmm2
	movaps	xmm2, xmm5
	paddq	xmm1, [rsp+0xb0]
	
	movaps	[rsp+0xb0], xmm1
	
	pmuludq	xmm2, xmm13
	movaps	xmm1, xmm2
	movaps	xmm2, xmm14
	movaps	xmm3, xmm14
	paddq	xmm1, xmm10
	movaps	xmm14, xmm15
	pmuludq	xmm2, xmm8
	movaps	xmm10, [rsp+0xc0]
	
	paddq	xmm2, xmm1
	movaps	xmm1, xmm0
	pmuludq	xmm14, xmm6
	movaps	xmm15, xmm12
	movaps	xmm0, [rsp]
	paddq	xmm1, xmm2
	pmuludq	xmm0, xmm7
	movaps	xmm2, [rsp+0x40]
	paddq	xmm0, xmm1
	paddq	xmm10, xmm0
	movaps	xmm0, xmm11
	movaps	[rsp+0xc0], xmm10
	
	pmuludq	xmm2, xmm12
	movaps	xmm10, xmm11
	pmuludq	xmm0, xmm5
	paddq	xmm0, xmm2
	pmuludq	xmm5, xmm15
	pmuludq	xmm8, xmm10
	paddq	xmm8, xmm5
	movaps	xmm11, [rsp+0x50]
	movaps	xmm2, xmm11
	pmuludq	xmm2, xmm13
	movaps	xmm1, xmm2
	movaps	xmm2, xmm3
	paddq	xmm1, xmm0
	movaps	xmm0, xmm3
	pmuludq	xmm2, xmm7
	pmuludq	xmm7, xmm10
	pmuludq	xmm0, xmm4
	paddq	xmm0, xmm1
	movaps	xmm1, xmm14
	movaps	xmm14, xmm15
	movaps	xmm15, [rsp+0x100]
	
	paddq	xmm1, xmm0
	movaps	xmm0, [rsp+0xf0]
	
	paddq	xmm0, xmm1
	movaps	xmm1, xmm2
	movaps	xmm12, xmm0
	movaps	xmm0, xmm4
	movaps	xmm2, xmm11
	pmuludq	xmm0, xmm13
	pmuludq	xmm13, xmm6
	paddq	xmm0, xmm8
	paddq	xmm1, xmm0
	movaps	xmm0, xmm10
	paddq	xmm15, xmm1
	movaps	xmm1, xmm13
	movaps	xmm13, [rsp+0x110]
	
	pmuludq	xmm2, xmm14
	pmuludq	xmm0, xmm4
	paddq	xmm0, xmm2
	pmuludq	xmm4, xmm14
	paddq	xmm1, xmm0
	paddq	xmm13, xmm1
	movaps	xmm1, [rsp+0x80]
	
	paddq	xmm7, xmm4
	movaps	xmm2, xmm12
	pmuludq	xmm6, xmm14
	movaps	xmm3, xmm1
	movaps	xmm4, [rsp+0x130]
	
	paddq	xmm7, [rsp+0x120]
	
	pand	xmm1, [.packedmask26]
	psrlq	xmm2, 0x1a
	paddq	xmm2, xmm15
	movaps	xmm0, [.packedmask26]
	paddq	xmm4, xmm6
	movaps	xmm5, xmm2
	movaps	xmm14, xmm4
	psrlq	xmm3, 0x1a
	paddq	xmm3, [rsp+0xa0]
	
	movaps	xmm4, xmm3
	pand	xmm0, xmm12
	movaps	xmm2, xmm3
	psrlq	xmm4, 0x19
	movaps	xmm3, xmm5
	paddq	xmm4, [rsp+0xb0]
	
	movaps	xmm8, xmm4
	pand	xmm2, [.packedmask25]
	psrlq	xmm4, 0x1a
	psrlq	xmm3, 0x19
	paddq	xmm4, [rsp+0xc0]
	
	paddq	xmm3, xmm13
	pand	xmm8, [.packedmask26]
	movaps	xmm9, xmm3
	psrlq	xmm3, 0x1a
	paddq	xmm3, xmm7
	movaps	xmm10, xmm4
	movaps	xmm6, xmm3
	psrlq	xmm4, 0x19
	pand	xmm5, [.packedmask25]
	paddq	xmm0, xmm4
	psrlq	xmm3, 0x19
	paddq	xmm3, xmm14
	movaps	xmm15, xmm3
	psrlq	xmm3, 0x1a
	paddq	xmm3, [rsp+0x140]
	
	movaps	xmm4, xmm3
	psrlq	xmm3, 0x19
	pmuludq	xmm3, [.packednineteen]
	paddq	xmm1, xmm3
	movaps	xmm7, xmm1
	pand	xmm1, [.packedmask26]
	movaps	xmm3, xmm0
	psrlq	xmm7, 0x1a
	paddq	xmm2, xmm7
	movaps	xmm7, xmm8
	pand	xmm10, [.packedmask25]
	psrlq	xmm3, 0x1a
	paddq	xmm5, xmm3
	pand	xmm9, [.packedmask26]
	movaps	xmm3, xmm1
	pand	xmm0, [.packedmask26]
	pand	xmm6, [.packedmask25]
	punpckldq xmm7, xmm10
	punpckldq xmm3, xmm2
	movaps	xmm13, xmm0
	pand	xmm15, [.packedmask26]
	punpcklqdq xmm3, xmm7
	movaps	xmm7, xmm9
	pand	xmm4, [.packedmask25]
	punpckldq xmm7, xmm6
	punpckldq xmm13, xmm5
	punpckhdq xmm8, xmm10
	punpckhdq xmm1, xmm2
	punpckhdq xmm9, xmm6
	punpckhdq xmm0, xmm5
	punpcklqdq xmm13, xmm7
	movaps	xmm7, xmm15
	punpcklqdq xmm1, xmm8
	punpckldq xmm7, xmm4
	punpcklqdq xmm0, xmm9
	punpckhdq xmm15, xmm4
	je     .lowloop_done
	movaps	xmm12, xmm0
	movaps	xmm2, xmm1
	movaps	xmm4, xmm7
	jmp	.lowloop
calign
.lowloop_done:
	movaps	xmm2, xmm1
	lea	rsi, [rsp+0x370]
	
	movaps	xmm12, xmm0
	lea	rdi, [rsp+0x3d0]
	
	movaps	xmm0, xmm7
	mov	edx, 1
	movaps	[rsp+0x310], xmm3
	movaps	[rsp+0x330], xmm0
	movaps	[rsp+0x370], xmm2
	movaps	[rsp+0x380], xmm12
	movaps	[rsp+0x390], xmm15
	movaps	[rsp+0x320], xmm13
	call	curve25519$square_times
	pshufd	xmm9, [rsp+0x3f0], 0xd8
	
	mov	eax, 2
	movaps	xmm0, [rsp+0x3d0]
	
	movaps	xmm14, xmm9
	pshufd	xmm2, xmm0, 0xd8
	pshufd	xmm15, xmm0, 0xfa
	movaps	xmm0, [rsp+0x3e0]
	
	pshufd	xmm1, xmm0, 0xd8
	pshufd	xmm0, xmm0, 0xfa
	movaps	xmm3, xmm2
	movaps	xmm12, xmm1
	movaps	xmm10, xmm0
calign
.squaretimes2:
	movaps	xmm0, xmm15
	sub	eax, 1
	movaps	xmm2, xmm3
	pslldq	xmm0, 0x8
	pshufd	xmm1, xmm15, 0
	movaps	xmm4, xmm15
	punpckhqdq xmm2, xmm0
	pshufd	xmm0, xmm3, 0
	movaps	xmm7, [.sse2_top64bitmask]
	pshufd	xmm6, xmm3, 0xaa
	movaps	[rsp], xmm2
	pshufd	xmm5, xmm15, 0xaa
	pslld	xmm6, 1
	movaps	xmm2, [.sse2_top64bitmask]
	pslld	xmm5, 1
	pand	xmm2, xmm0
	paddq	xmm2, xmm0
	movaps	xmm0, [.sse2_top64bitmask]
	pshufd	xmm8, xmm2, 0xe6
	pmuludq	xmm2, xmm3
	movaps	xmm3, xmm12
	pand	xmm0, xmm1
	pmuludq	xmm3, xmm8
	paddq	xmm0, xmm1
	movaps	xmm1, xmm12
	pshufd	xmm13, xmm0, 0xe6
	pmuludq	xmm0, xmm15
	paddq	xmm3, xmm0
	pslldq	xmm1, 0x8
	punpckhqdq xmm4, xmm1
	movaps	xmm0, xmm15
	pshufd	xmm1, xmm12, 0
	pslld	xmm0, 1
	movaps	[rsp+0xb0], xmm0
	
	movaps	xmm0, xmm10
	pand	xmm7, xmm1
	movaps	[rsp+0x10], xmm4
	pmuludq	xmm0, xmm8
	movaps	xmm4, xmm10
	movaps	xmm11, xmm7
	movaps	xmm7, xmm12
	paddq	xmm11, xmm1
	movaps	xmm1, xmm10
	pslldq	xmm1, 0x8
	punpckhqdq xmm7, xmm1
	movaps	xmm1, xmm7
	pmuludq	xmm1, [.packednineteen]
	pshufd	xmm9, xmm1, 0x54
	pshufd	xmm1, xmm1, 0xee
	movaps	[rsp+0x20], xmm9
	movaps	xmm9, xmm10
	movaps	[rsp+0x30], xmm1
	movaps	xmm1, xmm14
	pslldq	xmm1, 0x8
	punpckhqdq xmm9, xmm1
	pxor	xmm1, xmm1
	movaps	[rsp+0x40], xmm9
	pshufd	xmm9, xmm10, 0xaa
	punpcklqdq xmm4, xmm1
	pmuludq	xmm9, [.packed3819]
	pshufd	xmm1, xmm9, 0xfe
	movaps	[rsp+0x50], xmm4
	pxor	xmm4, xmm4
	movaps	[rsp+0x60], xmm1
	movaps	xmm1, xmm14
	punpcklqdq xmm1, xmm4
	movaps	[rsp+0x70], xmm1
	pshufd	xmm1, xmm14, 0
	movaps	xmm4, xmm1
	pshufd	xmm1, xmm14, 0xfe
	pmuludq	xmm4, [.packednineteen]
	movaps	[rsp+0x80], xmm4
	
	pshufd	xmm4, xmm14, 0xaa
	pslld	xmm1, 1
	movaps	[rsp+0x90], xmm1
	
	pmuludq	xmm4, [.packed3819]
	pshufd	xmm1, xmm4, 0xaa
	movaps	[rsp+0xa0], xmm1
	
	movaps	xmm1, xmm15
	movaps	xmm15, xmm12
	pmuludq	xmm1, xmm8
	pmuludq	xmm8, xmm14
	movaps	xmm14, xmm10
	pmuludq	xmm15, xmm13
	pmuludq	xmm13, xmm10
	movaps	xmm10, xmm11
	paddq	xmm8, xmm13
	movaps	xmm11, [rsp+0x10]
	paddq	xmm0, xmm15
	pmuludq	xmm10, xmm12
	paddq	xmm8, xmm10
	pslld	xmm12, 1
	movaps	xmm10, [.sse2_bot64bitmask]
	pslld	xmm14, 1
	pand	xmm10, xmm6
	movaps	xmm15, [rsp]
	paddq	xmm10, xmm6
	pmuludq	xmm6, xmm15
	paddq	xmm1, xmm6
	movaps	xmm6, xmm11
	movaps	xmm13, [.sse2_bot64bitmask]
	pmuludq	xmm6, xmm10
	paddq	xmm3, xmm6
	movaps	xmm6, xmm7
	pand	xmm13, xmm5
	pmuludq	xmm6, xmm10
	paddq	xmm0, xmm6
	movaps	xmm6, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	paddq	xmm13, xmm5
	pslld	xmm6, 1
	pmuludq	xmm5, xmm11
	pmuludq	xmm6, xmm4
	paddq	xmm0, xmm5
	pslld	xmm11, 1
	movaps	xmm5, [rsp+0x40]
	paddq	xmm2, xmm6
	pmuludq	xmm13, xmm7
	movaps	xmm6, xmm11
	pslld	xmm7, 1
	pmuludq	xmm11, xmm4
	pmuludq	xmm10, xmm5
	paddq	xmm8, xmm10
	paddq	xmm8, xmm13
	pmuludq	xmm6, xmm9
	movaps	xmm13, [rsp+0x30]
	paddq	xmm2, xmm6
	pmuludq	xmm9, xmm7
	pslld	xmm5, 1
	paddq	xmm1, xmm11
	movaps	xmm6, [rsp+0x20]
	paddq	xmm1, xmm9
	pmuludq	xmm6, xmm7
	pmuludq	xmm7, xmm4
	paddq	xmm2, xmm6
	paddq	xmm3, xmm7
	movaps	xmm6, xmm13
	pmuludq	xmm4, xmm5
	movaps	xmm7, [rsp+0x80]
	
	paddq	xmm0, xmm4
	pmuludq	xmm6, xmm12
	pmuludq	xmm15, xmm7
	paddq	xmm2, xmm15
	paddq	xmm2, xmm6
	movaps	xmm6, [rsp+0x50]
	pmuludq	xmm12, xmm7
	paddq	xmm1, xmm12
	movaps	xmm12, xmm2
	pmuludq	xmm14, xmm7
	pmuludq	xmm6, xmm13
	movaps	xmm4, [rsp+0x70]
	paddq	xmm1, xmm6
	movaps	xmm10, xmm1
	movaps	xmm6, [rsp+0x60]
	pmuludq	xmm4, xmm7
	paddq	xmm0, xmm4
	punpcklqdq xmm10, xmm0
	pmuludq	xmm6, xmm5
	movaps	xmm4, [rsp+0x90]
	
	paddq	xmm3, xmm6
	paddq	xmm3, xmm14
	punpcklqdq xmm12, xmm3
	punpckhqdq xmm2, xmm3
	pmuludq	xmm4, [rsp+0xa0]
	
	paddq	xmm8, xmm4
	punpckhqdq xmm1, xmm0
	movaps	xmm14, xmm8
	movaps	xmm0, xmm10
	pand	xmm10, [.packedmask26]
	movaps	xmm3, xmm12
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	movaps	xmm0, xmm1
	punpcklqdq xmm14, xmm8
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	movaps	xmm3, xmm2
	punpckhqdq xmm8, xmm8
	psrlq	xmm0, 0x19
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x19
	paddq	xmm14, xmm0
	paddq	xmm10, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, xmm14
	pand	xmm2, [.packedmask25]
	pslldq	xmm3, 0x8
	psrlq	xmm0, 0x1a
	paddq	xmm8, xmm0
	movaps	xmm0, xmm8
	pand	xmm1, [.packedmask25]
	psrlq	xmm0, 0x19
	pmuludq	xmm0, [.packednineteen]
	punpckhqdq xmm0, xmm3
	pand	xmm14, [.packedmask26]
	paddq	xmm12, xmm0
	movaps	xmm0, xmm10
	movaps	xmm3, xmm12
	pand	xmm10, [.packedmask26]
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	pand	xmm8, [.packedmask25]
	movaps	xmm15, xmm10
	punpckhqdq xmm10, xmm1
	movaps	xmm3, xmm12
	punpcklqdq xmm15, xmm1
	punpckhqdq xmm12, xmm2
	punpcklqdq xmm3, xmm2
	punpckhqdq xmm14, xmm8
	jne	.squaretimes2
	pshufd	xmm0, xmm12, 0xf8
	lea	rdx, [rsp+0x370]
	
	pshufd	xmm15, xmm15, 0x8f
	lea	rsi, [rsp+0x400]
	
	pshufd	xmm1, xmm10, 0x8f
	lea	rdi, [rsp+0x430]
	
	pshufd	xmm2, xmm3, 0xf8
	por	xmm1, xmm0
	por	xmm2, xmm15
	pshufd	xmm0, xmm14, 0xf8
	movaps	[rsp+0x410], xmm1
	movaps	[rsp+0x400], xmm2
	movaps	[rsp+0x420], xmm0
	call	curve25519$mul
	lea	rdx, [rsp+0x3d0]
	lea	rsi, [rsp+0x430]
	mov	rdi, rdx
	call	curve25519$mul
	lea	rsi, [rsp+0x3d0]
	lea	rdi, [rsp+0x400]
	mov	edx, 1
	call	curve25519$square_times
	lea	rdx, [rsp+0x430]
	lea	rsi, [rsp+0x400]
	mov	rdi, rdx
	call	curve25519$mul
	lea	rsi, [rsp+0x430]
	lea	rdi, [rsp+0x460]
	mov	edx, 5
	call	curve25519$square_times
	lea	rdx, [rsp+0x430]
	lea	rsi, [rsp+0x460]
	mov	rdi, rdx
	call	curve25519$mul
	lea	rsi, [rsp+0x430]
	lea	rdi, [rsp+0x460]
	mov	edx, 10
	call	curve25519$square_times
	lea	rdx, [rsp+0x430]
	lea	rsi, [rsp+0x460]
	lea	rdi, [rsp+0x490]
	call	curve25519$mul
	movaps	xmm0, [rsp+0x490]
	
	mov	eax, 20
	pshufd	xmm9, [rsp+0x4b0], 0xd8
	
	pshufd	xmm2, xmm0, 0xd8
	pshufd	xmm15, xmm0, 0xfa
	movaps	xmm0, [rsp+0x4a0]
	
	movaps	xmm14, xmm9
	pshufd	xmm1, xmm0, 0xd8
	movaps	xmm3, xmm2
	pshufd	xmm0, xmm0, 0xfa
	movaps	xmm12, xmm1
	movaps	xmm10, xmm0

calign
.squaretimes20:
	movaps	xmm0, xmm15
	sub	eax, 1
	movaps	xmm2, xmm3
	pslldq	xmm0, 0x8
	pshufd	xmm1, xmm15, 0
	movaps	xmm4, xmm15
	punpckhqdq xmm2, xmm0
	pshufd	xmm0, xmm3, 0
	movaps	xmm7, [.sse2_top64bitmask]
	pshufd	xmm6, xmm3, 0xaa
	movaps	[rsp], xmm2
	pshufd	xmm5, xmm15, 0xaa
	pslld	xmm6, 1
	movaps	xmm2, [.sse2_top64bitmask]
	pslld	xmm5, 1
	pand	xmm2, xmm0
	paddq	xmm2, xmm0
	movaps	xmm0, [.sse2_top64bitmask]
	pshufd	xmm8, xmm2, 0xe6
	pmuludq	xmm2, xmm3
	movaps	xmm3, xmm12
	pand	xmm0, xmm1
	pmuludq	xmm3, xmm8
	paddq	xmm0, xmm1
	movaps	xmm1, xmm12
	pshufd	xmm13, xmm0, 0xe6
	pmuludq	xmm0, xmm15
	paddq	xmm3, xmm0
	pslldq	xmm1, 0x8
	punpckhqdq xmm4, xmm1
	movaps	xmm0, xmm15
	pshufd	xmm1, xmm12, 0
	pslld	xmm0, 1
	movaps	[rsp+0xb0], xmm0
	
	movaps	xmm0, xmm10
	pand	xmm7, xmm1
	movaps	[rsp+0x10], xmm4
	pmuludq	xmm0, xmm8
	movaps	xmm4, xmm10
	movaps	xmm11, xmm7
	movaps	xmm7, xmm12
	paddq	xmm11, xmm1
	movaps	xmm1, xmm10
	pslldq	xmm1, 0x8
	punpckhqdq xmm7, xmm1
	movaps	xmm1, xmm7
	pmuludq	xmm1, [.packednineteen]
	pshufd	xmm9, xmm1, 0x54
	pshufd	xmm1, xmm1, 0xee
	movaps	[rsp+0x20], xmm9
	movaps	xmm9, xmm10
	movaps	[rsp+0x30], xmm1
	movaps	xmm1, xmm14
	pslldq	xmm1, 0x8
	punpckhqdq xmm9, xmm1
	pxor	xmm1, xmm1
	movaps	[rsp+0x40], xmm9
	pshufd	xmm9, xmm10, 0xaa
	punpcklqdq xmm4, xmm1
	pmuludq	xmm9, [.packed3819]
	pshufd	xmm1, xmm9, 0xfe
	movaps	[rsp+0x50], xmm4
	pxor	xmm4, xmm4
	movaps	[rsp+0x60], xmm1
	movaps	xmm1, xmm14
	punpcklqdq xmm1, xmm4
	movaps	[rsp+0x70], xmm1
	pshufd	xmm1, xmm14, 0
	movaps	xmm4, xmm1
	pshufd	xmm1, xmm14, 0xfe
	pmuludq	xmm4, [.packednineteen]
	movaps	[rsp+0x80], xmm4
	
	pshufd	xmm4, xmm14, 0xaa
	pslld	xmm1, 1
	movaps	[rsp+0x90], xmm1
	
	pmuludq	xmm4, [.packed3819]
	pshufd	xmm1, xmm4, 0xaa
	movaps	[rsp+0xa0], xmm1
	
	movaps	xmm1, xmm15
	movaps	xmm15, xmm12
	pmuludq	xmm1, xmm8
	pmuludq	xmm8, xmm14
	movaps	xmm14, xmm10
	pmuludq	xmm15, xmm13
	pmuludq	xmm13, xmm10
	movaps	xmm10, xmm11
	paddq	xmm8, xmm13
	movaps	xmm11, [rsp+0x10]
	paddq	xmm0, xmm15
	pmuludq	xmm10, xmm12
	paddq	xmm8, xmm10
	pslld	xmm12, 1
	movaps	xmm10, [.sse2_bot64bitmask]
	pslld	xmm14, 1
	pand	xmm10, xmm6
	movaps	xmm15, [rsp]
	paddq	xmm10, xmm6
	pmuludq	xmm6, xmm15
	paddq	xmm1, xmm6
	movaps	xmm6, xmm11
	movaps	xmm13, [.sse2_bot64bitmask]
	pmuludq	xmm6, xmm10
	paddq	xmm3, xmm6
	movaps	xmm6, xmm7
	pand	xmm13, xmm5
	pmuludq	xmm6, xmm10
	paddq	xmm0, xmm6
	movaps	xmm6, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	paddq	xmm13, xmm5
	pslld	xmm6, 1
	pmuludq	xmm5, xmm11
	pmuludq	xmm6, xmm4
	paddq	xmm0, xmm5
	pslld	xmm11, 1
	movaps	xmm5, [rsp+0x40]
	paddq	xmm2, xmm6
	pmuludq	xmm13, xmm7
	movaps	xmm6, xmm11
	pslld	xmm7, 1
	pmuludq	xmm11, xmm4
	pmuludq	xmm10, xmm5
	paddq	xmm8, xmm10
	paddq	xmm8, xmm13
	pmuludq	xmm6, xmm9
	movaps	xmm13, [rsp+0x30]
	paddq	xmm2, xmm6
	pmuludq	xmm9, xmm7
	pslld	xmm5, 1
	paddq	xmm1, xmm11
	movaps	xmm6, [rsp+0x20]
	paddq	xmm1, xmm9
	pmuludq	xmm6, xmm7
	pmuludq	xmm7, xmm4
	paddq	xmm2, xmm6
	paddq	xmm3, xmm7
	movaps	xmm6, xmm13
	pmuludq	xmm4, xmm5
	movaps	xmm7, [rsp+0x80]
	
	paddq	xmm0, xmm4
	pmuludq	xmm6, xmm12
	pmuludq	xmm15, xmm7
	paddq	xmm2, xmm15
	paddq	xmm2, xmm6
	movaps	xmm6, [rsp+0x50]
	pmuludq	xmm12, xmm7
	paddq	xmm1, xmm12
	movaps	xmm12, xmm2
	pmuludq	xmm14, xmm7
	pmuludq	xmm6, xmm13
	movaps	xmm4, [rsp+0x70]
	paddq	xmm1, xmm6
	movaps	xmm10, xmm1
	movaps	xmm6, [rsp+0x60]
	pmuludq	xmm4, xmm7
	paddq	xmm0, xmm4
	punpcklqdq xmm10, xmm0
	pmuludq	xmm6, xmm5
	movaps	xmm4, [rsp+0x90]
	
	paddq	xmm3, xmm6
	paddq	xmm3, xmm14
	punpcklqdq xmm12, xmm3
	punpckhqdq xmm2, xmm3
	pmuludq	xmm4, [rsp+0xa0]
	
	paddq	xmm8, xmm4
	punpckhqdq xmm1, xmm0
	movaps	xmm14, xmm8
	movaps	xmm0, xmm10
	pand	xmm10, [.packedmask26]
	movaps	xmm3, xmm12
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	movaps	xmm0, xmm1
	punpcklqdq xmm14, xmm8
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	movaps	xmm3, xmm2
	punpckhqdq xmm8, xmm8
	psrlq	xmm0, 0x19
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x19
	paddq	xmm14, xmm0
	paddq	xmm10, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, xmm14
	pand	xmm2, [.packedmask25]
	pslldq	xmm3, 0x8
	psrlq	xmm0, 0x1a
	paddq	xmm8, xmm0
	movaps	xmm0, xmm8
	pand	xmm1, [.packedmask25]
	psrlq	xmm0, 0x19
	pmuludq	xmm0, [.packednineteen]
	punpckhqdq xmm0, xmm3
	pand	xmm14, [.packedmask26]
	paddq	xmm12, xmm0
	movaps	xmm0, xmm10
	movaps	xmm3, xmm12
	pand	xmm10, [.packedmask26]
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	pand	xmm8, [.packedmask25]
	movaps	xmm15, xmm10
	punpckhqdq xmm10, xmm1
	movaps	xmm3, xmm12
	punpcklqdq xmm15, xmm1
	punpckhqdq xmm12, xmm2
	punpcklqdq xmm3, xmm2
	punpckhqdq xmm14, xmm8
	jne	.squaretimes20
	pshufd	xmm0, xmm12, 0xf8
	lea	rsi, [rsp+0x460]
	
	pshufd	xmm15, xmm15, 0x8f
	lea	rdx, [rsp+0x490]
	
	pshufd	xmm1, xmm10, 0x8f
	mov	rdi, rsi
	pshufd	xmm2, xmm3, 0xf8
	por	xmm1, xmm0
	por	xmm2, xmm15
	pshufd	xmm0, xmm14, 0xf8
	movaps	[rsp+0x470], xmm1
	movaps	[rsp+0x460], xmm2
	movaps	[rsp+0x480], xmm0
	call	curve25519$mul
	lea	rsi, [rsp+0x460]
	mov	rdi, rsi
	mov	edx, 10
	call	curve25519$square_times
	lea	rdx, [rsp+0x430]
	lea	rsi, [rsp+0x460]
	mov	rdi, rdx
	call	curve25519$mul
	lea	rsi, [rsp+0x430]
	lea	rdi, [rsp+0x460]
	mov	edx, 50
	call	curve25519$square_times
	lea	rdx, [rsp+0x430]
	lea	rsi, [rsp+0x460]
	lea	rdi, [rsp+0x490]
	call	curve25519$mul
	movaps	xmm0, [rsp+0x490]
	
	mov	eax, 100
	pshufd	xmm9, [rsp+0x4b0], 0xd8
	
	pshufd	xmm2, xmm0, 0xd8
	pshufd	xmm15, xmm0, 0xfa
	movaps	xmm0, [rsp+0x4a0]
	
	movaps	xmm14, xmm9
	pshufd	xmm1, xmm0, 0xd8
	movaps	xmm3, xmm2
	pshufd	xmm0, xmm0, 0xfa
	movaps	xmm12, xmm1
	movaps	xmm10, xmm0
	
calign
.squaretimes100:
	movaps	xmm0, xmm15
	sub	eax, 1
	movaps	xmm2, xmm3
	pslldq	xmm0, 0x8
	pshufd	xmm1, xmm15, 0
	movaps	xmm4, xmm15
	punpckhqdq xmm2, xmm0
	pshufd	xmm0, xmm3, 0
	movaps	xmm7, [.sse2_top64bitmask]
	pshufd	xmm6, xmm3, 0xaa
	movaps	[rsp], xmm2
	pshufd	xmm5, xmm15, 0xaa
	pslld	xmm6, 1
	movaps	xmm2, [.sse2_top64bitmask]
	pslld	xmm5, 1
	pand	xmm2, xmm0
	paddq	xmm2, xmm0
	movaps	xmm0, [.sse2_top64bitmask]
	pshufd	xmm8, xmm2, 0xe6
	pmuludq	xmm2, xmm3
	movaps	xmm3, xmm12
	pand	xmm0, xmm1
	pmuludq	xmm3, xmm8
	paddq	xmm0, xmm1
	movaps	xmm1, xmm12
	pshufd	xmm13, xmm0, 0xe6
	pmuludq	xmm0, xmm15
	paddq	xmm3, xmm0
	pslldq	xmm1, 0x8
	punpckhqdq xmm4, xmm1
	movaps	xmm0, xmm15
	pshufd	xmm1, xmm12, 0
	pslld	xmm0, 1
	movaps	[rsp+0xb0], xmm0
	
	movaps	xmm0, xmm10
	pand	xmm7, xmm1
	movaps	[rsp+0x10], xmm4
	pmuludq	xmm0, xmm8
	movaps	xmm4, xmm10
	movaps	xmm11, xmm7
	movaps	xmm7, xmm12
	paddq	xmm11, xmm1
	movaps	xmm1, xmm10
	pslldq	xmm1, 0x8
	punpckhqdq xmm7, xmm1
	movaps	xmm1, xmm7
	pmuludq	xmm1, [.packednineteen]
	pshufd	xmm9, xmm1, 0x54
	pshufd	xmm1, xmm1, 0xee
	movaps	[rsp+0x20], xmm9
	movaps	xmm9, xmm10
	movaps	[rsp+0x30], xmm1
	movaps	xmm1, xmm14
	pslldq	xmm1, 0x8
	punpckhqdq xmm9, xmm1
	pxor	xmm1, xmm1
	movaps	[rsp+0x40], xmm9
	pshufd	xmm9, xmm10, 0xaa
	punpcklqdq xmm4, xmm1
	pmuludq	xmm9, [.packed3819]
	pshufd	xmm1, xmm9, 0xfe
	movaps	[rsp+0x50], xmm4
	pxor	xmm4, xmm4
	movaps	[rsp+0x60], xmm1
	movaps	xmm1, xmm14
	punpcklqdq xmm1, xmm4
	movaps	[rsp+0x70], xmm1
	pshufd	xmm1, xmm14, 0
	movaps	xmm4, xmm1
	pshufd	xmm1, xmm14, 0xfe
	pmuludq	xmm4, [.packednineteen]
	movaps	[rsp+0x80], xmm4
	
	pshufd	xmm4, xmm14, 0xaa
	pslld	xmm1, 1
	movaps	[rsp+0x90], xmm1
	
	pmuludq	xmm4, [.packed3819]
	pshufd	xmm1, xmm4, 0xaa
	movaps	[rsp+0xa0], xmm1
	
	movaps	xmm1, xmm15
	movaps	xmm15, xmm12
	pmuludq	xmm1, xmm8
	pmuludq	xmm8, xmm14
	movaps	xmm14, xmm10
	pmuludq	xmm15, xmm13
	pmuludq	xmm13, xmm10
	movaps	xmm10, xmm11
	paddq	xmm8, xmm13
	movaps	xmm11, [rsp+0x10]
	paddq	xmm0, xmm15
	pmuludq	xmm10, xmm12
	paddq	xmm8, xmm10
	pslld	xmm12, 1
	movaps	xmm10, [.sse2_bot64bitmask]
	pslld	xmm14, 1
	pand	xmm10, xmm6
	movaps	xmm15, [rsp]
	paddq	xmm10, xmm6
	pmuludq	xmm6, xmm15
	paddq	xmm1, xmm6
	movaps	xmm6, xmm11
	movaps	xmm13, [.sse2_bot64bitmask]
	pmuludq	xmm6, xmm10
	paddq	xmm3, xmm6
	movaps	xmm6, xmm7
	pand	xmm13, xmm5
	pmuludq	xmm6, xmm10
	paddq	xmm0, xmm6
	movaps	xmm6, xmm15
	movaps	xmm15, [rsp+0xb0]
	
	paddq	xmm13, xmm5
	pslld	xmm6, 1
	pmuludq	xmm5, xmm11
	pmuludq	xmm6, xmm4
	paddq	xmm0, xmm5
	pslld	xmm11, 1
	movaps	xmm5, [rsp+0x40]
	paddq	xmm2, xmm6
	pmuludq	xmm13, xmm7
	movaps	xmm6, xmm11
	pslld	xmm7, 1
	pmuludq	xmm11, xmm4
	pmuludq	xmm10, xmm5
	paddq	xmm8, xmm10
	paddq	xmm8, xmm13
	pmuludq	xmm6, xmm9
	movaps	xmm13, [rsp+0x30]
	paddq	xmm2, xmm6
	pmuludq	xmm9, xmm7
	pslld	xmm5, 1
	paddq	xmm1, xmm11
	movaps	xmm6, [rsp+0x20]
	paddq	xmm1, xmm9
	pmuludq	xmm6, xmm7
	pmuludq	xmm7, xmm4
	paddq	xmm2, xmm6
	paddq	xmm3, xmm7
	movaps	xmm6, xmm13
	pmuludq	xmm4, xmm5
	movaps	xmm7, [rsp+0x80]
	
	paddq	xmm0, xmm4
	pmuludq	xmm6, xmm12
	pmuludq	xmm15, xmm7
	paddq	xmm2, xmm15
	paddq	xmm2, xmm6
	movaps	xmm6, [rsp+0x50]
	pmuludq	xmm12, xmm7
	paddq	xmm1, xmm12
	movaps	xmm12, xmm2
	pmuludq	xmm14, xmm7
	pmuludq	xmm6, xmm13
	movaps	xmm4, [rsp+0x70]
	paddq	xmm1, xmm6
	movaps	xmm10, xmm1
	movaps	xmm6, [rsp+0x60]
	pmuludq	xmm4, xmm7
	paddq	xmm0, xmm4
	punpcklqdq xmm10, xmm0
	pmuludq	xmm6, xmm5
	movaps	xmm4, [rsp+0x90]
	
	paddq	xmm3, xmm6
	paddq	xmm3, xmm14
	punpcklqdq xmm12, xmm3
	punpckhqdq xmm2, xmm3
	pmuludq	xmm4, [rsp+0xa0]
	
	paddq	xmm8, xmm4
	punpckhqdq xmm1, xmm0
	movaps	xmm14, xmm8
	movaps	xmm0, xmm10
	pand	xmm10, [.packedmask26]
	movaps	xmm3, xmm12
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	movaps	xmm0, xmm1
	punpcklqdq xmm14, xmm8
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	movaps	xmm3, xmm2
	punpckhqdq xmm8, xmm8
	psrlq	xmm0, 0x19
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x19
	paddq	xmm14, xmm0
	paddq	xmm10, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, xmm14
	pand	xmm2, [.packedmask25]
	pslldq	xmm3, 0x8
	psrlq	xmm0, 0x1a
	paddq	xmm8, xmm0
	movaps	xmm0, xmm8
	pand	xmm1, [.packedmask25]
	psrlq	xmm0, 0x19
	pmuludq	xmm0, [.packednineteen]
	punpckhqdq xmm0, xmm3
	pand	xmm14, [.packedmask26]
	paddq	xmm12, xmm0
	movaps	xmm0, xmm10
	movaps	xmm3, xmm12
	pand	xmm10, [.packedmask26]
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	pand	xmm8, [.packedmask25]
	movaps	xmm15, xmm10
	punpckhqdq xmm10, xmm1
	movaps	xmm3, xmm12
	punpcklqdq xmm15, xmm1
	punpckhqdq xmm12, xmm2
	punpcklqdq xmm3, xmm2
	punpckhqdq xmm14, xmm8
	jne	.squaretimes100
	pshufd	xmm0, xmm12, 0xf8
	lea	rsi, [rsp+0x460]
	
	pshufd	xmm15, xmm15, 0x8f
	lea	rdx, [rsp+0x490]
	
	pshufd	xmm1, xmm10, 0x8f
	mov	rdi, rsi
	pshufd	xmm2, xmm3, 0xf8
	por	xmm1, xmm0
	por	xmm2, xmm15
	pshufd	xmm0, xmm14, 0xf8
	movaps	[rsp+0x470], xmm1
	movaps	[rsp+0x460], xmm2
	movaps	[rsp+0x480], xmm0
	call	curve25519$mul
	lea	rsi, [rsp+0x460]
	mov	rdi, rsi
	mov	edx, 50
	call	curve25519$square_times
	lea	rdx, [rsp+0x430]
	lea	rsi, [rsp+0x460]
	mov	rdi, rdx
	call	curve25519$mul
	lea	rsi, [rsp+0x430]
	mov	rdi, rsi
	mov	edx, 5
	call	curve25519$square_times
	lea	rdx, [rsp+0x3d0]
	lea	rsi, [rsp+0x430]
	lea	rdi, [rsp+0x460]
	call	curve25519$mul
	lea	rdx, [rsp+0x460]
	lea	rdi, [rsp+0x370]
	mov	rsi, rbx
	call	curve25519$mul
	mov	r11d, [rsp+0x370]
	
	mov	edx, [rsp+0x380]
	movaps	xmm2, [rsp+0x370]
	
	mov	eax, [rsp+0x390]
	mov	r9d, r11d
	and	r11d, 0x3ffffff
	movaps	[rsp+0x490], xmm2
	
	shr	r9d, 0x1a
	add	r9d, [rsp+0x494]
	
	movaps	xmm2, [rsp+0x380]
	
	mov	r8d, r9d
	and	r9d, 0x1ffffff
	shr	r8d, 0x19
	add	r8d, [rsp+0x498]
	
	movaps	[rsp+0x4a0], xmm2
	
	mov	edi, r8d
	and	r8d, 0x3ffffff
	shr	edi, 0x1a
	add	edi, [rsp+0x49c]
	movaps	xmm2, [rsp+0x390]
	
	movaps	[rsp+0x4b0], xmm2
	
	mov	r14d, edi
	shr	r14d, 0x19
	add	r14d, edx
	mov	esi, r14d
	shr	esi, 0x1a
	add	esi, [rsp+0x4a4]
	mov	ecx, esi
	shr	ecx, 0x19
	add	ecx, [rsp+0x4a8]
	mov	edx, ecx
	shr	edx, 0x1a
	add	edx, [rsp+0x4ac]
	mov	r12d, edx
	shr	r12d, 0x19
	add	r12d, eax
	mov	eax, r12d
	shr	eax, 0x1a
	add	eax, [rsp+0x4b4]
	mov	r10d, eax
	shr	r10d, 0x19
	lea	ebx, [r10+r10*8]
	lea	ebx, [r10+rbx*2]
	add	ebx, r11d
	mov	r11d, ebx
	shr	r11d, 0x1a
	add	r11d, r9d
	mov	r10d, r11d
	shr	r10d, 0x19
	add	r10d, r8d
	mov	r13d, r10d
	shr	r13d, 0x1a
	and	edi, 0x1ffffff
	and	r14d, 0x3ffffff
	add	r13d, edi
	and	esi, 0x1ffffff
	and	ecx, 0x3ffffff
	mov	r9d, r13d
	and	edx, 0x1ffffff
	and	r12d, 0x3ffffff
	shr	r9d, 0x19
	and	eax, 0x1ffffff
	and	ebx, 0x3ffffff
	add	r9d, r14d
	and	r11d, 0x1ffffff
	and	r10d, 0x3ffffff
	mov	r14d, r9d
	and	r13d, 0x1ffffff
	shr	r14d, 0x1a
	add	r14d, esi
	mov	r8d, r14d
	shr	r8d, 0x19
	add	r8d, ecx
	mov	esi, r8d
	shr	esi, 0x1a
	add	esi, edx
	mov	edx, esi
	shr	edx, 0x19
	add	r12d, edx
	mov	r15d, r12d
	shr	r15d, 0x1a
	add	r15d, eax
	mov	eax, r15d
	shr	eax, 0x19
	lea	edx, [rax*8]
	add	edx, eax
	lea	eax, [rax+rdx*2]
	lea	edi, [rbx+rax+0x13]
	mov	eax, edi
	shr	eax, 0x1a
	add	eax, r11d
	mov	edx, eax
	shr	edx, 0x19
	add	r10d, edx
	mov	r11d, r10d
	shr	r11d, 0x1a
	add	r11d, r13d
	mov	ebx, r11d
	shr	ebx, 0x19
	and	r9d, 0x3ffffff
	and	r14d, 0x1ffffff
	add	ebx, r9d
	and	r8d, 0x3ffffff
	and	esi, 0x1ffffff
	mov	ecx, ebx
	and	r12d, 0x3ffffff
	and	r15d, 0x1ffffff
	shr	ecx, 0x1a
	and	edi, 0x3ffffff
	and	eax, 0x1ffffff
	add	ecx, r14d
	and	r10d, 0x3ffffff
	and	r11d, 0x1ffffff
	mov	edx, ecx
	and	ebx, 0x3ffffff
	and	ecx, 0x1ffffff
	shr	edx, 0x19
	add	edx, r8d
	mov	r13d, edx
	shr	r13d, 0x1a
	add	r13d, esi
	mov	esi, r13d
	shr	esi, 0x19
	add	r12d, esi
	mov	esi, r12d
	shr	esi, 0x1a
	add	esi, r15d
	mov	r8d, esi
	shr	r8d, 0x19
	lea	r9d, [r8*8]
	
	add	r9d, r8d
	lea	r8d, [r8+r9*2]
	lea	r9d, [rdi+r8+0x3ffffed]
	
	mov	edi, r9d
	shr	edi, 0x1a
	lea	eax, [rax+rdi+0x1ffffff]
	mov	edi, eax
	and	eax, 0x1ffffff
	shr	edi, 0x19
	lea	edi, [r10+rdi+0x3ffffff]
	
	mov	r10d, eax
	mov	eax, edi
	and	edi, 0x3ffffff
	shr	eax, 0x1a
	lea	eax, [r11+rax+0x1ffffff]
	
	mov	r11d, edi
	mov	edi, eax
	and	eax, 0x1ffffff
	shr	edi, 0x19
	lea	ebx, [rbx+rdi+0x3ffffff]
	mov	edi, ebx
	shr	edi, 0x1a
	and	edx, 0x3ffffff
	and	r13d, 0x1ffffff
	lea	r8d, [rcx+rdi+0x1ffffff]
	
	and	r12d, 0x3ffffff
	and	r9d, 0x3ffffff
	shl	ebx, 0x6
	and	esi, 0x1ffffff
	mov	ecx, r8d
	and	r8d, 0x1ffffff
	shr	ecx, 0x19
	lea	edx, [rdx+rcx+0x3ffffff]
	mov	ecx, edx
	and	edx, 0x3ffffff
	shr	ecx, 0x1a
	lea	edi, [r13+rcx+0x1ffffff]
	
	mov	r13d, edx
	mov	edx, edi
	and	edi, 0x1ffffff
	shr	edx, 0x19
	lea	ecx, [r12+rdx+0x3ffffff]
	
	mov	r12d, r10d
	shr	r10d, 0x6
	shl	r12d, 0x1a
	or	r12d, r9d
	mov	r9d, r11d
	mov	edx, ecx
	shl	r9d, 0x13
	and	edx, 0x3ffffff
	shr	ecx, 0x1a
	or	r9d, r10d
	shr	r11d, 0xd
	mov	[rbp], r12d
	mov	[rbp+0x4], r9d
	mov	r9d, eax
	shr	eax, 0x13
	or	ebx, eax
	mov	eax, r13d
	shr	r13d, 0x7
	shl	eax, 0x19
	shl	r9d, 0xd
	mov	[rbp+0xc], ebx
	or	eax, r8d
	or	r9d, r11d
	mov	[rbp+0x10], eax
	mov	eax, edi
	shr	edi, 0xd
	shl	eax, 0x13
	mov	[rbp+0x8], r9d
	or	eax, r13d
	mov	[rbp+0x14], eax
	mov	eax, edx
	shl	eax, 0xc
	or	eax, edi
	mov	[rbp+0x18], eax
	lea	eax, [rsi+rcx+0x1ffffff]
	and	eax, 0x1ffffff
	shr	edx, 0x14
	shl	eax, 6
	or	eax, edx
	mov	[rbp+0x1c], eax

	; finish:
	mov	rcx, [rsp+0x4c8]
	add	rsp, rcx
	pop	r15 r14 r13 r12 rbx rbp
	epilog

end if


if used curve25519$square_times | defined include_everything
	; three arguments: rdi == r, rsi == in, edx == count
falign
curve25519$square_times:
	prolog	curve25519$square_times
	sub	rsp, 0x60
	mov	rax, rsp
	and	rax, 0xf
	add	rax, 8
	sub	rsp, rax
	mov	[rsp+0x50], rax
	mov	eax,edx
	movaps	xmm15, [rsi]
	pshufd	xmm9, [rsi+0x20], 0xd8
	movaps	xmm0, [rsi+0x10]
	pshufd	xmm2, xmm15, 0xd8
	movaps	xmm14, xmm9
	pshufd	xmm1, xmm0, 0xd8
	pshufd	xmm0, xmm0, 0xfa
	pshufd	xmm15, xmm15, 0xfa
	movaps	xmm3, xmm2
	movaps	xmm12, xmm1
	movaps	xmm10, xmm0
calign
.square_times_loop:
	movaps	xmm0, xmm15
	sub	eax, 1
	movaps	xmm5, xmm3
	pslldq	xmm0, 0x8
	movaps	xmm2, [.sse2_top64bitmask]
	pshufd	xmm1, xmm15, 0
	punpckhqdq xmm5, xmm0
	pshufd	xmm0, xmm3, 0
	movaps	xmm7, xmm15
	movaps	xmm4, [.sse2_top64bitmask]
	pxor	xmm9, xmm9
	pand	xmm2, xmm0
	pshufd	xmm6, xmm3, 0xaa
	movaps	[rsp-0x78], xmm5
	paddq	xmm2, xmm0
	pshufd	xmm8, xmm2, 0xe6
	pmuludq xmm2, xmm3
	movaps	xmm3, xmm12
	pshufd	xmm5, xmm15, 0xaa
	pslld	xmm6, 1
	movaps	xmm0, [.sse2_top64bitmask]
	pmuludq xmm3, xmm8
	pslld	xmm5, 1
	pand	xmm0, xmm1
	paddq	xmm0, xmm1
	movaps	xmm1, xmm12
	pshufd	xmm13, xmm0, 0xe6
	pmuludq xmm0, xmm15
	paddq	xmm3, xmm0
	pslldq	xmm1, 0x8
	punpckhqdq xmm7, xmm1
	movaps	xmm0, xmm15
	pshufd	xmm1, xmm12, 0
	pslld	xmm0, 1
	movaps	[rsp+0x38], xmm0
	movaps	xmm0, xmm10
	pand	xmm4, xmm1
	movaps	[rsp-0x68], xmm7
	pmuludq xmm0, xmm8
	movaps	xmm7, xmm12
	movaps	xmm11, xmm4
	paddq	xmm11, xmm1
	movaps	xmm1, xmm10
	pslldq	xmm1, 0x8
	punpckhqdq xmm7, xmm1
	movaps	xmm1, xmm7
	pmuludq xmm1, [.packednineteen]
	pshufd	xmm4, xmm1, 0x54
	movaps	[rsp-0x58], xmm4
	pshufd	xmm4, xmm1, 0xee
	movaps	xmm1, xmm14
	movaps	[rsp-0x48], xmm4
	pslldq	xmm1, 0x8
	movaps	xmm4, xmm10
	punpckhqdq xmm4, xmm1
	movaps	xmm1, xmm10
	punpcklqdq xmm1, xmm9
	pshufd	xmm9, xmm10, 0xaa
	movaps	[rsp-0x38], xmm4
	pxor	xmm4, xmm4
	pmuludq xmm9, [.packed3819]
	movaps	[rsp-0x28], xmm1
	pshufd	xmm1, xmm9, 0xfe
	movaps	[rsp-0x18], xmm1
	movaps	xmm1, xmm14
	punpcklqdq xmm1, xmm4
	movaps	[rsp-0x8], xmm1
	pshufd	xmm1, xmm14, 0
	movaps	xmm4, xmm1
	pshufd	xmm1, xmm14, 0xfe
	pmuludq xmm4, [.packednineteen]
	movaps	[rsp+0x8], xmm4
	pshufd	xmm4, xmm14, 0xaa
	pslld	xmm1, 1
	movaps	[rsp+0x18], xmm1
	pmuludq xmm4, [.packed3819]
	pshufd	xmm1, xmm4, 0xaa
	movaps	[rsp+0x28], xmm1
	movaps	xmm1, xmm15
	movaps	xmm15, xmm12
	pmuludq xmm1, xmm8
	pmuludq xmm8, xmm14
	movaps	xmm14, xmm10
	pmuludq xmm15, xmm13
	pmuludq xmm13, xmm10
	movaps	xmm10, xmm11
	paddq	xmm8, xmm13
	movaps	xmm11, [rsp-0x68]
	paddq	xmm0, xmm15
	pmuludq xmm10, xmm12
	paddq	xmm8, xmm10
	pslld	xmm12, 1
	movaps	xmm10, [.sse2_bot64bitmask]
	pslld	xmm14, 1
	pand	xmm10, xmm6
	movaps	xmm15, [rsp-0x78]
	paddq	xmm10, xmm6
	pmuludq xmm6, xmm15
	paddq	xmm1, xmm6
	movaps	xmm6, xmm11
	movaps	xmm13, [.sse2_bot64bitmask]
	pmuludq xmm6, xmm10
	paddq	xmm3, xmm6
	movaps	xmm6, xmm7
	pand	xmm13, xmm5
	pmuludq xmm6, xmm10
	paddq	xmm0, xmm6
	movaps	xmm6, xmm15
	movaps	xmm15, [rsp+0x38]
	paddq	xmm13, xmm5
	pslld	xmm6, 1
	pmuludq xmm5, xmm11
	pmuludq xmm6, xmm4
	pslld	xmm11, 1
	paddq	xmm2, xmm6
	movaps	xmm6, xmm11
	pmuludq xmm13, xmm7
	pmuludq xmm11, xmm4
	pslld	xmm7, 1
	paddq	xmm1, xmm11
	paddq	xmm0, xmm5
	pmuludq xmm6, xmm9
	pmuludq xmm9, xmm7
	paddq	xmm2, xmm6
	paddq	xmm1, xmm9
	movaps	xmm6, [rsp-0x58]
	movaps	xmm9, [rsp-0x48]
	pmuludq xmm6, xmm7
	pmuludq xmm7, xmm4
	paddq	xmm2, xmm6
	paddq	xmm3, xmm7
	movaps	xmm6, xmm9
	movaps	xmm7, [rsp+0x8]
	pmuludq xmm6, xmm12
	movaps	xmm5, [rsp-0x38]
	pmuludq xmm15, xmm7
	paddq	xmm2, xmm15
	paddq	xmm2, xmm6
	pmuludq xmm12, xmm7
	paddq	xmm1, xmm12
	movaps	xmm6, [rsp-0x28]
	pmuludq xmm10, xmm5
	pslld	xmm5, 1
	paddq	xmm8, xmm10
	pmuludq xmm4, xmm5
	paddq	xmm0, xmm4
	pmuludq xmm6, xmm9
	movaps	xmm4, [rsp-0x8]
	paddq	xmm1, xmm6
	movaps	xmm10, xmm1
	movaps	xmm12, xmm2
	pmuludq xmm14, xmm7
	movaps	xmm6, [rsp-0x18]
	pmuludq xmm4, xmm7
	paddq	xmm0, xmm4
	punpcklqdq xmm10, xmm0
	paddq	xmm8, xmm13
	pmuludq xmm6, xmm5
	movaps	xmm4, [rsp+0x18]
	paddq	xmm3, xmm6
	paddq	xmm3, xmm14
	punpcklqdq xmm12, xmm3
	punpckhqdq xmm2, xmm3
	pmuludq xmm4, [rsp+0x28]
	paddq	xmm8, xmm4
	punpckhqdq xmm1, xmm0
	movaps	xmm14, xmm8
	movaps	xmm0, xmm10
	pand	xmm10, [.packedmask26]
	movaps	xmm3, xmm12
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	movaps	xmm0, xmm1
	punpcklqdq xmm14, xmm8
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	movaps	xmm3, xmm2
	punpckhqdq xmm8, xmm8
	psrlq	xmm0, 0x19
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x19
	paddq	xmm14, xmm0
	paddq	xmm10, xmm3
	movaps	xmm3, xmm0
	movaps	xmm0, xmm14
	pand	xmm2, [.packedmask25]
	pslldq	xmm3, 0x8
	psrlq	xmm0, 0x1a
	paddq	xmm8, xmm0
	movaps	xmm0, xmm8
	pand	xmm1, [.packedmask25]
	psrlq	xmm0, 0x19
	pmuludq xmm0, [.packednineteen]
	punpckhqdq xmm0, xmm3
	pand	xmm14, [.packedmask26]
	paddq	xmm12, xmm0
	movaps	xmm0, xmm10
	movaps	xmm3, xmm12
	pand	xmm10, [.packedmask26]
	psrlq	xmm0, 0x1a
	paddq	xmm1, xmm0
	pand	xmm12, [.packedmask26]
	psrlq	xmm3, 0x1a
	paddq	xmm2, xmm3
	pand	xmm8, [.packedmask25]
	movaps	xmm15, xmm10
	punpckhqdq xmm10, xmm1
	movaps	xmm3, xmm12
	punpcklqdq xmm15, xmm1
	punpckhqdq xmm12, xmm2
	punpcklqdq xmm3, xmm2
	punpckhqdq xmm14, xmm8
	jne	.square_times_loop
	pshufd	xmm15, xmm15, 0x8f
	pshufd	xmm2, xmm3, 0xf8
	pshufd	xmm0, xmm10, 0x8f
	pshufd	xmm9, xmm14, 0xf8
	por	xmm2, xmm15
	pshufd	xmm15, xmm12, 0xf8
	movaps	[rdi+0x20], xmm9
	por	xmm0, xmm15
	movaps	[rdi], xmm2
	movaps	[rdi+0x10], xmm0

	mov	rcx, [rsp+0x50]
	add	rsp, rcx
	add	rsp, 0x60
	epilog
align 16
.sse2_top64bitmask:
	dd	0x00000000, 0x00000000, 0xffffffff, 0xffffffff
.packednineteen:
	dq	19, 19
.packed3819:
	dq	38, 19
.sse2_bot64bitmask:
	dd	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
.packedmask26:
	dq	0x3ffffff, 0x3ffffff
.packedmask25:
	dq	0x1ffffff, 0x1ffffff

end if


if used curve25519$mul
	; three arguments: rdi == out, rsi == r, rdx == s
falign
curve25519$mul:
	prolog	curve25519$mul
	sub	rsp, 0x50
	movaps	xmm14, [rdx]
	mov	rax, rsp
	and	rax, 0xf
	sub	rsp, rax
	mov	[rsp+0x40], rax
	sub	rsp, 8
	pshufd	xmm3, xmm14, 0xa5
	pshufd	xmm0, xmm14, 0xd8
	movaps	xmm8, [rdx+0x10]
	pshufd	xmm1, xmm14, 0xfa
	psrldq	xmm14, 0xc
	movaps	[rsp-0x8], xmm3
	movaps	xmm3, xmm14
	pshufd	xmm6, xmm8, 0xa5
	punpcklqdq xmm3, xmm8
	movaps	xmm10, [rdx+0x20]
	movaps	[rsp-0x68], xmm6
	pshufd	xmm6, xmm8, 0xfa
	movaps	[rsp-0x38], xmm3
	pshufd	xmm3, xmm8, 0xd8
	psrldq	xmm8, 0xc
	movaps	xmm7, xmm8
	movaps	[rsp-0x58], xmm6
	punpcklqdq xmm7, xmm10
	movaps	[rsp-0x78], xmm7
	pshufd	xmm7, xmm10, 0xd8
	movaps	xmm11, [rsi]
	pshufd	xmm4, xmm7, 0xfa
	movaps	xmm15, xmm7
	movaps	xmm2, [.sse2_top64bitmask]
	pshufd	xmm5, xmm11, 0x55
	movaps	[rsp+0x8], xmm4
	movaps	xmm4, xmm2
	movaps	xmm7, xmm2
	movaps	xmm8, xmm2
	pand	xmm4, xmm5
	movaps	xmm12, [rsi+0x10]
	movaps	xmm9, xmm2
	pshufd	xmm10, xmm11, 0xaa
	paddq	xmm4, xmm5
	pshufd	xmm5, xmm11, 0xff
	movaps	[rsp-0x28], xmm15
	pshufd	xmm11, xmm11, 0
	pand	xmm7, xmm5
	paddq	xmm7, xmm5
	pshufd	xmm5, xmm12, 0x55
	pand	xmm8, xmm5
	paddq	xmm8, xmm5
	pshufd	xmm5, xmm12, 0xaa
	movaps	[rsp-0x48], xmm5
	pshufd	xmm5, xmm12, 0xff
	pshufd	xmm12, xmm12, 0
	pand	xmm9, xmm5
	movaps	xmm14, xmm9
	paddq	xmm14, xmm5
	movaps	xmm5, [rsi+0x20]
	pshufd	xmm9, xmm5, 0xdd
	pand	xmm2, xmm9
	movaps	xmm13, xmm2
	movaps	xmm2, xmm9
	pshufd	xmm9, xmm5, 0xcc
	movaps	xmm5, xmm4
	paddq	xmm2, xmm13
	movaps	xmm13, xmm4
	movaps	[rsp-0x18], xmm2
	pmuludq xmm5, xmm0
	movaps	xmm2, xmm7
	pmuludq xmm13, xmm1
	movaps	[rsp+0x18], xmm9
	pmuludq xmm2, xmm0
	movaps	xmm9, xmm4
	paddq	xmm13, xmm2
	movaps	xmm2, xmm7
	movaps	[rsp+0x28], xmm5
	pmuludq xmm9, xmm3
	pmuludq xmm2, xmm1
	movaps	xmm5, xmm6
	paddq	xmm9, xmm2
	movaps	xmm2, xmm7
	pmuludq xmm5, xmm4
	pmuludq xmm2, xmm3
	paddq	xmm5, xmm2
	movaps	xmm2, xmm15
	movaps	xmm15, xmm8
	pmuludq xmm2, xmm4
	pshufd	xmm4, xmm4, 0xa
	pmuludq xmm15, xmm0
	paddq	xmm9, xmm15
	movaps	xmm15, xmm8
	pmuludq xmm15, xmm1
	paddq	xmm5, xmm15
	movaps	xmm15, xmm6
	movaps	xmm6, xmm9
	pslldq	xmm9, 0x8
	pmuludq xmm15, xmm7
	paddq	xmm2, xmm15
	movaps	xmm15, xmm14
	pshufd	xmm7, xmm7, 0xa
	pmuludq xmm15, xmm0
	paddq	xmm5, xmm15
	movaps	xmm15, xmm8
	pshufd	xmm8, xmm8, 0xa
	pmuludq xmm15, xmm3
	paddq	xmm2, xmm15
	movaps	xmm15, xmm14
	pmuludq xmm15, xmm1
	paddq	xmm2, xmm15
	movaps	xmm15, [rsp-0x18]
	pmuludq xmm15, xmm0
	paddq	xmm2, xmm15
	movaps	xmm15, xmm5
	pslldq	xmm2, 0x8
	pslldq	xmm5, 0x8
	punpckhqdq xmm6, xmm5
	punpckhqdq xmm15, xmm2
	movaps	xmm2, xmm13
	pslldq	xmm13, 0x8
	punpckhqdq xmm2, xmm9
	movaps	xmm9, xmm2
	movaps	xmm2, [rsp+0x28]
	movaps	xmm5, xmm2
	punpckhqdq xmm5, xmm13
	movaps	xmm13, xmm2
	pxor	xmm2, xmm2
	pslldq	xmm13, 0x8
	punpckhqdq xmm2, xmm13
	movaps	xmm13, xmm11
	pmuludq xmm13, xmm0
	paddq	xmm2, xmm13
	movaps	xmm13, xmm11
	pmuludq xmm13, xmm1
	paddq	xmm5, xmm13
	movaps	xmm13, xmm11
	pmuludq xmm13, xmm3
	paddq	xmm9, xmm13
	movaps	xmm13, [rsp-0x58]
	pmuludq xmm13, xmm11
	paddq	xmm6, xmm13
	movaps	xmm13, xmm10
	pmuludq xmm11, [rsp-0x28]
	paddq	xmm15, xmm11
	movaps	xmm11, xmm12
	pmuludq xmm13, xmm0
	paddq	xmm5, xmm13
	movaps	xmm13, xmm10
	pmuludq xmm11, xmm0
	pmuludq xmm13, xmm1
	paddq	xmm9, xmm13
	movaps	xmm13, xmm12
	paddq	xmm9, xmm11
	movaps	xmm11, xmm10
	pmuludq xmm13, xmm1
	paddq	xmm6, xmm13
	movaps	xmm13, [rsp-0x48]
	pmuludq xmm11, xmm3
	paddq	xmm6, xmm11
	movaps	xmm11, [rsp-0x58]
	pmuludq xmm11, xmm10
	paddq	xmm15, xmm11
	movaps	xmm11, xmm13
	pmuludq xmm11, xmm0
	paddq	xmm6, xmm11
	movaps	xmm11, xmm12
	pmuludq xmm11, xmm3
	paddq	xmm15, xmm11
	movaps	xmm11, xmm13
	movaps	xmm13, xmm10
	movaps	xmm10, [rsp-0x48]
	pmuludq xmm11, xmm1
	paddq	xmm15, xmm11
	movaps	xmm11, [rsp+0x18]
	pmuludq xmm0, xmm11
	paddq	xmm15, xmm0
	movaps	xmm0, [.packednineteen]
	pmuludq xmm13, xmm0
	pmuludq xmm10, xmm0
	movaps	[rsp+0x18], xmm13
	movaps	xmm13, xmm10
	pmuludq xmm8, xmm0
	pmuludq xmm7, xmm0
	movaps	xmm10, xmm11
	movaps	xmm11, xmm4
	pmuludq xmm12, xmm0
	pshufd	xmm4, [rsp-0x18], 0xa
	pmuludq xmm11, xmm0
	movaps	[rsp-0x48], xmm11
	pmuludq xmm10, xmm0
	pshufd	xmm11, xmm14, 0xa
	pmuludq xmm1, xmm10
	pmuludq xmm4, xmm0
	movaps	xmm14, [rsp-0x8]
	pmuludq xmm11, xmm0
	pmuludq xmm14, xmm4
	paddq	xmm2, xmm14
	movaps	xmm14, [rsp-0x38]
	pmuludq xmm14, xmm4
	paddq	xmm5, xmm14
	movaps	xmm14, [rsp-0x68]
	pmuludq xmm14, xmm4
	paddq	xmm9, xmm14
	movaps	xmm14, [rsp-0x78]
	pmuludq xmm14, xmm4
	paddq	xmm6, xmm14
	movaps	xmm14, [rsp-0x38]
	pmuludq xmm14, xmm11
	paddq	xmm2, xmm14
	movaps	xmm14, [rsp-0x68]
	pmuludq xmm14, xmm11
	paddq	xmm5, xmm14
	movaps	xmm14, [rsp-0x78]
	pmuludq xmm14, xmm11
	paddq	xmm9, xmm14
	movaps	xmm14, [rsp+0x8]
	pmuludq xmm11, xmm14
	paddq	xmm6, xmm11
	movaps	xmm11, [rsp-0x68]
	pmuludq xmm11, xmm8
	paddq	xmm2, xmm11
	movaps	xmm11, [rsp-0x78]
	pmuludq xmm11, xmm8
	pmuludq xmm8, xmm14
	paddq	xmm5, xmm11
	movaps	xmm11, xmm14
	paddq	xmm9, xmm8
	movaps	xmm14, [rsp-0x28]
	pmuludq xmm4, xmm11
	paddq	xmm15, xmm4
	movaps	xmm8, xmm14
	movaps	xmm4, [rsp-0x48]
	pmuludq xmm8, xmm10
	paddq	xmm6, xmm8
	movaps	xmm8, [rsp-0x78]
	pmuludq xmm4, xmm11
	pmuludq xmm8, xmm7
	pmuludq xmm7, xmm11
	paddq	xmm5, xmm7
	movaps	xmm7, xmm14
	paddq	xmm2, xmm8
	paddq	xmm2, xmm1
	movaps	xmm1, xmm10
	pmuludq xmm7, xmm13
	paddq	xmm9, xmm7
	movaps	xmm7, [rsp-0x58]
	pmuludq xmm1, xmm3
	paddq	xmm5, xmm1
	pmuludq xmm3, xmm13
	pmuludq xmm10, xmm7
	movaps	xmm1, xmm7
	paddq	xmm9, xmm10
	movaps	xmm10, [rsp+0x18]
	paddq	xmm2, xmm3
	pmuludq xmm13, xmm7
	pmuludq xmm1, xmm12
	paddq	xmm5, xmm13
	paddq	xmm2, xmm1
	pmuludq xmm12, xmm14
	pmuludq xmm10, xmm14
	paddq	xmm5, xmm12
	paddq	xmm2, xmm10
	movaps	xmm3, xmm5
	paddq	xmm2, xmm4
	movaps	xmm4, xmm2
	movaps	xmm1, [.packedmask26]
	punpcklqdq xmm3, xmm6
	punpcklqdq xmm4, xmm9
	punpckhqdq xmm2, xmm9
	punpckhqdq xmm5, xmm6
	movaps	xmm6, xmm15
	movaps	xmm8, xmm4
	movaps	xmm7, xmm3
	punpcklqdq xmm6, xmm15
	pand	xmm3, xmm1
	psrlq	xmm8, 0x1a
	paddq	xmm2, xmm8
	psrlq	xmm7, 0x1a
	movaps	xmm9, xmm2
	paddq	xmm5, xmm7
	movaps	xmm8, xmm5
	punpckhqdq xmm15, xmm15
	psrlq	xmm9, 0x19
	paddq	xmm3, xmm9
	pand	xmm4, xmm1
	psrlq	xmm8, 0x19
	paddq	xmm6, xmm8
	movaps	xmm9, xmm6
	pslldq	xmm8, 0x8
	movaps	xmm7, [.packedmask25]
	pand	xmm6, xmm1
	psrlq	xmm9, 0x1a
	paddq	xmm15, xmm9
	movaps	xmm9, xmm15
	pand	xmm2, xmm7
	psrlq	xmm9, 0x19
	pmuludq xmm0, xmm9
	punpckhqdq xmm0, xmm8
	pand	xmm5, xmm7
	pand	xmm15, xmm7
	paddq	xmm4, xmm0
	movaps	xmm7, xmm4
	pand	xmm4, xmm1
	movaps	xmm0, xmm3
	pand	xmm1, xmm3
	psrlq	xmm7, 0x1a
	paddq	xmm2, xmm7
	psrlq	xmm0, 0x1a
	paddq	xmm5, xmm0
	movaps	xmm0, xmm4
	punpckhdq xmm4, xmm2
	punpckldq xmm0, xmm2
	add	rsp, 8
	movaps	xmm2, xmm1
	punpckhdq xmm1, xmm5
	punpckldq xmm2, xmm5
	punpckhdq xmm6, xmm15
	punpcklqdq xmm4, xmm1
	punpcklqdq xmm0, xmm2
	mov	rcx, [rsp+0x40]
	movaps	[rdi], xmm0
	movaps	[rdi+0x10], xmm4
	movaps	[rdi+0x20], xmm6
	add	rsp, rcx
	add	rsp, 0x50
	epilog
align 16
.sse2_top64bitmask:
	dd	0x00000000, 0x00000000, 0xffffffff, 0xffffffff
.packednineteen:
	dq	19, 19
.packedmask26:
	dq	0x3ffffff, 0x3ffffff
.packedmask25:
	dq	0x1ffffff, 0x1ffffff

end if

if used curve25519$contract | defined include_everything
	; two arguments: rdi == out buffer (32 bytes), rsi == ALIGNED 16 input number
falign
curve25519$contract:
	prolog	curve25519$contract

	; uggh this is a giant dependency chain, hahah, someday when I am bored walk back through this

	movaps	xmm2, [rsi]
	movaps	xmm0, [rsi+0x10]
	movaps	xmm1, [rsi+0x20]
	push	rbp rbx r12 r13 r14 r15
	sub	rsp, 0x50
	mov	rax, rsp
	and	rax, 0xf
	add	rax, 8
	sub	rsp, rax
	mov	[rsp+0x40], rax
	
	movaps	[rsp-0x48], xmm2
	movaps	[rsp-0x38], xmm2

	mov	r12d, [rsp-0x48]

	movaps	[rsp-0x68], xmm0
	movaps	[rsp-0x28], xmm0

	mov	r9d, r12d
	and	r12d, 0x3ffffff
	mov	edx, [rsp-0x68]
	shr	r9d, 0x1a
	add	r9d, [rsp-0x34]

	movaps	[rsp-0x58], xmm1
	movaps	[rsp-0x18], xmm1
	
	mov	r8d, r9d
	and	r9d, 0x1ffffff
	mov	eax, [rsp-0x58]
	shr	r8d, 0x19
	add	r8d, [rsp-0x30]

	mov	r10d, r8d
	and	r8d, 0x3ffffff
	shr	r10d, 0x1a
	add	r10d, [rsp-0x2c]
	
	mov	ebx, r10d
	and	r10d, 0x1ffffff
	shr	ebx, 0x19
	add	ebx, edx
	mov	esi, ebx
	and	ebx, 0x3ffffff
	shr	esi, 0x1a
	add	esi, [rsp-0x24]
	
	mov	ecx, esi
	shr	ecx, 0x19
	add	ecx, [rsp-0x20]
	mov	edx, ecx
	shr	edx, 0x1a
	add	edx, [rsp-0x1c]

	mov	r13d, edx
	shr	r13d, 0x19
	add	r13d, eax
	mov	eax, r13d
	shr	eax, 0x1a
	add	eax, [rsp-0x14]

	mov	r11d, eax
	shr	r11d, 0x19
	lea	ebp, [r11+r11*8]
	lea	ebp, [r11+rbp*2]
	add	ebp, r12d
	mov	r12d, ebp
	shr	r12d, 0x1a
	add	r12d, r9d
	mov	r9d, r12d
	shr	r9d, 0x19
	add	r9d, r8d
	mov	r8d, r9d
	shr	r8d, 0x1a
	add	r8d, r10d
	mov	r15d, r8d
	shr	r15d, 0x19
	add	r15d, ebx
	mov	r10d, r15d
	shr	r10d, 0x1a
	and	esi, 0x1ffffff
	and	ecx, 0x3ffffff
	add	esi, r10d
	and	edx, 0x1ffffff
	and	r13d, 0x3ffffff
	
	mov	r10d, esi
	and	eax, 0x1ffffff
	and	ebp, 0x3ffffff
	shr	r10d, 0x19
	and	r12d, 0x1ffffff
	and	r9d, 0x3ffffff
	add	ecx, r10d
	and	r8d, 0x1ffffff
	and	r15d, 0x3ffffff
	mov	r10d, ecx
	and	esi, 0x1ffffff
	shr	r10d, 0x1a
	add	edx, r10d
	mov	r14d, edx
	shr	r14d, 0x19
	add	r14d, r13d
	mov	r10d, r14d
	shr	r10d, 0x1a
	add	eax, r10d
	mov	r10d, eax

	shr	r10d, 0x19
	lea	r11d, [r10*8]
	add	r11d, r10d
	lea	r10d, [r10+r11*2]
	lea	ebp, [rbp+r10+0x13]
	mov	ebx, ebp
	shr	ebx, 0x1a
	add	ebx, r12d
	mov	r11d, ebx
	shr	r11d, 0x19
	add	r11d, r9d
	mov	r13d, r11d
	shr	r13d, 0x1a
	add	r13d, r8d
	mov	r10d, r13d
	shr	r10d, 0x19
	add	r10d, r15d
	mov	r9d, r10d
	shr	r9d, 0x1a
	add	r9d, esi
	mov	r8d, r9d
	shr	r8d, 0x19
	and	ecx, 0x3ffffff
	
	and	edx, 0x1ffffff
	add	r8d, ecx
	and	r14d, 0x3ffffff
	and	eax, 0x1ffffff
	mov	esi, r8d
	and	ebp, 0x3ffffff
	and	ebx, 0x1ffffff
	shr	esi, 0x1a
	and	r11d, 0x3ffffff
	and	r13d, 0x1ffffff
	add	esi, edx
	and	r10d, 0x3ffffff
	and	r9d, 0x1ffffff
	mov	edx, esi
	and	r8d, 0x3ffffff
	and	esi, 0x1ffffff
	shr	edx, 0x19

	add	edx, r14d
	mov	r12d, edx
	and	edx, 0x3ffffff
	shr	r12d, 0x1a
	add	r12d, eax
	mov	eax, r12d
	shr	eax, 0x19
	lea	ecx, [rax*8]
	add	ecx, eax

	lea	eax, [rax+rcx*2]
	lea	ebp, [rbp+rax+0x3ffffed]
	mov	eax, ebp
	and	ebp, 0x3ffffff
	shr	eax, 0x1a
	mov	[rdi], bpl
	lea	ebx, [rbx+rax+0x1ffffff]
	mov	eax, ebx
	
	shr	eax, 0x19
	lea	r11d, [r11+rax+0x3ffffff]
	mov	eax, r11d
	shr	eax, 0x1a
	lea	ecx, [r13+rax+0x1ffffff]
	mov	eax, ecx
	shr	eax, 0x19
	lea	r10d, [r10+rax+0x3ffffff]
	
	mov	eax, r10d
	shr	eax, 0x1a
	lea	r9d, [r9+rax+0x1ffffff]
	mov	eax, r9d
	and	r9d, 0x1ffffff
	shr	eax, 0x19
	lea	r8d, [r8+rax+0x3ffffff]
	mov	eax, r8d
	shr	eax, 0x1a
	lea	esi, [rsi+rax+0x1ffffff]
	mov	eax, esi
	shr	eax, 0x19
	and	ebx, 0x1ffffff
	and	r12d, 0x1ffffff
	lea	eax, [rdx+rax+0x3ffffff]
	shl	ebx, 2
	and	r11d, 0x3ffffff
	shl	r11d, 3
	and	ecx, 0x1ffffff
	shl	r10d, 6
	mov	edx, eax
	shr	eax, 0x1a
	shl	ecx, 5
	lea	eax, [r12+rax+0x1ffffff]

	mov	r12d, ebp
	and	r8d, 0x3ffffff
	shr	r12d, 0x8
	and	esi, 0x1ffffff
	and	edx, 0x3ffffff
	mov [rdi+1], r12b
	mov	r12d, ebp
	shr	ebp, 0x18
	or	ebp, ebx
	shr	r12d, 0x10
	and	eax, 0x1ffffff
	mov	[rdi+3], bpl
	mov	ebp, ebx
	mov	[rdi+2], r12b
	shr	ebp, 8
	add	r8d, r8d
	shl	esi, 3
	mov	[rdi+4], bpl
	mov	ebp, ebx
	shr	ebx, 0x18
	or	ebx, r11d
	shr	ebp, 0x10
	shl	edx, 4
	mov	[rdi+6], bl
	mov	ebx, r11d
	mov	[rdi+5], bpl
	shr	ebx, 8
	shl	eax, 6
	mov	[rdi+7], bl
	mov	ebx, r11d
	shr	r11d, 0x18
	or	r11d, ecx
	shr	ebx, 0x10
	mov	[rdi+8], bl
	mov	[rdi+9], r11b

	mov	r11d, ecx
	shr	r11d, 8
	mov	[rdi+0x10], r9b
	mov	[rdi+0xa], r11b
	mov	r11d, ecx
	shr	ecx, 0x18
	or	ecx, r10d
	shr	r11d, 0x10
	mov	[rdi+0xc], cl
	mov	ecx, r10d
	mov	[rdi+0xb], r11b
	shr	ecx, 8
	mov	[rdi+0xd], cl
	mov	ecx, r10d
	shr	r10d, 0x18
	shr	ecx, 0x10
	mov	[rdi+0xf], r10b
	mov	[rdi+0xe], cl

	mov	ecx, r9d
	shr	ecx, 8
	mov	[rdi+0x11], cl
	mov	ecx, r9d
	shr	r9d, 0x18
	shr	ecx, 0x10
	or	r9d, r8d
	mov	[rdi+0x12], cl
	mov	ecx, r8d
	mov	[rdi+0x13], r9b
	shr	ecx, 8
	mov	[rdi+0x14], cl
	mov	ecx, r8d
	shr	r8d, 0x18
	shr	ecx, 0x10
	or	r8d, esi
	mov	[rdi+0x15], cl

	mov	ecx, esi
	mov	[rdi+0x16], r8b
	shr	ecx, 8
	mov	[rdi+0x17], cl
	mov	ecx, esi
	shr	esi, 0x18
	shr	ecx, 0x10
	or	esi, edx
	mov	[rdi+0x18], cl
	mov	ecx, edx
	mov	[rdi+0x19], sil
	shr	ecx, 8
	mov	[rdi+0x1a], cl
	mov	ecx, edx
	shr	edx, 0x18
	or	edx, eax
	shr	ecx, 0x10
	mov	[rdi+0x1c], dl
	mov	edx, eax
	mov	[rdi+0x1b], cl
	shr	edx, 8
	mov	[rdi+0x1d], dl
	mov	edx, eax
	shr	eax, 0x18
	shr	edx, 0x10
	mov	[rdi+0x1f], al
	mov	[rdi+0x1e], dl

	mov	rcx, [rsp+0x40]
	add	rsp, rcx
	add	rsp, 0x50
	pop	r15 r14 r13 r12 rbx rbp
	epilog

end if