HeavyThing - rng.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	;       
	; rng.inc: RNG goodies
	;
	; this is a loose transcode from a loose transcode from a different assembler enviro
	; which was a loose transcode from agners rng goods, which is a transcode from
	; the SFMT, combined with the mother-of-all rng, which agner states is sufficient
	; for security applications (due to very long cycle lengths, and when used with
	; rng_heavy_init, trying every seed possibility is not viable either).
	; see agner's papers on rng theory and his choices for further info... good enough
	; for agner, good enough for me :-)
	;
	; Our library defaults to make use of a heavy state initialization, which uses
	; an HMAC-DRBG to generate the initial state.
	;
	; Note that we use the combined output of SFMT and Mother-of-all, per Agner's
	; recommendation.
	;
	; Agner states that it is possible to reconstruct a full sequence from a
	; state-sized subsequence, which in our case is 1408 bytes long, and that safety
	; is improved by combining the output of two or more generators only if combined
	; output ONLY is accessible to the attacker. For our purposes, this means that
	; for use in our applications, we must not ever expose contiguous output from our
	; RNG to any would-be attacker. Care must be taken as a direct result of this, and
	; if you are really paranoid, use hmac_drbg instead, it is just very very slow
	; compared to the methods we have chosen here.
	;
	; In summary, using this RNG for our intended crypto use cases is safe, PROVIDED
	; that care is appropriately taken (read: throw away bits, re-init periodically, etc)
	; functions that are used from TLS/SSH (rng$block, rng$block_nzb) automatically
	; throw away bits so that we never expose a complete subsequence.
	;
	; Also, as with most things in this library, calls to here are not thread-safe.
	;
	;
	;
	; someday when I am bored I will come back and recode this properly so it isn't quite
	; so messy, haha
	;
        ; Portions of this file are heavily modified routines that Agner Fog released
        ; also under the GPL in his excellent random number resources.
        ; Because the heavily modified functions herein are from various source files
        ; of his, and that each of his source files contains its own unique copyright
        ; notice, included below is the randoma.h header that covers them all.
        ;
        ; Cheers Agner for some of this as usual; All Hail Agner!
        ;
        ; Agner's randoma copyright appears below:
	;/*****************************   randoma.h   **********************************
	;* Author:        Agner Fog
	;* Date created:  1997
	;* Last modified: 2013-05-04
	;* Project:       randoma
	;* Source URL:    www.agner.org/random
	;*
	;* Description:
	;* This header file contains function prototypes and class declarations for
	;* the randoma library of uniform random number generators optimized in 
	;* assembly language.
	;*
	;* The following random number generators are available:
	;* Mersenne Twister, Mother-Of-All generator and SFMT generator.
	;*
	;* This library is available in many versions for different platforms,
	;* see ran-instructions.pdf for details.
	;*
	;* Copyright 1997-2010 by Agner Fog. 
	;* GNU General Public License http://www.gnu.org/licenses/gpl.html
	;*******************************************************************************
	;

if used rng$init | defined include_everything

globals
{
	; note: these are "weird" on purpose, the previous rendition ordered symbols alphabetically
	; in the data seg
	align 16
	_rng0_m3                dd ?,?
	_rng1_m2                dd ?,?
	_rng2_m1                dd ?,?
	_rng3_m0                dd ?
	_rng4_mc                dd ?
	_rng5_one               dq ?
	_rng6_tempran           dq ?
	align 16
	_rng7_mf3               dd ?,?
	_rng8_mf2               dd ?,?
	_rng9_mf1               dd ?,?
	_rnga_mf0               dd ?,?
	_rngb_lastinterval      dd ?
	_rngc_rlimit            dd ?
	_rngd_unused            dd ?
	_rnge_ix                dd ?
	_rngf_amask             dd ?,?,?,?
	_rngg_state             dd ?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?
	align 16
	_rngh_initmother	dd 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
	_rngi_initmask		dd 0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
	_rngj_initparity	dd 1,0,0e8148000H,0d0c7afa3H
}


if rng_heavy_init

	; Agner's original seed expansion is all well and good, but since we have the much slower
	; and crypto-safe hmac_drbg, we use that to initialize our 1408 main state and 32 byte
	; mother-of-all with entropy

	; (this is versus the non rng_heavy_init version that just uses a single 32 bit seed and expands that per Agner's
	; original methods).

falign
rng$init:
	prolog_silent	rng$init

	; our HMAC_DRBG setup needs non-predictable input of 64 bytes
	; to accomplish that, we do 16 bytes of rdtsc + gettimeofday at either end of the 64 bytes
	; and 32 bytes read from /dev/urandom in the middle to get our initial Key, V
	; NOTE: if the rng_paranoid setting is enabled, we'll pull our 32 bytes from /dev/random, which may stall.

	sub	rsp, 256
	readtsc
	mov	[rsp], rax
	lea	rdi, [rsp+128]
	xor	esi, esi
	; instead of using the VDSO gettimeofday, we are intentionally using the syscall version because
	; it takes a non-deterministic amount of time, thus adding more entropy than otherwise
	mov	eax, syscall_gettimeofday
	syscall
	; since we know microseconds will only ever be 17 bits worth, scoot the seconds over and combine
	mov	rax, [rsp+128]
	shl	rax, 17
	or	rax, [rsp+136]
	mov	[rsp+8], rax

	; so we have a rdtsc + gettimeofday for the first 16 bytes, next add /dev/urandom for 32
	mov	eax, syscall_open
	mov	rdi, .devurandom
	xor	esi, esi
	syscall
	cmp	eax, 0
	jl	.bad_devurandom
	mov	[rsp+128], eax
	mov	edi, eax
	lea	rsi, [rsp+16]
	mov	edx, 32
	mov	eax, syscall_read
	syscall
	mov	[rsp+136], eax
	mov	edi, [rsp+128]
	mov	eax, syscall_close
	syscall
	cmp	dword [rsp+136], 0
	jle	.bad_devurandom
	; that will take a non-deterministic amount of time to perform, so we can add another rdtsc+gettimeofday
	; for the last 16 bytes of our entropy seed to hmac_drbg
calign
.initialseed_ready:
	lea	rdi, [rsp+128]
	xor	esi, esi
	; instead of using the VDSO gettimeofday, we are intentionally using the syscall version because
	; it takes a non-deterministic amount of time, thus adding more entropy than otherwise
	mov	eax, syscall_gettimeofday
	syscall
	; since we know microseconds will only ever be 17 bits worth, scoot the seconds over and combine
	mov	rax, [rsp+128]
	shl	rax, 17
	or	rax, [rsp+136]
	mov	[rsp+48], rax
	; one last readtsc for our final 8 bytes
	readtsc
	mov	[rsp+56], rax

	; so now we have our seed material, fire up an HMAC_DRBG
	mov	rdi, hmac$init_sha512
	mov	rsi, rsp
	mov	edx, 64
	call	hmac_drbg$new
	mov	[rsp], rax

	; so now we can fill up our state
	mov	rdi, rax
	mov	rsi, _rngg_state
	mov	edx, 1408
	call	hmac_drbg$generate

	; m0..m3 next
	mov	rdi, [rsp]
	mov	rsi, _rng0_m3
	mov	edx, 32
	call	hmac_drbg$generate

	; we are finished seeding/expanding
	mov	rdi, [rsp]
	call	hmac_drbg$destroy

	; done with our stackframe
	add	rsp, 256

	movaps	xmm0, dqword [_rngh_initmother]
	movaps	dqword [_rng7_mf3], xmm0
	movaps	xmm0, dqword [_rngh_initmother+16]
	movaps	dqword [_rng9_mf1], xmm0

	movaps	xmm0, dqword [_rngi_initmask]
	movaps	dqword [_rngf_amask], xmm0
	
	xor	eax, eax
	mov	dword [_rng5_one], eax
	mov	dword [_rng5_one+4], 3FF00000H
	mov	dword [_rngb_lastinterval], eax

	movaps	xmm1, dqword [_rngg_state]
	andps	xmm1, dqword [_rngj_initparity]
	movhlps	xmm2, xmm1
	xorps	xmm1, xmm2
	pshufd	xmm2, xmm1, 1
	xorps	xmm1, xmm2
	movd	eax, xmm1
	mov	esi, eax
	shr	eax, 16
	xor	eax, esi
	; hmm TODO: redo so we can access upper 8 bit regs, previous rendition didn't have this luxury
	mov	esi, eax
	shr	esi, 8
	xor	al, sil
	jpo	.l8
	xor	edi, edi
	lea	rsi, [_rngj_initparity]
calign
.l5:
	mov	eax, [rsi+rdi*4]
	test	eax, eax
	jnz	.l6
	inc	edi
	jmp	.l5
calign
.l6:
	bsf	esi, eax
	btc	[_rngg_state+rdi*4], esi
.l8:
	; mothernext inline
	movdqa	xmm1, dqword [_rng0_m3]
	movdqa	xmm2, dqword [_rng2_m1]
	movhps	qword [_rng0_m3], xmm1
	movq	qword [_rng1_m2], xmm2
	movhps	qword [_rng2_m1], xmm2
	pmuludq	xmm1, dqword [_rng7_mf3]
	pmuludq	xmm2, dqword [_rng9_mf1]
	paddq	xmm1, xmm2
	movhlps	xmm2, xmm1
	movq	xmm3, qword [_rng4_mc]
	paddq	xmm1, xmm3
	paddq	xmm1, xmm2
	movq	qword [_rng3_m0], xmm1
	; end mothernext inline


	; copy of _rng_sfmt_generate
	movdqa	xmm1, dqword [_rngg_state+1376]
	movdqa	xmm2, dqword [_rngg_state+1392]
	mov	rsi, _rngg_state
	mov	rax, 320 + _rngg_state
	mov	rdi, 1408 + _rngg_state
calign
.ll1:
	movdqa	xmm0, dqword [rsi+1088]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	dqword [rsi], xmm2
	add	rsi, 16
	cmp	rsi, rax
	jb	.ll1
calign
.ll2:
	movdqa	xmm0, dqword [rsi-320]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	[rsi], xmm2
	add	rsi, 16
	cmp	rsi, rdi
	jb	.ll2
	
	cmp	[_rngf_amask], 0effff7fbH
	jne	.llerror
	mov	[_rnge_ix], 0
	epilog
calign
.llerror:
	xor	eax, eax
	div	eax		; die a thousand deaths quite intentionally
	epilog
dalign
.devurandom:
if rng_paranoid
	db	'/dev/random',0
else
	db	'/dev/urandom',0
end if
calign
.bad_devurandom:
	; we were unable to open /dev/urandom, so fill our middle 32 bytes with more rdtsc+gtod
	readtsc
	mov	[rsp+16], rax
	lea	rdi, [rsp+128]
	xor	esi, esi
	; instead of using the VDSO gettimeofday, we are intentionally using the syscall version because
	; it takes a non-deterministic amount of time, thus adding more entropy than otherwise
	mov	eax, syscall_gettimeofday
	syscall
	; since we know microseconds will only ever be 17 bits worth, scoot the seconds over and combine
	mov	rax, [rsp+128]
	shl	rax, 17
	or	rax, [rsp+136]
	mov	[rsp+24], rax

	readtsc
	mov	[rsp+32], rax
	lea	rdi, [rsp+128]
	xor	esi, esi
	; instead of using the VDSO gettimeofday, we are intentionally using the syscall version because
	; it takes a non-deterministic amount of time, thus adding more entropy than otherwise
	mov	eax, syscall_gettimeofday
	syscall
	; since we know microseconds will only ever be 17 bits worth, scoot the seconds over and combine
	mov	rax, [rsp+128]
	shl	rax, 17
	or	rax, [rsp+136]
	mov	[rsp+40], rax
	jmp	.initialseed_ready



else

	; note: we make a call to readtsc for the seed
	; i dont do fixed seed values anyway and this seems
	; as good or better than gettimeofday calls
	; RDRAND/RDSEED don't seem to be supported on much of my gear
	; and the low order word from readtsc when cascade-multiplied out here
	; seems to be quite sufficient
falign
rng$init:
	prolog_silent	rng$init
	readtsc
	; xmm0..3
	xor	edi, edi
	jmp	.l2
calign
.l1:
	; initsubf0:
	mov	esi, eax
	shr	eax, 30
	xor	eax, esi
	imul	eax, 1812433253
	inc	edi
	add	eax, edi
	; end initsubf0
.l2:
	mov	[_rngg_state+rdi*4], eax
	cmp	edi, 351
	jb	.l1
	; initsubf0:
	mov	esi, eax
	shr	eax, 30
	xor	eax, esi
	imul	eax, 1812433253
	inc	edi
	add	eax, edi
	; end initsubf0
	mov	[_rng3_m0], eax
	; initsubf0:
	mov	esi, eax
	shr	eax, 30
	xor	eax, esi
	imul	eax, 1812433253
	inc	edi
	add	eax, edi
	; end initsubf0
	mov	[_rng2_m1], eax
	; initsubf0:
	mov	esi, eax
	shr	eax, 30
	xor	eax, esi
	imul	eax, 1812433253
	inc	edi
	add	eax, edi
	; end initsubf0
	mov	[_rng1_m2], eax
	; initsubf0:
	mov	esi, eax
	shr	eax, 30
	xor	eax, esi
	imul	eax, 1812433253
	inc	edi
	add	eax, edi
	; end initsubf0
	mov	[_rng0_m3], eax
	; initsubf0:
	mov	esi, eax
	shr	eax, 30
	xor	eax, esi
	imul	eax, 1812433253
	inc	edi
	add	eax, edi
	; end initsubf0
	mov	[_rng4_mc], eax
	
	movaps	xmm0, dqword [_rngh_initmother]
	movaps	dqword [_rng7_mf3], xmm0
	movaps	xmm0, dqword [_rngh_initmother+16]
	movaps	dqword [_rng9_mf1], xmm0

	movaps	xmm0, dqword [_rngi_initmask]
	movaps	dqword [_rngf_amask], xmm0
	
	xor	eax, eax
	mov	dword [_rng5_one], eax
	mov	dword [_rng5_one+4], 3FF00000H
	mov	dword [_rngb_lastinterval], eax

	movaps	xmm1, dqword [_rngg_state]
	andps	xmm1, dqword [_rngj_initparity]
	movhlps	xmm2, xmm1
	xorps	xmm1, xmm2
	pshufd	xmm2, xmm1, 1
	xorps	xmm1, xmm2
	movd	eax, xmm1
	mov	esi, eax
	shr	eax, 16
	xor	eax, esi
	; hmm TODO: redo so we can access upper 8 bit regs, previous rendition didn't have this luxury
	mov	esi, eax
	shr	esi, 8
	xor	al, sil
	jpo	.l8
	xor	edi, edi
	lea	rsi, [_rngj_initparity]
calign
.l5:
	mov	eax, [rsi+rdi*4]
	test	eax, eax
	jnz	.l6
	inc	edi
	jmp	.l5
calign
.l6:
	bsf	esi, eax
	btc	[_rngg_state+rdi*4], esi
.l8:
	; mothernext inline
	movdqa	xmm1, dqword [_rng0_m3]
	movdqa	xmm2, dqword [_rng2_m1]
	movhps	qword [_rng0_m3], xmm1
	movq	qword [_rng1_m2], xmm2
	movhps	qword [_rng2_m1], xmm2
	pmuludq	xmm1, dqword [_rng7_mf3]
	pmuludq	xmm2, dqword [_rng9_mf1]
	paddq	xmm1, xmm2
	movhlps	xmm2, xmm1
	movq	xmm3, qword [_rng4_mc]
	paddq	xmm1, xmm3
	paddq	xmm1, xmm2
	movq	qword [_rng3_m0], xmm1
	; end mothernext inline


	; copy of _rng_sfmt_generate
	movdqa	xmm1, dqword [_rngg_state+1376]
	movdqa	xmm2, dqword [_rngg_state+1392]
	mov	rsi, _rngg_state
	mov	rax, 320 + _rngg_state
	mov	rdi, 1408 + _rngg_state
calign
.ll1:
	movdqa	xmm0, dqword [rsi+1088]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	dqword [rsi], xmm2
	add	rsi, 16
	cmp	rsi, rax
	jb	.ll1
calign
.ll2:
	movdqa	xmm0, dqword [rsi-320]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	[rsi], xmm2
	add	rsi, 16
	cmp	rsi, rdi
	jb	.ll2
	
	cmp	[_rngf_amask], 0effff7fbH
	jne	.llerror
	mov	[_rnge_ix], 0
	epilog
calign
.llerror:
	xor	eax, eax
	div	eax		; die a thousand deaths quite intentionally
	epilog

end if
	
	; eax (32 bits of seed) == a
	; rdi == b
	; rsi == d

end if



if used rng$int | defined include_everything
	; arguments: edi == min, esi == max (inclusive)
falign
rng$int:
	prolog	rng$int
	push	rbx
	mov	rbx, rsi
	sub	rbx, rdi
	jle	.m30
	add	rbx, 1
	cmp	ebx, [_rngb_lastinterval]
	je	.m10
	xor	eax, eax
	lea	edx, [eax+1]
	div	ebx
	mul	ebx
	sub	eax, 1
	mov	[_rngc_rlimit], eax
	mov	[_rngb_lastinterval], ebx
calign
.m10:
	mov	rbx, rdi
	call	rng$u32
	mul	[_rngb_lastinterval]
	cmp	eax, [_rngc_rlimit]
	ja	.m20
	lea	eax, [rbx+rdx]
	pop	rbx
	epilog
calign
.m20:
	call	rng$u32
	mul	[_rngb_lastinterval]
	cmp	eax, [_rngc_rlimit]
	ja	.m20
	lea	eax, [rbx+rdx]
	pop	rbx
	epilog
calign
.m30:
	mov	rax, rsi
	pop	rbx
	epilog

end if


if used rng$intmax | defined include_everything
	; argument: rdi == max (inclusive)
falign
rng$intmax:
	prolog	rng$intmax
	mov	rsi, rdi
	xor	edi, edi
	call	rng$int
	epilog
end if


if used rng$u32 | defined include_everything
	; returns random 32 bits in eax
falign
rng$u32:
	prolog	rng$u32
	mov	eax, [_rnge_ix]
	cmp	eax, 1404
	jnb	.morebits
	add	eax, 4
	mov	[_rnge_ix], eax
	mov	eax, [rax+_rngg_state-4]
	add	eax, [_rng3_m0]
	; mothernext inline
	movdqa	xmm1, dqword [_rng0_m3]
	movdqa	xmm2, dqword [_rng2_m1]
	movhps	qword [_rng0_m3], xmm1
	movq	qword [_rng1_m2], xmm2
	movhps	qword [_rng2_m1], xmm2
	pmuludq	xmm1, dqword [_rng7_mf3]
	pmuludq	xmm2, dqword [_rng9_mf1]
	paddq	xmm1, xmm2
	movhlps	xmm2, xmm1
	movq	xmm3, qword [_rng4_mc]
	paddq	xmm1, xmm3
	paddq	xmm1, xmm2
	movq	qword [_rng3_m0], xmm1
	; end mothernext inline
	epilog
calign
.morebits:
	; modified copy of _rng_sfmt_generate
	movdqa	xmm1, dqword [_rngg_state+1376]
	movdqa	xmm2, dqword [_rngg_state+1392]
	mov	rsi, _rngg_state
	mov	rax, 320 + _rngg_state
	mov	rdi, 1408 + _rngg_state
calign
.ll1:
	movdqa	xmm0, dqword [rsi+1088]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	dqword [rsi], xmm2
	add	rsi, 16
	cmp	rsi, rax
	jb	.ll1
calign
.ll2:
	movdqa	xmm0, dqword [rsi-320]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	[rsi], xmm2
	add	rsi, 16
	cmp	rsi, rdi
	jb	.ll2
	
	cmp	[_rngf_amask], 0effff7fbH
	jne	.llerror
	; modified bit:
	mov	[_rnge_ix], 4

	; and recopy of initial entry
	mov	eax, [_rngg_state]
	add	eax, [_rng3_m0]

	epilog
calign
.llerror:
	xor	eax, eax
	div	eax		; die a thousand deaths quite intentionally
	epilog
end if

if used rng$u64 | defined include_everything
	; returns random 64 bits in eax
falign
rng$u64:
	prolog	rng$u64
	mov	eax, [_rnge_ix]
	cmp	eax, 1396
	jnb	.morebits
	add	eax, 8
	mov	[_rnge_ix], eax
	mov	rax, qword [rax+_rngg_state-8]
	add	rax, qword [_rng3_m0]
	; mothernext inline
	movdqa	xmm1, dqword [_rng0_m3]
	movdqa	xmm2, dqword [_rng2_m1]
	movhps	qword [_rng0_m3], xmm1
	movq	qword [_rng1_m2], xmm2
	movhps	qword [_rng2_m1], xmm2
	pmuludq	xmm1, dqword [_rng7_mf3]
	pmuludq	xmm2, dqword [_rng9_mf1]
	paddq	xmm1, xmm2
	movhlps	xmm2, xmm1
	movq	xmm3, qword [_rng4_mc]
	paddq	xmm1, xmm3
	paddq	xmm1, xmm2
	movq	qword [_rng3_m0], xmm1
	; end mothernext inline
	epilog
calign
.morebits:
	; modified copy of _rng_sfmt_generate
	movdqa	xmm1, dqword [_rngg_state+1376]
	movdqa	xmm2, dqword [_rngg_state+1392]
	mov	rsi, _rngg_state
	mov	rax, 320 + _rngg_state
	mov	rdi, 1408 + _rngg_state
calign
.ll1:
	movdqa	xmm0, dqword [rsi+1088]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	dqword [rsi], xmm2
	add	rsi, 16
	cmp	rsi, rax
	jb	.ll1
calign
.ll2:
	movdqa	xmm0, dqword [rsi-320]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	[rsi], xmm2
	add	rsi, 16
	cmp	rsi, rdi
	jb	.ll2
	
	cmp	[_rngf_amask], 0effff7fbH
	jne	.llerror
	; modified bit:
	mov	[_rnge_ix], 8

	; and recopy of initial entry
	mov	rax, qword [_rngg_state]
	add	rax, qword [_rng3_m0]

	epilog
calign
.llerror:
	xor	eax, eax
	div	eax		; die a thousand deaths quite intentionally
	epilog
end if

if used rng$double | defined include_everything
	; returns random double in xmm0
falign
rng$double:
	prolog	rng$double
	mov	edi, [_rnge_ix]
	cmp	edi, 1396
	jnb	.morebits
	movq	xmm0, qword [rdi+_rngg_state]
	add	edi, 8
	mov	[_rnge_ix], edi
	movq	xmm1, qword [_rng3_m0]
	pshuflw	xmm1, xmm1, 01001011B
	paddq	xmm0, xmm1
	; mothernext inline
	movdqa	xmm1, dqword [_rng0_m3]
	movdqa	xmm2, dqword [_rng2_m1]
	movhps	qword [_rng0_m3], xmm1
	movq	qword [_rng1_m2], xmm2
	movhps	qword [_rng2_m1], xmm2
	pmuludq	xmm1, dqword [_rng7_mf3]
	pmuludq	xmm2, dqword [_rng9_mf1]
	paddq	xmm1, xmm2
	movhlps	xmm2, xmm1
	movq	xmm3, qword [_rng4_mc]
	paddq	xmm1, xmm3
	paddq	xmm1, xmm2
	movq	qword [_rng3_m0], xmm1
	; end mothernext inline
	psrlq	xmm0, 12
	movsd	xmm1, [_rng5_one]
	por	xmm0, xmm1
	subsd	xmm0, xmm1
	epilog
calign
.morebits:
	; modified copy of _rng_sfmt_generate
	movdqa	xmm1, dqword [_rngg_state+1376]
	movdqa	xmm2, dqword [_rngg_state+1392]
	mov	rsi, _rngg_state
	mov	rax, 320 + _rngg_state
	mov	rdi, 1408 + _rngg_state
calign
.ll1:
	movdqa	xmm0, dqword [rsi+1088]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	dqword [rsi], xmm2
	add	rsi, 16
	cmp	rsi, rax
	jb	.ll1
calign
.ll2:
	movdqa	xmm0, dqword [rsi-320]
	psrld	xmm0, 7
	pand	xmm0, dqword [_rngf_amask]
	movdqa	xmm3, dqword [rsi]
	pxor	xmm0, xmm3
	pslldq	xmm3, 3
	psrldq	xmm1, 3
	pxor	xmm0, xmm3
	pxor	xmm0, xmm1
	movdqa	xmm1, xmm2
	pslld	xmm2, 14
	pxor	xmm2, xmm0
	movdqa	[rsi], xmm2
	add	rsi, 16
	cmp	rsi, rdi
	jb	.ll2
	
	cmp	[_rngf_amask], 0effff7fbH
	jne	.llerror

	; modification of the original _rng_sfmt_generate
	mov	dword [_rnge_ix], 8

	; copy of the initial rng$double entry with no morebits check
	movq	xmm0, qword [_rngg_state]
	movq	xmm1, qword [_rng3_m0]
	pshuflw	xmm1, xmm1, 01001011B
	paddq	xmm0, xmm1
	; mothernext inline
	movdqa	xmm1, dqword [_rng0_m3]
	movdqa	xmm2, dqword [_rng2_m1]
	movhps	qword [_rng0_m3], xmm1
	movq	qword [_rng1_m2], xmm2
	movhps	qword [_rng2_m1], xmm2
	pmuludq	xmm1, dqword [_rng7_mf3]
	pmuludq	xmm2, dqword [_rng9_mf1]
	paddq	xmm1, xmm2
	movhlps	xmm2, xmm1
	movq	xmm3, qword [_rng4_mc]
	paddq	xmm1, xmm3
	paddq	xmm1, xmm2
	movq	qword [_rng3_m0], xmm1
	; end mothernext inline
	psrlq	xmm0, 12
	movsd	xmm1, [_rng5_one]
	por	xmm0, xmm1
	subsd	xmm0, xmm1
	epilog
calign
.llerror:
	xor	eax, eax
	div	eax		; die a thousand deaths quite intentionally
	epilog

end if



if used rng$block | defined include_everything

	; two arguments: rdi == byte buffer, rsi == length of same
	; we fill it with repeated calls to rng$u64
	; TODO: make me go faster
falign
rng$block:
	prolog	rng$block
	test	rsi, rsi
	jz	.nothingtodo
	push	rbx rbp r12
	mov	rbx, rdi
	mov	rbp, rsi
	mov	r12d, 48
calign
.do8:
	call	rng$u64
	mov	ecx, 49
	sub	r12d, 1
	cmovz	r12d, ecx
	jz	.do8
	cmp	rbp, 8
	jb	.do4
	mov	[rbx], rax
	add	rbx, 8
	sub	rbp, 8
	jz	.alldone
	jmp	.do8
calign
.do4:
	cmp	rbp, 4
	jb	.do2
	mov	[rbx], eax
	shr	rax, 32
	add	rbx, 4
	sub	rbp, 4
	jz	.alldone
calign
.do2:
	cmp	rbp, 2
	jb	.do1
	mov	[rbx], ax
	shr	rax, 16
	add	rbx, 2
	sub	rbp, 2
	jz	.alldone
calign
.do1:
	mov	[rbx], al
	pop	r12 rbp rbx
	; throw away 64 bits
	call	rng$u64
	epilog
calign
.alldone:
	pop	r12 rbp rbx
	; throw away 64 bits
	call	rng$u64
	epilog
calign
.nothingtodo:
	epilog

end if

if used rng$block_nzb | defined include_everything
	; same as rng$block, only makes sure they are non zero bytes throughout
falign
rng$block_nzb:
	prolog	rng$block_nzb
	test	rsi, rsi
	jz	.nothingtodo
	push	rbx rbp
	mov	rbx, rdi
	mov	rbp, rsi
calign
.do4:
	call	rng$u32
	test	eax, 0xff
	jz	.do4
	test	eax, 0xff00
	jz	.do4
	test	eax, 0xff0000
	jz	.do4
	test	eax, 0xff000000
	jz	.do4
	cmp	rbp, 4
	jb	.do2
	mov	[rbx], eax
	shr	rax, 32
	add	rbx, 4
	sub	rbp, 4
	jmp	.do4
calign
.do2:
	cmp	rbp, 2
	jb	.do1
	mov	[rbx], ax
	shr	rax, 16
	add	rbx, 2
	sub	rbp, 2
	jz	.alldone
calign
.do1:
	mov	[rbx], al
	pop	rbp rbx
	; throw away 64 bits
	call	rng$u64
	epilog
calign
.alldone:
	pop	rbp rbx
	; throw away 64 bits
	call	rng$u64
	epilog
calign
.nothingtodo:
	epilog

end if