HeavyThing - zlib_deflate.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	;       
	; zlib_deflate.inc: port of zlib, uses buffer goods
	;
	; This is quite literally a hand compilation (and thus interpretation/
	; modification) of the "reference zlib."
	; As such, the original zlib.h copyright appears below, although I am not
	; sure that is really necessary. Cheers to Jean-Loup Gailly and the legend
	; Mark Adler are definitely in order regardless of whether it is necessary
	; or not!
	;
	; NOTE: hahah, I really must be crazy...
	; Burning Purpose behind this entire kit of goodies though is to be
	; standalone, so either I go without, or I hand-compile it ;-)
	; HAHAH
	; 
	; Note to self: I wrote the maniacal HAHAH _before_ I did any of the work
	; how about "le Grunt" instead
	;
	; So, this deflate routine is mostly a "reference" version, its speed is faster than zlib-1.2.8
	; and the default gzip supplied with my primary linux distro
	;
	; NOTE: I didn't bother to do RLE and pure Huffman... none of my stuff uses it
	;
	; zlib.h copyright notice appears below:
	;/* zlib.h -- interface of the 'zlib' general purpose compression library
	;  version 1.2.8, April 28th, 2013
	;
	;  Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
	;
	;  This software is provided 'as-is', without any express or implied
	;  warranty.  In no event will the authors be held liable for any damages
	;  arising from the use of this software.
	;
	;  Permission is granted to anyone to use this software for any purpose,
	;  including commercial applications, and to alter it and redistribute it
	;  freely, subject to the following restrictions:
	;
	;  1. The origin of this software must not be misrepresented; you must not
	;     claim that you wrote the original software. If you use this software
	;     in a product, an acknowledgment in the product documentation would be
	;     appreciated but is not required.
	;  2. Altered source versions must be plainly marked as such, and must not be
	;     misrepresented as being the original software.
	;  3. This notice may not be removed or altered from any source distribution.
	;
	;  Jean-loup Gailly        Mark Adler
	;  jloup@gzip.org          madler@alumni.caltech.edu
	;
	;
	;  The data format used by the zlib library is described by RFCs (Request for
	;  Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
	;  (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
	;*/

	; TODO: cleanup dstate/remove unused items

	; various settings apply, see the default settings for further details.

	; profile the internal zlib function calls?
profile_zlib_internals = 0



zlib_inbuf_ofs = 0
zlib_outbuf_ofs = 8
zlib_totalin_ofs = 16
zlib_totalout_ofs = 24
zlib_state_ofs = 32
zlib_datatype_ofs = 40
zlib_adler_ofs = 48

zlib_stream_size = 56


zlib_tdesc_dyn_tree_ofs = 0
zlib_tdesc_max_code_ofs = 8
zlib_tdesc_stat_desc_ofs = 16

zlib_dstate_streamp_ofs = 0			; dq
zlib_dstate_status_ofs = 8			; dd
zlib_dstate_pending_buf_ofs = 16		; dq
zlib_dstate_wrap_ofs = 24			; dd (passed in at deflateInit)
zlib_dstate_gzhead_ofs = 32			; dq
zlib_dstate_gzindex_ofs = 40			; dd
zlib_dstate_last_flush_ofs = 48			; dd
zlib_dstate_w_size_ofs = 56			; dd-- notused (constant)
zlib_dstate_w_bits_ofs = 64			; dd-- notused (constant)
zlib_dstate_w_mask_ofs = 72			; dd-- notused (constant)
zlib_dstate_window_ofs = 80			; dq
zlib_dstate_window_size_ofs = 88		; dq-- notused (constant)
zlib_dstate_prev_ofs = 96			; dq
zlib_dstate_head_ofs = 104			; dq
zlib_dstate_ins_h_ofs = 112			; dd
zlib_dstate_hash_size_ofs = 120			; dd-- notused (constant)
zlib_dstate_hash_bits_ofs = 128			; dd-- notused
zlib_dstate_hash_mask_ofs = 136			; dd-- notused (constant)
zlib_dstate_hash_shift_ofs = 144		; dd-- notused (constant)
zlib_dstate_block_start_ofs = 152		; dq
zlib_dstate_match_length_ofs = 160		; dd
zlib_dstate_prev_match_ofs = 168		; dd
zlib_dstate_match_available_ofs = 176		; dd
zlib_dstate_strstart_ofs = 184			; dd
zlib_dstate_match_start_ofs = 192		; dd
zlib_dstate_lookahead_ofs = 200			; dd
zlib_dstate_prev_length_ofs = 208		; dd
zlib_dstate_max_chain_length_ofs = 216		; dd-- notused at all
zlib_dstate_max_lazy_match_ofs = 224		; dd-- notused (constant)
zlib_dstate_level_ofs = 232			; dd
zlib_dstate_strategy_ofs = 240			; dd
zlib_dstate_good_match_ofs = 248		; dd-- notused (constant)
zlib_dstate_nice_match_ofs = 256		; dd-- notused (constant)
zlib_dstate_dyn_ltree_ofs = 264			; array of struct ct_data_s
zlib_dstate_dyn_dtree_ofs = 2556		; array of struct ct_data_s
zlib_dstate_bl_tree_ofs = 2800			; array of struct ct_data_s
zlib_dstate_l_desc_ofs = 2956			; struct tree_desc_s
zlib_dstate_d_desc_ofs = 2980			; struct tree_desc_s
zlib_dstate_bl_desc_ofs = 3004			; struct tree_desc_s
zlib_dstate_bl_count_ofs = 3028			; array of dw
zlib_dstate_heap_ofs = 3060			; array of dd
zlib_dstate_heap_len_ofs = 5352			; dd
zlib_dstate_heap_max_ofs = 5360			; dd
zlib_dstate_depth_ofs = 5368			; array of db
zlib_dstate_l_buf_ofs = 5944			; dq
zlib_dstate_lit_bufsize_ofs = 5952		; dd-- notused (constant)
zlib_dstate_last_lit_ofs = 5960			; dd
zlib_dstate_d_buf_ofs = 5968			; dq
zlib_dstate_opt_len_ofs = 5976			; dq
zlib_dstate_static_len_ofs = 5984		; dq
zlib_dstate_matches_ofs = 5992			; dd
zlib_dstate_insert_ofs = 6000			; dd
zlib_dstate_bi_buf_ofs = 6008			; dw			... NOTE: changed to dq
zlib_dstate_bi_valid_ofs = 6016			; dd			... 
zlib_dstate_high_water_ofs = 6024		; dq
zlib_dstate_pending_out_ofs = 6032		; dq
zlib_dstate_pending_ofs = 6040			; dd

zlib_dstate_size = 6048


	; standard config goods here:
zlib_window_bits = 15
zlib_memlevel = 8


	; and calcs based on them:
zlib_wsize = 1 shl zlib_window_bits
zlib_wmask = zlib_wsize - 1

zlib_hashbits = zlib_memlevel + 7
zlib_hashsize = 1 shl zlib_hashbits
zlib_hashmask = zlib_hashsize - 1

zlib_min_match = 3

zlib_hashshift = (zlib_hashbits + zlib_min_match - 1) / zlib_min_match

zlib_litbufsize = 1 shl (zlib_memlevel + 6)

zlib_wsize_bytes = zlib_wsize shl 1
zlib_prev_bytes = zlib_wsize shl 1
zlib_head_bytes = zlib_hashsize shl 1
zlib_overlay_bytes = zlib_litbufsize shl 2

	; bi_buf/bi_valid sizing, in bits:
zlib_buf_size = 64

	; configuration based on zlib_deflate_level:
if zlib_deflate_level = 0
	zlib_good_length = 0
	zlib_max_lazy = 0
	zlib_nice_length = 0
	zlib_max_chain = 0
else if zlib_deflate_level = 1
	zlib_good_length = 4
	zlib_max_lazy = 4
	zlib_nice_length = 8
	zlib_max_chain = 4
else if zlib_deflate_level = 2
	zlib_good_length = 4
	zlib_max_lazy = 5
	zlib_nice_length = 16
	zlib_max_chain = 8
else if zlib_deflate_level = 3
	zlib_good_length = 4
	zlib_max_lazy = 6
	zlib_nice_length = 32
	zlib_max_chain = 32
else if zlib_deflate_level = 4
	zlib_good_length = 4
	zlib_max_lazy = 4
	zlib_nice_length = 16
	zlib_max_chain = 16
else if zlib_deflate_level = 5
	zlib_good_length = 8
	zlib_max_lazy = 16
	zlib_nice_length = 32
	zlib_max_chain = 32
else if zlib_deflate_level = 6
	zlib_good_length = 8
	zlib_max_lazy = 16
	zlib_nice_length = 128
	zlib_max_chain = 128
else if zlib_deflate_level = 7
	zlib_good_length = 8
	zlib_max_lazy = 32
	zlib_nice_length = 128
	zlib_max_chain = 256
else if zlib_deflate_level = 8
	zlib_good_length = 32
	zlib_max_lazy = 128
	zlib_nice_length = 258
	zlib_max_chain = 1024
else if zlib_deflate_level = 9
	zlib_good_length = 32
	zlib_max_lazy = 258
	zlib_nice_length = 258
	zlib_max_chain = 4096
else
	display 'invalid zlib_deflate_level',13,10
	err
end if


	; deflate states
zlib_dstate_init = 42
zlib_dstate_extra = 69
zlib_dstate_name = 73
zlib_dstate_comment = 91
zlib_dstate_hcrc = 103
zlib_dstate_busy = 113
zlib_dstate_finish = 666



macro zlib_debug preface*, reg* {
	local ..continue, ..string
	push	rax rcx rdx rdi rsi r8 r9 r10 r11
	sub	rsp, 8
	mov	rdi, reg
	mov	esi, 10
	call	string$from_unsigned
	mov	[rsp], rax
	mov	rdi, ..string
	call	string$to_stderr
	mov	rdi, [rsp]
	call	string$to_stderrln
	mov	rdi, [rsp]
	call	heap$free
	add	rsp, 8
	pop	r11 r10 r9 r8 rsi rdi rdx rcx rax
	jmp	..continue
cleartext ..string, preface
calign
..continue:
}

if used zlib$deflateEnd | defined include_everything

	; single argument in rdi: a zlib_stream pointer
	; all we do is free our state, otherwise, we leave things well enough alone
falign
zlib$deflateEnd:
	prolog	zlib$deflateEnd
	mov	rdi, [rdi+zlib_state_ofs]
	call	heap$free
	epilog

end if

if used zlib$deflateInit | defined include_everything
	; two arguments: rdi == a zlib_stream_size memory chunk for our state, esi == "wrap", see below
	; we do not mess with inbuf or outbuf
	; wrap == 0 == no headers whatsoever
	; wrap == 1 == zlib (suitable for all my streaming goods, SSH, etc)
	; wrap == 2 == gzip headers
falign
zlib$deflateInit:
	prolog	zlib$deflateInit
	xor	ecx, ecx
	sub	rsp, 24
	mov	[rsp], rdi
	mov	[rsp+16], esi
	mov	[rdi+zlib_totalin_ofs], rcx
	mov	[rdi+zlib_totalout_ofs], rcx
	mov	qword [rdi+zlib_datatype_ofs], 2

	mov	qword [rdi+zlib_adler_ofs], 1

	mov	edi, zlib_dstate_size + zlib_wsize_bytes + zlib_prev_bytes + zlib_head_bytes + zlib_overlay_bytes + 64
	call	heap$alloc
	mov	[rsp+8], rax
	mov	rdi, rax
	xor	esi, esi
	mov	edx, zlib_dstate_size		; note: we are _not_ clearing the buffers, no sense in that.. TODO: do we even need to do it for the rest of them?
	call	memset32
	mov	rdi, [rsp+8]			; our zlib_dstate block, zeroed
	add	rdi, zlib_dstate_size + zlib_wsize_bytes + zlib_prev_bytes	; the head location
	xor	esi, esi
	mov	edx, zlib_head_bytes
	call	memset32			; we may consider just combining and zeroing the entire lot in one call, hmmm
	mov	rsi, [rsp+8]			; our zlib_dstate block, zeroed
	mov	rdi, [rsp]			; our original z_stream block
	mov	rdx, rsi
	mov	rcx, rsi
	mov	r8, rsi
	mov	r9, rsi

	add	rdx, zlib_dstate_size
	add	rcx, zlib_dstate_size
	add	r8, zlib_dstate_size
	add	r9, zlib_dstate_size
	add	rcx, zlib_wsize_bytes + 15
	add	r8, zlib_wsize_bytes + 15
	add	r9, zlib_wsize_bytes + 15
	and	rcx, not 15
	and	r8, not 15
	and	r9, not 15

	add	r8, zlib_prev_bytes + 15
	add	r9, zlib_prev_bytes + 15
	and	r8, not 15
	and	r9, not 15
	add	r9, zlib_head_bytes + 15
	and	r9, not 15

	mov	r10d, [rsp+16]
	mov	[rdi+zlib_state_ofs], rsi
	mov	[rsi+zlib_dstate_streamp_ofs], rdi
	mov	[rsi+zlib_dstate_wrap_ofs], r10d
	; mov	dword [rsi+zlib_dstate_w_bits_ofs], zlib_window_bits
	; mov	dword [rsi+zlib_dstate_w_size_ofs], zlib_wsize
	; mov	qword [rsi+zlib_dstate_window_size_ofs], zlib_wsize shl 1
	; mov	dword [rsi+zlib_dstate_w_mask_ofs], zlib_wmask
	; mov	dword [rsi+zlib_dstate_hash_bits_ofs], zlib_hashbits
	; mov	dword [rsi+zlib_dstate_hash_size_ofs], zlib_hashsize
	; mov	dword [rsi+zlib_dstate_hash_mask_ofs], zlib_hashmask
	; mov	dword [rsi+zlib_dstate_hash_shift_ofs], zlib_hashshift
	; mov	dword [rsi+zlib_dstate_lit_bufsize_ofs], zlib_litbufsize
	mov	[rsi+zlib_dstate_window_ofs], rdx
	mov	[rsi+zlib_dstate_prev_ofs], rcx
	mov	[rsi+zlib_dstate_head_ofs], r8			; fill_window requires these to be adjacent
	mov	[rsi+zlib_dstate_pending_buf_ofs], r9
	mov	[rsi+zlib_dstate_pending_out_ofs], r9
	xor	r8d, r8d
	; mov	qword [rsi+zlib_dstate_pending_buf_size_ofs], zlib_overlay_bytes
	mov	rdx, r9
	mov	rcx, r9
	add	rdx, zlib_overlay_bytes shr 2			; d_buf now at byte offset 16384 instead
	add	rcx, 3 * zlib_litbufsize
	mov	[rsi+zlib_dstate_d_buf_ofs], rdx
	mov	[rsi+zlib_dstate_l_buf_ofs], rcx
	mov	dword [rsi+zlib_dstate_level_ofs], zlib_deflate_level
	mov	[rsi+zlib_dstate_pending_ofs], r8
	mov	dword [rsi+zlib_dstate_status_ofs], zlib_dstate_init

	; _tr_init(rsi) is next
	xor	r9d, r9d
	mov	rdx, rsi
	mov	rcx, rsi
	mov	r8, rsi
	add	rdx, zlib_dstate_dyn_ltree_ofs
	add	rcx, zlib_dstate_dyn_dtree_ofs
	add	r8, zlib_dstate_bl_tree_ofs
	mov	[rsi+zlib_dstate_l_desc_ofs + zlib_tdesc_dyn_tree_ofs], rdx
	mov	qword [rsi+zlib_dstate_l_desc_ofs + zlib_tdesc_stat_desc_ofs], zlib_static_l_desc
	mov	[rsi+zlib_dstate_d_desc_ofs + zlib_tdesc_dyn_tree_ofs], rcx
	mov	qword [rsi+zlib_dstate_d_desc_ofs + zlib_tdesc_stat_desc_ofs], zlib_static_d_desc
	mov	[rsi+zlib_dstate_bl_desc_ofs + zlib_tdesc_dyn_tree_ofs], r8
	mov	qword [rsi+zlib_dstate_bl_desc_ofs + zlib_tdesc_stat_desc_ofs], zlib_static_bl_desc
	mov	[rsi+zlib_dstate_bi_buf_ofs], r9
	mov	[rsi+zlib_dstate_bi_valid_ofs], r9

	; our entire dstate is already memset to 0, so we don't need to do our Freq = 0 or any of the rest of the clearing inside init_block
	mov	word [rsi+zlib_dstate_dyn_ltree_ofs + 256*4], 1		; dyn_ltree[END_BLOCK].Freq = 1

	; lm_init sprinkled above, and here
	mov	dword [rsi+zlib_dstate_match_length_ofs], 2		; min_match - 1
	mov	dword [rsi+zlib_dstate_prev_length_ofs], 2		; ""
	; mov	dword [rsi+zlib_dstate_max_lazy_match_ofs], zlib_max_lazy
	; mov	dword [rsi+zlib_dstate_good_match_ofs], zlib_good_length
	; mov	dword [rsi+zlib_dstate_nice_match_ofs], zlib_nice_length
	; mov	dword [rsi+zlib_dstate_max_chain_length_ofs], zlib_max_chain

	; CLEAR_HASH(s) was taken care of by the memset above
	mov	rax, rdi
	add	rsp, 24
	epilog
dalign
zlib_static_l_desc:
	dq	zlib_static_ltree, extra_lbits, 257, 286, 15
dalign
zlib_static_d_desc:
	dq	zlib_static_dtree, zlib_extra_dbits, 0, 30, 15
dalign
zlib_static_bl_desc:
	dq	0, zlib_extra_blbits, 0, 19, 7
dalign
extra_lbits:
	dd	0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0
dalign
zlib_extra_dbits:
	dd	0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13
dalign
zlib_extra_blbits:
	dd	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7
dalign
zlib_static_ltree:
	dw	 12,  8, 140,  8,  76,  8, 204,  8,  44,  8
	dw	172,  8, 108,  8, 236,  8,  28,  8, 156,  8
	dw	92,  8, 220,  8,  60,  8, 188,  8, 124,  8
	dw	252,  8,   2,  8, 130,  8,  66,  8, 194,  8
	dw	34,  8, 162,  8,  98,  8, 226,  8,  18,  8
	dw	146,  8,  82,  8, 210,  8,  50,  8, 178,  8
	dw	114,  8, 242,  8,  10,  8, 138,  8,  74,  8
	dw	202,  8,  42,  8, 170,  8, 106,  8, 234,  8
	dw	26,  8, 154,  8,  90,  8, 218,  8,  58,  8
	dw	186,  8, 122,  8, 250,  8,   6,  8, 134,  8
	dw	70,  8, 198,  8,  38,  8, 166,  8, 102,  8
	dw	230,  8,  22,  8, 150,  8,  86,  8, 214,  8
	dw	54,  8, 182,  8, 118,  8, 246,  8,  14,  8
	dw	142,  8,  78,  8, 206,  8,  46,  8, 174,  8
	dw	110,  8, 238,  8,  30,  8, 158,  8,  94,  8
	dw	222,  8,  62,  8, 190,  8, 126,  8, 254,  8
	dw	1,  8, 129,  8,  65,  8, 193,  8,  33,  8
	dw	161,  8,  97,  8, 225,  8,  17,  8, 145,  8
	dw	81,  8, 209,  8,  49,  8, 177,  8, 113,  8
	dw	241,  8,   9,  8, 137,  8,  73,  8, 201,  8
	dw	41,  8, 169,  8, 105,  8, 233,  8,  25,  8
	dw	153,  8,  89,  8, 217,  8,  57,  8, 185,  8
	dw	121,  8, 249,  8,   5,  8, 133,  8,  69,  8
	dw	197,  8,  37,  8, 165,  8, 101,  8, 229,  8
	dw	21,  8, 149,  8,  85,  8, 213,  8,  53,  8
	dw	181,  8, 117,  8, 245,  8,  13,  8, 141,  8
	dw	77,  8, 205,  8,  45,  8, 173,  8, 109,  8
	dw	237,  8,  29,  8, 157,  8,  93,  8, 221,  8
	dw	61,  8, 189,  8, 125,  8, 253,  8,  19,  9
	dw	275,  9, 147,  9, 403,  9,  83,  9, 339,  9
	dw	211,  9, 467,  9,  51,  9, 307,  9, 179,  9
	dw	435,  9, 115,  9, 371,  9, 243,  9, 499,  9
	dw	11,  9, 267,  9, 139,  9, 395,  9,  75,  9
	dw	331,  9, 203,  9, 459,  9,  43,  9, 299,  9
	dw	171,  9, 427,  9, 107,  9, 363,  9, 235,  9
	dw	491,  9,  27,  9, 283,  9, 155,  9, 411,  9
	dw	91,  9, 347,  9, 219,  9, 475,  9,  59,  9
	dw	315,  9, 187,  9, 443,  9, 123,  9, 379,  9
	dw	251,  9, 507,  9,   7,  9, 263,  9, 135,  9
	dw	391,  9,  71,  9, 327,  9, 199,  9, 455,  9
	dw	39,  9, 295,  9, 167,  9, 423,  9, 103,  9
	dw	359,  9, 231,  9, 487,  9,  23,  9, 279,  9
	dw	151,  9, 407,  9,  87,  9, 343,  9, 215,  9
	dw	471,  9,  55,  9, 311,  9, 183,  9, 439,  9
	dw	119,  9, 375,  9, 247,  9, 503,  9,  15,  9
	dw	271,  9, 143,  9, 399,  9,  79,  9, 335,  9
	dw	207,  9, 463,  9,  47,  9, 303,  9, 175,  9
	dw	431,  9, 111,  9, 367,  9, 239,  9, 495,  9
	dw	31,  9, 287,  9, 159,  9, 415,  9,  95,  9
	dw	351,  9, 223,  9, 479,  9,  63,  9, 319,  9
	dw	191,  9, 447,  9, 127,  9, 383,  9, 255,  9
	dw	511,  9,   0,  7,  64,  7,  32,  7,  96,  7
	dw	16,  7,  80,  7,  48,  7, 112,  7,   8,  7
	dw	72,  7,  40,  7, 104,  7,  24,  7,  88,  7
	dw	56,  7, 120,  7,   4,  7,  68,  7,  36,  7
	dw	100,  7,  20,  7,  84,  7,  52,  7, 116,  7
	dw	3,  8, 131,  8,  67,  8, 195,  8,  35,  8
	dw	163,  8,  99,  8, 227,  8
dalign
zlib_static_dtree:
	dw	 0, 5, 16, 5,  8, 5, 24, 5,  4, 5
	dw	20, 5, 12, 5, 28, 5,  2, 5, 18, 5
	dw	10, 5, 26, 5,  6, 5, 22, 5, 14, 5
	dw	30, 5,  1, 5, 17, 5,  9, 5, 25, 5
 	dw	5, 5, 21, 5, 13, 5, 29, 5,  3, 5
	dw	19, 5, 11, 5, 27, 5,  7, 5, 23, 5
dalign
zlib_dist_code:
 	db	0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8
 	db	8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10
	db	10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11
	db	11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
	db	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13
	db	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
	db	13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14
	db	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14
	db	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14
	db	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15
	db	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
	db	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
	db	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  0,  0, 16, 17
	db	18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22
	db	23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
	db	24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
	db	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26
	db	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27
	db	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27
	db	27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
	db	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
	db	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
	db	28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
	db	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
	db	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
	db	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
dalign
zlib_length_code:
 	db	0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 12, 12
	db	13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16
	db	17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19
	db	19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
	db	21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22
	db	22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23
	db	23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
	db	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
	db	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
	db	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26
	db	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26
	db	26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27
	db	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28
dalign
zlib_base_length:
	dd	0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56
	dd	64, 80, 96, 112, 128, 160, 192, 224, 0
dalign
zlib_base_dist:
    	dd	0,     1,     2,     3,     4,     6,     8,    12,    16,    24
   	dd	32,    48,    64,    96,   128,   192,   256,   384,   512,   768
 	dd	1024,  1536,  2048,  3072,  4096,  6144,  8192, 12288, 16384, 24576


end if

	; flush flags can be one of:
zlib_no_flush = 0
zlib_partial_flush = 1
zlib_sync_flush = 2
zlib_full_flush = 3
zlib_finish = 4
zlib_block = 5
	; zlib_trees is used for inflate side not this one:
zlib_trees = 6


if used zlib$deflate | defined include_everything

	; two arguments: rdi == z_stream pointer, esi == flush flags

	; we return a bool in eax (unlike the actual zlib), 1 == Z_OK equiv, 0 == fail
	; and in our implementation, we really don't care WHY it failed, only that it did.
	
		; NOTE: we use the input buffer's 16 user bytes for our own state information
		; TODO: maybe someday when I am bored I can just add these to the dstate information
		; and eliminate having to carry around r14/r15, hmm
falign
zlib$deflate:
	prolog	zlib$deflate
	push	rbx r12 r13 r14 r15
	mov	rbx, rdi
	mov	r12, [rdi+zlib_state_ofs]
	mov	r13d, esi
	mov	r14, [rdi+zlib_inbuf_ofs]
	mov	r15, [rdi+zlib_outbuf_ofs]
	test	r12, r12
	jz	.error_return
	cmp	r13d, 0
	jl	.error_return
	cmp	r13d, zlib_block
	jg	.error_return
	test	r14, r14
	jz	.error_return
	test	r15, r15
	jz	.error_return

	mov	rdi, r15
	mov	esi, zlib_deflate_reserve
	call	buffer$reserve

	; setup our user-space vars inside the inbuf so that we don't have to use the head of it 
	; and consume (bad for large buffers of course)
	mov	rax, [r14+buffer_length_ofs]
	mov	rcx, [r14+buffer_itself_ofs]
	mov	[r14+buffer_user_ofs], rcx		; user_ofs == current pointer
	mov	[r14+buffer_user_ofs+8], rax		; user_ofs+8 == remaining bytes

	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	mov	ecx, [r12+zlib_dstate_wrap_ofs]
	; so now, z_stream is in rbx, dstate is in r12, r13d has our flush flags, r14 has our inbuf, r15 has our outbuf
	mov	eax, [r12+zlib_dstate_status_ofs]
	; this is one big fallthrough mess
	cmp	eax, zlib_dstate_init
	jne	.state_not_init
	jmp	qword [rcx*8+.header_wrap_table]
dalign
.header_wrap_table:
	dq	.noheader, .zheader, .gzheader
calign
.gzheader:
	cmp	qword [r12+zlib_dstate_gzhead_ofs], 0
	jne	.state_init_wrap2_withgzhead
	mov	qword [rbx+zlib_adler_ofs], 0	; crc32(0, null, 0) == 0
	mov	r10, [.state_init_wrap2_gzhead]
	; possible values for the 9th byte:
	xor	ecx, ecx			; we'll use this one
	mov	edx, [r12+zlib_dstate_level_ofs]
	mov	r8d, 2
	mov	r9d, 4
	cmp	edx, 9
	cmove	ecx, r8d
	cmp	edx, 2
	cmovl	ecx, r9d
	cmp	dword [r12+zlib_dstate_strategy_ofs], 2
	cmovae	ecx, r9d			; TODO: if we already set ecx to nonzero, we shouldn't do this, eh?
	mov	edx, 3				; OS_CODE == unix
	mov	qword [rdi+rsi], r10
	mov	byte [rdi+rsi+8], cl
	mov	byte [rdi+rsi+9], dl
	add	rsi, 10
	mov	[r12+zlib_dstate_pending_ofs], rsi
	mov	eax, zlib_dstate_busy
	mov	[r12+zlib_dstate_status_ofs], eax
	jmp	.state_not_init
dalign
.state_init_wrap2_gzhead:
	db	31, 139, 8, 0, 0, 0, 0, 0	; 8 bytes
calign
.state_init_wrap2_withgzhead:
	; we don't really use any of this functionality... TODO, someday when I am bored, fill this out.
	; since this would require the use of a user supplied buffer to dump the gzhead stuff into
	; this won't break during normal runtime operations
	breakpoint
calign
.state_init_wrapnot2:
.zheader:
.noheader:
	; wrap was not two
	; for us, strategy should always be zero on entry, TODO: redo these cmovs?
	mov	eax, zlib_window_bits
	mov	r10d, [r12+zlib_dstate_level_ofs]
	sub	eax, 8
	mov	ecx, 3
	shl	eax, 4
	mov	edx, 1
	mov	r8d, 2
	add	eax, 8
	xor	r9d, r9d
	shl	eax, 8

	cmp	dword [r12+zlib_dstate_strategy_ofs], 2	; Z_HUFFMAN_ONLY
	cmovae	ecx, r9d
	cmp	r10d, 6
	cmove	ecx, r8d
	cmovb	ecx, edx
	cmp	r10d, 2
	cmovb	ecx, r9d			; see above comment re: strategy always being zero for us/TODO
	; so ecx now has our level_flags, eax has our header
	shl	ecx, 6
	or	eax, ecx
	mov	edx, eax
	or	edx, 0x20			; PRESET_DICT
	cmp	dword [r12+zlib_dstate_strstart_ofs], 0
	cmovne	eax, edx
	; next up: header += 31 - (header % 31), wtf?
	; this is some funky goods here
	mov	ecx, eax
	mov	edx, 0x8421085
	mul	edx
	xor	eax, eax
	sub	ecx, edx
	shr	ecx, 1
	add	ecx, edx
	shr	ecx, 4
	mov	eax, ecx
	sal	eax, 5
	sub	eax, ecx
	add	eax, 31
	; so eax is our header ushort
	xchg	ah, al
	mov	word [rdi+rsi], ax
	add	rsi, 2
	mov	[r12+zlib_dstate_pending_ofs], rsi
	mov	eax, zlib_dstate_busy
	mov	dword [r12+zlib_dstate_status_ofs], eax
	mov	rcx, [rbx+zlib_adler_ofs]
	mov	qword [rbx+zlib_adler_ofs], 1	; adler32(0, null, 0) == 1
	cmp	dword [r12+zlib_dstate_strstart_ofs], 0
	je	.state_not_init
	; else, we have two more shorts to put in there, derived from rcx
	mov	edx, ecx		; save it
	shr	ecx, 16
	xchg	ch, cl
	xchg	dh, dl
	mov	word [rdi+rsi], cx
	mov	word [rdi+rsi+2], dx
	add	rsi, 4
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; fallthrough to state_not_init okay
calign
.state_not_init:
	; NOTE: we are skipping EXTRA_STATE, NAME_STATE, COMMENT_STATE, and HCRC_STATE
	; because none of my streaming goods need them... TODO: revisit when I am bored? haha

	; so here, he flushes the pending output... and I am not sure I see the reason behind emptying it, and then
	; he checks avail_out for zero, and returns OK, waiting for the caller to call here again...

	; in further consideration, especially considering the way I use these routines
	; pretty sure we can just ensure that avail_out will _never_ be zero (and thus just reserve the amount of space
	; we need)... I appreciate the motivation and reason behind his choices on that... and his way works a treat

	; so at this point, we need to determine whether or not we go ahead with the deflate_* or not, based on
	; whether we have actual data in our inbuf (which we _should_)
	mov	eax, [r12+zlib_dstate_strategy_ofs]

	; r14 is our inbuf
	cmp	qword [r14+buffer_length_ofs], 0			; this is still valid, since we haven't touched our user vars yet
	jne	.doblock
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 0		; will this ever be true the way we are using it?
	jne	.doblock
	; r13d is our flush flags
	test	r13d, r13d		; zlib_no_flush == 0
	jz	.block_done_or_no_block
	cmp	dword [r12+zlib_dstate_status_ofs], zlib_dstate_finish
	je	.block_done_or_no_block
calign
.doblock:
	; a note here: our strategy is fixed at zero, so during normal operations, these won't occur
	; (and only would if you are playing around with it)
	cmp	eax, 2			; Z_HUFFMAN_ONLY
	je	.deflate_huff
	cmp	eax, 3			; Z_RLE
	je	.deflate_rle
	; otherwise, depends on our configuration level
	if zlib_deflate_level = 0
		jmp	.deflate_stored
	else if zlib_deflate_level = 1 | zlib_deflate_level = 2 | zlib_deflate_level = 3
		jmp	.deflate_fast
	else
		jmp	.deflate_slow
	end if
calign
.bstate_done:
	; effectively the "return" of our previous jump to deflate_{huff,rle,stored,fast,slow}
	; and our return is in eax
zlib_bstate_need_more = 0
zlib_bstate_block_done = 1
zlib_bstate_finish_started = 2
zlib_bstate_finish_done = 3
	; restore/make sure rdi/rsi are valid and pointing to the pending buffer (so that all of the various places that jump back to here don't have to worryabout it)
	; one way or another, the above call to deflate_* exhausted the input buffer, so we reset it here (though I spose we don't really have to even touch it)
	; reset doesn't do much, and for my use-case scenarios, this works well
	mov	rdi, r14
	call	buffer$reset

	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	mov	edx, [r12+zlib_dstate_status_ofs]
	mov	ecx, zlib_dstate_finish
	cmp	eax, zlib_bstate_finish_done
	cmove	edx, ecx
	cmp	eax, zlib_bstate_finish_started
	cmove	edx, ecx
	mov	[r12+zlib_dstate_status_ofs], edx
	je	.success_return
	cmp	eax, zlib_bstate_need_more
	je	.success_return
	cmp	eax, zlib_bstate_block_done
	jne	.block_done_or_no_block
	; so bstate is block_done here.... check for Z_PARTIAL_FLUSH, and flush != Z_BLOCK, then move to the trailer check
	cmp	r13d, zlib_partial_flush
	je	.bstate_done_partial_flush
	cmp	r13d, zlib_block
	je	.block_done_or_no_block		; Z_BLOCK just goes straight to the trailer
	; else, FULL_FLUSH or SYNC_FLUSH
	cmp	r13d, zlib_full_flush
	je	.bstate_done_full_flush
	; else, SYNC_FLUSH, so just store an empty block, and go to the trailer
	; _tr_stored_block(s, (char *)0, 0L, 0);	args are: s, buf, stored_len, last
	; which does: send_bits(s, (STORED_BLOCK<<1)+last, 3);
	;   and then: copy_block(s, buf, (unsigned)stored_len, 1);
	;
	; STORED_BLOCK=0
	; send_bits(s, (STORED_BLOCK << 1) + last, 3);

macro send_bits_lit value*,length* {
	; length is meant to be a literal value, not a register/computed
	; rdi/rsi must be pointing to the correct pending buffer
	; value must be a reg
	; r12 must be our deflate state block
	; NOTE: no MSB conversion is done for putting these into the buffer
	; we blast ecx, edx, and r8d
	local	.overflow,.enoughroom,.exit,.binfname

	mov	ecx, dword [r12+zlib_dstate_bi_valid_ofs]
	cmp	ecx, zlib_buf_size - length
	jg	.overflow

	shl	value, cl
	or	qword [r12+zlib_dstate_bi_buf_ofs], value
	add	ecx, length
	mov	[r12+zlib_dstate_bi_valid_ofs], ecx
	; hmmm, something goes astray with shr r64, 64
	; and/or |= val << 64, wtf
	cmp	ecx, zlib_buf_size 
	jl	.exit
	; otherwise, clear it
	mov	rdx, [r12+zlib_dstate_bi_buf_ofs]
	mov	[rdi+rsi], rdx
	add	rsi, 8
	mov	dword [r12+zlib_dstate_bi_valid_ofs], 0
	mov	qword [r12+zlib_dstate_bi_buf_ofs], 0
	mov	qword [r12+zlib_dstate_pending_ofs], rsi
	jmp	.exit
calign
.overflow:
	; bi_buf |= value << bi_valid
	; save bi_buf for 8 bytes into output
	; bi_buf = value >> (zlib_buf_size - bi_valid)
	; bi_valid += length - zlib_buf_size (which will always be negative)
	mov	ecx, dword [r12+zlib_dstate_bi_valid_ofs]
	mov	rdx, value
	shl	value, cl		; value << bi_valid
	mov	r8d, zlib_buf_size
	sub	r8d, ecx		; buf_size - bi_valid
	mov	ecx, r8d
	shr	rdx, cl			; value >> (zlib_buf_size - bi_valid)
	mov	rcx, [r12+zlib_dstate_bi_buf_ofs]
	or	rcx, value				; value to send = old value | (value << bi_valid)
	mov	[r12+zlib_dstate_bi_buf_ofs], rdx	; bi_buf = value >> (zlib_buf_size - bi_valid)

	; put qword
	mov	qword [rdi+rsi], rcx
	add	rsi, 8
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; set new bi_valid
	mov	ecx, [r12+zlib_dstate_bi_valid_ofs]
	mov	edx, length
	sub	edx, zlib_buf_size
	add	ecx, edx
	mov	[r12+zlib_dstate_bi_valid_ofs], ecx
calign
.exit:
}
macro send_bits value*,length* {
	; length must be a register, not ecx, edx, or r8d, or r9d
	; rdi/rsi must be pointing to the correct pending buffer
	; value must be a reg also
	; r12 must be our deflate state block
	; NOTE: no MSB conversion is done for putting these into the buffer
	; we blast ecx, edx, r8d, and r9d
	local	.overflow,.enoughroom,.exit,.binfname

	mov	ecx, [r12+zlib_dstate_bi_valid_ofs]

	mov	r9d, zlib_buf_size
	sub	r9d, length
	cmp	ecx, r9d
	jg	.overflow

	shl	value, cl
	or	qword [r12+zlib_dstate_bi_buf_ofs], value
	add	ecx, length
	mov	[r12+zlib_dstate_bi_valid_ofs], ecx
	; hmmm, something goes astray with shr 64, 64
	; and/or |= val << 64, wtf
	cmp	ecx, zlib_buf_size
	jl	.exit
	; otherwise, clear it
	mov	rdx, [r12+zlib_dstate_bi_buf_ofs]
	mov	[rdi+rsi], rdx
	add	rsi, 8
	mov	qword [r12+zlib_dstate_bi_buf_ofs], 0
	mov	dword [r12+zlib_dstate_bi_valid_ofs], 0
	mov	qword [r12+zlib_dstate_pending_ofs], rsi
	jmp	.exit
calign
.overflow:
	; bi_buf |= value << bi_valid
	; save bi_buf for 8 bytes into output
	; bi_buf = value >> (zlib_buf_size - bi_valid)
	; bi_valid += length - zlib_buf_size (which will always be negative)
	mov	rdx, value
	mov	r9d, zlib_buf_size
	sub	r9d, ecx		; buf_size - bi_valid
	shl	value, cl		; value << bi_valid
	mov	ecx, r9d
	shr	rdx, cl			; value >> (zlib_buf_size - bi_valid)
	mov	rcx, [r12+zlib_dstate_bi_buf_ofs]
	or	rcx, value
	mov	qword [r12+zlib_dstate_bi_buf_ofs], rdx

	; put qword
	mov	qword [rdi+rsi], rcx
	add	rsi, 8
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; set new bi_valid
	mov	ecx, [r12+zlib_dstate_bi_valid_ofs]
	mov	edx, length
	sub	edx, zlib_buf_size
	add	ecx, edx
	mov	[r12+zlib_dstate_bi_valid_ofs], ecx
calign
.exit:
}
	
macro bi_windup {
	; this flushes whatever is in bi_buf, "aligned output on a byte boundary", heh
	; rdi/rsi must be pointing to the correct pending buffer
	; r12 must be our deflate state block
	; we blast ecx, edx, r8d, r9d (to avoid branching)
	mov	ecx, [r12+zlib_dstate_bi_valid_ofs]
	mov	rdx, [r12+zlib_dstate_bi_buf_ofs]
	add	ecx, 7
	and	ecx, not 7
	shr	ecx, 3
	mov	qword [rdi+rsi], rdx
	add	rsi, rcx
	mov	[r12+zlib_dstate_pending_ofs], rsi
	xor	edx, edx
	mov	[r12+zlib_dstate_bi_buf_ofs], rdx
	mov	[r12+zlib_dstate_bi_valid_ofs], edx
}
macro bi_flush {
	; if there is >= 8 bits in bi_buf, get rid of all but 7
	local ..exit

	; rdi/rsi must be pointing to the correct pending buffer
	; r12 must be our deflate state block
	; we blast eax, ecx, edx, r8d, r9d, r10d, r11d (to avoid branching)
	mov	ecx, [r12+zlib_dstate_bi_valid_ofs]
	mov	r8d, ecx
	mov	rdx, [r12+zlib_dstate_bi_buf_ofs]
	cmp	ecx, 8
	jb	..exit
	
	mov	[rdi+rsi], rdx			; regardless of how many, doesn't hurt
	shr	ecx, 3				; how many bytes we actually added
	add	rsi, rcx
	shl	ecx, 3				; back to bit count
	sub	r8d, ecx
	mov	[r12+zlib_dstate_bi_valid_ofs], r8d
	shr	rdx, cl
	mov	[r12+zlib_dstate_bi_buf_ofs], rdx

	mov	[r12+zlib_dstate_pending_ofs], rsi
calign
..exit:
}
	xor	eax, eax
	send_bits_lit rax, 3
	; next up: copy_block(0, 0, 1) where buf, stored_len are 0, and last = 1
	bi_windup
	; because buf and length are both zero, but header is required, we only adding the two shorts
	xor	eax, eax
	xor	ecx, ecx
	not	eax
	mov	word [rdi+rsi], cx
	mov	word [rdi+rsi+2], ax
	add	rsi, 4
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; _tr_stored_block is now complete.
	jmp	.block_done_or_no_block		; do the trailer bit next, which will flush_pending for all possible branches here
calign
.bstate_done_full_flush:
	; _tr_stored_block(s, (char*)0, 0L, 0);
	xor	eax, eax
	send_bits_lit rax, 3
	; next up: copy_block(0, 0, 1) where buf, stored_len are 0, and last = 1
	bi_windup
	; because buf and length are both zero, but header is required, we only adding the two shorts
	xor	eax, eax
	xor	ecx, ecx
	not	eax
	mov	word [rdi+rsi], cx
	mov	word [rdi+rsi+2], ax
	add	rsi, 4
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; _tr_stored_block is now complete.
	; unlike SYNC_FLUSH, we need to CLEAR_HASH(s), and also do:
	; if (s->lookahead == 0) {
	;   s->strstart = 0;
	;   s->block_start = 0;
	;   s->insert = 0;
	; }
	; CLEAR_HASH(s) == memset(s->head, 0, zlib_head_bytes)
	mov	rdi, [r12+zlib_dstate_head_ofs]
	xor	esi, esi
	mov	edx, zlib_head_bytes
	call	memset32
	; restore rdi/rsi back to our pending buffer
	xor	ecx, ecx
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	; if loadhead nonzero, jump straight to block_done_or_no_block
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 0
	jne	.block_done_or_no_block
	mov	dword [r12+zlib_dstate_strstart_ofs], ecx
	mov	[r12+zlib_dstate_block_start_ofs], rcx
	mov	dword [r12+zlib_dstate_insert_ofs], ecx
	jmp	.block_done_or_no_block
calign
.bstate_done_partial_flush:
	; _tr_align(s), then jmp to .block_done_or_no_block
	mov	eax, 2			; STATIC_TREES << 1
	send_bits_lit rax, 3
	; send_code(END_BLOCK, zlib_static_ltree) is next, END_BLOCK = 256, so we need to load up offset 256 * 4 from zlib_static_ltree
	mov	eax, dword [zlib_static_ltree + 1024]
	; the code, then length are encoded as shorts, code first, then length
	; so the low order word of eax is the code, and the high order is the length
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d
	; bi_flush
	; fallthrough toe block_done_or_no_block
calign
.block_done_or_no_block:
	; Assert(strm->avail_out > 0, "bug2");
	mov	eax, [r12+zlib_dstate_wrap_ofs]
	cmp	r13d, zlib_finish
	jne	.success_return
	jmp	qword [rax*8+.block_done_wrapjump]
dalign
.block_done_wrapjump:
	dq	.blockdone_nowrap, .blockdone_zwrap, .blockdone_gzwrap
calign
.blockdone_nowrap:
	; success return copy (NOTE: this is Z_STREAM_END return)
	; we flush all pending output here
	bi_flush
	mov	rdi, r15
	mov	rsi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rdx, [r12+zlib_dstate_pending_ofs]
	call	buffer$append
	xor	ecx, ecx
	mov	[r12+zlib_dstate_pending_ofs], rcx
	mov	eax, 1
	pop	r15 r14 r13 r12 rbx
	epilog
calign
.blockdone_gzwrap:
	; two LSB uint32's get dumped into the buffer here
	mov	rcx, [rbx+zlib_adler_ofs]
	mov	rdx, [rbx+zlib_totalin_ofs]
	mov	dword [rdi+rsi], ecx
	mov	dword [rdi+rsi+4], edx
	add	rsi, 8
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; copy of .success_return
	; we flush all pending output here
	bi_flush
	mov	rdi, r15
	mov	rsi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rdx, [r12+zlib_dstate_pending_ofs]
	call	buffer$append
	xor	ecx, ecx
	mov	[r12+zlib_dstate_pending_ofs], rcx
	mov	eax, 1
	pop	r15 r14 r13 r12 rbx
	epilog
calign
.blockdone_zwrap:
	; else, putShortMSB(strm->adler >> 16)
	; and   putShortMSB(strm->adler & 0xffff)
	; then set wrap = -wrap and be done
	mov	rcx, [rbx+zlib_adler_ofs]
	mov	edx, ecx		; save it
	shr	ecx, 16
	xchg	ch, cl
	xchg	dh, dl
	mov	word [rdi+rsi], cx
	mov	word [rdi+rsi+2], dx
	add	rsi, 4
	mov	[r12+zlib_dstate_pending_ofs], rsi
	; copy of .success_return fallthrough to avoid the extra jump
	; we flush all pending output here
	bi_flush
	mov	rdi, r15
	mov	rsi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rdx, [r12+zlib_dstate_pending_ofs]
	call	buffer$append
	xor	ecx, ecx
	mov	[r12+zlib_dstate_pending_ofs], rcx
	mov	eax, 1
	pop	r15 r14 r13 r12 rbx
	epilog
calign
.success_return:
	; we flush all pending output here
	bi_flush
	mov	rdi, r15
	mov	rsi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rdx, [r12+zlib_dstate_pending_ofs]
	call	buffer$append
	xor	ecx, ecx
	mov	[r12+zlib_dstate_pending_ofs], rcx
	mov	eax, 1
	pop	r15 r14 r13 r12 rbx
	epilog
calign
.error_return:
	xor	eax, eax
	pop	r15 r14 r13 r12 rbx
	epilog


	;
	; for all of the deflate_ methods, they are jumped to, not called
	; which means when they are done doing their business, they must put one of the zlib_bstate constants into eax
	; and then do a direct jump again to .bstate_done
	;
	; bstate constants:
	; zlib_bstate_need_more = 0
	; zlib_bstate_block_done = 1
	; zlib_bstate_finish_started = 2
	; zlib_bstate_finish_done = 3
	;
;--------------------------------------------------- deflate_stored -------------------------------------------------
calign
.deflate_stored:
	; on entry:
	; r12 == our dstate, r13d == flush, r14 == inbuf, r15 == outbuf, rbx == z_stream, rdi == pending buffer, rsi == pending offset
	; see commentary above re: return method
	;
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 1
	ja	.deflate_stored_windowokay
	call	.fill_window
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 0
	jne	.deflate_stored_windowokay
	mov	eax, zlib_bstate_need_more
	cmp	r13d, zlib_no_flush
	je	.bstate_done
	jmp	.deflate_stored_loopdone
calign
.deflate_stored_windowokay:
	mov	eax, [r12+zlib_dstate_lookahead_ofs]
	add	dword [r12+zlib_dstate_strstart_ofs], eax
	mov	dword [r12+zlib_dstate_lookahead_ofs], 0
	; figure out max_block_size
	mov	eax, 0xffff
	mov	ecx, zlib_overlay_bytes - 5
	cmp	eax, zlib_overlay_bytes - 5
	cmova	eax, ecx
	; max_block_size in rax
	mov	rcx, [r12+zlib_dstate_block_start_ofs]
	add	rcx, rax
	mov	edx, [r12+zlib_dstate_strstart_ofs]
	cmp	rdx, rcx
	jae	.deflate_stored_loop_case1
calign
.deflate_stored_loop_case1_continue:
	; strstart is still in edx
	sub	rdx, qword [r12+zlib_dstate_block_start_ofs]	; strstart - block_start
	cmp	edx, zlib_wsize - 262	
	jb	.deflate_stored		; go back around again
	; else, FLUSH_BLOCK(0)

	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block

	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	; FLUSH_BLOCK(0) done

	; then back around again
	jmp	.deflate_stored
calign
.deflate_stored_loop_case1:
	; lookaehead = (s->strstart - max_start)
	; strstart = max_start
	; flush_block(0)
	; then make sure strstart is still in edx, and jump to deflate_stored_loop_case1_continue
	; strstart is in edx, max_start is in rcx
	mov	r8, rdx
	sub	r8, rcx
	mov	dword [r12+zlib_dstate_lookahead_ofs], r8d	; lookahead = (strstart - max_start)
	mov	rdx, rcx
	mov	dword [r12+zlib_dstate_strstart_ofs], edx	; strstart = (max_start)
	; FLUSH_BLOCK(0)
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block

	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	; FLUSH_BLOCK(0) done
	mov	edx, [r12+zlib_dstate_strstart_ofs]
	jmp	.deflate_stored_loop_case1_continue

calign
.deflate_stored_loopdone:
	mov	dword [r12+zlib_dstate_insert_ofs], 0
	cmp	r13d, zlib_finish
	je	.deflate_stored_loopdone_finish
	mov	eax, zlib_bstate_block_done
	mov	edx, [r12+zlib_dstate_strstart_ofs]
	cmp	rdx, qword [r12+zlib_dstate_block_start_ofs]
	jle	.bstate_done
	; else, FLUSH_BLOCK(0)
	; FLUSH_BLOCK(0)
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block

	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	; FLUSH_BLOCK(0) done
	mov	eax, zlib_bstate_block_done
	jmp	.bstate_done
calign
.deflate_slow_finish:			; slow finish is the same, just calls flush_block(1) and returns .bstate_done
.deflate_fast_finish:			; fast finish is the same, just calls flush_block(1) and returns .bstate_done
.deflate_stored_loopdone_finish:
	; flush flags said finish, so FLUSH_BLOCK(1) and return zlib_bstate_finish_done
	; FLUSH_BLOCK(1)

	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	mov	edx, 1			; last
	call	.tr_flush_block

	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	; FLUSH_BLOCK(1) done
	mov	eax, zlib_bstate_finish_done
	jmp	.bstate_done


;--------------------------------------------------- deflate_fast ---------------------------------------------------
calign
.deflate_fast:
	; on entry:
	; r12 == our dstate, r13d == flush, r14 == inbuf, r15 == outbuf, rbx == z_stream, rdi == pending buffer, rsi == pending offset
	; see commentary above deflate_stored re: return method
	;
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 262
	jae	.deflate_fast_windowokay
	call	.fill_window
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 262
	jb	.deflate_fast_checkwindow
calign
.deflate_fast_windowokay:
	; UPDATE_HASH(s, h, c) = (h = (((h)<hash_shift) ^ (c)) & s->hash_mask)
	;
	;
	; INSERT_STRING(s, str, match_head) = 
	; (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), 
	; match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h],
	; s->head[s->ins_h] = (Pos)(str))
	mov	eax, [r12+zlib_dstate_strstart_ofs]		; load this up preemptively for noinitialinsert as well
	xor	r10d, r10d	; hash_head
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 3
	jb	.deflate_fast_noinitialinsert
	; INSERT_STRING(s, s->strstart, hash_head)
	; UPDATE_HASH(s, s->ins_h, s->window[(s->str_start) + (MIN_MATCH-1)])
	mov	rdx, [r12+zlib_dstate_window_ofs]
	; eax already strstart
	add	eax, 2					; MIN_MATCH - 1
	movzx	eax, byte [rdx+rax]			; c for UPDATE_HASH
	mov	edx, dword [r12+zlib_dstate_ins_h_ofs]	; h for UPDATE_HASH
	shl	edx, zlib_hashshift
	xor	edx, eax
	and	edx, zlib_hashmask			; h = (((h)<hash_shift) ^ (c) & s->hash_mask)
	mov	dword [r12+zlib_dstate_ins_h_ofs], edx

	; next up: match_head (hash_head, r10d) = s->prev[(s->strstart) & s->w_mask] = s->head[s->ins_h]
	mov	r8, [r12+zlib_dstate_head_ofs]
	mov	r9, [r12+zlib_dstate_prev_ofs]
	mov	ecx, dword [r12+zlib_dstate_strstart_ofs]
	movzx	r10d, word [r8+rdx*2]
	and	ecx, zlib_wmask
	mov	word [r9+rcx*2], r10w

	; next up: load back up strstart and set s->head[s->ins_h] to it
	mov	eax, [r12+zlib_dstate_strstart_ofs]
	mov	word [r8+rdx*2], ax
	; END INSERT_STRING(s, s->strstart, hash_head)
calign
.deflate_fast_noinitialinsert:
	test	r10d, r10d
	jz	.deflate_fast_check_match_length
	sub	eax, r10d
	cmp	eax, zlib_wsize - 262
	ja	.deflate_fast_check_match_length
	; else, s->match_length = longest_match(s, hash_head)
	; we'll go ahead and use rdi as our hash_head argument
	mov	edi, r10d
	push	r10
	call	.longest_match
	pop	r10
	mov	dword [r12+zlib_dstate_match_length_ofs], eax
calign
.deflate_fast_check_match_length:
	cmp	dword [r12+zlib_dstate_match_length_ofs], 3
	jb	.deflate_fast_literalonly
	push	r10
if defined zlib_debug_wedontdothis
	mov	edi, dword [r12+zlib_dstate_strstart_ofs]
	mov	esi, dword [r12+zlib_dstate_match_start_ofs]
	mov	edx, dword [r12+zlib_dstate_match_length_ofs]
	call	.check_match
end if
	; _tr_tally_dist(s, s->strstart - s->match_start, s->match_length - MIN_MATCH, bflush)

	mov	r8, [r12+zlib_dstate_l_buf_ofs]
	mov	r9, [r12+zlib_dstate_d_buf_ofs]
	mov	ecx, dword [r12+zlib_dstate_last_lit_ofs]

	; and it is a macro
	mov	edi, dword [r12+zlib_dstate_strstart_ofs]
	mov	esi, dword [r12+zlib_dstate_match_start_ofs]
	sub	edi, esi
	mov	esi, dword [r12+zlib_dstate_match_length_ofs]
	sub	esi, 3
	; edi == distance
	; esi == length
	; last_lit is a u32

	; we're using r10 as our hash_head, and bflush needs to be set by _tr_tally_dist macro
	; probably easiest way is to push it onto the stack along with hash_head and then end of loop check can pop them both
	; so we are free to blast eax, ecx, edx, r8d, r9d, and if we push r10 that too
	mov	word [r9+rcx*2], di			; s->d_buf[last_lit] = dist (word)
	mov	byte [r8+rcx], sil			; s->l_buf[last_lit] = len (byte)

	add	ecx, 1					; last_lit++
	sub	edi, 1					; distance--
	movzx	edx, byte [rsi+zlib_length_code]		; acquire length code[len]
	mov	dword [r12+zlib_dstate_last_lit_ofs], ecx	; put last_lit back
	lea	r8, [r12+zlib_dstate_dyn_ltree_ofs]
	add	edx, 257				; LITERALS + 1
	lea	r9, [r12+zlib_dstate_dyn_dtree_ofs]
	add	word [r8+rdx*4], 1			; s->dyn_ltree[_zlib_length_code[len]+LITERALS+1].Freq++
	; for the dyn_dtree, we need d_code(dist), and dist is in edi
	mov	r8d, edi
	shr	r8d, 7
	add	r8d, 256				; 256+(dist>>7)
	cmp	edi, 256
	cmovb	r8d, edi
	movzx	eax, byte [r8+zlib_dist_code]
	; so now eax == d_code(dist)
	add	word [r9+rax*4], 1			; s->dyn_dtree[d_code(dist)].Freq++
	; r10 was already pushed above, next is to determine whether to flush or not

	xor	edx, edx
	mov	eax, 1
	cmp	ecx, zlib_litbufsize - 1
	cmovne	eax, edx

	; eax == flush
	push	rax
	; end of _tr_tally_dist

	; next up: s->lookahead -= s->match_length
	mov	edx, dword [r12+zlib_dstate_match_length_ofs]
	sub	dword [r12+zlib_dstate_lookahead_ofs], edx

	; next up: if (s->match_length <= s->max_insert_length && s->lookahead >= MIN_MATCH) ...
	; max_insert_length == same as max_lazy_match
	cmp	edx, zlib_max_lazy
	ja	.deflate_fast_insert_nonew
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 3	; MIN_MATCH
	jb	.deflate_fast_insert_nonew
	; otherwise, insert new strings in hash table
	sub	dword [r12+zlib_dstate_match_length_ofs], 1	; s->match_length--
	; because we aren't calling check_match, this isn't necessary mov	r10, [rsp+8]					; get back hash_head (flush is at [rsp])
calign
.deflate_fast_insert_newstrings:
	mov	eax, dword [r12+zlib_dstate_strstart_ofs]
	add	eax, 1
	mov	dword [r12+zlib_dstate_strstart_ofs], eax	; s->strstart++
	
	; INSERT_STRING(s, s->strstart, hash_head)
	; UPDATE_HASH(s, s->ins_h, s->window[(s->str_start) + (MIN_MATCH-1)])
	mov	rdx, [r12+zlib_dstate_window_ofs]
	; eax already strstart
	add	eax, 2					; MIN_MATCH - 1
	movzx	eax, byte [rdx+rax]			; c for UPDATE_HASH
	mov	edx, dword [r12+zlib_dstate_ins_h_ofs]	; h for UPDATE_HASH
	shl	edx, zlib_hashshift
	xor	edx, eax
	and	edx, zlib_hashmask			; h = (((h)<hash_shift) ^ (c) & s->hash_mask)
	mov	dword [r12+zlib_dstate_ins_h_ofs], edx

	; next up: match_head (hash_head, r10d) = s->prev[(s->strstart) & s->w_mask] = s->head[s->ins_h]
	mov	r8, [r12+zlib_dstate_head_ofs]
	mov	r9, [r12+zlib_dstate_prev_ofs]
	mov	ecx, dword [r12+zlib_dstate_strstart_ofs]
	movzx	r10d, word [r8+rdx*2]
	and	ecx, zlib_wmask
	mov	word [r9+rcx*2], r10w

	; next up: load back up strstart and set s->head[s->ins_h] to it
	mov	eax, [r12+zlib_dstate_strstart_ofs]
	mov	word [r8+rdx*2], ax
	; END INSERT_STRING(s, s->strstart, hash_head)

	sub	dword [r12+zlib_dstate_match_length_ofs], 1
	jnz	.deflate_fast_insert_newstrings

	; r10 (hash_head) got updated, store it back in the stack as well
	mov	[rsp+8], r10

	add	eax, 1
	mov	dword [r12+zlib_dstate_strstart_ofs], eax	; s->strstart++
	; pop our flush and hash_head
	pop	rax						; flush
	pop	r10						; hash_head
	test	eax, eax
	jz	.deflate_fast
	; else, FLUSH_BLOCK(s, 0)
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	jmp	.deflate_fast
calign
.deflate_fast_insert_nonew:
	; match_length is already in edx
	; s->strstart += s->match_length
	mov	r8, [r12+zlib_dstate_window_ofs]
	xor	eax, eax
	add	dword [r12+zlib_dstate_strstart_ofs], edx
	; s->match_length = 0
	mov	dword [r12+zlib_dstate_match_length_ofs], eax
	; s->ins_h = s->window[s->strstart];
	mov	ecx, dword [r12+zlib_dstate_strstart_ofs]
	movzx	eax, byte [r8+rcx]
	mov	[r12+zlib_dstate_ins_h_ofs], eax		; ins_h = window[strstart]
	; next up: UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1])
	; UPDATE_HASH(s, h, c) = (h = (((h)<hash_shift) ^ (c)) & s->hash_mask)

	; eax == ins_h == h
	movzx	edx, byte [r8+rcx+1]				; s->window[s->strstart+1]
	; edx == c
	shl	eax, zlib_hashshift
	xor	eax, edx
	and	eax, zlib_hashmask
	mov	dword [r12+zlib_dstate_ins_h_ofs], eax
	; end of UPDATE_HASH(s, h, c)

	pop	rax						; flush
	pop	r10						; hash_head
	test	eax, eax
	jz	.deflate_fast
	; else, FLUSH_BLOCK(s, 0)
	
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	jmp	.deflate_fast
calign
.deflate_fast_literalonly:
	; _tr_tally_lit(s, s->window[s->strstart], bflush)
	mov	edi, dword [r12+zlib_dstate_strstart_ofs]
	mov	rdx, [r12+zlib_dstate_window_ofs]
	mov	ecx, dword [r12+zlib_dstate_last_lit_ofs]
	mov	r8, [r12+zlib_dstate_l_buf_ofs]
	mov	r9, [r12+zlib_dstate_d_buf_ofs]

	movzx	eax, byte [rdx+rdi]			; s->window[s->strstart]
	add	edi, 1					; strstart++, we'll put it back after we're done
	mov	word [r9+rcx*2], 0
	mov	byte [r8+rcx], al

	add	ecx, 1
	lea	r8, [r12+zlib_dstate_dyn_ltree_ofs]
	add	word [r8+rax*4], 1		; s->dyn_ltree[cc].Freq++
	mov	[r12+zlib_dstate_last_lit_ofs], ecx	; last_lit++

	; flush == (s->last_lit == s->lit_bufsize-1)
	xor	edx, edx
	mov	eax, 1
	cmp	ecx, zlib_litbufsize - 1
	cmovne	eax, edx
	; eax == flush

	mov	dword [r12+zlib_dstate_strstart_ofs], edi	; s->strstart++ (from above)
	sub	dword [r12+zlib_dstate_lookahead_ofs], 1	; s->lookahead--

	test	eax, eax
	jz	.deflate_fast

	; FLUSH_BLOCK(s, 0);
	
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	jmp	.deflate_fast
calign
.deflate_fast_checkwindow:
	; lookahead < MIN_LOOKAHEAD (262) ..
	mov	eax, zlib_bstate_need_more
	cmp	r13d, zlib_no_flush
	je	.bstate_done
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 0
	jne	.deflate_fast_windowokay

	; so we are all done with the for (;;)
	; next up is: s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1
	mov	ecx, dword [r12+zlib_dstate_strstart_ofs]
	mov	edx, 2					; MIN_MATCH-1
	cmp	ecx, 2
	cmova	ecx, edx
	mov	dword [r12+zlib_dstate_insert_ofs], ecx
	cmp	r13d, zlib_finish
	je	.deflate_fast_finish			; stored_only finish does the same exact thing, so it is declared way above
	mov	eax, zlib_bstate_block_done
	cmp	dword [r12+zlib_dstate_last_lit_ofs], 0
	je	.bstate_done
	; FLUSH_BLOCK(s, 0)
	
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	mov	eax, zlib_bstate_block_done
	jmp	.bstate_done




;--------------------------------------------------- deflate_slow ---------------------------------------------------
calign
.deflate_slow:
	; on entry:
	; r12 == our dstate, r13d == flush, r14 == inbuf, r15 == outbuf, rbx == z_stream, rdi == pending buffer, rsi == pending offset
	; see commentary above deflate_stored re: return method
	;
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 262
	jae	.deflate_slow_windowokay
	call	.fill_window
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 262
	jb	.deflate_slow_checkwindow
calign
.deflate_slow_windowokay:
	; UPDATE_HASH(s, h, c) = (h = (((h)<hash_shift) ^ (c)) & s->hash_mask)
	;
	;
	; INSERT_STRING(s, str, match_head) = 
	; (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), 
	; match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h],
	; s->head[s->ins_h] = (Pos)(str))
	mov	eax, [r12+zlib_dstate_strstart_ofs]		; load this up preemptively for noinitialinsert as well
	xor	r10d, r10d	; hash_head
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 3
	jb	.deflate_slow_noinitialinsert
	; INSERT_STRING(s, s->strstart, hash_head)
	; UPDATE_HASH(s, s->ins_h, s->window[(s->str_start) + (MIN_MATCH-1)])
	mov	rdx, [r12+zlib_dstate_window_ofs]
	; eax already strstart
	add	eax, 2					; MIN_MATCH - 1
	movzx	eax, byte [rdx+rax]			; c for UPDATE_HASH
	mov	edx, dword [r12+zlib_dstate_ins_h_ofs]	; h for UPDATE_HASH
	shl	edx, zlib_hashshift
	xor	edx, eax
	and	edx, zlib_hashmask			; h = (((h)<hash_shift) ^ (c) & s->hash_mask)
	mov	dword [r12+zlib_dstate_ins_h_ofs], edx

	; next up: match_head (hash_head, r10d) = s->prev[(s->strstart) & s->w_mask] = s->head[s->ins_h]
	mov	r8, [r12+zlib_dstate_head_ofs]
	mov	r9, [r12+zlib_dstate_prev_ofs]
	mov	ecx, dword [r12+zlib_dstate_strstart_ofs]
	movzx	r10d, word [r8+rdx*2]
	and	ecx, zlib_wmask
	mov	word [r9+rcx*2], r10w

	; next up: load back up strstart and set s->head[s->ins_h] to it
	mov	eax, [r12+zlib_dstate_strstart_ofs]
	mov	word [r8+rdx*2], ax
	; END INSERT_STRING(s, s->strstart, hash_head)
calign
.deflate_slow_noinitialinsert:
	; different from deflate_fast:
	; s->prev_length = s->match_length, s->prev_match = s->match_start;
	; s->match_length = MIN_MATCH-1;
	mov	ecx, [r12+zlib_dstate_match_length_ofs]
	mov	edx, [r12+zlib_dstate_match_start_ofs]
	mov	r8d, 2

	mov	[r12+zlib_dstate_prev_length_ofs], ecx
	mov	[r12+zlib_dstate_prev_match_ofs], edx
	mov	[r12+zlib_dstate_match_length_ofs], r8d

	test	r10d, r10d
	jz	.deflate_slow_check_match_length		; !hash_head ?
	cmp	ecx, zlib_max_lazy
	jae	.deflate_slow_check_match_length		; prev_length >= max_lazy_match?

	sub	eax, r10d
	cmp	eax, zlib_wsize - 262
	ja	.deflate_slow_check_match_length

	; else, s->match_length = longest_match(s, hash_head)
	; we'll go ahead and use rdi as our hash_head argument
	mov	edi, r10d
	push	r10
	call	.longest_match
	pop	r10
	mov	dword [r12+zlib_dstate_match_length_ofs], eax

	; if match_length <= 5 and (strategy == filtered or (match_length == 3 && strstart - match_start > 4096))
	mov	ecx, [r12+zlib_dstate_match_length_ofs]
	cmp	ecx, 5
	ja	.deflate_slow_check_match_length
	cmp	dword [r12+zlib_dstate_strategy_ofs], 1		; Z_FILTERED
	je	.deflate_slow_force_match_length
	cmp	ecx, 3						; match_length == MIN_MATCH ?
	jne	.deflate_slow_check_match_length
	mov	eax, [r12+zlib_dstate_strstart_ofs]
	mov	edx, [r12+zlib_dstate_match_start_ofs]
	sub	eax, edx
	cmp	eax, 4096					; strstart - match_start > TOO_FAR?
	jle	.deflate_slow_check_match_length
calign
.deflate_slow_force_match_length:
	mov	dword [r12+zlib_dstate_match_length_ofs], 2	; MIN_MATCH-1
calign
.deflate_slow_check_match_length:
	mov	ecx, [r12+zlib_dstate_prev_length_ofs]
	mov	edx, [r12+zlib_dstate_match_length_ofs]
	mov	r8d, [r12+zlib_dstate_match_start_ofs]
	
	cmp	ecx, 3
	jb	.deflate_slow_check_match_available
	cmp	edx, ecx
	ja	.deflate_slow_check_match_available

	if profile_zlib_internals
		prolog_inner .deflate_slow_check_match_length
	end if

	push	r10
	; _tr_tally_dist(s, s->strstart - 1 - s->prev_match, s->prev_length - MIN_MATCH, bflush)
	mov	r8, [r12+zlib_dstate_l_buf_ofs]
	mov	r9, [r12+zlib_dstate_d_buf_ofs]
	mov	ecx, dword [r12+zlib_dstate_last_lit_ofs]

	; and it is a macro
	mov	edi, dword [r12+zlib_dstate_strstart_ofs]
	sub	edi, 1
	sub	edi, dword [r12+zlib_dstate_prev_match_ofs]
	mov	esi, dword [r12+zlib_dstate_prev_length_ofs]
	sub	esi, 3
	; edi == distance
	; esi == length
	; last_lit is a u32

	; we're using r10 as our hash_head, and bflush needs to be set by _tr_tally_dist macro
	; probably easiest way is to push it onto the stack along with hash_head and then end of loop check can pop them both
	; so we are free to blast eax, ecx, edx, r8d, r9d, and if we push r10 that too
	mov	word [r9+rcx*2], di			; s->d_buf[last_lit] = dist (word)
	mov	byte [r8+rcx], sil			; s->l_buf[last_lit] = len (byte)
	add	ecx, 1					; last_lit++
	sub	edi, 1					; distance--
	movzx	edx, byte [rsi+zlib_length_code]		; acquire length code[len]
	mov	dword [r12+zlib_dstate_last_lit_ofs], ecx	; put last_lit back
	lea	r8, [r12+zlib_dstate_dyn_ltree_ofs]
	add	edx, 257				; LITERALS + 1
	lea	r9, [r12+zlib_dstate_dyn_dtree_ofs]
	add	word [r8+rdx*4], 1			; s->dyn_ltree[_zlib_length_code[len]+LITERALS+1].Freq++

	; for the dyn_dtree, we need d_code(dist), and dist is in edi
	mov	r8d, edi
	shr	r8d, 7
	add	r8d, 256				; 256+(dist>>7)

	cmp	edi, 256
	cmovb	r8d, edi
	movzx	eax, byte [r8+zlib_dist_code]

	; so now eax == d_code(dist)
	add	word [r9+rax*4], 1			; s->dyn_dtree[d_code(dist)].Freq++

	; r10 was already pushed above, next is to determine whether to flush or not
	xor	edx, edx
	mov	eax, 1
	cmp	ecx, zlib_litbufsize - 1
	cmovne	eax, edx
	; eax == flush
	push	rax
	; end of _tr_tally_dist

	mov	r11d, dword [r12+zlib_dstate_lookahead_ofs]

	; next up: s->lookahead -= s->prev_length -1
	mov	edx, dword [r12+zlib_dstate_prev_length_ofs]
	sub	edx, 1
	sub	dword [r12+zlib_dstate_lookahead_ofs], edx
	; s->prev_length -= 2
	sub	edx, 1
	mov	dword [r12+zlib_dstate_prev_length_ofs], edx

	push	rbx

	; next up: do if (++strstart <= max_insert) INSERT_STRING(s, s->strstart, hash_head) while (--prev_length)
	; compute max_insert (pre the above mods) first

	mov	ebx, dword [r12+zlib_dstate_strstart_ofs]
	sub	r11d, 3
	add	r11d, ebx					; max_insert
calign
.deflate_slow_insert_newstrings:
	add	ebx, 1						; ++strstart
	cmp	ebx, r11d					; <= max_insert
	ja	.deflate_slow_insert_newstrings_next

	; INSERT_STRING(s, s->strstart, hash_head)
	; UPDATE_HASH(s, s->ins_h, s->window[(s->str_start) + (MIN_MATCH-1)])
	mov	rdx, [r12+zlib_dstate_window_ofs]
	movzx	eax, byte [rdx+rbx+2]			; c for UPDATE_HASH
	mov	edx, dword [r12+zlib_dstate_ins_h_ofs]	; h for UPDATE_HASH
	shl	edx, zlib_hashshift
	xor	edx, eax
	and	edx, zlib_hashmask			; h = (((h)<hash_shift) ^ (c) & s->hash_mask)
	mov	dword [r12+zlib_dstate_ins_h_ofs], edx

	; next up: match_head (hash_head, r10d) = s->prev[(s->strstart) & s->w_mask] = s->head[s->ins_h]
	mov	r8, [r12+zlib_dstate_head_ofs]
	mov	r9, [r12+zlib_dstate_prev_ofs]
	mov	ecx, ebx
	movzx	r10d, word [r8+rdx*2]
	and	ecx, zlib_wmask
	mov	word [r9+rcx*2], r10w

	; next up: load back up strstart and set s->head[s->ins_h] to it
	mov	word [r8+rdx*2], bx
	; END INSERT_STRING(s, s->strstart, hash_head)
calign
.deflate_slow_insert_newstrings_next:
	sub	dword [r12+zlib_dstate_prev_length_ofs], 1
	jnz	.deflate_slow_insert_newstrings
	xor	edx, edx

	add	ebx, 1
	mov	[r12+zlib_dstate_strstart_ofs], ebx
	pop	rbx

	; r10 (hash_head) got updated, store it back in the stack as well
	mov	[rsp+8], r10
	mov	[r12+zlib_dstate_match_available_ofs], edx	; match_available = 0
	mov	dword [r12+zlib_dstate_match_length_ofs], 2	; match_length = MIN_MATCH-1

	; pop our flush and hash_head
	pop	rax						; flush
	pop	r10						; hash_head

	if profile_zlib_internals
		epilog_inner
	end if

	test	eax, eax
	jz	.deflate_slow

	; else, FLUSH_BLOCK(s, 0)
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	jmp	.deflate_slow
calign
.deflate_slow_check_match_available:
	cmp	dword [r12+zlib_dstate_match_available_ofs], 0
	jne	.deflate_slow_match_available
	; otherwise, set match_available = 1, strstart++, lookahead-- and go back to the top
	mov	dword [r12+zlib_dstate_match_available_ofs], 1
	add	dword [r12+zlib_dstate_strstart_ofs], 1
	sub	dword [r12+zlib_dstate_lookahead_ofs], 1
	jmp	.deflate_slow
calign
.deflate_slow_match_available:
	if profile_zlib_internals
		prolog_inner .deflate_slow_match_available
	end if

	; _tr_tally_lit(s, s->window[s->strstart-1], bflush)
	mov	edi, dword [r12+zlib_dstate_strstart_ofs]
	mov	rdx, [r12+zlib_dstate_window_ofs]
	sub	edi, 1
	mov	ecx, dword [r12+zlib_dstate_last_lit_ofs]
	mov	r8, [r12+zlib_dstate_l_buf_ofs]
	mov	r9, [r12+zlib_dstate_d_buf_ofs]

	movzx	eax, byte [rdx+rdi]			; s->window[s->strstart-1]
	add	edi, 2					; strstart++, we'll put it back after we're done
	mov	word [r9+rcx*2], 0
	mov	byte [r8+rcx], al
	add	ecx, 1
	lea	r8, [r12+zlib_dstate_dyn_ltree_ofs]
	add	word [r8+rax*4], 1		; s->dyn_ltree[cc].Freq++
	mov	[r12+zlib_dstate_last_lit_ofs], ecx	; last_lit++

	; flush == (s->last_lit == s->lit_bufsize-1)
	xor	edx, edx
	mov	eax, 1
	cmp	ecx, zlib_litbufsize - 1
	cmovne	eax, edx
	; eax == flush

	mov	dword [r12+zlib_dstate_strstart_ofs], edi	; s->strstart++ (from above)
	sub	dword [r12+zlib_dstate_lookahead_ofs], 1	; s->lookahead--

	if profile_zlib_internals
		epilog_inner
	end if

	test	eax, eax
	jz	.deflate_slow

	; FLUSH_BLOCK_ONLY(s, 0);
	
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	jmp	.deflate_slow
calign
.deflate_slow_checkwindow:
	; lookahead < MIN_LOOKAHEAD (262) ..
	mov	eax, zlib_bstate_need_more
	cmp	r13d, zlib_no_flush
	je	.bstate_done
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 0
	jne	.deflate_slow_windowokay


	; so we are all done with the for (;;)
	cmp	dword [r12+zlib_dstate_match_available_ofs], 0
	je	.deflate_slow_alldone

	; _tr_tally_lit(s, s->window[s->strstart-1], bflush)
	mov	edi, dword [r12+zlib_dstate_strstart_ofs]
	mov	rdx, [r12+zlib_dstate_window_ofs]
	sub	edi, 1
	mov	ecx, dword [r12+zlib_dstate_last_lit_ofs]
	mov	r8, [r12+zlib_dstate_l_buf_ofs]
	mov	r9, [r12+zlib_dstate_d_buf_ofs]

	movzx	eax, byte [rdx+rdi]			; s->window[s->strstart]
	add	edi, 1					; strstart++, we'll put it back after we're done
	mov	word [r9+rcx*2], 0
	mov	byte [r8+rcx], al
	add	ecx, 1
	lea	r8, [r12+zlib_dstate_dyn_ltree_ofs]
	add	word [r8+rax*4], 1		; s->dyn_ltree[cc].Freq++
	mov	[r12+zlib_dstate_last_lit_ofs], ecx	; last_lit++

	; flush == (s->last_lit == s->lit_bufsize-1)
	xor	edx, edx
	mov	eax, 1
	cmp	ecx, zlib_litbufsize - 1
	cmovne	eax, edx
	; eax == flush

	mov	dword [r12+zlib_dstate_match_available_ofs], 0
	
calign
.deflate_slow_alldone:
	; next up is: s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1
	mov	ecx, dword [r12+zlib_dstate_strstart_ofs]
	mov	edx, 2					; MIN_MATCH-1
	cmp	ecx, 2
	cmova	ecx, edx
	mov	dword [r12+zlib_dstate_insert_ofs], ecx
	cmp	r13d, zlib_finish
	je	.deflate_slow_finish			; stored_only finish does the same exact thing, so it is declared way above
	mov	eax, zlib_bstate_block_done
	cmp	dword [r12+zlib_dstate_last_lit_ofs], 0
	je	.bstate_done
	; FLUSH_BLOCK(s, 0)
	
	mov	r8, [r12+zlib_dstate_block_start_ofs]
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	xor	r9d, r9d
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, r8
	cmp	r8, 0
	cmovl	rdi, r9
	; stored_len is next
	mov	rsi, r10
	sub	rsi, r8
	xor	edx, edx		; last
	call	.tr_flush_block
	mov	r10d, [r12+zlib_dstate_strstart_ofs]
	mov	[r12+zlib_dstate_block_start_ofs], r10

	; flush_pending: (tr_flush_bits just calls bi_flush)

	; restore rdi/rsi as our pending buffer for bi_flush
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	bi_flush

	mov	rdx, [r12+zlib_dstate_pending_ofs]
	add	qword [rbx+zlib_totalout_ofs], rdx
	mov	rsi, [r12+zlib_dstate_pending_out_ofs]
	mov	rdi, r15
	call	buffer$append

	; we know that will always succeed, so we can leave pending_out_ofs alone
	; and we can just clear pending entirely
	mov	qword [r12+zlib_dstate_pending_ofs], 0
	mov	eax, zlib_bstate_block_done
	jmp	.bstate_done


;--------------------------------------------------- deflate_huff ---------------------------------------------------
calign
.deflate_huff:
	; on entry:
	; r12 == our dstate, r13d == flush, r14 == inbuf, r15 == outbuf, rbx == z_stream, rdi == pending buffer, rsi == pending offset
	;
	; do I really need to even write this? none of my stuff uses it
	;
	; a note here: our strategy is fixed at zero, so during normal operations, these won't occur
	; (and only would if you are playing around with it)
	breakpoint


;--------------------------------------------------- deflate_rle ----------------------------------------------------
calign
.deflate_rle:
	; on entry:
	; r12 == our dstate, r13d == flush, r14 == inbuf, r15 == outbuf, rbx == z_stream, rdi == pending buffer, rsi == pending offset
	;
	; do I really need to even write this? none of my stuff uses it
	;
	; a note here: our strategy is fixed at zero, so during normal operations, these won't occur
	; (and only would if you are playing around with it)
	breakpoint

;
; for deflate levels in the FAST area, longest_match is not where most time is spent
; for deflate level == 9, this is where the most time gets spent.
;
; this is basically identical to the above, only we avoid byte compares where possible and do the outer loop
; with words only, and the inner loop in 8 byte all-at-once sections
;

; TODO: rewrite me, optimize me, haha as this reference-method is a hideous mess
;

;--------------------------------------------------- longest_match --------------------------------------------------
falign
.longest_match:
	if profile_zlib_internals
		prolog	.longest_match
	end if
	; so on entry, rdi == cur_match (IPos), r12 == our dstate, r13d == flush, r14 == inbuf, r15 == outbuf, rbx == z_stream
	; we need to return match length
	push	rbx r13 r14 r15

	mov	rbx, [r12+zlib_dstate_window_ofs]
	mov	esi, dword [r12+zlib_dstate_strstart_ofs]
	mov	ecx, zlib_max_chain				; chain_length
	mov	r15, rbx					; window
	add	rbx, rsi					; scan


	mov	r14, rbx
	add	r14, 258					; s->window + s->strstart + MAX_MATCH == strend

	mov	edx, dword [r12+zlib_dstate_prev_length_ofs]	; best_len

	mov	rax, rbx
	mov	r8d, zlib_nice_length				; nice_match
	add	rax, rdx

	sub	rax, 1
	mov	r10d, esi
	
	xor	r11d, r11d
	sub	r10d, zlib_wsize - 262				; - (MAX_DIST(s))
	cmp	esi, zlib_wsize - 262
	cmova	r9d, r10d
	cmovbe	r9d, r11d					; limit

	movzx	r11d, word [rax]				; word in ebp == scan_end1 and scan_end

	mov	eax, ecx
	mov	r10, [r12+zlib_dstate_prev_ofs]			; prev
	shr	eax, 2

	cmp	edx, zlib_good_length
	cmovae	ecx, eax

	mov	eax, dword [r12+zlib_dstate_lookahead_ofs]

	cmp	r8d, eax
	cmova	r8d, eax
calign
.longest_match_outer_loop:
	mov	rax, r15
	mov	r13, r15
	add	rax, rdi
	add	r13, rdi					; match = s->window + cur_match
	add	rax, rdx
	sub	rax, 1

	; rbx == scan, r13 == match, rax == match[best_len-1]
	cmp	word [rax], r11w				; scan_end1 and scan_end
	jne	.longest_match_outer_next

	movzx	eax, word [r13]
	movzx	esi, word [rbx]
	cmp	ax, si
	jne	.longest_match_outer_next
	add	r13, 2
	add	rbx, 2
calign
.longest_match_inner_loop:
	add	r13, 1
	add	rbx, 1
	mov	rax, [r13]
	xor	rax, [rbx]
	jz	.longest_match_inner_nextq
	bsf	rax, rax
	shr	rax, 3			; the byte # that was different
	add	r13, rax
	add	rbx, rax
	jmp	.longest_match_inner_done
calign
.longest_match_inner_nextq:
	add	r13, 7
	add	rbx, 7
	cmp	rbx, r14					; scan < strend
	jb	.longest_match_inner_loop
	; special case here for when we actually DO run to the end, as there is no further byte checking to be done

	mov	rax, r14
	sub	rax, rbx
	mov	esi, 258
	sub	esi, eax					; len = (MAX_MATCH) - (int)(strend - scan)
	mov	rbx, r14
	sub	rbx, 258					; scan = strend - (MAX_MATCH)
	cmp	esi, edx					; len > best_len ?
	jbe	.longest_match_outer_next
	mov	dword [r12+zlib_dstate_match_start_ofs], edi	; s->match_start = cur_match
	mov	edx, esi					; best_len = len
	cmp	esi, r8d					; len >= nice_match
	jae	.longest_match_outer_done
	; else, scan_end1 = scan[best_len-1], scan_end = scan[best_len]
	movzx	r11d, word [rbx+rdx-1]
	jmp	.longest_match_outer_next
calign
.longest_match_inner_done:
	mov	rsi, rbx
	movzx	eax, byte [rbx]
	add	rsi, 1
	cmp	al, byte [r13]
	cmove	rbx, rsi
	mov	rax, r14
	sub	rax, rbx
	mov	esi, 258
	sub	esi, eax					; len = (MAX_MATCH) - (int)(strend - scan)
	mov	rbx, r14
	sub	rbx, 258					; scan = strend - (MAX_MATCH)
	cmp	esi, edx					; len > best_len ?
	jbe	.longest_match_outer_next

	mov	dword [r12+zlib_dstate_match_start_ofs], edi	; s->match_start = cur_match
	mov	edx, esi					; best_len = len
	cmp	esi, r8d					; len >= nice_match
	jae	.longest_match_outer_done
	; else, scan_end1 = scan[best_len-1], scan_end = scan[best_len]
	movzx	r11d, word [rbx+rdx-1]
calign
.longest_match_outer_next:
	and	edi, zlib_wmask					; cur_match & wmask
	movzx	edi, word [r10+rdi*2]				; prev[cur_match & wmask]
	cmp	edi, r9d					; > limit?
	jbe	.longest_match_outer_done
	sub	ecx, 1						; --chain_length != 0
	jnz	.longest_match_outer_loop
calign
.longest_match_outer_done:
	mov	ecx, dword [r12+zlib_dstate_lookahead_ofs]
	cmp	edx, ecx
	cmovbe	eax, edx
	cmova	eax, ecx
	pop	r15 r14 r13 rbx
if profile_zlib_internals
	epilog
else
	ret
end if




;--------------------------------------------------- fill_window ----------------------------------------------------
falign
.fill_window:
	if profile_zlib_internals
		prolog	.fill_window
	end if
	; all callee-saves are assumed to be valid, we blast pretty much everything else

	mov	edx, zlib_wsize shl 1
	mov	eax, [r12+zlib_dstate_strstart_ofs]
	sub	rdx, qword [r12+zlib_dstate_lookahead_ofs]
	sub	rdx, rax					; more == amount of space at the end of the window

	cmp	eax, zlib_wsize + (zlib_wsize - 262)
	; MAX_MATCH is 258
	; MIN_MATCH is 3
	; MIN_LOOKAHEAD = (MAX_MATCH + MIN_MATCH + 1)  == 258 + 3 == 261 + 1 == 262
	; MAX_DIST(s) = (w_size - MIN_LOOKAHEAD)
	jl	.fill_window_upperhalf_okay

	; else, move the upper half to the lower one to make room in the upper half
	mov	rdi, [r12+zlib_dstate_window_ofs]
	mov	edx, zlib_wsize
	mov	rsi, rdi
	add	rsi, zlib_wsize					; window + w_size
	call	memcpy

if defined zlib_fillwindow_reference

	mov	edx, zlib_hashsize				; n
	mov	rsi, [r12+zlib_dstate_head_ofs]			; the actual head buffer
	xor	ecx, ecx
	sub	dword [r12+zlib_dstate_match_start_ofs], zlib_wsize
	sub	dword [r12+zlib_dstate_strstart_ofs], zlib_wsize
	sub	qword [r12+zlib_dstate_block_start_ofs], zlib_wsize
	mov	r8d, zlib_wsize

	; slide the hash table
	; we need the address of the word at head[n]
	lea	rsi, [rsi+rdx*2]				; p
calign
.fill_window_slide_loop:
	sub	rsi, 2
	movzx	eax, word [rsi]					; m = *--p
	mov	r9d, eax
	sub	r9d, r8d					; m - w_size
	cmp	eax, r8d
	cmovae	eax, r9d
	cmovb	eax, ecx
	mov	word [rsi], ax
	sub	edx, 1
	jnz	.fill_window_slide_loop

	mov	rsi, [r12+zlib_dstate_prev_ofs]
	mov	edx, r8d					; n = w_size
	; now we have to do the same with prev
	lea	rsi, [rsi+rdx*2]
calign
.fill_window_slide_prev_loop:
	sub	rsi, 2
	movzx	eax, word [rsi]					; m = *--p
	mov	r9d, eax
	sub	r9d, r8d					; m - w_size
	cmp	eax, r8d
	cmovae	eax, r9d
	cmovb	eax, ecx
	mov	word [rsi], ax
	sub	edx, 1
	jnz	.fill_window_slide_prev_loop

else

	; this does the same thing as above, cleaner/faster though

	; NOTE, haha, i used psubusw before i actually saw intel's later patch to the C reference that does the same, hahah
	; good to know i picked the right way to do it

	sub	dword [r12+zlib_dstate_match_start_ofs], zlib_wsize
	sub	dword [r12+zlib_dstate_strstart_ofs], zlib_wsize
	sub	qword [r12+zlib_dstate_block_start_ofs], zlib_wsize

	mov	rdi, [r12+zlib_dstate_prev_ofs]			; prev & head are adjacent, so we can do all of them in one pass, 16 bytes at a time
	movdqa	xmm8, dqword [.wsizeby8]
	mov	ecx, 1024
calign
.fill_window_slide_loop:
	movdqa	xmm0, [rdi]
	movdqa	xmm1, [rdi+16]
	movdqa	xmm2, [rdi+32]
	movdqa	xmm3, [rdi+48]
	movdqa	xmm4, [rdi+64]
	movdqa	xmm5, [rdi+80]
	movdqa	xmm6, [rdi+96]
	movdqa	xmm7, [rdi+112]
	psubusw	xmm0, xmm8
	psubusw	xmm1, xmm8
	psubusw	xmm2, xmm8
	psubusw	xmm3, xmm8
	psubusw	xmm4, xmm8
	psubusw xmm5, xmm8
	psubusw xmm6, xmm8
	psubusw	xmm7, xmm8
	movdqa	[rdi], xmm0
	movdqa	[rdi+16], xmm1
	movdqa	[rdi+32], xmm2
	movdqa	[rdi+48], xmm3
	movdqa	[rdi+64], xmm4
	movdqa	[rdi+80], xmm5
	movdqa	[rdi+96], xmm6
	movdqa	[rdi+112], xmm7
	add	rdi, 128
	sub	ecx, 1
	jnz	.fill_window_slide_loop
end if

	; reset more cuz we made calls out and blasted it anyway
	mov	edx, zlib_wsize shl 1
	sub	rdx, qword [r12+zlib_dstate_lookahead_ofs]
	sub	rdx, qword [r12+zlib_dstate_strstart_ofs]	; more == amount of space at the end of the window

	; fallthrough to fill_window_upperhalf_okay
calign
.fill_window_upperhalf_okay:

	cmp	qword [r14+buffer_user_ofs+8], 0		; # of bytes remaining to be processed in inbuf
	je	.fill_window_nothingtoread

	; else, we need to call read_buf to fill our window
	mov	rdi, [r12+zlib_dstate_window_ofs]
	mov	rsi, rdx					; more
	add	rdi, qword [r12+zlib_dstate_strstart_ofs]
	add	rdi, qword [r12+zlib_dstate_lookahead_ofs]
	call	.read_buf

	; rax now contains now much we read
	add	qword [r12+zlib_dstate_lookahead_ofs], rax
	mov	ecx, [r12+zlib_dstate_lookahead_ofs]
	add	ecx, dword [r12+zlib_dstate_insert_ofs]

	cmp	ecx, 3						; MIN_MATCH
	jb	.fill_window_readcheck
	; else, initialize the hash value now that we have some input
	mov	rdi, [r12+zlib_dstate_window_ofs]
	mov	rsi, [r12+zlib_dstate_head_ofs]
	mov	r8, [r12+zlib_dstate_prev_ofs]
	mov	r9d, [r12+zlib_dstate_strstart_ofs]
	sub	r9d, dword [r12+zlib_dstate_insert_ofs]	; str
	movzx	r10d, byte [rdi+r9]			; ins_h
	
	; s->ins_h = ((s->ins_h << s->hash_shift) ^ s->window[str+1]) & s->hash_mask
	shl	r10d, zlib_hashshift
	xor	r10b, byte [rdi+r9+1]
	and	r10d, zlib_hashmask
	
	mov	dword [r12+zlib_dstate_ins_h_ofs], r10d	; s->ins_h =
calign
.fill_window_readloop_hashinitloop:
	cmp	dword [r12+zlib_dstate_insert_ofs], 0
	je	.fill_window_readcheck

	shl	r10d, zlib_hashshift
	xor	r10b, byte [rdi+r9+2]
	and	r10d, zlib_hashmask

	mov	dword [r12+zlib_dstate_ins_h_ofs], r10d	; s->ins_h =

	; next is s->prev[str & s->w_mask] = s->head[s->ins_h]
	movzx	eax, word [rsi+r10*2]				; eax = s->head[s->ins_h]
	mov	r11d, r9d
	and	r11d, zlib_wmask
	mov	word [r8+r11*2], ax
	; next is: s->head[s->ins_h] = (Pos)str
	mov	word [r8+r10*2], r9w
	add	r9d, 1
	sub	dword [r12+zlib_dstate_insert_ofs], 1
	mov	eax, [r12+zlib_dstate_lookahead_ofs]
	add	eax, dword [r12+zlib_dstate_insert_ofs]
	cmp	eax, 3
	jb	.fill_window_readcheck
	jmp	.fill_window_readloop_hashinitloop
align 16
.wsizeby8	dw	0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000
calign
.fill_window_readcheck:
	mov	rax, [r12+zlib_dstate_lookahead_ofs]

	; while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0), jmp up to to top again
	cmp	dword [r12+zlib_dstate_lookahead_ofs], 262	; MIN_LOOKAHEAD
	jae	.fill_window_nothingtoread

	cmp	qword [r14+buffer_user_ofs+8], 0		; # of bytes remaining to be processed in inbuf
	jne	.fill_window
	; else, fallthrough, top while loop bit is done now
calign
.fill_window_nothingtoread:
	; top half read while loop done...
	mov	rax, [r12+zlib_dstate_high_water_ofs]		; high_water
	mov	ecx, zlib_wsize shl 1				; window_size
	cmp	rax, rcx
if profile_zlib_internals
	jae	.profiled_retonly
else
	jae	.retonly
end if
	mov	edx, [r12+zlib_dstate_strstart_ofs]		; curr = strstart
	add	rdx, qword [r12+zlib_dstate_lookahead_ofs]	; + lookahead
	cmp	rax, rdx
	jae	.fill_window_highwater_checktwo

	; previous high water mark below current data -- zero WIN_INIT
	; bytes or up to the end of the window, whichever is less
	mov	r9, rdx
	sub	rcx, rdx
	mov	r8, 258						; MAX_MATCH (aka WIN_INIT)
	cmp	rcx, r8
	cmova	rcx, r8
	add	r9, rcx
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, rdx
	xor	esi, esi
	mov	rdx, rcx
	; high_water = curr + init
	mov	[r12+zlib_dstate_high_water_ofs], r9
	call	memset
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.fill_window_highwater_checktwo:
	; high_water still sitting in rax
	; curr still sitting in rdx
	add	rdx, 258					; MAX_MATCH (aka WIN_INIT)
	cmp	rax, rdx
if profile_zlib_internals
	jae	.profiled_retonly
else
	jae	.retonly
end if

	; else, high_water < curr + WIN_INIT
	sub	rdx, rax					; init = curr + WIN_INIT - high_water
	; window_size still in rcx, sub high_water from it and boundscheck
	sub	rcx, rax
	cmp	rdx, rcx
	cmova	rdx, rcx
	; we have to add init to high_water
	mov	rdi, [r12+zlib_dstate_window_ofs]
	add	rdi, rax
	xor	esi, esi
	add	qword [r12+zlib_dstate_high_water_ofs], rdx
	call	memset
if profile_zlib_internals
	epilog
else
	ret
end if

if profile_zlib_internals
calign
.profiled_retonly:
	epilog
end if




;--------------------------------------------------- read_buf -------------------------------------------------------
calign
.read_buf:
	if profile_zlib_internals
		prolog	.read_buf
	end if
	; arguments: rdi == destination buffer, rsi == size of spot to put it, all callee-saves are assumed to be valid
	; we'll consume from r14, return in rax/eax, r12 is still valid dstate
	xor	eax, eax
	mov	rdx, [r14+buffer_user_ofs+8]	; # of bytes remaining to be processed in inbuf
	cmp	rdx, rsi
	cmova	rdx, rsi
	test	rdx, rdx
if profile_zlib_internals
	jz	.profiled_retonly
else
	jz	.retonly
end if
	sub	rsp, 16
	mov	rsi, [r14+buffer_user_ofs]	; current pointer into the inbuf
	mov	[rsp], rdi
	mov	[rsp+8], rdx
	call	memcpy
	mov	eax, [r12+zlib_dstate_wrap_ofs]
	mov	rdi, [rbx+zlib_adler_ofs]
	mov	rsi, [rsp]
	mov	rdx, [rsp+8]
	jmp	qword [rax*8+.read_buf_wrapjump]
dalign
.read_buf_wrapjump:
	dq	.read_buf_nowrap, .read_buf_zwrap, .read_buf_gzwrap
calign
.read_buf_zwrap:
	call	adler32
	mov	[rbx+zlib_adler_ofs], rax
	mov	rax, [rsp+8]
	add	qword [r14+buffer_user_ofs], rax
	sub	qword [r14+buffer_user_ofs+8], rax
	add	qword [rbx+zlib_totalin_ofs], rax
	add	rsp, 16
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.read_buf_gzwrap:
	call	crc$32
	mov	[rbx+zlib_adler_ofs], rax
	mov	rax, [rsp+8]
	add	qword [r14+buffer_user_ofs], rax
	sub	qword [r14+buffer_user_ofs+8], rax
	add	qword [rbx+zlib_totalin_ofs], rax
	add	rsp, 16
if profile_zlib_internals
	epilog
else
	ret
end if
.read_buf_nowrap:
	; else, no wrap
	add	qword [rbx+zlib_totalin_ofs], rdx
	; rdx has the # of bytes we consumed
	add	qword [r14+buffer_user_ofs], rdx	; increment the current pointer by how many we read
	sub	qword [r14+buffer_user_ofs+8], rdx	; decrement the remaining bytes available by how many we read
	mov	rax, rdx
	add	rsp, 16
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.retonly:
	; common jump point for when we just want a ret and nothing else
	ret

;--------------------------------------------------- tr_flush_block -------------------------------------------------
falign
.tr_flush_block:
	; callee-saved are presumed good, args are: rdi == buf, rsi == stored_len, edx == last (bool)
	if profile_zlib_internals
		prolog	.tr_flush_block
	end if

	mov	r8, rsi		; opt_lenb
	mov	r9, rsi		; static_lenb
	xor	r10d, r10d	; max_blindex
	add	r8, 5
	add	r9, 5

	cmp	dword [r12+zlib_dstate_level_ofs], 0
	je	.tr_flush_block_notrees

	; save our args
	push	rdi rsi rdx
	; verify that our datatype has been set
	call	.maybe_set_data_type
	; build_tree(s, (tree_desc *)(&(s->l_desc)))
	lea	rdi, [r12+zlib_dstate_l_desc_ofs]
	call	.build_tree

	mov	r8, [r12+zlib_dstate_opt_len_ofs]
	mov	r9, [r12+zlib_dstate_static_len_ofs]

	; build_tree(s, (tree_desc *)(&(s->d_desc)))
	lea	rdi, [r12+zlib_dstate_d_desc_ofs]
	call	.build_tree

	mov	r8, [r12+zlib_dstate_opt_len_ofs]
	mov	r9, [r12+zlib_dstate_static_len_ofs]

	; max_blindex = build_bl_tree(s)
	call	.build_bl_tree
	mov	r10d, eax	; max_blindex

	; opt_lenb = (s->opt_len+3+7)>>3
	mov	r8, [r12+zlib_dstate_opt_len_ofs]
	add	r8, 10
	shr	r8, 3

	; static_lenb = (s->static_len+3+7)>>3
	mov	r9, [r12+zlib_dstate_static_len_ofs]
	add	r9, 10
	shr	r9, 3

	; if static_lenb <= opt_lenb then opt_lenb = static_lenb
	cmp	r9, r8
	cmovl	r8, r9

	pop	rdx rsi rdi
	; fallthrough to _tr_flush_block_notrees
calign
.tr_flush_block_notrees:

	mov	rax, rsi
	add	rax, 4
	cmp	rax, r8
	ja	.tr_flush_block_notrees_notstored
	test	rdi, rdi
	jz	.tr_flush_block_notrees_notstored

	push	rdi			; save the buffer, because send_bits_lit blasts r8, r9

	mov	r10, rsi		; save them temporarily, because send_bits_lit needs them to be pointing to our pending buffer goods
	mov	r11, rdx

	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	mov	eax, edx		; (STORED_BLOCK << 1) + last (STORED_BLOCK == 0)
	send_bits_lit rax, 3

	; copy_block(buf, stored_len, 1) is next
	bi_windup
	mov	eax, r10d
	mov	ecx, r10d
	not	eax
	mov	word [rdi+rsi], cx
	mov	word [rdi+rsi+2], ax
	add	rsi, 4
	; done below: mov	[r12+zlib_dstate_pending_ofs], rsi

	; preserve last across call to memcpy
	lea	rdi, [rdi+rsi]
	add	rsi, r10
	mov	[r12+zlib_dstate_pending_ofs], rsi
	mov	rdx, r10
	mov	rsi, [rsp]
	push	r11
	call	memcpy

	; init_block is next
	; L_CODES == 286
	; D_CODES == 30
	; BL_CODES == 19
	lea	rdi, [r12+zlib_dstate_dyn_ltree_ofs]
	lea	rsi, [r12+zlib_dstate_dyn_dtree_ofs]
	lea	rdx, [r12+zlib_dstate_bl_tree_ofs]
	mov	ecx, 19
calign
.initblock1:
	mov	word [rdi], 0
	mov	word [rsi], 0
	mov	word [rdx], 0
	add	rdi, 4
	add	rsi, 4
	add	rdx, 4
	sub	ecx, 1
	jnz	.initblock1
	mov	ecx, 11
calign
.initblock2:
	mov	word [rdi], 0
	mov	word [rsi], 0
	add	rdi, 4
	add	rsi, 4
	sub	ecx, 1
	jnz	.initblock2
	mov	ecx, 256
calign
.initblock3:
	mov	word [rdi], 0
	add	rdi, 4
	sub	ecx, 1
	jnz	.initblock3

	xor	eax, eax
	pop	r11
	add	rsp, 8			; undo the previous bfufer store, we aren't interested in it anymore
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	mov	word [r12+zlib_dstate_dyn_ltree_ofs + 256*4], 1	; dyn_ltree[END_BLOCK].Freq = 1
	mov	[r12+zlib_dstate_opt_len_ofs], rax
	mov	[r12+zlib_dstate_static_len_ofs], rax
	mov	[r12+zlib_dstate_last_lit_ofs], eax
	mov	[r12+zlib_dstate_matches_ofs], eax
	; if last then bi_windup
	test	r11d, r11d
if profile_zlib_internals
	jz	.profiled_retonly
else
	jz	.retonly
end if
	bi_windup
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.tr_flush_block_notrees_notstored:
	; if (strategy == Z_FIXED || static_lenb == opt_lenb)
	cmp	dword [r12+zlib_dstate_strategy_ofs], 4		; Z_FIXED
	je	.tr_flush_block_notrees_static
	cmp	r9, r8
	je	.tr_flush_block_notrees_static
	sub	rsp, 56
	mov	[rsp], rdi
	mov	[rsp+8], rsi
	mov	[rsp+16], rdx
	mov	[rsp+24], r13
	mov	[rsp+32], r14
	mov	[rsp+40], r10				; max_blindex
	mov	[rsp+48], r15

	lea	r13, [r12+zlib_dstate_l_desc_ofs]
	lea	r14, [r12+zlib_dstate_d_desc_ofs]

	; send_bits( (DYN_TREES << 1) + last, 3)	sends block type
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	mov	eax, 4
	add	eax, edx		; (DYN_TREES << 1) + last
	send_bits_lit rax, 3

	; send_all_trees(s->l_desc.max_code+1, s->d_desc.max_code+1, max_blindex+1) is next
	mov	eax, [r13+zlib_tdesc_max_code_ofs]	; hmmm
	sub	eax, 256
	send_bits_lit rax, 5				; send_bits(s, lcodes-257, 5) (-256 here cuz we skipped the +1 arg)
	mov	eax, [r14+zlib_tdesc_max_code_ofs]	; hmmm
	send_bits_lit rax, 5				; send_bits(s, dcodes-1, 5) (no minus here cuz we skipped the +1 arg)
	mov	rax, [rsp+40]				; max_blindex
	sub	eax, 3
	send_bits_lit rax, 4				; send_bits(s, blcodes-4, 4) (-3 here cuz we skipped the +1 arg)

	xor	r13d, r13d				; rank
	mov	r14, [rsp+40]				; max_blindex
	lea	r15, [r12+zlib_dstate_bl_tree_ofs]
calign
.tr_flush_block_notrees_notstored_loop1:
	mov	ecx, [r13*4+.bl_order]
	movzx	eax, word [r15+rcx*4+2]
	send_bits_lit rax, 3				; send_bits(s->bl_tree[bl_order[rank]].Len, 3)
	add	r13d, 1
	cmp	r13d, r14d
	jle	.tr_flush_block_notrees_notstored_loop1

	; we'll leave rdi and rsi pointing to our pending buffer
	lea	r13, [r12+zlib_dstate_l_desc_ofs]
	lea	r14, [r12+zlib_dstate_d_desc_ofs]
	
	; send_tree(s->dyn_ltree, lcodes-1) is next
	lea	rdx, [r12+zlib_dstate_dyn_ltree_ofs]
	mov	ecx, [r13+zlib_tdesc_max_code_ofs]
	; so now, rdi is pending buffer, rsi is pending offset, rdx is the dyn_tree, ecx is the count
	call	.send_tree

	lea	rdx, [r12+zlib_dstate_dyn_dtree_ofs]
	mov	ecx, [r14+zlib_tdesc_max_code_ofs]
	call	.send_tree

	; compress_block(s, (const ct_data *)s->dyn_ltree, (const ct_data *)s->dyn_dtree) is next
	; we'll leave rdi and rsi pointing to our pending buffer
	lea	rdx, [r12+zlib_dstate_dyn_ltree_ofs]
	lea	rcx, [r12+zlib_dstate_dyn_dtree_ofs]
	call	.compress_block

	; then:
	; init_block is next
	lea	rdi, [r12+zlib_dstate_dyn_ltree_ofs]
	lea	rsi, [r12+zlib_dstate_dyn_dtree_ofs]
	lea	rdx, [r12+zlib_dstate_bl_tree_ofs]
	mov	ecx, 19
calign
.initblock1a:
	mov	word [rdi], 0
	mov	word [rsi], 0
	mov	word [rdx], 0
	add	rdi, 4
	add	rsi, 4
	add	rdx, 4
	sub	ecx, 1
	jnz	.initblock1a
	mov	ecx, 11
calign
.initblock2a:
	mov	word [rdi], 0
	mov	word [rsi], 0
	add	rdi, 4
	add	rsi, 4
	sub	ecx, 1
	jnz	.initblock2a
	mov	ecx, 256
calign
.initblock3a:
	mov	word [rdi], 0
	add	rdi, 4
	sub	ecx, 1
	jnz	.initblock3a

	xor	eax, eax

	mov	r11, [rsp+16]
	mov	r13, [rsp+24]
	mov	r14, [rsp+32]
	mov	r15, [rsp+48]
	add	rsp, 56

	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	mov	word [r12+zlib_dstate_dyn_ltree_ofs + 256*4], 1	; dyn_ltree[END_BLOCK].Freq = 1
	mov	[r12+zlib_dstate_opt_len_ofs], rax
	mov	[r12+zlib_dstate_static_len_ofs], rax
	mov	[r12+zlib_dstate_last_lit_ofs], eax
	mov	[r12+zlib_dstate_matches_ofs], eax
	; if last then bi_windup
	test	r11d, r11d
if profile_zlib_internals
	jz	.profiled_retonly
else
	jz	.retonly
end if
	bi_windup
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.tr_flush_block_notrees_static:
	; send_bits( (STATIC_TREES << 1) + last, 3)	sends block type

	mov	r11, rdx

	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]

	mov	eax, 2
	add	eax, edx		; (STATIC_TREES << 1) + last
	send_bits_lit rax, 3

	push	r11

	; compress_block(s, (const ct_data *)zlib_static_ltree, (const ct_data *)zlib_static_dtree) is next
	; we'll leave rdi and rsi pointing to our pending buffer
	mov	rdx, zlib_static_ltree
	mov	rcx, zlib_static_dtree
	call	.compress_block

	; then:
	; init_block is next
	lea	rdi, [r12+zlib_dstate_dyn_ltree_ofs]
	lea	rsi, [r12+zlib_dstate_dyn_dtree_ofs]
	lea	rdx, [r12+zlib_dstate_bl_tree_ofs]
	mov	ecx, 19
calign
.initblock1b:
	mov	word [rdi], 0
	mov	word [rsi], 0
	mov	word [rdx], 0
	add	rdi, 4
	add	rsi, 4
	add	rdx, 4
	sub	ecx, 1
	jnz	.initblock1b
	mov	ecx, 11
calign
.initblock2b:
	mov	word [rdi], 0
	mov	word [rsi], 0
	add	rdi, 4
	add	rsi, 4
	sub	ecx, 1
	jnz	.initblock2b
	mov	ecx, 256
calign
.initblock3b:
	mov	word [rdi], 0
	add	rdi, 4
	sub	ecx, 1
	jnz	.initblock3b

	xor	eax, eax
	pop	r11
	mov	rdi, [r12+zlib_dstate_pending_buf_ofs]
	mov	rsi, [r12+zlib_dstate_pending_ofs]
	mov	word [r12+zlib_dstate_dyn_ltree_ofs + 256*4], 1	; dyn_ltree[END_BLOCK].Freq = 1
	mov	[r12+zlib_dstate_opt_len_ofs], rax
	mov	[r12+zlib_dstate_static_len_ofs], rax
	mov	[r12+zlib_dstate_last_lit_ofs], eax
	mov	[r12+zlib_dstate_matches_ofs], eax
	; if last then bi_windup
	test	r11d, r11d
if profile_zlib_internals
	jz	.profiled_retonly
else
	jz	.retonly
end if
	bi_windup
if profile_zlib_internals
	epilog
else
	ret
end if


	; send_code(s, c, tree) == send_bits(s, tree[c].Code, tree[c].Len)
;--------------------------------------------------- compress_block -------------------------------------------------
falign
.compress_block:
	if profile_zlib_internals
		prolog	.compress_block
	end if
	; rdi == pending buffer
	; rsi == pending offset
	; rdx == ltree
	; rcx == dtree

	; so... we can safely hang onto r10, r11 cuz send_bits doesn't chew it
	; but we'll need the rest of our callee-saves i think
	cmp	dword [r12+zlib_dstate_last_lit_ofs], 0
	je	.compress_block_endonly
	sub	rsp, 48
	mov	[rsp], rbx
	mov	[rsp+8], r13
	mov	[rsp+16], r14
	mov	[rsp+24], r15

	mov	r14, rdx		; ltree
	mov	r15, rcx		; dtree
	xor	ebx, ebx		; lx
calign
.compress_block_loop:
	; dbuf and lbuf are both pointers
	mov	rdx, [r12+zlib_dstate_d_buf_ofs]
	mov	rcx, [r12+zlib_dstate_l_buf_ofs]
	; r8d == dist == d_buf
	; eax == lc == l_buf
	movzx	r8d, word [rdx+rbx*2]
	movzx	eax, byte [rcx+rbx]

	add	ebx, 1
	test	r8d, r8d
	jz	.compress_block_literal
	; because we are register starved here, save dist and lc on the stack
	mov	dword [rsp+32], r8d		; dist
	mov	dword [rsp+40], eax		; lc
	; he say:
	; Here, lc is the match length - MIN_MATCH (3)
	movzx	r13d, byte [rax+zlib_length_code]	; code = zlib_length_code[lc]
	mov	edx, r13d
	add	edx, 257			; LITERALS + 1

	mov	eax, dword [r14+rdx*4]		; get the tree entry
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d			; send_code(code+LITERALS+1, ltree)
	
	mov	r10d, dword [r13*4+extra_lbits]
	test	r10d, r10d
	jz	.compress_block_loop_noextra
	mov	eax, dword [rsp+40]		; lc
	sub	eax, dword [r13*4+zlib_base_length]
	mov	dword [rsp+40], eax
	
	send_bits rax, r10d			; send_bits(lc, extra)
calign
.compress_block_loop_noextra:
	mov	r8d, dword [rsp+32]
	sub	r8d, 1
	mov	dword [rsp+32], r8d		; dist--

	; we need code = d_code(dist)
	; d_code(dist) == ((dist) < 256 ? _zlib_dist_code[dist] : _zlib_dist_code[256+((dist)>>7)]

	; dist is in r8d
	mov	r9d, r8d
	shr	r9d, 7
	add	r9d, 256
	; 256+(dist>>7)

	cmp	r8d, 256
	cmovb	r9d, r8d
	movzx	r13d, byte [r9d+zlib_dist_code]

	; so now r13d (code) is d_code(dist)
	; now we need to send_code(code, dtree) (dtree in r15)
	mov	eax, dword [r15+r13*4]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d

	; next up: extra = zlib_extra_dbits[code]
	mov	r10d, [r13*4+zlib_extra_dbits]
	test	r10d, r10d
	jz	.compress_block_donext
	
	; otherwise, extra != 0, so dist -= zlib_base_dist[code]
	; and then send_bits dist, extra
	mov	eax, dword [rsp+32]		; dist
	sub	eax, dword [r13*4+zlib_base_dist]	; -= zlib_base_dist[code]
	send_bits rax, r10d			; send_bits(dist, extra)

	; and donext (copy of to avoid nop fill)
	cmp	ebx, dword [r12+zlib_dstate_last_lit_ofs]
	jb	.compress_block_loop
	; else, send the END_BLOCK code and be done
	mov	eax, dword [r14+1024]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d

	mov	rbx, [rsp]
	mov	r13, [rsp+8]
	mov	r14, [rsp+16]
	mov	r15, [rsp+24]
	add	rsp, 48
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.compress_block_donext:
	cmp	ebx, dword [r12+zlib_dstate_last_lit_ofs]
	jb	.compress_block_loop
	; else, send the END_BLOCK code and be done
	mov	eax, dword [r14+1024]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d

	mov	rbx, [rsp]
	mov	r13, [rsp+8]
	mov	r14, [rsp+16]
	mov	r15, [rsp+24]
	add	rsp, 48
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.compress_block_literal:
	; send_code(s, lc, ltree)
	mov	eax, dword [r14+rax*4]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d
	cmp	ebx, dword [r12+zlib_dstate_last_lit_ofs]
	jb	.compress_block_loop
	; else, send the END_BLOCK code and be done
	mov	eax, dword [r14+1024]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d

	mov	rbx, [rsp]
	mov	r13, [rsp+8]
	mov	r14, [rsp+16]
	mov	r15, [rsp+24]
	add	rsp, 48
if profile_zlib_internals
	epilog
else
	ret
end if

calign
.compress_block_endonly:
	; not part of the loop, jumps here only if last_lit was zero on entry, hence no stack cleanup/mods
	mov	eax, dword [rdx+1024]			; tree[END_BLOCK] dword (END_BLOCK == 256)
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d
if profile_zlib_internals
	epilog
else
	ret
end if




	; send_code(s, c, tree) == send_bits(s, tree[c].Code, tree[c].Len)
;--------------------------------------------------- send_tree ------------------------------------------------------
falign
.send_tree:
	if profile_zlib_internals
		prolog	.send_tree
	end if
	; rdi == pending buffer
	; rsi == pending offset
	; rdx == tree
	; ecx == max_code

	movzx	eax, word [rdx+2]			; tree[0].Len

	sub	rsp, 72
	mov	[rsp], rbx
	mov	[rsp+8], r13
	mov	[rsp+16], r14
	mov	[rsp+24], r15
	mov	[rsp+32], rbp
	mov	dword [rsp+40], -1			; prevlen
	mov	dword [rsp+48], eax			; nextlen

	xor	ebp, ebp				; n
	mov	ebx, ecx				; max_code
	xor	r13d, r13d				; count
	mov	r14, rdx				; tree
	lea	r15, [r12+zlib_dstate_bl_tree_ofs]	; bl_tree
	
	; r11d == curlen, [rsp+40] == prevlen, [rsp+48] == nextlen, [rsp+56] == max_count, [rsp+64] == min_count
	mov	ecx, 7
	mov	r8d, 4
	mov	r9d, 138
	mov	r10d, 3
	test	eax, eax				; nextlen == 0?
	cmovz	ecx, r9d
	cmovz	r8d, r10d
	mov	dword [rsp+56], ecx			; max_count
	mov	dword [rsp+64], r8d			; min_count
calign
.send_tree_loop:
	cmp	ebp, ebx				; n > max_code?
	jg	.send_tree_loop_alldone
	add	ebp, 1					; n++
	movzx	eax, word [r14+rbp*4+2]			; tree[n].Len
	mov	r11d, dword [rsp+48]			; curlen = nextlen
	mov	dword [rsp+48], eax			; nextlen = tree[n].Len
	add	r13d, 1					; ++count
	cmp	r11d, eax				; curlen == nextlen?
	jne	.send_tree_loop_topcase1
	; otherwise, curlen == nextlen, so check if count < max_count
	cmp	r13d, dword [rsp+56]
	jl	.send_tree_loop				; yep, continue
calign
.send_tree_loop_topcase1:
	cmp	r13d, dword [rsp+64]			; count < min_count ?
	jl	.send_tree_loop_case1
	test	r11d, r11d				; curlen != 0 ?
	jnz	.send_tree_loop_case2
	cmp	r13d, 10				; count <= 10 ?
	jle	.send_tree_loop_case3	

	; last else, so:
	; send_code(REPZ_11_138, s->bl_tree);		REPZ_11_138 == 18, so we need to load the full code from r15+18*4(72)
	mov	eax, dword [r15+72]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d
	; send_bits(s, count-11, 7);
	mov	eax, r13d
	sub	eax, 11
	send_bits_lit rax, 7

	; resetcount:
	xor	r13d, r13d				; count = 0
	mov	dword [rsp+40], r11d			; prevlen = curlen
	cmp	dword [rsp+48], 0			; nextlen == 0?
	je	.send_tree_loop_case4
	cmp	r11d, dword [rsp+48]			; curlen == nextlen
	je	.send_tree_loop_case5
	; else, max_count = 7, min_count = 4
	mov	dword [rsp+56], 7
	mov	dword [rsp+64], 4
	jmp	.send_tree_loop
calign
.send_tree_loop_case1:
	; count < min_count
	; which says: do { send_code(curlen, s->bl_tree); } while (--count != 0);
	mov	eax, dword [r15+r11*4]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d
	sub	r13d, 1
	jnz	.send_tree_loop_case1

	; resetcount:
	xor	r13d, r13d				; count = 0
	mov	dword [rsp+40], r11d			; prevlen = curlen
	cmp	dword [rsp+48], 0			; nextlen == 0?
	je	.send_tree_loop_case4
	cmp	r11d, dword [rsp+48]			; curlen == nextlen
	je	.send_tree_loop_case5
	; else, max_count = 7, min_count = 4
	mov	dword [rsp+56], 7
	mov	dword [rsp+64], 4
	jmp	.send_tree_loop
calign
.send_tree_loop_case2:
	; curlen != 0
	; which says: if (curlen != prevlen) { send_code(curlen, s->bl_tree); count--; }
	;             send_code(REP_3_6, s->bl_tree); send_bits(count-3, 2);
	cmp	r11d, dword [rsp+40]				; curlen != prevlen?
	jne	.send_tree_loop_case2_sub
	mov	eax, dword [r15+64]			; REP_3_6 == 16 * 4 == 64
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d				; send_code(REP_3_6, bl_tree)
	mov	eax, r13d
	sub	eax, 3
	send_bits_lit rax, 2

	; resetcount:
	xor	r13d, r13d				; count = 0
	mov	dword [rsp+40], r11d			; prevlen = curlen
	cmp	dword [rsp+48], 0			; nextlen == 0?
	je	.send_tree_loop_case4
	cmp	r11d, dword [rsp+48]			; curlen == nextlen
	je	.send_tree_loop_case5
	; else, max_count = 7, min_count = 4
	mov	dword [rsp+56], 7
	mov	dword [rsp+64], 4
	jmp	.send_tree_loop
calign
.send_tree_loop_case2_sub:
	; same as above, only we do the extra step of sending curlen code first and decrementing count
	mov	eax, dword [r15+r11*4]
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d				; send_code(curlen, bl_tree)
	sub	r13d, 1					; count--

	; copy of above:
	mov	eax, dword [r15+64]			; REP_3_6 == 16 * 4 == 64
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d				; send_code(REP_3_6, bl_tree)
	mov	eax, r13d
	sub	eax, 3
	send_bits_lit rax, 2

	; resetcount:
	xor	r13d, r13d				; count = 0
	mov	dword [rsp+40], r11d			; prevlen = curlen
	cmp	dword [rsp+48], 0			; nextlen == 0?
	je	.send_tree_loop_case4
	cmp	r11d, dword [rsp+48]			; curlen == nextlen
	je	.send_tree_loop_case5
	; else, max_count = 7, min_count = 4
	mov	dword [rsp+56], 7
	mov	dword [rsp+64], 4
	jmp	.send_tree_loop
calign
.send_tree_loop_case3:
	; count <= 10
	; which says: send_code(REPZ_3_10, s->bl_tree); send_bits(count-3, 3);
	mov	eax, dword [r15+68]			; REPZ_3_10 == 17 * 4 == 68
	mov	r10d, eax
	and	eax, 0xffff
	shr	r10d, 16
	send_bits rax, r10d				; send_code(REPZ_3_10, bl_tree)
	mov	eax, r13d
	sub	eax, 3
	send_bits_lit rax, 3

	; resetcount:
	xor	r13d, r13d				; count = 0
	mov	dword [rsp+40], r11d			; prevlen = curlen
	cmp	dword [rsp+48], 0			; nextlen == 0?
	je	.send_tree_loop_case4
	cmp	r11d, dword [rsp+48]			; curlen == nextlen
	je	.send_tree_loop_case5
	; else, max_count = 7, min_count = 4
	mov	dword [rsp+56], 7
	mov	dword [rsp+64], 4
	jmp	.send_tree_loop
calign
.send_tree_loop_case4:
	; max_count = 138, min_count = 3
	mov	dword [rsp+56], 138
	mov	dword [rsp+64], 3
	jmp	.send_tree_loop
calign
.send_tree_loop_case5:
	; max_count = 6, min_count = 3
	mov	dword [rsp+56], 6
	mov	dword [rsp+64], 3
	jmp	.send_tree_loop
calign
.send_tree_loop_alldone:
	mov	rbx, [rsp]
	mov	r13, [rsp+8]
	mov	r14, [rsp+16]
	mov	r15, [rsp+24]
	mov	rbp, [rsp+32]
	add	rsp, 72
if profile_zlib_internals
	epilog
else
	ret
end if






;--------------------------------------------------- build_tree -----------------------------------------------------
falign
.build_tree:
	if profile_zlib_internals
		prolog	.build_tree
	end if
	push	rbx r13 r14 r15				; we need a few more temporaries here
	mov	rsi, [rdi+zlib_tdesc_dyn_tree_ofs]	; desc->dyn_tree
	mov	rdx, [rdi+zlib_tdesc_stat_desc_ofs]	; desc->stat_desc
	; stat_desc offsets:
	; static_tree = ofs 0
	; extra_bits = ofs 8
	; extra_base = ofs 16
	; elems = ofs 24
	; max_length = ofs 32
	mov	rcx, [rdx]				; static_tree
	mov	r8d, [rdx+24]				; elems
	xor	r9d, r9d				; n
	xor	r10d, r10d
	mov	edx, -1					; max_code
	; r10d == m, r11d == node

	; we'll use r10d as our heap_len, and r11 temporary as a pointer to the heap
	lea	r11, [r12+zlib_dstate_heap_ofs]
	
	mov	dword [r12+zlib_dstate_heap_len_ofs], r9d
	mov	dword [r12+zlib_dstate_heap_max_ofs], 573	; HEAP_MAX


calign
.build_tree_loop1:
	cmp	r9d, r8d				; n > elems?
	jae	.build_tree_loop2

	cmp	word [rsi+r9*4], 0
	jne	.build_tree_loop1_freq
	mov	word [rsi+r9*4+2], 0			; tree[n].Len = 0
	add	r9d, 1
	jmp	.build_tree_loop1
calign
.build_tree_loop1_freq:
	add	r10d, 1					; heap_len++
	movzx	eax, word [rsi+r9*4]
	mov	edx, r9d				; max_code = n
	mov	dword [r11+r10*4], r9d			; heap[heap_len] = n
	mov	byte [r12+r9+zlib_dstate_depth_ofs], 0	; s->depth[n] = 0
	add	r9d, 1					; n++
	jmp	.build_tree_loop1
calign
.build_tree_loop2:
	cmp	r10d, 2
	jae	.build_tree_loop2_done
	xor	eax, eax
	cmp	edx, 2
	jl	.build_tree_loop2_incmax
	add	r10d, 1
	mov	dword [r11+r10*4], 0
	mov	word [rsi], 1
	mov	byte [r12+zlib_dstate_depth_ofs], 0
	sub	qword [r12+zlib_dstate_opt_len_ofs], 1
	test	rcx, rcx
	jz	.build_tree_loop2
	mov	ax, word [rcx+2]
	sub	qword [r12+zlib_dstate_static_len_ofs], rax
	jmp	.build_tree_loop2
calign
.build_tree_loop2_incmax:
	add	edx, 1					; max_code
	add	r10d, 1
	mov	dword [r11+r10*4], edx
	mov	word [rsi+rdx*4], 1
	mov	byte [r12+rdx+zlib_dstate_depth_ofs], 0
	sub	qword [r12+zlib_dstate_opt_len_ofs], 1
	test	rcx, rcx
	jz	.build_tree_loop2
	mov	ax, word [rcx+rdx*4+2]
	sub	qword [r12+zlib_dstate_static_len_ofs], rax
	jmp	.build_tree_loop2
calign
.build_tree_loop2_done:
	mov	dword [r12+zlib_dstate_heap_len_ofs], r10d
	mov	dword [rdi+zlib_tdesc_max_code_ofs], edx

	mov	r9d, r10d
	shr	r9d, 1
calign
.build_tree_loop3:
	cmp	r9d, 1
	jl	.build_tree_final_loop
	; pqdownheap(s, tree, n)
	; r11 still pointing to our heap
	; rsi is our "tree"
	; r9d is n
macro pqdownheap {
	; edx is the k argument, rsi must be the tree, r11 must be pointing to the heap, r12 must be our dstate, r10d must be our heap_len
	; eax is our v
	; ebx is our j
	local	.loop,.notless,.keepgoing,.checkdepth,.done,.less_incj
	mov	eax, dword [r11+rdx*4]
	mov	ebx, edx
	shl	ebx, 1
calign
.loop:
	cmp	ebx, r10d
	jg	.done
	je	.notless
	; so, j < s->heap_len
	; if smaller(tree, s->heap[j+1], s->heap[j], s->depth)
	;   j++
	mov	r13d, dword [r11+rbx*4+4]		; heap[j+1]		n
	mov	r14d, dword [r11+rbx*4]			; heap[j]		m
	movzx	r15d, word [rsi+r13*4]			; tree[heap[j+1]].Freq	tree[n].Freq
	cmp	r15w, word [rsi+r14*4]			; cmp tree[heap[j+1]].Freq with tree[heap[j]].Freq	with tree[m].Freq
	jl	.less_incj
	jne	.notless
	; otherwise, freq was equal, check depth equality
	movzx	r15d, byte [r12+r13+zlib_dstate_depth_ofs]
	cmp	r15b, byte [r12+r14+zlib_dstate_depth_ofs]		; yuck.
	ja	.notless
calign
.less_incj:
	add	ebx, 1
calign
.notless:
	; if (smaller(tree, v, s->heap[j], s->depth)) break;
	mov	r13d, eax				; v
	mov	r14d, dword [r11+rbx*4]			; heap[j]
	movzx	r15d, word [rsi+r13*4]			; tree[v].Freq
	cmp	r15w, word [rsi+r14*4]
	; so, if tree[v].Freq < tree[heap[j]].Freq, break.
	; or, if tree[v].Freq == tree[heap[j]].Freq && depth[v] == depth[heap[j]], break.
	; otherwise, keep going.
	jl	.done
	je	.checkdepth
calign
.keepgoing:
	mov	r13d, dword [r11+rbx*4]		; r13d = heap[j]
	mov	dword [r11+rdx*4], r13d		; heap[k] = heap[j]
	mov	edx, ebx			; k = j
	shl	ebx, 1				; j <<= 1
	jmp	.loop
calign
.checkdepth:
	movzx	r15d, byte [r12+r13+zlib_dstate_depth_ofs]
	cmp	r15b, byte [r12+r14+zlib_dstate_depth_ofs]		; yuck.
	ja	.keepgoing
	; else, it was less than or equal, so break
calign
.done:
	mov	dword [r11+rdx*4], eax
}

	mov	edx, r9d
	pqdownheap
	sub	r9d, 1
	jmp	.build_tree_loop3			; TODO: redo this loop

macro pqremove {
	; rsi == tree argument
	; r9d == top argument
	mov	edx, 1
	mov	r9d, dword [r11+4]			; top = heap[SMALLEST]
	mov	eax, dword [r11+r10*4]			; heap[heap_len--]
	mov	dword [r11+4], eax			; heap[SMALLEST] = s->heap[s->heap_len--]
	sub	r10d, 1
	mov	dword [r12+zlib_dstate_heap_len_ofs], r10d
	pqdownheap
}

calign
.build_tree_final_loop:


	; so at this point:

	; rcx is our static tree (stree)
	; rdi is our tree desc (desc)
	; rsi is our tree
	; r8d is elems
	; r9d must be zero (n) at this point
	; r10d is still heap_len
	; r11 is still our heap
	; eax, ebx, edx, r13d, r14d, r15d are all free to use

	; we can use r8d for our node var since elems isn't referenced again
	; stree isn't referenced from this point forward, so we can blast rcx
	; pqremove is a #define, which modifies its top parameter, pqdownheap which it calls does not modify, and takes an argument

	; since pqdownheap uses edx as its arg, we can use r9d for our n argument, which is already zero
	pqremove

	; n (r9d) is now set, and we are free to blast all our temporaries now... r8d is still our node, r10d still our heap_len which got reduced
	mov	r13d, dword [r11+4]			; m = heap[SMALLEST]
	mov	r14d, dword [r12+zlib_dstate_heap_max_ofs]
	sub	r14d, 1
	mov	dword [r11+r14*4], r9d
	sub	r14d, 1
	mov	dword [r11+r14*4], r13d
	mov	dword [r12+zlib_dstate_heap_max_ofs], r14d
	; create a new node father of n and m
	movzx	eax, word [rsi+r9*4]			; tree[n].Freq
	movzx	ecx, word [rsi+r13*4]			; tree[m].Freq
	add	eax, ecx
	mov	word [rsi+r8*4], ax			; tree[node].Freq = tree[n].Freq + tree[m].Freq
	
	lea	rbx, [r12+zlib_dstate_depth_ofs]	; offset to the depth byte table
	movzx	eax, byte [rbx+r9]			; depth[n]
	movzx	ecx, byte [rbx+r13]			; depth[m]

	cmp	eax, ecx
	cmovb	eax, ecx
	add	eax, 1

	mov	byte [rbx+r8], al
	mov	word [rsi+r9*4+2], r8w			; tree[n].Dad = node
	mov	word [rsi+r13*4+2], r8w			; tree[m].Dad = node
	mov	dword [r11+4], r8d			; heap[SMALLEST] = node
	add	r8d, 1					; node++
	mov	edx, 1
	pqdownheap
	cmp	r10d, 2
	jae	.build_tree_final_loop

	mov	dword [r12+zlib_dstate_heap_len_ofs], r10d	; put heap_len back

	mov	eax, dword [r11+4]			; heap[SMALLEST]
	mov	r14d, dword [r12+zlib_dstate_heap_max_ofs]
	sub	r14d, 1
	mov	dword [r11+r14*4], eax			; heap[--heap_max] = heap[SMALLEST]
	mov	dword [r12+zlib_dstate_heap_max_ofs], r14d

	; gen_bitlen:

	; rdi is still our desc
	mov	rsi, [rdi+zlib_tdesc_dyn_tree_ofs]	; desc->dyn_tree	(tree)
	mov	rdx, [rdi+zlib_tdesc_stat_desc_ofs]	; desc->stat_desc
	mov	rcx, [rdx]				; static_tree		(stree)
	; extra_bits is at [rdx+8]
	; extra_base is at [rdx+16]
	; elems is at [rdx+24]
	; max_length is at [rdx+32]

	; stat_desc offsets:
	; static_tree = ofs 0
	; extra_bits = ofs 8
	; extra_base = ofs 16
	; elems = ofs 24
	; max_length = ofs 32
	xor	eax, eax
	mov	r8, [rdx+8]				; extra
	mov	r9d, dword [rdx+16]			; extra_base
	mov	r10d, dword [rdx+32]			; max_length
	lea	rdx, [r12+zlib_dstate_bl_count_ofs]	; rdx now pointing to our bl_count
	mov	[rdx], rax
	mov	[rdx+8], rax
	mov	[rdx+16], rax
	mov	[rdx+24], rax				; zero all of our bl_count entries (array of dw, 16 entries in all)

	; r11 is still our heap ... rdi (desc), rsi (tree), rdx (bl_count), rcx (stree), r8 (extra*), r9d (extra_bits), r10d (max_length), r11 (heap), r12(dstate), r14d (heap_max)
	; so we have clear eax, ebx, r13d, r14d, r15d
	; r14d still == heap_max
	mov	eax, [r11+r14*4]			; s->heap[s->heap_max]
	mov	word [rsi+rax*4+2], 0			; s->tree[s->heap[s->heap_max]].Len = 0
	add	r14d, 1					; heap_max + 1 (which we'll use as h)
	xor	ebx, ebx				; overflow = 0
calign
.gen_bitlen_loop1:
	cmp	r14d, 573				; h < HEAP_SIZE ?
	jae	.gen_bitlen_loop1_done
	lea	rdx, [r12+zlib_dstate_bl_count_ofs]	; rdx now pointing to our bl_count
	mov	eax, [r11+r14*4]			; n = s->heap[h]
	movzx	r15d, word [rsi+rax*4+2]		; tree[n].Dad
	movzx	r13d, word [rsi+r15*4+2]		; tree[tree[n].Dad].Len
	add	r13d, 1					; + 1 (bits)

	; 15 is clear, n = eax, h = r14d, overflow = ebx, bits = r13d
	mov	r15d, ebx
	add	r15d, 1
	cmp	r13d, r10d				; bits > max_length?
	cmova	r13d, r10d				; if so, bits = max_length
	cmova	ebx, r15d				; and overflow++

	mov	word [rsi+rax*4+2], r13w		; tree[n].Len = bits
	; if (n > max_code) continue;
	mov	r15d, r14d
	add	r15d, 1					; h++ temporary
	cmp	eax, dword [rdi+zlib_tdesc_max_code_ofs]
	cmova	r14d, r15d
	ja	.gen_bitlen_loop1			; continue

	add	word [rdx+r13*2], 1			; bl_count[bits]++

	xor	r15d, r15d				; xbits = 0
	mov	edx, eax
	sub	edx, r9d
	cmp	eax, r9d				; n >= base?
	jb	.gen_bitlen_loop1_noxbits
	; TODO: contemplate this cmov ... i intentionally set its address invalid if the condition isn't satisfied
	; but still get a segfault here even if the condition isn't met... :-(
	; cmovae	r15d, dword [r8+rdx*4]			; if so, xbits = extra[n-base]
	mov	r15d, dword [r8+rdx*4]
calign
.gen_bitlen_loop1_noxbits:
	; we are safe to blast r13d now, it will be set again next loop iteration
	add	r13d, r15d				; bits += xbits
	push	rax					; save n
	movzx	eax, word [rsi+rax*4]			; tree[n].Freq
	mul	r13
	add	qword [r12+zlib_dstate_opt_len_ofs], rax
	pop	rax
	add	r14d, 1
	test	rcx, rcx
	jz	.gen_bitlen_loop1
	; else, stree exists, so we need to do static_len += f * (stree[n].Len + xbits)
	movzx	r13d, word [rcx+rax*4+2]		; stree[n].Len into r13d
	add	r13d, r15d				; + xbits
	push	rax
	movzx	eax, word [rsi+rax*4]			; tree[n].Freq again (f)
	mul	r13
	add	qword [r12+zlib_dstate_static_len_ofs], rax
	pop	rax
	jmp	.gen_bitlen_loop1
calign
.gen_bitlen_loop1_done:
	test	ebx, ebx
	jz	.gen_bitlen_alldone
	lea	rdx, [r12+zlib_dstate_bl_count_ofs]	; rdx now pointing to our bl_count
calign
.gen_bitlen_loop2:
	mov	r13d, r10d				; bits = max_length
	sub	r13d, 1
	cmp	word [rdx+r13*2], 0
	jne	.gen_bitlen_loop2_continue
	sub	r13d, 1
calign
.gen_bitlen_loop2_bc:
	cmp	word [rdx+r13*2], 0
	jne	.gen_bitlen_loop2_continue
	sub	r13d, 1
	jmp	.gen_bitlen_loop2_bc
calign
.gen_bitlen_loop2_continue:
	sub	word [rdx+r13*2], 1
	add	word [rdx+r13*2+2], 2
	sub	word [rdx+r10*2], 1
	sub	ebx, 2
	cmp	ebx, 0
	jg	.gen_bitlen_loop2
	; he say:
	; now recompute all bit lengths, scanning in increasing frequency
	; rsi == tree, r11 == heap, rdx == bl_count, r14d = h (HEAP_SIZE at the moment)
	mov	r13d, r10d				; bits = max_length
calign
.gen_bitlen_loop3:
	test	r13d, r13d
	jz	.gen_bitlen_alldone
	movzx	ebx, word [rdx+r13*2]			; n = bl_count[bits]
calign
.gen_bitlen_loop3_inner:
	test	ebx, ebx
	jz	.gen_bitlen_loop3_inner_done
	sub	r14d, 1
	mov	r15d, dword [r11+r14*4]			; m = s->heap[--h]
	cmp	r15d, dword [rdi+zlib_tdesc_max_code_ofs]
	ja	.gen_bitlen_loop3_inner
	sub	ebx, 1					; n--
	movzx	ecx, word [rsi+r15*4+2]			; tree[m].Len
	cmp	r13d, ecx				; if (bits != tree[m].Len)
	je	.gen_bitlen_loop3_inner
	movzx	eax, word [rsi+r15*4]			; tree[m].Freq
	mov	edx, r13d
	sub	edx, ecx
	mul	rdx
	mov	word [rsi+r15*4+2], r13w		; tree[m].Len = bits
	add	qword [r12+zlib_dstate_opt_len_ofs], rax
	jmp	.gen_bitlen_loop3_inner
calign
.gen_bitlen_loop3_inner_done:
	sub	r13d, 1
	lea	rdx, [r12+zlib_dstate_bl_count_ofs]	; rdx now pointing to our bl_count
	jmp	.gen_bitlen_loop3

calign
.gen_bitlen_alldone:
	; esi must be max_code
	; rdx must point to s->bl_count
	mov	rbx, rsi				; (tree, which .gen_bitlen left alone)
	mov	esi, [rdi+zlib_tdesc_max_code_ofs]
	mov	rdi, rbx
	lea	rdx, [r12+zlib_dstate_bl_count_ofs]
	call	.gen_codes
	
	pop	r15 r14 r13 rbx
if profile_zlib_internals
	epilog
else
	ret
end if

;--------------------------------------------------- gen_codes ------------------------------------------------------
falign
.gen_codes:
	if profile_zlib_internals
		prolog	.gen_codes
	end if
	; on entry: rdi == tree, esi == max_code, rdx == pointer to bl_count array of dw
	sub	rsp, 48					; space for next_code
	xor	ecx, ecx				; code
	mov	r8d, 1					; bits
	xor	r9d, r9d				; n
calign
.gen_codes_ncloop:
	movzx	eax, word [rdx+r9*2]			; bl_count[bits-1]
	add	ecx, eax
	shl	ecx, 1
	mov	word [rsp+r8*2], cx
	add	r8d, 1
	add	r9d, 1
	cmp	r8d, 16
	jl	.gen_codes_ncloop
	xor	r9d, r9d
calign
.gen_codes_loop:
	cmp	r9d, esi
	ja	.gen_codes_bailout			; n > max_code == bailout
	mov	r8d, r9d
	add	r8d, 1
	movzx	eax, word [rdi+r9*4+2]			; tree[n].Len
	test	eax, eax
	cmovz	r9d, r8d
	jz	.gen_codes_loop
	; len nonzero, reverse the bits
	; tree[n].Code = bi_reverse(next_code[len]++, len)
	; then, n++ and continue
	movzx	ecx, word [rsp+rax*2]			; next_code[len]
	add	word [rsp+rax*2], 1			; next_code[len]++
	; ecx == code, we'll use r8d for our res
	xor	r8d, r8d
calign
.gen_codes_bi_reverse:
	mov	r10d, ecx
	and	r10d, 1
	or	r8d, r10d
	shr	ecx, 1
	shl	r8d, 1
	sub	eax, 1
	jnz	.gen_codes_bi_reverse
	shr	r8d, 1
	; r8d now contains the reverse, which we need to stick in tree[n].Code
	mov	word [rdi+r9*4], r8w
	add	r9d, 1
	jmp	.gen_codes_loop
calign
.gen_codes_bailout:
	add	rsp, 48
if profile_zlib_internals
	epilog
else
	ret
end if



;--------------------------------------------------- build_bl_tree --------------------------------------------------
falign
.build_bl_tree:
	if profile_zlib_internals
		prolog .build_bl_tree
	end if
	; use eax as our max_blindex (it is what we must return anyway)
	; no arguments, r12 == our dstate on entry
	lea	rdi, [r12+zlib_dstate_dyn_ltree_ofs]
	lea	rdx, [r12+zlib_dstate_l_desc_ofs]
	mov	rsi, [rdx+zlib_tdesc_max_code_ofs]	; hmmm
	call	.scan_tree
	lea	rdi, [r12+zlib_dstate_dyn_dtree_ofs]
	lea	rdx, [r12+zlib_dstate_d_desc_ofs]
	mov	rsi, [rdx+zlib_tdesc_max_code_ofs]	; hmmm
	call	.scan_tree

	lea	rdi, [r12+zlib_dstate_bl_desc_ofs]
	call	.build_tree

	mov	rax, [r12+zlib_dstate_opt_len_ofs]

	mov	eax, 18					; BL_CODES - 1
	lea	rdi, [r12+zlib_dstate_bl_tree_ofs]
calign
.build_bl_tree_loop1:
	cmp	eax, 3
	jl	.build_bl_tree_loop1_done
	; if (s->bl_tree[bl_order[max_blindex]].Len != 0) break
	mov	esi, [rax*4+.bl_order]
	cmp	word [rdi+rsi*4+2], 0
	jne	.build_bl_tree_loop1_done
	sub	eax, 1
	jmp	.build_bl_tree_loop1
dalign
.bl_order:
	dd	16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15
calign
.build_bl_tree_loop1_done:
	mov	edx, 3
	mov	r8d, eax				; save our max_blndex
	add	eax, 1
	mul	rdx
	add	rax, 14
	add	qword [r12+zlib_dstate_opt_len_ofs], rax
	mov	eax, r8d				; restore our max_blindex
if profile_zlib_internals
	epilog
else
	ret
end if

;--------------------------------------------------- scan_tree ------------------------------------------------------
falign
.scan_tree:
	if profile_zlib_internals
		prolog	.scan_tree
	end if
	; rdi == tree, esi == max_code
	push	r13
	lea	rdx, [r12+zlib_dstate_bl_tree_ofs]
	; so we are free to blast eax, ecx, r8d, r9d, r10d, r11d
	
	; ecx == n
	; eax == prevlen
	; r8d == curlen
	; r9d == nextlen
	; r10d == count
	; r11d == max_count
	; r13d == min_count
	mov	eax, 138
	mov	ecx, 3
	movzx	r9d, word [rdi+2]	; tree[0].Len
	mov	word [rdi+rsi*4+6], 0xffff	; guard (tree[max_code+1].Len = 0xffff)
	xor	r10d, r10d
	mov	r11d, 7
	mov	r13d, 4
	test	r9d, r9d
	cmovz	r11d, eax
	cmovz	r13d, ecx		; if (nextlen == 0) max_count = 138, min_count = 3
	xor	ecx, ecx
	mov	eax, -1
calign
.scan_tree_loop:
	cmp	ecx, esi
	jg	.scan_tree_loop_done
	add	ecx, 1
	mov	r8d, r9d		; curlen = nextlen
	movzx	r9d, word [rdi+rcx*4+2]	; tree[n+1].Len
	add	r10d, 1			; ++count
	cmp	r8d, r9d
	jne	.scan_tree_loop_topcase1
	; otherwise, curlen == nextlen, so check if count < max_count
	cmp	r10d, r11d
	jl	.scan_tree_loop		; yep, continue
calign
.scan_tree_loop_topcase1:
	cmp	r10d, r13d		; count < min_count
	jl	.scan_tree_loop_case1	
	test	r8d, r8d		; curlen != 0
	jnz	.scan_tree_loop_case2
	cmp	r10d, 10		; count <= 10
	jle	.scan_tree_loop_case3
	; last else
	add	word [rdx+72], 1	; s->bl_tree[REPZ_11_138].Freq++ (REPZ_11_138 == 18 * 4 == 72)
	; resetcount:
	xor	r10d, r10d		; count = 0
	mov	eax, r8d		; prevlen = curlen
	test	r9d, r9d
	jz	.scan_tree_loop_case4
	cmp	r8d, r9d
	je	.scan_tree_loop_case5
	mov	r11d, 7
	mov	r13d, 4
	jmp	.scan_tree_loop
calign
.scan_tree_loop_case1:
	add	word [rdx+r8*4], r10w	; s->bl_tree[curlen].Freq += count
	; resetcount:
	xor	r10d, r10d		; count = 0
	mov	eax, r8d		; prevlen = curlen
	test	r9d, r9d
	jz	.scan_tree_loop_case4
	cmp	r8d, r9d
	je	.scan_tree_loop_case5
	mov	r11d, 7
	mov	r13d, 4
	jmp	.scan_tree_loop
calign
.scan_tree_loop_case2:
	add	word [rdx+64], 1	; s->bl_tree[REP_3_6].Freq++ (REP_3_6 == 16 * 4 == 64)
	cmp	r8d, eax		; curlen != prevlen
	je	.scan_tree_loop_rset
	add	word [rdx+r8*4], 1	; s->bl_tree[curlen].Freq++
	; resetcount:
	xor	r10d, r10d		; count = 0
	mov	eax, r8d		; prevlen = curlen
	test	r9d, r9d
	jz	.scan_tree_loop_case4
	cmp	r8d, r9d
	je	.scan_tree_loop_case5
	mov	r11d, 7
	mov	r13d, 4
	jmp	.scan_tree_loop
calign
.scan_tree_loop_case3:
	add	word [rdx+68], 1	; s->bl_tree[REPZ_3_10].Freq++ (REPZ_3_10 == 17 * 4 == 68)
	; resetcount:
	xor	r10d, r10d		; count = 0
	mov	eax, r8d		; prevlen = curlen
	test	r9d, r9d
	jz	.scan_tree_loop_case4
	cmp	r8d, r9d
	je	.scan_tree_loop_case5
	mov	r11d, 7
	mov	r13d, 4
	jmp	.scan_tree_loop
calign
.scan_tree_loop_rset:
	; resetcount:
	xor	r10d, r10d		; count = 0
	mov	eax, r8d		; prevlen = curlen
	test	r9d, r9d
	jz	.scan_tree_loop_case4
	cmp	r8d, r9d
	je	.scan_tree_loop_case5
	mov	r11d, 7
	mov	r13d, 4
	jmp	.scan_tree_loop
calign
.scan_tree_loop_case4:
	mov	r11d, 138
	mov	r13d, 3
	jmp	.scan_tree_loop
calign
.scan_tree_loop_case5:
	mov	r11d, 6
	mov	r13d, 3
	jmp	.scan_tree_loop
calign
.scan_tree_loop_done:
	pop	r13
if profile_zlib_internals
	epilog
else
	ret
end if




;--------------------------------------------------- mabe_set_data_type ---------------------------------------------
falign
.maybe_set_data_type:
	if profile_zlib_internals
		prolog	.maybe_set_data_type
	end if
	cmp	dword [rbx+zlib_datatype_ofs], 2	; Z_UNKNOWN
if profile_zlib_internals
	jne	.profiled_retonly
else
	jne	.retonly
end if
	; Z_BINARY = 0, Z_TEXT = 1
	; r12+zlib_dyn_ltree_ofs is what we need to load up
	lea	rsi, [r12+zlib_dstate_dyn_ltree_ofs]
	mov	edx, 0xf3ffc07f
	mov	ecx, 32
calign
.maybe_set_data_type_nontext_loop:
	test	rdx, 1
	jz	.maybe_set_data_type_nontext_next
	cmp	word [rsi], 0
	je	.maybe_set_data_type_nontext_next
	mov	dword [rbx+zlib_datatype_ofs], 0	; Z_BINARY
if profile_zlib_internals
	epilog
else
	ret
end if
calign
.maybe_set_data_type_nontext_next:
	add	rsi, 4
	shr	rdx, 1
	sub	ecx, 1
	jnz	.maybe_set_data_type_nontext_loop
	; else, check for textual white-listed bytes
	lea	rsi, [r12+zlib_dstate_dyn_ltree_ofs]
	; set it to Z_TEXT so we can just jump to retonly
	mov	dword [rbx+zlib_datatype_ofs], 1	; Z_TEXT
	cmp	word [rsi+36], 0
if profile_zlib_internals
	jne	.profiled_retonly
else
	jne	.retonly
end if
	cmp	word [rsi+40], 0
if profile_zlib_internals
	jne	.profiled_retonly
else
	jne	.retonly
end if
	cmp	word [rsi+52], 0
if profile_zlib_internals
	jne	.profiled_retonly
else
	jne	.retonly
end if
	add	rsi, 128				; [32]
	mov	ecx, 224				; LITERALS - 32
calign
.maybe_set_data_type_text_loop:
	cmp	word [rsi], 0
if profile_zlib_internals
	jne	.profiled_retonly
else
	jne	.retonly
end if
	add	rsi, 4
	sub	ecx, 1
	jnz	.maybe_set_data_type_text_loop
	; otherwise, no blacklisted or whitelisted bytes, stream is empty or graylisted only
	mov	dword [rbx+zlib_datatype_ofs], 0	; Z_BINARY
if profile_zlib_internals
	epilog
else
	ret
end if
	
end if




if used adler32 | defined include_everything
	; TODO: redo me with a bit more consideration
	; edi == adler32 accumulator, rsi == buffer, rdx == length
falign
adler32:
	prolog	adler32
	mov	r8d, edi
	mov	r9d, edi		
	and	r8d, 0xffff		; low order word
	shr	r9d, 16			; high order word
	cmp	rdx, 1
	je	.singlebyte
calign
.chunkloop:
	cmp	rdx, 5552
	jb	.chunkdone
	mov	ecx, 5552
	mov	r10d, r8d		; low order copy
	mov	r11d, r9d		; high order copy
calign
.chunkinner:
	cmp	ecx, 16
	jl	.chunkinnerdone
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+10]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+11]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+12]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+13]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+14]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+15]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 16
	sub	ecx, 16
	sub	rdx, 16
	jmp	.chunkinner
calign
.chunkinnerdone:
	; ecx < 16
	shl	ecx, 3
	add	rcx, .chunkjumptable
	jmp	qword [rcx]
dalign
.chunkjumptable:
	dq	.chunk0, .chunk1, .chunk2, .chunk3, .chunk4, .chunk5, .chunk6, .chunk7
	dq	.chunk8, .chunk9, .chunk10, .chunk11, .chunk12, .chunk13, .chunk14, .chunk15
calign
.chunk0:
	; r8d = r10d % 65521
	; r9d = r11d % 65521
	; save rdx cuz unsigned divide blasts it
	mov	rcx, rdx
	mov	eax, r10d
	mov	r10d, 65521
	xor	edx, edx
	div	r10d
	; remainder now in edx
	mov	r8d, edx
	xor	edx, edx
	mov	eax, r11d
	div	r10d
	mov	r9d, edx
	; restore rdx, then check for zero
	mov	rdx, rcx
	test	rdx, rdx
	jz	.chunkreallydone
	; otherwise, go back to the top
	jmp	.chunkloop
calign
.chunk1:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 1
	sub	rdx, 1
	jmp	.chunk0
calign
.chunk2:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 2
	sub	rdx, 2
	jmp	.chunk0
calign
.chunk3:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 3
	sub	rdx, 3
	jmp	.chunk0
calign
.chunk4:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 4
	sub	rdx, 4
	jmp	.chunk0
calign
.chunk5:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 5
	sub	rdx, 5
	jmp	.chunk0
calign
.chunk6:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 6
	sub	rdx, 6
	jmp	.chunk0
calign
.chunk7:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 7
	sub	rdx, 7
	jmp	.chunk0
calign
.chunk8:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 8
	sub	rdx, 8
	jmp	.chunk0
calign
.chunk9:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 9
	sub	rdx, 9
	jmp	.chunk0
calign
.chunk10:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 10
	sub	rdx, 10
	jmp	.chunk0
calign
.chunk11:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+10]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 11
	sub	rdx, 11
	jmp	.chunk0
calign
.chunk12:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+10]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+11]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 12
	sub	rdx, 12
	jmp	.chunk0
calign
.chunk13:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+10]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+11]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+12]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 13
	sub	rdx, 13
	jmp	.chunk0
calign
.chunk14:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+10]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+11]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+12]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+13]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 14
	sub	rdx, 14
	jmp	.chunk0
calign
.chunk15:
	movzx	eax, byte [rsi]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+1]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+2]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+3]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+4]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+5]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+6]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+7]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+8]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+9]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+10]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+11]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+12]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+13]
	add	r10d, eax
	add	r11d, r10d
	movzx	eax, byte [rsi+14]
	add	r10d, eax
	add	r11d, r10d
	add	rsi, 15
	sub	rdx, 15
	jmp	.chunk0
calign
.chunkdone:
	; if rdx < 5552, we end up here
	mov	ecx, edx		; however many are left, which we know is <5552
	mov	r10d, r8d		; low order copy
	mov	r11d, r9d		; high order copy
	; when this one finishes, it will call chunkreallydone
	jmp	.chunkinner
calign
.chunkreallydone:
	; rdx went to zero, which means we are all done and can recombine our r8d and r9d for our return
	shl	r9d, 16
	or	r8d, r9d
	mov	eax, r8d
	epilog
calign
.singlebyte:
	movzx	eax, byte [rsi]
	add	r8d, eax		; adler += buf[0]

	mov	ecx, r8d
	sub	ecx, 65521
	cmp	r8d, 65521
	cmovae	r8d, ecx		; if (adler >= BASE) adler -= BASE

	add	r9d, r8d		; sum2 += adler
	mov	ecx, r9d
	sub	ecx, 65521
	cmp	r9d, 65521
	cmovae	r9d, ecx		; if (sum2 >= BASE) sum2 -= BASE
	shl	r9d, 16
	or	r8d, r9d
	mov	eax, r8d
	epilog
	
	
end if