HeavyThing - xmlparser.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	; xmlparser.inc: A very fast XML parser/navigator that will handle and
	;   parse UTF8, UTF16, and UTF32 (passed as a parameter to xmlparser$new)
	;   Optionally, xmlparser$new_string will take a native string as well.
	;
	; This is a "streaming parser" that allows forward and backward movement
	; through an XML document/fragment (see xmlparser$next and xmlparser$prev)
	; (pull style).
	;
	; Update v1.22+: xmlparser$prev now behaves itself correctly. Previously
	; it choked on  and >>>>>]]> situations, thanks
	; to it paying attention to > and nesting levels, etc. The new version
	; correctly deals with these and now sanely parses them. Additionally,
	; we also deal with unescaped > in text/attrs (messy haha).
	;
	; Since a pointer, length and character width are passed to xmlparser$new,
	; we do not create actual library strings for each, and instead keep
	; start and end offsets for everywhere would-be strings are, and present
	; convenience functions to create library strings from them for when they
	; are desired. (this allows heap$alloc-free parsing/traversal speed)
	;


if used xmltag$new | used xmltag$reset | defined include_everything

; this define sets the upper bound on the maximum number of attributes permitted
; our default here is pretty reasonable, it is rare for an xml element to contain
; more than 32 attributes ... if you encounter the xmlparser_tagattributeoverflow
; error return from the xmlparser, increase this value... 
xmltag_maxattr			= 32

; when unescaping strings, this setting determines max size to use for stack-based
; unescape temporary buffer space (note this size is in characters, not bytes)
xmltag_unescape_stacklimit	= 16384

; defines for possible tag types
xmltag_notype			= 0
xmltag_element			= 1
xmltag_textnode			= 2
xmltag_cdata			= 3
xmltag_processinginstruction	= 4
xmltag_comment			= 5
xmltag_doctypedeclaration	= 6
xmltag_xmldeclaration		= 7

; each individual attribute contains:
xmltagattr_namestart_ofs	= 0		; offset to start of the name
xmltagattr_nameend_ofs		= 8		; offset to end of the name
xmltagattr_valuestart_ofs	= 16		; offset to start of value
xmltagattr_valueend_ofs		= 24		; offset to end of value
xmltagattr_namecolon_ofs	= 32		; dword offset (relative to nameend-namestart) of colon in the attribute name (-1 otherwise)

xmltagattr_size = 40				; wasted 4 bytes each

; and our xmltag itself needs:
xmltag_textstart_ofs		= 0		; offset to start of tag text
xmltag_textend_ofs		= 8		; offset to start of tag text end
xmltag_textcolon_ofs		= 16		; dword offset (relative to textend-textstart) of colon in the tag name (-1 otherwise)
xmltag_empty_ofs		= 20		; dword bool whether tag is empty or not (think 
)
xmltag_nodetype_ofs = 24 ; dword one of the 8 nodetypes listed above xmltag_attrcount_ofs = 28 ; dword count of how many attributes we have xmltag_realstart_ofs = 32 ; offset to the real start of the tag xmltag_realend_ofs = 40 ; offset to the real end of the tag xmltag_base_ofs = 48 ; copy of the base pointer from the xmlparser xmltag_width_ofs = 56 ; copy of the width setting from the xmlparser (wasting 4 bytes) xmltag_attributes_ofs = 64 ; xmltag_maxattr * xmltagattr_size bytes xmltag_size = xmltag_attributes_ofs + (xmltag_maxattr * xmltagattr_size) ; NOTE: this can of course also be stack allocated, but you should call xmltag$reset on it before use ; no arguments, heap$alloc's and initialises an xmltag object ; returns new xmltag object falign xmltag$new: prolog xmltag$new mov edi, xmltag_size call heap$alloc xor ecx, ecx mov edx, -1 mov [rax+xmltag_textstart_ofs], rcx mov [rax+xmltag_textend_ofs], rcx mov [rax+xmltag_textcolon_ofs], rdx ; writes 0 into empty too mov [rax+xmltag_nodetype_ofs], rcx ; writes over attrcount too epilog end if if used xmltag$destroy | defined include_everything ; placeholder function only really, just does a heap$free ; single argument in rdi: an xmltag object falign xmltag$destroy: prolog xmltag$destroy call heap$free epilog end if if used xmltag$reset | defined include_everything ; single argument in rdi: an xmltag object to reset falign xmltag$reset: prolog xmltag$reset xor ecx, ecx mov edx, -1 mov [rdi+xmltag_textstart_ofs], rcx mov [rdi+xmltag_textend_ofs], rcx mov [rdi+xmltag_textcolon_ofs], rdx ; writes 0 into empty too mov [rdi+xmltag_nodetype_ofs], rcx ; writes over attrcount too epilog end if if used xmltag$newattr | defined include_everything ; six arguments: rdi == xmltag object, rsi == namestart, rdx == nameend, ecx == colonpos, r8 == valuestart, r9 == valueend ; returns bool in eax as to whether we succeeded or not (0 == maxattr reached) ; (leaves rdx as a pointer to the xmltagattr structure if eax = 1) falign xmltag$newattr: prolog xmltag$newattr mov eax, [rdi+xmltag_attrcount_ofs] cmp eax, xmltag_maxattr je .zeroret imul eax, eax, xmltagattr_size add dword [rdi+xmltag_attrcount_ofs], 1 ; offset into rdi for our new attribute is rdi+xmltag_attributes_ofs+rax lea rax, [rdi+rax+xmltag_attributes_ofs] mov [rax+xmltagattr_namestart_ofs], rsi mov [rax+xmltagattr_nameend_ofs], rdx mov [rax+xmltagattr_valuestart_ofs], r8 mov [rax+xmltagattr_valueend_ofs], r9 mov [rax+xmltagattr_namecolon_ofs], rcx mov rdx, rax mov eax, 1 epilog .zeroret: xor eax, eax epilog end if if used xmltag$getattr | defined include_everything ; two arguments: rdi == xmltag object, esi == attribute # to get ; returns pointer to xmltagattr structure for esi or 0 on error falign xmltag$getattr: prolog xmltag$getattr cmp esi, [rdi+xmltag_attrcount_ofs] jae .zeroret imul esi, esi, xmltagattr_size lea rax, [rdi+rsi+xmltag_attributes_ofs] epilog .zeroret: xor eax, eax epilog end if if used xmltag$escape_string | defined include_everything ; single argument in rdi: string to escape ; returns a new heap$alloc'd string of escaped text ; note that this is only for the 5 lt, gt, amp, apos, quot falign xmltag$escape_string: prolog xmltag$escape_string lea rsi, [rdi+8] mov rcx, [rdi] xor edx, edx ; determine the size we need first mov r9d, 4 mov r10d, 5 mov r11d, 6 calign .firstpass: if string_bits = 32 mov eax, [rsi] add rsi, 4 else movzx eax, word [rsi] add rsi, 2 end if mov r8d, 1 cmp eax, '<' cmove r8d, r9d cmp eax, '>' cmove r8d, r9d cmp eax, '&' cmove r8d, r10d cmp eax, 0x27 cmove r8d, r11d cmp eax, '"' cmove r8d, r11d add rdx, r8 sub rcx, 1 jnz .firstpass ; so now rdx has the new string length we are after push rdx if string_bits = 32 shl rdx, 2 else shl rdx, 1 end if push rdi add rdx, 8 mov rdi, rdx call heap$alloc pop rsi mov rdi, rax mov rcx, [rsi] add rsi, 8 pop rdx mov [rdi], rdx add rdi, 8 calign .secondpass: if string_bits = 32 mov edx, [rsi] add rsi, 4 else movzx edx, word [rsi] add rsi, 2 end if cmp edx, '<' je .lt cmp edx, '>' je .gt cmp edx, '&' je .amp cmp edx, 0x27 je .apos cmp edx, '"' je .quot if string_bits = 32 mov [rdi], edx add rdi, 4 else mov [rdi], dx add rdi, 2 end if sub rcx, 1 jnz .secondpass epilog calign .lt: if string_bits = 32 mov dword [rdi], '&' mov dword [rdi+4], 'l' mov dword [rdi+8], 't' mov dword [rdi+12], ';' add rdi, 16 else mov word [rdi], '&' mov word [rdi+2], 'l' mov word [rdi+4], 't' mov word [rdi+6], ';' add rdi, 8 end if sub rcx, 1 jnz .secondpass epilog calign .gt: if string_bits = 32 mov dword [rdi], '&' mov dword [rdi+4], 'g' mov dword [rdi+8], 't' mov dword [rdi+12], ';' add rdi, 16 else mov word [rdi], '&' mov word [rdi+2], 'g' mov word [rdi+4], 't' mov word [rdi+6], ';' add rdi, 8 end if sub rcx, 1 jnz .secondpass epilog calign .amp: if string_bits = 32 mov dword [rdi], '&' mov dword [rdi+4], 'a' mov dword [rdi+8], 'm' mov dword [rdi+12], 'p' mov dword [rdi+16], ';' add rdi, 20 else mov word [rdi], '&' mov word [rdi+2], 'a' mov word [rdi+4], 'm' mov word [rdi+6], 'p' mov word [rdi+8], ';' add rdi, 10 end if sub rcx, 1 jnz .secondpass epilog calign .apos: if string_bits = 32 mov dword [rdi], '&' mov dword [rdi+4], 'a' mov dword [rdi+8], 'p' mov dword [rdi+12], 'o' mov dword [rdi+16], 's' mov dword [rdi+20], ';' add rdi, 24 else mov word [rdi], '&' mov word [rdi+2], 'a' mov word [rdi+4], 'p' mov word [rdi+6], 'o' mov word [rdi+8], 's' mov word [rdi+10], ';' add rdi, 12 end if sub rcx, 1 jnz .secondpass epilog calign .quot: if string_bits = 32 mov dword [rdi], '&' mov dword [rdi+4], 'q' mov dword [rdi+8], 'u' mov dword [rdi+12], 'o' mov dword [rdi+16], 't' mov dword [rdi+20], ';' add rdi, 24 else mov word [rdi], '&' mov word [rdi+2], 'q' mov word [rdi+4], 'u' mov word [rdi+6], 'o' mov word [rdi+8], 't' mov word [rdi+10], ';' add rdi, 12 end if sub rcx, 1 jnz .secondpass epilog end if if used xmltag$unescape | defined include_everything ; three arguments: rdi == xmltag object, rsi == text start offset, rdx == text end offset ; returns a new heap$alloc'd string of unescaped text falign xmltag$unescape: prolog xmltag$unescape mov r8, [rdi+xmltag_base_ofs] mov r9d, [rdi+xmltag_width_ofs] mov rcx, rdx lea r10, [rsi*2] lea r11, [rsi*4] sub rcx, rsi jz .emptystring cmp r9d, 1 cmove rsi, r10 cmp r9d, 2 cmove rsi, r11 mov r10, rcx lea rax, [r8+rsi] jmp qword [r9*8+.dispatch] dalign .dispatch: dq .utf8, .utf16, .utf32 calign .emptystring: call string$new epilog calign .utf8: ; prescan the string searching for ampersand cmp byte [rax], '&' je .utf8_unescape add rax, 1 sub rcx, 1 jnz .utf8 ; if we made it here, no ampersands were found lea rdi, [r8+rsi] mov rsi, r10 call string$from_utf8 epilog calign .utf16: ; prescan the string searching for ampersand cmp word [rax], '&' je .utf16_unescape add rax, 2 sub rcx, 1 jnz .utf16 ; if we made it here, no ampersands were found lea rdi, [r8+rsi] lea rsi, [r10*2] call string$from_utf16 epilog calign .utf32: ; prescan the string searching for ampersand cmp dword [rax], '&' je .utf32_unescape add rax, 4 sub rcx, 1 jnz .utf32 ; if we made it here, no ampersands were found lea rdi, [r8+rsi] lea rsi, [r10*4] call string$from_utf32 epilog calign .utf8_unescape: ; if our text is small enough, unescape on the stack ; otherwise, do expensive buffer construction cmp r10, xmltag_unescape_stacklimit ja .utf8_unescape_buffer mov rdx, r10 push rbx r12 r13 r14 xor r14d, r14d sub rsp, xmltag_unescape_stacklimit lea rsi, [r8+rsi] ; beginning of string sub rdx, rcx mov rdi, rsp mov r12, rax ; pointer to first ampersand mov r13, rcx ; bytes left lea rbx, [rsp+rdx] call memcpy mov rsi, r12 mov rcx, r13 xor edx, edx calign .utf8_unescape_semicolon_scan: cmp byte [rsi+rdx], ';' je .utf8_unescape_semicolon add rdx, 1 sub rcx, 1 jnz .utf8_unescape_semicolon_scan ; if we made it to here, an ampersand was found but no trailing semicolon ; despite this being an error, just return the remaining of the string as-is mov rdi, rbx mov rsi, r12 mov rdx, r13 add rbx, r13 call memcpy .utf8_unescape_return: test r14, r14 jnz .utf8_unescape_buffer_return ; rbx - rsp is how many bytes we have accumulated mov rdi, rsp mov rsi, rbx sub rsi, rsp call string$from_utf8 add rsp, xmltag_unescape_stacklimit pop r14 r13 r12 rbx epilog calign .utf8_unescape_buffer_return: mov rdi, r14 mov rsi, rbx sub rsi, r14 call string$from_utf8 mov rdi, r14 mov r14, rax call heap$free mov rax, r14 pop r14 r13 r12 rbx epilog calign .utf8_unescape_semicolon: ; rdx is our length cmp byte [rsi+1], '#' je .utf8_unescape_codepoint cmp rdx, 3 je .utf8_unescape_ltgt cmp rdx, 4 je .utf8_unescape_ampersand cmp rdx, 5 jne .utf8_unescape_straightin ; quot or apos check mov eax, '"' cmp dword [rsi+1], 'quot' je .utf8_unescape_char mov eax, 0x27 cmp dword [rsi+1], 'apos' je .utf8_unescape_char mov eax, 0xa0 cmp dword [rsi+1], 'nbsp' je .utf8_unescape_char .utf8_unescape_straightin: ; add from r12 to r12+rdx+1 mov rdi, rbx mov rsi, r12 add rdx, 1 add rbx, rdx add r12, rdx sub r13, rdx call memcpy test r13, r13 jz .utf8_unescape_return ; find the next ampersand mov rsi, r12 mov rcx, r13 xor edx, edx calign .utf8_unescape_amp_scan: cmp byte [rsi+rdx], '&' je .utf8_unescape_amp add rdx, 1 sub rcx, 1 jnz .utf8_unescape_amp_scan ; if we made it to here no ampersand was found test rdx, rdx jz .utf8_unescape_return mov rdi, rbx mov rsi, r12 mov rdx, r13 add rbx, r13 call memcpy jmp .utf8_unescape_return calign .utf8_unescape_codepoint: ; word at [rsi] == '&#' ; byte at [rsi+rdx] == ';' lea rdi, [rsi+2] lea rcx, [rdx-2] cmp rdx, 2 ; if it was &#; put it straight in as-is je .utf8_unescape_straightin cmp byte [rdi], 'x' je .utf8_unescape_codepoint_base16 xor eax, eax calign .utf8_unescape_codepoint_loop: movzx r8d, byte [rdi] add rdi, 1 sub r8d, '0' cmp r8d, 10 jae .utf8_unescape_straightin imul eax, eax, 10 add eax, r8d sub rcx, 1 jnz .utf8_unescape_codepoint_loop .utf8_unescape_codepoint_ready: cmp eax, 256 jae .utf8_unescape_codepoint_encoded mov byte [rbx], al add rbx, 1 add rdx, 1 add r12, rdx sub r13, rdx jz .utf8_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf8_unescape_amp_scan calign .utf8_unescape_codepoint_encoded: ; we can cheat here at create a string object below our current stackframe: mov qword [rsp-64], 1 mov qword [rsp-56], rax lea rdi, [rsp-64] mov rsi, rbx ; setup values for next round: add rdx, 1 add r12, rdx sub r13, rdx call string$to_utf8 add rbx, rax test r13, r13 jz .utf8_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf8_unescape_amp_scan calign .utf8_unescape_codepoint_base16: add rdi, 1 sub rcx, 1 jz .utf8_unescape_straightin ; &#x; xor eax, eax calign .utf8_unescape_codepoint_base16_loop: movzx r8d, byte [rdi] add rdi, 1 cmp r8d, 'a' jae .utf8_unescape_codepoint_base16_lc cmp r8d, 'A' jae .utf8_unescape_codepoint_base16_uc sub r8d, '0' cmp r8d, 16 jae .utf8_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf8_unescape_codepoint_base16_loop jmp .utf8_unescape_codepoint_ready calign .utf8_unescape_codepoint_base16_uc: sub r8d, '0'+7 cmp r8d, 16 jae .utf8_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf8_unescape_codepoint_base16_loop jmp .utf8_unescape_codepoint_ready calign .utf8_unescape_codepoint_base16_lc: sub r8d, '0'+7+0x20 cmp r8d, 16 jae .utf8_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf8_unescape_codepoint_base16_loop jmp .utf8_unescape_codepoint_ready calign .utf8_unescape_ampersand: mov eax, '&' cmp dword [rsi], '&' je .utf8_unescape_char jmp .utf8_unescape_straightin calign .utf8_unescape_ltgt: mov eax, '<' cmp dword [rsi], '<' je .utf8_unescape_char mov eax, '>' cmp dword [rsi], '>' jne .utf8_unescape_straightin ; fall through to utf8_unescape_char calign .utf8_unescape_char: mov [rbx], al add rdx, 1 add rbx, 1 add r12, rdx sub r13, rdx jz .utf8_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf8_unescape_amp_scan calign .utf8_unescape_amp: ; if rdx is 0, no memcpy test rdx, rdx jz .utf8_unescape_amp_nocopy ; otherwise, rdx bytes go straight in mov rdi, rbx mov rsi, r12 add rbx, rdx add r12, rdx sub r13, rdx call memcpy .utf8_unescape_amp_nocopy: ; so now the byte at r12 is an ampersand mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf8_unescape_semicolon_scan calign .utf8_unescape_buffer: ; string is too big to do on the stack, so create a temporary buffer for it mov rdx, r10 push rbx r12 r13 r14 lea rsi, [r8+rsi] ; beginning of string sub rdx, rcx mov r12, rax ; pointer to first ampersand mov r13, rcx ; bytes left push rsi rdx mov rdi, r10 call heap$alloc mov r14, rax pop rdx rsi mov rdi, rax lea rbx, [r14+rdx] call memcpy mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf8_unescape_semicolon_scan calign .utf16_unescape: ; if our text is small enough, unescape on the stack ; otherwise, do expensive buffer construction cmp r10, xmltag_unescape_stacklimit ja .utf16_unescape_buffer mov rdx, r10 push rbx r12 r13 r14 xor r14d, r14d sub rsp, xmltag_unescape_stacklimit shl 1 lea rsi, [r8+rsi] ; beginning of string sub rdx, rcx mov rdi, rsp mov r12, rax ; pointer to first ampersand mov r13, rcx ; characters left lea rbx, [rsp+rdx*2] shl rdx, 1 call memcpy mov rsi, r12 mov rcx, r13 xor edx, edx calign .utf16_unescape_semicolon_scan: cmp word [rsi+rdx*2], ';' je .utf16_unescape_semicolon add rdx, 1 sub rcx, 1 jnz .utf16_unescape_semicolon_scan ; if we made it to here, an ampersand was found but no trailing semicolon ; despite this being an error, just return the remaining of the string as-is ; r13 is the number of characters left not bytes: shl r13, 1 mov rdi, rbx mov rsi, r12 mov rdx, r13 add rbx, r13 call memcpy .utf16_unescape_return: test r14, r14 jnz .utf16_unescape_buffer_return ; rbx - rsp is how many bytes we have accumulated mov rdi, rsp mov rsi, rbx sub rsi, rsp call string$from_utf16 add rsp, xmltag_unescape_stacklimit shl 1 pop r14 r13 r12 rbx epilog calign .utf16_unescape_buffer_return: mov rdi, r14 mov rsi, rbx sub rsi, r14 call string$from_utf16 mov rdi, r14 mov r14, rax call heap$free mov rax, r14 pop r14 r13 r12 rbx epilog dalign .qquot: dw 'q', 'u', 'o', 't' .qapos: dw 'a', 'p', 'o', 's' .qnbsp: dw 'n', 'b', 's', 'p' calign .utf16_unescape_semicolon: ; rdx is our length cmp word [rsi+2], '#' je .utf16_unescape_codepoint cmp rdx, 3 je .utf16_unescape_ltgt cmp rdx, 4 je .utf16_unescape_ampersand cmp rdx, 5 jne .utf16_unescape_straightin ; quot or apos check mov eax, '"' mov r8, [rsi+2] cmp r8, [.qquot] je .utf16_unescape_char mov eax, 0x27 cmp r8, [.qapos] je .utf16_unescape_char mov eax, 0xa0 cmp r8, [.qnbsp] je .utf16_unescape_char .utf16_unescape_straightin: ; add from r12 to r12+rdx+1 mov rdi, rbx mov rsi, r12 add rdx, 1 sub r13, rdx shl rdx, 1 add rbx, rdx add r12, rdx call memcpy test r13, r13 jz .utf16_unescape_return ; find the next ampersand mov rsi, r12 mov rcx, r13 xor edx, edx calign .utf16_unescape_amp_scan: cmp word [rsi+rdx*2], '&' je .utf16_unescape_amp add rdx, 1 sub rcx, 1 jnz .utf16_unescape_amp_scan ; if we made it to here no ampersand was found test rdx, rdx jz .utf16_unescape_return ; r13 is number of characters left: shl r13, 1 mov rdi, rbx mov rsi, r12 mov rdx, r13 add rbx, r13 call memcpy jmp .utf16_unescape_return calign .utf16_unescape_codepoint: ; dword at [rsi] == '&#' ; word at [rsi+rdx*2] == ';' lea rdi, [rsi+4] lea rcx, [rdx-2] cmp rdx, 2 ; if it was &#; put it straight in as-is je .utf16_unescape_straightin cmp word [rdi], 'x' je .utf16_unescape_codepoint_base16 xor eax, eax calign .utf16_unescape_codepoint_loop: movzx r8d, word [rdi] add rdi, 2 sub r8d, '0' cmp r8d, 10 jae .utf16_unescape_straightin imul eax, eax, 10 add eax, r8d sub rcx, 1 jnz .utf16_unescape_codepoint_loop .utf16_unescape_codepoint_ready: cmp eax, 0xd800 jae .utf16_unescape_codepoint_encoded mov word [rbx], ax add rbx, 2 add rdx, 1 sub r13, rdx shl rdx, 1 add r12, rdx test r13, r13 jz .utf16_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf16_unescape_amp_scan calign .utf16_unescape_codepoint_encoded: ; we can cheat here at create a string object below our current stackframe: mov qword [rsp-64], 1 mov qword [rsp-56], rax lea rdi, [rsp-64] mov rsi, rbx ; setup values for next round: add rdx, 1 sub r13, rdx shl rdx, 1 add r12, rdx call string$to_utf16 shl rax, 1 add rbx, rax test r13, r13 jz .utf16_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf16_unescape_amp_scan calign .utf16_unescape_codepoint_base16: add rdi, 2 sub rcx, 1 jz .utf16_unescape_straightin ; &#x; xor eax, eax calign .utf16_unescape_codepoint_base16_loop: movzx r8d, word [rdi] add rdi, 2 cmp r8d, 'a' jae .utf16_unescape_codepoint_base16_lc cmp r8d, 'A' jae .utf16_unescape_codepoint_base16_uc sub r8d, '0' cmp r8d, 16 jae .utf16_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf16_unescape_codepoint_base16_loop jmp .utf16_unescape_codepoint_ready calign .utf16_unescape_codepoint_base16_uc: sub r8d, '0'+7 cmp r8d, 16 jae .utf16_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf16_unescape_codepoint_base16_loop jmp .utf16_unescape_codepoint_ready calign .utf16_unescape_codepoint_base16_lc: sub r8d, '0'+7+0x20 cmp r8d, 16 jae .utf16_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf16_unescape_codepoint_base16_loop jmp .utf16_unescape_codepoint_ready dalign .qamp: dw '&', 'a', 'm', 'p' .qlt: dw '&', 'l', 't', ';' .qgt: dw '&', 'g', 't', ';' calign .utf16_unescape_ampersand: mov r8, [.qamp] mov eax, '&' cmp r8, [rsi] je .utf16_unescape_char jmp .utf16_unescape_straightin calign .utf16_unescape_ltgt: mov r8, [rsi] mov eax, '<' cmp r8, [.qlt] je .utf16_unescape_char mov eax, '>' cmp r8, [.qgt] jne .utf16_unescape_straightin ; fall through to utf16_unescape_char calign .utf16_unescape_char: mov [rbx], ax add rdx, 1 sub r13, rdx shl rdx, 1 add rbx, 2 add r12, rdx test r13, r13 jz .utf16_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf16_unescape_amp_scan calign .utf16_unescape_amp: ; if rdx is 0, no memcpy test rdx, rdx jz .utf16_unescape_amp_nocopy ; otherwise, rdx characters go straight in mov rdi, rbx mov rsi, r12 sub r13, rdx shl rdx, 1 add rbx, rdx add r12, rdx call memcpy .utf16_unescape_amp_nocopy: ; so now the word at r12 is an ampersand mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf16_unescape_semicolon_scan calign .utf16_unescape_buffer: ; string is too big to do on the stack, so create a temporary buffer for it mov rdx, r10 push rbx r12 r13 r14 lea rsi, [r8+rsi] ; beginning of string sub rdx, rcx mov r12, rax ; pointer to first ampersand mov r13, rcx ; characters left push rsi rdx mov rdi, r10 shl rdi, 1 call heap$alloc mov r14, rax pop rdx rsi mov rdi, rax lea rbx, [r14+rdx] shl rdx, 1 call memcpy mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf16_unescape_semicolon_scan calign .utf32_unescape: ; if our text is small enough, unescape on the stack ; otherwise, do expensive buffer construction cmp r10, xmltag_unescape_stacklimit ja .utf32_unescape_buffer mov rdx, r10 push rbx r12 r13 r14 xor r14d, r14d sub rsp, xmltag_unescape_stacklimit shl 2 lea rsi, [r8+rsi] ; beginning of string sub rdx, rcx mov rdi, rsp mov r12, rax ; pointer to first ampersand mov r13, rcx ; characters left lea rbx, [rsp+rdx*4] shl rdx, 2 call memcpy mov rsi, r12 mov rcx, r13 xor edx, edx calign .utf32_unescape_semicolon_scan: cmp dword [rsi+rdx*4], ';' je .utf32_unescape_semicolon add rdx, 1 sub rcx, 1 jnz .utf32_unescape_semicolon_scan ; if we made it to here, an ampersand was found but no trailing semicolon ; despite this being an error, just return the remaining of the string as-is ; r13 is the number of characters left not bytes: shl r13, 2 mov rdi, rbx mov rsi, r12 mov rdx, r13 add rbx, r13 call memcpy .utf32_unescape_return: test r14, r14 jnz .utf32_unescape_buffer_return ; rbx - rsp is how many bytes we have accumulated mov rdi, rsp mov rsi, rbx sub rsi, rsp call string$from_utf32 add rsp, xmltag_unescape_stacklimit shl 2 pop r14 r13 r12 rbx epilog calign .utf32_unescape_buffer_return: mov rdi, r14 mov rsi, rbx sub rsi, r14 call string$from_utf32 mov rdi, r14 mov r14, rax call heap$free mov rax, r14 pop r14 r13 r12 rbx epilog dalign .dqquot: dd 'q', 'u', 'o', 't' .dqapos: dd 'a', 'p', 'o', 's' .dqnbsp: dd 'n', 'b', 's', 'p' calign .utf32_unescape_maybequot: cmp r9, [.dqquot+8] jne .utf32_unescape_notquot jmp .utf32_unescape_char calign .utf32_unescape_maybeapos: cmp r9, [.dqapos+8] jne .utf32_unescape_notapos jmp .utf32_unescape_char calign .utf32_unescape_maybenbsp: cmp r9, [.dqnbsp+8] jne .utf32_unescape_straightin jmp .utf32_unescape_char calign .utf32_unescape_semicolon: ; rdx is our length cmp dword [rsi+4], '#' je .utf32_unescape_codepoint cmp rdx, 3 je .utf32_unescape_ltgt cmp rdx, 4 je .utf32_unescape_ampersand cmp rdx, 5 jne .utf32_unescape_straightin ; quot or apos check mov eax, '"' mov r8, [rsi+4] mov r9, [rsi+12] cmp r8, [.dqquot] je .utf32_unescape_maybequot .utf32_unescape_notquot: mov eax, 0x27 cmp r8, [.dqapos] je .utf32_unescape_maybeapos .utf32_unescape_notapos: mov eax, 0xa0 cmp r8, [.dqnbsp] je .utf32_unescape_maybenbsp .utf32_unescape_straightin: ; add from r12 to r12+rdx+1 mov rdi, rbx mov rsi, r12 add rdx, 1 sub r13, rdx shl rdx, 2 add rbx, rdx add r12, rdx call memcpy test r13, r13 jz .utf32_unescape_return ; find the next ampersand mov rsi, r12 mov rcx, r13 xor edx, edx calign .utf32_unescape_amp_scan: cmp dword [rsi+rdx*4], '&' je .utf32_unescape_amp add rdx, 1 sub rcx, 1 jnz .utf32_unescape_amp_scan ; if we made it to here no ampersand was found test rdx, rdx jz .utf32_unescape_return ; r13 is number of characters left: shl r13, 2 mov rdi, rbx mov rsi, r12 mov rdx, r13 add rbx, r13 call memcpy jmp .utf32_unescape_return calign .utf32_unescape_codepoint: ; qword at [rsi] == '&#' ; dword at [rsi+rdx*4] == ';' lea rdi, [rsi+8] lea rcx, [rdx-2] cmp rdx, 2 ; if it was &#; put it straight in as-is je .utf32_unescape_straightin cmp dword [rdi], 'x' je .utf32_unescape_codepoint_base16 xor eax, eax calign .utf32_unescape_codepoint_loop: mov r8d, dword [rdi] add rdi, 4 sub r8d, '0' cmp r8d, 10 jae .utf32_unescape_straightin imul eax, eax, 10 add eax, r8d sub rcx, 1 jnz .utf32_unescape_codepoint_loop .utf32_unescape_codepoint_ready: mov dword [rbx], eax add rbx, 4 add rdx, 1 sub r13, rdx shl rdx, 2 add r12, rdx test r13, r13 jz .utf32_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf32_unescape_amp_scan calign .utf32_unescape_codepoint_base16: add rdi, 4 sub rcx, 1 jz .utf32_unescape_straightin ; &#x; xor eax, eax calign .utf32_unescape_codepoint_base16_loop: mov r8d, dword [rdi] add rdi, 4 cmp r8d, 'a' jae .utf32_unescape_codepoint_base16_lc cmp r8d, 'A' jae .utf32_unescape_codepoint_base16_uc sub r8d, '0' cmp r8d, 16 jae .utf32_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf32_unescape_codepoint_base16_loop jmp .utf32_unescape_codepoint_ready calign .utf32_unescape_codepoint_base16_uc: sub r8d, '0'+7 cmp r8d, 16 jae .utf32_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf32_unescape_codepoint_base16_loop jmp .utf32_unescape_codepoint_ready calign .utf32_unescape_codepoint_base16_lc: sub r8d, '0'+7+0x20 cmp r8d, 16 jae .utf32_unescape_straightin imul eax, eax, 16 add eax, r8d sub rcx, 1 jnz .utf32_unescape_codepoint_base16_loop jmp .utf32_unescape_codepoint_ready dalign .dqamp: dd '&', 'a', 'm', 'p' .dqlt: dd '&', 'l', 't', ';' .dqgt: dd '&', 'g', 't', ';' calign .utf32_unescape_maybeamp: cmp r9, [rsi+8] je .utf32_unescape_char jmp .utf32_unescape_straightin calign .utf32_unescape_ampersand: mov r8, [.dqamp] mov r9, [.dqamp+8] mov eax, '&' cmp r8, [rsi] je .utf32_unescape_maybeamp jmp .utf32_unescape_straightin calign .utf32_unescape_maybelt: cmp r9, [.dqlt+8] je .utf32_unescape_char jmp .utf32_unescape_notlt calign .utf32_unescape_maybegt: cmp r9, [.dqgt+8] je .utf32_unescape_char jmp .utf32_unescape_straightin calign .utf32_unescape_ltgt: mov r8, [rsi] mov r9, [rsi+8] mov eax, '<' cmp r8, [.dqlt] je .utf32_unescape_maybelt .utf32_unescape_notlt: mov eax, '>' cmp r8, [.dqgt] je .utf32_unescape_maybegt jmp .utf32_unescape_straightin calign .utf32_unescape_char: mov [rbx], eax add rdx, 1 sub r13, rdx shl rdx, 2 add rbx, 4 add r12, rdx test r13, r13 jz .utf32_unescape_return mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf32_unescape_amp_scan calign .utf32_unescape_amp: ; if rdx is 0, no memcpy test rdx, rdx jz .utf32_unescape_amp_nocopy ; otherwise, rdx characters go straight in mov rdi, rbx mov rsi, r12 sub r13, rdx shl rdx, 2 add rbx, rdx add r12, rdx call memcpy .utf32_unescape_amp_nocopy: ; so now the word at r12 is an ampersand mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf32_unescape_semicolon_scan calign .utf32_unescape_buffer: ; string is too big to do on the stack, so create a temporary buffer for it mov rdx, r10 push rbx r12 r13 r14 lea rsi, [r8+rsi] ; beginning of string sub rdx, rcx mov r12, rax ; pointer to first ampersand mov r13, rcx ; characters left push rsi rdx mov rdi, r10 shl rdi, 2 call heap$alloc mov r14, rax pop rdx rsi mov rdi, rax lea rbx, [r14+rdx] shl rdx, 2 call memcpy mov rsi, r12 mov rcx, r13 xor edx, edx jmp .utf32_unescape_semicolon_scan end if if used xmltag$unescape_string | defined include_everything ; single argument in rdi: string to unescape ; returns a new heap$alloc'd string of unescaped text ; NOTE: we cheat a bit here and use the xmltag$unescape version to do the dirty work falign xmltag$unescape_string: prolog xmltag$unescape_string lea rcx, [rdi+8] xor esi, esi mov rdx, [rdi] sub rsp, xmltag_attributes_ofs ; we don't need a full xmltag for this mov [rsp+xmltag_base_ofs], rcx if string_bits = 32 mov dword [rsp+xmltag_width_ofs], xmlparser_utf32 else mov dword [rsp+xmltag_width_ofs], xmlparser_utf16 end if mov rdi, rsp call xmltag$unescape add rsp, xmltag_attributes_ofs epilog end if if used xmltag$text | defined include_everything ; single argument in rdi: an xmltag object ; returns a new heap$alloc'd string of the text, possibly unescaped falign xmltag$text: prolog xmltag$text mov eax, [rdi+xmltag_width_ofs] mov rsi, [rdi+xmltag_base_ofs] mov rdx, [rdi+xmltag_textstart_ofs] mov rcx, [rdi+xmltag_textend_ofs] cmp dword [rdi+xmltag_nodetype_ofs], xmltag_textnode jne .noescape cmp rcx, rdx je .emptystring mov rsi, rdx mov rdx, rcx call xmltag$unescape epilog calign .emptystring: call string$new epilog calign .noescape: sub rcx, rdx jz .emptystring jmp qword [rax*8+.noescape_dispatch] dalign .noescape_dispatch: dq .noescape_utf8, .noescape_utf16, .noescape_utf32 calign .noescape_utf8: lea rdi, [rsi+rdx] mov rsi, rcx call string$from_utf8 epilog calign .noescape_utf16: lea rdi, [rsi+rdx*2] lea rsi, [rcx*2] call string$from_utf16 epilog calign .noescape_utf32: lea rdi, [rsi+rdx*4] lea rsi, [rcx*4] call string$from_utf32 epilog end if if used xmltag$debug | defined include_everything ; single argument in rdi: an xmltag object falign xmltag$debug: prolog xmltag$debug mov eax, [rdi+xmltag_nodetype_ofs] push rbx r12 r13 mov rbx, [rdi+xmltag_base_ofs] mov r13d, [rdi+xmltag_width_ofs] mov r12, rdi jmp qword [rax*8+.dispatch] dalign .dispatch: dq .notype, .element, .textnode, .cdata, .processinginstruction, .comment, .doctype, .xmldecl cleartext .tagtype_notype, '(no type) (no output as a result)' calign .notype: mov rdi, .tagtype_notype call string$to_stdoutln pop r13 r12 rbx epilog cleartext .tagtype_element, 'Element: ' cleartext .space, ' ' cleartext .empty, ' Empty? ' cleartext .true, 'true' cleartext .false, 'false' cleartext .attrs, ' attrcount: ' cleartext .equalquote, '="' cleartext .quote, '"' calign .element: mov rdi, .tagtype_element call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .empty call string$to_stdout mov rdi, .true mov rsi, .false cmp dword [r12+xmltag_empty_ofs], 0 cmove rdi, rsi call string$to_stdout mov rdi, .attrs call string$to_stdout mov edi, [r12+xmltag_attrcount_ofs] mov esi, 10 call string$from_unsigned push rax mov rdi, rax call string$to_stdout pop rdi call heap$free cmp dword [r12+xmltag_attrcount_ofs], 0 je .element_noattrs push r14 xor r14d, r14d calign .element_attrloop: mov rdi, .space call string$to_stdout mov rdi, r12 mov esi, r14d call xmltag$getattr push rax ; attribute name first: mov rdi, [rax+xmltagattr_namestart_ofs] mov rsi, [rax+xmltagattr_nameend_ofs] call .textout mov rdi, .equalquote call string$to_stdout pop rax ; value mov rdi, [rax+xmltagattr_valuestart_ofs] mov rsi, [rax+xmltagattr_valueend_ofs] call .textout mov rdi, .quote call string$to_stdout add r14d, 1 cmp r14d, dword [r12+xmltag_attrcount_ofs] jne .element_attrloop pop r14 ; fallthrough to .element_noattrs calign .element_noattrs: mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .tagtype_textnode, 'Text: ' calign .textnode: mov rdi, .tagtype_textnode call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .tagtype_cdata, 'CData: ' calign .cdata: mov rdi, .tagtype_cdata call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .tagtype_processinginstruction, 'ProcessingInstruction: ' calign .processinginstruction: mov rdi, .tagtype_processinginstruction call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .tagtype_comment, 'Comment: ' calign .comment: mov rdi, .tagtype_comment call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .tagtype_doctype, 'DOCTYPE: ' calign .doctype: mov rdi, .tagtype_doctype call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .tagtype_xmldecl, 'XMLDeclaration: ' calign .xmldecl: mov rdi, .tagtype_xmldecl call string$to_stdout mov rdi, [r12+xmltag_textstart_ofs] mov rsi, [r12+xmltag_textend_ofs] call .textout mov rdi, .lf call string$to_stdout pop r13 r12 rbx epilog cleartext .lf, 10 falign .textout: jmp qword [r13*8+.textdispatch] dalign .textdispatch: dq .text8, .text16, .text32 calign .text8: sub rsi, rdi add rdi, rbx call string$from_utf8 push rax mov rdi, rax call string$to_stdout pop rdi call heap$free ret calign .text16: sub rsi, rdi shl rsi, 1 shl rdi, 1 add rdi, rbx call string$from_utf16 push rax mov rdi, rax call string$to_stdout pop rdi call heap$free ret calign .text32: sub rsi, rdi shl rsi, 2 shl rdi, 2 add rdi, rbx call string$from_utf32 push rax mov rdi, rax call string$to_stdout pop rdi call heap$free ret end if if used xmlparser$new | used xmlparser$new_string | used xmlparser$init | used xmlparser$init_string | defined include_everything ; defines for the character width (as passed to xmlparser$new) xmlparser_utf8 = 0 xmlparser_utf16 = 1 xmlparser_utf32 = 2 ; defines for flags (as passed to xmlparser$new) xmlparser_ignorewhite = 1 xmlparser_condensewhite = 2 ; defines for the return of xmlparser$next and xmlparser$prev xmlparser_noerror = 0 xmlparser_endofdocument = 1 xmlparser_unterminatedcdatasection = 2 xmlparser_unterminatedxmldeclaration = 3 xmlparser_unterminateddoctypedeclaration = 4 xmlparser_unterminatedcomment = 5 xmlparser_malformedelement = 6 xmlparser_unterminatedattributevalue = 7 xmlparser_unterminatedelement = 8 xmlparser_unterminatedprocessinginstruction = 9 xmlparser_tagattributeoverflow = 10 xmlparser_badqname = 11 xmlparser_prefixnotbound = 12 xmlparser_duplicateattribute = 13 ; xmlparser object itself: xmlparser_base_ofs = 0 ; pointer to xml xmlparser_size_ofs = 8 ; size in characters (not bytes) xmlparser_pos_ofs = 16 ; our current position (in characters not bytes) xmlparser_end_ofs = 24 ; our end (might be size, or less if whitespace trunc, etc) xmlparser_flags_ofs = 32 ; dword flags xmlparser_width_ofs = 36 ; dword character width xmlparser_size = 40 end if if used xmlparser$new | defined include_everything ; four arguments: rdi == ptr to xml, rsi == length (in characters) of same, edx == character width, ecx == flags ; returns new xmlparser object in rax falign xmlparser$new: prolog xmlparser$new push rdi rsi rdx rcx mov edi, xmlparser_size call heap$alloc xor r8d, r8d pop rcx rdx rsi rdi mov [rax+xmlparser_base_ofs], rdi mov [rax+xmlparser_size_ofs], rsi mov [rax+xmlparser_pos_ofs], r8 mov [rax+xmlparser_end_ofs], rsi mov [rax+xmlparser_flags_ofs], ecx mov [rax+xmlparser_width_ofs], edx epilog end if if used xmlparser$new_string | defined include_everything ; two arguments: rdi == string (native library string), esi == flags ; returns new xmlparser object in rax falign xmlparser$new_string: prolog xmlparser$new_string push rdi rsi mov edi, xmlparser_size call heap$alloc xor r8d, r8d pop rsi rdi mov rcx, [rdi] lea rdx, [rdi+8] mov [rax+xmlparser_base_ofs], rdx mov [rax+xmlparser_size_ofs], rcx mov [rax+xmlparser_pos_ofs], r8 mov [rax+xmlparser_end_ofs], rcx mov [rax+xmlparser_flags_ofs], esi if string_bits = 32 mov dword [rax+xmlparser_width_ofs], xmlparser_utf32 else mov dword [rax+xmlparser_width_ofs], xmlparser_utf16 end if epilog end if if used xmlparser$init | defined include_everything ; five arguments: rdi == xmlparser object to init, rsi == ptr to xml, rdx == length (in chars) of same, ecx == char width, r8d == flags falign xmlparser$init: prolog xmlparser$init mov rax, rdi xor r9d, r9d mov [rdi+xmlparser_base_ofs], rsi mov [rdi+xmlparser_size_ofs], rdx mov [rdi+xmlparser_pos_ofs], r9 mov [rdi+xmlparser_end_ofs], rdx mov [rdi+xmlparser_flags_ofs], r8d mov [rdi+xmlparser_width_ofs], ecx epilog end if if used xmlparser$init_string | defined include_everything ; three arguments: rdi == xmlparser object, rsi == string (native library string), edx == flags falign xmlparser$init_string: prolog xmlparser$init_string lea rcx, [rsi+8] mov r8, [rsi] xor r9d, r9d mov rax, rdi mov [rdi+xmlparser_base_ofs], rcx mov [rdi+xmlparser_size_ofs], r8 mov [rdi+xmlparser_pos_ofs], r9 mov [rdi+xmlparser_end_ofs], r8 mov [rdi+xmlparser_flags_ofs], edx if string_bits = 32 mov dword [rdi+xmlparser_width_ofs], xmlparser_utf32 else mov dword [rdi+xmlparser_width_ofs], xmlparser_utf16 end if epilog end if if used xmlparser$errortext | defined include_everything ; single argument in edi: one of the xmlparser_ numeric return values ; returns a static string (not heap$alloc'd) falign xmlparser$errortext: prolog xmlparser$errortext mov rax, [rdi*8+.dispatch] epilog cleartext .e0, 'No Error' cleartext .e1, 'End of Document' cleartext .e2, 'Unterminated CDATA Section' cleartext .e3, 'Unterminated XML Declaration' cleartext .e4, 'Unterminated DOCTYPE Declaration' cleartext .e5, 'Unterminated Comment' cleartext .e6, 'Malformed Element' cleartext .e7, 'Unterminated Attribute Value' cleartext .e8, 'Unterminated Element' cleartext .e9, 'Unterminated Processing Instruction' cleartext .e10, 'Tag Attribute Count Overflow' cleartext .e11, 'Bad QName' cleartext .e12, 'Prefix Not Bound' cleartext .e13, 'Duplicate Attribute' dalign .dispatch: dq .e0, .e1, .e2, .e3, .e4, .e5, .e6, .e7, .e8, .e9, .e10, .e11, .e12, .e13 end if if used xmlparser$next | defined include_everything ; two arguments: rdi == xmlparser object, rsi == xmltag object (we'll call reset on it first) ; returns one of the above xmlparser_ values in eax falign xmlparser$next: prolog xmlparser$next mov eax, [rdi+xmlparser_width_ofs] mov ecx, [rdi+xmlparser_flags_ofs] push rbx rdi mov rbx, rsi mov rsi, [rdi+xmlparser_pos_ofs] mov r10, [rdi+xmlparser_end_ofs] mov rdi, [rdi+xmlparser_base_ofs] ; copy the base and width to the tag itself: sub r10, rsi mov [rbx+xmltag_base_ofs], rdi mov dword [rbx+xmltag_width_ofs], eax jmp qword [rax*8+.dispatch] dalign .dispatch: dq .utf8, .utf16, .utf32 falign .utf8: test r10, r10 jz .utf8_endofdoc test ecx, xmlparser_ignorewhite jz .utf8_noskipwhite ; otherwise, skip whitespace and check for eod calign .utf8_skipwhite: movzx ecx, byte [rdi+rsi] mov r8d, 1 cmp ecx, 32 ja .utf8_noskipwhite sub ecx, 1 shl r8d, cl test r8d, 2147488512 jz .utf8_noskipwhite ; otherwise, we hit a 32, 9, 10, or 13 add rsi, 1 sub r10, 1 jnz .utf8_skipwhite ; fallthrough to end of document calign .utf8_endofdoc: ; save our position pop rdi rbx mov [rdi+xmlparser_pos_ofs], rsi mov eax, xmlparser_endofdocument epilog calign .utf8_noskipwhite: ; reset our xmltag (no need to call xmltag$reset, inline here is fine) xor ecx, ecx mov edx, -1 movzx eax, byte [rdi+rsi] mov [rbx+xmltag_textstart_ofs], rcx mov [rbx+xmltag_textend_ofs], rcx mov [rbx+xmltag_textcolon_ofs], rdx ; writes 0 into empty too cmp eax, '<' mov [rbx+xmltag_nodetype_ofs], rcx ; writes over attrcount too mov [rbx+xmltag_realstart_ofs], rsi jne .utf8_textnode cmp r10, 6 jb .utf8_notcdata cmp dword [rdi+rsi], ' je .utf8_maybexmldecl .utf8_notxmldecl: cmp dword [rdi+rsi], ' je .utf8_maybedoctype .utf8_notdoctype: cmp dword [rdi+rsi], ' je .utf8_maybecdata .utf8_notcdata: cmp r10, 2 jb .utf8_element cmp word [rdi+rsi], ' je .utf8_pi cmp r10, 4 jb .utf8_element cmp dword [rdi+rsi], '