; ------------------------------------------------------------------------
; HeavyThing x86_64 assembly language library and showcase programs
; Copyright © 2015-2018 2 Ton Digital
; Homepage: https://2ton.com.au/
; Author: Jeff Marrison <jeff@2ton.com.au>
;
; This file is part of the HeavyThing library.
;
; HeavyThing is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License, or
; (at your option) any later version.
;
; HeavyThing is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along
; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
; ------------------------------------------------------------------------
; xmlparser.inc: A very fast XML parser/navigator that will handle and
; parse UTF8, UTF16, and UTF32 (passed as a parameter to xmlparser$new)
; Optionally, xmlparser$new_string will take a native string as well.
;
; This is a "streaming parser" that allows forward and backward movement
; through an XML document/fragment (see xmlparser$next and xmlparser$prev)
; (pull style).
;
; Update v1.22+: xmlparser$prev now behaves itself correctly. Previously
; it choked on and >>>>>]]> situations, thanks
; to it paying attention to > and nesting levels, etc. The new version
; correctly deals with these and now sanely parses them. Additionally,
; we also deal with unescaped > in text/attrs (messy haha).
;
; Since a pointer, length and character width are passed to xmlparser$new,
; we do not create actual library strings for each, and instead keep
; start and end offsets for everywhere would-be strings are, and present
; convenience functions to create library strings from them for when they
; are desired. (this allows heap$alloc-free parsing/traversal speed)
;
if used xmltag$new | used xmltag$reset | defined include_everything
; this define sets the upper bound on the maximum number of attributes permitted
; our default here is pretty reasonable, it is rare for an xml element to contain
; more than 32 attributes ... if you encounter the xmlparser_tagattributeoverflow
; error return from the xmlparser, increase this value...
xmltag_maxattr = 32
; when unescaping strings, this setting determines max size to use for stack-based
; unescape temporary buffer space (note this size is in characters, not bytes)
xmltag_unescape_stacklimit = 16384
; defines for possible tag types
xmltag_notype = 0
xmltag_element = 1
xmltag_textnode = 2
xmltag_cdata = 3
xmltag_processinginstruction = 4
xmltag_comment = 5
xmltag_doctypedeclaration = 6
xmltag_xmldeclaration = 7
; each individual attribute contains:
xmltagattr_namestart_ofs = 0 ; offset to start of the name
xmltagattr_nameend_ofs = 8 ; offset to end of the name
xmltagattr_valuestart_ofs = 16 ; offset to start of value
xmltagattr_valueend_ofs = 24 ; offset to end of value
xmltagattr_namecolon_ofs = 32 ; dword offset (relative to nameend-namestart) of colon in the attribute name (-1 otherwise)
xmltagattr_size = 40 ; wasted 4 bytes each
; and our xmltag itself needs:
xmltag_textstart_ofs = 0 ; offset to start of tag text
xmltag_textend_ofs = 8 ; offset to start of tag text end
xmltag_textcolon_ofs = 16 ; dword offset (relative to textend-textstart) of colon in the tag name (-1 otherwise)
xmltag_empty_ofs = 20 ; dword bool whether tag is empty or not (think
)
xmltag_nodetype_ofs = 24 ; dword one of the 8 nodetypes listed above
xmltag_attrcount_ofs = 28 ; dword count of how many attributes we have
xmltag_realstart_ofs = 32 ; offset to the real start of the tag
xmltag_realend_ofs = 40 ; offset to the real end of the tag
xmltag_base_ofs = 48 ; copy of the base pointer from the xmlparser
xmltag_width_ofs = 56 ; copy of the width setting from the xmlparser (wasting 4 bytes)
xmltag_attributes_ofs = 64 ; xmltag_maxattr * xmltagattr_size bytes
xmltag_size = xmltag_attributes_ofs + (xmltag_maxattr * xmltagattr_size)
; NOTE: this can of course also be stack allocated, but you should call xmltag$reset on it before use
; no arguments, heap$alloc's and initialises an xmltag object
; returns new xmltag object
falign
xmltag$new:
prolog xmltag$new
mov edi, xmltag_size
call heap$alloc
xor ecx, ecx
mov edx, -1
mov [rax+xmltag_textstart_ofs], rcx
mov [rax+xmltag_textend_ofs], rcx
mov [rax+xmltag_textcolon_ofs], rdx ; writes 0 into empty too
mov [rax+xmltag_nodetype_ofs], rcx ; writes over attrcount too
epilog
end if
if used xmltag$destroy | defined include_everything
; placeholder function only really, just does a heap$free
; single argument in rdi: an xmltag object
falign
xmltag$destroy:
prolog xmltag$destroy
call heap$free
epilog
end if
if used xmltag$reset | defined include_everything
; single argument in rdi: an xmltag object to reset
falign
xmltag$reset:
prolog xmltag$reset
xor ecx, ecx
mov edx, -1
mov [rdi+xmltag_textstart_ofs], rcx
mov [rdi+xmltag_textend_ofs], rcx
mov [rdi+xmltag_textcolon_ofs], rdx ; writes 0 into empty too
mov [rdi+xmltag_nodetype_ofs], rcx ; writes over attrcount too
epilog
end if
if used xmltag$newattr | defined include_everything
; six arguments: rdi == xmltag object, rsi == namestart, rdx == nameend, ecx == colonpos, r8 == valuestart, r9 == valueend
; returns bool in eax as to whether we succeeded or not (0 == maxattr reached)
; (leaves rdx as a pointer to the xmltagattr structure if eax = 1)
falign
xmltag$newattr:
prolog xmltag$newattr
mov eax, [rdi+xmltag_attrcount_ofs]
cmp eax, xmltag_maxattr
je .zeroret
imul eax, eax, xmltagattr_size
add dword [rdi+xmltag_attrcount_ofs], 1
; offset into rdi for our new attribute is rdi+xmltag_attributes_ofs+rax
lea rax, [rdi+rax+xmltag_attributes_ofs]
mov [rax+xmltagattr_namestart_ofs], rsi
mov [rax+xmltagattr_nameend_ofs], rdx
mov [rax+xmltagattr_valuestart_ofs], r8
mov [rax+xmltagattr_valueend_ofs], r9
mov [rax+xmltagattr_namecolon_ofs], rcx
mov rdx, rax
mov eax, 1
epilog
.zeroret:
xor eax, eax
epilog
end if
if used xmltag$getattr | defined include_everything
; two arguments: rdi == xmltag object, esi == attribute # to get
; returns pointer to xmltagattr structure for esi or 0 on error
falign
xmltag$getattr:
prolog xmltag$getattr
cmp esi, [rdi+xmltag_attrcount_ofs]
jae .zeroret
imul esi, esi, xmltagattr_size
lea rax, [rdi+rsi+xmltag_attributes_ofs]
epilog
.zeroret:
xor eax, eax
epilog
end if
if used xmltag$escape_string | defined include_everything
; single argument in rdi: string to escape
; returns a new heap$alloc'd string of escaped text
; note that this is only for the 5 lt, gt, amp, apos, quot
falign
xmltag$escape_string:
prolog xmltag$escape_string
lea rsi, [rdi+8]
mov rcx, [rdi]
xor edx, edx
; determine the size we need first
mov r9d, 4
mov r10d, 5
mov r11d, 6
calign
.firstpass:
if string_bits = 32
mov eax, [rsi]
add rsi, 4
else
movzx eax, word [rsi]
add rsi, 2
end if
mov r8d, 1
cmp eax, '<'
cmove r8d, r9d
cmp eax, '>'
cmove r8d, r9d
cmp eax, '&'
cmove r8d, r10d
cmp eax, 0x27
cmove r8d, r11d
cmp eax, '"'
cmove r8d, r11d
add rdx, r8
sub rcx, 1
jnz .firstpass
; so now rdx has the new string length we are after
push rdx
if string_bits = 32
shl rdx, 2
else
shl rdx, 1
end if
push rdi
add rdx, 8
mov rdi, rdx
call heap$alloc
pop rsi
mov rdi, rax
mov rcx, [rsi]
add rsi, 8
pop rdx
mov [rdi], rdx
add rdi, 8
calign
.secondpass:
if string_bits = 32
mov edx, [rsi]
add rsi, 4
else
movzx edx, word [rsi]
add rsi, 2
end if
cmp edx, '<'
je .lt
cmp edx, '>'
je .gt
cmp edx, '&'
je .amp
cmp edx, 0x27
je .apos
cmp edx, '"'
je .quot
if string_bits = 32
mov [rdi], edx
add rdi, 4
else
mov [rdi], dx
add rdi, 2
end if
sub rcx, 1
jnz .secondpass
epilog
calign
.lt:
if string_bits = 32
mov dword [rdi], '&'
mov dword [rdi+4], 'l'
mov dword [rdi+8], 't'
mov dword [rdi+12], ';'
add rdi, 16
else
mov word [rdi], '&'
mov word [rdi+2], 'l'
mov word [rdi+4], 't'
mov word [rdi+6], ';'
add rdi, 8
end if
sub rcx, 1
jnz .secondpass
epilog
calign
.gt:
if string_bits = 32
mov dword [rdi], '&'
mov dword [rdi+4], 'g'
mov dword [rdi+8], 't'
mov dword [rdi+12], ';'
add rdi, 16
else
mov word [rdi], '&'
mov word [rdi+2], 'g'
mov word [rdi+4], 't'
mov word [rdi+6], ';'
add rdi, 8
end if
sub rcx, 1
jnz .secondpass
epilog
calign
.amp:
if string_bits = 32
mov dword [rdi], '&'
mov dword [rdi+4], 'a'
mov dword [rdi+8], 'm'
mov dword [rdi+12], 'p'
mov dword [rdi+16], ';'
add rdi, 20
else
mov word [rdi], '&'
mov word [rdi+2], 'a'
mov word [rdi+4], 'm'
mov word [rdi+6], 'p'
mov word [rdi+8], ';'
add rdi, 10
end if
sub rcx, 1
jnz .secondpass
epilog
calign
.apos:
if string_bits = 32
mov dword [rdi], '&'
mov dword [rdi+4], 'a'
mov dword [rdi+8], 'p'
mov dword [rdi+12], 'o'
mov dword [rdi+16], 's'
mov dword [rdi+20], ';'
add rdi, 24
else
mov word [rdi], '&'
mov word [rdi+2], 'a'
mov word [rdi+4], 'p'
mov word [rdi+6], 'o'
mov word [rdi+8], 's'
mov word [rdi+10], ';'
add rdi, 12
end if
sub rcx, 1
jnz .secondpass
epilog
calign
.quot:
if string_bits = 32
mov dword [rdi], '&'
mov dword [rdi+4], 'q'
mov dword [rdi+8], 'u'
mov dword [rdi+12], 'o'
mov dword [rdi+16], 't'
mov dword [rdi+20], ';'
add rdi, 24
else
mov word [rdi], '&'
mov word [rdi+2], 'q'
mov word [rdi+4], 'u'
mov word [rdi+6], 'o'
mov word [rdi+8], 't'
mov word [rdi+10], ';'
add rdi, 12
end if
sub rcx, 1
jnz .secondpass
epilog
end if
if used xmltag$unescape | defined include_everything
; three arguments: rdi == xmltag object, rsi == text start offset, rdx == text end offset
; returns a new heap$alloc'd string of unescaped text
falign
xmltag$unescape:
prolog xmltag$unescape
mov r8, [rdi+xmltag_base_ofs]
mov r9d, [rdi+xmltag_width_ofs]
mov rcx, rdx
lea r10, [rsi*2]
lea r11, [rsi*4]
sub rcx, rsi
jz .emptystring
cmp r9d, 1
cmove rsi, r10
cmp r9d, 2
cmove rsi, r11
mov r10, rcx
lea rax, [r8+rsi]
jmp qword [r9*8+.dispatch]
dalign
.dispatch:
dq .utf8, .utf16, .utf32
calign
.emptystring:
call string$new
epilog
calign
.utf8:
; prescan the string searching for ampersand
cmp byte [rax], '&'
je .utf8_unescape
add rax, 1
sub rcx, 1
jnz .utf8
; if we made it here, no ampersands were found
lea rdi, [r8+rsi]
mov rsi, r10
call string$from_utf8
epilog
calign
.utf16:
; prescan the string searching for ampersand
cmp word [rax], '&'
je .utf16_unescape
add rax, 2
sub rcx, 1
jnz .utf16
; if we made it here, no ampersands were found
lea rdi, [r8+rsi]
lea rsi, [r10*2]
call string$from_utf16
epilog
calign
.utf32:
; prescan the string searching for ampersand
cmp dword [rax], '&'
je .utf32_unescape
add rax, 4
sub rcx, 1
jnz .utf32
; if we made it here, no ampersands were found
lea rdi, [r8+rsi]
lea rsi, [r10*4]
call string$from_utf32
epilog
calign
.utf8_unescape:
; if our text is small enough, unescape on the stack
; otherwise, do expensive buffer construction
cmp r10, xmltag_unescape_stacklimit
ja .utf8_unescape_buffer
mov rdx, r10
push rbx r12 r13 r14
xor r14d, r14d
sub rsp, xmltag_unescape_stacklimit
lea rsi, [r8+rsi] ; beginning of string
sub rdx, rcx
mov rdi, rsp
mov r12, rax ; pointer to first ampersand
mov r13, rcx ; bytes left
lea rbx, [rsp+rdx]
call memcpy
mov rsi, r12
mov rcx, r13
xor edx, edx
calign
.utf8_unescape_semicolon_scan:
cmp byte [rsi+rdx], ';'
je .utf8_unescape_semicolon
add rdx, 1
sub rcx, 1
jnz .utf8_unescape_semicolon_scan
; if we made it to here, an ampersand was found but no trailing semicolon
; despite this being an error, just return the remaining of the string as-is
mov rdi, rbx
mov rsi, r12
mov rdx, r13
add rbx, r13
call memcpy
.utf8_unescape_return:
test r14, r14
jnz .utf8_unescape_buffer_return
; rbx - rsp is how many bytes we have accumulated
mov rdi, rsp
mov rsi, rbx
sub rsi, rsp
call string$from_utf8
add rsp, xmltag_unescape_stacklimit
pop r14 r13 r12 rbx
epilog
calign
.utf8_unescape_buffer_return:
mov rdi, r14
mov rsi, rbx
sub rsi, r14
call string$from_utf8
mov rdi, r14
mov r14, rax
call heap$free
mov rax, r14
pop r14 r13 r12 rbx
epilog
calign
.utf8_unescape_semicolon:
; rdx is our length
cmp byte [rsi+1], '#'
je .utf8_unescape_codepoint
cmp rdx, 3
je .utf8_unescape_ltgt
cmp rdx, 4
je .utf8_unescape_ampersand
cmp rdx, 5
jne .utf8_unescape_straightin
; quot or apos check
mov eax, '"'
cmp dword [rsi+1], 'quot'
je .utf8_unescape_char
mov eax, 0x27
cmp dword [rsi+1], 'apos'
je .utf8_unescape_char
mov eax, 0xa0
cmp dword [rsi+1], 'nbsp'
je .utf8_unescape_char
.utf8_unescape_straightin:
; add from r12 to r12+rdx+1
mov rdi, rbx
mov rsi, r12
add rdx, 1
add rbx, rdx
add r12, rdx
sub r13, rdx
call memcpy
test r13, r13
jz .utf8_unescape_return
; find the next ampersand
mov rsi, r12
mov rcx, r13
xor edx, edx
calign
.utf8_unescape_amp_scan:
cmp byte [rsi+rdx], '&'
je .utf8_unescape_amp
add rdx, 1
sub rcx, 1
jnz .utf8_unescape_amp_scan
; if we made it to here no ampersand was found
test rdx, rdx
jz .utf8_unescape_return
mov rdi, rbx
mov rsi, r12
mov rdx, r13
add rbx, r13
call memcpy
jmp .utf8_unescape_return
calign
.utf8_unescape_codepoint:
; word at [rsi] == ''
; byte at [rsi+rdx] == ';'
lea rdi, [rsi+2]
lea rcx, [rdx-2]
cmp rdx, 2 ; if it was put it straight in as-is
je .utf8_unescape_straightin
cmp byte [rdi], 'x'
je .utf8_unescape_codepoint_base16
xor eax, eax
calign
.utf8_unescape_codepoint_loop:
movzx r8d, byte [rdi]
add rdi, 1
sub r8d, '0'
cmp r8d, 10
jae .utf8_unescape_straightin
imul eax, eax, 10
add eax, r8d
sub rcx, 1
jnz .utf8_unescape_codepoint_loop
.utf8_unescape_codepoint_ready:
cmp eax, 256
jae .utf8_unescape_codepoint_encoded
mov byte [rbx], al
add rbx, 1
add rdx, 1
add r12, rdx
sub r13, rdx
jz .utf8_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf8_unescape_amp_scan
calign
.utf8_unescape_codepoint_encoded:
; we can cheat here at create a string object below our current stackframe:
mov qword [rsp-64], 1
mov qword [rsp-56], rax
lea rdi, [rsp-64]
mov rsi, rbx
; setup values for next round:
add rdx, 1
add r12, rdx
sub r13, rdx
call string$to_utf8
add rbx, rax
test r13, r13
jz .utf8_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf8_unescape_amp_scan
calign
.utf8_unescape_codepoint_base16:
add rdi, 1
sub rcx, 1
jz .utf8_unescape_straightin ;
xor eax, eax
calign
.utf8_unescape_codepoint_base16_loop:
movzx r8d, byte [rdi]
add rdi, 1
cmp r8d, 'a'
jae .utf8_unescape_codepoint_base16_lc
cmp r8d, 'A'
jae .utf8_unescape_codepoint_base16_uc
sub r8d, '0'
cmp r8d, 16
jae .utf8_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf8_unescape_codepoint_base16_loop
jmp .utf8_unescape_codepoint_ready
calign
.utf8_unescape_codepoint_base16_uc:
sub r8d, '0'+7
cmp r8d, 16
jae .utf8_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf8_unescape_codepoint_base16_loop
jmp .utf8_unescape_codepoint_ready
calign
.utf8_unescape_codepoint_base16_lc:
sub r8d, '0'+7+0x20
cmp r8d, 16
jae .utf8_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf8_unescape_codepoint_base16_loop
jmp .utf8_unescape_codepoint_ready
calign
.utf8_unescape_ampersand:
mov eax, '&'
cmp dword [rsi], '&'
je .utf8_unescape_char
jmp .utf8_unescape_straightin
calign
.utf8_unescape_ltgt:
mov eax, '<'
cmp dword [rsi], '<'
je .utf8_unescape_char
mov eax, '>'
cmp dword [rsi], '>'
jne .utf8_unescape_straightin
; fall through to utf8_unescape_char
calign
.utf8_unescape_char:
mov [rbx], al
add rdx, 1
add rbx, 1
add r12, rdx
sub r13, rdx
jz .utf8_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf8_unescape_amp_scan
calign
.utf8_unescape_amp:
; if rdx is 0, no memcpy
test rdx, rdx
jz .utf8_unescape_amp_nocopy
; otherwise, rdx bytes go straight in
mov rdi, rbx
mov rsi, r12
add rbx, rdx
add r12, rdx
sub r13, rdx
call memcpy
.utf8_unescape_amp_nocopy:
; so now the byte at r12 is an ampersand
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf8_unescape_semicolon_scan
calign
.utf8_unescape_buffer:
; string is too big to do on the stack, so create a temporary buffer for it
mov rdx, r10
push rbx r12 r13 r14
lea rsi, [r8+rsi] ; beginning of string
sub rdx, rcx
mov r12, rax ; pointer to first ampersand
mov r13, rcx ; bytes left
push rsi rdx
mov rdi, r10
call heap$alloc
mov r14, rax
pop rdx rsi
mov rdi, rax
lea rbx, [r14+rdx]
call memcpy
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf8_unescape_semicolon_scan
calign
.utf16_unescape:
; if our text is small enough, unescape on the stack
; otherwise, do expensive buffer construction
cmp r10, xmltag_unescape_stacklimit
ja .utf16_unescape_buffer
mov rdx, r10
push rbx r12 r13 r14
xor r14d, r14d
sub rsp, xmltag_unescape_stacklimit shl 1
lea rsi, [r8+rsi] ; beginning of string
sub rdx, rcx
mov rdi, rsp
mov r12, rax ; pointer to first ampersand
mov r13, rcx ; characters left
lea rbx, [rsp+rdx*2]
shl rdx, 1
call memcpy
mov rsi, r12
mov rcx, r13
xor edx, edx
calign
.utf16_unescape_semicolon_scan:
cmp word [rsi+rdx*2], ';'
je .utf16_unescape_semicolon
add rdx, 1
sub rcx, 1
jnz .utf16_unescape_semicolon_scan
; if we made it to here, an ampersand was found but no trailing semicolon
; despite this being an error, just return the remaining of the string as-is
; r13 is the number of characters left not bytes:
shl r13, 1
mov rdi, rbx
mov rsi, r12
mov rdx, r13
add rbx, r13
call memcpy
.utf16_unescape_return:
test r14, r14
jnz .utf16_unescape_buffer_return
; rbx - rsp is how many bytes we have accumulated
mov rdi, rsp
mov rsi, rbx
sub rsi, rsp
call string$from_utf16
add rsp, xmltag_unescape_stacklimit shl 1
pop r14 r13 r12 rbx
epilog
calign
.utf16_unescape_buffer_return:
mov rdi, r14
mov rsi, rbx
sub rsi, r14
call string$from_utf16
mov rdi, r14
mov r14, rax
call heap$free
mov rax, r14
pop r14 r13 r12 rbx
epilog
dalign
.qquot:
dw 'q', 'u', 'o', 't'
.qapos:
dw 'a', 'p', 'o', 's'
.qnbsp:
dw 'n', 'b', 's', 'p'
calign
.utf16_unescape_semicolon:
; rdx is our length
cmp word [rsi+2], '#'
je .utf16_unescape_codepoint
cmp rdx, 3
je .utf16_unescape_ltgt
cmp rdx, 4
je .utf16_unescape_ampersand
cmp rdx, 5
jne .utf16_unescape_straightin
; quot or apos check
mov eax, '"'
mov r8, [rsi+2]
cmp r8, [.qquot]
je .utf16_unescape_char
mov eax, 0x27
cmp r8, [.qapos]
je .utf16_unescape_char
mov eax, 0xa0
cmp r8, [.qnbsp]
je .utf16_unescape_char
.utf16_unescape_straightin:
; add from r12 to r12+rdx+1
mov rdi, rbx
mov rsi, r12
add rdx, 1
sub r13, rdx
shl rdx, 1
add rbx, rdx
add r12, rdx
call memcpy
test r13, r13
jz .utf16_unescape_return
; find the next ampersand
mov rsi, r12
mov rcx, r13
xor edx, edx
calign
.utf16_unescape_amp_scan:
cmp word [rsi+rdx*2], '&'
je .utf16_unescape_amp
add rdx, 1
sub rcx, 1
jnz .utf16_unescape_amp_scan
; if we made it to here no ampersand was found
test rdx, rdx
jz .utf16_unescape_return
; r13 is number of characters left:
shl r13, 1
mov rdi, rbx
mov rsi, r12
mov rdx, r13
add rbx, r13
call memcpy
jmp .utf16_unescape_return
calign
.utf16_unescape_codepoint:
; dword at [rsi] == ''
; word at [rsi+rdx*2] == ';'
lea rdi, [rsi+4]
lea rcx, [rdx-2]
cmp rdx, 2 ; if it was put it straight in as-is
je .utf16_unescape_straightin
cmp word [rdi], 'x'
je .utf16_unescape_codepoint_base16
xor eax, eax
calign
.utf16_unescape_codepoint_loop:
movzx r8d, word [rdi]
add rdi, 2
sub r8d, '0'
cmp r8d, 10
jae .utf16_unescape_straightin
imul eax, eax, 10
add eax, r8d
sub rcx, 1
jnz .utf16_unescape_codepoint_loop
.utf16_unescape_codepoint_ready:
cmp eax, 0xd800
jae .utf16_unescape_codepoint_encoded
mov word [rbx], ax
add rbx, 2
add rdx, 1
sub r13, rdx
shl rdx, 1
add r12, rdx
test r13, r13
jz .utf16_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf16_unescape_amp_scan
calign
.utf16_unescape_codepoint_encoded:
; we can cheat here at create a string object below our current stackframe:
mov qword [rsp-64], 1
mov qword [rsp-56], rax
lea rdi, [rsp-64]
mov rsi, rbx
; setup values for next round:
add rdx, 1
sub r13, rdx
shl rdx, 1
add r12, rdx
call string$to_utf16
shl rax, 1
add rbx, rax
test r13, r13
jz .utf16_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf16_unescape_amp_scan
calign
.utf16_unescape_codepoint_base16:
add rdi, 2
sub rcx, 1
jz .utf16_unescape_straightin ;
xor eax, eax
calign
.utf16_unescape_codepoint_base16_loop:
movzx r8d, word [rdi]
add rdi, 2
cmp r8d, 'a'
jae .utf16_unescape_codepoint_base16_lc
cmp r8d, 'A'
jae .utf16_unescape_codepoint_base16_uc
sub r8d, '0'
cmp r8d, 16
jae .utf16_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf16_unescape_codepoint_base16_loop
jmp .utf16_unescape_codepoint_ready
calign
.utf16_unescape_codepoint_base16_uc:
sub r8d, '0'+7
cmp r8d, 16
jae .utf16_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf16_unescape_codepoint_base16_loop
jmp .utf16_unescape_codepoint_ready
calign
.utf16_unescape_codepoint_base16_lc:
sub r8d, '0'+7+0x20
cmp r8d, 16
jae .utf16_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf16_unescape_codepoint_base16_loop
jmp .utf16_unescape_codepoint_ready
dalign
.qamp:
dw '&', 'a', 'm', 'p'
.qlt:
dw '&', 'l', 't', ';'
.qgt:
dw '&', 'g', 't', ';'
calign
.utf16_unescape_ampersand:
mov r8, [.qamp]
mov eax, '&'
cmp r8, [rsi]
je .utf16_unescape_char
jmp .utf16_unescape_straightin
calign
.utf16_unescape_ltgt:
mov r8, [rsi]
mov eax, '<'
cmp r8, [.qlt]
je .utf16_unescape_char
mov eax, '>'
cmp r8, [.qgt]
jne .utf16_unescape_straightin
; fall through to utf16_unescape_char
calign
.utf16_unescape_char:
mov [rbx], ax
add rdx, 1
sub r13, rdx
shl rdx, 1
add rbx, 2
add r12, rdx
test r13, r13
jz .utf16_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf16_unescape_amp_scan
calign
.utf16_unescape_amp:
; if rdx is 0, no memcpy
test rdx, rdx
jz .utf16_unescape_amp_nocopy
; otherwise, rdx characters go straight in
mov rdi, rbx
mov rsi, r12
sub r13, rdx
shl rdx, 1
add rbx, rdx
add r12, rdx
call memcpy
.utf16_unescape_amp_nocopy:
; so now the word at r12 is an ampersand
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf16_unescape_semicolon_scan
calign
.utf16_unescape_buffer:
; string is too big to do on the stack, so create a temporary buffer for it
mov rdx, r10
push rbx r12 r13 r14
lea rsi, [r8+rsi] ; beginning of string
sub rdx, rcx
mov r12, rax ; pointer to first ampersand
mov r13, rcx ; characters left
push rsi rdx
mov rdi, r10
shl rdi, 1
call heap$alloc
mov r14, rax
pop rdx rsi
mov rdi, rax
lea rbx, [r14+rdx]
shl rdx, 1
call memcpy
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf16_unescape_semicolon_scan
calign
.utf32_unescape:
; if our text is small enough, unescape on the stack
; otherwise, do expensive buffer construction
cmp r10, xmltag_unescape_stacklimit
ja .utf32_unescape_buffer
mov rdx, r10
push rbx r12 r13 r14
xor r14d, r14d
sub rsp, xmltag_unescape_stacklimit shl 2
lea rsi, [r8+rsi] ; beginning of string
sub rdx, rcx
mov rdi, rsp
mov r12, rax ; pointer to first ampersand
mov r13, rcx ; characters left
lea rbx, [rsp+rdx*4]
shl rdx, 2
call memcpy
mov rsi, r12
mov rcx, r13
xor edx, edx
calign
.utf32_unescape_semicolon_scan:
cmp dword [rsi+rdx*4], ';'
je .utf32_unescape_semicolon
add rdx, 1
sub rcx, 1
jnz .utf32_unescape_semicolon_scan
; if we made it to here, an ampersand was found but no trailing semicolon
; despite this being an error, just return the remaining of the string as-is
; r13 is the number of characters left not bytes:
shl r13, 2
mov rdi, rbx
mov rsi, r12
mov rdx, r13
add rbx, r13
call memcpy
.utf32_unescape_return:
test r14, r14
jnz .utf32_unescape_buffer_return
; rbx - rsp is how many bytes we have accumulated
mov rdi, rsp
mov rsi, rbx
sub rsi, rsp
call string$from_utf32
add rsp, xmltag_unescape_stacklimit shl 2
pop r14 r13 r12 rbx
epilog
calign
.utf32_unescape_buffer_return:
mov rdi, r14
mov rsi, rbx
sub rsi, r14
call string$from_utf32
mov rdi, r14
mov r14, rax
call heap$free
mov rax, r14
pop r14 r13 r12 rbx
epilog
dalign
.dqquot:
dd 'q', 'u', 'o', 't'
.dqapos:
dd 'a', 'p', 'o', 's'
.dqnbsp:
dd 'n', 'b', 's', 'p'
calign
.utf32_unescape_maybequot:
cmp r9, [.dqquot+8]
jne .utf32_unescape_notquot
jmp .utf32_unescape_char
calign
.utf32_unescape_maybeapos:
cmp r9, [.dqapos+8]
jne .utf32_unescape_notapos
jmp .utf32_unescape_char
calign
.utf32_unescape_maybenbsp:
cmp r9, [.dqnbsp+8]
jne .utf32_unescape_straightin
jmp .utf32_unescape_char
calign
.utf32_unescape_semicolon:
; rdx is our length
cmp dword [rsi+4], '#'
je .utf32_unescape_codepoint
cmp rdx, 3
je .utf32_unescape_ltgt
cmp rdx, 4
je .utf32_unescape_ampersand
cmp rdx, 5
jne .utf32_unescape_straightin
; quot or apos check
mov eax, '"'
mov r8, [rsi+4]
mov r9, [rsi+12]
cmp r8, [.dqquot]
je .utf32_unescape_maybequot
.utf32_unescape_notquot:
mov eax, 0x27
cmp r8, [.dqapos]
je .utf32_unescape_maybeapos
.utf32_unescape_notapos:
mov eax, 0xa0
cmp r8, [.dqnbsp]
je .utf32_unescape_maybenbsp
.utf32_unescape_straightin:
; add from r12 to r12+rdx+1
mov rdi, rbx
mov rsi, r12
add rdx, 1
sub r13, rdx
shl rdx, 2
add rbx, rdx
add r12, rdx
call memcpy
test r13, r13
jz .utf32_unescape_return
; find the next ampersand
mov rsi, r12
mov rcx, r13
xor edx, edx
calign
.utf32_unescape_amp_scan:
cmp dword [rsi+rdx*4], '&'
je .utf32_unescape_amp
add rdx, 1
sub rcx, 1
jnz .utf32_unescape_amp_scan
; if we made it to here no ampersand was found
test rdx, rdx
jz .utf32_unescape_return
; r13 is number of characters left:
shl r13, 2
mov rdi, rbx
mov rsi, r12
mov rdx, r13
add rbx, r13
call memcpy
jmp .utf32_unescape_return
calign
.utf32_unescape_codepoint:
; qword at [rsi] == ''
; dword at [rsi+rdx*4] == ';'
lea rdi, [rsi+8]
lea rcx, [rdx-2]
cmp rdx, 2 ; if it was put it straight in as-is
je .utf32_unescape_straightin
cmp dword [rdi], 'x'
je .utf32_unescape_codepoint_base16
xor eax, eax
calign
.utf32_unescape_codepoint_loop:
mov r8d, dword [rdi]
add rdi, 4
sub r8d, '0'
cmp r8d, 10
jae .utf32_unescape_straightin
imul eax, eax, 10
add eax, r8d
sub rcx, 1
jnz .utf32_unescape_codepoint_loop
.utf32_unescape_codepoint_ready:
mov dword [rbx], eax
add rbx, 4
add rdx, 1
sub r13, rdx
shl rdx, 2
add r12, rdx
test r13, r13
jz .utf32_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf32_unescape_amp_scan
calign
.utf32_unescape_codepoint_base16:
add rdi, 4
sub rcx, 1
jz .utf32_unescape_straightin ;
xor eax, eax
calign
.utf32_unescape_codepoint_base16_loop:
mov r8d, dword [rdi]
add rdi, 4
cmp r8d, 'a'
jae .utf32_unescape_codepoint_base16_lc
cmp r8d, 'A'
jae .utf32_unescape_codepoint_base16_uc
sub r8d, '0'
cmp r8d, 16
jae .utf32_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf32_unescape_codepoint_base16_loop
jmp .utf32_unescape_codepoint_ready
calign
.utf32_unescape_codepoint_base16_uc:
sub r8d, '0'+7
cmp r8d, 16
jae .utf32_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf32_unescape_codepoint_base16_loop
jmp .utf32_unescape_codepoint_ready
calign
.utf32_unescape_codepoint_base16_lc:
sub r8d, '0'+7+0x20
cmp r8d, 16
jae .utf32_unescape_straightin
imul eax, eax, 16
add eax, r8d
sub rcx, 1
jnz .utf32_unescape_codepoint_base16_loop
jmp .utf32_unescape_codepoint_ready
dalign
.dqamp:
dd '&', 'a', 'm', 'p'
.dqlt:
dd '&', 'l', 't', ';'
.dqgt:
dd '&', 'g', 't', ';'
calign
.utf32_unescape_maybeamp:
cmp r9, [rsi+8]
je .utf32_unescape_char
jmp .utf32_unescape_straightin
calign
.utf32_unescape_ampersand:
mov r8, [.dqamp]
mov r9, [.dqamp+8]
mov eax, '&'
cmp r8, [rsi]
je .utf32_unescape_maybeamp
jmp .utf32_unescape_straightin
calign
.utf32_unescape_maybelt:
cmp r9, [.dqlt+8]
je .utf32_unescape_char
jmp .utf32_unescape_notlt
calign
.utf32_unescape_maybegt:
cmp r9, [.dqgt+8]
je .utf32_unescape_char
jmp .utf32_unescape_straightin
calign
.utf32_unescape_ltgt:
mov r8, [rsi]
mov r9, [rsi+8]
mov eax, '<'
cmp r8, [.dqlt]
je .utf32_unescape_maybelt
.utf32_unescape_notlt:
mov eax, '>'
cmp r8, [.dqgt]
je .utf32_unescape_maybegt
jmp .utf32_unescape_straightin
calign
.utf32_unescape_char:
mov [rbx], eax
add rdx, 1
sub r13, rdx
shl rdx, 2
add rbx, 4
add r12, rdx
test r13, r13
jz .utf32_unescape_return
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf32_unescape_amp_scan
calign
.utf32_unescape_amp:
; if rdx is 0, no memcpy
test rdx, rdx
jz .utf32_unescape_amp_nocopy
; otherwise, rdx characters go straight in
mov rdi, rbx
mov rsi, r12
sub r13, rdx
shl rdx, 2
add rbx, rdx
add r12, rdx
call memcpy
.utf32_unescape_amp_nocopy:
; so now the word at r12 is an ampersand
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf32_unescape_semicolon_scan
calign
.utf32_unescape_buffer:
; string is too big to do on the stack, so create a temporary buffer for it
mov rdx, r10
push rbx r12 r13 r14
lea rsi, [r8+rsi] ; beginning of string
sub rdx, rcx
mov r12, rax ; pointer to first ampersand
mov r13, rcx ; characters left
push rsi rdx
mov rdi, r10
shl rdi, 2
call heap$alloc
mov r14, rax
pop rdx rsi
mov rdi, rax
lea rbx, [r14+rdx]
shl rdx, 2
call memcpy
mov rsi, r12
mov rcx, r13
xor edx, edx
jmp .utf32_unescape_semicolon_scan
end if
if used xmltag$unescape_string | defined include_everything
; single argument in rdi: string to unescape
; returns a new heap$alloc'd string of unescaped text
; NOTE: we cheat a bit here and use the xmltag$unescape version to do the dirty work
falign
xmltag$unescape_string:
prolog xmltag$unescape_string
lea rcx, [rdi+8]
xor esi, esi
mov rdx, [rdi]
sub rsp, xmltag_attributes_ofs ; we don't need a full xmltag for this
mov [rsp+xmltag_base_ofs], rcx
if string_bits = 32
mov dword [rsp+xmltag_width_ofs], xmlparser_utf32
else
mov dword [rsp+xmltag_width_ofs], xmlparser_utf16
end if
mov rdi, rsp
call xmltag$unescape
add rsp, xmltag_attributes_ofs
epilog
end if
if used xmltag$text | defined include_everything
; single argument in rdi: an xmltag object
; returns a new heap$alloc'd string of the text, possibly unescaped
falign
xmltag$text:
prolog xmltag$text
mov eax, [rdi+xmltag_width_ofs]
mov rsi, [rdi+xmltag_base_ofs]
mov rdx, [rdi+xmltag_textstart_ofs]
mov rcx, [rdi+xmltag_textend_ofs]
cmp dword [rdi+xmltag_nodetype_ofs], xmltag_textnode
jne .noescape
cmp rcx, rdx
je .emptystring
mov rsi, rdx
mov rdx, rcx
call xmltag$unescape
epilog
calign
.emptystring:
call string$new
epilog
calign
.noescape:
sub rcx, rdx
jz .emptystring
jmp qword [rax*8+.noescape_dispatch]
dalign
.noescape_dispatch:
dq .noescape_utf8, .noescape_utf16, .noescape_utf32
calign
.noescape_utf8:
lea rdi, [rsi+rdx]
mov rsi, rcx
call string$from_utf8
epilog
calign
.noescape_utf16:
lea rdi, [rsi+rdx*2]
lea rsi, [rcx*2]
call string$from_utf16
epilog
calign
.noescape_utf32:
lea rdi, [rsi+rdx*4]
lea rsi, [rcx*4]
call string$from_utf32
epilog
end if
if used xmltag$debug | defined include_everything
; single argument in rdi: an xmltag object
falign
xmltag$debug:
prolog xmltag$debug
mov eax, [rdi+xmltag_nodetype_ofs]
push rbx r12 r13
mov rbx, [rdi+xmltag_base_ofs]
mov r13d, [rdi+xmltag_width_ofs]
mov r12, rdi
jmp qword [rax*8+.dispatch]
dalign
.dispatch:
dq .notype, .element, .textnode, .cdata, .processinginstruction, .comment, .doctype, .xmldecl
cleartext .tagtype_notype, '(no type) (no output as a result)'
calign
.notype:
mov rdi, .tagtype_notype
call string$to_stdoutln
pop r13 r12 rbx
epilog
cleartext .tagtype_element, 'Element: '
cleartext .space, ' '
cleartext .empty, ' Empty? '
cleartext .true, 'true'
cleartext .false, 'false'
cleartext .attrs, ' attrcount: '
cleartext .equalquote, '="'
cleartext .quote, '"'
calign
.element:
mov rdi, .tagtype_element
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .empty
call string$to_stdout
mov rdi, .true
mov rsi, .false
cmp dword [r12+xmltag_empty_ofs], 0
cmove rdi, rsi
call string$to_stdout
mov rdi, .attrs
call string$to_stdout
mov edi, [r12+xmltag_attrcount_ofs]
mov esi, 10
call string$from_unsigned
push rax
mov rdi, rax
call string$to_stdout
pop rdi
call heap$free
cmp dword [r12+xmltag_attrcount_ofs], 0
je .element_noattrs
push r14
xor r14d, r14d
calign
.element_attrloop:
mov rdi, .space
call string$to_stdout
mov rdi, r12
mov esi, r14d
call xmltag$getattr
push rax
; attribute name first:
mov rdi, [rax+xmltagattr_namestart_ofs]
mov rsi, [rax+xmltagattr_nameend_ofs]
call .textout
mov rdi, .equalquote
call string$to_stdout
pop rax
; value
mov rdi, [rax+xmltagattr_valuestart_ofs]
mov rsi, [rax+xmltagattr_valueend_ofs]
call .textout
mov rdi, .quote
call string$to_stdout
add r14d, 1
cmp r14d, dword [r12+xmltag_attrcount_ofs]
jne .element_attrloop
pop r14
; fallthrough to .element_noattrs
calign
.element_noattrs:
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .tagtype_textnode, 'Text: '
calign
.textnode:
mov rdi, .tagtype_textnode
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .tagtype_cdata, 'CData: '
calign
.cdata:
mov rdi, .tagtype_cdata
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .tagtype_processinginstruction, 'ProcessingInstruction: '
calign
.processinginstruction:
mov rdi, .tagtype_processinginstruction
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .tagtype_comment, 'Comment: '
calign
.comment:
mov rdi, .tagtype_comment
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .tagtype_doctype, 'DOCTYPE: '
calign
.doctype:
mov rdi, .tagtype_doctype
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .tagtype_xmldecl, 'XMLDeclaration: '
calign
.xmldecl:
mov rdi, .tagtype_xmldecl
call string$to_stdout
mov rdi, [r12+xmltag_textstart_ofs]
mov rsi, [r12+xmltag_textend_ofs]
call .textout
mov rdi, .lf
call string$to_stdout
pop r13 r12 rbx
epilog
cleartext .lf, 10
falign
.textout:
jmp qword [r13*8+.textdispatch]
dalign
.textdispatch:
dq .text8, .text16, .text32
calign
.text8:
sub rsi, rdi
add rdi, rbx
call string$from_utf8
push rax
mov rdi, rax
call string$to_stdout
pop rdi
call heap$free
ret
calign
.text16:
sub rsi, rdi
shl rsi, 1
shl rdi, 1
add rdi, rbx
call string$from_utf16
push rax
mov rdi, rax
call string$to_stdout
pop rdi
call heap$free
ret
calign
.text32:
sub rsi, rdi
shl rsi, 2
shl rdi, 2
add rdi, rbx
call string$from_utf32
push rax
mov rdi, rax
call string$to_stdout
pop rdi
call heap$free
ret
end if
if used xmlparser$new | used xmlparser$new_string | used xmlparser$init | used xmlparser$init_string | defined include_everything
; defines for the character width (as passed to xmlparser$new)
xmlparser_utf8 = 0
xmlparser_utf16 = 1
xmlparser_utf32 = 2
; defines for flags (as passed to xmlparser$new)
xmlparser_ignorewhite = 1
xmlparser_condensewhite = 2
; defines for the return of xmlparser$next and xmlparser$prev
xmlparser_noerror = 0
xmlparser_endofdocument = 1
xmlparser_unterminatedcdatasection = 2
xmlparser_unterminatedxmldeclaration = 3
xmlparser_unterminateddoctypedeclaration = 4
xmlparser_unterminatedcomment = 5
xmlparser_malformedelement = 6
xmlparser_unterminatedattributevalue = 7
xmlparser_unterminatedelement = 8
xmlparser_unterminatedprocessinginstruction = 9
xmlparser_tagattributeoverflow = 10
xmlparser_badqname = 11
xmlparser_prefixnotbound = 12
xmlparser_duplicateattribute = 13
; xmlparser object itself:
xmlparser_base_ofs = 0 ; pointer to xml
xmlparser_size_ofs = 8 ; size in characters (not bytes)
xmlparser_pos_ofs = 16 ; our current position (in characters not bytes)
xmlparser_end_ofs = 24 ; our end (might be size, or less if whitespace trunc, etc)
xmlparser_flags_ofs = 32 ; dword flags
xmlparser_width_ofs = 36 ; dword character width
xmlparser_size = 40
end if
if used xmlparser$new | defined include_everything
; four arguments: rdi == ptr to xml, rsi == length (in characters) of same, edx == character width, ecx == flags
; returns new xmlparser object in rax
falign
xmlparser$new:
prolog xmlparser$new
push rdi rsi rdx rcx
mov edi, xmlparser_size
call heap$alloc
xor r8d, r8d
pop rcx rdx rsi rdi
mov [rax+xmlparser_base_ofs], rdi
mov [rax+xmlparser_size_ofs], rsi
mov [rax+xmlparser_pos_ofs], r8
mov [rax+xmlparser_end_ofs], rsi
mov [rax+xmlparser_flags_ofs], ecx
mov [rax+xmlparser_width_ofs], edx
epilog
end if
if used xmlparser$new_string | defined include_everything
; two arguments: rdi == string (native library string), esi == flags
; returns new xmlparser object in rax
falign
xmlparser$new_string:
prolog xmlparser$new_string
push rdi rsi
mov edi, xmlparser_size
call heap$alloc
xor r8d, r8d
pop rsi rdi
mov rcx, [rdi]
lea rdx, [rdi+8]
mov [rax+xmlparser_base_ofs], rdx
mov [rax+xmlparser_size_ofs], rcx
mov [rax+xmlparser_pos_ofs], r8
mov [rax+xmlparser_end_ofs], rcx
mov [rax+xmlparser_flags_ofs], esi
if string_bits = 32
mov dword [rax+xmlparser_width_ofs], xmlparser_utf32
else
mov dword [rax+xmlparser_width_ofs], xmlparser_utf16
end if
epilog
end if
if used xmlparser$init | defined include_everything
; five arguments: rdi == xmlparser object to init, rsi == ptr to xml, rdx == length (in chars) of same, ecx == char width, r8d == flags
falign
xmlparser$init:
prolog xmlparser$init
mov rax, rdi
xor r9d, r9d
mov [rdi+xmlparser_base_ofs], rsi
mov [rdi+xmlparser_size_ofs], rdx
mov [rdi+xmlparser_pos_ofs], r9
mov [rdi+xmlparser_end_ofs], rdx
mov [rdi+xmlparser_flags_ofs], r8d
mov [rdi+xmlparser_width_ofs], ecx
epilog
end if
if used xmlparser$init_string | defined include_everything
; three arguments: rdi == xmlparser object, rsi == string (native library string), edx == flags
falign
xmlparser$init_string:
prolog xmlparser$init_string
lea rcx, [rsi+8]
mov r8, [rsi]
xor r9d, r9d
mov rax, rdi
mov [rdi+xmlparser_base_ofs], rcx
mov [rdi+xmlparser_size_ofs], r8
mov [rdi+xmlparser_pos_ofs], r9
mov [rdi+xmlparser_end_ofs], r8
mov [rdi+xmlparser_flags_ofs], edx
if string_bits = 32
mov dword [rdi+xmlparser_width_ofs], xmlparser_utf32
else
mov dword [rdi+xmlparser_width_ofs], xmlparser_utf16
end if
epilog
end if
if used xmlparser$errortext | defined include_everything
; single argument in edi: one of the xmlparser_ numeric return values
; returns a static string (not heap$alloc'd)
falign
xmlparser$errortext:
prolog xmlparser$errortext
mov rax, [rdi*8+.dispatch]
epilog
cleartext .e0, 'No Error'
cleartext .e1, 'End of Document'
cleartext .e2, 'Unterminated CDATA Section'
cleartext .e3, 'Unterminated XML Declaration'
cleartext .e4, 'Unterminated DOCTYPE Declaration'
cleartext .e5, 'Unterminated Comment'
cleartext .e6, 'Malformed Element'
cleartext .e7, 'Unterminated Attribute Value'
cleartext .e8, 'Unterminated Element'
cleartext .e9, 'Unterminated Processing Instruction'
cleartext .e10, 'Tag Attribute Count Overflow'
cleartext .e11, 'Bad QName'
cleartext .e12, 'Prefix Not Bound'
cleartext .e13, 'Duplicate Attribute'
dalign
.dispatch:
dq .e0, .e1, .e2, .e3, .e4, .e5, .e6, .e7, .e8, .e9, .e10, .e11, .e12, .e13
end if
if used xmlparser$next | defined include_everything
; two arguments: rdi == xmlparser object, rsi == xmltag object (we'll call reset on it first)
; returns one of the above xmlparser_ values in eax
falign
xmlparser$next:
prolog xmlparser$next
mov eax, [rdi+xmlparser_width_ofs]
mov ecx, [rdi+xmlparser_flags_ofs]
push rbx rdi
mov rbx, rsi
mov rsi, [rdi+xmlparser_pos_ofs]
mov r10, [rdi+xmlparser_end_ofs]
mov rdi, [rdi+xmlparser_base_ofs]
; copy the base and width to the tag itself:
sub r10, rsi
mov [rbx+xmltag_base_ofs], rdi
mov dword [rbx+xmltag_width_ofs], eax
jmp qword [rax*8+.dispatch]
dalign
.dispatch:
dq .utf8, .utf16, .utf32
falign
.utf8:
test r10, r10
jz .utf8_endofdoc
test ecx, xmlparser_ignorewhite
jz .utf8_noskipwhite
; otherwise, skip whitespace and check for eod
calign
.utf8_skipwhite:
movzx ecx, byte [rdi+rsi]
mov r8d, 1
cmp ecx, 32
ja .utf8_noskipwhite
sub ecx, 1
shl r8d, cl
test r8d, 2147488512
jz .utf8_noskipwhite
; otherwise, we hit a 32, 9, 10, or 13
add rsi, 1
sub r10, 1
jnz .utf8_skipwhite
; fallthrough to end of document
calign
.utf8_endofdoc:
; save our position
pop rdi rbx
mov [rdi+xmlparser_pos_ofs], rsi
mov eax, xmlparser_endofdocument
epilog
calign
.utf8_noskipwhite:
; reset our xmltag (no need to call xmltag$reset, inline here is fine)
xor ecx, ecx
mov edx, -1
movzx eax, byte [rdi+rsi]
mov [rbx+xmltag_textstart_ofs], rcx
mov [rbx+xmltag_textend_ofs], rcx
mov [rbx+xmltag_textcolon_ofs], rdx ; writes 0 into empty too
cmp eax, '<'
mov [rbx+xmltag_nodetype_ofs], rcx ; writes over attrcount too
mov [rbx+xmltag_realstart_ofs], rsi
jne .utf8_textnode
cmp r10, 6
jb .utf8_notcdata
cmp dword [rdi+rsi], '
je .utf8_maybexmldecl
.utf8_notxmldecl:
cmp dword [rdi+rsi], '
je .utf8_maybedoctype
.utf8_notdoctype:
cmp dword [rdi+rsi], '
je .utf8_maybecdata
.utf8_notcdata:
cmp r10, 2
jb .utf8_element
cmp word [rdi+rsi], ''
je .utf8_pi
cmp r10, 4
jb .utf8_element
cmp dword [rdi+rsi], '