HeavyThing - cleartext.inc

Jeff Marrison

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	;       
	; cleartext.inc: static string encoding macros
	;
	; v1.22+ update: This now correctly deals with UTF8 input in fasm source
	;    .. Versions prior to this one simply promoted the string (much like
	;    fasm does with a du 'string' declaration) to either 16 bit or 32 bit
	;    characters.
	;
	;    This version validates and correctly interprets UTF8.
	;    NOTE: Requires little-endian byte order (see load ..ch lines below).

macro cleartext name*, [val*] {
common
	local ..str,..sz,..ch,..ci,..cj,..cc,..ign,..dat,..idx,..pad
	virtual at 0
		..str:: db val
		..sz = $
	end virtual
	..cc = 0
	..ign = 0
	; first step: count how many codepoints there are and validate the UTF8
	repeat ..sz
		if ..ign > 0
			..ign = ..ign - 1
		else
			if %+3 <= ..sz
				load ..ch dword from ..str:(%-1)
			else if %+2 <= ..sz
				load ..ch word from ..str:(%-1)
				load ..ci byte from ..str:(%+1)
				..ch = ..ch or (..ci shl 16)
			else if %+1 <= ..sz
				load ..ch word from ..str:(%-1)
			else
				load ..ch byte from ..str:(%-1)
			end if
			..ci = ..ch
			..cj = ..ch
			..ch = (..ch and 0xf0) shr 4
			if ..ch < 8
				; ascii
				..cj = ..ci and 0xff
				; codepoint is in ..cj
				..cc = ..cc + 1
			else if ..ch < 12
				display 'invalid UTF8',10
				err
			else if ..ch < 14
				; w8 or w16
				if %+1 > ..sz
					; not enough bytes left
					display 'invalid UTF8',10
					err
				end if
				..ch = ..ci
				if (..cj shr 8) and 0xc0 <> 0x80
					display 'invalid UTF8',10
					err
				end if
				..cj = ((..ch and 0xff) shl 6) and 0x7c0
				..ch = (..ch shr 8) and 0x3f
				..cj = ..cj or ..ch
				if ..cj < 0x80
					display 'invalid UTF8',10
					err
				end if
				; codepoint is in ..cj
				..cc = ..cc + 1
				..ign = 1
			else if ..ch = 14
				; w16
				if %+2 > ..sz
					; not enough bytes left
					display 'invalid UTF8',10
					err
				end if
				..ch = ..ci
				if (..cj shr 8) and 0xc0 <> 0x80
					display 'invalid UTF8',10
					err
				end if
				if (..cj shr 16) and 0xc0 <> 0x80
					display 'invalid UTF8',10
					err
				end if
				..cj = ((..ch and 0xff) shl 12) and 0xf000
				..ci = (((..ch shr 8) and 0xff) shl 6) and 0xfc0
				..cj = ..cj or ..ci
				..ch = (..ch shr 16) and 0x3f
				..cj = ..cj or ..ch
				if ..cj < 0x800
					display 'invalid UTF8',10
					err
				end if
				; codepoint is in ..cj
				..cc = ..cc + 1
				..ign = 2
			else
				; w32
				if %+3 > ..sz
					; not enough bytes left
					display 'invalid UTF8',10
					err
				end if
				..ch = ..ci
				if ..ch and 0x08 > 0
					display 'invalid UTF8',10
					err
				end if
				if (..ch shr 8) and 0xc0 <> 0x80
					display 'invalid UTF8',10
					err
				end if
				if (..ch shr 16) and 0xc0 <> 0x80
					display 'invalid UTF8',10
					err
				end if
				if (..ch shr 24) and 0xc0 <> 0x80
					display 'invalid UTF8',10
					err
				end if
				..cj = ((..ch and 0xff) shl 18) and 0x1c0000
				..ci = (((..ch shr 8) and 0xff) shl 12) and 0x3f000
				..cj = ..cj or ..ci
				..ci = (((..ch shr 16) and 0xff) shl 6) and 0xfc0
				..cj = ..cj or ..ci
				..ci = (..ch shr 24) and 0x3f
				..cj = ..cj or ..ci
				if ..cj < 0x10000
					display 'invalid UTF8',10
					err
				end if
if string_bits = 32
				..cc = ..cc + 1
else
				..cc = ..cc + 2
end if
				..ign = 3
			end if
		end if
	end repeat
	; our actual string storage
	dalign
	name:
		dq	..cc
	..dat:
if string_bits = 32
		db	(..cc shl 2) dup 0
else
		db	(..cc shl 1) dup 0
end if
	; and repeat again, only without the validation
	..ign = 0
	..idx = 0
	repeat ..sz
		if ..ign > 0
			..ign = ..ign - 1
		else
			if %+3 <= ..sz
				load ..ch dword from ..str:(%-1)
			else if %+2 <= ..sz
				load ..ch word from ..str:(%-1)
				load ..ci byte from ..str:(%+1)
				..ch = ..ch or (..ci shl 16)
			else if %+1 <= ..sz
				load ..ch word from ..str:(%-1)
			else
				load ..ch byte from ..str:(%-1)
			end if
			..ci = ..ch
			..cj = ..ch
			..ch = (..ch and 0xf0) shr 4
			if ..ch < 8
				; ascii
				..cj = ..ci and 0xff
				; codepoint is in ..cj
if string_bits = 32
				store dword ..cj at ..dat+..idx
				..idx = ..idx + 4
else
				store word ..cj at ..dat+..idx
				..idx = ..idx + 2
end if
			else if ..ch < 14
				; w8 or w16
				..ch = ..ci
				..cj = ((..ch and 0xff) shl 6) and 0x7c0
				..ch = (..ch shr 8) and 0x3f
				..cj = ..cj or ..ch
				; codepoint is in ..cj
if string_bits = 32
				store dword ..cj at ..dat+..idx
				..idx = ..idx + 4
else
				store word ..cj at ..dat+..idx
				..idx = ..idx + 2
end if
				..ign = 1
			else if ..ch = 14
				; w16
				..ch = ..ci
				..cj = ((..ch and 0xff) shl 12) and 0xf000
				..ci = (((..ch shr 8) and 0xff) shl 6) and 0xfc0
				..cj = ..cj or ..ci
				..ch = (..ch shr 16) and 0x3f
				..cj = ..cj or ..ch
				; codepoint is in ..cj
if string_bits = 32
				store dword ..cj at ..dat+..idx
				..idx = ..idx + 4
else
				store word ..cj at ..dat+..idx
				..idx = ..idx + 2
end if
				..ign = 2
			else
				; w32
				..ch = ..ci
				..cj = ((..ch and 0xff) shl 18) and 0x1c0000
				..ci = (((..ch shr 8) and 0xff) shl 12) and 0x3f000
				..cj = ..cj or ..ci
				..ci = (((..ch shr 16) and 0xff) shl 6) and 0xfc0
				..cj = ..cj or ..ci
				..ci = (..ch shr 24) and 0x3f
				..cj = ..cj or ..ci
if string_bits = 32
				store dword ..cj at ..dat+..idx
				..idx = ..idx + 4
else
				..ci = (((..cj - 0x10000) shr 10) and 0x3ff) + 0xd800
				; first in ci
				store word ..ci at ..dat+..idx
				..idx = ..idx + 2
				..cj = ((..cj - 0x10000) and 0x3ff) + 0xdc00
				; second in ..cj
				store word ..cj at ..dat+..idx
				..idx = ..idx + 2
end if
				..ign = 3
			end if
		end if
	end repeat
}