HeavyThing - profiler.inc

Jeff Marrison

Table of functions

	; ------------------------------------------------------------------------
	; HeavyThing x86_64 assembly language library and showcase programs
	; Copyright © 2015-2018 2 Ton Digital 
	; Homepage: https://2ton.com.au/
	; Author: Jeff Marrison <jeff@2ton.com.au>
	;       
	; This file is part of the HeavyThing library.
	;       
	; HeavyThing is free software: you can redistribute it and/or modify
	; it under the terms of the GNU General Public License, or
	; (at your option) any later version.
	;       
	; HeavyThing is distributed in the hope that it will be useful, 
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
	; GNU General Public License for more details.
	;       
	; You should have received a copy of the GNU General Public License along
	; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
	; ------------------------------------------------------------------------
	;       
	; profiler.inc: automatic callgraph/profiling goods
	;
	; SOME NOTES HERE:
	;
	; This isn't 100% accurate of course... but does provide excellent per-cycle
	; function timings and of course the actual callgraph itself.
	;
	; In other words, if this profiler says something takes X clock cycles
	; some of those clocks are still buried in the unpleasantness in profiler itself
	; and while measures were taken to reduce the exponential nesting results of this
	; profiling a single line function does not equal <10 clock cycles... a quick
	; look at how the profiling function enter/leave macros looks is sufficient to
	; understand why this is, hahaha
	;
	; that being said: for optimization efforts, this does precisely what I wanted it
	; to, and provides clock timing quite suitable for optimization efforts.
	;
	; finer-grained-than-this requires a bit more custom efforts.
	;
	; ALSO NOTE: profiler$init does some precomputing for delta timing, and tries
	; to wakeup the power management goods PRIOR to doing so.
	; this of course comes at a penalty that you'll see in the profiler output of ht$init
	; 
	; if you are on some crazy-fast machine, or if this annoys you, play with the values
	; in profile$init to adjust its delta timing and how much effort is spent doing so
	;
	; TODO: with profiling enabled, it would also be trivial to use the profiler
	; state tables to generate named-function stack traces at any given time.
	; might be useful to toss in.
	; 
	;


macro prolog name {
	local .function_name,.function_start,.function_namelen
	if public_funcs
		public name
	end if
        if framepointers
                push rbp
                mov rbp, rsp
        end if
        if profiling
                jmp     .function_start
                cleartext .function_name, `name
		calign
                .function_start:
                ; important for obvious reasons that we preserve _all_ registers
                push    rax rcx rdx rdi rsi r8 r9 r10 r11
                mov     rdi, .function_name
                call    profiler$enter                  ; single arg rdi == function name
                pop     r11 r10 r9 r8 rsi rdi rdx rcx rax
	else if calltracing
                jmp     .function_start
		.function_name:
		db	'calltrace: '
		db	`name
		db	10
		.function_namelen = $ - .function_name
		calign
                .function_start:
                ; important for obvious reasons that we preserve _all_ registers
                push    rax rcx rdx rdi rsi r8 r9 r10 r11
		mov	eax, syscall_write
		mov	edi, 1
		mov	rsi, .function_name
		mov	edx, .function_namelen
		syscall
                pop     r11 r10 r9 r8 rsi rdi rdx rcx rax
        end if
}

macro prolog_noprofile name {
	if public_funcs
		public name
	end if
	if framepointers
		push	rbp
		mov	rbp, rsp
	end if
}

macro prolog_noprofile_silent name {
	if framepointers
		push	rbp
		mov	rbp, rsp
	end if
}

macro prolog_silent name {
	local .function_name,.function_start
        if framepointers
                push rbp
                mov rbp, rsp
        end if
        if profiling
                jmp     .function_start
                cleartext .function_name, `name
		calign
                .function_start:
                ; important for obvious reasons that we preserve _all_ registers
                push    rax rcx rdx rdi rsi r8 r9 r10 r11
                mov     rdi, .function_name
                call    profiler$enter                  ; single arg rdi == function name rc4 offset into code
                pop     r11 r10 r9 r8 rsi rdi rdx rcx rax
        end if
}

macro epilog {
        if profiling
                ; important for obvious reasons that we preserve _all_ registers
                push    rax rcx rdx rdi rsi r8 r9 r10 r11
                call    profiler$leave
                pop     r11 r10 r9 r8 rsi rdi rdx rcx rax
        end if
        if framepointers
                leave
        end if
        ret
}

; these two inner-ones are useful for profiling individual codesections inside a function
; such that new framepointers and return are not necessary, but that entries aka calls
; can be tracked as well as cycle counts
; CARE MUST BE TAKEN to make sure these stay 1:1 (or the callgraph and everything gets all jacked up)
macro prolog_inner name {
	local .function_name,.function_start
        if profiling
                jmp     .function_start
                cleartext .function_name, `name
		calign
                .function_start:
                ; important for obvious reasons that we preserve _all_ registers
                push    rax rcx rdx rdi rsi r8 r9 r10 r11
                mov     rdi, .function_name
                call    profiler$enter                  ; single arg rdi == function name rc4 offset into code
                pop     r11 r10 r9 r8 rsi rdi rdx rcx rax
        end if
}

macro epilog_inner {
        if profiling
                ; important for obvious reasons that we preserve _all_ registers
                push    rax rcx rdx rdi rsi r8 r9 r10 r11
                call    profiler$leave
                pop     r11 r10 r9 r8 rsi rdi rdx rcx rax
        end if
}

macro epilog_noprofile {
	if framepointers
		leave
	end if
	ret
}

if profiling

globals
{
	_profiler_tscaccum	dq	0
	_profiler_deltamod	dq	0
	_profiler_last_tsc	dq	0
	_profiler_current	dq	0
	_profiler_block_free	dq	0
	_profiler_block		dq	0
}

falign
public profiler$init
profiler$init:
	; burning purpose: compute our _profiler_deltamod


	; NOTE: we need to sit and spin for 20us or so to kick the cpu power management up before we start our timing

	; ALSO: without this pmloop init here, what can happen is that power management upscales our process, and thus
	; what we start out as for timing information turns out to be incorrect, and we can get negative CPC results
	; in the profiler output.

	xor	eax, eax
	mov	ecx, 320000000
calign
.pmloop:
	xor	eax, ecx
	add	eax, ecx
	sub	ecx, 1
	jnz	.pmloop

	readtsc
	mov	[_profiler_last_tsc], rax
	mov	ecx, 1000000
calign
.loop:
	readtsc
	sub	ecx, 1
	jnz	.loop
	; rax has our most recent result
	sub	rax, qword [_profiler_last_tsc]
	; rax now has how long it took to do a bunch of readtscs....
	mov	ecx, 1000000
	xor	edx, edx
	div	rcx
	mov	[_profiler_deltamod], rax
	ret


; if attempting to clear the profiler from inside a child process, this causes the slate to be wiped clean
; (noting that we toss (leak) the previous profiler mmap segment)
falign
public profiler$reset
profiler$reset:
	mov	qword [_profiler_current], 0
	ret



	; single argument, rdi == function name
falign
public profiler$enter
profiler$enter:
	mov	rcx, [_profiler_current]
	test	rcx, rcx
	jz	.virgin
	readtsc
	mov	rdx, [_profiler_deltamod]
	mov	r9, [_profiler_tscaccum]
	add	r9, rdx
	mov	[_profiler_tscaccum], r9
	sub	rax, r9
	mov	r8, rax
	sub	rax, qword [_profiler_last_tsc]
	mov	[_profiler_last_tsc], r8
	; rax has the delta, we need to add this to our current item before we make a new current
	add	qword [_profiler_rcx_cycles], rax
	cmp	qword [_profiler_rcx_children], 0
	jz	.newchild
	mov	r10, rcx	; save for linking back to parent
	mov	rcx, [_profiler_rcx_children]
calign
.searchloop:
	mov	r9, [_profiler_rcx_name]
	cmp	[_profiler_rcx_name], rdi
	je	.foundit
	mov	r9, rcx				; save our prev pointer
	mov	rcx, [_profiler_rcx_next]	; move to next one
	test	rcx, rcx
	jnz	.searchloop
	; addchild:
	mov	r8, [_profiler_block_free]
	sub	r8, profiler_record_size
	cmp	r8, [_profiler_block]
	jbe	.kakked
	mov	[_profiler_block_free], r8
	mov	rcx, r9				; our prev
	; r8 is our spanking new pointer, link to rcx's next
	mov	[_profiler_rcx_next], r8
	mov	rcx, r8
	; set our goods
	mov	qword [_profiler_rcx_name], rdi
	mov	qword [_profiler_rcx_calls], 1
	mov	qword [_profiler_rcx_cycles], 0
	mov	qword [_profiler_rcx_children], 0
	mov	qword [_profiler_rcx_parent], r10
	mov	qword [_profiler_rcx_next], 0
	mov	[_profiler_current], rcx
	readtsc
	mov	rdx, [_profiler_deltamod]
	mov	r9, [_profiler_tscaccum]
	add	r9, rdx
	mov	[_profiler_tscaccum], r9
	sub	rax, r9
	mov	[_profiler_last_tsc], rax
	ret
calign
.foundit:
	; rcx matches our function name
	; set current to it, add calls
	add	qword [_profiler_rcx_calls], 1
	mov	[_profiler_current], rcx
	ret
calign
.kakked:
	mov	eax, syscall_exit
	mov	edi, 98
	syscall
calign
.newchild:
	; rcx is still loaded with the node we need to add to/set
	mov	r8, [_profiler_block_free]
	sub	r8, profiler_record_size
	cmp	r8, [_profiler_block]
	jbe	.kakked
	mov	[_profiler_block_free], r8
	; r8 is our spanking new pointer, link to rcx's children list
	mov	[_profiler_rcx_children], r8
	mov	r9, rcx				; save our parent pointer
	mov	rcx, r8
	; set our goods
	mov	qword [_profiler_rcx_name], rdi
	mov	qword [_profiler_rcx_calls], 1
	mov	qword [_profiler_rcx_cycles], 0
	mov	qword [_profiler_rcx_children], 0
	mov	qword [_profiler_rcx_parent], r9
	mov	qword [_profiler_rcx_next], 0
	mov	[_profiler_current], rcx
	readtsc
	mov	rdx, [_profiler_deltamod]
	mov	r9, [_profiler_tscaccum]
	add	r9, rdx
	mov	[_profiler_tscaccum], r9
	sub	rax, r9
	mov	[_profiler_last_tsc], rax
	ret
calign
.virgin:
	; no profiler current exists (which means by default it is our very first call, so do some setup)
	push	rdi				; save our one and only argument
	mov	eax, syscall_mmap
	xor	edi, edi
	mov	esi, profiler_record_size * profiler_recordcount
	mov	edx, 0x3
	mov	r10d, 0x22
	mov	r8, -1
	xor	r9d, r9d
	syscall
	cmp	rax, -1
	je	.kakked
	mov	[_profiler_block], rax
	add	rax, profiler_record_size * (profiler_recordcount - 1)
	mov	rcx, rax
	mov	[_profiler_current], rax	; this is our "global" record
	mov	[_profiler_block_free], rax	; this is the starting point for our next free one
	pop	rdi
	readtsc
	mov	rdx, [_profiler_deltamod]
	add	qword [_profiler_tscaccum], rdx
	sub	rax, qword [_profiler_tscaccum]
	mov	[_profiler_last_tsc], rax
	; zero our record in its entirety
	mov	qword [_profiler_rcx_name], 0
	mov	qword [_profiler_rcx_calls], 0
	mov	qword [_profiler_rcx_cycles], 0
	mov	qword [_profiler_rcx_children], 0
	mov	qword [_profiler_rcx_parent], 0
	mov	qword [_profiler_rcx_next], 0
	jmp	profiler$enter			; it will have better luck this time

	; no arguments
falign
public profiler$leave
profiler$leave:
	mov	rcx, [_profiler_current]
	readtsc
	mov	rdx, [_profiler_deltamod]
	mov	r9, [_profiler_tscaccum]
	add	r9, rdx
	mov	[_profiler_tscaccum], r9
	sub	rax, r9
	mov	r8, rax
	sub	rax, qword [_profiler_last_tsc]
	mov	qword [_profiler_last_tsc], r8
	add	qword [_profiler_rcx_cycles], rax
	mov	rcx, [_profiler_rcx_parent]
	mov	[_profiler_current], rcx
	ret

profiler_record_size = 48

virtual at rcx
	_profiler_rcx_name dq ?
	_profiler_rcx_calls dq ?
	_profiler_rcx_cycles dq ?
	_profiler_rcx_children dq ?
	_profiler_rcx_parent dq ?
	_profiler_rcx_next dq ?
end virtual


if used profiler$to_json | defined include_everything
	; no arguments
	; returns current profiler data as a new json OBJECT... with two children:
	; profiler, a json ARRAY (array of object, suitable for dumping into a tui_datagrid)
	; callgraph, a json ARRAY, same
falign
profiler$to_json:
	prolog	profiler$to_json	; yes, we profile ourself
	; save all our callee-save goods
	push	rbx r12 r13 r14 r15

	; we need a copy of the information table (so that it doesn't change in flight while we are doing our thing)
	mov	rdi, [_profiler_block]
	add	rdi, profiler_record_size * profiler_recordcount
	sub	rdi, qword [_profiler_block_free]
	; this call to heap$alloc will likely use another spot in our profiler table
	add	rdi, profiler_record_size * 2
	call	heap$alloc
	mov	r11, [_profiler_block]
	add	r11, profiler_record_size * profiler_recordcount	; the end
	mov	rbx, rax
	mov	rdi, rax
	mov	rsi, [_profiler_block_free]
calign
.copyloop:
	mov	rcx, [rsi]
	add	rsi, 8
	mov	[rdi], rcx
	add	rdi, 8
	cmp	rsi, r11
	jb	.copyloop

	; so now we have a copy of the relevant goods in rbx
	; in order to get our per function based totals, we need to iterate the blocks in rbx
	; stop when name == 0, add function name itself to a unique unsignedmap
	; and then iterate that list to generate our per cycle list
	; (cuz there will likely be duplicate same-func names in the list and we are not generating a callgraph yet)
	xor	edi, edi	; sort order
	call	unsignedmap$new
	mov	r12, rax

	mov	rcx, rbx
calign
.functionload:
	mov	rdi, r12
	mov	rsi, [_profiler_rcx_name]
	push	rcx
	call	unsignedmap$insert_unique
	pop	rcx
	add	rcx, 48
	cmp	[_profiler_rcx_name], 0
	jne	.functionload
	; ok so now our unsignedmap in r12 is full of our unique function names
	; now we want to iterate through those, and create a second map with cycle counts
	xor	edi, edi	; sort order
	call	unsignedmap$new
	mov	r13, rax
	mov	rax, [r12+_avlofs_next]
	; we know that there is valid functions, so we don't have to check the first one for null
calign
.cycleload:
	mov	rdi, [rax+_avlofs_key]
	call	.gettotals_for_function
	; so at this point, rax still has our node, rdi has our function index, rdx has our total cycle count
	mov	rsi, rdx	; cycle count is our key
	mov	rdx, rdi	; value is our function index
	mov	rdi, r13
	push	rax
	call	unsignedmap$insert	; duplicates fine by me
	pop	rax
	mov	rax, [rax+_avlofs_next]
	test	rax, rax
	jnz	.cycleload

	; we can safely destroy our map in r12, and reuse the spot for our json array
	mov	rdi, r12
	xor	esi, esi	; no clear function
	call	unsignedmap$clear
	mov	rdi, r12
	call	heap$free
	; now, create a new json array called profiler
	
	mov	rdi, .profilerstr
	call	json$newarray
	mov	r12, rax

	; so now our unsignedmap in r13 has our by-cycle count, go through it backwards
	; we need to tally up the total cycles for all of our program
	xor	r9d, r9d	; accum them here
	mov	rax, [r13+_avlofs_prev]
calign
.gettotalcycles:
	add	r9, [rax+_avlofs_key]
	mov	rax, [rax+_avlofs_prev]
	test	rax, rax
	jnz	.gettotalcycles

	mov	rax, [r13+_avlofs_prev]
calign
.populatecycles:
	mov	rdi, [rax+_avlofs_value]	; function index
	call	.gettotals_for_function		; even though we have cycles, we still need call count
	mov	rsi, rdx
	mov	rdx, r8				; so now, rdi == function, rsi == cycles, rdx == calls, r9 == total cycles
	call	.add_profiler_function
	mov	rax, [rax+_avlofs_prev]
	test	rax, rax
	jnz	.populatecycles

	; we are no longer interested in our map in r13
	mov	rdi, r13
	xor	esi, esi	; no clear function
	call	unsignedmap$clear
	mov	rdi, r13
	call	heap$free

	; we need a new callgraph json array
	mov	rdi, .callgraphstr
	call	json$newarray
	mov	r13, rax


	; ok so, if we use our COPY in rbx, due to all of our own functions calls, it is no longer valid
	; so, we have to make use of the "live" one (cough)
	mov	rcx, [_profiler_block]
	add	rcx, profiler_record_size * (profiler_recordcount-1)	; the "list node" (master root), of which its children list has the goods

	mov	rcx, [_profiler_rcx_children]
	; now, all we have to do is iterate through this list
calign
.graphloop:
	push	rcx
	mov	rdi, rcx	; first arg: the profiler block
	xor	esi, esi	; second arg: the nest level
	call	.add_individual_callgraph
	pop	rcx
	mov	rcx, [_profiler_rcx_next]
	test	rcx, rcx
	jnz	.graphloop

	mov	rdi, rbx
	call	heap$free

	; construct our unnamed object return
	call	string$new
	mov	rdi, rax
	call	json$newobject_nocopy

	mov	r14, rax
	mov	rdi, rax
	mov	rsi, r12
	call	json$appendchild
	mov	rdi, r14
	mov	rsi, r13
	call	json$appendchild
	mov	rax, r14
	
	pop	r15 r14 r13 r12 rbx
	epilog
cleartext .profilerstr, 'profiler'
cleartext .callgraphstr, 'callgraph'
cleartext .functionstr, 'function'
cleartext .callsstr, 'calls'
cleartext .cyclesstr, 'cycles'
cleartext .cpcstr, 'cpc'
cleartext .totcyclesstr, 'totcycles'
cleartext .percstr, '%'
calign
.add_individual_callgraph:
	; rdi == pointer to our profiler block, rsi == nestlevel
	sub	rsp, 16
	mov	[rsp], rdi
	mov	[rsp+8], rsi

	call	string$new
	mov	rdi, rax
	call	json$newobject_nocopy
	mov	r14, rax

	mov	rdi, .functionstr
	call	string$copy
	mov	r15, rax
	
	mov	rcx, [rsp]

	mov	rdi, [_profiler_rcx_name]
	mov	rsi, [rsp+8]
	add	rsi, qword [rdi]		; width + leading spaces for padding
	mov	edx, ' '			; padchar
	call	string$lpad
	mov	rdi, r15
	mov	rsi, rax			; the value
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "function":"name" child.

	mov	rsi, [rsp+8]
	add	rsi, 1				; for the next level down
	mov	[rsp+8], rsi

	mov	rdi, .callsstr
	call	string$copy
	mov	r15, rax
	
	mov	rcx, [rsp]

	mov	rdi, [_profiler_rcx_calls]
	mov	esi, 10		; radix
	call	string$from_int
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "calls":"12345" child.
	
	mov	rdi, .cyclesstr
	call	string$copy
	mov	r15, rax
	
	mov	rcx, [rsp]
	
	mov	rdi, [_profiler_rcx_cycles]
	mov	esi, 10		; radix
	call	string$from_int
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "cycles":"12345" child.

	mov	rdi, .cpcstr
	call	string$copy
	mov	r15, rax

	mov	rcx, [rsp]
	
if cpc_integers
	mov	rax, [_profiler_rcx_cycles]
	mov	rcx, [_profiler_rcx_calls]
	div	rcx
	mov	rdi, rax
	mov	esi, 10
	call	string$from_int
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild
else
	mov	rsi, [_profiler_rcx_cycles]
	mov	rdx, [_profiler_rcx_calls]

	cvtsi2sd	xmm0, rsi
	cvtsi2sd	xmm1, rdx
	divsd	xmm0, xmm1			; cycles / calls
	mov	edi, double_string_fixed
	mov	esi, 1				; precision of 1 decimal digit should be plenty
	call	string$from_double
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "cpc":"123.456" child.
end if

	mov	rdi, .totcyclesstr
	call	string$copy
	mov	r15, rax

	mov	rdi, [rsp]
	xor	rax, rax
	call	.get_total_cycles

	mov	rdi, rax
	mov	esi, 10		; radix
	call	string$from_int
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "totcycles":"12345" child.

	; now our object in r14 is complete, add it to r13
	mov	rdi, r13
	mov	rsi, r14
	call	json$appendchild

	; now we need to "nest"
	mov	rcx, [rsp]

	mov	rcx, [_profiler_rcx_children]
	test	rcx, rcx
	jz	.individual_done
calign
.individual_loop:
	mov	[rsp], rcx
	mov	rdi, rcx
	mov	rsi, [rsp+8]
	call	.add_individual_callgraph
	mov	rcx, [rsp]
	mov	rcx, [_profiler_rcx_next]
	test	rcx, rcx
	jnz	.individual_loop
calign
.individual_done:
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	add	rsp, 16
	ret
calign
.get_total_cycles:
	; parm in rdi == our record, we need to tally up our cycle count and all our children's
	mov	rcx, rdi
	add	rax, [_profiler_rcx_cycles]
	mov	rcx, [_profiler_rcx_children]
	test	rcx, rcx
	jz	.get_total_cycles_done
calign
.get_total_cycles_loop:
	push	rcx rsi
	mov	rdi, rcx
	call	.get_total_cycles
	pop	rsi rcx
	mov	rcx, [_profiler_rcx_next]
	test	rcx, rcx
	jnz	.get_total_cycles_loop
calign
.get_total_cycles_done:
	ret
	
calign
.add_profiler_function:
	; rdi == function, rsi == cycles, rdx == calls, r9 == total cycles
	sub	rsp, 40
	mov	[rsp], rdi
	mov	[rsp+8], rsi
	mov	[rsp+16], rdx
	mov	[rsp+24], r9
	mov	[rsp+32], rax		; we are not allowed to mash rax or r9
	
	call	string$new
	mov	rdi, rax
	call	json$newobject_nocopy
	mov	r14, rax

	mov	rdi, .functionstr
	call	string$copy
	mov	r15, rax
	mov	rdi, [rsp]
	call	string$copy
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild	; object now has a "function":"name" child

	mov	rdi, .callsstr
	call	string$copy
	mov	r15, rax
	mov	rdi, [rsp+16]
	mov	esi, 10
	call	string$from_int
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild	; object now has a "calls":"12345" child

	mov	rdi, .cyclesstr
	call	string$copy
	mov	r15, rax
	mov	rdi, [rsp+8]
	mov	esi, 10
	call	string$from_int
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild	; object nowe has a "cycles":"12345" child

	mov	rdi, .cpcstr
	call	string$copy
	mov	r15, rax

if cpc_integers
	mov	rax, [rsp+8]
	mov	rcx, [rsp+16]
	div	rcx
	mov	rdi, rax
	mov	esi, 10
	call	string$from_int
else

	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]

	cvtsi2sd	xmm0, rsi
	cvtsi2sd	xmm1, rdx
	divsd	xmm0, xmm1			; cycles / calls
	mov	edi, double_string_fixed
	mov	esi, 1				; precision of 1 decimal digit should be plenty
	call	string$from_double
end if
	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "cpc":"123.456" child.

	mov	rdi, .totcyclesstr
	call	string$copy
	mov	r15, rax

	; for the total cycles field here, we want to add a percentage of total time spent
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+24]
	cvtsi2sd	xmm0, rsi
	cvtsi2sd	xmm1, rdx
	divsd	xmm0, xmm1
	movq	xmm1, [_math_onehundred]
	mulsd	xmm0, xmm1

	mov	edi, double_string_fixed
	mov	esi, 3				; 2 decimal digits for our percentage string
	call	string$from_double

	push	rax
	mov	rdi, rax
	mov	rsi, .percstr
	call	string$concat

	mov	rdi, r15
	mov	rsi, rax
	call	json$newvalue_nocopy
	mov	rdi, r14
	mov	rsi, rax
	call	json$appendchild		; object now has a "totcycles":"99.99%" child.

	pop	rdi
	call	heap$free			; free the first bit of our string

	; our object is complete, now we need to add our unnamed object to the array in r12
	mov	rdi, r12
	mov	rsi, r14
	call	json$appendchild
	
	; restore our vars
	mov	rdi, [rsp]
	mov	rsi, [rsp+8]
	mov	rdx, [rsp+16]
	mov	r9, [rsp+24]
	mov	rax, [rsp+32]
	add	rsp, 40
	ret

calign
.gettotals_for_function:
	; without mashing rax, populate rdx with the total number of cycles, r8 with the total number of calls
	mov	rcx, rbx
	xor	edx, edx
	xor	r8d, r8d
calign
.gffloop:
	cmp	qword [_profiler_rcx_name], rdi
	jne	.gffnext
	add	rdx, qword [_profiler_rcx_cycles]
	add	r8, qword [_profiler_rcx_calls]
	add	rcx, profiler_record_size
	cmp	qword [_profiler_rcx_name], 0
	jne	.gffloop
	ret
calign
.gffnext:
	add	rcx, profiler_record_size
	cmp	qword [_profiler_rcx_name], 0
	jne	.gffloop
	ret
end if
	
	
	
if used profiler$to_string | defined include_everything
	; no arguments
	; returns current profiler data as a string suitable for dumping to stdout/stderr
falign
profiler$to_string:
	prolog	profiler$to_string	; yes, we profile ourself

	sub	rsp, 16
	call	buffer$new

	mov	[rsp], rax
	mov	rdi, rax
	mov	rsi, .profilerheader
	call	buffer$append_string
	mov	rdi, [rsp]
	mov	esi, 10
	call	buffer$append_byte

	call	profiler$to_json
	mov	[rsp+8], rax
	
	mov	rdi, rax
	mov	rsi, .profilerstr
	call	json$getvaluebyname
	mov	rdi, [rax+json_contents_ofs]
	mov	rsi, .eachcallgraph
	mov	rdx, [rsp]
	call	list$foreach_arg

	mov	rdi, [rsp]
	mov	rsi, .callgraphheader
	call	buffer$append_string
	mov	rdi, [rsp]
	mov	esi, 10
	call	buffer$append_byte
	
	mov	rdi, [rsp+8]
	mov	rsi, .callgraphstr
	call	json$getvaluebyname
	mov	rdi, [rax+json_contents_ofs]
	mov	rsi, .eachcallgraph
	mov	rdx, [rsp]
	call	list$foreach_arg

	mov	rdi, [rsp+8]
	call	json$destroy

	mov	rcx, [rsp]
	mov	rdi, [rcx+buffer_itself_ofs]
	mov	rsi, [rcx+buffer_length_ofs]
	call	string$from_utf8
	mov	[rsp+8], rax
	mov	rdi, [rsp]
	call	buffer$destroy
	mov	rax, [rsp+8]
	add	rsp, 16
	epilog
cleartext .profilerstr, 'profiler'
cleartext .callgraphstr, 'callgraph'
cleartext .profilerheader, 'Function calls listed by total time spent descending:'
cleartext .callgraphheader, 'Actual Call Graph, Listed in Call Order and Dependency Order'
cleartext .functionstr, 'function'
cleartext .callsstr, 'calls'
cleartext .cyclesstr, 'cycles'
cleartext .cpcstr, 'cpc'
cleartext .totcyclesstr, 'totcycles'
cleartext .ind1, ': calls: '
cleartext .ind2, ', cycles: '
cleartext .ind3, ', cpc: '
cleartext .ind4, ', tot_cycles: '
calign
.eachcallgraph:
	; rdi == our entry json object, rsi == destination buffer
	sub	rsp, 24
	mov	[rsp], rdi
	mov	[rsp+8], rsi
	mov	rdi, rsi
	mov	esi, 1024
	call	buffer$reserve
	mov	rdi, [rsp]
	mov	rsi, .functionstr
	call	json$getvaluebyname
	mov	rsi, [rax+json_value_ofs]
	mov	rdi, [rsp+8]
	call	buffer$append_string
	
	mov	rdi, [rsp+8]
	mov	rsi, .ind1
	call	buffer$append_string

	mov	rdi, [rsp]
	mov	rsi, .callsstr
	call	json$getvaluebyname
	mov	rsi, [rax+json_value_ofs]
	mov	rdi, [rsp+8]
	call	buffer$append_string
	
	mov	rdi, [rsp+8]
	mov	rsi, .ind2
	call	buffer$append_string

	mov	rdi, [rsp]
	mov	rsi, .cyclesstr
	call	json$getvaluebyname
	mov	rsi, [rax+json_value_ofs]
	mov	rdi, [rsp+8]
	call	buffer$append_string

	mov	rdi, [rsp+8]
	mov	rsi, .ind3
	call	buffer$append_string

	mov	rdi, [rsp]
	mov	rsi, .cpcstr
	call	json$getvaluebyname
	mov	rsi, [rax+json_value_ofs]
	mov	rdi, [rsp+8]
	call	buffer$append_string

	mov	rdi, [rsp]
	mov	rsi, .totcyclesstr
	call	json$getvaluebyname
	test	rax, rax
	jz	.nototal
	mov	[rsp+16], rax

	mov	rdi, [rsp+8]
	mov	rsi, .ind4
	call	buffer$append_string
	
	mov	rax, [rsp+16]
	mov	rsi, [rax+json_value_ofs]
	mov	rdi, [rsp+8]
	call	buffer$append_string
.nototal:
	mov	rdi, [rsp+8]
	mov	esi, 10
	call	buffer$append_byte
	add	rsp, 24
	ret
end if



if used tui_profiler$vtable | defined include_everything
	; in order to create a component, we are going to wrap our goods inside a custom tui_object
	; and keep track of our header label, and allow flipping/refreshing of the profiler data.
	; so this means we need a custom vtable so we can catch keyevents and do a custom clone/cleanup
dalign
tui_profiler$vtable:
        dq      tui_profiler$cleanup, tui_profiler$clone, tui_object$draw, tui_object$redraw, tui_object$updatedisplaylist, tui_object$sizechanged
        dq      tui_object$timer, tui_object$layoutchanged, tui_object$move, tui_object$setfocus, tui_object$gotfocus, tui_object$lostfocus
        dq      tui_profiler$keyevent, tui_object$domodal, tui_object$endmodal, tui_object$exit, tui_object$calcbounds, tui_object$calcchildbounds
        dq      tui_object$appendchild, tui_object$appendbastard, tui_object$prependchild, tui_object$contains, tui_object$getchildindex
        dq      tui_object$removechild, tui_object$removebastard, tui_object$removeallchildren, tui_object$removeallbastards
        dq      tui_object$getobjectsunderpoint, tui_object$flatten, tui_object$firekeyevent, tui_object$ontab, tui_object$onshifttab
        dq      tui_object$setcursor, tui_object$showcursor, tui_object$hidecursor, tui_object$click, tui_object$clicked
end if

tui_profiler_dg_ofs = tui_object_size
tui_profiler_json_ofs = tui_object_size + 8
tui_profiler_state_ofs = tui_object_size + 16
tui_profiler_text_ofs = tui_object_size + 24

tui_profiler_size = tui_object_size + 32

if used tui_profiler$new | defined include_everything
	; no arguments, returns a new tui_profiler component in rax (100% width/height)
	; returns a new profiler tui component in rax suitable for adding to an existing tui object,
	; or as an argument to terminal$new, etc.
	; NOTE: this sets the initial datagrid goods with the current profiler data
falign
tui_profiler$new:
	prolog	tui_profiler$new
	mov	edi, tui_profiler_size
	call	heap$alloc
	push	rax
	mov	rdi, rax
	movq	xmm0, [_math_onehundred]
	movq	xmm1, [_math_onehundred]
	call	tui_object$init_dd
	mov	rax, [rsp]
	mov	qword [rax], tui_profiler$vtable
	call	profiler$to_json
	mov	rdi, [rsp]
	mov	[rdi+tui_profiler_json_ofs], rax	; store the json object itself

	; next up, we need our header line
	; which is an hbox, 100%x1 dims, and in it
	; two labels, one of which we need to be able to modify
	; and the other of which will remain static for the life of our component
	mov	edi, tui_object_size
	call	heap$alloc
	mov	qword [rax], tui_object$default_vtable
	push	rax
	mov	rdi, rax
	movq	xmm0, [_math_onehundred]
	mov	esi, 1
	call	tui_object$init_di
	mov	rdi, [rsp]
	mov	dword [rdi+tui_layout_ofs], tui_layout_horizontal
	; now we need to create/add a label to it
	movq	xmm0, [_math_onehundred]
	mov	edi, 1
	mov	rsi, .profilerlabel
	ansi_colors edx, 'lightgray', 'black'
	mov	ecx, tui_textalign_left
	call	tui_label$new_di
	; [rsp+8] is our tui_profiler object, we need to set its text label to this one
	mov	rsi, [rsp+8]
	mov	[rsi+tui_profiler_text_ofs], rax
	mov	rsi, rax
	mov	rdi, [rsp]	; the header hbox
	mov	rdx, [rdi]
	call	qword [rdx+tui_vappendchild]
	; now we need our static right hand side text
	mov	edi, 48
	mov	esi, 1
	mov	rdx, .controlslabel
	ansi_colors ecx, 'lightgray', 'black'
	mov	r8d, tui_textalign_right
	call	tui_label$new_ii
	mov	rdi, [rsp]	; the header hbox
	mov	rsi, rax
	mov	rdx, [rdi]
	call	qword [rdx+tui_vappendchild]
	; so now, we have to add the header hbox to our tui_profiler object
	pop	rsi
	mov	rdi, [rsp]
	mov	rdx, [rdi]
	call	qword [rdx+tui_vappendchild]
	

	; next up, we need a datagrid
	movq	xmm0, [_math_onehundred]
	movq	xmm1, [_math_onehundred]
	ansi_colors edi, 'black', 'lightgray'
	ansi_colors esi, 'lightgray', 'black'
	ansi_colors edx, 'lightgray', 'blue'
	call	tui_datagrid$new_dd
	mov	rdi, [rsp]
	mov	[rdi+tui_profiler_dg_ofs], rax
	; set its initial data to the profiler json array object contained in our goods
	mov	rsi, .profilerstr
	mov	rdi, [rdi+tui_profiler_json_ofs]
	call	json$getvaluebyname
	; this should be a json array object sitting in rax
	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	mov	rsi, rax
	call	tui_datagrid$nvsetdata_notowner

	; so now we have a datagrid with profiler data, the DG needs its headers
	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	mov	rsi, .functionlabel
	movq	xmm0, [_math_onehundred]
	mov	edx, tui_textalign_left
	mov	rcx, .functionstr
	call	tui_datagrid$nvaddproperty_d

	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	mov	rsi, .callslabel
	mov	edx, 14
	mov	ecx, tui_textalign_right
	mov	r8, .callsstr
	call	tui_datagrid$nvaddproperty_i

	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	mov	rsi, .cycleslabel
	mov	edx, 14
	mov	ecx, tui_textalign_right
	mov	r8, .cyclesstr
	call	tui_datagrid$nvaddproperty_i

	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	mov	rsi, .cpclabel
	mov	edx, 14
	mov	ecx, tui_textalign_right
	mov	r8, .cpcstr
	call	tui_datagrid$nvaddproperty_i

	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	mov	rsi, .totlabel
	mov	edx, 14
	mov	ecx, tui_textalign_right
	mov	r8, .totcyclesstr
	call	tui_datagrid$nvaddproperty_i

	mov	rdi, [rsp]
	mov	rsi, [rdi+tui_profiler_dg_ofs]
	mov	rdx, [rdi]
	call	qword [rdx+tui_vappendchild]
	
	; so now, we have added/set everything we need, except our state variable
	mov	rax, [rsp]
	mov	qword [rax+tui_profiler_state_ofs], 0
	add	rsp, 8
	
	epilog
cleartext .profilerstr, 'profiler'
cleartext .functionlabel, 'Function Name'
cleartext .functionstr, 'function'
cleartext .callslabel, 'Calls'
cleartext .callsstr, 'calls'
cleartext .cycleslabel, 'Cycles'
cleartext .cyclesstr, 'cycles'
cleartext .cpclabel, 'CPC'
cleartext .cpcstr, 'cpc'
cleartext .totlabel, 'Total Cycles'
cleartext .totcyclesstr, 'totcycles'
cleartext .profilerlabel, ' Functions by time spent descending'
cleartext .controlslabel, 'Space-Profiler/CallGraph Up/Dn-Peruse R-Refresh '

end if

if used tui_profiler$cleanup | defined include_everything
	; single argument in rdi: the tui_profiler object we are cleaning up
	; we call tui_object's cleanup to let it deal with everything but our json, which is the only one we hold onto
	; that isn't handeld by tui_object's method
falign
tui_profiler$cleanup:
	prolog	tui_profiler$cleanup
	push	rdi
	call	tui_object$cleanup
	pop	rsi
	mov	rdi, [rsi+tui_profiler_json_ofs]
	call	json$destroy
	epilog
end if

if used tui_profiler$clone | defined include_everything
	; single argument in rdi: the tui_profiler object we are making a copy of
falign
tui_profiler$clone:
	prolog	tui_profiler$clone
	; because we dont actually care, we just return a spanking new one
	call	tui_profiler$new
	epilog
end if

if used tui_profiler$keyevent | defined include_everything
	; three arguments: rdi == tui_profiler object, esi == key, edx == esc_key
	; returns 1 in eax if we handled it, 0 if not
falign
tui_profiler$keyevent:
	prolog	tui_profiler$keyevent
	xor	eax, eax
	cmp	esi, 32
	je	.gotspace
	cmp	esi, 'r'
	je	.gotr
	cmp	esi, 'R'
	je	.gotr
	epilog

calign
.gotr:
	push	rdi
	mov	rdi, [rdi+tui_profiler_json_ofs]
	call	json$destroy
	call	profiler$to_json
	mov	rdi, [rsp]		; jumping to toprofiler assumes rdi was pushed, so we are not popping it here
	mov	[rdi+tui_profiler_json_ofs], rax
	jmp	.toprofiler
calign
.gotspace:
	; flip between profiler and callgraph data
	push	rdi
	cmp	qword [rdi+tui_profiler_state_ofs], 0
	je	.tocallgraph
calign
.toprofiler:
	; else, toprofiler data.
	mov	rdi, [rdi+tui_profiler_json_ofs]
	mov	rsi, .profilerstr
	call	json$getvaluebyname

	mov	rsi, rax
	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	call	tui_datagrid$nvsetdata_notowner
	mov	rdi, [rsp]

	cmp	qword [rdi+tui_profiler_state_ofs], 0
	je	.notextreset
	mov	qword [rdi+tui_profiler_state_ofs], 0
	mov	rdi, [rdi+tui_profiler_text_ofs]
	mov	rsi, .profilerlabel
	call	tui_label$nvsettext
calign
.notextreset:
	pop	rdi
	mov	eax, 1
	epilog
calign
.tocallgraph:
	mov	rdi, [rdi+tui_profiler_json_ofs]
	mov	rsi, .callgraphstr
	call	json$getvaluebyname
	mov	rsi, rax
	mov	rdx, [rsp]
	mov	rdi, [rdx+tui_profiler_dg_ofs]
	call	tui_datagrid$nvsetdata_notowner
	mov	rdi, [rsp]
	mov	qword [rdi+tui_profiler_state_ofs], 1
	mov	rdi, [rdi+tui_profiler_text_ofs]
	mov	rsi, .callgraphlabel
	call	tui_label$nvsettext
	pop	rdi
	mov	eax, 1
	epilog
cleartext .profilerstr, 'profiler'
cleartext .callgraphstr, 'callgraph'
cleartext .profilerlabel, ' Functions by time spent descending'
cleartext .callgraphlabel, ' Actual Call Graph'
end if


end if