; ------------------------------------------------------------------------
; HeavyThing x86_64 assembly language library and showcase programs
; Copyright © 2015-2018 2 Ton Digital
; Homepage: https://2ton.com.au/
; Author: Jeff Marrison <jeff@2ton.com.au>
;
; This file is part of the HeavyThing library.
;
; HeavyThing is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License, or
; (at your option) any later version.
;
; HeavyThing is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along
; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
; ------------------------------------------------------------------------
;
; memfuncs.inc: memset, memcmp, memcmp16, memcpy
;
; Portions of this file are heavily modified routines that Agner Fog released
; also under the GPL in his excellent optimization resources.
; Because the heavily modified functions herein are from various source files
; of his, and that each of his source files contains its own unique copyright
; notice, included below is the asmlib.h header that covers them all.
;
; Cheers Agner for some of this as usual; All Hail Agner!
;
; NOTE re: Agner, where I did in fact follow his fine lead, they are not verbatim
; copies... because of the way I enforce 16 byte aligned jump targets _everywhere_
; we had to take some liberties...
;
; Agner's asmlib copyright appears below:
;/*************************** asmlib.h ***************************************
;* Author: Agner Fog
;* Date created: 2003-12-12
;* Last modified: 2012-03-10
;* Project: asmlib.zip
;* Source URL: www.agner.org/optimize
;*
;* Description:
;* Header file for the asmlib function library.
;* This library is available in many versions for different platforms.
;* See asmlib-instructions.pdf for details.
;*
;* (c) Copyright 2003 - 2012 by Agner Fog.
;* GNU General Public License http://www.gnu.org/licenses/gpl.html
;*****************************************************************************/
; we also include a strlen_latin1 function in here for c strings
if used strlen_latin1 | defined include_everything
; rdi == c string
falign
strlen_latin1:
prolog strlen_latin1
mov rax, rdi
sub rax, 1
calign
.top:
add rax, 1
test rax, 3
jnz .misaligned
calign
.inner:
mov ecx, [rax]
add rax, 4
mov edx, ecx
not ecx
sub edx, 0x01010101
and ecx, 0x80808080
and ecx, edx
jz .inner
sub rax, 4
calign
.misaligned:
cmp byte [rax], 0
jne .top
sub rax, rdi
epilog
end if
if used memset | defined include_everything
; rdi == dest, esi (char), rdx == count
; rcx == dest2
; r8 == count2
; xmm0 == x0
falign
memset:
prolog memset
mov r8, rdx
imul esi, 0x01010101
mov rcx, rdi
cmp rdx, 16
ja .m100
shl rdx, 3
add rdx, .AlignmentDispatch
jmp qword [rdx]
calign
.m16:
mov [rdi+12], esi
mov [rdi+8], esi
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m12:
mov [rdi+8], esi
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m08:
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m04:
mov [rdi], esi
epilog
calign
.m15:
mov [rdi+11], esi
mov [rdi+7], esi
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
; yeah wow I don't get that.
calign
.m11:
mov [rdi+7], esi
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m07:
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m03:
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m01:
mov [rdi], sil
epilog
calign
.m14:
mov [rdi+10], esi
mov [rdi+6], esi
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m10:
mov [rdi+6], esi
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m06:
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m02:
mov [rdi], si
epilog
calign
.m13:
mov [rdi+9], esi
mov [rdi+5], esi
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m09:
mov [rdi+5], esi
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m05:
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m00:
epilog
dalign
.AlignmentDispatch:
dq .m00, .m01, .m02, .m03, .m04, .m05, .m06, .m07, .m08, .m09, .m10, .m11, .m12, .m13, .m14, .m15, .m16
; note here: i don't do memset64 typically with sizes big enough to warrant his non-temporal goods being
; actively checked
calign
.m100:
movd xmm0, esi
pshufd xmm0, xmm0, 0
movq [rdi], xmm0
movq [rdi+8], xmm0
lea rdx, [rdi+rdx-1]
and rdx, -10H
add rdi, 10H
and rdi, -10H
sub rdi, rdx
jnl .m300
calign
.m200:
movdqa [rdx+rdi], xmm0
add rdi, 10H
jnz .m200
calign
.m300:
movq [rcx+r8-10H], xmm0
movq [rcx+r8-8], xmm0
epilog
end if
if used memset16 | defined include_everything
; NOTE NOTE NOTE: this is the same as memset only we do utf16 character setting instead of byte setting
; it is ASSUMED that rdx & 1 == 0
; rdi == dest, esi (char), rdx == count (in BYTES, even though it is misleading)
; rcx == dest2
; r8 == count2
; xmm0 == x0
falign
memset16:
prolog memset16
mov r8, rdx
imul esi, 0x10001
mov rcx, rdi
cmp rdx, 16
ja .m100
shl rdx, 3
add rdx, .AlignmentDispatch
jmp qword [rdx]
calign
.m16:
mov [rdi+12], esi
mov [rdi+8], esi
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m12:
mov [rdi+8], esi
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m08:
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m04:
mov [rdi], esi
epilog
; these will not be called, but we leave them here anyway
calign
.m15:
mov [rdi+11], esi
mov [rdi+7], esi
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
; yeah wow I don't get that.
calign
.m11:
mov [rdi+7], esi
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m07:
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m03:
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m01:
mov [rdi], sil
epilog
calign
.m14:
mov [rdi+10], esi
mov [rdi+6], esi
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m10:
mov [rdi+6], esi
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m06:
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m02:
mov [rdi], si
epilog
calign
.m13:
mov [rdi+9], esi
mov [rdi+5], esi
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m09:
mov [rdi+5], esi
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m05:
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m00:
epilog
dalign
.AlignmentDispatch:
dq .m00, .m01, .m02, .m03, .m04, .m05, .m06, .m07, .m08, .m09, .m10, .m11, .m12, .m13, .m14, .m15, .m16
; note here: i don't do memset64 typically with sizes big enough to warrant his non-temporal goods being
; actively checked
calign
.m100:
movd xmm0, esi
pshufd xmm0, xmm0, 0
movq [rdi], xmm0
movq [rdi+8], xmm0
lea rdx, [rdi+rdx-1]
and rdx, -10H
add rdi, 10H
and rdi, -10H
sub rdi, rdx
jnl .m300
calign
.m200:
movdqa [rdx+rdi], xmm0
add rdi, 10H
jnz .m200
calign
.m300:
movq [rcx+r8-10H], xmm0
movq [rcx+r8-8], xmm0
epilog
end if
if used memset32 | defined include_everything
; NOTE NOTE NOTE: this is the same as memset only we do utf32 character setting instead of byte setting
; it is ASSUMED that rdx & 3 == 0
; rdi == dest, esi (char), rdx == count (in BYTES, even though it is misleading)
; rcx == dest2
; r8 == count2
; xmm0 == x0
falign
memset32:
prolog memset32
mov r8, rdx
mov rcx, rdi
cmp rdx, 16
ja .m100
shl rdx, 3
add rdx, .AlignmentDispatch
jmp qword [rdx]
calign
.m16:
mov [rdi+12], esi
mov [rdi+8], esi
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m12:
mov [rdi+8], esi
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m08:
mov [rdi+4], esi
mov [rdi], esi
epilog
calign
.m04:
mov [rdi], esi
epilog
calign
.m15:
mov [rdi+11], esi
mov [rdi+7], esi
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m11:
mov [rdi+7], esi
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m07:
mov [rdi+3], esi
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m03:
mov [rdi+1], si
mov [rdi], sil
epilog
calign
.m01:
mov [rdi], sil
epilog
calign
.m14:
mov [rdi+10], esi
mov [rdi+6], esi
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m10:
mov [rdi+6], esi
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m06:
mov [rdi+2], esi
mov [rdi], si
epilog
calign
.m02:
mov [rdi], si
epilog
calign
.m13:
mov [rdi+9], esi
mov [rdi+5], esi
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m09:
mov [rdi+5], esi
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m05:
mov [rdi+1], esi
mov [rdi], sil
epilog
calign
.m00:
epilog
dalign
.AlignmentDispatch:
dq .m00, .m01, .m02, .m03, .m04, .m05, .m06, .m07, .m08, .m09, .m10, .m11, .m12, .m13, .m14, .m15, .m16
; note here: i don't do memset64 typically with sizes big enough to warrant his non-temporal goods being
; actively checked
calign
.m100:
movd xmm0, esi
pshufd xmm0, xmm0, 0
movq [rdi], xmm0
movq [rdi+8], xmm0
lea rdx, [rdi+rdx-1]
and rdx, -10H
add rdi, 10H
and rdi, -10H
sub rdi, rdx
jnl .m300
calign
.m200:
movdqa [rdx+rdi], xmm0
add rdi, 10H
jnz .m200
calign
.m300:
movq [rcx+r8-10H], xmm0
movq [rcx+r8-8], xmm0
epilog
end if
if used memcmp | defined include_everything
; ok so, this probably is not the best way to do this, but since we have a decently high probability
; of ALL our memory regions being aligned (due to heap$alloc)
; the various methods for doing this are all similar, and this way isn't TERRIBLE by any means
; TODO: study timings better
; rdi == left, rsi == right, rdx == count
; return in rax
; rax == x
falign
memcmp:
prolog memcmp
add rdi, rdx
add rsi, rdx
neg rdx
calign
.do32:
cmp rdx, -32
jg .do16
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
mov rax, [rdi+rdx+8]
xor rax, [rsi+rdx+8]
jnz .inequal64_8
mov rax, [rdi+rdx+0x10]
xor rax, [rsi+rdx+0x10]
jnz .inequal64_16
mov rax, [rdi+rdx+0x18]
xor rax, [rsi+rdx+0x18]
jnz .inequal64_24
add rdx, 0x20
cmp rdx, -32
jbe .do32
calign
.do16:
cmp rdx, -16
jg .do8
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
mov rax, [rdi+rdx+8]
xor rax, [rsi+rdx+8]
jnz .inequal64_8
add rdx, 0x10
calign
.do8:
cmp rdx, -8
jg .do4
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
add rdx, 8
calign
.do4:
cmp rdx, -4
jg .do2
mov eax, [rdi+rdx]
xor eax, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
add rdx, 4
calign
.do2:
cmp rdx, -2
jg .do1
xor eax, eax
mov ax, [rdi+rdx]
xor ax, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
add rdx, 2
calign
.do1:
cmp rdx, -1
jg .alldone
xor eax, eax
mov al, [rdi+rdx]
xor al, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
; rax must be zero
epilog
calign
.alldone:
xor eax, eax
epilog
calign
.inequal64:
; ok so we did an xor of the right into the left, and it said the result was nonzero
; so now we can bit scan forward our value in x to determine which byte # it was
bsf rax, rax
; so now rax contains a 0..64 #, and we need that to be converted to a 0..7 number
shr rax, 3
add rdx, rax
movzx rax, byte [rdi+rdx]
movzx rdi, byte [rsi+rdx]
sub rax, rdi
epilog
calign
.inequal64_8:
bsf rax, rax
shr rax, 3
add rdx, rax
movzx rax, byte [rdi+rdx+8]
movzx rdi, byte [rsi+rdx+8]
sub rax, rdi
epilog
calign
.inequal64_16:
bsf rax, rax
shr rax, 3
add rdx, rax
movzx rax, byte [rdi+rdx+0x10]
movzx rdi, byte [rsi+rdx+0x10]
sub rax, rdi
epilog
calign
.inequal64_24:
bsf rax, rax
shr rax, 3
add rdx, rax
movzx rax, byte [rdi+rdx+0x18]
movzx rdi, byte [rsi+rdx+0x18]
sub rax, rdi
epilog
end if
if used memcmp16 | defined include_everything
; this is the same as memcmp, but does 16 bits instead of byte comps, useful for my UTF16 goods
; NOTE: this ASSUMES rdx & 1 is 0
; rdi == left, rsi == right, rdx == count
falign
memcmp16:
prolog memcmp16
add rdi, rdx
add rsi, rdx
neg rdx
calign
.do32:
cmp rdx, -32
jg .do16
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
mov rax, [rdi+rdx+8]
xor rax, [rsi+rdx+8]
jnz .inequal64_8
mov rax, [rdi+rdx+0x10]
xor rax, [rsi+rdx+0x10]
jnz .inequal64_16
mov rax, [rdi+rdx+0x18]
xor rax, [rsi+rdx+0x18]
jnz .inequal64_24
add rdx, 0x20
cmp rdx, -32
jbe .do32
calign
.do16:
cmp rdx, -16
jg .do8
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
mov rax, [rdi+rdx+8]
xor rax, [rsi+rdx+8]
jnz .inequal64_8
add rdx, 0x10
calign
.do8:
cmp rdx, -8
jg .do4
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
add rdx, 8
calign
.do4:
cmp rdx, -4
jg .do2
mov eax, [rdi+rdx]
xor eax, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
add rdx, 4
calign
.do2:
cmp rdx, -2
jg .do1
xor eax, eax
mov ax, [rdi+rdx]
xor ax, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
add rdx, 2
calign
.do1:
; NEVER REACHED (unless our previous assumption was not adhered to)
cmp rdx, -1
jg .alldone
xor eax, eax
mov al, [rdi+rdx]
xor al, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
; rax must be zero
epilog
calign
.alldone:
xor eax, eax
epilog
calign
.inequal64:
; ok so we did an xor of the right into the left, and it said the result was nonzero
; so now we can bit scan forward our value in x to determine which byte # it was
bsf rax, rax
; so now rax contains a 0..64 #, and we need that to be converted to a 0..7 number
; which means we need to shr 4 that result to get our word number
shr rax, 4
shl rax, 1 ; hmm, shr 3 and then an and instead? or two adds? TODO
add rdx, rax
movzx rax, word [rdi+rdx]
movzx rdi, word [rsi+rdx]
sub rax, rdi
epilog
calign
.inequal64_8:
bsf rax, rax
shr rax, 4
shl rax, 1
add rdx, rax
movzx rax, word [rdi+rdx+8]
movzx rdi, word [rsi+rdx+8]
sub rax, rdi
epilog
calign
.inequal64_16:
bsf rax, rax
shr rax, 4
shl rax, 1
add rdx, rax
movzx rax, word [rdi+rdx+0x10]
movzx rdi, word [rsi+rdx+0x10]
sub rax, rdi
epilog
calign
.inequal64_24:
bsf rax, rax
shr rax, 4
shl rax, 1
add rdx, rax
movzx rax, word [rdi+rdx+0x18]
movzx rdi, word [rsi+rdx+0x18]
sub rax, rdi
epilog
end if
if used memcmp32 | defined include_everything
; this is the same as memcmp, but does 32 bits instead of byte comps, useful for my UTF32 goods
; NOTE: this ASSUMES rdx & 3 is 0
; rdi == left, rsi == right, rdx == count
falign
memcmp32:
prolog memcmp32
add rdi, rdx
add rsi, rdx
neg rdx
calign
.do32:
cmp rdx, -32
jg .do16
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
mov rax, [rdi+rdx+8]
xor rax, [rsi+rdx+8]
jnz .inequal64_8
mov rax, [rdi+rdx+0x10]
xor rax, [rsi+rdx+0x10]
jnz .inequal64_16
mov rax, [rdi+rdx+0x18]
xor rax, [rsi+rdx+0x18]
jnz .inequal64_24
add rdx, 0x20
cmp rdx, -32
jbe .do32
calign
.do16:
cmp rdx, -16
jg .do8
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
mov rax, [rdi+rdx+8]
xor rax, [rsi+rdx+8]
jnz .inequal64_8
add rdx, 0x10
calign
.do8:
cmp rdx, -8
jg .do4
mov rax, [rdi+rdx]
xor rax, [rsi+rdx]
jnz .inequal64
add rdx, 8
calign
.do4:
cmp rdx, -4
jg .do2
mov eax, [rdi+rdx]
xor eax, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
add rdx, 4
calign
.do2:
; NEVER REACHED (unless our previous assumption was not adhered to)
cmp rdx, -2
jg .do1
xor eax, eax
mov ax, [rdi+rdx]
xor ax, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
add rdx, 2
calign
.do1:
; NEVER REACHED (unless our previous assumption was not adhered to)
cmp rdx, -1
jg .alldone
xor eax, eax
mov al, [rdi+rdx]
xor al, [rsi+rdx]
jnz .inequal64 ; this is still okay cuz we are doing BSF
; rax must be zero
epilog
calign
.alldone:
xor eax, eax
epilog
calign
.inequal64:
; ok so we did an xor of the right into the left, and it said the result was nonzero
; so now we can bit scan forward our value in x to determine which byte # it was
bsf rax, rax
; so now rax contains a 0..64 #, and we need that to be converted to a 0..7 number
; which means we need to shr 4 that result to get our word number
shr rax, 5
shl rax, 2 ; hmm, shr 3 and then an and instead? or two adds? TODO
add rdx, rax
mov eax, dword [rdi+rdx]
mov edi, dword [rsi+rdx]
sub rax, rdi
epilog
calign
.inequal64_8:
bsf rax, rax
shr rax, 5
shl rax, 2
add rdx, rax
mov eax, dword [rdi+rdx+8]
mov edi, dword [rsi+rdx+8]
sub rax, rdi
epilog
calign
.inequal64_16:
bsf rax, rax
shr rax, 5
shl rax, 2
add rdx, rax
mov eax, dword [rdi+rdx+0x10]
mov edi, dword [rsi+rdx+0x10]
sub rax, rdi
epilog
calign
.inequal64_24:
bsf rax, rax
shr rax, 5
shl rax, 2
add rdx, rax
mov eax, dword [rdi+rdx+0x18]
mov edi, dword [rsi+rdx+0x18]
sub rax, rdi
epilog
end if
if used memmove | defined include_everything
; rdi == dest, rsi == source, rdx == count
falign
memmove:
prolog memmove
mov rax, rdi
sub rax, rsi
cmp rax, rdx
jae .memcpyisokay
mov rcx, rdx
mov r9, rdi
cmp rcx, 0x40
jae .b0100
test ecx, 0x20
jz .a100
sub ecx, 0x20
mov rax, [rsi+rcx+0x18]
mov rdx, [rsi+rcx+0x10]
mov [rdi+rcx+0x18], rax
mov [rdi+rcx+0x10], rdx
mov rax, [rsi+rcx+8]
mov rdx, [rsi+rcx]
mov [rdi+rcx+8], rax
mov [rdi+rcx], rdx
calign
.a100:
test ecx, 0x10
jz .a200
sub ecx, 0x10
mov rax, [rsi+rcx+8]
mov rdx, [rsi+rcx]
mov [rdi+rcx+8], rax
mov [rdi+rcx], rdx
calign
.a200:
test ecx, 8
jz .a300
sub ecx, 8
mov rax, [rsi+rcx]
mov [rdi+rcx], rax
calign
.a300:
test ecx, 4
jz .a400
sub ecx, 4
mov eax, [rsi+rcx]
mov [rdi+rcx], eax
jz .a900
calign
.a400:
test ecx, 2
jz .a500
sub ecx, 2
movzx eax, word [rsi+rcx]
mov [rdi+rcx], ax
calign
.a500:
test ecx, 1
jz .a900
movzx eax, byte [rsi+rcx]
mov [rdi+rcx], al
calign
.a900:
mov rax, r9 ; return value == dest?
epilog
calign
.b0100:
; count >= 64
lea edx, [rdi+rcx]
and edx, 0xf
jz .b0300
test edx, 3
jz .b0210
test edx, 1
jz .b0201
sub rcx, 1
movzx eax, byte [rsi+rcx]
mov [rdi+rcx], al
calign
.b0200:
test edx, 2
jz .b0210
calign
.b0201:
sub rcx, 2
movzx eax, word [rsi+rcx]
mov [rdi+rcx], ax
calign
.b0210:
test edx, 4
jz .b0220
sub rcx, 4
mov eax, [rsi+rcx]
mov [rdi+rcx], eax
calign
.b0220:
test edx, 8
jz .b0300
sub rcx, 8
mov rax, [rsi+rcx]
mov [rdi+rcx], rax
calign
.b0300:
; dest aligned 16
lea eax, [rsi+rcx]
and eax, 0xf
mov edx, ecx
and rcx, -20H
sub edx, ecx
sub rsi, rax
add rsi, rdx
add rdi, rdx
mov r8, .alignmentdispatch
jmp qword [r8+rax*8]
dalign
.alignmentdispatch:
dq .c100, .d101, .d102, .d103, .d104, .d105, .d106, .d107
dq .d108, .d109, .d10a, .d10b, .d10c, .d10d, .d10e, .d10f
calign
.c100:
movaps xmm0, [rsi+rcx-10h]
movaps xmm1, [rsi+rcx-20h]
movaps [rdi+rcx-10h], xmm0
movaps [rdi+rcx-20h], xmm1
sub rcx, 20h
jnz .c100
test edx, edx
jz .c500
test edx, 10h
jz .c200
sub rcx, 10h
movaps xmm0, [rsi+rcx]
movaps [rdi+rcx], xmm0
calign
.c200:
test edx, edx
jz .c500
test edx, 8
jz .c210
sub rcx, 8
mov rax, [rsi+rcx]
mov [rdi+rcx], rax
calign
.c210:
test edx, 4
jz .c220
sub rcx, 4
mov eax, [rsi+rcx]
mov [rdi+rcx], eax
jz .c500
calign
.c220:
test edx, 2
jz .c230
sub rcx, 2
movzx eax, word [rsi+rcx]
mov [rdi+rcx], ax
calign
.c230:
test edx, 1
jz .c500
movzx eax, byte [rsi+rcx-1]
mov [rdi+rcx-1], al
calign
.c500:
mov rax, r9 ; return value == dest?
epilog
macro move_reverse_4 {
local .l1,.l2
movaps xmm0, [rsi+rcx]
calign
.l1:
sub rcx, 20h
movaps xmm1, [rsi+rcx+10h]
movaps xmm2, [rsi+rcx]
movaps xmm3, xmm0
movaps xmm0, xmm2
movss xmm2, xmm1
shufps xmm2, xmm2, 00111001b
movss xmm1, xmm3
shufps xmm1, xmm1, 00111001b
movaps [rdi+rcx+10h], xmm1
movaps [rdi+rcx], xmm2
jnz .l1
test edx, 10h
jz .l2
sub rcx, 10h
movaps xmm1, [rsi+rcx]
movss xmm1, xmm0
shufps xmm1, xmm1, 00111001b
movaps [rdi+rcx], xmm1
calign
.l2:
add rsi, rax
jmp .c200
}
macro move_reverse_8 {
local .l1,.l2
movaps xmm0, [rsi+rcx]
shufps xmm0, xmm0, 01001110b
calign
.l1:
sub rcx, 20h
movaps xmm1, [rsi+rcx+10h]
shufps xmm1, xmm1, 01001110b
movsd xmm0, xmm1
movaps [rdi+rcx+10h], xmm0
movaps xmm0, [rsi+rcx]
shufps xmm0, xmm0, 01001110b
movsd xmm1, xmm0
movaps [rdi+rcx], xmm1
jnz .l1
test edx, 10h
jz .l2
sub rcx, 10h
movaps xmm1, [rsi+rcx]
shufps xmm1, xmm1, 01001110b
movsd xmm0, xmm1
movaps [rdi+rcx], xmm0
calign
.l2:
add rsi, rax
jmp .c200
}
macro move_reverse_12 {
local .l1,.l2
movaps xmm0, [rsi+rcx]
shufps xmm0, xmm0, 10010011b
calign
.l1:
sub rcx, 20h
movaps xmm1, [rsi+rcx+10h]
shufps xmm1, xmm1, 10010011b
movss xmm0, xmm1
movaps [rdi+rcx+10h], xmm0
movaps xmm0, [rsi+rcx]
shufps xmm0, xmm0, 10010011b
movss xmm1, xmm0
movaps [rdi+rcx], xmm1
jnz .l1
test edx, 10h
jz .l2
sub rcx, 10h
movaps xmm1, [rsi+rcx]
shufps xmm1, xmm1, 10010011b
movss xmm0, xmm1
movaps [rdi+rcx], xmm0
calign
.l2:
add rsi, rax
jmp .c200
}
macro move_reverse u {
local .l1,.l2
movdqa xmm0, [rsi+rcx]
calign
.l1:
sub rcx, 20h
movdqa xmm1, [rsi+rcx+10h]
movdqa xmm2, [rsi+rcx]
movdqa xmm3, xmm1
pslldq xmm0, 16-u
psrldq xmm1, u
por xmm0, xmm1
movdqa [rdi+rcx+10h], xmm0
movdqa xmm0, xmm2
pslldq xmm3, 16-u
psrldq xmm2, u
por xmm3, xmm2
movdqa [rdi+rcx], xmm3
jnz .l1
test edx, 10h
jz .l2
sub rcx, 10h
movdqa xmm1, [rsi+rcx]
pslldq xmm0, 16-u
psrldq xmm1, u
por xmm0, xmm1
movdqa [rdi+rcx], xmm0
calign
.l2:
add rsi, rax
jmp .c200
}
calign
.d104:
move_reverse_4
calign
.d108:
move_reverse_8
calign
.d10c:
move_reverse_12
calign
.d101:
move_reverse 1
calign
.d102:
move_reverse 2
calign
.d103:
move_reverse 3
calign
.d105:
move_reverse 5
calign
.d106:
move_reverse 6
calign
.d107:
move_reverse 7
calign
.d109:
move_reverse 9
calign
.d10a:
move_reverse 0xa
calign
.d10b:
move_reverse 0xb
calign
.d10d:
move_reverse 0xd
calign
.d10e:
move_reverse 0xe
calign
.d10f:
move_reverse 0xf
calign
.memcpyisokay:
call memcpy
epilog
end if
; memcpy itself is quite large, and for >64 moves, is very fast, if you know that your copy is going to be small
; it may be better to use an inline version:
macro memcpy_inline {
local .do32, .do16, .do8, .do4, .do2, .do1, .alldone
add rdi, rdx
add rsi, rdx
neg rdx
calign
.do32:
cmp rdx, -32
jg .do16
mov rcx, [rsi+rdx]
mov rax, [rsi+rdx+8]
mov [rdi+rdx], rcx
mov [rdi+rdx+8], rax
mov rcx, [rsi+rdx+0x10]
mov rax, [rsi+rdx+0x18]
mov [rdi+rdx+0x10], rcx
mov [rdi+rdx+0x18], rax
add rdx, 0x20
jz .alldone
cmp rdx, -32
jle .do32
calign
.do16:
cmp rdx, -16
jg .do8
mov rcx, [rsi+rdx]
mov rax, [rsi+rdx+8]
mov [rdi+rdx], rcx
mov [rdi+rdx+8], rax
add rdx, 0x10
jz .alldone
calign
.do8:
cmp rdx, -8
jg .do4
mov rcx, [rsi+rdx]
mov [rdi+rdx], rcx
add rdx, 8
jz .alldone
calign
.do4:
cmp rdx, -4
jg .do2
mov ecx, [rsi+rdx]
mov [rdi+rdx], ecx
add rdx, 4
jz .alldone
calign
.do2:
cmp rdx, -2
jg .do1
movzx ecx, word [rsi+rdx]
mov [rdi+rdx], cx
add rdx, 2
jz .alldone
calign
.do1:
cmp rdx, -1
jg .alldone
movzx ecx, byte [rsi+rdx]
mov [rdi+rdx], cl
calign
.alldone:
}
if used memcpy | defined include_everything
; rdi == dest, rsi == source, rdx == count
falign
memcpy:
prolog memcpy
mov r9, rdi
cmp rdx, 0x40
jae .loopy
; count < 64, per agner, better to do 32, 16, 8, 4, 2, 1 for small counts, loopy for bigger
add rdi, rdx
add rsi, rdx
neg rdx
calign
.do32:
cmp rdx, -32
jg .do16
mov rcx, [rsi+rdx]
mov r8, [rsi+rdx+8]
mov [rdi+rdx], rcx
mov [rdi+rdx+8], r8
mov rcx, [rsi+rdx+0x10]
mov r8, [rsi+rdx+0x18]
mov [rdi+rdx+0x10], rcx
mov [rdi+rdx+0x18], r8
add rdx, 0x20
calign
.do16:
cmp rdx, -16
jg .do8
mov rcx, [rsi+rdx]
mov r8, [rsi+rdx+8]
mov [rdi+rdx], rcx
mov [rdi+rdx+8], r8
add rdx, 0x10
calign
.do8:
cmp rdx, -8
jg .do4
mov rcx, [rsi+rdx]
mov [rdi+rdx], rcx
add rdx, 8
calign
.do4:
cmp rdx, -4
jg .do2
mov ecx, [rsi+rdx]
mov [rdi+rdx], ecx
add rdx, 4
jz .alldone
calign
.do2:
cmp rdx, -2
jg .do1
movzx ecx, word [rsi+rdx]
mov [rdi+rdx], cx
add rdx, 2
jz .alldone
calign
.do1:
cmp rdx, -1
jg .alldone
movzx ecx, byte [rsi+rdx]
mov [rdi+rdx], cl
mov rax, r9
epilog
; rcx == w
; r8 == x
; xmm0..3 used
calign
.loopy:
; count >= 64
mov ecx, edi
neg ecx
and ecx, 0xf
jz .l0200
test ecx, 3
jz .l0030
test ecx, 1
jz .l0020
movzx eax, byte [rsi]
mov [rdi], al
add rsi, 1
add rdi, 1
calign
.l0020:
test ecx, 2
jz .l0030
movzx eax, word [rsi]
mov [rdi], ax
add rsi, 2
add rdi, 2
calign
.l0030:
test ecx, 4
jz .l0040
mov eax, [rsi]
mov [rdi], eax
add rsi, 4
add rdi, 4
calign
.l0040:
test ecx, 8
jz .l0050
mov rax, [rsi]
mov [rdi], rax
add rsi, 8
add rdi, 8
calign
.l0050:
sub rdx, rcx
calign
.l0200:
mov ecx, edi
neg ecx
and ecx, 0xf
jz .l300
add rsi, rcx
add rdi, rcx
sub rdx, rcx
neg rcx
cmp ecx, -8
jg .l200
mov rax, [rsi+rcx]
mov [rdi+rcx], rax
add rcx, 8
calign
.l200:
cmp ecx, -4
jg .l210
mov eax, [rsi+rcx]
mov [rdi+rcx], eax
add rcx, 4
jz .l300
calign
.l210:
cmp ecx, -2
movzx eax, word [rsi+rcx]
mov [rdi+rcx], ax
add rcx, 2
calign
.l220:
cmp ecx, -1
jg .l300
movzx eax, byte [rsi+rcx]
mov [rdi+rcx], al
calign
.l300:
mov eax, esi
and eax, 0xf
mov r8d, edx
and rdx, -20H
add rsi, rdx
add rdi, rdx
sub r8d, edx ; remaining data after loop
sub rsi, rax
neg rdx
shl rax, 3
add rax, .alignmentdispatch
jmp qword [rax]
dalign
.alignmentdispatch:
dq .c100, .d101, .d102, .d103, .d104, .d105, .d106, .d107
dq .d108, .d109, .d10a, .d10b, .d10c, .d10d, .d10e, .d10f
calign
.c100:
movaps xmm0, [rsi+rdx]
movaps xmm1, [rsi+rdx+0x10]
movaps [rdi+rdx], xmm0
movaps [rdi+rdx+0x10], xmm1
add rdx, 0x20
jnz .c100
add rsi, r8
add rdi, r8
neg r8
jz .alldone
cmp r8d, -16
jg .c200
movaps xmm0, [rsi+r8]
movaps [rdi+r8], xmm0
add r8, 0x10
calign
.c200:
cmp r8d, -8
jg .c210
mov rcx, [rsi+r8]
mov [rdi+r8], rcx
add r8, 8
jz .alldone
calign
.c210:
cmp r8d, -4
jg .c220
mov ecx, [rsi+r8]
mov [rdi+r8], ecx
add r8, 4
jz .alldone
calign
.c220:
cmp r8d, -2
jg .c230
movzx ecx, word [rsi+r8]
mov [rdi+r8], cx
add r8, 2
calign
.c230:
cmp r8d, -1
jg .alldone
movzx ecx, byte [rsi+r8]
mov [rdi+r8], cl
mov rax, r9
epilog
macro .move_unaligned_sse2 u* {
movdqa xmm0, [rsi+rdx]
calign
@@:
movdqa xmm1, [rsi+rdx+0x10]
movdqa xmm2, [rsi+rdx+0x20]
movdqa xmm3, xmm1
psrldq xmm0, u
pslldq xmm1, 16-u
por xmm0, xmm1
movdqa [rdi+rdx], xmm0
movdqa xmm0, xmm2
psrldq xmm3, u
pslldq xmm2, 16-u
por xmm3, xmm2
movdqa [rdi+rdx+0x10], xmm3
add rdx, 0x20
jnz @b
add rsi, r8
add rdi, r8
neg r8
cmp r8d, -16
jg @f
movdqa xmm1, [rsi+r8+0x10]
psrldq xmm0, u
pslldq xmm1, 16-u
por xmm0, xmm1
movdqa [rdi+r8], xmm0
add r8, 0x10
calign
@@:
add rsi, u
jmp .c200
}
calign
.d101:
.move_unaligned_sse2 1
calign
.d102:
.move_unaligned_sse2 2
calign
.d103:
.move_unaligned_sse2 3
calign
.d104:
movaps xmm0, [rsi+rdx]
calign
@@:
movaps xmm1, [rsi+rdx+0x10]
movss xmm0, xmm1
shufps xmm0, xmm0, 00111001b ; rotate
movaps [rdi+rdx], xmm0
movaps xmm0, [rsi+rdx+0x20]
movss xmm1, xmm0
shufps xmm1, xmm1, 00111001b
movaps [rdi+rdx+0x10], xmm1
add rdx, 0x20
jnz @b
add rsi, r8
add rdi, r8
neg r8
cmp r8d, -10h
jg @f
movaps xmm1, [rsi+r8+0x10]
movss xmm0, xmm1
shufps xmm0, xmm0, 00111001b
movaps [rdi+r8], xmm0
add r8, 10h
calign
@@:
add rsi, 4
jmp .c200
calign
.d105:
.move_unaligned_sse2 5
calign
.d106:
.move_unaligned_sse2 6
calign
.d107:
.move_unaligned_sse2 7
calign
.d108:
movaps xmm0, [rsi+rdx]
calign
@@:
movaps xmm1, [rsi+rdx+0x10]
movsd xmm0, xmm1
shufps xmm0, xmm0, 01001110b
movaps [rdi+rdx], xmm0
movaps xmm0, [rsi+rdx+0x20]
movsd xmm1, xmm0
shufps xmm1, xmm1, 01001110b
movaps [rdi+rdx+0x10], xmm1
add rdx, 0x20
jnz @b
add rsi, r8
add rdi, r8
neg r8
cmp r8d, -10H
jg @f
movaps xmm1, [rsi+r8+0x10]
movsd xmm0, xmm1
shufps xmm0, xmm0, 01001110b
movaps [rdi+r8], xmm0
add r8, 0x10
calign
@@:
add rsi, 8
jmp .c200
calign
.d109:
.move_unaligned_sse2 9
calign
.d10a:
.move_unaligned_sse2 0xa
calign
.d10b:
.move_unaligned_sse2 0xb
calign
.d10c:
movaps xmm0, [rsi+rdx]
shufps xmm0, xmm0, 10010011b
calign
@@:
movaps xmm1, [rsi+rdx+0x10]
movaps xmm2, [rsi+rdx+0x20]
shufps xmm1, xmm1, 10010011b
shufps xmm2, xmm2, 10010011b
movaps xmm3, xmm2
movss xmm2, xmm1
movss xmm1, xmm0
movaps [rdi+rdx], xmm1
movaps [rdi+rdx+0x10], xmm2
movaps xmm0, xmm3
add rdx, 0x20
jnz @b
add rsi, r8
add rdi, r8
neg r8
cmp r8d, -10H
jg @f
movaps xmm1, [rsi+r8+0x10]
shufps xmm1, xmm1, 10010011b
movss xmm1, xmm0
movdqa [rdi+r8], xmm1
add r8, 0x10
calign
@@:
add rsi, 12
jmp .c200
calign
.d10d:
.move_unaligned_sse2 0xd
calign
.d10e:
.move_unaligned_sse2 0xe
calign
.d10f:
.move_unaligned_sse2 0xf
calign
.alldone:
mov rax, r9
epilog
end if
if used memreverse | defined include_everything
; rdi == byte buffer, rsi == length of same (must be >0)
; this reverses the bytes, not particularly efficient, but does the deed
falign
memreverse:
prolog memreverse
lea rsi, [rdi+rsi]
sub rsi, 1 ; last character pointer
calign
.doit:
movzx ecx, byte [rdi]
movzx edx, byte [rsi]
mov byte [rdi], dl
mov byte [rsi], cl
add rdi, 1
sub rsi, 1
cmp rdi, rsi
jb .doit
epilog
end if
if used memxor | defined include_everything
; rdi == dest, rsi == source, rdx == count
falign
memxor:
prolog memxor
add rdi, rdx
add rsi, rdx
neg rdx
calign
.do32:
cmp rdx, -32
jg .do16
mov rcx, [rsi+rdx]
mov rax, [rsi+rdx+8]
xor [rdi+rdx], rcx
xor [rdi+rdx+8], rax
mov rcx, [rsi+rdx+0x10]
mov rax, [rsi+rdx+0x18]
xor [rdi+rdx+0x10], rcx
xor [rdi+rdx+0x18], rax
add rdx, 0x20
jmp .do32
calign
.do16:
cmp rdx, -16
jg .do8
mov rcx, [rsi+rdx]
mov rax, [rsi+rdx+8]
xor [rdi+rdx], rcx
xor [rdi+rdx+8], rax
add rdx, 0x10
calign
.do8:
cmp rdx, -8
jg .do4
mov rcx, [rsi+rdx]
xor [rdi+rdx], rcx
add rdx, 8
calign
.do4:
cmp rdx, -4
jg .do2
mov ecx, [rsi+rdx]
xor [rdi+rdx], ecx
add rdx, 4
jz .alldone
calign
.do2:
cmp rdx, -2
jg .do1
movzx ecx, word [rsi+rdx]
xor [rdi+rdx], cx
add rdx, 2
jz .alldone
calign
.do1:
cmp rdx, -1
jg .alldone
movzx ecx, byte [rsi+rdx]
xor [rdi+rdx], cl
epilog
calign
.alldone:
epilog
end if