; ------------------------------------------------------------------------
; HeavyThing x86_64 assembly language library and showcase programs
; Copyright © 2015-2018 2 Ton Digital
; Homepage: https://2ton.com.au/
; Author: Jeff Marrison <jeff@2ton.com.au>
;
; This file is part of the HeavyThing library.
;
; HeavyThing is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License, or
; (at your option) any later version.
;
; HeavyThing is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along
; with the HeavyThing library. If not, see <http://www.gnu.org/licenses/>.
; ------------------------------------------------------------------------
;
; aes.inc: aes128/aes192/aes256 goodies, based on public domain implementation from Wei Dai
;
; NOTE: in keeping with my overall library requirement of SSE2-only operation, if AESNI is available
; we happily make use of it, but otherwise, falls back to plain-old x86_64 version.
;
; Some notes here on timing attacks and the like for the non-AESNI version only:
; I have implemented the Wei Dai timing countermeasures, there really isn't a performance hit.
; and it appears to directly address the issues raised by Joseph Bonneau and Ilya Mironov back in 2006.
; (32M block samples required.)
;
; Considering that Wei Dai's own library which has been externally validated is _not_ using
; the countermeasure (thanks to gcc optimizing it out wholly), I very much doubt this matters a great deal,
; and it certainly doesn't matter for the way I use these tidbits.
;
; YMMV. hahah
;
aes_rounds_ofs = 0
aes_loopidx_ofs = 8 ; either (0, 1, or 2) << 3 depending on whether we are AES128, AES192, or AES256 (unrolls the round loop accordingly)
aes_roundkeys_ofs = 16 ; round keys themselves, room for 15 x 16 bytes worth
aes_size = 264 ; we actually only need 256, but all of the aesni goods require our object to be aligned 16, so on entry, we force-align rdi
if used aes$tls | defined include_everything
; aes$tls is just a function wrapper for the 4 functions that we provide such that it can do function call hooks to here
dalign
aes$tls:
dq aes$init_encrypt, aes$encrypt, aes$init_decrypt, aes$decrypt
end if
if public_funcs & (used aes$Se | used aes$Sd | used aes$Td | used aes$Te)
dalign
public aes$data
aes$data:
end if
if used aes$Se | defined include_everything
dalign
aes$Se:
db 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d
db 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
db 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2
db 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
db 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb
db 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
db 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d
db 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
db 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d
db 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
db 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9
db 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
db 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
end if
if used aes$Sd | defined include_everything
dalign
aes$Sd:
db 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82
db 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
db 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49
db 0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
db 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, 0x90, 0xd8, 0xab, 0x00
db 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
db 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce
db 0xf0, 0xb4, 0xe6, 0x73, 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
db 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b
db 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
db 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f
db 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
db 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
end if
if used aes$Te | defined include_everything
dalign
aes$Te:
dd 0xc6636300, 0xc66363a5, 0xf87c7c00, 0xf87c7c84, 0xee777700, 0xee777799, 0xf67b7b00, 0xf67b7b8d
dd 0xfff2f200, 0xfff2f20d, 0xd66b6b00, 0xd66b6bbd, 0xde6f6f00, 0xde6f6fb1, 0x91c5c500, 0x91c5c554
dd 0x60303000, 0x60303050, 0x02010100, 0x02010103, 0xce676700, 0xce6767a9, 0x562b2b00, 0x562b2b7d
dd 0xe7fefe00, 0xe7fefe19, 0xb5d7d700, 0xb5d7d762, 0x4dabab00, 0x4dababe6, 0xec767600, 0xec76769a
dd 0x8fcaca00, 0x8fcaca45, 0x1f828200, 0x1f82829d, 0x89c9c900, 0x89c9c940, 0xfa7d7d00, 0xfa7d7d87
dd 0xeffafa00, 0xeffafa15, 0xb2595900, 0xb25959eb, 0x8e474700, 0x8e4747c9, 0xfbf0f000, 0xfbf0f00b
dd 0x41adad00, 0x41adadec, 0xb3d4d400, 0xb3d4d467, 0x5fa2a200, 0x5fa2a2fd, 0x45afaf00, 0x45afafea
dd 0x239c9c00, 0x239c9cbf, 0x53a4a400, 0x53a4a4f7, 0xe4727200, 0xe4727296, 0x9bc0c000, 0x9bc0c05b
dd 0x75b7b700, 0x75b7b7c2, 0xe1fdfd00, 0xe1fdfd1c, 0x3d939300, 0x3d9393ae, 0x4c262600, 0x4c26266a
dd 0x6c363600, 0x6c36365a, 0x7e3f3f00, 0x7e3f3f41, 0xf5f7f700, 0xf5f7f702, 0x83cccc00, 0x83cccc4f
dd 0x68343400, 0x6834345c, 0x51a5a500, 0x51a5a5f4, 0xd1e5e500, 0xd1e5e534, 0xf9f1f100, 0xf9f1f108
dd 0xe2717100, 0xe2717193, 0xabd8d800, 0xabd8d873, 0x62313100, 0x62313153, 0x2a151500, 0x2a15153f
dd 0x08040400, 0x0804040c, 0x95c7c700, 0x95c7c752, 0x46232300, 0x46232365, 0x9dc3c300, 0x9dc3c35e
dd 0x30181800, 0x30181828, 0x37969600, 0x379696a1, 0x0a050500, 0x0a05050f, 0x2f9a9a00, 0x2f9a9ab5
dd 0x0e070700, 0x0e070709, 0x24121200, 0x24121236, 0x1b808000, 0x1b80809b, 0xdfe2e200, 0xdfe2e23d
dd 0xcdebeb00, 0xcdebeb26, 0x4e272700, 0x4e272769, 0x7fb2b200, 0x7fb2b2cd, 0xea757500, 0xea75759f
dd 0x12090900, 0x1209091b, 0x1d838300, 0x1d83839e, 0x582c2c00, 0x582c2c74, 0x341a1a00, 0x341a1a2e
dd 0x361b1b00, 0x361b1b2d, 0xdc6e6e00, 0xdc6e6eb2, 0xb45a5a00, 0xb45a5aee, 0x5ba0a000, 0x5ba0a0fb
dd 0xa4525200, 0xa45252f6, 0x763b3b00, 0x763b3b4d, 0xb7d6d600, 0xb7d6d661, 0x7db3b300, 0x7db3b3ce
dd 0x52292900, 0x5229297b, 0xdde3e300, 0xdde3e33e, 0x5e2f2f00, 0x5e2f2f71, 0x13848400, 0x13848497
dd 0xa6535300, 0xa65353f5, 0xb9d1d100, 0xb9d1d168, 0x00000000, 0x00000000, 0xc1eded00, 0xc1eded2c
dd 0x40202000, 0x40202060, 0xe3fcfc00, 0xe3fcfc1f, 0x79b1b100, 0x79b1b1c8, 0xb65b5b00, 0xb65b5bed
dd 0xd46a6a00, 0xd46a6abe, 0x8dcbcb00, 0x8dcbcb46, 0x67bebe00, 0x67bebed9, 0x72393900, 0x7239394b
dd 0x944a4a00, 0x944a4ade, 0x984c4c00, 0x984c4cd4, 0xb0585800, 0xb05858e8, 0x85cfcf00, 0x85cfcf4a
dd 0xbbd0d000, 0xbbd0d06b, 0xc5efef00, 0xc5efef2a, 0x4faaaa00, 0x4faaaae5, 0xedfbfb00, 0xedfbfb16
dd 0x86434300, 0x864343c5, 0x9a4d4d00, 0x9a4d4dd7, 0x66333300, 0x66333355, 0x11858500, 0x11858594
dd 0x8a454500, 0x8a4545cf, 0xe9f9f900, 0xe9f9f910, 0x04020200, 0x04020206, 0xfe7f7f00, 0xfe7f7f81
dd 0xa0505000, 0xa05050f0, 0x783c3c00, 0x783c3c44, 0x259f9f00, 0x259f9fba, 0x4ba8a800, 0x4ba8a8e3
dd 0xa2515100, 0xa25151f3, 0x5da3a300, 0x5da3a3fe, 0x80404000, 0x804040c0, 0x058f8f00, 0x058f8f8a
dd 0x3f929200, 0x3f9292ad, 0x219d9d00, 0x219d9dbc, 0x70383800, 0x70383848, 0xf1f5f500, 0xf1f5f504
dd 0x63bcbc00, 0x63bcbcdf, 0x77b6b600, 0x77b6b6c1, 0xafdada00, 0xafdada75, 0x42212100, 0x42212163
dd 0x20101000, 0x20101030, 0xe5ffff00, 0xe5ffff1a, 0xfdf3f300, 0xfdf3f30e, 0xbfd2d200, 0xbfd2d26d
dd 0x81cdcd00, 0x81cdcd4c, 0x180c0c00, 0x180c0c14, 0x26131300, 0x26131335, 0xc3ecec00, 0xc3ecec2f
dd 0xbe5f5f00, 0xbe5f5fe1, 0x35979700, 0x359797a2, 0x88444400, 0x884444cc, 0x2e171700, 0x2e171739
dd 0x93c4c400, 0x93c4c457, 0x55a7a700, 0x55a7a7f2, 0xfc7e7e00, 0xfc7e7e82, 0x7a3d3d00, 0x7a3d3d47
dd 0xc8646400, 0xc86464ac, 0xba5d5d00, 0xba5d5de7, 0x32191900, 0x3219192b, 0xe6737300, 0xe6737395
dd 0xc0606000, 0xc06060a0, 0x19818100, 0x19818198, 0x9e4f4f00, 0x9e4f4fd1, 0xa3dcdc00, 0xa3dcdc7f
dd 0x44222200, 0x44222266, 0x542a2a00, 0x542a2a7e, 0x3b909000, 0x3b9090ab, 0x0b888800, 0x0b888883
dd 0x8c464600, 0x8c4646ca, 0xc7eeee00, 0xc7eeee29, 0x6bb8b800, 0x6bb8b8d3, 0x28141400, 0x2814143c
dd 0xa7dede00, 0xa7dede79, 0xbc5e5e00, 0xbc5e5ee2, 0x160b0b00, 0x160b0b1d, 0xaddbdb00, 0xaddbdb76
dd 0xdbe0e000, 0xdbe0e03b, 0x64323200, 0x64323256, 0x743a3a00, 0x743a3a4e, 0x140a0a00, 0x140a0a1e
dd 0x92494900, 0x924949db, 0x0c060600, 0x0c06060a, 0x48242400, 0x4824246c, 0xb85c5c00, 0xb85c5ce4
dd 0x9fc2c200, 0x9fc2c25d, 0xbdd3d300, 0xbdd3d36e, 0x43acac00, 0x43acacef, 0xc4626200, 0xc46262a6
dd 0x39919100, 0x399191a8, 0x31959500, 0x319595a4, 0xd3e4e400, 0xd3e4e437, 0xf2797900, 0xf279798b
dd 0xd5e7e700, 0xd5e7e732, 0x8bc8c800, 0x8bc8c843, 0x6e373700, 0x6e373759, 0xda6d6d00, 0xda6d6db7
dd 0x018d8d00, 0x018d8d8c, 0xb1d5d500, 0xb1d5d564, 0x9c4e4e00, 0x9c4e4ed2, 0x49a9a900, 0x49a9a9e0
dd 0xd86c6c00, 0xd86c6cb4, 0xac565600, 0xac5656fa, 0xf3f4f400, 0xf3f4f407, 0xcfeaea00, 0xcfeaea25
dd 0xca656500, 0xca6565af, 0xf47a7a00, 0xf47a7a8e, 0x47aeae00, 0x47aeaee9, 0x10080800, 0x10080818
dd 0x6fbaba00, 0x6fbabad5, 0xf0787800, 0xf0787888, 0x4a252500, 0x4a25256f, 0x5c2e2e00, 0x5c2e2e72
dd 0x381c1c00, 0x381c1c24, 0x57a6a600, 0x57a6a6f1, 0x73b4b400, 0x73b4b4c7, 0x97c6c600, 0x97c6c651
dd 0xcbe8e800, 0xcbe8e823, 0xa1dddd00, 0xa1dddd7c, 0xe8747400, 0xe874749c, 0x3e1f1f00, 0x3e1f1f21
dd 0x964b4b00, 0x964b4bdd, 0x61bdbd00, 0x61bdbddc, 0x0d8b8b00, 0x0d8b8b86, 0x0f8a8a00, 0x0f8a8a85
dd 0xe0707000, 0xe0707090, 0x7c3e3e00, 0x7c3e3e42, 0x71b5b500, 0x71b5b5c4, 0xcc666600, 0xcc6666aa
dd 0x90484800, 0x904848d8, 0x06030300, 0x06030305, 0xf7f6f600, 0xf7f6f601, 0x1c0e0e00, 0x1c0e0e12
dd 0xc2616100, 0xc26161a3, 0x6a353500, 0x6a35355f, 0xae575700, 0xae5757f9, 0x69b9b900, 0x69b9b9d0
dd 0x17868600, 0x17868691, 0x99c1c100, 0x99c1c158, 0x3a1d1d00, 0x3a1d1d27, 0x279e9e00, 0x279e9eb9
dd 0xd9e1e100, 0xd9e1e138, 0xebf8f800, 0xebf8f813, 0x2b989800, 0x2b9898b3, 0x22111100, 0x22111133
dd 0xd2696900, 0xd26969bb, 0xa9d9d900, 0xa9d9d970, 0x078e8e00, 0x078e8e89, 0x33949400, 0x339494a7
dd 0x2d9b9b00, 0x2d9b9bb6, 0x3c1e1e00, 0x3c1e1e22, 0x15878700, 0x15878792, 0xc9e9e900, 0xc9e9e920
dd 0x87cece00, 0x87cece49, 0xaa555500, 0xaa5555ff, 0x50282800, 0x50282878, 0xa5dfdf00, 0xa5dfdf7a
dd 0x038c8c00, 0x038c8c8f, 0x59a1a100, 0x59a1a1f8, 0x09898900, 0x09898980, 0x1a0d0d00, 0x1a0d0d17
dd 0x65bfbf00, 0x65bfbfda, 0xd7e6e600, 0xd7e6e631, 0x84424200, 0x844242c6, 0xd0686800, 0xd06868b8
dd 0x82414100, 0x824141c3, 0x29999900, 0x299999b0, 0x5a2d2d00, 0x5a2d2d77, 0x1e0f0f00, 0x1e0f0f11
dd 0x7bb0b000, 0x7bb0b0cb, 0xa8545400, 0xa85454fc, 0x6dbbbb00, 0x6dbbbbd6, 0x2c161600, 0x2c16163a
end if
if used aes$Td | defined include_everything
dalign
aes$Td:
dd 0x51f4a752, 0x51f4a750, 0x7e416509, 0x7e416553, 0x1a17a46a, 0x1a17a4c3, 0x3a275ed5, 0x3a275e96
dd 0x3bab6b30, 0x3bab6bcb, 0x1f9d4536, 0x1f9d45f1, 0xacfa58a5, 0xacfa58ab, 0x4be30338, 0x4be30393
dd 0x2030fabf, 0x2030fa55, 0xad766d40, 0xad766df6, 0x88cc76a3, 0x88cc7691, 0xf5024c9e, 0xf5024c25
dd 0x4fe5d781, 0x4fe5d7fc, 0xc52acbf3, 0xc52acbd7, 0x263544d7, 0x26354480, 0xb562a3fb, 0xb562a38f
dd 0xdeb15a7c, 0xdeb15a49, 0x25ba1be3, 0x25ba1b67, 0x45ea0e39, 0x45ea0e98, 0x5dfec082, 0x5dfec0e1
dd 0xc32f759b, 0xc32f7502, 0x814cf02f, 0x814cf012, 0x8d4697ff, 0x8d4697a3, 0x6bd3f987, 0x6bd3f9c6
dd 0x038f5f34, 0x038f5fe7, 0x15929c8e, 0x15929c95, 0xbf6d7a43, 0xbf6d7aeb, 0x95525944, 0x955259da
dd 0xd4be83c4, 0xd4be832d, 0x587421de, 0x587421d3, 0x49e069e9, 0x49e06929, 0x8ec9c8cb, 0x8ec9c844
dd 0x75c28954, 0x75c2896a, 0xf48e797b, 0xf48e7978, 0x99583e94, 0x99583e6b, 0x27b97132, 0x27b971dd
dd 0xbee14fa6, 0xbee14fb6, 0xf088adc2, 0xf088ad17, 0xc920ac23, 0xc920ac66, 0x7dce3a3d, 0x7dce3ab4
dd 0x63df4aee, 0x63df4a18, 0xe51a314c, 0xe51a3182, 0x97513395, 0x97513360, 0x62537f0b, 0x62537f45
dd 0xb1647742, 0xb16477e0, 0xbb6baefa, 0xbb6bae84, 0xfe81a0c3, 0xfe81a01c, 0xf9082b4e, 0xf9082b94
dd 0x70486808, 0x70486858, 0x8f45fd2e, 0x8f45fd19, 0x94de6ca1, 0x94de6c87, 0x527bf866, 0x527bf8b7
dd 0xab73d328, 0xab73d323, 0x724b02d9, 0x724b02e2, 0xe31f8f24, 0xe31f8f57, 0x6655abb2, 0x6655ab2a
dd 0xb2eb2876, 0xb2eb2807, 0x2fb5c25b, 0x2fb5c203, 0x86c57ba2, 0x86c57b9a, 0xd3370849, 0xd33708a5
dd 0x3028876d, 0x302887f2, 0x23bfa58b, 0x23bfa5b2, 0x02036ad1, 0x02036aba, 0xed168225, 0xed16825c
dd 0x8acf1c72, 0x8acf1c2b, 0xa779b4f8, 0xa779b492, 0xf307f2f6, 0xf307f2f0, 0x4e69e264, 0x4e69e2a1
dd 0x65daf486, 0x65daf4cd, 0x0605be68, 0x0605bed5, 0xd1346298, 0xd134621f, 0xc4a6fe16, 0xc4a6fe8a
dd 0x342e53d4, 0x342e539d, 0xa2f355a4, 0xa2f355a0, 0x058ae15c, 0x058ae132, 0xa4f6ebcc, 0xa4f6eb75
dd 0x0b83ec5d, 0x0b83ec39, 0x4060ef65, 0x4060efaa, 0x5e719fb6, 0x5e719f06, 0xbd6e1092, 0xbd6e1051
dd 0x3e218a6c, 0x3e218af9, 0x96dd0670, 0x96dd063d, 0xdd3e0548, 0xdd3e05ae, 0x4de6bd50, 0x4de6bd46
dd 0x91548dfd, 0x91548db5, 0x71c45ded, 0x71c45d05, 0x0406d4b9, 0x0406d46f, 0x605015da, 0x605015ff
dd 0x1998fb5e, 0x1998fb24, 0xd6bde915, 0xd6bde997, 0x89404346, 0x894043cc, 0x67d99e57, 0x67d99e77
dd 0xb0e842a7, 0xb0e842bd, 0x07898b8d, 0x07898b88, 0xe7195b9d, 0xe7195b38, 0x79c8ee84, 0x79c8eedb
dd 0xa17c0a90, 0xa17c0a47, 0x7c420fd8, 0x7c420fe9, 0xf8841eab, 0xf8841ec9, 0x00000000, 0x00000000
dd 0x0980868c, 0x09808683, 0x322bedbc, 0x322bed48, 0x1e1170d3, 0x1e1170ac, 0x6c5a720a, 0x6c5a724e
dd 0xfd0efff7, 0xfd0efffb, 0x0f8538e4, 0x0f853856, 0x3daed558, 0x3daed51e, 0x362d3905, 0x362d3927
dd 0x0a0fd9b8, 0x0a0fd964, 0x685ca6b3, 0x685ca621, 0x9b5b5445, 0x9b5b54d1, 0x24362e06, 0x24362e3a
dd 0x0c0a67d0, 0x0c0a67b1, 0x9357e72c, 0x9357e70f, 0xb4ee961e, 0xb4ee96d2, 0x1b9b918f, 0x1b9b919e
dd 0x80c0c5ca, 0x80c0c54f, 0x61dc203f, 0x61dc20a2, 0x5a774b0f, 0x5a774b69, 0x1c121a02, 0x1c121a16
dd 0xe293bac1, 0xe293ba0a, 0xc0a02aaf, 0xc0a02ae5, 0x3c22e0bd, 0x3c22e043, 0x121b1703, 0x121b171d
dd 0x0e090d01, 0x0e090d0b, 0xf28bc713, 0xf28bc7ad, 0x2db6a88a, 0x2db6a8b9, 0x141ea96b, 0x141ea9c8
dd 0x57f1193a, 0x57f11985, 0xaf750791, 0xaf75074c, 0xee99dd11, 0xee99ddbb, 0xa37f6041, 0xa37f60fd
dd 0xf701264f, 0xf701269f, 0x5c72f567, 0x5c72f5bc, 0x44663bdc, 0x44663bc5, 0x5bfb7eea, 0x5bfb7e34
dd 0x8b432997, 0x8b432976, 0xcb23c6f2, 0xcb23c6dc, 0xb6edfccf, 0xb6edfc68, 0xb8e4f1ce, 0xb8e4f163
dd 0xd731dcf0, 0xd731dcca, 0x426385b4, 0x42638510, 0x139722e6, 0x13972240, 0x84c61173, 0x84c61120
dd 0x854a2496, 0x854a247d, 0xd2bb3dac, 0xd2bb3df8, 0xaef93274, 0xaef93211, 0xc729a122, 0xc729a16d
dd 0x1d9e2fe7, 0x1d9e2f4b, 0xdcb230ad, 0xdcb230f3, 0x0d865235, 0x0d8652ec, 0x77c1e385, 0x77c1e3d0
dd 0x2bb316e2, 0x2bb3166c, 0xa970b9f9, 0xa970b999, 0x11944837, 0x119448fa, 0x47e964e8, 0x47e96422
dd 0xa8fc8c1c, 0xa8fc8cc4, 0xa0f03f75, 0xa0f03f1a, 0x567d2cdf, 0x567d2cd8, 0x2233906e, 0x223390ef
dd 0x87494e47, 0x87494ec7, 0xd938d1f1, 0xd938d1c1, 0x8ccaa21a, 0x8ccaa2fe, 0x98d40b71, 0x98d40b36
dd 0xa6f5811d, 0xa6f581cf, 0xa57ade29, 0xa57ade28, 0xdab78ec5, 0xdab78e26, 0x3fadbf89, 0x3fadbfa4
dd 0x2c3a9d6f, 0x2c3a9de4, 0x507892b7, 0x5078920d, 0x6a5fcc62, 0x6a5fcc9b, 0x547e460e, 0x547e4662
dd 0xf68d13aa, 0xf68d13c2, 0x90d8b818, 0x90d8b8e8, 0x2e39f7be, 0x2e39f75e, 0x82c3af1b, 0x82c3aff5
dd 0x9f5d80fc, 0x9f5d80be, 0x69d09356, 0x69d0937c, 0x6fd52d3e, 0x6fd52da9, 0xcf25124b, 0xcf2512b3
dd 0xc8ac99c6, 0xc8ac993b, 0x10187dd2, 0x10187da7, 0xe89c6379, 0xe89c636e, 0xdb3bbb20, 0xdb3bbb7b
dd 0xcd26789a, 0xcd267809, 0x6e5918db, 0x6e5918f4, 0xec9ab7c0, 0xec9ab701, 0x834f9afe, 0x834f9aa8
dd 0xe6956e78, 0xe6956e65, 0xaaffe6cd, 0xaaffe67e, 0x21bccf5a, 0x21bccf08, 0xef15e8f4, 0xef15e8e6
dd 0xbae79b1f, 0xbae79bd9, 0x4a6f36dd, 0x4a6f36ce, 0xea9f09a8, 0xea9f09d4, 0x29b07c33, 0x29b07cd6
dd 0x31a4b288, 0x31a4b2af, 0x2a3f2307, 0x2a3f2331, 0xc6a594c7, 0xc6a59430, 0x35a26631, 0x35a266c0
dd 0x744ebcb1, 0x744ebc37, 0xfc82ca12, 0xfc82caa6, 0xe090d010, 0xe090d0b0, 0x33a7d859, 0x33a7d815
dd 0xf1049827, 0xf104984a, 0x41ecda80, 0x41ecdaf7, 0x7fcd50ec, 0x7fcd500e, 0x1791f65f, 0x1791f62f
dd 0x764dd660, 0x764dd68d, 0x43efb051, 0x43efb04d, 0xccaa4d7f, 0xccaa4d54, 0xe49604a9, 0xe49604df
dd 0x9ed1b519, 0x9ed1b5e3, 0x4c6a88b5, 0x4c6a881b, 0xc12c1f4a, 0xc12c1fb8, 0x4665510d, 0x4665517f
dd 0x9d5eea2d, 0x9d5eea04, 0x018c35e5, 0x018c355d, 0xfa87747a, 0xfa877473, 0xfb0b419f, 0xfb0b412e
dd 0xb3671d93, 0xb3671d5a, 0x92dbd2c9, 0x92dbd252, 0xe910569c, 0xe9105633, 0x6dd647ef, 0x6dd64713
dd 0x9ad761a0, 0x9ad7618c, 0x37a10ce0, 0x37a10c7a, 0x59f8143b, 0x59f8148e, 0xeb133c4d, 0xeb133c89
dd 0xcea927ae, 0xcea927ee, 0xb761c92a, 0xb761c935, 0xe11ce5f5, 0xe11ce5ed, 0x7a47b1b0, 0x7a47b13c
dd 0x9cd2dfc8, 0x9cd2df59, 0x55f273eb, 0x55f2733f, 0x1814cebb, 0x1814ce79, 0x73c7373c, 0x73c737bf
dd 0x53f7cd83, 0x53f7cdea, 0x5ffdaa53, 0x5ffdaa5b, 0xdf3d6f99, 0xdf3d6f14, 0x7844db61, 0x7844db86
dd 0xcaaff317, 0xcaaff381, 0xb968c42b, 0xb968c43e, 0x38243404, 0x3824342c, 0xc2a3407e, 0xc2a3405f
dd 0x161dc3ba, 0x161dc372, 0xbce22577, 0xbce2250c, 0x283c49d6, 0x283c498b, 0xff0d9526, 0xff0d9541
dd 0x39a801e1, 0x39a80171, 0x080cb369, 0x080cb3de, 0xd8b4e414, 0xd8b4e49c, 0x6456c163, 0x6456c190
dd 0x7bcb8455, 0x7bcb8461, 0xd532b621, 0xd532b670, 0x486c5c0c, 0x486c5c74, 0xd0b8577d, 0xd0b85742
end if
if used aes$init_common | defined include_everything
; (shared init code between encrypt/decrypt inits)
; three arguments: rdi == aes object (MUST BE ALIGNED 16), rsi == ptr to key, edx == length of same (16, 24, or 32)
; NOTE: we preserve rdi through here, the individual inits rely on this fact
falign
aes$init_common:
prolog aes$init_common
mov eax, edx
shr edx, 3 ; edx now 2, 3, or 4
shr eax, 2 ; eax now 4, 6, or 8
sub edx, 2 ; edx now 0, 1, or 2
add eax, 6 ; eax now 10, 12, or 14
mov r8d, edx
mov [rdi+aes_rounds_ofs], eax
add r8d, 3
cmp dword [has_AESNI], 1
cmove edx, r8d
shl edx, 3 ; edx now 0, 8, 16, 24, 32, 40
mov [rdi+aes_loopidx_ofs], rdx
jmp qword [rdx+.dispatch]
dalign
.dispatch:
dq .aes128, .aes192, .aes256, .aesni128, .aesni192, .aesni256
macro aes_keygen kl*, ofs*, rcon* {
local rk
rk = ofs
mov eax, [rdi+rk*4+(kl-1)*4+aes_roundkeys_ofs]
mov ecx, eax
mov edx, eax
mov r8d, eax
and eax, 0xff0000
and ecx, 0xff00
and edx, 0xff
and r8d, 0xff000000
shr eax, 16
shr ecx, 8
shr r8d, 24
movzx r9d, byte [rax+aes$Se] ; 2
movzx r10d, byte [rcx+aes$Se] ; 1
movzx r11d, byte [rdx+aes$Se] ; 0
shl r9d, 24
shl r10d, 16
shl r11d, 8
movzx eax, byte [r8+aes$Se] ; 3
xor r9d, r10d ; why are these xors instead of ors? they'll never get xor'd as such
xor r9d, r11d
xor r9d, eax ; x
mov ecx, [rdi+rk*4+aes_roundkeys_ofs] ; rk[0]
mov edx, [rdi+rk*4+4+aes_roundkeys_ofs] ; rk[1]
mov r8d, [rdi+rk*4+8+aes_roundkeys_ofs] ; rk[2]
xor ecx, r9d
xor ecx, rcon
mov r9d, [rdi+rk*4+12+aes_roundkeys_ofs] ; rk[3]
mov [rdi+rk*4+kl*4+aes_roundkeys_ofs], ecx
xor edx, ecx
mov [rdi+rk*4+kl*4+4+aes_roundkeys_ofs], edx
xor r8d, edx
mov [rdi+rk*4+kl*4+8+aes_roundkeys_ofs], r8d
xor r9d, r8d
mov [rdi+rk*4+kl*4+12+aes_roundkeys_ofs], r9d
if kl = 6 & ofs < 42
mov eax, [rdi+rk*4+16+aes_roundkeys_ofs]
mov ecx, [rdi+rk*4+20+aes_roundkeys_ofs]
xor eax, [rdi+rk*4+36+aes_roundkeys_ofs]
xor ecx, [rdi+rk*4+40+aes_roundkeys_ofs]
mov [rdi+rk*4+40+aes_roundkeys_ofs], eax
mov [rdi+rk*4+44+aes_roundkeys_ofs], ecx
else if kl = 8 & ofs < 48
mov eax, [rdi+rk*4+44+aes_roundkeys_ofs] ; temp
mov ecx, eax
mov edx, eax
mov r8d, eax
and eax, 0xff000000
and ecx, 0xff0000
and edx, 0xff00
and r8d, 0xff
shr eax, 24
shr ecx, 16
shr edx, 8
movzx r9d, byte [rax+aes$Se] ; 3
movzx r10d, byte [rcx+aes$Se] ; 2
movzx r11d, byte [rdx+aes$Se] ; 1
shl r9d, 24
shl r10d, 16
mov ecx, [rdi+rk*4+16+aes_roundkeys_ofs] ; rk[4]
mov edx, [rdi+rk*4+20+aes_roundkeys_ofs] ; rk[5]
shl r11d, 8
movzx eax, byte [r8+aes$Se] ; 0
xor r9d, r10d
xor r9d, r11d
xor r9d, eax
xor ecx, r9d
xor edx, ecx ; 5 ^ 12
mov r8d, [rdi+rk*4+24+aes_roundkeys_ofs] ; rk[6]
mov r9d, [rdi+rk*4+28+aes_roundkeys_ofs] ; rk[7]
xor r8d, edx ; 6 ^ 13
xor r9d, r8d ; 7 ^ 14
mov [rdi+rk*4+48+aes_roundkeys_ofs], ecx ; 4 =
mov [rdi+rk*4+52+aes_roundkeys_ofs], edx ; 5 =
mov [rdi+rk*4+56+aes_roundkeys_ofs], r8d ; 6 =
mov [rdi+rk*4+60+aes_roundkeys_ofs], r9d ; 7 =
end if
}
calign
.aes128:
; 4 32 bit words in our key
if use_movbe
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
movbe [rdi+aes_roundkeys_ofs], r8d
movbe [rdi+aes_roundkeys_ofs+4], r9d
movbe [rdi+aes_roundkeys_ofs+8], r10d
mov r8d, [rsi+12]
movbe [rdi+aes_roundkeys_ofs+12], r8d
else
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
bswap r8d
bswap r9d
bswap r10d
mov [rdi+aes_roundkeys_ofs], r8d
mov [rdi+aes_roundkeys_ofs+4], r9d
mov [rdi+aes_roundkeys_ofs+8], r10d
mov r8d, [rsi+12]
bswap r8d
mov [rdi+aes_roundkeys_ofs+12], r8d
end if
; rk pointed at start, 32 bit key count is 44
; keylen/4 == 4
aes_keygen 4, 0, 0x01000000
aes_keygen 4, 4, 0x02000000
aes_keygen 4, 8, 0x04000000
aes_keygen 4, 12, 0x08000000
aes_keygen 4, 16, 0x10000000
aes_keygen 4, 20, 0x20000000
aes_keygen 4, 24, 0x40000000
aes_keygen 4, 28, 0x80000000
aes_keygen 4, 32, 0x1B000000
aes_keygen 4, 36, 0x36000000
epilog
calign
.aes192:
; 6 32 bit words in our key
if use_movbe
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
movbe [rdi+aes_roundkeys_ofs], r8d
movbe [rdi+aes_roundkeys_ofs+4], r9d
movbe [rdi+aes_roundkeys_ofs+8], r10d
mov r8d, [rsi+12]
mov r9d, [rsi+16]
mov r10d, [rsi+20]
movbe [rdi+aes_roundkeys_ofs+12], r8d
movbe [rdi+aes_roundkeys_ofs+16], r9d
movbe [rdi+aes_roundkeys_ofs+20], r10d
else
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
bswap r8d
bswap r9d
bswap r10d
mov [rdi+aes_roundkeys_ofs], r8d
mov [rdi+aes_roundkeys_ofs+4], r9d
mov [rdi+aes_roundkeys_ofs+8], r10d
mov r8d, [rsi+12]
mov r9d, [rsi+16]
mov r10d, [rsi+20]
bswap r8d
bswap r9d
bswap r10d
mov [rdi+aes_roundkeys_ofs+12], r8d
mov [rdi+aes_roundkeys_ofs+16], r9d
mov [rdi+aes_roundkeys_ofs+20], r10d
end if
; rk pointed at start, 32 bit key count is 52
; keylen/4 == 6
aes_keygen 6, 0, 0x01000000
aes_keygen 6, 6, 0x02000000
aes_keygen 6, 12, 0x04000000
aes_keygen 6, 18, 0x08000000
aes_keygen 6, 24, 0x10000000
aes_keygen 6, 30, 0x20000000
aes_keygen 6, 36, 0x40000000
aes_keygen 6, 42, 0x80000000 ; ofs + keylen/4 + 4 == 52 == end, finite
epilog
calign
.aes256:
; 8 32 bit words in our key
if use_movbe
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
movbe [rdi+aes_roundkeys_ofs], r8d
movbe [rdi+aes_roundkeys_ofs+4], r9d
movbe [rdi+aes_roundkeys_ofs+8], r10d
mov r8d, [rsi+12]
mov r9d, [rsi+16]
mov r10d, [rsi+20]
movbe [rdi+aes_roundkeys_ofs+12], r8d
movbe [rdi+aes_roundkeys_ofs+16], r9d
movbe [rdi+aes_roundkeys_ofs+20], r10d
mov r8d, [rsi+24]
mov r9d, [rsi+28]
movbe [rdi+aes_roundkeys_ofs+24], r8d
movbe [rdi+aes_roundkeys_ofs+28], r9d
else
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
bswap r8d
bswap r9d
bswap r10d
mov [rdi+aes_roundkeys_ofs], r8d
mov [rdi+aes_roundkeys_ofs+4], r9d
mov [rdi+aes_roundkeys_ofs+8], r10d
mov r8d, [rsi+12]
mov r9d, [rsi+16]
mov r10d, [rsi+20]
bswap r8d
bswap r9d
bswap r10d
mov [rdi+aes_roundkeys_ofs+12], r8d
mov [rdi+aes_roundkeys_ofs+16], r9d
mov [rdi+aes_roundkeys_ofs+20], r10d
mov r8d, [rsi+24]
mov r9d, [rsi+28]
bswap r8d
bswap r9d
mov [rdi+aes_roundkeys_ofs+24], r8d
mov [rdi+aes_roundkeys_ofs+28], r9d
end if
; rk pointed at start, 32 bit key count is 60
; keylen/4 == 8
aes_keygen 8, 0, 0x01000000
aes_keygen 8, 8, 0x02000000
aes_keygen 8, 16, 0x04000000
aes_keygen 8, 24, 0x08000000
aes_keygen 8, 32, 0x10000000
aes_keygen 8, 40, 0x20000000
aes_keygen 8, 48, 0x40000000 ; ofs + keylen/4 + 4 == 60 == end, finite
epilog
macro aesni_keygen kl*, ofs*, rcon* {
local rk
rk = ofs
mov eax, [rdi+rk*4+aes_roundkeys_ofs]
mov ecx, [rdi+rk*4+aes_roundkeys_ofs+4]
mov r8d, [rdi+rk*4+aes_roundkeys_ofs+8]
aeskeygenassist xmm1, xmm0, 0
pextrd edx, xmm1, 3
xor eax, edx
mov r9d, [rdi+rk*4+aes_roundkeys_ofs+12]
if kl = 6 & ofs < 42
mov r10d, [rdi+rk*4+aes_roundkeys_ofs+16]
mov r11d, [rdi+rk*4+aes_roundkeys_ofs+20]
else if kl = 8 & ofs < 48
; mov r10d, [rdi+rk*4+aes_roundkeys_ofs+44]
end if
xor eax, rcon
mov [rdi+rk*4+kl*4+aes_roundkeys_ofs], eax
xor ecx, eax
mov [rdi+rk*4+kl*4+aes_roundkeys_ofs+4], ecx
xor r8d, ecx
mov [rdi+rk*4+kl*4+aes_roundkeys_ofs+8], r8d
xor r9d, r8d
mov [rdi+rk*4+kl*4+aes_roundkeys_ofs+12], r9d
if kl = 4 & ofs < 36
pinsrd xmm0, r9d, 3
else if kl = 6 & ofs < 42
; xor r10d, [rdi+rk*4+aes_roundkeys_ofs+36]
xor r10d, r9d
mov [rdi+rk*4+aes_roundkeys_ofs+40], r10d
xor r11d, r10d
mov [rdi+rk*4+aes_roundkeys_ofs+44], r11d
pinsrd xmm0, r11d, 3
else if kl = 8 & ofs < 48
; pinsrd xmm0, r10d, 3
pinsrd xmm0, r9d, 3
mov eax, [rdi+rk*4+aes_roundkeys_ofs+16]
mov ecx, [rdi+rk*4+aes_roundkeys_ofs+20]
mov r8d, [rdi+rk*4+aes_roundkeys_ofs+24]
aeskeygenassist xmm1, xmm0, 0
pextrd edx, xmm1, 2
mov r9d, [rdi+rk*4+aes_roundkeys_ofs+28]
xor eax, edx
mov [rdi+rk*4+aes_roundkeys_ofs+48], eax
xor ecx, eax
mov [rdi+rk*4+aes_roundkeys_ofs+52], ecx
xor r8d, ecx
mov [rdi+rk*4+aes_roundkeys_ofs+56], r8d
xor r9d, r8d
mov [rdi+rk*4+aes_roundkeys_ofs+60], r9d
pinsrd xmm0, r9d, 3
end if
}
calign
.aesni128:
movdqu xmm0, [rsi]
movdqu [rdi+aes_roundkeys_ofs], xmm0
aesni_keygen 4, 0, 0x01
aesni_keygen 4, 4, 0x02
aesni_keygen 4, 8, 0x04
aesni_keygen 4, 12, 0x08
aesni_keygen 4, 16, 0x10
aesni_keygen 4, 20, 0x20
aesni_keygen 4, 24, 0x40
aesni_keygen 4, 28, 0x80
aesni_keygen 4, 32, 0x1B
aesni_keygen 4, 36, 0x36
epilog
calign
.aesni192:
mov rax, [rsi]
movdqu xmm0, [rsi+8]
mov [rdi+aes_roundkeys_ofs], rax
movdqu [rdi+aes_roundkeys_ofs+8], xmm0
aesni_keygen 6, 0, 0x01
aesni_keygen 6, 6, 0x02
aesni_keygen 6, 12, 0x04
aesni_keygen 6, 18, 0x08
aesni_keygen 6, 24, 0x10
aesni_keygen 6, 30, 0x20
aesni_keygen 6, 36, 0x40
aesni_keygen 6, 42, 0x80
epilog
calign
.aesni256:
movdqu xmm1, [rsi]
movdqu xmm0, [rsi+16]
movdqu [rdi+aes_roundkeys_ofs], xmm1
movdqu [rdi+aes_roundkeys_ofs+16], xmm0
aesni_keygen 8, 0, 0x01
aesni_keygen 8, 8, 0x02
aesni_keygen 8, 16, 0x04
aesni_keygen 8, 24, 0x08
aesni_keygen 8, 32, 0x10
aesni_keygen 8, 40, 0x20
aesni_keygen 8, 48, 0x40
epilog
end if
if used aes$init_encrypt | defined include_everything
; three arguments: rdi == aes object, rsi == ptr to key, edx == length of same (16, 24, or 32)
; no actual bounds checking on key length validity is done, up to the caller
falign
aes$init_encrypt:
prolog aes$init_encrypt
; force align rdi to 16
add rdi, 0xf
and rdi, not 0xf
call aes$init_common ; rdi stays intact across this call
cmp dword [has_AESNI], 1
jne .noaesni
; else, AESNI present, all done
epilog
calign
.noaesni:
; byteswap the first and last round key (4 words each)
mov eax, [rdi+aes_rounds_ofs] ; number of rounds
mov ecx, [rdi+aes_roundkeys_ofs]
mov edx, [rdi+aes_roundkeys_ofs+4]
shl eax, 4 ; rounds * 16
if use_movbe
movbe [rdi+aes_roundkeys_ofs], ecx
movbe [rdi+aes_roundkeys_ofs+4], edx
mov ecx, [rdi+aes_roundkeys_ofs+8]
mov edx, [rdi+aes_roundkeys_ofs+12]
mov r8d, [rdi+rax+aes_roundkeys_ofs]
movbe [rdi+aes_roundkeys_ofs+8], ecx
movbe [rdi+aes_roundkeys_ofs+12], edx
movbe [rdi+rax+aes_roundkeys_ofs], r8d
mov ecx, [rdi+rax+aes_roundkeys_ofs+4]
mov edx, [rdi+rax+aes_roundkeys_ofs+8]
mov r8d, [rdi+rax+aes_roundkeys_ofs+12]
movbe [rdi+rax+aes_roundkeys_ofs+4], ecx
movbe [rdi+rax+aes_roundkeys_ofs+8], edx
movbe [rdi+rax+aes_roundkeys_ofs+12], r8d
else
bswap ecx
bswap edx
mov [rdi+aes_roundkeys_ofs], ecx
mov [rdi+aes_roundkeys_ofs+4], edx
mov ecx, [rdi+aes_roundkeys_ofs+8]
mov edx, [rdi+aes_roundkeys_ofs+12]
mov r8d, [rdi+rax+aes_roundkeys_ofs]
bswap ecx
bswap edx
bswap r8d
mov [rdi+aes_roundkeys_ofs+8], ecx
mov [rdi+aes_roundkeys_ofs+12], edx
mov [rdi+rax+aes_roundkeys_ofs], r8d
mov ecx, [rdi+rax+aes_roundkeys_ofs+4]
mov edx, [rdi+rax+aes_roundkeys_ofs+8]
mov r8d, [rdi+rax+aes_roundkeys_ofs+12]
bswap ecx
bswap edx
bswap r8d
mov [rdi+rax+aes_roundkeys_ofs+4], ecx
mov [rdi+rax+aes_roundkeys_ofs+8], edx
mov [rdi+rax+aes_roundkeys_ofs+12], r8d
end if
epilog
end if
if used aes$init_decrypt | defined include_everything
; three arguments: rdi == aes object, rsi == ptr to key, edx == length of same (16, 24, or 32)
; no actual bounds checking on key length validity is done, up to the caller
; NOTE: unrolling these certainly causes a fair bit of bloat, but there are no conditional branches in here
falign
aes$init_decrypt:
prolog aes$init_decrypt
; force align rdi to 16
add rdi, 0xf
and rdi, not 0xf
call aes$init_common
mov eax, [rdi+aes_loopidx_ofs]
jmp qword [rax+.dispatch]
dalign
.dispatch:
dq .aes128, .aes192, .aes256, .aesni128, .aesni192, .aesni256
macro inverseinner i*, j* {
mov eax, [rdi+i*4+aes_roundkeys_ofs]
mov ecx, eax
mov edx, eax
mov r8d, eax
and eax, 0xff000000
and ecx, 0xff0000
and edx, 0xff00
and r8d, 0xff
shr eax, 24
shr ecx, 16
shr edx, 8
movzx r9d, byte [rax+aes$Se] ; 3
movzx r10d, byte [rcx+aes$Se] ; 2
movzx r11d, byte [rdx+aes$Se] ; 1
movzx eax, byte [r8+aes$Se] ; 0
mov esi, dword [r9*8+aes$Td + (3 mod 4) + 1]
xor esi, dword [r10*8+aes$Td + (4 mod 4) + 1]
xor esi, dword [r11*8+aes$Td + (5 mod 4) + 1]
xor esi, dword [rax*8+aes$Td + (6 mod 4) + 1]
; esi now has the i side, do the j side next
mov eax, [rdi+j*4+aes_roundkeys_ofs]
mov ecx, eax
mov edx, eax
mov r8d, eax
and eax, 0xff000000
and ecx, 0xff0000
and edx, 0xff00
and r8d, 0xff
shr eax, 24
shr ecx, 16
shr edx, 8
movzx r9d, byte [rax+aes$Se] ; 3
movzx r10d, byte [rcx+aes$Se] ; 2
movzx r11d, byte [rdx+aes$Se] ; 1
movzx eax, byte [r8+aes$Se] ; 0
mov ecx, dword [r9*8+aes$Td + (3 mod 4) + 1]
xor ecx, dword [r10*8+aes$Td + (4 mod 4) + 1]
xor ecx, dword [r11*8+aes$Td + (5 mod 4) + 1]
xor ecx, dword [rax*8+aes$Td + (6 mod 4) + 1]
; now ecx has the j side, put them back
mov [rdi+j*4+aes_roundkeys_ofs], esi
mov [rdi+i*4+aes_roundkeys_ofs], ecx
}
macro inverseouter i* {
mov eax, [rdi+i*4+aes_roundkeys_ofs]
mov ecx, eax
mov edx, eax
mov r8d, eax
and eax, 0xff000000
and ecx, 0xff0000
and edx, 0xff00
and r8d, 0xff
shr eax, 24
shr ecx, 16
shr edx, 8
movzx r9d, byte [rax+aes$Se] ; 3
movzx r10d, byte [rcx+aes$Se] ; 2
movzx r11d, byte [rdx+aes$Se] ; 1
movzx eax, byte [r8+aes$Se] ; 0
mov ecx, dword [r9*8+aes$Td + (3 mod 4) + 1]
xor ecx, dword [r10*8+aes$Td + (4 mod 4) + 1]
xor ecx, dword [r11*8+aes$Td + (5 mod 4) + 1]
xor ecx, dword [rax*8+aes$Td + (6 mod 4) + 1]
mov [rdi+i*4+aes_roundkeys_ofs], ecx
}
macro doswap l* {
if use_movbe
mov eax, [rdi+aes_roundkeys_ofs]
mov ecx, [rdi+aes_roundkeys_ofs+4]
mov edx, [rdi+l*4+aes_roundkeys_ofs]
movbe [rdi+l*4+aes_roundkeys_ofs], eax
movbe [rdi+l*4+aes_roundkeys_ofs+4], ecx
movbe [rdi+aes_roundkeys_ofs], edx
mov r8d, [rdi+l*4+aes_roundkeys_ofs+4]
mov eax, [rdi+aes_roundkeys_ofs+8]
mov ecx, [rdi+aes_roundkeys_ofs+12]
movbe [rdi+aes_roundkeys_ofs+4], r8d
movbe [rdi+l*4+aes_roundkeys_ofs+8], eax
movbe [rdi+l*4+aes_roundkeys_ofs+12], ecx
mov edx, [rdi+l*4+aes_roundkeys_ofs+8]
mov r8d, [rdi+l*4+aes_roundkeys_ofs+12]
movbe [rdi+aes_roundkeys_ofs+8], edx
movbe [rdi+aes_roundkeys_ofs+12], r8d
else
mov eax, [rdi+aes_roundkeys_ofs]
mov ecx, [rdi+aes_roundkeys_ofs+4]
bswap eax
bswap ecx
mov edx, [rdi+l*4+aes_roundkeys_ofs]
mov r8d, [rdi+l*4+aes_roundkeys_ofs+4]
bswap edx
bswap r8d
mov [rdi+l*4+aes_roundkeys_ofs], eax
mov [rdi+l*4+aes_roundkeys_ofs+4], ecx
mov eax, [rdi+aes_roundkeys_ofs+8]
mov ecx, [rdi+aes_roundkeys_ofs+12]
bswap eax
bswap ecx
mov [rdi+aes_roundkeys_ofs], edx
mov [rdi+aes_roundkeys_ofs+4], r8d
mov edx, [rdi+l*4+aes_roundkeys_ofs+8]
mov r8d, [rdi+l*4+aes_roundkeys_ofs+12]
bswap edx
bswap r8d
mov [rdi+l*4+aes_roundkeys_ofs+8], eax
mov [rdi+l*4+aes_roundkeys_ofs+12], ecx
mov [rdi+aes_roundkeys_ofs+8], edx
mov [rdi+aes_roundkeys_ofs+12], r8d
end if
}
calign
.aes128:
inverseinner 4, 36
inverseinner 5, 37
inverseinner 6, 38
inverseinner 7, 39
inverseinner 8, 32
inverseinner 9, 33
inverseinner 10, 34
inverseinner 11, 35
inverseinner 12, 28
inverseinner 13, 29
inverseinner 14, 30
inverseinner 15, 31
inverseinner 16, 24
inverseinner 17, 25
inverseinner 18, 26
inverseinner 19, 27
inverseouter 20
inverseouter 21
inverseouter 22
inverseouter 23
doswap 40
epilog
calign
.aes192:
inverseinner 4, 44
inverseinner 5, 45
inverseinner 6, 46
inverseinner 7, 47
inverseinner 8, 40
inverseinner 9, 41
inverseinner 10, 42
inverseinner 11, 43
inverseinner 12, 36
inverseinner 13, 37
inverseinner 14, 38
inverseinner 15, 39
inverseinner 16, 32
inverseinner 17, 33
inverseinner 18, 34
inverseinner 19, 35
inverseinner 20, 28
inverseinner 21, 29
inverseinner 22, 30
inverseinner 23, 31
inverseouter 24
inverseouter 25
inverseouter 26
inverseouter 27
doswap 48
epilog
calign
.aes256:
inverseinner 4, 52
inverseinner 5, 53
inverseinner 6, 54
inverseinner 7, 55
inverseinner 8, 48
inverseinner 9, 49
inverseinner 10, 50
inverseinner 11, 51
inverseinner 12, 44
inverseinner 13, 45
inverseinner 14, 46
inverseinner 15, 47
inverseinner 16, 40
inverseinner 17, 41
inverseinner 18, 42
inverseinner 19, 43
inverseinner 20, 36
inverseinner 21, 37
inverseinner 22, 38
inverseinner 23, 39
inverseinner 24, 32
inverseinner 25, 33
inverseinner 26, 34
inverseinner 27, 35
inverseouter 28
inverseouter 29
inverseouter 30
inverseouter 31
doswap 56
epilog
macro aesni_inverseinner i*, j* {
aesimc xmm0, [rdi+i*4+aes_roundkeys_ofs]
aesimc xmm1, [rdi+j*4+aes_roundkeys_ofs]
movdqu [rdi+i*4+aes_roundkeys_ofs], xmm1
movdqu [rdi+j*4+aes_roundkeys_ofs], xmm0
}
macro aesni_inversemiddle i* {
aesimc xmm0, [rdi+i*4+aes_roundkeys_ofs]
movdqu [rdi+i*4+aes_roundkeys_ofs], xmm0
}
calign
.aesni128:
; inversemix required, but considerably simpler than the above methods
; swap the first and last key
movdqu xmm0, [rdi+aes_roundkeys_ofs]
movdqu xmm1, [rdi+aes_roundkeys_ofs+160]
movdqu [rdi+aes_roundkeys_ofs+160], xmm0
movdqu [rdi+aes_roundkeys_ofs], xmm1
aesni_inverseinner 4, 36
aesni_inverseinner 8, 32
aesni_inverseinner 12, 28
aesni_inverseinner 16, 24
aesni_inversemiddle 20
epilog
calign
.aesni192:
; swap the first and last key
movdqu xmm0, [rdi+aes_roundkeys_ofs]
movdqu xmm1, [rdi+aes_roundkeys_ofs+192]
movdqu [rdi+aes_roundkeys_ofs+192], xmm0
movdqu [rdi+aes_roundkeys_ofs], xmm1
aesni_inverseinner 4, 44
aesni_inverseinner 8, 40
aesni_inverseinner 12, 36
aesni_inverseinner 16, 32
aesni_inverseinner 20, 28
aesni_inversemiddle 24
epilog
calign
.aesni256:
; swap the first and last key
movdqu xmm0, [rdi+aes_roundkeys_ofs]
movdqu xmm1, [rdi+aes_roundkeys_ofs+224]
movdqu [rdi+aes_roundkeys_ofs+224], xmm0
movdqu [rdi+aes_roundkeys_ofs], xmm1
aesni_inverseinner 4, 52
aesni_inverseinner 8, 48
aesni_inverseinner 12, 44
aesni_inverseinner 16, 40
aesni_inverseinner 20, 36
aesni_inverseinner 24, 32
aesni_inversemiddle 28
epilog
end if
if used aes$encrypt | defined include_everything
; two arguments: rdi == aes object, rsi == ptr to block to encrypt in place
falign
aes$encrypt:
prolog aes$encrypt
; force align rdi to 16
add rdi, 0xf
and rdi, not 0xf
cmp dword [has_AESNI], 1
je .aesni
push r12 r13 r14 r15
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
xor eax, eax
xor r12d, r12d
mov rdx, aes$Te
mov ecx, [cpu_L1_size]
mov r11d, [rsi+12]
; preload the Te table into the L1 cache per the exhaustive commentary on cache timing
; though per my notes atop, my use cases for this library don't seem to be vulnerable
calign
.timingcountermeasure:
and eax, dword [rdx+r12]
add r12d, ecx
cmp r12d, 2048
jb .timingcountermeasure
and rax, qword [rdx+2040]
xor r8d, [rdi+aes_roundkeys_ofs]
xor r9d, [rdi+aes_roundkeys_ofs+4]
xor r10d, [rdi+aes_roundkeys_ofs+8]
xor r11d, [rdi+aes_roundkeys_ofs+12]
mov r12d, [rdi+aes_roundkeys_ofs+16]
mov r13d, [rdi+aes_roundkeys_ofs+20]
mov r14d, [rdi+aes_roundkeys_ofs+24]
mov r15d, [rdi+aes_roundkeys_ofs+28]
macro quarter_round_fe t*, tb*, d*, c*, b*, a* {
movzx eax, tb
shr t, 8
xor a, [aes$Te+rax*8+ ((6-3) mod 4) + 1]
movzx eax, tb
shr t, 8
xor b, [aes$Te+rax*8+ ((6-2) mod 4) + 1]
movzx eax, tb
shr t, 8
xor c, [aes$Te+rax*8+ ((6-1) mod 4) + 1]
xor d, [aes$Te+t*8+ ((6-0) mod 4) + 1]
}
macro quarter_round_e t*, tb*, a*, b*, c*, d* {
movzx eax, tb
shr t, 8
xor a, [aes$Te+rax*8+ ((3+3) mod 4) + 1]
movzx eax, tb
shr t, 8
xor b, [aes$Te+rax*8+ ((2+3) mod 4) + 1]
movzx eax, tb
shr t, 8
xor c, [aes$Te+rax*8+ ((1+3) mod 4) + 1]
xor d, [aes$Te+t*8+ ((0+3) mod 4) + 1]
}
macro quarter_round_le t*, tb*, a*, b*, c*, d* {
movzx eax, tb
shr t, 8
movzx ecx, byte [aes$Te+rax*8+1]
movzx eax, tb
shr t, 8
movzx edx, byte [aes$Te+rax*8+1]
movzx eax, tb
shr t, 8
mov byte [rsi+a], cl
mov byte [rsi+b], dl
movzx ecx, byte [aes$Te+rax*8+1]
movzx edx, byte [aes$Te+t*8+1]
mov byte [rsi+c], cl
mov byte [rsi+d], dl
}
macro aesenc_round r* {
mov r8d, [rdi+r*4+aes_roundkeys_ofs]
mov r9d, [rdi+r*4+aes_roundkeys_ofs+4]
mov r10d, [rdi+r*4+aes_roundkeys_ofs+8]
mov r11d, [rdi+r*4+aes_roundkeys_ofs+12]
quarter_round_e r15d, r15b, r8d, r9d, r10d, r11d
quarter_round_e r14d, r14b, r11d, r8d, r9d, r10d
quarter_round_e r13d, r13b, r10d, r11d, r8d, r9d
quarter_round_e r12d, r12b, r9d, r10d, r11d, r8d
mov r12d, [rdi+r*4+aes_roundkeys_ofs+16]
mov r13d, [rdi+r*4+aes_roundkeys_ofs+20]
mov r14d, [rdi+r*4+aes_roundkeys_ofs+24]
mov r15d, [rdi+r*4+aes_roundkeys_ofs+28]
quarter_round_e r11d, r11b, r12d, r13d, r14d, r15d
quarter_round_e r10d, r10b, r15d, r12d, r13d, r14d
quarter_round_e r9d, r9b, r14d, r15d, r12d, r13d
quarter_round_e r8d, r8b, r13d, r14d, r15d, r12d
}
macro aesenc_xor_roundkeys r* {
; we have to xor [rsi] with round keys (16 bytes only)
mov eax, [rdi+r*4+aes_roundkeys_ofs]
mov ecx, [rdi+r*4+aes_roundkeys_ofs+4]
mov edx, [rdi+r*4+aes_roundkeys_ofs+8]
mov r8d, [rdi+r*4+aes_roundkeys_ofs+12]
xor [rsi], eax
xor [rsi+4], ecx
xor [rsi+8], edx
xor [rsi+12], r8d
}
quarter_round_fe r11d, r11b, r12d, r13d, r14d, r15d
quarter_round_fe r10d, r10b, r15d, r12d, r13d, r14d
quarter_round_fe r9d, r9b, r14d, r15d, r12d, r13d
quarter_round_fe r8d, r8b, r13d, r14d, r15d, r12d
calign
.aesni: ; jumps here if AESNI to skip all the non AESNI goodies
mov eax, [rdi+aes_loopidx_ofs]
jmp qword [rax+.dispatch]
dalign
.dispatch:
dq .aes128, .aes192, .aes256, .aesni128, .aesni192, .aesni256
calign
.aes128:
; 4 unrolls
aesenc_round 8
aesenc_round 16
aesenc_round 24
aesenc_round 32
quarter_round_le r14d, r14b, 15, 2, 5, 8
quarter_round_le r13d, r13b, 11, 14, 1, 4
quarter_round_le r12d, r12b, 7, 10, 13, 0
quarter_round_le r15d, r15b, 3, 6, 9, 12
aesenc_xor_roundkeys 40
pop r15 r14 r13 r12
epilog
calign
.aes192:
; 5 unrolls
aesenc_round 8
aesenc_round 16
aesenc_round 24
aesenc_round 32
aesenc_round 40
quarter_round_le r14d, r14b, 15, 2, 5, 8
quarter_round_le r13d, r13b, 11, 14, 1, 4
quarter_round_le r12d, r12b, 7, 10, 13, 0
quarter_round_le r15d, r15b, 3, 6, 9, 12
aesenc_xor_roundkeys 48
pop r15 r14 r13 r12
epilog
calign
.aes256:
; 6 unrolls
aesenc_round 8
aesenc_round 16
aesenc_round 24
aesenc_round 32
aesenc_round 40
aesenc_round 48
quarter_round_le r14d, r14b, 15, 2, 5, 8
quarter_round_le r13d, r13b, 11, 14, 1, 4
quarter_round_le r12d, r12b, 7, 10, 13, 0
quarter_round_le r15d, r15b, 3, 6, 9, 12
aesenc_xor_roundkeys 56
pop r15 r14 r13 r12
epilog
macro aesni_enc i* {
aesenc xmm0, [rdi+i*16+aes_roundkeys_ofs]
}
macro aesni_enclast i* {
aesenclast xmm0, [rdi+i*16+aes_roundkeys_ofs]
}
calign
.aesni128:
movdqu xmm0, [rsi] ; load up our block
pxor xmm0, [rdi+aes_roundkeys_ofs]
repeat 9
aesni_enc %
end repeat
aesni_enclast 10
movdqu [rsi], xmm0 ; store the goods
epilog
calign
.aesni192:
movdqu xmm0, [rsi] ; load up our block
pxor xmm0, [rdi+aes_roundkeys_ofs]
repeat 11
aesni_enc %
end repeat
aesni_enclast 12
movdqu [rsi], xmm0 ; store the goods
epilog
calign
.aesni256:
movdqu xmm0, [rsi] ; load up our block
pxor xmm0, [rdi+aes_roundkeys_ofs]
repeat 13
aesni_enc %
end repeat
aesni_enclast 14
movdqu [rsi], xmm0 ; store the goods
epilog
end if
if used aes$decrypt | defined include_everything
; two arguments: rdi == aes object, rsi == ptr to block to decrypt in place
falign
aes$decrypt:
prolog aes$decrypt
; force align rdi to 16
add rdi, 0xf
and rdi, not 0xf
cmp dword [has_AESNI], 1
je .aesni
push r12 r13 r14 r15
mov r8d, [rsi]
mov r9d, [rsi+4]
mov r10d, [rsi+8]
xor eax, eax
xor r12d, r12d
mov rdx, aes$Td
mov ecx, [cpu_L1_size]
mov r11d, [rsi+12]
; preload the Td table into the L1 cache per the exhaustive commentary on cache timing
; though per my notes atop, my use cases for this library don't seem to be vulnerable
calign
.timingcountermeasure:
and eax, dword [rdx+r12]
add r12d, ecx
cmp r12d, 2048
jb .timingcountermeasure
and rax, qword [rdx+2040]
xor r8d, [rdi+aes_roundkeys_ofs]
xor r9d, [rdi+aes_roundkeys_ofs+4]
xor r10d, [rdi+aes_roundkeys_ofs+8]
xor r11d, [rdi+aes_roundkeys_ofs+12]
mov r12d, [rdi+aes_roundkeys_ofs+16]
mov r13d, [rdi+aes_roundkeys_ofs+20]
mov r14d, [rdi+aes_roundkeys_ofs+24]
mov r15d, [rdi+aes_roundkeys_ofs+28]
macro quarter_round_fd t*, tb*, d*, c*, b*, a* {
movzx eax, tb
shr t, 8
xor a, [aes$Td+rax*8+ ((6-3) mod 4) + 1]
movzx eax, tb
shr t, 8
xor b, [aes$Td+rax*8+ ((6-2) mod 4) + 1]
movzx eax, tb
shr t, 8
xor c, [aes$Td+rax*8+ ((6-1) mod 4) + 1]
xor d, [aes$Td+t*8+ ((6-0) mod 4) + 1]
}
macro quarter_round_d t*, tb*, a*, b*, c*, d* {
movzx eax, tb
shr t, 8
xor a, [aes$Td+rax*8+ ((3+3) mod 4) + 1]
movzx eax, tb
shr t, 8
xor b, [aes$Td+rax*8+ ((2+3) mod 4) + 1]
movzx eax, tb
shr t, 8
xor c, [aes$Td+rax*8+ ((1+3) mod 4) + 1]
xor d, [aes$Td+t*8+ ((0+3) mod 4) + 1]
}
macro quarter_round_ld t*, tb*, a*, b*, c*, d* {
movzx eax, tb
shr t, 8
movzx ecx, byte [aes$Td+rax*8]
movzx eax, tb
shr t, 8
movzx edx, byte [aes$Td+rax*8]
movzx eax, tb
shr t, 8
mov byte [rsi+a], cl
mov byte [rsi+b], dl
movzx ecx, byte [aes$Td+rax*8]
movzx edx, byte [aes$Td+t*8]
mov byte [rsi+c], cl
mov byte [rsi+d], dl
}
macro aesdec_round r* {
mov r8d, [rdi+r*4+aes_roundkeys_ofs]
mov r9d, [rdi+r*4+aes_roundkeys_ofs+4]
mov r10d, [rdi+r*4+aes_roundkeys_ofs+8]
mov r11d, [rdi+r*4+aes_roundkeys_ofs+12]
quarter_round_d r15d, r15b, r10d, r9d, r8d, r11d
quarter_round_d r14d, r14b, r9d, r8d, r11d, r10d
quarter_round_d r13d, r13b, r8d, r11d, r10d, r9d
quarter_round_d r12d, r12b, r11d, r10d, r9d, r8d
mov r12d, [rdi+r*4+aes_roundkeys_ofs+16]
mov r13d, [rdi+r*4+aes_roundkeys_ofs+20]
mov r14d, [rdi+r*4+aes_roundkeys_ofs+24]
mov r15d, [rdi+r*4+aes_roundkeys_ofs+28]
quarter_round_d r11d, r11b, r14d, r13d, r12d, r15d
quarter_round_d r10d, r10b, r13d, r12d, r15d, r14d
quarter_round_d r9d, r9b, r12d, r15d, r14d, r13d
quarter_round_d r8d, r8b, r15d, r14d, r13d, r12d
}
macro aesdec_xor_roundkeys r* {
; we have to xor [rsi] with round keys (16 bytes only)
mov eax, [rdi+r*4+aes_roundkeys_ofs]
mov ecx, [rdi+r*4+aes_roundkeys_ofs+4]
mov edx, [rdi+r*4+aes_roundkeys_ofs+8]
mov r8d, [rdi+r*4+aes_roundkeys_ofs+12]
xor [rsi], eax
xor [rsi+4], ecx
xor [rsi+8], edx
xor [rsi+12], r8d
}
quarter_round_fd r11d, r11b, r14d, r13d, r12d, r15d
quarter_round_fd r10d, r10b, r13d, r12d, r15d, r14d
quarter_round_fd r9d, r9b, r12d, r15d, r14d, r13d
quarter_round_fd r8d, r8b, r15d, r14d, r13d, r12d
calign
.aesni: ; jumps here and skips all the unnecessary nonAESNI goods if has_AESNI
mov eax, [rdi+aes_loopidx_ofs]
jmp qword [rax+.dispatch]
dalign
.dispatch:
dq .aes128, .aes192, .aes256, .aesni128, .aesni192, .aesni256
calign
.aes128:
; 4 unrolls
aesdec_round 8
aesdec_round 16
aesdec_round 24
aesdec_round 32
quarter_round_ld r14d, r14b, 7, 2, 13, 8
quarter_round_ld r13d, r13b, 3, 14, 9, 4
quarter_round_ld r12d, r12b, 15, 10, 5, 0
quarter_round_ld r15d, r15b, 11, 6, 1, 12
aesdec_xor_roundkeys 40
pop r15 r14 r13 r12
epilog
calign
.aes192:
; 5 unrolls
aesdec_round 8
aesdec_round 16
aesdec_round 24
aesdec_round 32
aesdec_round 40
quarter_round_ld r14d, r14b, 7, 2, 13, 8
quarter_round_ld r13d, r13b, 3, 14, 9, 4
quarter_round_ld r12d, r12b, 15, 10, 5, 0
quarter_round_ld r15d, r15b, 11, 6, 1, 12
aesdec_xor_roundkeys 48
pop r15 r14 r13 r12
epilog
calign
.aes256:
; 6 unrolls
aesdec_round 8
aesdec_round 16
aesdec_round 24
aesdec_round 32
aesdec_round 40
aesdec_round 48
quarter_round_ld r14d, r14b, 7, 2, 13, 8
quarter_round_ld r13d, r13b, 3, 14, 9, 4
quarter_round_ld r12d, r12b, 15, 10, 5, 0
quarter_round_ld r15d, r15b, 11, 6, 1, 12
aesdec_xor_roundkeys 56
pop r15 r14 r13 r12
epilog
macro aesni_dec i* {
aesdec xmm0, [rdi+i*16+aes_roundkeys_ofs]
}
macro aesni_declast i* {
aesdeclast xmm0, [rdi+i*16+aes_roundkeys_ofs]
}
calign
.aesni128:
movdqu xmm0, [rsi] ; load up our block
pxor xmm0, [rdi+aes_roundkeys_ofs]
repeat 9
aesni_dec %
end repeat
aesni_declast 10
movdqu [rsi], xmm0 ; store the goods
epilog
calign
.aesni192:
movdqu xmm0, [rsi] ; load up our block
pxor xmm0, [rdi+aes_roundkeys_ofs]
repeat 11
aesni_dec %
end repeat
aesni_declast 12
movdqu [rsi], xmm0 ; store the goods
epilog
calign
.aesni256:
movdqu xmm0, [rsi] ; load up our block
pxor xmm0, [rdi+aes_roundkeys_ofs]
repeat 13
aesni_dec %
end repeat
aesni_declast 14
movdqu [rsi], xmm0 ; store the goods
epilog
end if