From 40ab870c59569d72d47397a72121923d0b998a3a Mon Sep 17 00:00:00 2001 From: Jeffrey Crowell Date: Fri, 4 Dec 2015 10:48:38 -0500 Subject: [PATCH] sync with chromium boringssl https://chromium.googlesource.com/chromium/src.git/+/10f4f96db4f6e67316f5e5b6aa92b73947509ac5 https://boringssl.googlesource.com/boringssl/+log/fde89b43c347155798dee8b1210c2c5faabe25f8..6d9e5a74482bb832a6b4d9ae3d20d8f10f250bbd --- DEPS | 5 +- third_party/serf/err_data.c | 2360 ++++------------ .../aes/{aesv8-armx.S => aesv8-armx64.S} | 4 +- .../serf/linux-aarch64/crypto/bn/armv8-mont.S | 1406 +++++++++ .../{ghashv8-armx.S => ghashv8-armx64.S} | 26 +- .../linux-aarch64/crypto/sha/sha1-armv8.S | 4 +- .../linux-aarch64/crypto/sha/sha256-armv8.S | 4 +- .../linux-aarch64/crypto/sha/sha512-armv8.S | 4 +- .../serf/linux-arm/crypto/aes/aes-armv4.S | 4 +- .../aes/{aesv8-armx.S => aesv8-armx32.S} | 4 +- .../serf/linux-arm/crypto/aes/bsaes-armv7.S | 6 +- .../serf/linux-arm/crypto/bn/armv4-mont.S | 4 +- .../serf/linux-arm/crypto/modes/ghash-armv4.S | 7 +- .../{ghashv8-armx.S => ghashv8-armx32.S} | 26 +- .../linux-arm/crypto/sha/sha1-armv4-large.S | 4 +- .../serf/linux-arm/crypto/sha/sha256-armv4.S | 4 +- .../serf/linux-arm/crypto/sha/sha512-armv4.S | 4 +- .../serf/linux-x86/crypto/cpu-x86-asm.S | 322 --- .../serf/linux-x86/crypto/sha/sha1-586.S | 1365 +++++++-- .../serf/linux-x86/crypto/sha/sha256-586.S | 1394 +++++++-- .../linux-x86_64/crypto/bn/x86_64-mont5.S | 9 + .../serf/linux-x86_64/crypto/cpu-x86_64-asm.S | 143 - .../linux-x86_64/crypto/ec/p256-x86_64-asm.S | 1780 ++++++++++++ .../linux-x86_64/crypto/rand/rdrand-x86_64.S | 39 +- .../linux-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++ .../linux-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++ .../linux-x86_64/crypto/sha/sha512-x86_64.S | 2241 +++++++++++++++ third_party/serf/mac-x86/crypto/cpu-x86-asm.S | 309 -- .../serf/mac-x86/crypto/sha/sha1-586.S | 1359 +++++++-- .../serf/mac-x86/crypto/sha/sha256-586.S | 1394 +++++++-- .../serf/mac-x86_64/crypto/bn/x86_64-mont5.S | 9 + .../serf/mac-x86_64/crypto/cpu-x86_64-asm.S | 143 - .../mac-x86_64/crypto/ec/p256-x86_64-asm.S | 1779 ++++++++++++ .../mac-x86_64/crypto/rand/rdrand-x86_64.S | 39 +- .../serf/mac-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++ .../mac-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++ .../mac-x86_64/crypto/sha/sha512-x86_64.S | 2241 +++++++++++++++ third_party/serf/openssl.gyp | 11 +- third_party/serf/openssl.gypi | 94 +- .../serf/win-x86/crypto/cpu-x86-asm.asm | 303 -- .../serf/win-x86/crypto/sha/sha1-586.asm | 1355 +++++++-- .../serf/win-x86/crypto/sha/sha256-586.asm | 1394 +++++++-- .../win-x86_64/crypto/bn/x86_64-mont5.asm | 9 + .../serf/win-x86_64/crypto/cpu-x86_64-asm.asm | 154 - .../win-x86_64/crypto/ec/p256-x86_64-asm.asm | 1916 +++++++++++++ .../win-x86_64/crypto/rand/rdrand-x86_64.asm | 50 +- .../win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 ++++++++ .../win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 +++++++ .../win-x86_64/crypto/sha/sha512-x86_64.asm | 2501 ++++++++++++++++- 49 files changed, 28317 insertions(+), 4518 deletions(-) rename third_party/serf/linux-aarch64/crypto/aes/{aesv8-armx.S => aesv8-armx64.S} (99%) create mode 100644 third_party/serf/linux-aarch64/crypto/bn/armv8-mont.S rename third_party/serf/linux-aarch64/crypto/modes/{ghashv8-armx.S => ghashv8-armx64.S} (91%) rename third_party/serf/linux-arm/crypto/aes/{aesv8-armx.S => aesv8-armx32.S} (99%) rename third_party/serf/linux-arm/crypto/modes/{ghashv8-armx.S => ghashv8-armx32.S} (89%) delete mode 100644 third_party/serf/linux-x86/crypto/cpu-x86-asm.S delete mode 100644 third_party/serf/linux-x86_64/crypto/cpu-x86_64-asm.S create mode 100644 third_party/serf/linux-x86_64/crypto/ec/p256-x86_64-asm.S delete mode 100644 third_party/serf/mac-x86/crypto/cpu-x86-asm.S delete mode 100644 third_party/serf/mac-x86_64/crypto/cpu-x86_64-asm.S create mode 100644 third_party/serf/mac-x86_64/crypto/ec/p256-x86_64-asm.S delete mode 100644 third_party/serf/win-x86/crypto/cpu-x86-asm.asm delete mode 100644 third_party/serf/win-x86_64/crypto/cpu-x86_64-asm.asm create mode 100644 third_party/serf/win-x86_64/crypto/ec/p256-x86_64-asm.asm diff --git a/DEPS b/DEPS index d732ec567b..94d4631a2f 100644 --- a/DEPS +++ b/DEPS @@ -76,9 +76,10 @@ vars = { # TODO(jmarantz): create an easy way to choose this option from the # 'gclient' command, without having to edit the gyp & DEPS files. # - # BoringSSL commit picked on Jun 10, 2015 + # BoringSSL commit picked on Dec 4, 2015 + # https://chromium.googlesource.com/chromium/src.git/+/10f4f96db4f6e67316f5e5b6aa92b73947509ac5 "boringssl_src": "https://boringssl.googlesource.com/boringssl.git", - "boringssl_git_revision": "@3a9e1fba0e3bcf014caa2df143a70475068447dc", + "boringssl_git_revision": "@6d9e5a74482bb832a6b4d9ae3d20d8f10f250bbd", "domain_registry_provider_src": "https://domain-registry-provider.googlecode.com/svn/trunk/", diff --git a/third_party/serf/err_data.c b/third_party/serf/err_data.c index 811f180d95..38580d1070 100644 --- a/third_party/serf/err_data.c +++ b/third_party/serf/err_data.c @@ -11,10 +11,14 @@ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + /* This file was generated by err_data_generate.go. */ + #include #include #include + + OPENSSL_COMPILE_ASSERT(ERR_LIB_NONE == 1, library_values_changed_1); OPENSSL_COMPILE_ASSERT(ERR_LIB_SYS == 2, library_values_changed_2); OPENSSL_COMPILE_ASSERT(ERR_LIB_BN == 3, library_values_changed_3); @@ -45,1462 +49,183 @@ OPENSSL_COMPILE_ASSERT(ERR_LIB_ECDH == 27, library_values_changed_27); OPENSSL_COMPILE_ASSERT(ERR_LIB_HMAC == 28, library_values_changed_28); OPENSSL_COMPILE_ASSERT(ERR_LIB_DIGEST == 29, library_values_changed_29); OPENSSL_COMPILE_ASSERT(ERR_LIB_CIPHER == 30, library_values_changed_30); -OPENSSL_COMPILE_ASSERT(ERR_LIB_USER == 31, library_values_changed_31); -OPENSSL_COMPILE_ASSERT(ERR_LIB_HKDF == 32, library_values_changed_32); +OPENSSL_COMPILE_ASSERT(ERR_LIB_HKDF == 31, library_values_changed_31); +OPENSSL_COMPILE_ASSERT(ERR_LIB_USER == 32, library_values_changed_32); OPENSSL_COMPILE_ASSERT(ERR_NUM_LIBS == 33, library_values_changed_num); -const uint32_t kOpenSSLFunctionValues[] = { - 0xc320540, - 0xc32854b, - 0xc330556, - 0xc338563, - 0xc34056d, - 0xc348577, - 0xc35057e, - 0xc35858a, - 0xc360591, - 0xc3685a7, - 0xc3705bc, - 0xc3785cd, - 0xc3805dd, - 0xc3885f7, - 0xc39060c, - 0xc39861b, - 0xc3a0634, - 0xc3a8648, - 0xc3b0654, - 0xc3b865b, - 0xc3c0663, - 0xc3c8671, - 0xc3d0679, - 0xc3d8681, - 0xc3e068c, - 0x10321744, - 0x1032975b, - 0x10331774, - 0x1033978a, - 0x1034179a, - 0x103497ad, - 0x103517bb, - 0x103597ca, - 0x103617ea, - 0x10369809, - 0x10371826, - 0x10379843, - 0x10381858, - 0x1038987a, - 0x10391899, - 0x103998b8, - 0x103a18cf, - 0x103a98e6, - 0x103b18ef, - 0x103b98fa, - 0x103c1914, - 0x103c991c, - 0x103d1924, - 0x103d992b, - 0x103e193e, - 0x103e9950, - 0x103f1963, - 0x103f996c, - 0x143209cb, - 0x143289d9, - 0x143309e5, - 0x143389f2, - 0x1832100b, - 0x18329023, - 0x18331045, - 0x18339057, - 0x18341068, - 0x18349081, - 0x18351092, - 0x183590a8, - 0x183610b8, - 0x183690cd, - 0x183710e6, - 0x183790f7, - 0x1838110d, - 0x1838911e, - 0x18391130, - 0x18399145, - 0x183a1157, - 0x183a9167, - 0x183b117c, - 0x183b9189, - 0x183c119b, - 0x183c91a9, - 0x183d11bc, - 0x183d91cc, - 0x183e11e1, - 0x183e91f2, - 0x183f1205, - 0x183f9214, - 0x18401224, - 0x18409231, - 0x18411240, - 0x18419251, - 0x18421264, - 0x18429276, - 0x18431288, - 0x18439299, - 0x184412aa, - 0x184492bb, - 0x184512cc, - 0x184592d9, - 0x184612e7, - 0x184692fa, - 0x1847130e, - 0x1847931b, - 0x1848132a, - 0x18489339, - 0x1849134a, - 0x18499357, - 0x184a1365, - 0x184a9376, - 0x184b1387, - 0x184b9395, - 0x184c13a5, - 0x184c93cb, - 0x184d13da, - 0x184d93ea, - 0x184e13fa, - 0x184e9409, - 0x1c320699, - 0x1c3286a5, - 0x1c3306b0, - 0x1c3386bc, - 0x2032141d, - 0x20329428, - 0x20331430, - 0x2033943c, - 0x24321448, - 0x24329456, - 0x24331468, - 0x24339477, - 0x2434148a, - 0x2434949d, - 0x243514b4, - 0x243594cc, - 0x243614da, - 0x243694f2, - 0x243714fb, - 0x2437950d, - 0x24381521, - 0x2438952e, - 0x24391544, - 0x2439955c, - 0x243a1574, - 0x243a957e, - 0x243b1593, - 0x243b95a1, - 0x243c15b9, - 0x243c95d0, - 0x243d15db, - 0x243d95e9, - 0x28320a2b, - 0x28328a3a, - 0x28330a45, - 0x28338a4a, - 0x28340a55, - 0x2c322850, - 0x2c32a85c, - 0x2c33286f, - 0x2c33a880, - 0x2c342899, - 0x2c34a8c1, - 0x2c3528d8, - 0x2c35a8f5, - 0x2c362912, - 0x2c36a92f, - 0x2c372948, - 0x2c37a961, - 0x2c382977, - 0x2c38a985, - 0x2c392997, - 0x2c39a9b4, - 0x2c3a29d1, - 0x2c3aa9df, - 0x2c3b29fd, - 0x2c3baa1b, - 0x2c3c2a36, - 0x2c3caa4a, - 0x2c3d2a5c, - 0x2c3daa6c, - 0x2c3e2a7a, - 0x2c3eaa8a, - 0x2c3f2a9a, - 0x2c3faab5, - 0x2c402ac6, - 0x2c40aae1, - 0x2c412af5, - 0x2c41ab08, - 0x2c422b27, - 0x2c42ab3b, - 0x2c432b4e, - 0x2c43ab5d, - 0x2c442b6c, - 0x2c44ab83, - 0x2c452b9e, - 0x2c45abb6, - 0x2c462bca, - 0x2c46abdd, - 0x2c472bee, - 0x2c47abff, - 0x2c482c10, - 0x2c48ac21, - 0x2c492c30, - 0x2c49ac3d, - 0x2c4a2c4a, - 0x2c4aac57, - 0x2c4b2c60, - 0x2c4bac74, - 0x2c4c2c83, - 0x2c4cac91, - 0x2c4d2cb3, - 0x2c4dacc4, - 0x2c4e2cd5, - 0x2c4eaca0, - 0x2c4f28b2, - 0x30320000, - 0x30328018, - 0x3033002c, - 0x30338042, - 0x3034005b, - 0x3034806c, - 0x3035007f, - 0x3035808f, - 0x3036009d, - 0x303680b3, - 0x303700c3, - 0x303780d8, - 0x303800e6, - 0x303880f7, - 0x30390103, - 0x3039810c, - 0x303a011d, - 0x303a812d, - 0x303b013a, - 0x303b8146, - 0x303c0157, - 0x303c8165, - 0x303d0176, - 0x303d8188, - 0x303e0199, - 0x303e81a8, - 0x303f01b9, - 0x303f81cd, - 0x304001df, - 0x304081ec, - 0x30410202, - 0x30418215, - 0x30420225, - 0x30428239, - 0x3043024a, - 0x3043825a, - 0x30440265, - 0x3044826d, - 0x3045027d, - 0x30458294, - 0x304602a1, - 0x304682b7, - 0x304702c9, - 0x304782d5, - 0x304802e1, - 0x304882ef, - 0x30490308, - 0x30498316, - 0x304a032b, - 0x304a8343, - 0x304b034d, - 0x304b8361, - 0x304c0372, - 0x304c8382, - 0x304d038f, - 0x304d83a0, - 0x304e03b0, - 0x304e83c2, - 0x304f03d3, - 0x304f83e2, - 0x305003f6, - 0x30508404, - 0x30510413, - 0x3051841c, - 0x3432095d, - 0x3432896d, - 0x34330978, - 0x34338985, - 0x3832098e, - 0x383289a1, - 0x383309ab, - 0x383389bd, - 0x3c320a5c, - 0x3c328a6a, - 0x3c330a81, - 0x3c338a95, - 0x3c340ab0, - 0x3c348ac1, - 0x3c350acd, - 0x3c358ae1, - 0x3c360af3, - 0x3c368b1c, - 0x3c370b29, - 0x3c378b36, - 0x3c380b44, - 0x3c388b51, - 0x3c390b5e, - 0x3c398b82, - 0x3c3a0b92, - 0x3c3a8baa, - 0x3c3b0bbf, - 0x3c3b8bd4, - 0x3c3c0be1, - 0x3c3c8bf4, - 0x3c3d0c07, - 0x3c3d8c2b, - 0x3c3e0c53, - 0x3c3e8c6c, - 0x3c3f0c82, - 0x3c3f8c8f, - 0x3c400ca2, - 0x3c408cb3, - 0x3c410cc4, - 0x3c418cdd, - 0x3c420cf6, - 0x3c428d0c, - 0x3c430d29, - 0x3c438d3f, - 0x3c440d5b, - 0x3c448d82, - 0x3c450da0, - 0x3c458dba, - 0x3c460dd2, - 0x3c468dea, - 0x3c470e15, - 0x3c478e40, - 0x3c480e61, - 0x3c488e8a, - 0x3c490ea5, - 0x3c498ec0, - 0x3c4a0ecd, - 0x3c4a8ee4, - 0x3c4b0efb, - 0x3c4b8f24, - 0x3c4c0f34, - 0x3c4c8f40, - 0x3c4d0f58, - 0x3c4d8f6b, - 0x3c4e0f7c, - 0x3c4e8f8d, - 0x3c4f0f9d, - 0x40321977, - 0x40329991, - 0x4033199d, - 0x403399b5, - 0x403419d3, - 0x403499f2, - 0x40351a09, - 0x40359a25, - 0x40361a41, - 0x40369a5b, - 0x40371a7a, - 0x40379a99, - 0x40381ab1, - 0x40389ace, - 0x40391af1, - 0x40399b0e, - 0x403a1b2c, - 0x403a9b3c, - 0x403b1b51, - 0x403b9b6d, - 0x403c1b87, - 0x403c9b92, - 0x403d1bb5, - 0x403d9bd9, - 0x403e1bef, - 0x403e9bf9, - 0x403f1c05, - 0x403f9c16, - 0x40401c2e, - 0x40409c36, - 0x40411c3f, - 0x40419c48, - 0x40421c58, - 0x40429c6c, - 0x40431c77, - 0x40439c83, - 0x40441c9e, - 0x40449caa, - 0x40451cb7, - 0x40459cca, - 0x40461ce2, - 0x40469cfa, - 0x40471d10, - 0x40479d2b, - 0x40481d46, - 0x40489d5a, - 0x40491d73, - 0x40499d8c, - 0x404a1da6, - 0x404a9db0, - 0x404b1dc0, - 0x404b9de1, - 0x404c1dfc, - 0x404c9e0a, - 0x404d1e17, - 0x404d9e2b, - 0x404e1e43, - 0x404e9e51, - 0x404f1e7b, - 0x404f9e92, - 0x40501ea4, - 0x40509ed5, - 0x40511f06, - 0x40519f1b, - 0x40521f2c, - 0x40529f4c, - 0x40531f67, - 0x40539f77, - 0x40541f83, - 0x40549f96, - 0x40551fac, - 0x40559fca, - 0x40561fd7, - 0x40569fe1, - 0x40571fef, - 0x4057a00a, - 0x40582025, - 0x4058a044, - 0x40592059, - 0x4059a06e, - 0x405a208b, - 0x405aa09f, - 0x405b20bb, - 0x405ba0d1, - 0x405c20ee, - 0x405ca100, - 0x405d2117, - 0x405da128, - 0x405e2144, - 0x405ea158, - 0x405f2168, - 0x405fa184, - 0x40602199, - 0x4060a1af, - 0x406121cc, - 0x4061a1e5, - 0x406221f8, - 0x4062a201, - 0x40632211, - 0x4063a21d, - 0x40642233, - 0x4064a251, - 0x40652266, - 0x4065a283, - 0x4066229a, - 0x4066a2b8, - 0x406722d5, - 0x4067a2ec, - 0x4068230a, - 0x4068a321, - 0x40692339, - 0x4069a34a, - 0x406a235d, - 0x406aa370, - 0x406b2384, - 0x406ba3a8, - 0x406c23c3, - 0x406ca3e4, - 0x406d2408, - 0x406da423, - 0x406e2444, - 0x406ea459, - 0x406f2472, - 0x406fa47f, - 0x4070248d, - 0x4070a49a, - 0x407124b7, - 0x4071a4d7, - 0x407224f2, - 0x4072a50b, - 0x40732522, - 0x4073a53c, - 0x40742560, - 0x4074a576, - 0x4075258a, - 0x4075a59f, - 0x407625b9, - 0x4076a5cb, - 0x407725e0, - 0x4077a606, - 0x40782623, - 0x4078a646, - 0x4079266c, - 0x4079a689, - 0x407a26ac, - 0x407aa6c8, - 0x407b26e4, - 0x407ba6f6, - 0x407c2703, - 0x407ca710, - 0x407d272d, - 0x407da744, - 0x407e2760, - 0x407ea776, - 0x407f278e, - 0x407fa7a1, - 0x408027b6, - 0x4080a7cf, - 0x408127ed, - 0x4081a80d, - 0x40822816, - 0x4082a832, - 0x4083283b, - 0x40839e60, - 0x40841eef, - 0x40849ebf, - 0x4432042a, - 0x4432843c, - 0x44330445, - 0x4433844d, - 0x4434045a, - 0x4434846a, - 0x44350485, - 0x443584a5, - 0x443604c1, - 0x443684e2, - 0x443704e9, - 0x443784f7, - 0x44380501, - 0x4438850d, - 0x44390517, - 0x44398522, - 0x443a052c, - 0x443a8536, - 0x4c3215f1, - 0x4c329600, - 0x4c33160f, - 0x4c339628, - 0x4c341643, - 0x4c34965f, - 0x4c351671, - 0x4c35967f, - 0x4c361694, - 0x4c3696a5, - 0x4c3716b3, - 0x4c3796c1, - 0x4c3816d3, - 0x4c3896e3, - 0x4c3916ed, - 0x4c399705, - 0x4c3a171d, - 0x4c3a9730, - 0x50322ce6, - 0x5032acfb, - 0x50332d0c, - 0x5033ad1f, - 0x50342d30, - 0x5034ad43, - 0x50352d52, - 0x5035ad67, - 0x50362d77, - 0x5036ad86, - 0x50372d97, - 0x5037ada7, - 0x50382db8, - 0x5038adcb, - 0x50392ddd, - 0x5039adf3, - 0x503a2e05, - 0x503aae16, - 0x503b2e27, - 0x503bae38, - 0x503c2e43, - 0x503cae4f, - 0x503d2e5a, - 0x503dae65, - 0x503e2e72, - 0x503eae87, - 0x503f2e95, - 0x503faea9, - 0x50402ebc, - 0x5040aecd, - 0x50412ee7, - 0x5041aef6, - 0x50422eff, - 0x5042af0e, - 0x50432f20, - 0x5043af2c, - 0x50442f34, - 0x5044af47, - 0x50452f58, - 0x5045af6e, - 0x50462f7a, - 0x5046af8e, - 0x50472f9c, - 0x5047afb0, - 0x50482fca, - 0x5048afde, - 0x50492ff4, - 0x5049b00b, - 0x504a301d, - 0x504ab031, - 0x504b3046, - 0x504bb05d, - 0x504c3071, - 0x504cb07a, - 0x504d3082, - 0x504db091, - 0x504e30a1, - 0x68320fbe, - 0x68328fcf, - 0x68330fdf, - 0x68338fed, - 0x68340ffa, - 0x6c320fad, - 0x74320a06, - 0x74328a18, - 0x783206c9, - 0x783286fc, - 0x7833070e, - 0x78338720, - 0x78340734, - 0x78348748, - 0x78350766, - 0x78358778, - 0x7836078c, - 0x783687a0, - 0x783707b2, - 0x783787c4, - 0x783807d6, - 0x783887ed, - 0x78390804, - 0x7839881b, - 0x783a0837, - 0x783a8853, - 0x783b086f, - 0x783b8885, - 0x783c089b, - 0x783c88b1, - 0x783d08ce, - 0x783d88dd, - 0x783e08ec, - 0x783e88fb, - 0x783f0917, - 0x783f8925, - 0x78400933, - 0x78408941, - 0x7841094e, - 0x784186db, - 0x80321418, -}; -const size_t kOpenSSLFunctionValuesLen = sizeof(kOpenSSLFunctionValues) / sizeof(kOpenSSLFunctionValues[0]); -const char kOpenSSLFunctionStringData[] = - "ASN1_BIT_STRING_set_bit\0" - "ASN1_ENUMERATED_set\0" - "ASN1_ENUMERATED_to_BN\0" - "ASN1_GENERALIZEDTIME_adj\0" - "ASN1_INTEGER_set\0" - "ASN1_INTEGER_to_BN\0" - "ASN1_OBJECT_new\0" - "ASN1_PCTX_new\0" - "ASN1_STRING_TABLE_add\0" - "ASN1_STRING_set\0" - "ASN1_STRING_type_new\0" - "ASN1_TIME_adj\0" - "ASN1_UTCTIME_adj\0" - "ASN1_d2i_fp\0" - "ASN1_dup\0" - "ASN1_generate_v3\0" - "ASN1_get_object\0" - "ASN1_i2d_bio\0" - "ASN1_i2d_fp\0" - "ASN1_item_d2i_fp\0" - "ASN1_item_dup\0" - "ASN1_item_ex_d2i\0" - "ASN1_item_i2d_bio\0" - "ASN1_item_i2d_fp\0" - "ASN1_item_pack\0" - "ASN1_item_unpack\0" - "ASN1_mbstring_ncopy\0" - "ASN1_template_new\0" - "BIO_new_NDEF\0" - "BN_to_ASN1_ENUMERATED\0" - "BN_to_ASN1_INTEGER\0" - "a2d_ASN1_OBJECT\0" - "a2i_ASN1_ENUMERATED\0" - "a2i_ASN1_INTEGER\0" - "a2i_ASN1_STRING\0" - "append_exp\0" - "asn1_cb\0" - "asn1_check_tlen\0" - "asn1_collate_primitive\0" - "asn1_collect\0" - "asn1_d2i_ex_primitive\0" - "asn1_d2i_read_bio\0" - "asn1_do_adb\0" - "asn1_ex_c2i\0" - "asn1_find_end\0" - "asn1_item_ex_combine_new\0" - "asn1_str2type\0" - "asn1_template_ex_d2i\0" - "asn1_template_noexp_d2i\0" - "bitstr_cb\0" - "c2i_ASN1_BIT_STRING\0" - "c2i_ASN1_INTEGER\0" - "c2i_ASN1_OBJECT\0" - "collect_data\0" - "d2i_ASN1_BOOLEAN\0" - "d2i_ASN1_OBJECT\0" - "d2i_ASN1_UINTEGER\0" - "d2i_ASN1_UTCTIME\0" - "d2i_ASN1_bytes\0" - "d2i_ASN1_type_bytes\0" - "i2d_ASN1_TIME\0" - "i2d_PrivateKey\0" - "long_c2i\0" - "parse_tagging\0" - "BIO_callback_ctrl\0" - "BIO_ctrl\0" - "BIO_new\0" - "BIO_new_file\0" - "BIO_new_mem_buf\0" - "BIO_zero_copy_get_read_buf\0" - "BIO_zero_copy_get_read_buf_done\0" - "BIO_zero_copy_get_write_buf\0" - "BIO_zero_copy_get_write_buf_done\0" - "bio_io\0" - "bio_make_pair\0" - "bio_write\0" - "buffer_ctrl\0" - "conn_ctrl\0" - "conn_state\0" - "file_ctrl\0" - "file_read\0" - "mem_write\0" - "BN_CTX_get\0" - "BN_CTX_new\0" - "BN_CTX_start\0" - "BN_bn2dec\0" - "BN_bn2hex\0" - "BN_div\0" - "BN_div_recp\0" - "BN_exp\0" - "BN_generate_dsa_nonce\0" - "BN_generate_prime_ex\0" - "BN_mod_exp2_mont\0" - "BN_mod_exp_mont\0" - "BN_mod_exp_mont_consttime\0" - "BN_mod_exp_mont_word\0" - "BN_mod_inverse\0" - "BN_mod_inverse_no_branch\0" - "BN_mod_lshift_quick\0" - "BN_mod_sqrt\0" - "BN_new\0" - "BN_rand\0" - "BN_rand_range\0" - "BN_sqrt\0" - "BN_usub\0" - "bn_wexpand\0" - "mod_exp_recp\0" - "BUF_MEM_new\0" - "BUF_memdup\0" - "BUF_strndup\0" - "buf_mem_grow\0" - "EVP_AEAD_CTX_init\0" - "EVP_AEAD_CTX_init_with_direction\0" - "EVP_AEAD_CTX_open\0" - "EVP_AEAD_CTX_seal\0" - "EVP_CIPHER_CTX_copy\0" - "EVP_CIPHER_CTX_ctrl\0" - "EVP_CIPHER_CTX_set_key_length\0" - "EVP_CipherInit_ex\0" - "EVP_DecryptFinal_ex\0" - "EVP_EncryptFinal_ex\0" - "aead_aes_gcm_init\0" - "aead_aes_gcm_open\0" - "aead_aes_gcm_seal\0" - "aead_aes_key_wrap_init\0" - "aead_aes_key_wrap_open\0" - "aead_aes_key_wrap_seal\0" - "aead_chacha20_poly1305_init\0" - "aead_chacha20_poly1305_open\0" - "aead_chacha20_poly1305_seal\0" - "aead_rc4_md5_tls_init\0" - "aead_rc4_md5_tls_open\0" - "aead_rc4_md5_tls_seal\0" - "aead_ssl3_ensure_cipher_init\0" - "aead_ssl3_init\0" - "aead_ssl3_open\0" - "aead_ssl3_seal\0" - "aead_tls_ensure_cipher_init\0" - "aead_tls_init\0" - "aead_tls_open\0" - "aead_tls_seal\0" - "aes_init_key\0" - "aesni_init_key\0" - "CONF_parse_list\0" - "NCONF_load\0" - "def_load_bio\0" - "str_copy\0" - "CRYPTO_set_ex_data\0" - "get_class\0" - "get_func_pointers\0" - "get_new_index\0" - "DH_new_method\0" - "compute_key\0" - "generate_key\0" - "generate_parameters\0" - "EVP_DigestInit_ex\0" - "EVP_MD_CTX_copy_ex\0" - "DSA_new_method\0" - "dsa_sig_cb\0" - "sign\0" - "sign_setup\0" - "verify\0" - "EC_GROUP_copy\0" - "EC_GROUP_get_curve_GFp\0" - "EC_GROUP_get_degree\0" - "EC_GROUP_new_by_curve_name\0" - "EC_KEY_check_key\0" - "EC_KEY_copy\0" - "EC_KEY_generate_key\0" - "EC_KEY_new_method\0" - "EC_KEY_set_public_key_affine_coordinates\0" - "EC_POINT_add\0" - "EC_POINT_cmp\0" - "EC_POINT_copy\0" - "EC_POINT_dbl\0" - "EC_POINT_dup\0" - "EC_POINT_get_affine_coordinates_GFp\0" - "EC_POINT_invert\0" - "EC_POINT_is_at_infinity\0" - "EC_POINT_is_on_curve\0" - "EC_POINT_make_affine\0" - "EC_POINT_new\0" - "EC_POINT_oct2point\0" - "EC_POINT_point2oct\0" - "EC_POINT_set_affine_coordinates_GFp\0" - "EC_POINT_set_compressed_coordinates_GFp\0" - "EC_POINT_set_to_infinity\0" - "EC_POINTs_make_affine\0" - "compute_wNAF\0" - "d2i_ECPKParameters\0" - "d2i_ECParameters\0" - "d2i_ECPrivateKey\0" - "ec_GFp_mont_field_decode\0" - "ec_GFp_mont_field_encode\0" - "ec_GFp_mont_field_mul\0" - "ec_GFp_mont_field_set_to_one\0" - "ec_GFp_mont_field_sqr\0" - "ec_GFp_mont_group_set_curve\0" - "ec_GFp_simple_group_check_discriminant\0" - "ec_GFp_simple_group_set_curve\0" - "ec_GFp_simple_make_affine\0" - "ec_GFp_simple_oct2point\0" - "ec_GFp_simple_point2oct\0" - "ec_GFp_simple_point_get_affine_coordinates\0" - "ec_GFp_simple_point_set_affine_coordinates\0" - "ec_GFp_simple_points_make_affine\0" - "ec_GFp_simple_set_compressed_coordinates\0" - "ec_asn1_group2pkparameters\0" - "ec_asn1_pkparameters2group\0" - "ec_group_new\0" - "ec_group_new_curve_GFp\0" - "ec_group_new_from_data\0" - "ec_point_set_Jprojective_coordinates_GFp\0" - "ec_pre_comp_new\0" - "ec_wNAF_mul\0" - "ec_wNAF_precompute_mult\0" - "i2d_ECPKParameters\0" - "i2d_ECParameters\0" - "i2d_ECPrivateKey\0" - "i2o_ECPublicKey\0" - "o2i_ECPublicKey\0" - "ECDH_compute_key\0" - "ECDSA_do_sign_ex\0" - "ECDSA_do_verify\0" - "ECDSA_sign_ex\0" - "digest_to_bn\0" - "ecdsa_sign_setup\0" - "EVP_DigestSignAlgorithm\0" - "EVP_DigestVerifyInitFromAlgorithm\0" - "EVP_PKEY_CTX_ctrl\0" - "EVP_PKEY_CTX_dup\0" - "EVP_PKEY_copy_parameters\0" - "EVP_PKEY_decrypt\0" - "EVP_PKEY_decrypt_init\0" - "EVP_PKEY_derive\0" - "EVP_PKEY_derive_init\0" - "EVP_PKEY_derive_set_peer\0" - "EVP_PKEY_encrypt\0" - "EVP_PKEY_encrypt_init\0" - "EVP_PKEY_get1_DH\0" - "EVP_PKEY_get1_DSA\0" - "EVP_PKEY_get1_EC_KEY\0" - "EVP_PKEY_get1_RSA\0" - "EVP_PKEY_keygen\0" - "EVP_PKEY_keygen_init\0" - "EVP_PKEY_new\0" - "EVP_PKEY_set_type\0" - "EVP_PKEY_sign\0" - "EVP_PKEY_sign_init\0" - "EVP_PKEY_verify\0" - "EVP_PKEY_verify_init\0" - "check_padding_md\0" - "d2i_AutoPrivateKey\0" - "d2i_PrivateKey\0" - "do_EC_KEY_print\0" - "do_rsa_print\0" - "do_sigver_init\0" - "eckey_param2type\0" - "eckey_param_decode\0" - "eckey_priv_decode\0" - "eckey_priv_encode\0" - "eckey_pub_decode\0" - "eckey_pub_encode\0" - "eckey_type2param\0" - "evp_pkey_ctx_new\0" - "hmac_signctx\0" - "i2d_PublicKey\0" - "old_ec_priv_decode\0" - "old_rsa_priv_decode\0" - "pkey_ec_ctrl\0" - "pkey_ec_derive\0" - "pkey_ec_keygen\0" - "pkey_ec_paramgen\0" - "pkey_ec_sign\0" - "pkey_rsa_ctrl\0" - "pkey_rsa_decrypt\0" - "pkey_rsa_encrypt\0" - "pkey_rsa_sign\0" - "rsa_algor_to_md\0" - "rsa_digest_verify_init_from_algorithm\0" - "rsa_mgf1_to_md\0" - "rsa_priv_decode\0" - "rsa_priv_encode\0" - "rsa_pss_to_ctx\0" - "rsa_pub_decode\0" - "HKDF\0" - "OBJ_create\0" - "OBJ_dup\0" - "OBJ_nid2obj\0" - "OBJ_txt2obj\0" - "PEM_ASN1_read\0" - "PEM_ASN1_read_bio\0" - "PEM_ASN1_write\0" - "PEM_ASN1_write_bio\0" - "PEM_X509_INFO_read\0" - "PEM_X509_INFO_read_bio\0" - "PEM_X509_INFO_write_bio\0" - "PEM_do_header\0" - "PEM_get_EVP_CIPHER_INFO\0" - "PEM_read\0" - "PEM_read_DHparams\0" - "PEM_read_PrivateKey\0" - "PEM_read_bio\0" - "PEM_read_bio_DHparams\0" - "PEM_read_bio_Parameters\0" - "PEM_read_bio_PrivateKey\0" - "PEM_write\0" - "PEM_write_PrivateKey\0" - "PEM_write_bio\0" - "d2i_PKCS8PrivateKey_bio\0" - "d2i_PKCS8PrivateKey_fp\0" - "do_pk8pkey\0" - "do_pk8pkey_fp\0" - "load_iv\0" - "EVP_PKCS82PKEY\0" - "EVP_PKEY2PKCS8\0" - "PKCS12_get_key_and_certs\0" - "PKCS12_handle_content_info\0" - "PKCS12_handle_content_infos\0" - "PKCS5_pbe2_set_iv\0" - "PKCS5_pbe_set\0" - "PKCS5_pbe_set0_algor\0" - "PKCS5_pbkdf2_set\0" - "PKCS8_decrypt\0" - "PKCS8_encrypt\0" - "PKCS8_encrypt_pbe\0" - "pbe_cipher_init\0" - "pbe_crypt\0" - "pkcs12_item_decrypt_d2i\0" - "pkcs12_item_i2d_encrypt\0" - "pkcs12_key_gen_raw\0" - "pkcs12_pbe_keyivgen\0" - "BN_BLINDING_convert_ex\0" - "BN_BLINDING_create_param\0" - "BN_BLINDING_invert_ex\0" - "BN_BLINDING_new\0" - "BN_BLINDING_update\0" - "RSA_check_key\0" - "RSA_new_method\0" - "RSA_padding_add_PKCS1_OAEP_mgf1\0" - "RSA_padding_add_PKCS1_PSS_mgf1\0" - "RSA_padding_add_PKCS1_type_1\0" - "RSA_padding_add_PKCS1_type_2\0" - "RSA_padding_add_none\0" - "RSA_padding_check_PKCS1_OAEP_mgf1\0" - "RSA_padding_check_PKCS1_type_1\0" - "RSA_padding_check_PKCS1_type_2\0" - "RSA_padding_check_none\0" - "RSA_recover_crt_params\0" - "RSA_sign\0" - "RSA_verify\0" - "RSA_verify_PKCS1_PSS_mgf1\0" - "decrypt\0" - "encrypt\0" - "keygen\0" - "pkcs1_prefixed_msg\0" - "private_transform\0" - "rsa_setup_blinding\0" - "sign_raw\0" - "verify_raw\0" - "SSL_CTX_check_private_key\0" - "SSL_CTX_new\0" - "SSL_CTX_set_cipher_list\0" - "SSL_CTX_set_cipher_list_tls11\0" - "SSL_CTX_set_session_id_context\0" - "SSL_CTX_use_PrivateKey\0" - "SSL_CTX_use_PrivateKey_ASN1\0" - "SSL_CTX_use_PrivateKey_file\0" - "SSL_CTX_use_RSAPrivateKey\0" - "SSL_CTX_use_RSAPrivateKey_ASN1\0" - "SSL_CTX_use_RSAPrivateKey_file\0" - "SSL_CTX_use_certificate\0" - "SSL_CTX_use_certificate_ASN1\0" - "SSL_CTX_use_certificate_chain_file\0" - "SSL_CTX_use_certificate_file\0" - "SSL_CTX_use_psk_identity_hint\0" - "SSL_SESSION_new\0" - "SSL_SESSION_print_fp\0" - "SSL_SESSION_set1_id_context\0" - "SSL_SESSION_to_bytes_full\0" - "SSL_accept\0" - "SSL_add_dir_cert_subjects_to_stack\0" - "SSL_add_file_cert_subjects_to_stack\0" - "SSL_check_private_key\0" - "SSL_clear\0" - "SSL_connect\0" - "SSL_do_handshake\0" - "SSL_load_client_CA_file\0" - "SSL_new\0" - "SSL_peek\0" - "SSL_read\0" - "SSL_renegotiate\0" - "SSL_set_cipher_list\0" - "SSL_set_fd\0" - "SSL_set_rfd\0" - "SSL_set_session_id_context\0" - "SSL_set_wfd\0" - "SSL_shutdown\0" - "SSL_use_PrivateKey\0" - "SSL_use_PrivateKey_ASN1\0" - "SSL_use_PrivateKey_file\0" - "SSL_use_RSAPrivateKey\0" - "SSL_use_RSAPrivateKey_ASN1\0" - "SSL_use_RSAPrivateKey_file\0" - "SSL_use_certificate\0" - "SSL_use_certificate_ASN1\0" - "SSL_use_certificate_file\0" - "SSL_use_psk_identity_hint\0" - "SSL_write\0" - "d2i_SSL_SESSION\0" - "d2i_SSL_SESSION_get_octet_string\0" - "d2i_SSL_SESSION_get_string\0" - "do_ssl3_write\0" - "dtls1_accept\0" - "dtls1_buffer_record\0" - "dtls1_check_timeout_num\0" - "dtls1_connect\0" - "dtls1_do_write\0" - "dtls1_get_buffered_message\0" - "dtls1_get_hello_verify\0" - "dtls1_get_message\0" - "dtls1_get_message_fragment\0" - "dtls1_hm_fragment_new\0" - "dtls1_preprocess_fragment\0" - "dtls1_process_fragment\0" - "dtls1_process_record\0" - "dtls1_read_bytes\0" - "dtls1_send_hello_verify_request\0" - "dtls1_write_app_data_bytes\0" - "i2d_SSL_SESSION\0" - "ssl3_accept\0" - "ssl3_callback_ctrl\0" - "ssl3_cert_verify_hash\0" - "ssl3_check_cert_and_algorithm\0" - "ssl3_connect\0" - "ssl3_ctrl\0" - "ssl3_ctx_ctrl\0" - "ssl3_digest_cached_records\0" - "ssl3_do_change_cipher_spec\0" - "ssl3_expect_change_cipher_spec\0" - "ssl3_get_cert_status\0" - "ssl3_get_cert_verify\0" - "ssl3_get_certificate_request\0" - "ssl3_get_channel_id\0" - "ssl3_get_client_certificate\0" - "ssl3_get_client_hello\0" - "ssl3_get_client_key_exchange\0" - "ssl3_get_finished\0" - "ssl3_get_initial_bytes\0" - "ssl3_get_message\0" - "ssl3_get_new_session_ticket\0" - "ssl3_get_next_proto\0" - "ssl3_get_record\0" - "ssl3_get_server_certificate\0" - "ssl3_get_server_done\0" - "ssl3_get_server_hello\0" - "ssl3_get_server_key_exchange\0" - "ssl3_get_v2_client_hello\0" - "ssl3_handshake_mac\0" - "ssl3_prf\0" - "ssl3_read_bytes\0" - "ssl3_read_n\0" - "ssl3_send_cert_verify\0" - "ssl3_send_certificate_request\0" - "ssl3_send_channel_id\0" - "ssl3_send_client_certificate\0" - "ssl3_send_client_hello\0" - "ssl3_send_client_key_exchange\0" - "ssl3_send_server_certificate\0" - "ssl3_send_server_hello\0" - "ssl3_send_server_key_exchange\0" - "ssl3_setup_read_buffer\0" - "ssl3_setup_write_buffer\0" - "ssl3_write_bytes\0" - "ssl3_write_pending\0" - "ssl_add_cert_chain\0" - "ssl_add_cert_to_buf\0" - "ssl_add_clienthello_renegotiate_ext\0" - "ssl_add_clienthello_tlsext\0" - "ssl_add_clienthello_use_srtp_ext\0" - "ssl_add_serverhello_renegotiate_ext\0" - "ssl_add_serverhello_tlsext\0" - "ssl_add_serverhello_use_srtp_ext\0" - "ssl_build_cert_chain\0" - "ssl_bytes_to_cipher_list\0" - "ssl_cert_dup\0" - "ssl_cert_inst\0" - "ssl_cert_new\0" - "ssl_check_serverhello_tlsext\0" - "ssl_check_srvr_ecc_cert_and_alg\0" - "ssl_cipher_process_rulestr\0" - "ssl_cipher_strength_sort\0" - "ssl_create_cipher_list\0" - "ssl_ctx_log_master_secret\0" - "ssl_ctx_log_rsa_client_key_exchange\0" - "ssl_ctx_make_profiles\0" - "ssl_get_new_session\0" - "ssl_get_prev_session\0" - "ssl_get_server_cert_index\0" - "ssl_get_sign_pkey\0" - "ssl_init_wbio_buffer\0" - "ssl_parse_clienthello_renegotiate_ext\0" - "ssl_parse_clienthello_tlsext\0" - "ssl_parse_clienthello_use_srtp_ext\0" - "ssl_parse_serverhello_renegotiate_ext\0" - "ssl_parse_serverhello_tlsext\0" - "ssl_parse_serverhello_use_srtp_ext\0" - "ssl_scan_clienthello_tlsext\0" - "ssl_scan_serverhello_tlsext\0" - "ssl_sess_cert_new\0" - "ssl_set_cert\0" - "ssl_set_pkey\0" - "ssl_undefined_const_function\0" - "ssl_undefined_function\0" - "ssl_undefined_void_function\0" - "ssl_verify_cert_chain\0" - "tls12_check_peer_sigalg\0" - "tls1_aead_ctx_init\0" - "tls1_cert_verify_mac\0" - "tls1_change_cipher_state\0" - "tls1_change_cipher_state_aead\0" - "tls1_check_duplicate_extensions\0" - "tls1_enc\0" - "tls1_export_keying_material\0" - "tls1_prf\0" - "tls1_setup_key_block\0" - "ASN1_digest\0" - "ASN1_item_sign_ctx\0" - "ASN1_item_verify\0" - "NETSCAPE_SPKI_b64_decode\0" - "NETSCAPE_SPKI_b64_encode\0" - "PKCS7_get_CRLs\0" - "PKCS7_get_certificates\0" - "X509_ATTRIBUTE_create_by_NID\0" - "X509_ATTRIBUTE_create_by_OBJ\0" - "X509_ATTRIBUTE_create_by_txt\0" - "X509_ATTRIBUTE_get0_data\0" - "X509_ATTRIBUTE_set1_data\0" - "X509_CRL_add0_revoked\0" - "X509_CRL_diff\0" - "X509_CRL_print_fp\0" - "X509_EXTENSION_create_by_NID\0" - "X509_EXTENSION_create_by_OBJ\0" - "X509_INFO_new\0" - "X509_NAME_ENTRY_create_by_NID\0" - "X509_NAME_ENTRY_create_by_txt\0" - "X509_NAME_ENTRY_set_object\0" - "X509_NAME_add_entry\0" - "X509_NAME_oneline\0" - "X509_NAME_print\0" - "X509_PKEY_new\0" - "X509_PUBKEY_get\0" - "X509_PUBKEY_set\0" - "X509_REQ_check_private_key\0" - "X509_REQ_to_X509\0" - "X509_STORE_CTX_get1_issuer\0" - "X509_STORE_CTX_init\0" - "X509_STORE_CTX_new\0" - "X509_STORE_CTX_purpose_inherit\0" - "X509_STORE_add_cert\0" - "X509_STORE_add_crl\0" - "X509_TRUST_add\0" - "X509_TRUST_set\0" - "X509_check_private_key\0" - "X509_get_pubkey_parameters\0" - "X509_load_cert_crl_file\0" - "X509_load_cert_file\0" - "X509_load_crl_file\0" - "X509_print_ex_fp\0" - "X509_to_X509_REQ\0" - "X509_verify_cert\0" - "X509at_add1_attr\0" - "X509v3_add_ext\0" - "add_cert_dir\0" - "by_file_ctrl\0" - "check_policy\0" - "dir_ctrl\0" - "get_cert_by_subject\0" - "i2d_DSA_PUBKEY\0" - "i2d_EC_PUBKEY\0" - "i2d_RSA_PUBKEY\0" - "pkcs7_parse_header\0" - "x509_name_encode\0" - "x509_name_ex_d2i\0" - "x509_name_ex_new\0" - "SXNET_add_id_INTEGER\0" - "SXNET_add_id_asc\0" - "SXNET_add_id_ulong\0" - "SXNET_get_id_asc\0" - "SXNET_get_id_ulong\0" - "X509V3_EXT_add\0" - "X509V3_EXT_add_alias\0" - "X509V3_EXT_free\0" - "X509V3_EXT_i2d\0" - "X509V3_EXT_nconf\0" - "X509V3_add1_i2d\0" - "X509V3_add_value\0" - "X509V3_get_section\0" - "X509V3_get_string\0" - "X509V3_get_value_bool\0" - "X509V3_parse_list\0" - "X509_PURPOSE_add\0" - "X509_PURPOSE_set\0" - "a2i_GENERAL_NAME\0" - "copy_email\0" - "copy_issuer\0" - "do_dirname\0" - "do_ext_i2d\0" - "do_ext_nconf\0" - "gnames_from_sectname\0" - "hex_to_string\0" - "i2s_ASN1_ENUMERATED\0" - "i2s_ASN1_IA5STRING\0" - "i2s_ASN1_INTEGER\0" - "i2v_AUTHORITY_INFO_ACCESS\0" - "notice_section\0" - "nref_nos\0" - "policy_section\0" - "process_pci_value\0" - "r2i_certpol\0" - "r2i_pci\0" - "s2i_ASN1_IA5STRING\0" - "s2i_ASN1_INTEGER\0" - "s2i_ASN1_OCTET_STRING\0" - "s2i_skey_id\0" - "set_dist_point_name\0" - "string_to_hex\0" - "v2i_ASN1_BIT_STRING\0" - "v2i_AUTHORITY_INFO_ACCESS\0" - "v2i_AUTHORITY_KEYID\0" - "v2i_BASIC_CONSTRAINTS\0" - "v2i_EXTENDED_KEY_USAGE\0" - "v2i_GENERAL_NAMES\0" - "v2i_GENERAL_NAME_ex\0" - "v2i_NAME_CONSTRAINTS\0" - "v2i_POLICY_CONSTRAINTS\0" - "v2i_POLICY_MAPPINGS\0" - "v2i_crld\0" - "v2i_idp\0" - "v2i_issuer_alt\0" - "v2i_subject_alt\0" - "v3_generic_extension\0" - ""; + const uint32_t kOpenSSLReasonValues[] = { 0xc3207ba, - 0xc3287c7, - 0xc3307d6, - 0xc3387e6, - 0xc3407f5, - 0xc34880e, - 0xc35081a, - 0xc358837, - 0xc360849, - 0xc368857, - 0xc370867, - 0xc378874, - 0xc380884, - 0xc38888f, - 0xc3908a5, - 0xc3988b4, - 0xc3a08c8, - 0x10321417, - 0x10329423, - 0x1033143c, - 0x1033944f, - 0x10340da9, - 0x10349462, - 0x10351477, - 0x1035948a, - 0x103614a3, - 0x103694b8, - 0x103714d6, - 0x103794e5, - 0x10381501, - 0x1038951c, - 0x1039152b, - 0x10399547, - 0x103a1562, - 0x103a9579, - 0x103b158a, - 0x103b959e, - 0x103c15bd, - 0x103c95cc, - 0x103d15e3, - 0x103d95f6, - 0x103e0b5f, - 0x103e9609, - 0x103f161c, - 0x103f9636, - 0x10401646, - 0x1040965a, - 0x10411670, - 0x10419688, - 0x1042169d, - 0x104296b1, - 0x104316c3, + 0xc3287d4, + 0xc3307e3, + 0xc3387f3, + 0xc340802, + 0xc34881b, + 0xc350827, + 0xc358844, + 0xc360856, + 0xc368864, + 0xc370874, + 0xc378881, + 0xc380891, + 0xc38889c, + 0xc3908b2, + 0xc3988c1, + 0xc3a08d5, + 0xc3a87c7, + 0xc3b00b0, + 0x10321478, + 0x10329484, + 0x1033149d, + 0x103394b0, + 0x10340de1, + 0x103494cf, + 0x103514e4, + 0x10359516, + 0x1036152f, + 0x10369544, + 0x10371562, + 0x10379571, + 0x1038158d, + 0x103895a8, + 0x103915b7, + 0x103995d3, + 0x103a15ee, + 0x103a9605, + 0x103b1616, + 0x103b962a, + 0x103c1649, + 0x103c9658, + 0x103d166f, + 0x103d9682, + 0x103e0b6c, + 0x103e96b3, + 0x103f16c6, + 0x103f96e0, + 0x104016f0, + 0x10409704, + 0x1041171a, + 0x10419732, + 0x10421747, + 0x1042975b, + 0x1043176d, 0x104385d0, - 0x104408b4, - 0x104496d8, - 0x104516ef, - 0x10459704, - 0x10461712, - 0x14320b42, - 0x14328b50, - 0x14330b5f, - 0x14338b71, + 0x104408c1, + 0x10449782, + 0x10451799, + 0x104597ae, + 0x104617bc, + 0x10469695, + 0x104714f7, + 0x104787c7, + 0x104800b0, + 0x104894c3, + 0x14320b4f, + 0x14328b5d, + 0x14330b6c, + 0x14338b7e, 0x18320083, - 0x18328dff, - 0x18330e15, - 0x18338094, - 0x18340e2d, - 0x18348e41, - 0x18350e56, - 0x18358e78, - 0x18360e90, - 0x18368ea5, - 0x18370eb8, - 0x18378ecc, - 0x18380ef0, - 0x18388efe, - 0x18390f14, - 0x18398f28, - 0x183a0f38, - 0x183a89cc, - 0x183b0f48, - 0x183b8f5d, - 0x183c0f74, - 0x183c8f88, - 0x183d0f9c, - 0x183d8fac, - 0x183e0b8e, - 0x183e8fb9, - 0x183f0fcb, - 0x183f8fd6, - 0x18400fe6, - 0x18408ff7, - 0x18411008, - 0x1841901a, - 0x18421043, - 0x1842905c, - 0x1843106b, - 0x1843907f, - 0x184410a0, - 0x184490b8, - 0x184510d4, - 0x184590ea, - 0x18461105, - 0x1846866b, - 0x18471120, - 0x1847913b, - 0x20321162, - 0x2432116e, - 0x243288fa, - 0x24331180, - 0x2433918d, - 0x2434119a, - 0x243491ac, - 0x243511bb, - 0x243591d8, - 0x243611e5, - 0x243691f3, - 0x24371201, - 0x2437920f, - 0x24381218, - 0x24389225, - 0x24391238, - 0x28320b82, - 0x28328b8e, - 0x28330b5f, - 0x28338ba1, - 0x2c3229d2, - 0x2c32a9e0, - 0x2c3329f2, - 0x2c33aa04, - 0x2c342a18, - 0x2c34aa2a, - 0x2c352a45, - 0x2c35aa57, - 0x2c362a6a, + 0x18328e47, + 0x18340e75, + 0x18348e89, + 0x18358ec0, + 0x18368eed, + 0x18370f00, + 0x18378f14, + 0x18380f38, + 0x18388f46, + 0x18390f5c, + 0x18398f70, + 0x183a0f80, + 0x183b0f90, + 0x183b8fa5, + 0x183c8fd0, + 0x183d0fe4, + 0x183d8ff4, + 0x183e0b9b, + 0x183e9001, + 0x183f1013, + 0x183f901e, + 0x1840102e, + 0x1840903f, + 0x18411050, + 0x18419062, + 0x1842108b, + 0x184290bd, + 0x184310cc, + 0x18451135, + 0x1845914b, + 0x18461166, + 0x18468ed8, + 0x184709d9, + 0x18478094, + 0x18480fbc, + 0x18489101, + 0x18490e5d, + 0x18498e9e, + 0x184a119c, + 0x184a9119, + 0x184b10e0, + 0x184b8e37, + 0x184c10a4, + 0x184c866b, + 0x184d1181, + 0x203211c3, + 0x243211cf, + 0x24328907, + 0x243311e1, + 0x243391ee, + 0x243411fb, + 0x2434920d, + 0x2435121c, + 0x24359239, + 0x24361246, + 0x24369254, + 0x24371262, + 0x24379270, + 0x24381279, + 0x24389286, + 0x24391299, + 0x28320b8f, + 0x28328b9b, + 0x28330b6c, + 0x28338bae, + 0x2c322c0b, + 0x2c32ac19, + 0x2c332c2b, + 0x2c33ac3d, + 0x2c342c51, + 0x2c34ac63, + 0x2c352c7e, + 0x2c35ac90, + 0x2c362ca3, 0x2c3682f3, - 0x2c372a77, - 0x2c37aa89, - 0x2c382a9c, - 0x2c38aaaa, - 0x2c392aba, - 0x2c39aacc, - 0x2c3a2ae0, - 0x2c3aaaf1, - 0x2c3b12f8, - 0x2c3bab02, - 0x2c3c2b16, - 0x2c3cab2c, - 0x2c3d2b45, - 0x2c3dab73, - 0x2c3e2b81, - 0x2c3eab99, - 0x2c3f2bb1, - 0x2c3fabbe, - 0x2c402be1, - 0x2c40ac00, - 0x2c411162, - 0x2c41ac11, - 0x2c422c24, - 0x2c4290d4, - 0x2c432c35, + 0x2c372cb0, + 0x2c37acc2, + 0x2c382cd5, + 0x2c38ace3, + 0x2c392cf3, + 0x2c39ad05, + 0x2c3a2d19, + 0x2c3aad2a, + 0x2c3b1359, + 0x2c3bad3b, + 0x2c3c2d4f, + 0x2c3cad65, + 0x2c3d2d7e, + 0x2c3dadac, + 0x2c3e2dba, + 0x2c3eadd2, + 0x2c3f2dea, + 0x2c3fadf7, + 0x2c402e1a, + 0x2c40ae39, + 0x2c4111c3, + 0x2c41ae4a, + 0x2c422e5d, + 0x2c429135, + 0x2c432e6e, 0x2c4386a2, - 0x2c442b62, + 0x2c442d9b, 0x30320000, 0x30328015, 0x3033001f, @@ -1589,240 +314,258 @@ const uint32_t kOpenSSLReasonValues[] = { 0x305c8687, 0x305d0698, 0x305d86a2, - 0x34320abc, - 0x34328ad0, - 0x34330aed, - 0x34338b00, - 0x34340b0f, - 0x34348b2c, + 0x34320ac9, + 0x34328add, + 0x34330afa, + 0x34338b0d, + 0x34340b1c, + 0x34348b39, 0x3c320083, - 0x3c328bb7, - 0x3c330bd0, - 0x3c338beb, - 0x3c340c08, - 0x3c348c23, - 0x3c350c3e, - 0x3c358c53, - 0x3c360c6c, - 0x3c368c84, - 0x3c370c95, - 0x3c378ca3, - 0x3c380cb0, - 0x3c388cc4, - 0x3c390b8e, - 0x3c398cd8, - 0x3c3a0cec, - 0x3c3a8874, - 0x3c3b0cfc, - 0x3c3b8d17, - 0x3c3c0d29, - 0x3c3c8d3f, - 0x3c3d0d49, - 0x3c3d8d5d, - 0x3c3e0d6b, - 0x3c3e8d79, - 0x40321729, - 0x4032973f, - 0x4033176d, - 0x40339777, - 0x4034178e, - 0x403497ac, - 0x403517bc, - 0x403597ce, - 0x403617db, - 0x403697e7, - 0x403717fc, - 0x40379811, - 0x40381823, - 0x4038982e, - 0x40391840, - 0x40398da9, - 0x403a1850, - 0x403a9863, - 0x403b1884, - 0x403b9895, - 0x403c18a5, + 0x3c328bd8, + 0x3c330bf1, + 0x3c338c0c, + 0x3c340c29, + 0x3c348c44, + 0x3c350c5f, + 0x3c358c74, + 0x3c360c8d, + 0x3c368ca5, + 0x3c370cb6, + 0x3c378cc4, + 0x3c380cd1, + 0x3c388ce5, + 0x3c390b9b, + 0x3c398cf9, + 0x3c3a0d0d, + 0x3c3a8881, + 0x3c3b0d1d, + 0x3c3b8d38, + 0x3c3c0d4a, + 0x3c3c8d60, + 0x3c3d0d6a, + 0x3c3d8d7e, + 0x3c3e0d8c, + 0x3c3e8db1, + 0x3c3f0bc4, + 0x3c3f8d9a, + 0x403217d3, + 0x403297e9, + 0x40331817, + 0x40339821, + 0x40341838, + 0x40349856, + 0x40351866, + 0x40359878, + 0x40361885, + 0x40369891, + 0x403718a6, + 0x403798bb, + 0x403818cd, + 0x403898d8, + 0x403918ea, + 0x40398de1, + 0x403a18fa, + 0x403a990d, + 0x403b192e, + 0x403b993f, + 0x403c194f, 0x403c8064, - 0x403d18b1, - 0x403d98cd, - 0x403e18e3, - 0x403e98f2, - 0x403f1905, - 0x403f991f, - 0x4040192d, - 0x40409942, - 0x40411956, - 0x40419973, - 0x4042198c, - 0x404299a7, - 0x404319c0, - 0x404399d3, - 0x404419e7, - 0x404499ff, - 0x40451a0f, - 0x40459a1d, - 0x40461a3b, + 0x403d195b, + 0x403d9977, + 0x403e198d, + 0x403e999c, + 0x403f19af, + 0x403f99c9, + 0x404019d7, + 0x404099ec, + 0x40411a00, + 0x40419a1d, + 0x40421a36, + 0x40429a51, + 0x40431a6a, + 0x40439a7d, + 0x40441a91, + 0x40449aa9, + 0x40451af4, + 0x40459b02, + 0x40461b20, 0x40468094, - 0x40471a50, - 0x40479a62, - 0x40481a86, - 0x40489aa6, - 0x40491aba, - 0x40499acf, - 0x404a1ae8, - 0x404a9b0b, - 0x404b1b25, - 0x404b9b43, - 0x404c1b5e, - 0x404c9b78, - 0x404d1b8f, - 0x404d9bb7, - 0x404e1bce, - 0x404e9bea, - 0x404f1c06, - 0x404f9c27, - 0x40501c49, - 0x40509c65, - 0x40511c79, - 0x40519c86, - 0x40521c9d, - 0x40529cad, - 0x40531cbd, - 0x40539cd1, - 0x40541cec, - 0x40549cfc, - 0x40551d13, - 0x40559d22, - 0x40561d3d, - 0x40569d55, - 0x40571d71, - 0x40579d8a, - 0x40581d9d, - 0x40589db2, - 0x40591dd5, - 0x40599de3, - 0x405a1df0, - 0x405a9e09, - 0x405b1e21, - 0x405b9e34, - 0x405c1e49, - 0x405c9e5b, - 0x405d1e70, - 0x405d9e80, - 0x405e1e99, - 0x405e9ead, - 0x405f1ebd, - 0x405f9ed5, - 0x40601ee6, - 0x40609ef9, - 0x40611f0a, - 0x40619f28, - 0x40621f39, - 0x40629f46, - 0x40631f5d, - 0x40639f7d, - 0x40641f94, - 0x40649fa1, - 0x40651faf, - 0x40659fd1, - 0x40661ff9, - 0x4066a00e, - 0x40672025, - 0x4067a036, - 0x40682047, - 0x4068a058, - 0x4069206d, - 0x4069a084, - 0x406a2095, - 0x406aa0ae, - 0x406b20c9, - 0x406ba0e0, - 0x406c20f8, - 0x406ca119, - 0x406d212c, - 0x406da14d, - 0x406e2168, - 0x406ea183, - 0x406f21a4, - 0x406fa1ca, - 0x407021ea, - 0x4070a206, - 0x40712393, - 0x4071a3b6, - 0x407223cc, - 0x4072a3eb, - 0x40732403, - 0x4073a423, - 0x4074264d, - 0x4074a672, - 0x4075268d, - 0x4075a6ac, - 0x407626db, - 0x4076a703, - 0x4077271c, - 0x4077a73b, - 0x40782760, - 0x4078a777, - 0x4079278a, - 0x4079a7a7, + 0x40471b35, + 0x40479b47, + 0x40481b6b, + 0x40489b99, + 0x40491bad, + 0x40499bc2, + 0x404a1bdb, + 0x404a9c15, + 0x404b1c46, + 0x404b9c7c, + 0x404c1c97, + 0x404c9cb1, + 0x404d1cc8, + 0x404d9cf0, + 0x404e1d07, + 0x404e9d23, + 0x404f1d3f, + 0x404f9d60, + 0x40501d82, + 0x40509d9e, + 0x40511db2, + 0x40519dbf, + 0x40521dd6, + 0x40529de6, + 0x40531df6, + 0x40539e0a, + 0x40541e25, + 0x40549e35, + 0x40551e4c, + 0x40559e5b, + 0x40561e88, + 0x40569ea0, + 0x40571ebc, + 0x40579ed5, + 0x40581ee8, + 0x40589efd, + 0x40591f20, + 0x40599f4b, + 0x405a1f58, + 0x405a9f71, + 0x405b1f89, + 0x405b9f9c, + 0x405c1fb1, + 0x405c9fc3, + 0x405d1fd8, + 0x405d9fe8, + 0x405e2001, + 0x405ea015, + 0x405f2025, + 0x405fa03d, + 0x4060204e, + 0x4060a061, + 0x40612072, + 0x4061a090, + 0x406220a1, + 0x4062a0ae, + 0x406320c5, + 0x4063a106, + 0x4064211d, + 0x4064a12a, + 0x40652138, + 0x4065a15a, + 0x40662182, + 0x4066a197, + 0x406721ae, + 0x4067a1bf, + 0x406821d0, + 0x4068a1e1, + 0x406921f6, + 0x4069a20d, + 0x406a221e, + 0x406aa237, + 0x406b2252, + 0x406ba269, + 0x406c22d6, + 0x406ca2f7, + 0x406d230a, + 0x406da32b, + 0x406e2346, + 0x406ea38f, + 0x406f23b0, + 0x406fa3d6, + 0x407023f6, + 0x4070a412, + 0x4071259f, + 0x4071a5c2, + 0x407225d8, + 0x4072a5f7, + 0x4073260f, + 0x4073a62f, + 0x40742859, + 0x4074a87e, + 0x40752899, + 0x4075a8b8, + 0x407628e7, + 0x4076a90f, + 0x40772940, + 0x4077a95f, + 0x40782999, + 0x4078a9b0, + 0x407929c3, + 0x4079a9e0, 0x407a0782, - 0x407aa7b9, - 0x407b27cc, - 0x407ba7e5, - 0x407c27fd, - 0x407c905c, - 0x407d2811, - 0x407da82b, - 0x407e283c, - 0x407ea850, - 0x407f285e, - 0x407fa879, - 0x40801225, - 0x4080a89e, - 0x408128c0, - 0x4081a8db, - 0x408228f0, - 0x4082a908, - 0x40832920, - 0x4083a937, - 0x4084294d, - 0x4084a959, - 0x4085296c, - 0x4085a981, - 0x40862993, - 0x4086a9a8, - 0x408729b1, - 0x40879ba5, - 0x41f422be, - 0x41f92350, - 0x41fe2243, - 0x41fea474, - 0x41ff2565, - 0x420322d7, - 0x420822f9, - 0x4208a335, - 0x42092227, - 0x4209a36f, - 0x420a227e, - 0x420aa25e, - 0x420b229e, - 0x420ba317, - 0x420c2581, - 0x420ca441, - 0x420d245b, - 0x420da492, - 0x421224ac, - 0x42172548, - 0x4217a4ee, - 0x421c2510, - 0x421f24cb, - 0x42212598, - 0x4226252b, - 0x422b2631, - 0x422ba5fa, - 0x422c2619, - 0x422ca5d4, - 0x422d25b3, + 0x407aa9f2, + 0x407b2a05, + 0x407baa1e, + 0x407c2a36, + 0x407c90bd, + 0x407d2a4a, + 0x407daa64, + 0x407e2a75, + 0x407eaa89, + 0x407f2a97, + 0x407faab2, + 0x40801286, + 0x4080aad7, + 0x40812af9, + 0x4081ab14, + 0x40822b29, + 0x4082ab41, + 0x40832b59, + 0x4083ab70, + 0x40842b86, + 0x4084ab92, + 0x40852ba5, + 0x4085abba, + 0x40862bcc, + 0x4086abe1, + 0x40872bea, + 0x40879cde, + 0x40880083, + 0x4088a0e5, + 0x40890a17, + 0x4089a281, + 0x408a1bfe, + 0x408aa2ab, + 0x408b2928, + 0x408ba984, + 0x408c2361, + 0x408c9c2f, + 0x408d1c64, + 0x408d9e76, + 0x408e1ab9, + 0x408e9add, + 0x408f1f2e, + 0x408f9b8b, + 0x41f424ca, + 0x41f9255c, + 0x41fe244f, + 0x41fea680, + 0x41ff2771, + 0x420324e3, + 0x42082505, + 0x4208a541, + 0x42092433, + 0x4209a57b, + 0x420a248a, + 0x420aa46a, + 0x420b24aa, + 0x420ba523, + 0x420c278d, + 0x420ca64d, + 0x420d2667, + 0x420da69e, + 0x421226b8, + 0x42172754, + 0x4217a6fa, + 0x421c271c, + 0x421f26d7, + 0x422127a4, + 0x42262737, + 0x422b283d, + 0x422ba806, + 0x422c2825, + 0x422ca7e0, + 0x422d27bf, 0x443206ad, 0x443286bc, 0x443306c8, @@ -1840,132 +583,135 @@ const uint32_t kOpenSSLReasonValues[] = { 0x44390782, 0x44398790, 0x443a07a3, - 0x4c32124f, - 0x4c32925f, - 0x4c331272, - 0x4c339292, + 0x4c3212b0, + 0x4c3292c0, + 0x4c3312d3, + 0x4c3392f3, 0x4c340094, 0x4c3480b0, - 0x4c35129e, - 0x4c3592ac, - 0x4c3612c8, - 0x4c3692db, - 0x4c3712ea, - 0x4c3792f8, - 0x4c38130d, - 0x4c389319, - 0x4c391339, - 0x4c399363, - 0x4c3a137c, - 0x4c3a9395, + 0x4c3512ff, + 0x4c35930d, + 0x4c361329, + 0x4c36933c, + 0x4c37134b, + 0x4c379359, + 0x4c38136e, + 0x4c38937a, + 0x4c39139a, + 0x4c3993c4, + 0x4c3a13dd, + 0x4c3a93f6, 0x4c3b05d0, - 0x4c3b93ae, - 0x4c3c13c0, - 0x4c3c93cf, - 0x4c3d105c, - 0x4c3d93e8, - 0x4c3e13f5, - 0x50322c47, - 0x5032ac56, - 0x50332c61, - 0x5033ac71, - 0x50342c8a, - 0x5034aca4, - 0x50352cb2, - 0x5035acc8, - 0x50362cda, - 0x5036acf0, - 0x50372d09, - 0x5037ad1c, - 0x50382d34, - 0x5038ad45, - 0x50392d5a, - 0x5039ad6e, - 0x503a2d8e, - 0x503aada4, - 0x503b2dbc, - 0x503badce, - 0x503c2dea, - 0x503cae01, - 0x503d2e1a, - 0x503dae30, - 0x503e2e3d, - 0x503eae53, - 0x503f2e65, + 0x4c3b940f, + 0x4c3c1421, + 0x4c3c9430, + 0x4c3d10bd, + 0x4c3d9449, + 0x4c3e1456, + 0x50322e80, + 0x5032ae8f, + 0x50332e9a, + 0x5033aeaa, + 0x50342ec3, + 0x5034aedd, + 0x50352eeb, + 0x5035af01, + 0x50362f13, + 0x5036af29, + 0x50372f42, + 0x5037af55, + 0x50382f6d, + 0x5038af7e, + 0x50392f93, + 0x5039afa7, + 0x503a2fc7, + 0x503aafdd, + 0x503b2ff5, + 0x503bb007, + 0x503c3023, + 0x503cb03a, + 0x503d3053, + 0x503db069, + 0x503e3076, + 0x503eb08c, + 0x503f309e, 0x503f8348, - 0x50402e78, - 0x5040ae88, - 0x50412ea2, - 0x5041aeb1, - 0x50422ecb, - 0x5042aee8, - 0x50432ef8, - 0x5043af08, - 0x50442f17, + 0x504030b1, + 0x5040b0c1, + 0x504130db, + 0x5041b0ea, + 0x50423104, + 0x5042b121, + 0x50433131, + 0x5043b141, + 0x50443150, 0x50448414, - 0x50452f2b, - 0x5045af49, - 0x50462f5c, - 0x5046af72, - 0x50472f84, - 0x5047af99, - 0x50482fbf, - 0x5048afcd, - 0x50492fe0, - 0x5049aff5, - 0x504a300b, - 0x504ab01b, - 0x504b303b, - 0x504bb04e, - 0x504c3071, - 0x504cb09f, - 0x504d30b1, - 0x504db0ce, - 0x504e30e9, - 0x504eb105, - 0x504f3117, - 0x504fb12e, - 0x5050313d, + 0x50453164, + 0x5045b182, + 0x50463195, + 0x5046b1ab, + 0x504731bd, + 0x5047b1d2, + 0x504831f8, + 0x5048b206, + 0x50493219, + 0x5049b22e, + 0x504a3244, + 0x504ab254, + 0x504b3274, + 0x504bb287, + 0x504c32aa, + 0x504cb2d8, + 0x504d32ea, + 0x504db307, + 0x504e3322, + 0x504eb33e, + 0x504f3350, + 0x504fb367, + 0x50503376, 0x50508687, - 0x50513150, - 0x58320de7, - 0x68320da9, - 0x68328b8e, - 0x68330ba1, - 0x68338db7, - 0x68340dc7, - 0x6c320d85, - 0x6c328b71, - 0x6c330d90, - 0x74320980, - 0x783208e5, - 0x783288fa, - 0x78330906, + 0x50513389, + 0x58320e1f, + 0x68320de1, + 0x68328b9b, + 0x68330bae, + 0x68338def, + 0x68340dff, + 0x683480b0, + 0x6c320dbd, + 0x6c328b7e, + 0x6c330dc8, + 0x7432098d, + 0x783208f2, + 0x78328907, + 0x78330913, 0x78338083, - 0x78340915, - 0x7834892a, - 0x78350949, - 0x7835896b, - 0x78360980, - 0x78368996, - 0x783709a6, - 0x783789b9, - 0x783809cc, - 0x783889de, - 0x783909eb, - 0x78398a0a, - 0x783a0a1f, - 0x783a8a2d, - 0x783b0a37, - 0x783b8a4b, - 0x783c0a62, - 0x783c8a77, - 0x783d0a8e, - 0x783d8aa3, - 0x783e09f9, - 0x80321151, + 0x78340922, + 0x78348937, + 0x78350956, + 0x78358978, + 0x7836098d, + 0x783689a3, + 0x783709b3, + 0x783789c6, + 0x783809d9, + 0x783889eb, + 0x783909f8, + 0x78398a17, + 0x783a0a2c, + 0x783a8a3a, + 0x783b0a44, + 0x783b8a58, + 0x783c0a6f, + 0x783c8a84, + 0x783d0a9b, + 0x783d8ab0, + 0x783e0a06, + 0x7c3211b2, }; + const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); + const char kOpenSSLReasonStringData[] = "ASN1_LENGTH_MISMATCH\0" "AUX_ERROR\0" @@ -2073,6 +819,7 @@ const char kOpenSSLReasonStringData[] = "UNSUPPORTED_METHOD\0" "WRITE_TO_READ_ONLY_BIO\0" "ARG2_LT_ARG3\0" + "BAD_ENCODING\0" "BAD_RECIPROCAL\0" "BIGNUM_TOO_LONG\0" "BITS_TOO_SMALL\0" @@ -2126,6 +873,7 @@ const char kOpenSSLReasonStringData[] = "BAD_Q_VALUE\0" "MISSING_PARAMETERS\0" "NEED_NEW_SETUP_VALUES\0" + "BIGNUM_OUT_OF_RANGE\0" "COORDINATES_OUT_OF_RANGE\0" "D2I_ECPKPARAMETERS_FAILURE\0" "EC_GROUP_NEW_BY_NAME_FAILURE\0" @@ -2148,6 +896,7 @@ const char kOpenSSLReasonStringData[] = "UNDEFINED_GENERATOR\0" "UNKNOWN_GROUP\0" "UNKNOWN_ORDER\0" + "WRONG_CURVE_PARAMETERS\0" "WRONG_ORDER\0" "KDF_FAILED\0" "POINT_ARITHMETIC_FAILURE\0" @@ -2155,6 +904,7 @@ const char kOpenSSLReasonStringData[] = "NOT_IMPLEMENTED\0" "RANDOM_NUMBER_GENERATION_FAILED\0" "OPERATION_NOT_SUPPORTED\0" + "BN_DECODE_ERROR\0" "COMMAND_NOT_SUPPORTED\0" "CONTEXT_NOT_INITIALISED\0" "DIFFERENT_KEY_TYPES\0" @@ -2184,6 +934,7 @@ const char kOpenSSLReasonStringData[] = "NO_PARAMETERS_SET\0" "OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE\0" "OPERATON_NOT_INITIALIZED\0" + "PARAMETER_ENCODING_ERROR\0" "UNKNOWN_DIGEST\0" "UNKNOWN_MASK_DIGEST\0" "UNKNOWN_MESSAGE_DIGEST_ALGORITHM\0" @@ -2235,8 +986,10 @@ const char kOpenSSLReasonStringData[] = "BAD_FIXED_HEADER_DECRYPT\0" "BAD_PAD_BYTE_COUNT\0" "BAD_RSA_PARAMETERS\0" + "BAD_VERSION\0" "BLOCK_TYPE_IS_NOT_01\0" "BN_NOT_INITIALIZED\0" + "CANNOT_RECOVER_MULTI_PRIME_KEY\0" "CRT_PARAMS_ALREADY_GIVEN\0" "CRT_VALUES_INCORRECT\0" "DATA_LEN_NOT_EQUAL_TO_MOD_LEN\0" @@ -2254,6 +1007,7 @@ const char kOpenSSLReasonStringData[] = "INVALID_MESSAGE_LENGTH\0" "KEY_SIZE_TOO_SMALL\0" "LAST_OCTET_INVALID\0" + "MUST_HAVE_AT_LEAST_TWO_PRIMES\0" "NO_PUBLIC_EXPONENT\0" "NULL_BEFORE_BLOCK_MISSING\0" "N_NOT_EQUAL_P_Q\0" @@ -2304,18 +1058,24 @@ const char kOpenSSLReasonStringData[] = "CONNECTION_REJECTED\0" "CONNECTION_TYPE_NOT_SET\0" "COOKIE_MISMATCH\0" + "CUSTOM_EXTENSION_CONTENTS_TOO_LARGE\0" + "CUSTOM_EXTENSION_ERROR\0" "D2I_ECDSA_SIG\0" "DATA_BETWEEN_CCS_AND_FINISHED\0" "DATA_LENGTH_TOO_LONG\0" "DECRYPTION_FAILED\0" "DECRYPTION_FAILED_OR_BAD_RECORD_MAC\0" "DH_PUBLIC_VALUE_LENGTH_IS_WRONG\0" + "DH_P_TOO_LONG\0" "DIGEST_CHECK_FAILED\0" "DTLS_MESSAGE_TOO_BIG\0" "ECC_CERT_NOT_FOR_SIGNING\0" "EMPTY_SRTP_PROTECTION_PROFILE_LIST\0" + "EMS_STATE_INCONSISTENT\0" "ENCRYPTED_LENGTH_TOO_LONG\0" + "ERROR_ADDING_EXTENSION\0" "ERROR_IN_RECEIVED_CIPHER_LIST\0" + "ERROR_PARSING_EXTENSION\0" "EVP_DIGESTSIGNFINAL_FAILED\0" "EVP_DIGESTSIGNINIT_FAILED\0" "EXCESSIVE_MESSAGE_SIZE\0" @@ -2338,6 +1098,7 @@ const char kOpenSSLReasonStringData[] = "LIBRARY_HAS_NO_CIPHERS\0" "MISSING_DH_KEY\0" "MISSING_ECDSA_SIGNING_CERT\0" + "MISSING_EXTENSION\0" "MISSING_RSA_CERTIFICATE\0" "MISSING_RSA_ENCRYPTING_CERT\0" "MISSING_RSA_SIGNING_CERT\0" @@ -2345,6 +1106,7 @@ const char kOpenSSLReasonStringData[] = "MISSING_TMP_ECDH_KEY\0" "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0" "MTU_TOO_SMALL\0" + "NEGOTIATED_BOTH_NPN_AND_ALPN\0" "NESTED_GROUP\0" "NO_CERTIFICATES_RETURNED\0" "NO_CERTIFICATE_ASSIGNED\0" @@ -2365,6 +1127,7 @@ const char kOpenSSLReasonStringData[] = "NULL_SSL_CTX\0" "NULL_SSL_METHOD_PASSED\0" "OLD_SESSION_CIPHER_NOT_RETURNED\0" + "OLD_SESSION_VERSION_NOT_RETURNED\0" "PACKET_LENGTH_TOO_LONG\0" "PARSE_TLSEXT\0" "PATH_TOO_LONG\0" @@ -2382,11 +1145,14 @@ const char kOpenSSLReasonStringData[] = "RENEGOTIATION_ENCODING_ERR\0" "RENEGOTIATION_MISMATCH\0" "REQUIRED_CIPHER_MISSING\0" + "RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION\0" + "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0" "SCSV_RECEIVED_WHEN_RENEGOTIATING\0" "SERVERHELLO_TLSEXT\0" "SESSION_ID_CONTEXT_UNINITIALIZED\0" "SESSION_MAY_NOT_BE_CREATED\0" "SIGNATURE_ALGORITHMS_ERROR\0" + "SIGNATURE_ALGORITHMS_EXTENSION_SENT_BY_SERVER\0" "SRTP_COULD_NOT_ALLOCATE_PROFILES\0" "SRTP_PROTECTION_PROFILE_LIST_TOO_LONG\0" "SRTP_UNKNOWN_PROTECTION_PROFILE\0" @@ -2434,8 +1200,10 @@ const char kOpenSSLReasonStringData[] = "TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST\0" "TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG\0" "TOO_MANY_EMPTY_FRAGMENTS\0" + "TOO_MANY_WARNING_ALERTS\0" "UNABLE_TO_FIND_ECDH_PARAMETERS\0" "UNABLE_TO_FIND_PUBLIC_KEY_PARAMETERS\0" + "UNEXPECTED_EXTENSION\0" "UNEXPECTED_GROUP_CLOSE\0" "UNEXPECTED_MESSAGE\0" "UNEXPECTED_OPERATOR_IN_GROUP\0" diff --git a/third_party/serf/linux-aarch64/crypto/aes/aesv8-armx.S b/third_party/serf/linux-aarch64/crypto/aes/aesv8-armx64.S similarity index 99% rename from third_party/serf/linux-aarch64/crypto/aes/aesv8-armx.S rename to third_party/serf/linux-aarch64/crypto/aes/aesv8-armx64.S index 9c63291070..fa2abbccc0 100644 --- a/third_party/serf/linux-aarch64/crypto/aes/aesv8-armx.S +++ b/third_party/serf/linux-aarch64/crypto/aes/aesv8-armx64.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__aarch64__) +#include #if __ARM_MAX_ARCH__>=7 .text @@ -747,3 +748,4 @@ aes_v8_ctr32_encrypt_blocks: ret .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-aarch64/crypto/bn/armv8-mont.S b/third_party/serf/linux-aarch64/crypto/bn/armv8-mont.S new file mode 100644 index 0000000000..9355ce79c9 --- /dev/null +++ b/third_party/serf/linux-aarch64/crypto/bn/armv8-mont.S @@ -0,0 +1,1406 @@ +#if defined(__aarch64__) +.text + +.globl bn_mul_mont +.type bn_mul_mont,%function +.align 5 +bn_mul_mont: + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +.Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,.L1st_skip + +.L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,.L1st + +.L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +.Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,.Linner_skip + +.Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,.Linner + +.Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +.Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,.Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +.Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,.Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size bn_mul_mont,.-bn_mul_mont +.type __bn_sqr8x_mont,%function +.align 5 +__bn_sqr8x_mont: + cmp x1,x2 + b.ne __bn_mul4x_mont +.Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,.Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,.Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq .Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,.Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +.Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,.Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +.Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,.Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,.Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +.Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,.Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +.Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,.Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +.Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,.Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +.type __bn_mul4x_mont,%function +.align 5 +__bn_mul4x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +.Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_reduction + + cbz x10,.Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,.Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +.Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,.Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq .Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +.Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,.Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +.Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,.Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_mul4x_mont,.-__bn_mul4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif \ No newline at end of file diff --git a/third_party/serf/linux-aarch64/crypto/modes/ghashv8-armx.S b/third_party/serf/linux-aarch64/crypto/modes/ghashv8-armx64.S similarity index 91% rename from third_party/serf/linux-aarch64/crypto/modes/ghashv8-armx.S rename to third_party/serf/linux-aarch64/crypto/modes/ghashv8-armx64.S index ad190747a9..8d44667ec2 100644 --- a/third_party/serf/linux-aarch64/crypto/modes/ghashv8-armx.S +++ b/third_party/serf/linux-aarch64/crypto/modes/ghashv8-armx64.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__aarch64__) +#include .text #if !defined(__clang__) @@ -67,10 +68,10 @@ gcm_gmult_v8: #endif ext v3.16b,v17.16b,v17.16b,#8 - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b @@ -134,7 +135,7 @@ gcm_ghash_v8: #endif ext v7.16b,v17.16b,v17.16b,#8 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing pmull2 v6.1q,v20.2d,v7.2d b .Loop_mod2x_v8 @@ -143,14 +144,14 @@ gcm_ghash_v8: .Loop_mod2x_v8: ext v18.16b,v3.16b,v3.16b,#8 subs x3,x3,#32 //is there more data? - pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo csel x12,xzr,x12,lo //is it time to zero x12? pmull v5.1q,v21.1d,v17.1d eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi eor v0.16b,v0.16b,v4.16b //accumulate - pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] eor v2.16b,v2.16b,v6.16b @@ -175,7 +176,7 @@ gcm_ghash_v8: ext v7.16b,v17.16b,v17.16b,#8 ext v3.16b,v16.16b,v16.16b,#8 eor v0.16b,v1.16b,v18.16b - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction @@ -196,10 +197,10 @@ gcm_ghash_v8: eor v3.16b,v3.16b,v0.16b //inp^=Xi eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b @@ -228,3 +229,4 @@ gcm_ghash_v8: .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 +#endif \ No newline at end of file diff --git a/third_party/serf/linux-aarch64/crypto/sha/sha1-armv8.S b/third_party/serf/linux-aarch64/crypto/sha/sha1-armv8.S index ab6aa98a17..6cf9877e23 100644 --- a/third_party/serf/linux-aarch64/crypto/sha/sha1-armv8.S +++ b/third_party/serf/linux-aarch64/crypto/sha/sha1-armv8.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__aarch64__) +#include .text @@ -1211,3 +1212,4 @@ sha1_block_armv8: .align 2 .align 2 .comm OPENSSL_armcap_P,4,4 +#endif \ No newline at end of file diff --git a/third_party/serf/linux-aarch64/crypto/sha/sha256-armv8.S b/third_party/serf/linux-aarch64/crypto/sha/sha256-armv8.S index ec572e9503..0fad009460 100644 --- a/third_party/serf/linux-aarch64/crypto/sha/sha256-armv8.S +++ b/third_party/serf/linux-aarch64/crypto/sha/sha256-armv8.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__aarch64__) +#include .text @@ -1141,3 +1142,4 @@ sha256_block_armv8: ret .size sha256_block_armv8,.-sha256_block_armv8 .comm OPENSSL_armcap_P,4,4 +#endif \ No newline at end of file diff --git a/third_party/serf/linux-aarch64/crypto/sha/sha512-armv8.S b/third_party/serf/linux-aarch64/crypto/sha/sha512-armv8.S index 8fc342a545..517c033fb0 100644 --- a/third_party/serf/linux-aarch64/crypto/sha/sha512-armv8.S +++ b/third_party/serf/linux-aarch64/crypto/sha/sha512-armv8.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__aarch64__) +#include .text @@ -1021,3 +1022,4 @@ sha512_block_data_order: .align 2 .align 2 .comm OPENSSL_armcap_P,4,4 +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/aes/aes-armv4.S b/third_party/serf/linux-arm/crypto/aes/aes-armv4.S index 113502037c..c4d70657e6 100644 --- a/third_party/serf/linux-arm/crypto/aes/aes-armv4.S +++ b/third_party/serf/linux-arm/crypto/aes/aes-armv4.S @@ -1,3 +1,4 @@ +#if defined(__arm__) @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -33,7 +34,7 @@ #if defined(__arm__) #ifndef __KERNEL__ -# include "arm_arch.h" +# include #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif @@ -1196,3 +1197,4 @@ _armv4_AES_decrypt: .align 2 #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/aes/aesv8-armx.S b/third_party/serf/linux-arm/crypto/aes/aesv8-armx32.S similarity index 99% rename from third_party/serf/linux-arm/crypto/aes/aesv8-armx.S rename to third_party/serf/linux-arm/crypto/aes/aesv8-armx32.S index 006300c380..6012b0ce28 100644 --- a/third_party/serf/linux-arm/crypto/aes/aesv8-armx.S +++ b/third_party/serf/linux-arm/crypto/aes/aesv8-armx32.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__arm__) +#include #if __ARM_MAX_ARCH__>=7 .text @@ -752,3 +753,4 @@ aes_v8_ctr32_encrypt_blocks: ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/aes/bsaes-armv7.S b/third_party/serf/linux-arm/crypto/aes/bsaes-armv7.S index 204ee3e17f..a3ebec8884 100644 --- a/third_party/serf/linux-arm/crypto/aes/bsaes-armv7.S +++ b/third_party/serf/linux-arm/crypto/aes/bsaes-armv7.S @@ -1,3 +1,4 @@ +#if defined(__arm__) @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -46,9 +47,8 @@ @ @ -#if defined(__arm__) #ifndef __KERNEL__ -# include "arm_arch.h" +# include # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} @@ -2574,4 +2574,4 @@ bsaes_xts_decrypt: .size bsaes_xts_decrypt,.-bsaes_xts_decrypt #endif -#endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/bn/armv4-mont.S b/third_party/serf/linux-arm/crypto/bn/armv4-mont.S index 81dcbebc3b..fc671e8d6a 100644 --- a/third_party/serf/linux-arm/crypto/bn/armv4-mont.S +++ b/third_party/serf/linux-arm/crypto/bn/armv4-mont.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__arm__) +#include .text .code 32 @@ -585,3 +586,4 @@ bn_mul8x_mont_neon: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/modes/ghash-armv4.S b/third_party/serf/linux-arm/crypto/modes/ghash-armv4.S index b6c7e9b7b9..d955d827a5 100644 --- a/third_party/serf/linux-arm/crypto/modes/ghash-armv4.S +++ b/third_party/serf/linux-arm/crypto/modes/ghash-armv4.S @@ -1,12 +1,12 @@ #if defined(__arm__) -#include "arm_arch.h" +#include .syntax unified .text .code 32 -#ifdef __APPLE__ +#ifdef __clang__ #define ldrplb ldrbpl #define ldrneb ldrbne #endif @@ -535,5 +535,4 @@ gcm_ghash_neon: .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 - -#endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/modes/ghashv8-armx.S b/third_party/serf/linux-arm/crypto/modes/ghashv8-armx32.S similarity index 89% rename from third_party/serf/linux-arm/crypto/modes/ghashv8-armx.S rename to third_party/serf/linux-arm/crypto/modes/ghashv8-armx32.S index 71913fb174..9a38ded484 100644 --- a/third_party/serf/linux-arm/crypto/modes/ghashv8-armx.S +++ b/third_party/serf/linux-arm/crypto/modes/ghashv8-armx32.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__arm__) +#include .text .fpu neon @@ -66,10 +67,10 @@ gcm_gmult_v8: #endif vext.8 q3,q9,q9,#8 -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 @@ -134,7 +135,7 @@ gcm_ghash_v8: #endif vext.8 q7,q9,q9,#8 veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 veor q9,q9,q7 @ Karatsuba pre-processing .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 b .Loop_mod2x_v8 @@ -143,14 +144,14 @@ gcm_ghash_v8: .Loop_mod2x_v8: vext.8 q10,q3,q3,#8 subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo +.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo movlo r12,#0 @ is it time to zero r12? .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi +.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) +.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] veor q2,q2,q6 @@ -175,7 +176,7 @@ gcm_ghash_v8: vext.8 q7,q9,q9,#8 vext.8 q3,q8,q8,#8 veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 veor q3,q3,q2 @ accumulate q3 early vext.8 q10,q0,q0,#8 @ 2nd phase of reduction @@ -196,10 +197,10 @@ gcm_ghash_v8: veor q3,q3,q0 @ inp^=Xi veor q9,q8,q10 @ q9 is rotated inp^Xi -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 @@ -229,3 +230,4 @@ gcm_ghash_v8: .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/sha/sha1-armv4-large.S b/third_party/serf/linux-arm/crypto/sha/sha1-armv4-large.S index 52c99bf1ba..66d0ef3e56 100644 --- a/third_party/serf/linux-arm/crypto/sha/sha1-armv4-large.S +++ b/third_party/serf/linux-arm/crypto/sha/sha1-armv4-large.S @@ -1,4 +1,5 @@ -#include "arm_arch.h" +#if defined(__arm__) +#include .text .code 32 @@ -1458,3 +1459,4 @@ sha1_block_data_order_armv8: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/sha/sha256-armv4.S b/third_party/serf/linux-arm/crypto/sha/sha256-armv4.S index 114aa43f22..9fc3e0b8b6 100644 --- a/third_party/serf/linux-arm/crypto/sha/sha256-armv4.S +++ b/third_party/serf/linux-arm/crypto/sha/sha256-armv4.S @@ -1,3 +1,4 @@ +#if defined(__arm__) @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -37,7 +38,7 @@ @ Add ARMv8 code path performing at 2.0 cpb on Apple A7. #ifndef __KERNEL__ -# include "arm_arch.h" +# include #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ 7 @@ -2814,3 +2815,4 @@ sha256_block_data_order_armv8: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-arm/crypto/sha/sha512-armv4.S b/third_party/serf/linux-arm/crypto/sha/sha512-armv4.S index 1a3d467a68..834ede9c30 100644 --- a/third_party/serf/linux-arm/crypto/sha/sha512-armv4.S +++ b/third_party/serf/linux-arm/crypto/sha/sha512-armv4.S @@ -1,3 +1,4 @@ +#if defined(__arm__) @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -46,7 +47,7 @@ @ was reflected in below two parameters as 0 and 4. Now caller is @ expected to maintain native byte order for whole 64-bit values. #ifndef __KERNEL__ -# include "arm_arch.h" +# include # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} #else @@ -1865,3 +1866,4 @@ sha512_block_data_order_neon: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif +#endif \ No newline at end of file diff --git a/third_party/serf/linux-x86/crypto/cpu-x86-asm.S b/third_party/serf/linux-x86/crypto/cpu-x86-asm.S deleted file mode 100644 index 24a8dd4d42..0000000000 --- a/third_party/serf/linux-x86/crypto/cpu-x86-asm.S +++ /dev/null @@ -1,322 +0,0 @@ -#if defined(__i386__) -.file "crypto/cpu-x86-asm.S" -.text -.globl OPENSSL_ia32_cpuid -.hidden OPENSSL_ia32_cpuid -.type OPENSSL_ia32_cpuid,@function -.align 16 -OPENSSL_ia32_cpuid: -.L_OPENSSL_ia32_cpuid_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %edx,%edx - pushfl - popl %eax - movl %eax,%ecx - xorl $2097152,%eax - pushl %eax - popfl - pushfl - popl %eax - xorl %eax,%ecx - xorl %eax,%eax - btl $21,%ecx - jnc .L000nocpuid - movl 20(%esp),%esi - movl %eax,8(%esi) - .byte 0x0f,0xa2 - movl %eax,%edi - xorl %eax,%eax - cmpl $1970169159,%ebx - setne %al - movl %eax,%ebp - cmpl $1231384169,%edx - setne %al - orl %eax,%ebp - cmpl $1818588270,%ecx - setne %al - orl %eax,%ebp - jz .L001intel - cmpl $1752462657,%ebx - setne %al - movl %eax,%esi - cmpl $1769238117,%edx - setne %al - orl %eax,%esi - cmpl $1145913699,%ecx - setne %al - orl %eax,%esi - jnz .L001intel - movl $2147483648,%eax - .byte 0x0f,0xa2 - cmpl $2147483649,%eax - jb .L001intel - movl %eax,%esi - movl $2147483649,%eax - .byte 0x0f,0xa2 - orl %ecx,%ebp - andl $2049,%ebp - cmpl $2147483656,%esi - jb .L001intel - movl $2147483656,%eax - .byte 0x0f,0xa2 - movzbl %cl,%esi - incl %esi - movl $1,%eax - xorl %ecx,%ecx - .byte 0x0f,0xa2 - btl $28,%edx - jnc .L002generic - shrl $16,%ebx - andl $255,%ebx - cmpl %esi,%ebx - ja .L002generic - andl $4026531839,%edx - jmp .L002generic -.L001intel: - cmpl $7,%edi - jb .L003cacheinfo - movl 20(%esp),%esi - movl $7,%eax - xorl %ecx,%ecx - .byte 0x0f,0xa2 - movl %ebx,8(%esi) -.L003cacheinfo: - cmpl $4,%edi - movl $-1,%edi - jb .L004nocacheinfo - movl $4,%eax - movl $0,%ecx - .byte 0x0f,0xa2 - movl %eax,%edi - shrl $14,%edi - andl $4095,%edi -.L004nocacheinfo: - movl $1,%eax - xorl %ecx,%ecx - .byte 0x0f,0xa2 - andl $3220176895,%edx - cmpl $0,%ebp - jne .L005notintel - orl $1073741824,%edx -.L005notintel: - btl $28,%edx - jnc .L002generic - andl $4026531839,%edx - cmpl $0,%edi - je .L002generic - orl $268435456,%edx - shrl $16,%ebx - cmpb $1,%bl - ja .L002generic - andl $4026531839,%edx -.L002generic: - andl $2048,%ebp - andl $4294965247,%ecx - movl %edx,%esi - orl %ecx,%ebp - btl $27,%ecx - jnc .L006clear_avx - xorl %ecx,%ecx -.byte 15,1,208 - andl $6,%eax - cmpl $6,%eax - je .L007done - cmpl $2,%eax - je .L006clear_avx -.L008clear_xmm: - andl $4261412861,%ebp - andl $4278190079,%esi -.L006clear_avx: - andl $4026525695,%ebp - movl 20(%esp),%edi - andl $4294967263,8(%edi) -.L007done: - movl %esi,%eax - movl %ebp,%edx -.L000nocpuid: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size OPENSSL_ia32_cpuid,.-.L_OPENSSL_ia32_cpuid_begin -.globl OPENSSL_rdtsc -.hidden OPENSSL_rdtsc -.type OPENSSL_rdtsc,@function -.align 16 -OPENSSL_rdtsc: -.L_OPENSSL_rdtsc_begin: - xorl %eax,%eax - xorl %edx,%edx - call .L009PIC_me_up -.L009PIC_me_up: - popl %ecx - leal OPENSSL_ia32cap_P-.L009PIC_me_up(%ecx),%ecx - btl $4,(%ecx) - jnc .L010notsc - .byte 0x0f,0x31 -.L010notsc: - ret -.size OPENSSL_rdtsc,.-.L_OPENSSL_rdtsc_begin -.globl OPENSSL_instrument_halt -.hidden OPENSSL_instrument_halt -.type OPENSSL_instrument_halt,@function -.align 16 -OPENSSL_instrument_halt: -.L_OPENSSL_instrument_halt_begin: - call .L011PIC_me_up -.L011PIC_me_up: - popl %ecx - leal OPENSSL_ia32cap_P-.L011PIC_me_up(%ecx),%ecx - btl $4,(%ecx) - jnc .L012nohalt -.long 2421723150 - andl $3,%eax - jnz .L012nohalt - pushfl - popl %eax - btl $9,%eax - jnc .L012nohalt - .byte 0x0f,0x31 - pushl %edx - pushl %eax - hlt - .byte 0x0f,0x31 - subl (%esp),%eax - sbbl 4(%esp),%edx - addl $8,%esp - ret -.L012nohalt: - xorl %eax,%eax - xorl %edx,%edx - ret -.size OPENSSL_instrument_halt,.-.L_OPENSSL_instrument_halt_begin -.globl OPENSSL_far_spin -.hidden OPENSSL_far_spin -.type OPENSSL_far_spin,@function -.align 16 -OPENSSL_far_spin: -.L_OPENSSL_far_spin_begin: - pushfl - popl %eax - btl $9,%eax - jnc .L013nospin - movl 4(%esp),%eax - movl 8(%esp),%ecx -.long 2430111262 - xorl %eax,%eax - movl (%ecx),%edx - jmp .L014spin -.align 16 -.L014spin: - incl %eax - cmpl (%ecx),%edx - je .L014spin -.long 529567888 - ret -.L013nospin: - xorl %eax,%eax - xorl %edx,%edx - ret -.size OPENSSL_far_spin,.-.L_OPENSSL_far_spin_begin -.globl OPENSSL_wipe_cpu -.hidden OPENSSL_wipe_cpu -.type OPENSSL_wipe_cpu,@function -.align 16 -OPENSSL_wipe_cpu: -.L_OPENSSL_wipe_cpu_begin: - xorl %eax,%eax - xorl %edx,%edx - call .L015PIC_me_up -.L015PIC_me_up: - popl %ecx - leal OPENSSL_ia32cap_P-.L015PIC_me_up(%ecx),%ecx - movl (%ecx),%ecx - btl $1,(%ecx) - jnc .L016no_x87 - andl $83886080,%ecx - cmpl $83886080,%ecx - jne .L017no_sse2 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 -.L017no_sse2: -.long 4007259865,4007259865,4007259865,4007259865,2430851995 -.L016no_x87: - leal 4(%esp),%eax - ret -.size OPENSSL_wipe_cpu,.-.L_OPENSSL_wipe_cpu_begin -.globl OPENSSL_atomic_add -.hidden OPENSSL_atomic_add -.type OPENSSL_atomic_add,@function -.align 16 -OPENSSL_atomic_add: -.L_OPENSSL_atomic_add_begin: - movl 4(%esp),%edx - movl 8(%esp),%ecx - pushl %ebx - nop - movl (%edx),%eax -.L018spin: - leal (%eax,%ecx,1),%ebx - nop -.long 447811568 - jne .L018spin - movl %ebx,%eax - popl %ebx - ret -.size OPENSSL_atomic_add,.-.L_OPENSSL_atomic_add_begin -.globl OPENSSL_indirect_call -.hidden OPENSSL_indirect_call -.type OPENSSL_indirect_call,@function -.align 16 -OPENSSL_indirect_call: -.L_OPENSSL_indirect_call_begin: - pushl %ebp - movl %esp,%ebp - subl $28,%esp - movl 12(%ebp),%ecx - movl %ecx,(%esp) - movl 16(%ebp),%edx - movl %edx,4(%esp) - movl 20(%ebp),%eax - movl %eax,8(%esp) - movl 24(%ebp),%eax - movl %eax,12(%esp) - movl 28(%ebp),%eax - movl %eax,16(%esp) - movl 32(%ebp),%eax - movl %eax,20(%esp) - movl 36(%ebp),%eax - movl %eax,24(%esp) - call *8(%ebp) - movl %ebp,%esp - popl %ebp - ret -.size OPENSSL_indirect_call,.-.L_OPENSSL_indirect_call_begin -.globl OPENSSL_ia32_rdrand -.hidden OPENSSL_ia32_rdrand -.type OPENSSL_ia32_rdrand,@function -.align 16 -OPENSSL_ia32_rdrand: -.L_OPENSSL_ia32_rdrand_begin: - movl $8,%ecx -.L019loop: -.byte 15,199,240 - jc .L020break - loop .L019loop -.L020break: - cmpl $0,%eax - cmovel %ecx,%eax - ret -.size OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin -.hidden OPENSSL_ia32cap_P -#endif diff --git a/third_party/serf/linux-x86/crypto/sha/sha1-586.S b/third_party/serf/linux-x86/crypto/sha/sha1-586.S index 808ccac517..58d0bc1277 100644 --- a/third_party/serf/linux-x86/crypto/sha/sha1-586.S +++ b/third_party/serf/linux-x86/crypto/sha/sha1-586.S @@ -23,8 +23,11 @@ sha1_block_data_order: movl 8(%esi),%ecx testl $16777216,%eax jz .L001x86 - testl $536870912,%ecx - jnz .Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: @@ -1393,177 +1396,6 @@ sha1_block_data_order: popl %ebp ret .size sha1_block_data_order,.-.L_sha1_block_data_order_begin -.hidden _sha1_block_data_order_shaext -.type _sha1_block_data_order_shaext,@function -.align 16 -_sha1_block_data_order_shaext: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L003pic_point -.L003pic_point: - popl %ebp - leal .LK_XX_XX-.L003pic_point(%ebp),%ebp -.Lshaext_shortcut: - movl 20(%esp),%edi - movl %esp,%ebx - movl 24(%esp),%esi - movl 28(%esp),%ecx - subl $32,%esp - movdqu (%edi),%xmm0 - movd 16(%edi),%xmm1 - andl $-32,%esp - movdqa 80(%ebp),%xmm3 - movdqu (%esi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%esi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 -.byte 102,15,56,0,251 - jmp .L004loop_shaext -.align 16 -.L004loop_shaext: - decl %ecx - leal 64(%esi),%eax - movdqa %xmm1,(%esp) - paddd %xmm4,%xmm1 - cmovnel %eax,%esi - movdqa %xmm0,16(%esp) -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%esi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%esi),%xmm5 -.byte 102,15,56,0,227 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,235 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,243 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 - movdqa (%esp),%xmm2 -.byte 102,15,56,0,251 -.byte 15,56,200,202 - paddd 16(%esp),%xmm0 - jnz .L004loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%edi) - movd %xmm1,16(%edi) - movl %ebx,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext .hidden _sha1_block_data_order_ssse3 .type _sha1_block_data_order_ssse3,@function .align 16 @@ -1572,10 +1404,10 @@ _sha1_block_data_order_ssse3: pushl %ebx pushl %esi pushl %edi - call .L005pic_point -.L005pic_point: + call .L003pic_point +.L003pic_point: popl %ebp - leal .LK_XX_XX-.L005pic_point(%ebp),%ebp + leal .LK_XX_XX-.L003pic_point(%ebp),%ebp .Lssse3_shortcut: movdqa (%ebp),%xmm7 movdqa 16(%ebp),%xmm0 @@ -1628,9 +1460,9 @@ _sha1_block_data_order_ssse3: xorl %edx,%ebp pshufd $238,%xmm0,%xmm4 andl %ebp,%esi - jmp .L006loop + jmp .L004loop .align 16 -.L006loop: +.L004loop: rorl $2,%ebx xorl %edx,%esi movl %eax,%ebp @@ -2533,7 +2365,7 @@ _sha1_block_data_order_ssse3: addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp - je .L007done + je .L005done movdqa 160(%esp),%xmm7 movdqa 176(%esp),%xmm6 movdqu (%ebp),%xmm0 @@ -2668,9 +2500,9 @@ _sha1_block_data_order_ssse3: pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx - jmp .L006loop + jmp .L004loop .align 16 -.L007done: +.L005done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp @@ -2784,6 +2616,1177 @@ _sha1_block_data_order_ssse3: popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.hidden _sha1_block_data_order_avx +.type _sha1_block_data_order_avx,@function +.align 16 +_sha1_block_data_order_avx: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L006pic_point +.L006pic_point: + popl %ebp + leal .LK_XX_XX-.L006pic_point(%ebp),%ebp +.Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp .L007loop +.align 16 +.L007loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L008done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp .L007loop +.align 16 +.L008done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/third_party/serf/linux-x86/crypto/sha/sha256-586.S b/third_party/serf/linux-x86/crypto/sha/sha256-586.S index 08d948432b..38acbd8374 100644 --- a/third_party/serf/linux-x86/crypto/sha/sha256-586.S +++ b/third_party/serf/linux-x86/crypto/sha/sha256-586.S @@ -37,11 +37,10 @@ sha256_block_data_order: jz .L003no_xmm andl $1073741824,%ecx andl $268435968,%ebx - testl $536870912,%edx - jnz .L004shaext orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L004AVX testl $512,%ebx jnz .L005SSSE3 .L003no_xmm: @@ -3167,204 +3166,6 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L004shaext: - subl $32,%esp - movdqu (%esi),%xmm1 - leal 128(%ebp),%ebp - movdqu 16(%esi),%xmm2 - movdqa 128(%ebp),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext -.align 16 -.L010loop_shaext: - movdqu (%edi),%xmm3 - movdqu 16(%edi),%xmm4 - movdqu 32(%edi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%edi),%xmm6 - movdqa %xmm2,16(%esp) - movdqa -128(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,(%esp) -.byte 15,56,203,202 - movdqa -112(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leal 64(%edi),%edi -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -80(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa -64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa -48(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -16(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa (%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 16(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 48(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - movdqa 96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa 128(%ebp),%xmm7 -.byte 15,56,203,202 - movdqa 112(%ebp),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - cmpl %edi,%eax - nop -.byte 15,56,203,202 - paddd 16(%esp),%xmm2 - paddd (%esp),%xmm1 - jnz .L010loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - movl 44(%esp),%esp - movdqu %xmm1,(%esi) - movdqu %xmm2,16(%esi) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 .L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax @@ -3384,9 +3185,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L010grand_ssse3 .align 16 -.L011grand_ssse3: +.L010grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3409,9 +3210,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L011ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L011ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4054,7 +3855,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L011ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4568,8 +4369,1189 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L010grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L004AVX: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L012grand_avx +.align 32 +.L012grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L013avx_00_47 +.align 16 +.L013avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L013avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L012grand_avx movl 108(%esp),%esp + vzeroall popl %edi popl %esi popl %ebx diff --git a/third_party/serf/linux-x86_64/crypto/bn/x86_64-mont5.S b/third_party/serf/linux-x86_64/crypto/bn/x86_64-mont5.S index 02edc69b36..214064e61d 100644 --- a/third_party/serf/linux-x86_64/crypto/bn/x86_64-mont5.S +++ b/third_party/serf/linux-x86_64/crypto/bn/x86_64-mont5.S @@ -1561,6 +1561,15 @@ sqr8x_reduction: .align 32 .L8x_tail_done: addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + xorq %rax,%rax negq %rsi diff --git a/third_party/serf/linux-x86_64/crypto/cpu-x86_64-asm.S b/third_party/serf/linux-x86_64/crypto/cpu-x86_64-asm.S deleted file mode 100644 index 9eef154e9e..0000000000 --- a/third_party/serf/linux-x86_64/crypto/cpu-x86_64-asm.S +++ /dev/null @@ -1,143 +0,0 @@ -#if defined(__x86_64__) -.text - -.globl OPENSSL_ia32_cpuid -.hidden OPENSSL_ia32_cpuid -.type OPENSSL_ia32_cpuid,@function -.align 16 -OPENSSL_ia32_cpuid: - - - movq %rdi,%rdi - movq %rbx,%r8 - - xorl %eax,%eax - movl %eax,8(%rdi) - cpuid - movl %eax,%r11d - - xorl %eax,%eax - cmpl $1970169159,%ebx - setne %al - movl %eax,%r9d - cmpl $1231384169,%edx - setne %al - orl %eax,%r9d - cmpl $1818588270,%ecx - setne %al - orl %eax,%r9d - jz .Lintel - - cmpl $1752462657,%ebx - setne %al - movl %eax,%r10d - cmpl $1769238117,%edx - setne %al - orl %eax,%r10d - cmpl $1145913699,%ecx - setne %al - orl %eax,%r10d - jnz .Lintel - - - - - movl $2147483648,%eax - cpuid - - - cmpl $2147483649,%eax - jb .Lintel - movl %eax,%r10d - movl $2147483649,%eax - cpuid - - - orl %ecx,%r9d - andl $2049,%r9d - - cmpl $2147483656,%r10d - jb .Lintel - - movl $2147483656,%eax - cpuid - - movzbq %cl,%r10 - incq %r10 - - movl $1,%eax - cpuid - - btl $28,%edx - jnc .Lgeneric - shrl $16,%ebx - cmpb %r10b,%bl - ja .Lgeneric - andl $4026531839,%edx - jmp .Lgeneric - -.Lintel: - cmpl $4,%r11d - movl $-1,%r10d - jb .Lnocacheinfo - - movl $4,%eax - movl $0,%ecx - cpuid - movl %eax,%r10d - shrl $14,%r10d - andl $4095,%r10d - - cmpl $7,%r11d - jb .Lnocacheinfo - - movl $7,%eax - xorl %ecx,%ecx - cpuid - movl %ebx,8(%rdi) - -.Lnocacheinfo: - movl $1,%eax - cpuid - - andl $3220176895,%edx - cmpl $0,%r9d - jne .Lnotintel - orl $1073741824,%edx -.Lnotintel: - btl $28,%edx - jnc .Lgeneric - andl $4026531839,%edx - cmpl $0,%r10d - je .Lgeneric - - orl $268435456,%edx - shrl $16,%ebx - cmpb $1,%bl - ja .Lgeneric - andl $4026531839,%edx -.Lgeneric: - andl $2048,%r9d - andl $4294965247,%ecx - orl %ecx,%r9d - - movl %edx,%r10d - btl $27,%r9d - jnc .Lclear_avx - xorl %ecx,%ecx -.byte 0x0f,0x01,0xd0 - andl $6,%eax - cmpl $6,%eax - je .Ldone -.Lclear_avx: - movl $4026525695,%eax - andl %eax,%r9d - andl $4294967263,8(%rdi) -.Ldone: - movl %r9d,4(%rdi) - movl %r10d,0(%rdi) - movq %r8,%rbx - .byte 0xf3,0xc3 -.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid - -#endif diff --git a/third_party/serf/linux-x86_64/crypto/ec/p256-x86_64-asm.S b/third_party/serf/linux-x86_64/crypto/ec/p256-x86_64-asm.S new file mode 100644 index 0000000000..2884c69bc7 --- /dev/null +++ b/third_party/serf/linux-x86_64/crypto/ec/p256-x86_64-asm.S @@ -0,0 +1,1780 @@ +#if defined(__x86_64__) +.text +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P + + +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +.type ecp_nistz256_mul_by_2,@function +.align 64 +ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + sbbq %r13,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + + + +.globl ecp_nistz256_neg +.hidden ecp_nistz256_neg +.type ecp_nistz256_neg,@function +.align 32 +ecp_nistz256_neg: + pushq %r12 + pushq %r13 + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_neg,.-ecp_nistz256_neg + + + + + + +.globl ecp_nistz256_mul_mont +.hidden ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,@function +.align 32 +ecp_nistz256_mul_mont: +.Lmul_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq +.Lmul_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_montq,@function +.align 32 +__ecp_nistz256_mul_montq: + + + movq %rax,%rbp + mulq %r9 + movq .Lpoly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq .Lpoly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + + + + + + + + +.globl ecp_nistz256_sqr_mont +.hidden ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,@function +.align 32 +ecp_nistz256_sqr_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq +.Lsqr_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +.type __ecp_nistz256_sqr_montq,@function +.align 32 +__ecp_nistz256_sqr_montq: + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq .Lpoly+8(%rip),%rsi + movq .Lpoly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq + + + + + + +.globl ecp_nistz256_from_mont +.hidden ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,@function +.align 32 +ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%rax + movq .Lpoly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq .Lpoly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %r13 + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %r13 + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %r13 + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + movq %r8,%rcx + adcq %rax,%r10 + movq %r9,%rsi + adcq $0,%rdx + + subq $-1,%r8 + movq %r10,%rax + sbbq %r12,%r9 + sbbq $0,%r10 + movq %rdx,%r11 + sbbq %r13,%rdx + sbbq %r13,%r13 + + cmovnzq %rcx,%r8 + cmovnzq %rsi,%r9 + movq %r8,0(%rdi) + cmovnzq %rax,%r10 + movq %r9,8(%rdi) + cmovzq %rdx,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + + +.globl ecp_nistz256_select_w5 +.hidden ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,@function +.align 32 +ecp_nistz256_select_w5: + movdqa .LOne(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +.Lselect_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz .Lselect_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + + +.globl ecp_nistz256_select_w7 +.hidden ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,@function +.align 32 +ecp_nistz256_select_w7: + movdqa .LOne(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +.Lselect_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.globl ecp_nistz256_avx2_select_w7 +.hidden ecp_nistz256_avx2_select_w7 +.type ecp_nistz256_avx2_select_w7,@function +.align 32 +ecp_nistz256_avx2_select_w7: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double +.hidden ecp_nistz256_point_double +.type ecp_nistz256_point_double,@function +.align 32 +ecp_nistz256_point_double: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + addq $160+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.hidden ecp_nistz256_point_add +.type ecp_nistz256_point_add,@function +.align 32 +ecp_nistz256_point_add: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rsi),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz .Ladd_proceedq +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz .Ladd_proceedq + testq %r9,%r9 + jz .Ladd_proceedq + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_doneq + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + addq $576+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.hidden ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,@function +.align 32 +ecp_nistz256_point_add_affine: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rbx),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + addq $480+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +#endif diff --git a/third_party/serf/linux-x86_64/crypto/rand/rdrand-x86_64.S b/third_party/serf/linux-x86_64/crypto/rand/rdrand-x86_64.S index 622ae55c1e..94aab9c19b 100644 --- a/third_party/serf/linux-x86_64/crypto/rand/rdrand-x86_64.S +++ b/third_party/serf/linux-x86_64/crypto/rand/rdrand-x86_64.S @@ -1,11 +1,48 @@ #if defined(__x86_64__) .text + + + .globl CRYPTO_rdrand .hidden CRYPTO_rdrand .type CRYPTO_rdrand,@function .align 16 CRYPTO_rdrand: -.byte 0x48, 0x0f, 0xc7, 0xf0 + xorq %rax,%rax + + +.byte 0x48, 0x0f, 0xc7, 0xf1 + + adcq %rax,%rax + movq %rcx,0(%rdi) + .byte 0xf3,0xc3 + + + + + +.globl CRYPTO_rdrand_multiple8_buf +.hidden CRYPTO_rdrand_multiple8_buf +.type CRYPTO_rdrand_multiple8_buf,@function +.align 16 +CRYPTO_rdrand_multiple8_buf: + testq %rsi,%rsi + jz .Lout + movq $8,%rdx +.Lloop: + + +.byte 0x48, 0x0f, 0xc7, 0xf1 + jnc .Lerr + movq %rcx,0(%rdi) + addq %rdx,%rdi + subq %rdx,%rsi + jnz .Lloop +.Lout: + movq $1,%rax + .byte 0xf3,0xc3 +.Lerr: + xorq %rax,%rax .byte 0xf3,0xc3 #endif diff --git a/third_party/serf/linux-x86_64/crypto/sha/sha1-x86_64.S b/third_party/serf/linux-x86_64/crypto/sha/sha1-x86_64.S index 7668c2b1f5..d830b534de 100644 --- a/third_party/serf/linux-x86_64/crypto/sha/sha1-x86_64.S +++ b/third_party/serf/linux-x86_64/crypto/sha/sha1-x86_64.S @@ -13,6 +13,11 @@ sha1_block_data_order: movl OPENSSL_ia32cap_P+8(%rip),%r10d testl $512,%r8d jz .Lialu + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .align 16 @@ -2408,6 +2413,1122 @@ _ssse3_shortcut: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp + vzeroupper + movq %rax,%r14 + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r11 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.align 16 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r11),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r11),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r11),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.align 16 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/third_party/serf/linux-x86_64/crypto/sha/sha256-x86_64.S b/third_party/serf/linux-x86_64/crypto/sha/sha256-x86_64.S index f526de51ad..445b497e88 100644 --- a/third_party/serf/linux-x86_64/crypto/sha/sha256-x86_64.S +++ b/third_party/serf/linux-x86_64/crypto/sha/sha256-x86_64.S @@ -12,6 +12,11 @@ sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -2841,4 +2846,1061 @@ sha256_block_data_order_ssse3: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.Lavx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 64+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif diff --git a/third_party/serf/linux-x86_64/crypto/sha/sha512-x86_64.S b/third_party/serf/linux-x86_64/crypto/sha/sha512-x86_64.S index ca3a3a1644..d65743fd52 100644 --- a/third_party/serf/linux-x86_64/crypto/sha/sha512-x86_64.S +++ b/third_party/serf/linux-x86_64/crypto/sha/sha512-x86_64.S @@ -8,6 +8,17 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $2048,%r10d + jnz .Lxop_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -1784,4 +1795,2234 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha512_block_data_order_xop,@function +.align 64 +sha512_block_data_order_xop: +.Lxop_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +.Lprologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_xop +.align 16 +.Lloop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lxop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_xop + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_xop: + .byte 0xf3,0xc3 +.size sha512_block_data_order_xop,.-sha512_block_data_order_xop +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.Lavx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif diff --git a/third_party/serf/mac-x86/crypto/cpu-x86-asm.S b/third_party/serf/mac-x86/crypto/cpu-x86-asm.S deleted file mode 100644 index bfb292c893..0000000000 --- a/third_party/serf/mac-x86/crypto/cpu-x86-asm.S +++ /dev/null @@ -1,309 +0,0 @@ -#if defined(__i386__) -.file "crypto/cpu-x86-asm.S" -.text -.globl _OPENSSL_ia32_cpuid -.private_extern _OPENSSL_ia32_cpuid -.align 4 -_OPENSSL_ia32_cpuid: -L_OPENSSL_ia32_cpuid_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %edx,%edx - pushfl - popl %eax - movl %eax,%ecx - xorl $2097152,%eax - pushl %eax - popfl - pushfl - popl %eax - xorl %eax,%ecx - xorl %eax,%eax - btl $21,%ecx - jnc L000nocpuid - movl 20(%esp),%esi - movl %eax,8(%esi) - .byte 0x0f,0xa2 - movl %eax,%edi - xorl %eax,%eax - cmpl $1970169159,%ebx - setne %al - movl %eax,%ebp - cmpl $1231384169,%edx - setne %al - orl %eax,%ebp - cmpl $1818588270,%ecx - setne %al - orl %eax,%ebp - jz L001intel - cmpl $1752462657,%ebx - setne %al - movl %eax,%esi - cmpl $1769238117,%edx - setne %al - orl %eax,%esi - cmpl $1145913699,%ecx - setne %al - orl %eax,%esi - jnz L001intel - movl $2147483648,%eax - .byte 0x0f,0xa2 - cmpl $2147483649,%eax - jb L001intel - movl %eax,%esi - movl $2147483649,%eax - .byte 0x0f,0xa2 - orl %ecx,%ebp - andl $2049,%ebp - cmpl $2147483656,%esi - jb L001intel - movl $2147483656,%eax - .byte 0x0f,0xa2 - movzbl %cl,%esi - incl %esi - movl $1,%eax - xorl %ecx,%ecx - .byte 0x0f,0xa2 - btl $28,%edx - jnc L002generic - shrl $16,%ebx - andl $255,%ebx - cmpl %esi,%ebx - ja L002generic - andl $4026531839,%edx - jmp L002generic -L001intel: - cmpl $7,%edi - jb L003cacheinfo - movl 20(%esp),%esi - movl $7,%eax - xorl %ecx,%ecx - .byte 0x0f,0xa2 - movl %ebx,8(%esi) -L003cacheinfo: - cmpl $4,%edi - movl $-1,%edi - jb L004nocacheinfo - movl $4,%eax - movl $0,%ecx - .byte 0x0f,0xa2 - movl %eax,%edi - shrl $14,%edi - andl $4095,%edi -L004nocacheinfo: - movl $1,%eax - xorl %ecx,%ecx - .byte 0x0f,0xa2 - andl $3220176895,%edx - cmpl $0,%ebp - jne L005notintel - orl $1073741824,%edx -L005notintel: - btl $28,%edx - jnc L002generic - andl $4026531839,%edx - cmpl $0,%edi - je L002generic - orl $268435456,%edx - shrl $16,%ebx - cmpb $1,%bl - ja L002generic - andl $4026531839,%edx -L002generic: - andl $2048,%ebp - andl $4294965247,%ecx - movl %edx,%esi - orl %ecx,%ebp - btl $27,%ecx - jnc L006clear_avx - xorl %ecx,%ecx -.byte 15,1,208 - andl $6,%eax - cmpl $6,%eax - je L007done - cmpl $2,%eax - je L006clear_avx -L008clear_xmm: - andl $4261412861,%ebp - andl $4278190079,%esi -L006clear_avx: - andl $4026525695,%ebp - movl 20(%esp),%edi - andl $4294967263,8(%edi) -L007done: - movl %esi,%eax - movl %ebp,%edx -L000nocpuid: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _OPENSSL_rdtsc -.private_extern _OPENSSL_rdtsc -.align 4 -_OPENSSL_rdtsc: -L_OPENSSL_rdtsc_begin: - xorl %eax,%eax - xorl %edx,%edx - call L009PIC_me_up -L009PIC_me_up: - popl %ecx - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L009PIC_me_up(%ecx),%ecx - btl $4,(%ecx) - jnc L010notsc - .byte 0x0f,0x31 -L010notsc: - ret -.globl _OPENSSL_instrument_halt -.private_extern _OPENSSL_instrument_halt -.align 4 -_OPENSSL_instrument_halt: -L_OPENSSL_instrument_halt_begin: - call L011PIC_me_up -L011PIC_me_up: - popl %ecx - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L011PIC_me_up(%ecx),%ecx - btl $4,(%ecx) - jnc L012nohalt -.long 2421723150 - andl $3,%eax - jnz L012nohalt - pushfl - popl %eax - btl $9,%eax - jnc L012nohalt - .byte 0x0f,0x31 - pushl %edx - pushl %eax - hlt - .byte 0x0f,0x31 - subl (%esp),%eax - sbbl 4(%esp),%edx - addl $8,%esp - ret -L012nohalt: - xorl %eax,%eax - xorl %edx,%edx - ret -.globl _OPENSSL_far_spin -.private_extern _OPENSSL_far_spin -.align 4 -_OPENSSL_far_spin: -L_OPENSSL_far_spin_begin: - pushfl - popl %eax - btl $9,%eax - jnc L013nospin - movl 4(%esp),%eax - movl 8(%esp),%ecx -.long 2430111262 - xorl %eax,%eax - movl (%ecx),%edx - jmp L014spin -.align 4,0x90 -L014spin: - incl %eax - cmpl (%ecx),%edx - je L014spin -.long 529567888 - ret -L013nospin: - xorl %eax,%eax - xorl %edx,%edx - ret -.globl _OPENSSL_wipe_cpu -.private_extern _OPENSSL_wipe_cpu -.align 4 -_OPENSSL_wipe_cpu: -L_OPENSSL_wipe_cpu_begin: - xorl %eax,%eax - xorl %edx,%edx - call L015PIC_me_up -L015PIC_me_up: - popl %ecx - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L015PIC_me_up(%ecx),%ecx - movl (%ecx),%ecx - btl $1,(%ecx) - jnc L016no_x87 - andl $83886080,%ecx - cmpl $83886080,%ecx - jne L017no_sse2 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 -L017no_sse2: -.long 4007259865,4007259865,4007259865,4007259865,2430851995 -L016no_x87: - leal 4(%esp),%eax - ret -.globl _OPENSSL_atomic_add -.private_extern _OPENSSL_atomic_add -.align 4 -_OPENSSL_atomic_add: -L_OPENSSL_atomic_add_begin: - movl 4(%esp),%edx - movl 8(%esp),%ecx - pushl %ebx - nop - movl (%edx),%eax -L018spin: - leal (%eax,%ecx,1),%ebx - nop -.long 447811568 - jne L018spin - movl %ebx,%eax - popl %ebx - ret -.globl _OPENSSL_indirect_call -.private_extern _OPENSSL_indirect_call -.align 4 -_OPENSSL_indirect_call: -L_OPENSSL_indirect_call_begin: - pushl %ebp - movl %esp,%ebp - subl $28,%esp - movl 12(%ebp),%ecx - movl %ecx,(%esp) - movl 16(%ebp),%edx - movl %edx,4(%esp) - movl 20(%ebp),%eax - movl %eax,8(%esp) - movl 24(%ebp),%eax - movl %eax,12(%esp) - movl 28(%ebp),%eax - movl %eax,16(%esp) - movl 32(%ebp),%eax - movl %eax,20(%esp) - movl 36(%ebp),%eax - movl %eax,24(%esp) - call *8(%ebp) - movl %ebp,%esp - popl %ebp - ret -.globl _OPENSSL_ia32_rdrand -.private_extern _OPENSSL_ia32_rdrand -.align 4 -_OPENSSL_ia32_rdrand: -L_OPENSSL_ia32_rdrand_begin: - movl $8,%ecx -L019loop: -.byte 15,199,240 - jc L020break - loop L019loop -L020break: - cmpl $0,%eax - cmovel %ecx,%eax - ret -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/serf/mac-x86/crypto/sha/sha1-586.S b/third_party/serf/mac-x86/crypto/sha/sha1-586.S index 97aafbf198..72a7205dea 100644 --- a/third_party/serf/mac-x86/crypto/sha/sha1-586.S +++ b/third_party/serf/mac-x86/crypto/sha/sha1-586.S @@ -22,8 +22,11 @@ L000pic_point: movl 8(%esi),%ecx testl $16777216,%eax jz L001x86 - testl $536870912,%ecx - jnz Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je Lavx_shortcut jmp Lssse3_shortcut .align 4,0x90 L001x86: @@ -1391,9 +1394,9 @@ L002loop: popl %ebx popl %ebp ret -.private_extern __sha1_block_data_order_shaext +.private_extern __sha1_block_data_order_ssse3 .align 4 -__sha1_block_data_order_shaext: +__sha1_block_data_order_ssse3: pushl %ebp pushl %ebx pushl %esi @@ -1402,175 +1405,6 @@ __sha1_block_data_order_shaext: L003pic_point: popl %ebp leal LK_XX_XX-L003pic_point(%ebp),%ebp -Lshaext_shortcut: - movl 20(%esp),%edi - movl %esp,%ebx - movl 24(%esp),%esi - movl 28(%esp),%ecx - subl $32,%esp - movdqu (%edi),%xmm0 - movd 16(%edi),%xmm1 - andl $-32,%esp - movdqa 80(%ebp),%xmm3 - movdqu (%esi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%esi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 -.byte 102,15,56,0,251 - jmp L004loop_shaext -.align 4,0x90 -L004loop_shaext: - decl %ecx - leal 64(%esi),%eax - movdqa %xmm1,(%esp) - paddd %xmm4,%xmm1 - cmovnel %eax,%esi - movdqa %xmm0,16(%esp) -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%esi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%esi),%xmm5 -.byte 102,15,56,0,227 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,235 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,243 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 - movdqa (%esp),%xmm2 -.byte 102,15,56,0,251 -.byte 15,56,200,202 - paddd 16(%esp),%xmm0 - jnz L004loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%edi) - movd %xmm1,16(%edi) - movl %ebx,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.private_extern __sha1_block_data_order_ssse3 -.align 4 -__sha1_block_data_order_ssse3: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call L005pic_point -L005pic_point: - popl %ebp - leal LK_XX_XX-L005pic_point(%ebp),%ebp Lssse3_shortcut: movdqa (%ebp),%xmm7 movdqa 16(%ebp),%xmm0 @@ -1623,9 +1457,9 @@ Lssse3_shortcut: xorl %edx,%ebp pshufd $238,%xmm0,%xmm4 andl %ebp,%esi - jmp L006loop + jmp L004loop .align 4,0x90 -L006loop: +L004loop: rorl $2,%ebx xorl %edx,%esi movl %eax,%ebp @@ -2528,7 +2362,7 @@ L006loop: addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp - je L007done + je L005done movdqa 160(%esp),%xmm7 movdqa 176(%esp),%xmm6 movdqu (%ebp),%xmm0 @@ -2663,9 +2497,9 @@ L006loop: pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx - jmp L006loop + jmp L004loop .align 4,0x90 -L007done: +L005done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp @@ -2778,6 +2612,1175 @@ L007done: popl %ebx popl %ebp ret +.private_extern __sha1_block_data_order_avx +.align 4 +__sha1_block_data_order_avx: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call L006pic_point +L006pic_point: + popl %ebp + leal LK_XX_XX-L006pic_point(%ebp),%ebp +Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp L007loop +.align 4,0x90 +L007loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je L008done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp L007loop +.align 4,0x90 +L008done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .align 6,0x90 LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/third_party/serf/mac-x86/crypto/sha/sha256-586.S b/third_party/serf/mac-x86/crypto/sha/sha256-586.S index f0ba612fab..841854f7a9 100644 --- a/third_party/serf/mac-x86/crypto/sha/sha256-586.S +++ b/third_party/serf/mac-x86/crypto/sha/sha256-586.S @@ -36,11 +36,10 @@ L000pic_point: jz L003no_xmm andl $1073741824,%ecx andl $268435968,%ebx - testl $536870912,%edx - jnz L004shaext orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je L004AVX testl $512,%ebx jnz L005SSSE3 L003no_xmm: @@ -3166,204 +3165,6 @@ L009grand_loop: popl %ebp ret .align 5,0x90 -L004shaext: - subl $32,%esp - movdqu (%esi),%xmm1 - leal 128(%ebp),%ebp - movdqu 16(%esi),%xmm2 - movdqa 128(%ebp),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp L010loop_shaext -.align 4,0x90 -L010loop_shaext: - movdqu (%edi),%xmm3 - movdqu 16(%edi),%xmm4 - movdqu 32(%edi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%edi),%xmm6 - movdqa %xmm2,16(%esp) - movdqa -128(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,(%esp) -.byte 15,56,203,202 - movdqa -112(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leal 64(%edi),%edi -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -80(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa -64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa -48(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -16(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa (%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 16(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 48(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - movdqa 96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa 128(%ebp),%xmm7 -.byte 15,56,203,202 - movdqa 112(%ebp),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - cmpl %edi,%eax - nop -.byte 15,56,203,202 - paddd 16(%esp),%xmm2 - paddd (%esp),%xmm1 - jnz L010loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - movl 44(%esp),%esp - movdqu %xmm1,(%esi) - movdqu %xmm2,16(%esi) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 5,0x90 L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax @@ -3383,9 +3184,9 @@ L005SSSE3: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp L011grand_ssse3 + jmp L010grand_ssse3 .align 4,0x90 -L011grand_ssse3: +L010grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3408,9 +3209,9 @@ L011grand_ssse3: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp L012ssse3_00_47 + jmp L011ssse3_00_47 .align 4,0x90 -L012ssse3_00_47: +L011ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4053,7 +3854,7 @@ L012ssse3_00_47: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne L012ssse3_00_47 + jne L011ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4567,8 +4368,1189 @@ L012ssse3_00_47: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb L011grand_ssse3 + jb L010grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 5,0x90 +L004AVX: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp L012grand_avx +.align 5,0x90 +L012grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp L013avx_00_47 +.align 4,0x90 +L013avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne L013avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb L012grand_avx movl 108(%esp),%esp + vzeroall popl %edi popl %esi popl %ebx diff --git a/third_party/serf/mac-x86_64/crypto/bn/x86_64-mont5.S b/third_party/serf/mac-x86_64/crypto/bn/x86_64-mont5.S index 2e8f469c14..461bfb2233 100644 --- a/third_party/serf/mac-x86_64/crypto/bn/x86_64-mont5.S +++ b/third_party/serf/mac-x86_64/crypto/bn/x86_64-mont5.S @@ -1560,6 +1560,15 @@ L$8x_tail: .p2align 5 L$8x_tail_done: addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + xorq %rax,%rax negq %rsi diff --git a/third_party/serf/mac-x86_64/crypto/cpu-x86_64-asm.S b/third_party/serf/mac-x86_64/crypto/cpu-x86_64-asm.S deleted file mode 100644 index 0dde04d8fc..0000000000 --- a/third_party/serf/mac-x86_64/crypto/cpu-x86_64-asm.S +++ /dev/null @@ -1,143 +0,0 @@ -#if defined(__x86_64__) -.text - -.globl _OPENSSL_ia32_cpuid -.private_extern _OPENSSL_ia32_cpuid - -.p2align 4 -_OPENSSL_ia32_cpuid: - - - movq %rdi,%rdi - movq %rbx,%r8 - - xorl %eax,%eax - movl %eax,8(%rdi) - cpuid - movl %eax,%r11d - - xorl %eax,%eax - cmpl $1970169159,%ebx - setne %al - movl %eax,%r9d - cmpl $1231384169,%edx - setne %al - orl %eax,%r9d - cmpl $1818588270,%ecx - setne %al - orl %eax,%r9d - jz L$intel - - cmpl $1752462657,%ebx - setne %al - movl %eax,%r10d - cmpl $1769238117,%edx - setne %al - orl %eax,%r10d - cmpl $1145913699,%ecx - setne %al - orl %eax,%r10d - jnz L$intel - - - - - movl $2147483648,%eax - cpuid - - - cmpl $2147483649,%eax - jb L$intel - movl %eax,%r10d - movl $2147483649,%eax - cpuid - - - orl %ecx,%r9d - andl $2049,%r9d - - cmpl $2147483656,%r10d - jb L$intel - - movl $2147483656,%eax - cpuid - - movzbq %cl,%r10 - incq %r10 - - movl $1,%eax - cpuid - - btl $28,%edx - jnc L$generic - shrl $16,%ebx - cmpb %r10b,%bl - ja L$generic - andl $4026531839,%edx - jmp L$generic - -L$intel: - cmpl $4,%r11d - movl $-1,%r10d - jb L$nocacheinfo - - movl $4,%eax - movl $0,%ecx - cpuid - movl %eax,%r10d - shrl $14,%r10d - andl $4095,%r10d - - cmpl $7,%r11d - jb L$nocacheinfo - - movl $7,%eax - xorl %ecx,%ecx - cpuid - movl %ebx,8(%rdi) - -L$nocacheinfo: - movl $1,%eax - cpuid - - andl $3220176895,%edx - cmpl $0,%r9d - jne L$notintel - orl $1073741824,%edx -L$notintel: - btl $28,%edx - jnc L$generic - andl $4026531839,%edx - cmpl $0,%r10d - je L$generic - - orl $268435456,%edx - shrl $16,%ebx - cmpb $1,%bl - ja L$generic - andl $4026531839,%edx -L$generic: - andl $2048,%r9d - andl $4294965247,%ecx - orl %ecx,%r9d - - movl %edx,%r10d - btl $27,%r9d - jnc L$clear_avx - xorl %ecx,%ecx -.byte 0x0f,0x01,0xd0 - andl $6,%eax - cmpl $6,%eax - je L$done -L$clear_avx: - movl $4026525695,%eax - andl %eax,%r9d - andl $4294967263,8(%rdi) -L$done: - movl %r9d,4(%rdi) - movl %r10d,0(%rdi) - movq %r8,%rbx - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/serf/mac-x86_64/crypto/ec/p256-x86_64-asm.S b/third_party/serf/mac-x86_64/crypto/ec/p256-x86_64-asm.S new file mode 100644 index 0000000000..bed1130561 --- /dev/null +++ b/third_party/serf/mac-x86_64/crypto/ec/p256-x86_64-asm.S @@ -0,0 +1,1779 @@ +#if defined(__x86_64__) +.text + + + +.p2align 6 +L$poly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +L$One: +.long 1,1,1,1,1,1,1,1 +L$Two: +.long 2,2,2,2,2,2,2,2 +L$Three: +.long 3,3,3,3,3,3,3,3 +L$ONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + + +.p2align 6 +ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + leaq L$poly(%rip),%rsi + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + sbbq %r13,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 + + + + +.globl _ecp_nistz256_neg +.private_extern _ecp_nistz256_neg + +.p2align 5 +_ecp_nistz256_neg: + pushq %r12 + pushq %r13 + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq L$poly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 + + + + + + + +.globl _ecp_nistz256_mul_mont +.private_extern _ecp_nistz256_mul_mont + +.p2align 5 +_ecp_nistz256_mul_mont: +L$mul_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq +L$mul_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_mul_montq: + + + movq %rax,%rbp + mulq %r9 + movq L$poly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq L$poly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + + + + + + +.globl _ecp_nistz256_sqr_mont +.private_extern _ecp_nistz256_sqr_mont + +.p2align 5 +_ecp_nistz256_sqr_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq +L$sqr_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_sqr_montq: + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq L$poly+8(%rip),%rsi + movq L$poly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 + + + + + + + +.globl _ecp_nistz256_from_mont +.private_extern _ecp_nistz256_from_mont + +.p2align 5 +_ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%rax + movq L$poly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq L$poly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %r13 + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %r13 + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %r13 + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + movq %r8,%rcx + adcq %rax,%r10 + movq %r9,%rsi + adcq $0,%rdx + + subq $-1,%r8 + movq %r10,%rax + sbbq %r12,%r9 + sbbq $0,%r10 + movq %rdx,%r11 + sbbq %r13,%rdx + sbbq %r13,%r13 + + cmovnzq %rcx,%r8 + cmovnzq %rsi,%r9 + movq %r8,0(%rdi) + cmovnzq %rax,%r10 + movq %r9,8(%rdi) + cmovzq %rdx,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 + + + +.globl _ecp_nistz256_select_w5 +.private_extern _ecp_nistz256_select_w5 + +.p2align 5 +_ecp_nistz256_select_w5: + movdqa L$One(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +L$select_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz L$select_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + .byte 0xf3,0xc3 + + + + +.globl _ecp_nistz256_select_w7 +.private_extern _ecp_nistz256_select_w7 + +.p2align 5 +_ecp_nistz256_select_w7: + movdqa L$One(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +L$select_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz L$select_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_avx2_select_w7 +.private_extern _ecp_nistz256_avx2_select_w7 + +.p2align 5 +_ecp_nistz256_avx2_select_w7: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 + + +.p2align 5 +__ecp_nistz256_add_toq: + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_subq: + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_mul_by_2q: + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_point_double +.private_extern _ecp_nistz256_point_double + +.p2align 5 +_ecp_nistz256_point_double: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + addq $160+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_point_add +.private_extern _ecp_nistz256_point_add + +.p2align 5 +_ecp_nistz256_point_add: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rsi),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz L$add_proceedq +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz L$add_proceedq + testq %r9,%r9 + jz L$add_proceedq + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_doneq + +.p2align 5 +L$add_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_doneq: + addq $576+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_point_add_affine +.private_extern _ecp_nistz256_point_add_affine + +.p2align 5 +_ecp_nistz256_point_add_affine: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rbx),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + addq $480+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +#endif diff --git a/third_party/serf/mac-x86_64/crypto/rand/rdrand-x86_64.S b/third_party/serf/mac-x86_64/crypto/rand/rdrand-x86_64.S index 1ba990fbfb..f0df296e1a 100644 --- a/third_party/serf/mac-x86_64/crypto/rand/rdrand-x86_64.S +++ b/third_party/serf/mac-x86_64/crypto/rand/rdrand-x86_64.S @@ -1,11 +1,48 @@ #if defined(__x86_64__) .text + + + .globl _CRYPTO_rdrand .private_extern _CRYPTO_rdrand .p2align 4 _CRYPTO_rdrand: -.byte 0x48, 0x0f, 0xc7, 0xf0 + xorq %rax,%rax + + +.byte 0x48, 0x0f, 0xc7, 0xf1 + + adcq %rax,%rax + movq %rcx,0(%rdi) + .byte 0xf3,0xc3 + + + + + +.globl _CRYPTO_rdrand_multiple8_buf +.private_extern _CRYPTO_rdrand_multiple8_buf + +.p2align 4 +_CRYPTO_rdrand_multiple8_buf: + testq %rsi,%rsi + jz L$out + movq $8,%rdx +L$loop: + + +.byte 0x48, 0x0f, 0xc7, 0xf1 + jnc L$err + movq %rcx,0(%rdi) + addq %rdx,%rdi + subq %rdx,%rsi + jnz L$loop +L$out: + movq $1,%rax + .byte 0xf3,0xc3 +L$err: + xorq %rax,%rax .byte 0xf3,0xc3 #endif diff --git a/third_party/serf/mac-x86_64/crypto/sha/sha1-x86_64.S b/third_party/serf/mac-x86_64/crypto/sha/sha1-x86_64.S index 044dc5b7cd..0509d45163 100644 --- a/third_party/serf/mac-x86_64/crypto/sha/sha1-x86_64.S +++ b/third_party/serf/mac-x86_64/crypto/sha/sha1-x86_64.S @@ -12,6 +12,11 @@ _sha1_block_data_order: movl _OPENSSL_ia32cap_P+8(%rip),%r10d testl $512,%r8d jz L$ialu + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .p2align 4 @@ -2407,6 +2412,1122 @@ L$done_ssse3: L$epilogue_ssse3: .byte 0xf3,0xc3 + +.p2align 4 +sha1_block_data_order_avx: +_avx_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp + vzeroupper + movq %rax,%r14 + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r11 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp L$oop_avx +.p2align 4 +L$oop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r11),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r11),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r11),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je L$done_avx + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp L$oop_avx + +.p2align 4 +L$done_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$epilogue_avx: + .byte 0xf3,0xc3 + .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/third_party/serf/mac-x86_64/crypto/sha/sha256-x86_64.S b/third_party/serf/mac-x86_64/crypto/sha/sha256-x86_64.S index da02d4c2dc..0146ff5cfc 100644 --- a/third_party/serf/mac-x86_64/crypto/sha/sha256-x86_64.S +++ b/third_party/serf/mac-x86_64/crypto/sha/sha256-x86_64.S @@ -11,6 +11,11 @@ _sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je L$avx_shortcut testl $512,%r10d jnz L$ssse3_shortcut pushq %rbx @@ -2840,4 +2845,1061 @@ L$ssse3_00_47: L$epilogue_ssse3: .byte 0xf3,0xc3 + +.p2align 6 +sha256_block_data_order_avx: +L$avx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +L$prologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$avx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_avx + + movq 64+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_avx: + .byte 0xf3,0xc3 + #endif diff --git a/third_party/serf/mac-x86_64/crypto/sha/sha512-x86_64.S b/third_party/serf/mac-x86_64/crypto/sha/sha512-x86_64.S index 2f5d912425..aeabd3f43a 100644 --- a/third_party/serf/mac-x86_64/crypto/sha/sha512-x86_64.S +++ b/third_party/serf/mac-x86_64/crypto/sha/sha512-x86_64.S @@ -7,6 +7,17 @@ .p2align 4 _sha512_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $2048,%r10d + jnz L$xop_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je L$avx_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -1783,4 +1794,2234 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + +.p2align 6 +sha512_block_data_order_xop: +L$xop_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +L$prologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_xop +.p2align 4 +L$loop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$xop_00_47 + +.p2align 4 +L$xop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$xop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_xop + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_xop: + .byte 0xf3,0xc3 + + +.p2align 6 +sha512_block_data_order_avx: +L$avx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +L$prologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$avx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_avx + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_avx: + .byte 0xf3,0xc3 + #endif diff --git a/third_party/serf/openssl.gyp b/third_party/serf/openssl.gyp index 1523c50f4a..928edbcad0 100644 --- a/third_party/serf/openssl.gyp +++ b/third_party/serf/openssl.gyp @@ -58,12 +58,15 @@ ], }, 'sources': [ - '<@(boringssl_lib_sources)', + '<@(boringssl_crypto_sources)', + '<@(boringssl_ssl_sources)', ], 'cflags': [ '-D_POSIX_C_SOURCE=200112L', ], - 'defines': [ 'BORINGSSL_IMPLEMENTATION' ], + 'defines': [ 'BORINGSSL_IMPLEMENTATION', + 'BORINGSSL_NO_STATIC_INITIALIZER', + ], 'conditions': [ ['component == "shared_library"', { 'defines': [ @@ -123,6 +126,9 @@ 'include_dirs': [ '<(DEPTH)/third_party/boringssl/src/include', ], + 'defines' : [ + 'OPENSSL_SMALL', + ], 'conditions': [ ['component == "shared_library"', { 'defines': [ @@ -135,4 +141,3 @@ ], } - diff --git a/third_party/serf/openssl.gypi b/third_party/serf/openssl.gypi index c943efa542..d5b72f5e50 100644 --- a/third_party/serf/openssl.gypi +++ b/third_party/serf/openssl.gypi @@ -6,7 +6,39 @@ { 'variables': { - 'boringssl_lib_sources': [ + 'boringssl_ssl_sources': [ + '<(openssl_root)/ssl/custom_extensions.c', + '<(openssl_root)/ssl/d1_both.c', + '<(openssl_root)/ssl/d1_clnt.c', + '<(openssl_root)/ssl/d1_lib.c', + '<(openssl_root)/ssl/d1_meth.c', + '<(openssl_root)/ssl/d1_pkt.c', + '<(openssl_root)/ssl/d1_srtp.c', + '<(openssl_root)/ssl/d1_srvr.c', + '<(openssl_root)/ssl/dtls_record.c', + '<(openssl_root)/ssl/pqueue/pqueue.c', + '<(openssl_root)/ssl/s3_both.c', + '<(openssl_root)/ssl/s3_clnt.c', + '<(openssl_root)/ssl/s3_enc.c', + '<(openssl_root)/ssl/s3_lib.c', + '<(openssl_root)/ssl/s3_meth.c', + '<(openssl_root)/ssl/s3_pkt.c', + '<(openssl_root)/ssl/s3_srvr.c', + '<(openssl_root)/ssl/ssl_aead_ctx.c', + '<(openssl_root)/ssl/ssl_asn1.c', + '<(openssl_root)/ssl/ssl_buffer.c', + '<(openssl_root)/ssl/ssl_cert.c', + '<(openssl_root)/ssl/ssl_cipher.c', + '<(openssl_root)/ssl/ssl_file.c', + '<(openssl_root)/ssl/ssl_lib.c', + '<(openssl_root)/ssl/ssl_rsa.c', + '<(openssl_root)/ssl/ssl_session.c', + '<(openssl_root)/ssl/ssl_stat.c', + '<(openssl_root)/ssl/t1_enc.c', + '<(openssl_root)/ssl/t1_lib.c', + '<(openssl_root)/ssl/tls_record.c', + ], + 'boringssl_crypto_sources': [ 'err_data.c', '<(openssl_root)/crypto/aes/aes.c', '<(openssl_root)/crypto/aes/mode_wrappers.c', @@ -62,6 +94,7 @@ '<(openssl_root)/crypto/bn/add.c', '<(openssl_root)/crypto/bn/asm/x86_64-gcc.c', '<(openssl_root)/crypto/bn/bn.c', + '<(openssl_root)/crypto/bn/bn_asn1.c', '<(openssl_root)/crypto/bn/cmp.c', '<(openssl_root)/crypto/bn/convert.c', '<(openssl_root)/crypto/bn/ctx.c', @@ -100,11 +133,11 @@ '<(openssl_root)/crypto/cpu-arm.c', '<(openssl_root)/crypto/cpu-intel.c', '<(openssl_root)/crypto/crypto.c', + '<(openssl_root)/crypto/curve25519/curve25519.c', '<(openssl_root)/crypto/des/des.c', '<(openssl_root)/crypto/dh/check.c', '<(openssl_root)/crypto/dh/dh.c', '<(openssl_root)/crypto/dh/dh_asn1.c', - '<(openssl_root)/crypto/dh/dh_impl.c', '<(openssl_root)/crypto/dh/params.c', '<(openssl_root)/crypto/digest/digest.c', '<(openssl_root)/crypto/digest/digests.c', @@ -112,13 +145,14 @@ '<(openssl_root)/crypto/directory_win.c', '<(openssl_root)/crypto/dsa/dsa.c', '<(openssl_root)/crypto/dsa/dsa_asn1.c', - '<(openssl_root)/crypto/dsa/dsa_impl.c', '<(openssl_root)/crypto/ec/ec.c', '<(openssl_root)/crypto/ec/ec_asn1.c', '<(openssl_root)/crypto/ec/ec_key.c', '<(openssl_root)/crypto/ec/ec_montgomery.c', '<(openssl_root)/crypto/ec/oct.c', + '<(openssl_root)/crypto/ec/p224-64.c', '<(openssl_root)/crypto/ec/p256-64.c', + '<(openssl_root)/crypto/ec/p256-x86_64.c', '<(openssl_root)/crypto/ec/simple.c', '<(openssl_root)/crypto/ec/util-64.c', '<(openssl_root)/crypto/ec/wnaf.c', @@ -128,15 +162,13 @@ '<(openssl_root)/crypto/engine/engine.c', '<(openssl_root)/crypto/err/err.c', '<(openssl_root)/crypto/evp/algorithm.c', - '<(openssl_root)/crypto/evp/asn1.c', '<(openssl_root)/crypto/evp/digestsign.c', '<(openssl_root)/crypto/evp/evp.c', + '<(openssl_root)/crypto/evp/evp_asn1.c', '<(openssl_root)/crypto/evp/evp_ctx.c', '<(openssl_root)/crypto/evp/p_dsa_asn1.c', '<(openssl_root)/crypto/evp/p_ec.c', '<(openssl_root)/crypto/evp/p_ec_asn1.c', - '<(openssl_root)/crypto/evp/p_hmac.c', - '<(openssl_root)/crypto/evp/p_hmac_asn1.c', '<(openssl_root)/crypto/evp/p_rsa.c', '<(openssl_root)/crypto/evp/p_rsa_asn1.c', '<(openssl_root)/crypto/evp/pbkdf.c', @@ -170,7 +202,6 @@ '<(openssl_root)/crypto/poly1305/poly1305.c', '<(openssl_root)/crypto/poly1305/poly1305_arm.c', '<(openssl_root)/crypto/poly1305/poly1305_vec.c', - '<(openssl_root)/crypto/rand/hwrand.c', '<(openssl_root)/crypto/rand/rand.c', '<(openssl_root)/crypto/rand/urandom.c', '<(openssl_root)/crypto/rand/windows.c', @@ -201,6 +232,7 @@ '<(openssl_root)/crypto/x509/i2d_pr.c', '<(openssl_root)/crypto/x509/pkcs7.c', '<(openssl_root)/crypto/x509/t_crl.c', + '<(openssl_root)/crypto/x509/t_req.c', '<(openssl_root)/crypto/x509/t_x509.c', '<(openssl_root)/crypto/x509/t_x509a.c', '<(openssl_root)/crypto/x509/x509.c', @@ -271,54 +303,28 @@ '<(openssl_root)/crypto/x509v3/v3_skey.c', '<(openssl_root)/crypto/x509v3/v3_sxnet.c', '<(openssl_root)/crypto/x509v3/v3_utl.c', - '<(openssl_root)/ssl/d1_both.c', - '<(openssl_root)/ssl/d1_clnt.c', - '<(openssl_root)/ssl/d1_lib.c', - '<(openssl_root)/ssl/d1_meth.c', - '<(openssl_root)/ssl/d1_pkt.c', - '<(openssl_root)/ssl/d1_srtp.c', - '<(openssl_root)/ssl/d1_srvr.c', - '<(openssl_root)/ssl/pqueue/pqueue.c', - '<(openssl_root)/ssl/s3_both.c', - '<(openssl_root)/ssl/s3_clnt.c', - '<(openssl_root)/ssl/s3_enc.c', - '<(openssl_root)/ssl/s3_lib.c', - '<(openssl_root)/ssl/s3_meth.c', - '<(openssl_root)/ssl/s3_pkt.c', - '<(openssl_root)/ssl/s3_srvr.c', - '<(openssl_root)/ssl/ssl_aead_ctx.c', - '<(openssl_root)/ssl/ssl_algs.c', - '<(openssl_root)/ssl/ssl_asn1.c', - '<(openssl_root)/ssl/ssl_cert.c', - '<(openssl_root)/ssl/ssl_cipher.c', - '<(openssl_root)/ssl/ssl_lib.c', - '<(openssl_root)/ssl/ssl_rsa.c', - '<(openssl_root)/ssl/ssl_sess.c', - '<(openssl_root)/ssl/ssl_stat.c', - '<(openssl_root)/ssl/ssl_txt.c', - '<(openssl_root)/ssl/t1_enc.c', - '<(openssl_root)/ssl/t1_lib.c', - '<(openssl_root)/ssl/t1_reneg.c', ], 'boringssl_linux_aarch64_sources': [ - 'linux-aarch64/crypto/aes/aesv8-armx.S', - 'linux-aarch64/crypto/modes/ghashv8-armx.S', + 'linux-aarch64/crypto/aes/aesv8-armx64.S', + 'linux-aarch64/crypto/bn/armv8-mont.S', + 'linux-aarch64/crypto/modes/ghashv8-armx64.S', 'linux-aarch64/crypto/sha/sha1-armv8.S', 'linux-aarch64/crypto/sha/sha256-armv8.S', 'linux-aarch64/crypto/sha/sha512-armv8.S', ], 'boringssl_linux_arm_sources': [ 'linux-arm/crypto/aes/aes-armv4.S', - 'linux-arm/crypto/aes/aesv8-armx.S', + 'linux-arm/crypto/aes/aesv8-armx32.S', 'linux-arm/crypto/aes/bsaes-armv7.S', 'linux-arm/crypto/bn/armv4-mont.S', 'linux-arm/crypto/modes/ghash-armv4.S', - 'linux-arm/crypto/modes/ghashv8-armx.S', + 'linux-arm/crypto/modes/ghashv8-armx32.S', 'linux-arm/crypto/sha/sha1-armv4-large.S', 'linux-arm/crypto/sha/sha256-armv4.S', 'linux-arm/crypto/sha/sha512-armv4.S', '<(openssl_root)/crypto/chacha/chacha_vec_arm.S', '<(openssl_root)/crypto/cpu-arm-asm.S', + '<(openssl_root)/crypto/curve25519/asm/x25519-arm.S', '<(openssl_root)/crypto/poly1305/poly1305_arm_asm.S', ], 'boringssl_linux_x86_sources': [ @@ -328,7 +334,6 @@ 'linux-x86/crypto/bn/bn-586.S', 'linux-x86/crypto/bn/co-586.S', 'linux-x86/crypto/bn/x86-mont.S', - 'linux-x86/crypto/cpu-x86-asm.S', 'linux-x86/crypto/md5/md5-586.S', 'linux-x86/crypto/modes/ghash-x86.S', 'linux-x86/crypto/rc4/rc4-586.S', @@ -345,7 +350,7 @@ 'linux-x86_64/crypto/bn/rsaz-x86_64.S', 'linux-x86_64/crypto/bn/x86_64-mont.S', 'linux-x86_64/crypto/bn/x86_64-mont5.S', - 'linux-x86_64/crypto/cpu-x86_64-asm.S', + 'linux-x86_64/crypto/ec/p256-x86_64-asm.S', 'linux-x86_64/crypto/md5/md5-x86_64.S', 'linux-x86_64/crypto/modes/aesni-gcm-x86_64.S', 'linux-x86_64/crypto/modes/ghash-x86_64.S', @@ -363,7 +368,6 @@ 'mac-x86/crypto/bn/bn-586.S', 'mac-x86/crypto/bn/co-586.S', 'mac-x86/crypto/bn/x86-mont.S', - 'mac-x86/crypto/cpu-x86-asm.S', 'mac-x86/crypto/md5/md5-586.S', 'mac-x86/crypto/modes/ghash-x86.S', 'mac-x86/crypto/rc4/rc4-586.S', @@ -380,7 +384,7 @@ 'mac-x86_64/crypto/bn/rsaz-x86_64.S', 'mac-x86_64/crypto/bn/x86_64-mont.S', 'mac-x86_64/crypto/bn/x86_64-mont5.S', - 'mac-x86_64/crypto/cpu-x86_64-asm.S', + 'mac-x86_64/crypto/ec/p256-x86_64-asm.S', 'mac-x86_64/crypto/md5/md5-x86_64.S', 'mac-x86_64/crypto/modes/aesni-gcm-x86_64.S', 'mac-x86_64/crypto/modes/ghash-x86_64.S', @@ -398,7 +402,6 @@ 'win-x86/crypto/bn/bn-586.asm', 'win-x86/crypto/bn/co-586.asm', 'win-x86/crypto/bn/x86-mont.asm', - 'win-x86/crypto/cpu-x86-asm.asm', 'win-x86/crypto/md5/md5-586.asm', 'win-x86/crypto/modes/ghash-x86.asm', 'win-x86/crypto/rc4/rc4-586.asm', @@ -415,7 +418,7 @@ 'win-x86_64/crypto/bn/rsaz-x86_64.asm', 'win-x86_64/crypto/bn/x86_64-mont.asm', 'win-x86_64/crypto/bn/x86_64-mont5.asm', - 'win-x86_64/crypto/cpu-x86_64-asm.asm', + 'win-x86_64/crypto/ec/p256-x86_64-asm.asm', 'win-x86_64/crypto/md5/md5-x86_64.asm', 'win-x86_64/crypto/modes/aesni-gcm-x86_64.asm', 'win-x86_64/crypto/modes/ghash-x86_64.asm', @@ -428,4 +431,3 @@ ], } } - diff --git a/third_party/serf/win-x86/crypto/cpu-x86-asm.asm b/third_party/serf/win-x86/crypto/cpu-x86-asm.asm deleted file mode 100644 index 4317a73bc8..0000000000 --- a/third_party/serf/win-x86/crypto/cpu-x86-asm.asm +++ /dev/null @@ -1,303 +0,0 @@ -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -%ifdef __YASM_VERSION_ID__ -%if __YASM_VERSION_ID__ < 01010000h -%error yasm version 1.1.0 or later needed. -%endif -; Yasm automatically includes .00 and complains about redefining it. -; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html -%else -$@feat.00 equ 1 -%endif -section .text code align=64 -%else -section .text code -%endif -global _OPENSSL_ia32_cpuid -align 16 -_OPENSSL_ia32_cpuid: -L$_OPENSSL_ia32_cpuid_begin: - push ebp - push ebx - push esi - push edi - xor edx,edx - pushfd - pop eax - mov ecx,eax - xor eax,2097152 - push eax - popfd - pushfd - pop eax - xor ecx,eax - xor eax,eax - bt ecx,21 - jnc NEAR L$000nocpuid - mov esi,DWORD [20+esp] - mov DWORD [8+esi],eax - cpuid - mov edi,eax - xor eax,eax - cmp ebx,1970169159 - setne al - mov ebp,eax - cmp edx,1231384169 - setne al - or ebp,eax - cmp ecx,1818588270 - setne al - or ebp,eax - jz NEAR L$001intel - cmp ebx,1752462657 - setne al - mov esi,eax - cmp edx,1769238117 - setne al - or esi,eax - cmp ecx,1145913699 - setne al - or esi,eax - jnz NEAR L$001intel - mov eax,2147483648 - cpuid - cmp eax,2147483649 - jb NEAR L$001intel - mov esi,eax - mov eax,2147483649 - cpuid - or ebp,ecx - and ebp,2049 - cmp esi,2147483656 - jb NEAR L$001intel - mov eax,2147483656 - cpuid - movzx esi,cl - inc esi - mov eax,1 - xor ecx,ecx - cpuid - bt edx,28 - jnc NEAR L$002generic - shr ebx,16 - and ebx,255 - cmp ebx,esi - ja NEAR L$002generic - and edx,4026531839 - jmp NEAR L$002generic -L$001intel: - cmp edi,7 - jb NEAR L$003cacheinfo - mov esi,DWORD [20+esp] - mov eax,7 - xor ecx,ecx - cpuid - mov DWORD [8+esi],ebx -L$003cacheinfo: - cmp edi,4 - mov edi,-1 - jb NEAR L$004nocacheinfo - mov eax,4 - mov ecx,0 - cpuid - mov edi,eax - shr edi,14 - and edi,4095 -L$004nocacheinfo: - mov eax,1 - xor ecx,ecx - cpuid - and edx,3220176895 - cmp ebp,0 - jne NEAR L$005notintel - or edx,1073741824 -L$005notintel: - bt edx,28 - jnc NEAR L$002generic - and edx,4026531839 - cmp edi,0 - je NEAR L$002generic - or edx,268435456 - shr ebx,16 - cmp bl,1 - ja NEAR L$002generic - and edx,4026531839 -L$002generic: - and ebp,2048 - and ecx,4294965247 - mov esi,edx - or ebp,ecx - bt ecx,27 - jnc NEAR L$006clear_avx - xor ecx,ecx -db 15,1,208 - and eax,6 - cmp eax,6 - je NEAR L$007done - cmp eax,2 - je NEAR L$006clear_avx -L$008clear_xmm: - and ebp,4261412861 - and esi,4278190079 -L$006clear_avx: - and ebp,4026525695 - mov edi,DWORD [20+esp] - and DWORD [8+edi],4294967263 -L$007done: - mov eax,esi - mov edx,ebp -L$000nocpuid: - pop edi - pop esi - pop ebx - pop ebp - ret -;extern _OPENSSL_ia32cap_P -global _OPENSSL_rdtsc -align 16 -_OPENSSL_rdtsc: -L$_OPENSSL_rdtsc_begin: - xor eax,eax - xor edx,edx - lea ecx,[_OPENSSL_ia32cap_P] - bt DWORD [ecx],4 - jnc NEAR L$009notsc - rdtsc -L$009notsc: - ret -global _OPENSSL_instrument_halt -align 16 -_OPENSSL_instrument_halt: -L$_OPENSSL_instrument_halt_begin: - lea ecx,[_OPENSSL_ia32cap_P] - bt DWORD [ecx],4 - jnc NEAR L$010nohalt -dd 2421723150 - and eax,3 - jnz NEAR L$010nohalt - pushfd - pop eax - bt eax,9 - jnc NEAR L$010nohalt - rdtsc - push edx - push eax - hlt - rdtsc - sub eax,DWORD [esp] - sbb edx,DWORD [4+esp] - add esp,8 - ret -L$010nohalt: - xor eax,eax - xor edx,edx - ret -global _OPENSSL_far_spin -align 16 -_OPENSSL_far_spin: -L$_OPENSSL_far_spin_begin: - pushfd - pop eax - bt eax,9 - jnc NEAR L$011nospin - mov eax,DWORD [4+esp] - mov ecx,DWORD [8+esp] -dd 2430111262 - xor eax,eax - mov edx,DWORD [ecx] - jmp NEAR L$012spin -align 16 -L$012spin: - inc eax - cmp edx,DWORD [ecx] - je NEAR L$012spin -dd 529567888 - ret -L$011nospin: - xor eax,eax - xor edx,edx - ret -global _OPENSSL_wipe_cpu -align 16 -_OPENSSL_wipe_cpu: -L$_OPENSSL_wipe_cpu_begin: - xor eax,eax - xor edx,edx - lea ecx,[_OPENSSL_ia32cap_P] - mov ecx,DWORD [ecx] - bt DWORD [ecx],1 - jnc NEAR L$013no_x87 - and ecx,83886080 - cmp ecx,83886080 - jne NEAR L$014no_sse2 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pxor xmm7,xmm7 -L$014no_sse2: -dd 4007259865,4007259865,4007259865,4007259865,2430851995 -L$013no_x87: - lea eax,[4+esp] - ret -global _OPENSSL_atomic_add -align 16 -_OPENSSL_atomic_add: -L$_OPENSSL_atomic_add_begin: - mov edx,DWORD [4+esp] - mov ecx,DWORD [8+esp] - push ebx - nop - mov eax,DWORD [edx] -L$015spin: - lea ebx,[ecx*1+eax] - nop -dd 447811568 - jne NEAR L$015spin - mov eax,ebx - pop ebx - ret -global _OPENSSL_indirect_call -align 16 -_OPENSSL_indirect_call: -L$_OPENSSL_indirect_call_begin: - push ebp - mov ebp,esp - sub esp,28 - mov ecx,DWORD [12+ebp] - mov DWORD [esp],ecx - mov edx,DWORD [16+ebp] - mov DWORD [4+esp],edx - mov eax,DWORD [20+ebp] - mov DWORD [8+esp],eax - mov eax,DWORD [24+ebp] - mov DWORD [12+esp],eax - mov eax,DWORD [28+ebp] - mov DWORD [16+esp],eax - mov eax,DWORD [32+ebp] - mov DWORD [20+esp],eax - mov eax,DWORD [36+ebp] - mov DWORD [24+esp],eax - call DWORD [8+ebp] - mov esp,ebp - pop ebp - ret -global _OPENSSL_ia32_rdrand -align 16 -_OPENSSL_ia32_rdrand: -L$_OPENSSL_ia32_rdrand_begin: - mov ecx,8 -L$016loop: -db 15,199,240 - jc NEAR L$017break - loop L$016loop -L$017break: - cmp eax,0 - cmove eax,ecx - ret -segment .bss -common _OPENSSL_ia32cap_P 16 diff --git a/third_party/serf/win-x86/crypto/sha/sha1-586.asm b/third_party/serf/win-x86/crypto/sha/sha1-586.asm index e24449d200..cee8c62626 100644 --- a/third_party/serf/win-x86/crypto/sha/sha1-586.asm +++ b/third_party/serf/win-x86/crypto/sha/sha1-586.asm @@ -35,8 +35,11 @@ L$000pic_point: mov ecx,DWORD [8+esi] test eax,16777216 jz NEAR L$001x86 - test ecx,536870912 - jnz NEAR L$shaext_shortcut + and edx,268435456 + and eax,1073741824 + or eax,edx + cmp eax,1342177280 + je NEAR L$avx_shortcut jmp NEAR L$ssse3_shortcut align 16 L$001x86: @@ -1405,7 +1408,7 @@ L$002loop: pop ebp ret align 16 -__sha1_block_data_order_shaext: +__sha1_block_data_order_ssse3: push ebp push ebx push esi @@ -1414,174 +1417,6 @@ __sha1_block_data_order_shaext: L$003pic_point: pop ebp lea ebp,[(L$K_XX_XX-L$003pic_point)+ebp] -L$shaext_shortcut: - mov edi,DWORD [20+esp] - mov ebx,esp - mov esi,DWORD [24+esp] - mov ecx,DWORD [28+esp] - sub esp,32 - movdqu xmm0,[edi] - movd xmm1,DWORD [16+edi] - and esp,-32 - movdqa xmm3,[80+ebp] - movdqu xmm4,[esi] - pshufd xmm0,xmm0,27 - movdqu xmm5,[16+esi] - pshufd xmm1,xmm1,27 - movdqu xmm6,[32+esi] -db 102,15,56,0,227 - movdqu xmm7,[48+esi] -db 102,15,56,0,235 -db 102,15,56,0,243 -db 102,15,56,0,251 - jmp NEAR L$004loop_shaext -align 16 -L$004loop_shaext: - dec ecx - lea eax,[64+esi] - movdqa [esp],xmm1 - paddd xmm1,xmm4 - cmovne esi,eax - movdqa [16+esp],xmm0 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,0 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,0 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,0 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,0 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,0 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,1 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,1 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,1 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,1 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,1 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,2 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,2 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,2 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,2 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,2 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,3 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 - movdqu xmm4,[esi] - movdqa xmm2,xmm0 -db 15,58,204,193,3 -db 15,56,200,213 - movdqu xmm5,[16+esi] -db 102,15,56,0,227 - movdqa xmm1,xmm0 -db 15,58,204,194,3 -db 15,56,200,206 - movdqu xmm6,[32+esi] -db 102,15,56,0,235 - movdqa xmm2,xmm0 -db 15,58,204,193,3 -db 15,56,200,215 - movdqu xmm7,[48+esi] -db 102,15,56,0,243 - movdqa xmm1,xmm0 -db 15,58,204,194,3 - movdqa xmm2,[esp] -db 102,15,56,0,251 -db 15,56,200,202 - paddd xmm0,[16+esp] - jnz NEAR L$004loop_shaext - pshufd xmm0,xmm0,27 - pshufd xmm1,xmm1,27 - movdqu [edi],xmm0 - movd DWORD [16+edi],xmm1 - mov esp,ebx - pop edi - pop esi - pop ebx - pop ebp - ret -align 16 -__sha1_block_data_order_ssse3: - push ebp - push ebx - push esi - push edi - call L$005pic_point -L$005pic_point: - pop ebp - lea ebp,[(L$K_XX_XX-L$005pic_point)+ebp] L$ssse3_shortcut: movdqa xmm7,[ebp] movdqa xmm0,[16+ebp] @@ -1634,9 +1469,9 @@ db 102,15,56,0,222 xor ebp,edx pshufd xmm4,xmm0,238 and esi,ebp - jmp NEAR L$006loop + jmp NEAR L$004loop align 16 -L$006loop: +L$004loop: ror ebx,2 xor esi,edx mov ebp,eax @@ -2539,7 +2374,7 @@ L$006loop: add ecx,edx mov ebp,DWORD [196+esp] cmp ebp,DWORD [200+esp] - je NEAR L$007done + je NEAR L$005done movdqa xmm7,[160+esp] movdqa xmm6,[176+esp] movdqu xmm0,[ebp] @@ -2674,9 +2509,9 @@ db 102,15,56,0,222 pshufd xmm4,xmm0,238 and esi,ebx mov ebx,ebp - jmp NEAR L$006loop + jmp NEAR L$004loop align 16 -L$007done: +L$005done: add ebx,DWORD [16+esp] xor esi,edi mov ebp,ecx @@ -2789,6 +2624,1174 @@ L$007done: pop ebx pop ebp ret +align 16 +__sha1_block_data_order_avx: + push ebp + push ebx + push esi + push edi + call L$006pic_point +L$006pic_point: + pop ebp + lea ebp,[(L$K_XX_XX-L$006pic_point)+ebp] +L$avx_shortcut: + vzeroall + vmovdqa xmm7,[ebp] + vmovdqa xmm0,[16+ebp] + vmovdqa xmm1,[32+ebp] + vmovdqa xmm2,[48+ebp] + vmovdqa xmm6,[64+ebp] + mov edi,DWORD [20+esp] + mov ebp,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov esi,esp + sub esp,208 + and esp,-64 + vmovdqa [112+esp],xmm0 + vmovdqa [128+esp],xmm1 + vmovdqa [144+esp],xmm2 + shl edx,6 + vmovdqa [160+esp],xmm7 + add edx,ebp + vmovdqa [176+esp],xmm6 + add ebp,64 + mov DWORD [192+esp],edi + mov DWORD [196+esp],ebp + mov DWORD [200+esp],edx + mov DWORD [204+esp],esi + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + mov edi,DWORD [16+edi] + mov esi,ebx + vmovdqu xmm0,[ebp-64] + vmovdqu xmm1,[ebp-48] + vmovdqu xmm2,[ebp-32] + vmovdqu xmm3,[ebp-16] + vpshufb xmm0,xmm0,xmm6 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vmovdqa [96+esp],xmm7 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm7 + vpaddd xmm5,xmm1,xmm7 + vpaddd xmm6,xmm2,xmm7 + vmovdqa [esp],xmm4 + mov ebp,ecx + vmovdqa [16+esp],xmm5 + xor ebp,edx + vmovdqa [32+esp],xmm6 + and esi,ebp + jmp NEAR L$007loop +align 16 +L$007loop: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov ebp,eax + add edi,DWORD [esp] + vpaddd xmm7,xmm7,xmm3 + vmovdqa [64+esp],xmm0 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm6,xmm3,4 + add edi,esi + and ebp,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add edi,eax + vpxor xmm6,xmm6,xmm2 + shrd eax,eax,7 + xor ebp,ecx + vmovdqa [48+esp],xmm7 + mov esi,edi + add edx,DWORD [4+esp] + vpxor xmm4,xmm4,xmm6 + xor eax,ebx + shld edi,edi,5 + add edx,ebp + and esi,eax + vpsrld xmm6,xmm4,31 + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor esi,ebx + vpslldq xmm0,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov ebp,edx + add ecx,DWORD [8+esp] + xor edi,eax + shld edx,edx,5 + vpsrld xmm7,xmm0,30 + vpor xmm4,xmm4,xmm6 + add ecx,esi + and ebp,edi + xor edi,eax + add ecx,edx + vpslld xmm0,xmm0,2 + shrd edx,edx,7 + xor ebp,eax + vpxor xmm4,xmm4,xmm7 + mov esi,ecx + add ebx,DWORD [12+esp] + xor edx,edi + shld ecx,ecx,5 + vpxor xmm4,xmm4,xmm0 + add ebx,ebp + and esi,edx + vmovdqa xmm0,[96+esp] + xor edx,edi + add ebx,ecx + shrd ecx,ecx,7 + xor esi,edi + vpalignr xmm5,xmm2,xmm1,8 + mov ebp,ebx + add eax,DWORD [16+esp] + vpaddd xmm0,xmm0,xmm4 + vmovdqa [80+esp],xmm1 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm7,xmm4,4 + add eax,esi + and ebp,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm7,xmm7,xmm3 + shrd ebx,ebx,7 + xor ebp,edx + vmovdqa [esp],xmm0 + mov esi,eax + add edi,DWORD [20+esp] + vpxor xmm5,xmm5,xmm7 + xor ebx,ecx + shld eax,eax,5 + add edi,ebp + and esi,ebx + vpsrld xmm7,xmm5,31 + xor ebx,ecx + add edi,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm1,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov ebp,edi + add edx,DWORD [24+esp] + xor eax,ebx + shld edi,edi,5 + vpsrld xmm0,xmm1,30 + vpor xmm5,xmm5,xmm7 + add edx,esi + and ebp,eax + xor eax,ebx + add edx,edi + vpslld xmm1,xmm1,2 + shrd edi,edi,7 + xor ebp,ebx + vpxor xmm5,xmm5,xmm0 + mov esi,edx + add ecx,DWORD [28+esp] + xor edi,eax + shld edx,edx,5 + vpxor xmm5,xmm5,xmm1 + add ecx,ebp + and esi,edi + vmovdqa xmm1,[112+esp] + xor edi,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov ebp,ecx + add ebx,DWORD [32+esp] + vpaddd xmm1,xmm1,xmm5 + vmovdqa [96+esp],xmm2 + xor edx,edi + shld ecx,ecx,5 + vpsrldq xmm0,xmm5,4 + add ebx,esi + and ebp,edx + vpxor xmm6,xmm6,xmm2 + xor edx,edi + add ebx,ecx + vpxor xmm0,xmm0,xmm4 + shrd ecx,ecx,7 + xor ebp,edi + vmovdqa [16+esp],xmm1 + mov esi,ebx + add eax,DWORD [36+esp] + vpxor xmm6,xmm6,xmm0 + xor ecx,edx + shld ebx,ebx,5 + add eax,ebp + and esi,ecx + vpsrld xmm0,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm2,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov ebp,eax + add edi,DWORD [40+esp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm1,xmm2,30 + vpor xmm6,xmm6,xmm0 + add edi,esi + and ebp,ebx + xor ebx,ecx + add edi,eax + vpslld xmm2,xmm2,2 + vmovdqa xmm0,[64+esp] + shrd eax,eax,7 + xor ebp,ecx + vpxor xmm6,xmm6,xmm1 + mov esi,edi + add edx,DWORD [44+esp] + xor eax,ebx + shld edi,edi,5 + vpxor xmm6,xmm6,xmm2 + add edx,ebp + and esi,eax + vmovdqa xmm2,[112+esp] + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov ebp,edx + add ecx,DWORD [48+esp] + vpaddd xmm2,xmm2,xmm6 + vmovdqa [64+esp],xmm3 + xor edi,eax + shld edx,edx,5 + vpsrldq xmm1,xmm6,4 + add ecx,esi + and ebp,edi + vpxor xmm7,xmm7,xmm3 + xor edi,eax + add ecx,edx + vpxor xmm1,xmm1,xmm5 + shrd edx,edx,7 + xor ebp,eax + vmovdqa [32+esp],xmm2 + mov esi,ecx + add ebx,DWORD [52+esp] + vpxor xmm7,xmm7,xmm1 + xor edx,edi + shld ecx,ecx,5 + add ebx,ebp + and esi,edx + vpsrld xmm1,xmm7,31 + xor edx,edi + add ebx,ecx + shrd ecx,ecx,7 + xor esi,edi + vpslldq xmm3,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov ebp,ebx + add eax,DWORD [56+esp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm2,xmm3,30 + vpor xmm7,xmm7,xmm1 + add eax,esi + and ebp,ecx + xor ecx,edx + add eax,ebx + vpslld xmm3,xmm3,2 + vmovdqa xmm1,[80+esp] + shrd ebx,ebx,7 + xor ebp,edx + vpxor xmm7,xmm7,xmm2 + mov esi,eax + add edi,DWORD [60+esp] + xor ebx,ecx + shld eax,eax,5 + vpxor xmm7,xmm7,xmm3 + add edi,ebp + and esi,ebx + vmovdqa xmm3,[112+esp] + xor ebx,ecx + add edi,eax + vpalignr xmm2,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov ebp,edi + add edx,DWORD [esp] + vpxor xmm0,xmm0,xmm1 + vmovdqa [80+esp],xmm4 + xor eax,ebx + shld edi,edi,5 + vmovdqa xmm4,xmm3 + vpaddd xmm3,xmm3,xmm7 + add edx,esi + and ebp,eax + vpxor xmm0,xmm0,xmm2 + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor ebp,ebx + vpsrld xmm2,xmm0,30 + vmovdqa [48+esp],xmm3 + mov esi,edx + add ecx,DWORD [4+esp] + xor edi,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,ebp + and esi,edi + xor edi,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov ebp,ecx + add ebx,DWORD [8+esp] + vpor xmm0,xmm0,xmm2 + xor edx,edi + shld ecx,ecx,5 + vmovdqa xmm2,[96+esp] + add ebx,esi + and ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [12+esp] + xor ebp,edi + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm3,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add edi,DWORD [16+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + vmovdqa [96+esp],xmm5 + add edi,esi + xor ebp,ecx + vmovdqa xmm5,xmm4 + vpaddd xmm4,xmm4,xmm0 + shrd ebx,ebx,7 + add edi,eax + vpxor xmm1,xmm1,xmm3 + add edx,DWORD [20+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + vpsrld xmm3,xmm1,30 + vmovdqa [esp],xmm4 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpslld xmm1,xmm1,2 + add ecx,DWORD [24+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vpor xmm1,xmm1,xmm3 + add ebx,DWORD [28+esp] + xor ebp,edi + vmovdqa xmm3,[64+esp] + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm4,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD [32+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + vmovdqa [64+esp],xmm6 + add eax,esi + xor ebp,edx + vmovdqa xmm6,[128+esp] + vpaddd xmm5,xmm5,xmm1 + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm4 + add edi,DWORD [36+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm4,xmm2,30 + vmovdqa [16+esp],xmm5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + vpslld xmm2,xmm2,2 + add edx,DWORD [40+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + vpor xmm2,xmm2,xmm4 + add ecx,DWORD [44+esp] + xor ebp,eax + vmovdqa xmm4,[80+esp] + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + vpalignr xmm5,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD [48+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + vmovdqa [80+esp],xmm7 + add ebx,esi + xor ebp,edi + vmovdqa xmm7,xmm6 + vpaddd xmm6,xmm6,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm5 + add eax,DWORD [52+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm5,xmm3,30 + vmovdqa [32+esp],xmm6 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add edi,DWORD [56+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + vpor xmm3,xmm3,xmm5 + add edx,DWORD [60+esp] + xor ebp,ebx + vmovdqa xmm5,[96+esp] + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpalignr xmm6,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD [esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + vmovdqa [96+esp],xmm0 + add ecx,esi + xor ebp,eax + vmovdqa xmm0,xmm7 + vpaddd xmm7,xmm7,xmm3 + shrd edi,edi,7 + add ecx,edx + vpxor xmm4,xmm4,xmm6 + add ebx,DWORD [4+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm6,xmm4,30 + vmovdqa [48+esp],xmm7 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD [8+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm6 + add edi,DWORD [12+esp] + xor ebp,ecx + vmovdqa xmm6,[64+esp] + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + vpalignr xmm7,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD [16+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + vpxor xmm5,xmm5,xmm6 + vmovdqa [64+esp],xmm1 + add edx,esi + xor ebp,ebx + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,xmm4 + shrd eax,eax,7 + add edx,edi + vpxor xmm5,xmm5,xmm7 + add ecx,DWORD [20+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm7,xmm5,30 + vmovdqa [esp],xmm0 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD [24+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm7 + add eax,DWORD [28+esp] + vmovdqa xmm7,[80+esp] + shrd ecx,ecx,7 + mov esi,ebx + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm0,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add edi,DWORD [32+esp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + vmovdqa [80+esp],xmm2 + mov ebp,eax + xor esi,ecx + vmovdqa xmm2,xmm1 + vpaddd xmm1,xmm1,xmm5 + shld eax,eax,5 + add edi,esi + vpxor xmm6,xmm6,xmm0 + xor ebp,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [36+esp] + vpsrld xmm0,xmm6,30 + vmovdqa [16+esp],xmm1 + and ebp,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,edi + vpslld xmm6,xmm6,2 + xor ebp,ebx + shld edi,edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [40+esp] + and esi,eax + vpor xmm6,xmm6,xmm0 + xor eax,ebx + shrd edi,edi,7 + vmovdqa xmm0,[96+esp] + mov ebp,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor ebp,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [44+esp] + and ebp,edi + xor edi,eax + shrd edx,edx,7 + mov esi,ecx + xor ebp,edi + shld ecx,ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + vpalignr xmm1,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD [48+esp] + and esi,edx + xor edx,edi + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + vmovdqa [96+esp],xmm3 + mov ebp,ebx + xor esi,edx + vmovdqa xmm3,[144+esp] + vpaddd xmm2,xmm2,xmm6 + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm1 + xor ebp,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [52+esp] + vpsrld xmm1,xmm7,30 + vmovdqa [32+esp],xmm2 + and ebp,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor ebp,ecx + shld eax,eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [56+esp] + and esi,ebx + vpor xmm7,xmm7,xmm1 + xor ebx,ecx + shrd eax,eax,7 + vmovdqa xmm1,[64+esp] + mov ebp,edi + xor esi,ebx + shld edi,edi,5 + add edx,esi + xor ebp,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [60+esp] + and ebp,eax + xor eax,ebx + shrd edi,edi,7 + mov esi,edx + xor ebp,eax + shld edx,edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + vpalignr xmm2,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD [esp] + and esi,edi + xor edi,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + vmovdqa [64+esp],xmm4 + mov ebp,ecx + xor esi,edi + vmovdqa xmm4,xmm3 + vpaddd xmm3,xmm3,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm2 + xor ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [4+esp] + vpsrld xmm2,xmm0,30 + vmovdqa [48+esp],xmm3 + and ebp,edx + xor edx,edi + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [8+esp] + and esi,ecx + vpor xmm0,xmm0,xmm2 + xor ecx,edx + shrd ebx,ebx,7 + vmovdqa xmm2,[80+esp] + mov ebp,eax + xor esi,ecx + shld eax,eax,5 + add edi,esi + xor ebp,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [12+esp] + and ebp,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,edi + xor ebp,ebx + shld edi,edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + vpalignr xmm3,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD [16+esp] + and esi,eax + xor eax,ebx + shrd edi,edi,7 + vpxor xmm1,xmm1,xmm2 + vmovdqa [80+esp],xmm5 + mov ebp,edx + xor esi,eax + vmovdqa xmm5,xmm4 + vpaddd xmm4,xmm4,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm3 + xor ebp,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [20+esp] + vpsrld xmm3,xmm1,30 + vmovdqa [esp],xmm4 + and ebp,edi + xor edi,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor ebp,edi + shld ecx,ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [24+esp] + and esi,edx + vpor xmm1,xmm1,xmm3 + xor edx,edi + shrd ecx,ecx,7 + vmovdqa xmm3,[96+esp] + mov ebp,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor ebp,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [28+esp] + and ebp,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor ebp,ecx + shld eax,eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + vpalignr xmm4,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD [32+esp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + vmovdqa [96+esp],xmm6 + mov ebp,edi + xor esi,ebx + vmovdqa xmm6,xmm5 + vpaddd xmm5,xmm5,xmm1 + shld edi,edi,5 + add edx,esi + vpxor xmm2,xmm2,xmm4 + xor ebp,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [36+esp] + vpsrld xmm4,xmm2,30 + vmovdqa [16+esp],xmm5 + and ebp,eax + xor eax,ebx + shrd edi,edi,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor ebp,eax + shld edx,edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [40+esp] + and esi,edi + vpor xmm2,xmm2,xmm4 + xor edi,eax + shrd edx,edx,7 + vmovdqa xmm4,[64+esp] + mov ebp,ecx + xor esi,edi + shld ecx,ecx,5 + add ebx,esi + xor ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [44+esp] + and ebp,edx + xor edx,edi + shrd ecx,ecx,7 + mov esi,ebx + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + add eax,ebx + vpalignr xmm5,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add edi,DWORD [48+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + vmovdqa [64+esp],xmm7 + add edi,esi + xor ebp,ecx + vmovdqa xmm7,xmm6 + vpaddd xmm6,xmm6,xmm2 + shrd ebx,ebx,7 + add edi,eax + vpxor xmm3,xmm3,xmm5 + add edx,DWORD [52+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + vpsrld xmm5,xmm3,30 + vmovdqa [32+esp],xmm6 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpslld xmm3,xmm3,2 + add ecx,DWORD [56+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vpor xmm3,xmm3,xmm5 + add ebx,DWORD [60+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [esp] + vpaddd xmm7,xmm7,xmm3 + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa [48+esp],xmm7 + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [4+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [8+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [12+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + mov ebp,DWORD [196+esp] + cmp ebp,DWORD [200+esp] + je NEAR L$008done + vmovdqa xmm7,[160+esp] + vmovdqa xmm6,[176+esp] + vmovdqu xmm0,[ebp] + vmovdqu xmm1,[16+ebp] + vmovdqu xmm2,[32+ebp] + vmovdqu xmm3,[48+ebp] + add ebp,64 + vpshufb xmm0,xmm0,xmm6 + mov DWORD [196+esp],ebp + vmovdqa [96+esp],xmm7 + add ebx,DWORD [16+esp] + xor esi,edi + vpshufb xmm1,xmm1,xmm6 + mov ebp,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm7 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + vmovdqa [esp],xmm4 + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov ebp,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm7 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vmovdqa [16+esp],xmm5 + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov ebp,edi + shld edi,edi,5 + vpaddd xmm6,xmm2,xmm7 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + vmovdqa [32+esp],xmm6 + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + shrd ecx,ecx,7 + add eax,ebx + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov ebx,ecx + mov DWORD [8+ebp],ecx + xor ebx,edx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + mov ebp,esi + and esi,ebx + mov ebx,ebp + jmp NEAR L$007loop +align 16 +L$008done: + add ebx,DWORD [16+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + shrd ecx,ecx,7 + add eax,ebx + vzeroall + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + mov esp,DWORD [204+esp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov DWORD [8+ebp],ecx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + pop edi + pop esi + pop ebx + pop ebp + ret align 64 L$K_XX_XX: dd 1518500249,1518500249,1518500249,1518500249 diff --git a/third_party/serf/win-x86/crypto/sha/sha256-586.asm b/third_party/serf/win-x86/crypto/sha/sha256-586.asm index fe36bc5c9a..3e7cfcca07 100644 --- a/third_party/serf/win-x86/crypto/sha/sha256-586.asm +++ b/third_party/serf/win-x86/crypto/sha/sha256-586.asm @@ -49,11 +49,10 @@ L$000pic_point: jz NEAR L$003no_xmm and ecx,1073741824 and ebx,268435968 - test edx,536870912 - jnz NEAR L$004shaext or ecx,ebx and ecx,1342177280 cmp ecx,1342177280 + je NEAR L$004AVX test ebx,512 jnz NEAR L$005SSSE3 L$003no_xmm: @@ -3179,204 +3178,6 @@ L$009grand_loop: pop ebp ret align 32 -L$004shaext: - sub esp,32 - movdqu xmm1,[esi] - lea ebp,[128+ebp] - movdqu xmm2,[16+esi] - movdqa xmm7,[128+ebp] - pshufd xmm0,xmm1,27 - pshufd xmm1,xmm1,177 - pshufd xmm2,xmm2,27 -db 102,15,58,15,202,8 - punpcklqdq xmm2,xmm0 - jmp NEAR L$010loop_shaext -align 16 -L$010loop_shaext: - movdqu xmm3,[edi] - movdqu xmm4,[16+edi] - movdqu xmm5,[32+edi] -db 102,15,56,0,223 - movdqu xmm6,[48+edi] - movdqa [16+esp],xmm2 - movdqa xmm0,[ebp-128] - paddd xmm0,xmm3 -db 102,15,56,0,231 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - nop - movdqa [esp],xmm1 -db 15,56,203,202 - movdqa xmm0,[ebp-112] - paddd xmm0,xmm4 -db 102,15,56,0,239 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - lea edi,[64+edi] -db 15,56,204,220 -db 15,56,203,202 - movdqa xmm0,[ebp-96] - paddd xmm0,xmm5 -db 102,15,56,0,247 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm6 -db 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -db 15,56,204,229 -db 15,56,203,202 - movdqa xmm0,[ebp-80] - paddd xmm0,xmm6 -db 15,56,205,222 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm3 -db 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -db 15,56,204,238 -db 15,56,203,202 - movdqa xmm0,[ebp-64] - paddd xmm0,xmm3 -db 15,56,205,227 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm4 -db 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -db 15,56,204,243 -db 15,56,203,202 - movdqa xmm0,[ebp-48] - paddd xmm0,xmm4 -db 15,56,205,236 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm5 -db 102,15,58,15,252,4 - nop - paddd xmm6,xmm7 -db 15,56,204,220 -db 15,56,203,202 - movdqa xmm0,[ebp-32] - paddd xmm0,xmm5 -db 15,56,205,245 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm6 -db 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -db 15,56,204,229 -db 15,56,203,202 - movdqa xmm0,[ebp-16] - paddd xmm0,xmm6 -db 15,56,205,222 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm3 -db 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -db 15,56,204,238 -db 15,56,203,202 - movdqa xmm0,[ebp] - paddd xmm0,xmm3 -db 15,56,205,227 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm4 -db 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -db 15,56,204,243 -db 15,56,203,202 - movdqa xmm0,[16+ebp] - paddd xmm0,xmm4 -db 15,56,205,236 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm5 -db 102,15,58,15,252,4 - nop - paddd xmm6,xmm7 -db 15,56,204,220 -db 15,56,203,202 - movdqa xmm0,[32+ebp] - paddd xmm0,xmm5 -db 15,56,205,245 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm6 -db 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -db 15,56,204,229 -db 15,56,203,202 - movdqa xmm0,[48+ebp] - paddd xmm0,xmm6 -db 15,56,205,222 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm3 -db 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -db 15,56,204,238 -db 15,56,203,202 - movdqa xmm0,[64+ebp] - paddd xmm0,xmm3 -db 15,56,205,227 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm4 -db 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -db 15,56,204,243 -db 15,56,203,202 - movdqa xmm0,[80+ebp] - paddd xmm0,xmm4 -db 15,56,205,236 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm5 -db 102,15,58,15,252,4 -db 15,56,203,202 - paddd xmm6,xmm7 - movdqa xmm0,[96+ebp] - paddd xmm0,xmm5 -db 15,56,203,209 - pshufd xmm0,xmm0,14 -db 15,56,205,245 - movdqa xmm7,[128+ebp] -db 15,56,203,202 - movdqa xmm0,[112+ebp] - paddd xmm0,xmm6 - nop -db 15,56,203,209 - pshufd xmm0,xmm0,14 - cmp eax,edi - nop -db 15,56,203,202 - paddd xmm2,[16+esp] - paddd xmm1,[esp] - jnz NEAR L$010loop_shaext - pshufd xmm2,xmm2,177 - pshufd xmm7,xmm1,27 - pshufd xmm1,xmm1,177 - punpckhqdq xmm1,xmm2 -db 102,15,58,15,215,8 - mov esp,DWORD [44+esp] - movdqu [esi],xmm1 - movdqu [16+esi],xmm2 - pop edi - pop esi - pop ebx - pop ebp - ret -align 32 L$005SSSE3: lea esp,[esp-96] mov eax,DWORD [esi] @@ -3396,9 +3197,9 @@ L$005SSSE3: mov DWORD [24+esp],ecx mov DWORD [28+esp],esi movdqa xmm7,[256+ebp] - jmp NEAR L$011grand_ssse3 + jmp NEAR L$010grand_ssse3 align 16 -L$011grand_ssse3: +L$010grand_ssse3: movdqu xmm0,[edi] movdqu xmm1,[16+edi] movdqu xmm2,[32+edi] @@ -3421,9 +3222,9 @@ db 102,15,56,0,223 paddd xmm7,xmm3 movdqa [64+esp],xmm6 movdqa [80+esp],xmm7 - jmp NEAR L$012ssse3_00_47 + jmp NEAR L$011ssse3_00_47 align 16 -L$012ssse3_00_47: +L$011ssse3_00_47: add ebp,64 mov ecx,edx movdqa xmm4,xmm1 @@ -4066,7 +3867,7 @@ db 102,15,58,15,249,4 add eax,ecx movdqa [80+esp],xmm6 cmp DWORD [64+ebp],66051 - jne NEAR L$012ssse3_00_47 + jne NEAR L$011ssse3_00_47 mov ecx,edx ror edx,14 mov esi,DWORD [20+esp] @@ -4580,8 +4381,1189 @@ db 102,15,58,15,249,4 movdqa xmm7,[64+ebp] sub ebp,192 cmp edi,DWORD [104+esp] - jb NEAR L$011grand_ssse3 + jb NEAR L$010grand_ssse3 + mov esp,DWORD [108+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 32 +L$004AVX: + lea esp,[esp-96] + vzeroall + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edi,DWORD [12+esi] + mov DWORD [4+esp],ebx + xor ebx,ecx + mov DWORD [8+esp],ecx + mov DWORD [12+esp],edi + mov edx,DWORD [16+esi] + mov edi,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov esi,DWORD [28+esi] + mov DWORD [20+esp],edi + mov edi,DWORD [100+esp] + mov DWORD [24+esp],ecx + mov DWORD [28+esp],esi + vmovdqa xmm7,[256+ebp] + jmp NEAR L$012grand_avx +align 32 +L$012grand_avx: + vmovdqu xmm0,[edi] + vmovdqu xmm1,[16+edi] + vmovdqu xmm2,[32+edi] + vmovdqu xmm3,[48+edi] + add edi,64 + vpshufb xmm0,xmm0,xmm7 + mov DWORD [100+esp],edi + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,[ebp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,[16+ebp] + vpaddd xmm6,xmm2,[32+ebp] + vpaddd xmm7,xmm3,[48+ebp] + vmovdqa [32+esp],xmm4 + vmovdqa [48+esp],xmm5 + vmovdqa [64+esp],xmm6 + vmovdqa [80+esp],xmm7 + jmp NEAR L$013avx_00_47 +align 16 +L$013avx_00_47: + add ebp,64 + vpalignr xmm4,xmm1,xmm0,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + vpalignr xmm7,xmm3,xmm2,4 + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + vpaddd xmm0,xmm0,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + vpshufd xmm7,xmm3,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [32+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + vpaddd xmm0,xmm0,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [36+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + vpaddd xmm0,xmm0,xmm7 + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + vpshufd xmm7,xmm0,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [40+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + vpaddd xmm0,xmm0,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + vpaddd xmm6,xmm0,[ebp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + vmovdqa [32+esp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + vpalignr xmm7,xmm0,xmm3,4 + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + vpaddd xmm1,xmm1,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + vpshufd xmm7,xmm0,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [48+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + vpaddd xmm1,xmm1,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [52+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + vpaddd xmm1,xmm1,xmm7 + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + vpshufd xmm7,xmm1,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [56+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + vpaddd xmm1,xmm1,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + vpaddd xmm6,xmm1,[16+ebp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + vmovdqa [48+esp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + vpalignr xmm7,xmm1,xmm0,4 + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + vpaddd xmm2,xmm2,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + vpshufd xmm7,xmm1,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [64+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + vpaddd xmm2,xmm2,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [68+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + vpaddd xmm2,xmm2,xmm7 + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + vpshufd xmm7,xmm2,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [72+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + vpaddd xmm2,xmm2,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + vpaddd xmm6,xmm2,[32+ebp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + vmovdqa [64+esp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + vpalignr xmm7,xmm2,xmm1,4 + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + vpaddd xmm3,xmm3,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + vpshufd xmm7,xmm2,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [80+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + vpaddd xmm3,xmm3,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [84+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + vpaddd xmm3,xmm3,xmm7 + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + vpshufd xmm7,xmm3,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [88+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + vpaddd xmm3,xmm3,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + vpaddd xmm6,xmm3,[48+ebp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + vmovdqa [80+esp],xmm6 + cmp DWORD [64+ebp],66051 + jne NEAR L$013avx_00_47 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [32+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [36+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [40+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [48+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [52+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [56+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [64+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [68+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [72+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [80+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [84+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [88+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov esi,DWORD [96+esp] + xor ebx,edi + mov ecx,DWORD [12+esp] + add eax,DWORD [esi] + add ebx,DWORD [4+esi] + add edi,DWORD [8+esi] + add ecx,DWORD [12+esi] + mov DWORD [esi],eax + mov DWORD [4+esi],ebx + mov DWORD [8+esi],edi + mov DWORD [12+esi],ecx + mov DWORD [4+esp],ebx + xor ebx,edi + mov DWORD [8+esp],edi + mov DWORD [12+esp],ecx + mov edi,DWORD [20+esp] + mov ecx,DWORD [24+esp] + add edx,DWORD [16+esi] + add edi,DWORD [20+esi] + add ecx,DWORD [24+esi] + mov DWORD [16+esi],edx + mov DWORD [20+esi],edi + mov DWORD [20+esp],edi + mov edi,DWORD [28+esp] + mov DWORD [24+esi],ecx + add edi,DWORD [28+esi] + mov DWORD [24+esp],ecx + mov DWORD [28+esi],edi + mov DWORD [28+esp],edi + mov edi,DWORD [100+esp] + vmovdqa xmm7,[64+ebp] + sub ebp,192 + cmp edi,DWORD [104+esp] + jb NEAR L$012grand_avx mov esp,DWORD [108+esp] + vzeroall pop edi pop esi pop ebx diff --git a/third_party/serf/win-x86_64/crypto/bn/x86_64-mont5.asm b/third_party/serf/win-x86_64/crypto/bn/x86_64-mont5.asm index 284318aae3..560b384e8a 100644 --- a/third_party/serf/win-x86_64/crypto/bn/x86_64-mont5.asm +++ b/third_party/serf/win-x86_64/crypto/bn/x86_64-mont5.asm @@ -1616,6 +1616,15 @@ $L$8x_tail: ALIGN 32 $L$8x_tail_done: add r8,QWORD[rdx] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + + xor rax,rax neg rsi diff --git a/third_party/serf/win-x86_64/crypto/cpu-x86_64-asm.asm b/third_party/serf/win-x86_64/crypto/cpu-x86_64-asm.asm deleted file mode 100644 index c92d7bbc1f..0000000000 --- a/third_party/serf/win-x86_64/crypto/cpu-x86_64-asm.asm +++ /dev/null @@ -1,154 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - - -global OPENSSL_ia32_cpuid - -ALIGN 16 -OPENSSL_ia32_cpuid: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_OPENSSL_ia32_cpuid: - mov rdi,rcx - - - - - mov rdi,rcx - mov r8,rbx - - xor eax,eax - mov DWORD[8+rdi],eax - cpuid - mov r11d,eax - - xor eax,eax - cmp ebx,0x756e6547 - setne al - mov r9d,eax - cmp edx,0x49656e69 - setne al - or r9d,eax - cmp ecx,0x6c65746e - setne al - or r9d,eax - jz NEAR $L$intel - - cmp ebx,0x68747541 - setne al - mov r10d,eax - cmp edx,0x69746E65 - setne al - or r10d,eax - cmp ecx,0x444D4163 - setne al - or r10d,eax - jnz NEAR $L$intel - - - - - mov eax,0x80000000 - cpuid - - - cmp eax,0x80000001 - jb NEAR $L$intel - mov r10d,eax - mov eax,0x80000001 - cpuid - - - or r9d,ecx - and r9d,0x00000801 - - cmp r10d,0x80000008 - jb NEAR $L$intel - - mov eax,0x80000008 - cpuid - - movzx r10,cl - inc r10 - - mov eax,1 - cpuid - - bt edx,28 - jnc NEAR $L$generic - shr ebx,16 - cmp bl,r10b - ja NEAR $L$generic - and edx,0xefffffff - jmp NEAR $L$generic - -$L$intel: - cmp r11d,4 - mov r10d,-1 - jb NEAR $L$nocacheinfo - - mov eax,4 - mov ecx,0 - cpuid - mov r10d,eax - shr r10d,14 - and r10d,0xfff - - cmp r11d,7 - jb NEAR $L$nocacheinfo - - mov eax,7 - xor ecx,ecx - cpuid - mov DWORD[8+rdi],ebx - -$L$nocacheinfo: - mov eax,1 - cpuid - - and edx,0xbfefffff - cmp r9d,0 - jne NEAR $L$notintel - or edx,0x40000000 -$L$notintel: - bt edx,28 - jnc NEAR $L$generic - and edx,0xefffffff - cmp r10d,0 - je NEAR $L$generic - - or edx,0x10000000 - shr ebx,16 - cmp bl,1 - ja NEAR $L$generic - and edx,0xefffffff -$L$generic: - and r9d,0x00000800 - and ecx,0xfffff7ff - or r9d,ecx - - mov r10d,edx - bt r9d,27 - jnc NEAR $L$clear_avx - xor ecx,ecx -DB 0x0f,0x01,0xd0 - and eax,6 - cmp eax,6 - je NEAR $L$done -$L$clear_avx: - mov eax,0xefffe7ff - and r9d,eax - and DWORD[8+rdi],0xffffffdf -$L$done: - mov DWORD[4+rdi],r9d - mov DWORD[rdi],r10d - mov rbx,r8 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_OPENSSL_ia32_cpuid: - diff --git a/third_party/serf/win-x86_64/crypto/ec/p256-x86_64-asm.asm b/third_party/serf/win-x86_64/crypto/ec/p256-x86_64-asm.asm new file mode 100644 index 0000000000..45ba686c7b --- /dev/null +++ b/third_party/serf/win-x86_64/crypto/ec/p256-x86_64-asm.asm @@ -0,0 +1,1916 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + +EXTERN OPENSSL_ia32cap_P + + +ALIGN 64 +$L$poly: + DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 + +$L$One: + DD 1,1,1,1,1,1,1,1 +$L$Two: + DD 2,2,2,2,2,2,2,2 +$L$Three: + DD 3,3,3,3,3,3,3,3 +$L$ONE_mont: + DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe + + +ALIGN 64 +ecp_nistz256_mul_by_2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_by_2: + mov rdi,rcx + mov rsi,rdx + + + push r12 + push r13 + + mov r8,QWORD[rsi] + mov r9,QWORD[8+rsi] + add r8,r8 + mov r10,QWORD[16+rsi] + adc r9,r9 + mov r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rax,r8 + adc r10,r10 + adc r11,r11 + mov rdx,r9 + sbb r13,r13 + + sub r8,QWORD[rsi] + mov rcx,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r12,r11 + sbb r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_mul_by_2: + + + +global ecp_nistz256_neg + +ALIGN 32 +ecp_nistz256_neg: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_neg: + mov rdi,rcx + mov rsi,rdx + + + push r12 + push r13 + + xor r8,r8 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor r13,r13 + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov rax,r8 + sbb r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rdx,r9 + sbb r13,0 + + add r8,QWORD[rsi] + mov rcx,r10 + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + mov r12,r11 + adc r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_neg: + + + + + + +global ecp_nistz256_mul_mont + +ALIGN 32 +ecp_nistz256_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$mul_mont: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + mov rbx,rdx + mov rax,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + + call __ecp_nistz256_mul_montq +$L$mul_mont_done: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_mul_mont: + + +ALIGN 32 +__ecp_nistz256_mul_montq: + + + mov rbp,rax + mul r9 + mov r14,QWORD[(($L$poly+8))] + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r10 + mov r15,QWORD[(($L$poly+24))] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r11 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r8 + adc rdx,0 + xor r13,r13 + mov r12,rdx + + + + + + + + + + + mov rbp,r8 + shl r8,32 + mul r15 + shr rbp,32 + add r9,r8 + adc r10,rbp + adc r11,rax + mov rax,QWORD[8+rbx] + adc r12,rdx + adc r13,0 + xor r8,r8 + + + + mov rbp,rax + mul QWORD[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + + mov rbp,r9 + shl r9,32 + mul r15 + shr rbp,32 + add r10,r9 + adc r11,rbp + adc r12,rax + mov rax,QWORD[16+rbx] + adc r13,rdx + adc r8,0 + xor r9,r9 + + + + mov rbp,rax + mul QWORD[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + + mov rbp,r10 + shl r10,32 + mul r15 + shr rbp,32 + add r11,r10 + adc r12,rbp + adc r13,rax + mov rax,QWORD[24+rbx] + adc r8,rdx + adc r9,0 + xor r10,r10 + + + + mov rbp,rax + mul QWORD[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + + mov rbp,r11 + shl r11,32 + mul r15 + shr rbp,32 + add r12,r11 + adc r13,rbp + mov rcx,r12 + adc r8,rax + adc r9,rdx + mov rbp,r13 + adc r10,0 + + + + sub r12,-1 + mov rbx,r8 + sbb r13,r14 + sbb r8,0 + mov rdx,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rcx + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rbx + mov QWORD[8+rdi],r13 + cmovc r9,rdx + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + + + + + + +global ecp_nistz256_sqr_mont + +ALIGN 32 +ecp_nistz256_sqr_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_sqr_mont: + mov rdi,rcx + mov rsi,rdx + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + mov rax,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + + call __ecp_nistz256_sqr_montq +$L$sqr_mont_done: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_sqr_mont: + + +ALIGN 32 +__ecp_nistz256_sqr_montq: + mov r13,rax + mul r14 + mov r9,rax + mov rax,r15 + mov r10,rdx + + mul r13 + add r10,rax + mov rax,r8 + adc rdx,0 + mov r11,rdx + + mul r13 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + + mul r14 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul r14 + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + mov r13,rdx + adc r13,0 + + + mul r15 + xor r15,r15 + add r13,rax + mov rax,QWORD[rsi] + mov r14,rdx + adc r14,0 + + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + mul rax + mov r8,rax + mov rax,QWORD[8+rsi] + mov rcx,rdx + + mul rax + add r9,rcx + adc r10,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r11,rcx + adc r12,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r13,rcx + adc r14,rax + mov rax,r8 + adc r15,rdx + + mov rsi,QWORD[(($L$poly+8))] + mov rbp,QWORD[(($L$poly+24))] + + + + + mov rcx,r8 + shl r8,32 + mul rbp + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul rbp + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul rbp + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul rbp + shr rcx,32 + add r8,r11 + adc r9,rcx + adc r10,rax + adc rdx,0 + xor r11,r11 + + + + add r12,r8 + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,rdx + mov r9,r13 + adc r11,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov rcx,r15 + sbb r15,rbp + sbb r11,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,rcx + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + DB 0F3h,0C3h ;repret + + + + + + + +global ecp_nistz256_from_mont + +ALIGN 32 +ecp_nistz256_from_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_from_mont: + mov rdi,rcx + mov rsi,rdx + + + push r12 + push r13 + + mov rax,QWORD[rsi] + mov r13,QWORD[(($L$poly+24))] + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + mov r8,rax + mov r12,QWORD[(($L$poly+8))] + + + + mov rcx,rax + shl r8,32 + mul r13 + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul r13 + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul r13 + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul r13 + shr rcx,32 + add r8,r11 + adc r9,rcx + mov rcx,r8 + adc r10,rax + mov rsi,r9 + adc rdx,0 + + sub r8,-1 + mov rax,r10 + sbb r9,r12 + sbb r10,0 + mov r11,rdx + sbb rdx,r13 + sbb r13,r13 + + cmovnz r8,rcx + cmovnz r9,rsi + mov QWORD[rdi],r8 + cmovnz r10,rax + mov QWORD[8+rdi],r9 + cmovz r11,rdx + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_from_mont: + + +global ecp_nistz256_select_w5 + +ALIGN 32 +ecp_nistz256_select_w5: + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w5: +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm0,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + + movdqa xmm8,xmm0 + pshufd xmm1,xmm1,0 + + mov rax,16 +$L$select_loop_sse_w5: + + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + pcmpeqd xmm15,xmm1 + + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + movdqa xmm13,XMMWORD[64+rdx] + movdqa xmm14,XMMWORD[80+rdx] + lea rdx,[96+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + pand xmm13,xmm15 + por xmm5,xmm12 + pand xmm14,xmm15 + por xmm6,xmm13 + por xmm7,xmm14 + + dec rax + jnz NEAR $L$select_loop_sse_w5 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movdqu XMMWORD[64+rcx],xmm6 + movdqu XMMWORD[80+rcx],xmm7 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_ecp_nistz256_select_w5: + DB 0F3h,0C3h ;repret + + + + +global ecp_nistz256_select_w7 + +ALIGN 32 +ecp_nistz256_select_w7: + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w7: +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm8,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + movdqa xmm0,xmm8 + pshufd xmm1,xmm1,0 + mov rax,64 + +$L$select_loop_sse_w7: + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + pcmpeqd xmm15,xmm1 + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + lea rdx,[64+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + prefetcht0 [255+rdx] + por xmm5,xmm12 + + dec rax + jnz NEAR $L$select_loop_sse_w7 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_ecp_nistz256_select_w7: + DB 0F3h,0C3h ;repret + +global ecp_nistz256_avx2_select_w7 + +ALIGN 32 +ecp_nistz256_avx2_select_w7: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_avx2_select_w7: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +DB 0x0f,0x0b + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_select_w7: + +ALIGN 32 +__ecp_nistz256_add_toq: + add r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_sub_fromq: + sub r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + add r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_subq: + sub rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,r11 + + add rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + test r11,r11 + + cmovnz r12,rax + cmovnz r13,rbp + cmovnz r8,rcx + cmovnz r9,r10 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_mul_by_2q: + add r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + +global ecp_nistz256_point_double + +ALIGN 32 +ecp_nistz256_point_double: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_double: + mov rdi,rcx + mov rsi,rdx + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*5+8 + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-0))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-0))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rax,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subq + + mov rax,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-0))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + add rsp,32*5+8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_point_double: +global ecp_nistz256_point_add + +ALIGN 32 +ecp_nistz256_point_add: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*18+8 + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + por xmm1,xmm0 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + por xmm3,xmm2 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm3,xmm1 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm5,xmm3,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + mov QWORD[((544+0))+rsp],rax + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rax,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] + + lea rsi,[((64-0))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((0+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 0x3e + jnz NEAR $L$add_proceedq +DB 102,73,15,126,208 +DB 102,73,15,126,217 + test r8,r8 + jnz NEAR $L$add_proceedq + test r9,r9 + jz NEAR $L$add_proceedq + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_doneq + +ALIGN 32 +$L$add_proceedq: + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((0+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + + + + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + mov rax,QWORD[rsi] + cmovz r13,rbp + mov rbp,QWORD[8+rsi] + cmovz r8,rcx + mov rcx,QWORD[16+rsi] + cmovz r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_doneq: + add rsp,32*18+8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_point_add: +global ecp_nistz256_point_add_affine + +ALIGN 32 +ecp_nistz256_point_add_affine: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affine: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*15+8 + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + por xmm1,xmm0 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + por xmm3,xmm2 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm3,xmm1 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm5,xmm3,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rax,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-0))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((0+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + + + + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + mov rax,QWORD[rsi] + cmovz r13,rbp + mov rbp,QWORD[8+rsi] + cmovz r8,rcx + mov rcx,QWORD[16+rsi] + cmovz r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((0+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + add rsp,32*15+8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_point_add_affine: diff --git a/third_party/serf/win-x86_64/crypto/rand/rdrand-x86_64.asm b/third_party/serf/win-x86_64/crypto/rand/rdrand-x86_64.asm index a63ea69d4c..4c03791b48 100644 --- a/third_party/serf/win-x86_64/crypto/rand/rdrand-x86_64.asm +++ b/third_party/serf/win-x86_64/crypto/rand/rdrand-x86_64.asm @@ -5,6 +5,9 @@ default rel section .text code align=64 + + + global CRYPTO_rdrand ALIGN 16 @@ -16,7 +19,52 @@ $L$SEH_begin_CRYPTO_rdrand: mov rdi,rcx -DB 0x48,0x0f,0xc7,0xf0 + xor rax,rax + + +DB 0x48,0x0f,0xc7,0xf1 + + adc rax,rax + mov QWORD[rdi],rcx + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + + + + +global CRYPTO_rdrand_multiple8_buf + +ALIGN 16 +CRYPTO_rdrand_multiple8_buf: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_CRYPTO_rdrand_multiple8_buf: + mov rdi,rcx + mov rsi,rdx + + + test rsi,rsi + jz NEAR $L$out + mov rdx,8 +$L$loop: + + +DB 0x48,0x0f,0xc7,0xf1 + jnc NEAR $L$err + mov QWORD[rdi],rcx + add rdi,rdx + sub rsi,rdx + jnz NEAR $L$loop +$L$out: + mov rax,1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$err: + xor rax,rax mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret diff --git a/third_party/serf/win-x86_64/crypto/sha/sha1-x86_64.asm b/third_party/serf/win-x86_64/crypto/sha/sha1-x86_64.asm index 0f5361a1b5..168f78db3f 100644 --- a/third_party/serf/win-x86_64/crypto/sha/sha1-x86_64.asm +++ b/third_party/serf/win-x86_64/crypto/sha/sha1-x86_64.asm @@ -24,6 +24,11 @@ $L$SEH_begin_sha1_block_data_order: mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] test r8d,512 jz NEAR $L$ialu + and r8d,268435456 + and r9d,1073741824 + or r8d,r9d + cmp r8d,1342177280 + je NEAR _avx_shortcut jmp NEAR _ssse3_shortcut ALIGN 16 @@ -2445,6 +2450,1146 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_ssse3: + +ALIGN 16 +sha1_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +_avx_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + vzeroupper + vmovaps XMMWORD[(-40-96)+rax],xmm6 + vmovaps XMMWORD[(-40-80)+rax],xmm7 + vmovaps XMMWORD[(-40-64)+rax],xmm8 + vmovaps XMMWORD[(-40-48)+rax],xmm9 + vmovaps XMMWORD[(-40-32)+rax],xmm10 + vmovaps XMMWORD[(-40-16)+rax],xmm11 +$L$prologue_avx: + mov r14,rax + and rsp,-64 + mov r8,rdi + mov r9,rsi + mov r10,rdx + + shl r10,6 + add r10,r9 + lea r11,[((K_XX_XX+64))] + + mov eax,DWORD[r8] + mov ebx,DWORD[4+r8] + mov ecx,DWORD[8+r8] + mov edx,DWORD[12+r8] + mov esi,ebx + mov ebp,DWORD[16+r8] + mov edi,ecx + xor edi,edx + and esi,edi + + vmovdqa xmm6,XMMWORD[64+r11] + vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm11 + vpaddd xmm5,xmm1,xmm11 + vpaddd xmm6,xmm2,xmm11 + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + jmp NEAR $L$oop_avx +ALIGN 16 +$L$oop_avx: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov edi,eax + add ebp,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm8,xmm3,4 + add ebp,esi + and edi,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add ebp,eax + vpxor xmm8,xmm8,xmm2 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[4+rsp] + vpxor xmm4,xmm4,xmm8 + xor eax,ebx + shld ebp,ebp,5 + vmovdqa XMMWORD[48+rsp],xmm9 + add edx,edi + and esi,eax + vpsrld xmm8,xmm4,31 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpslldq xmm10,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov edi,edx + add ecx,DWORD[8+rsp] + xor ebp,eax + shld edx,edx,5 + vpsrld xmm9,xmm10,30 + vpor xmm4,xmm4,xmm8 + add ecx,esi + and edi,ebp + xor ebp,eax + add ecx,edx + vpslld xmm10,xmm10,2 + vpxor xmm4,xmm4,xmm9 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[12+rsp] + vpxor xmm4,xmm4,xmm10 + xor edx,ebp + shld ecx,ecx,5 + add ebx,edi + and esi,edx + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpalignr xmm5,xmm2,xmm1,8 + mov edi,ebx + add eax,DWORD[16+rsp] + vpaddd xmm9,xmm11,xmm4 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm8,xmm4,4 + add eax,esi + and edi,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm8,xmm8,xmm3 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[20+rsp] + vpxor xmm5,xmm5,xmm8 + xor ebx,ecx + shld eax,eax,5 + vmovdqa XMMWORD[rsp],xmm9 + add ebp,edi + and esi,ebx + vpsrld xmm8,xmm5,31 + xor ebx,ecx + add ebp,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm10,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov edi,ebp + add edx,DWORD[24+rsp] + xor eax,ebx + shld ebp,ebp,5 + vpsrld xmm9,xmm10,30 + vpor xmm5,xmm5,xmm8 + add edx,esi + and edi,eax + xor eax,ebx + add edx,ebp + vpslld xmm10,xmm10,2 + vpxor xmm5,xmm5,xmm9 + shrd ebp,ebp,7 + xor edi,ebx + mov esi,edx + add ecx,DWORD[28+rsp] + vpxor xmm5,xmm5,xmm10 + xor ebp,eax + shld edx,edx,5 + vmovdqa xmm11,XMMWORD[((-32))+r11] + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov edi,ecx + add ebx,DWORD[32+rsp] + vpaddd xmm9,xmm11,xmm5 + xor edx,ebp + shld ecx,ecx,5 + vpsrldq xmm8,xmm5,4 + add ebx,esi + and edi,edx + vpxor xmm6,xmm6,xmm2 + xor edx,ebp + add ebx,ecx + vpxor xmm8,xmm8,xmm4 + shrd ecx,ecx,7 + xor edi,ebp + mov esi,ebx + add eax,DWORD[36+rsp] + vpxor xmm6,xmm6,xmm8 + xor ecx,edx + shld ebx,ebx,5 + vmovdqa XMMWORD[16+rsp],xmm9 + add eax,edi + and esi,ecx + vpsrld xmm8,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm10,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov edi,eax + add ebp,DWORD[40+rsp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm9,xmm10,30 + vpor xmm6,xmm6,xmm8 + add ebp,esi + and edi,ebx + xor ebx,ecx + add ebp,eax + vpslld xmm10,xmm10,2 + vpxor xmm6,xmm6,xmm9 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[44+rsp] + vpxor xmm6,xmm6,xmm10 + xor eax,ebx + shld ebp,ebp,5 + add edx,edi + and esi,eax + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov edi,edx + add ecx,DWORD[48+rsp] + vpaddd xmm9,xmm11,xmm6 + xor ebp,eax + shld edx,edx,5 + vpsrldq xmm8,xmm6,4 + add ecx,esi + and edi,ebp + vpxor xmm7,xmm7,xmm3 + xor ebp,eax + add ecx,edx + vpxor xmm8,xmm8,xmm5 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[52+rsp] + vpxor xmm7,xmm7,xmm8 + xor edx,ebp + shld ecx,ecx,5 + vmovdqa XMMWORD[32+rsp],xmm9 + add ebx,edi + and esi,edx + vpsrld xmm8,xmm7,31 + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpslldq xmm10,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov edi,ebx + add eax,DWORD[56+rsp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm9,xmm10,30 + vpor xmm7,xmm7,xmm8 + add eax,esi + and edi,ecx + xor ecx,edx + add eax,ebx + vpslld xmm10,xmm10,2 + vpxor xmm7,xmm7,xmm9 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[60+rsp] + vpxor xmm7,xmm7,xmm10 + xor ebx,ecx + shld eax,eax,5 + add ebp,edi + and esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov edi,ebp + add edx,DWORD[rsp] + vpxor xmm0,xmm0,xmm1 + xor eax,ebx + shld ebp,ebp,5 + vpaddd xmm9,xmm11,xmm7 + add edx,esi + and edi,eax + vpxor xmm0,xmm0,xmm8 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor edi,ebx + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + mov esi,edx + add ecx,DWORD[4+rsp] + xor ebp,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov edi,ecx + add ebx,DWORD[8+rsp] + vpor xmm0,xmm0,xmm8 + xor edx,ebp + shld ecx,ecx,5 + add ebx,esi + and edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[12+rsp] + xor edi,ebp + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ebp,DWORD[16+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm0 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm1,xmm1,xmm8 + add edx,DWORD[20+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm1,xmm1,2 + add ecx,DWORD[24+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm1,xmm1,xmm8 + add ebx,DWORD[28+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD[32+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + add eax,esi + xor edi,edx + vpaddd xmm9,xmm11,xmm1 + vmovdqa xmm11,XMMWORD[r11] + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm8 + add ebp,DWORD[36+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpslld xmm2,xmm2,2 + add edx,DWORD[40+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vpor xmm2,xmm2,xmm8 + add ecx,DWORD[44+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD[48+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + add ebx,esi + xor edi,ebp + vpaddd xmm9,xmm11,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm8 + add eax,DWORD[52+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add ebp,DWORD[56+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpor xmm3,xmm3,xmm8 + add edx,DWORD[60+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpalignr xmm8,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD[rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + add ecx,esi + xor edi,eax + vpaddd xmm9,xmm11,xmm3 + shrd ebp,ebp,7 + add ecx,edx + vpxor xmm4,xmm4,xmm8 + add ebx,DWORD[4+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm8,xmm4,30 + vmovdqa XMMWORD[48+rsp],xmm9 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD[8+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm8 + add ebp,DWORD[12+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpalignr xmm8,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD[16+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + vpxor xmm5,xmm5,xmm6 + add edx,esi + xor edi,ebx + vpaddd xmm9,xmm11,xmm4 + shrd eax,eax,7 + add edx,ebp + vpxor xmm5,xmm5,xmm8 + add ecx,DWORD[20+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm8,xmm5,30 + vmovdqa XMMWORD[rsp],xmm9 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD[24+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm8 + add eax,DWORD[28+rsp] + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm8,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add ebp,DWORD[32+rsp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + mov edi,eax + xor esi,ecx + vpaddd xmm9,xmm11,xmm5 + shld eax,eax,5 + add ebp,esi + vpxor xmm6,xmm6,xmm8 + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[36+rsp] + vpsrld xmm8,xmm6,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + vpslld xmm6,xmm6,2 + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[40+rsp] + and esi,eax + vpor xmm6,xmm6,xmm8 + xor eax,ebx + shrd ebp,ebp,7 + mov edi,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[44+rsp] + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + vpalignr xmm8,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD[48+rsp] + and esi,edx + xor edx,ebp + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + mov edi,ebx + xor esi,edx + vpaddd xmm9,xmm11,xmm6 + vmovdqa xmm11,XMMWORD[32+r11] + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm8 + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[52+rsp] + vpsrld xmm8,xmm7,30 + vmovdqa XMMWORD[32+rsp],xmm9 + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[56+rsp] + and esi,ebx + vpor xmm7,xmm7,xmm8 + xor ebx,ecx + shrd eax,eax,7 + mov edi,ebp + xor esi,ebx + shld ebp,ebp,5 + add edx,esi + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[60+rsp] + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD[rsp] + and esi,ebp + xor ebp,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + mov edi,ecx + xor esi,ebp + vpaddd xmm9,xmm11,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm8 + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[4+rsp] + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[8+rsp] + and esi,ecx + vpor xmm0,xmm0,xmm8 + xor ecx,edx + shrd ebx,ebx,7 + mov edi,eax + xor esi,ecx + shld eax,eax,5 + add ebp,esi + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[12+rsp] + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD[16+rsp] + and esi,eax + xor eax,ebx + shrd ebp,ebp,7 + vpxor xmm1,xmm1,xmm2 + mov edi,edx + xor esi,eax + vpaddd xmm9,xmm11,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm8 + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[20+rsp] + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[24+rsp] + and esi,edx + vpor xmm1,xmm1,xmm8 + xor edx,ebp + shrd ecx,ecx,7 + mov edi,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[28+rsp] + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD[32+rsp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + mov edi,ebp + xor esi,ebx + vpaddd xmm9,xmm11,xmm1 + shld ebp,ebp,5 + add edx,esi + vpxor xmm2,xmm2,xmm8 + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[36+rsp] + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[40+rsp] + and esi,ebp + vpor xmm2,xmm2,xmm8 + xor ebp,eax + shrd edx,edx,7 + mov edi,ecx + xor esi,ebp + shld ecx,ecx,5 + add ebx,esi + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[44+rsp] + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + add eax,ebx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebp,DWORD[48+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm2 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm3,xmm3,xmm8 + add edx,DWORD[52+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm3,xmm3,2 + add ecx,DWORD[56+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm3,xmm3,xmm8 + add ebx,DWORD[60+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa XMMWORD[48+rsp],xmm9 + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[4+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[8+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[12+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + cmp r9,r10 + je NEAR $L$done_avx + vmovdqa xmm6,XMMWORD[64+r11] + vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + add ebx,DWORD[16+rsp] + xor esi,ebp + vpshufb xmm1,xmm1,xmm6 + mov edi,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm11 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vmovdqa XMMWORD[rsp],xmm4 + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov edi,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm11 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vmovdqa XMMWORD[16+rsp],xmm5 + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov edi,ebp + shld ebp,ebp,5 + vpaddd xmm6,xmm2,xmm11 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vmovdqa XMMWORD[32+rsp],xmm6 + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + add edx,DWORD[12+r8] + mov DWORD[r8],eax + add ebp,DWORD[16+r8] + mov DWORD[4+r8],esi + mov ebx,esi + mov DWORD[8+r8],ecx + mov edi,ecx + mov DWORD[12+r8],edx + xor edi,edx + mov DWORD[16+r8],ebp + and esi,edi + jmp NEAR $L$oop_avx + +ALIGN 16 +$L$done_avx: + add ebx,DWORD[16+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + vzeroupper + + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + mov DWORD[r8],eax + add edx,DWORD[12+r8] + mov DWORD[4+r8],esi + add ebp,DWORD[16+r8] + mov DWORD[8+r8],ecx + mov DWORD[12+r8],edx + mov DWORD[16+r8],ebp + movaps xmm6,XMMWORD[((-40-96))+r14] + movaps xmm7,XMMWORD[((-40-80))+r14] + movaps xmm8,XMMWORD[((-40-64))+r14] + movaps xmm9,XMMWORD[((-40-48))+r14] + movaps xmm10,XMMWORD[((-40-32))+r14] + movaps xmm11,XMMWORD[((-40-16))+r14] + lea rsi,[r14] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha1_block_data_order_avx: ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2605,6 +3750,9 @@ ALIGN 4 DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha1_block_data_order: @@ -2614,3 +3762,7 @@ $L$SEH_info_sha1_block_data_order_ssse3: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha1_block_data_order_avx: +DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/third_party/serf/win-x86_64/crypto/sha/sha256-x86_64.asm b/third_party/serf/win-x86_64/crypto/sha/sha256-x86_64.asm index e6193c5b9d..efaf9b55fc 100644 --- a/third_party/serf/win-x86_64/crypto/sha/sha256-x86_64.asm +++ b/third_party/serf/win-x86_64/crypto/sha/sha256-x86_64.asm @@ -23,6 +23,11 @@ $L$SEH_begin_sha256_block_data_order: mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut push rbx @@ -2877,6 +2882,1082 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order_ssse3: + +ALIGN 64 +sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[((64+24))+rsp],r11 + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[((64+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -2982,6 +4063,9 @@ ALIGN 4 DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_block_data_order: @@ -2992,3 +4076,7 @@ $L$SEH_info_sha256_block_data_order_ssse3: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha256_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/third_party/serf/win-x86_64/crypto/sha/sha512-x86_64.asm b/third_party/serf/win-x86_64/crypto/sha/sha512-x86_64.asm index b76cc0edb9..71449cd24f 100644 --- a/third_party/serf/win-x86_64/crypto/sha/sha512-x86_64.asm +++ b/third_party/serf/win-x86_64/crypto/sha/sha512-x86_64.asm @@ -19,6 +19,17 @@ $L$SEH_begin_sha512_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] + mov r9d,DWORD[r11] + mov r10d,DWORD[4+r11] + mov r11d,DWORD[8+r11] + test r10d,2048 + jnz NEAR $L$xop_shortcut + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut push rbx push rbp push r12 @@ -1801,111 +1812,2401 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 -EXTERN __imp_RtlVirtualUnwind -ALIGN 16 -se_handler: - push rsi - push rdi +ALIGN 64 +sha512_block_data_order_xop: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_xop: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$xop_shortcut: push rbx push rbp push r12 push r13 push r14 push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - mov rsi,rax - mov rax,QWORD[((128+24))+rax] - lea rax,[48+rax] - - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - lea r10,[$L$epilogue] - cmp rbx,r10 - jb NEAR $L$in_prologue - - lea rsi,[((128+32))+rsi] - lea rdi,[512+r8] - mov ecx,12 - DD 0xa548f3fc - -$L$in_prologue: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret + mov r11,rsp + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[((128+24))+rsp],r11 + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_xop: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_xop +ALIGN 16 +$L$loop_xop: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$xop_00_47 -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order wrt ..imagebase -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_sha512_block_data_order: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +ALIGN 16 +$L$xop_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + ror r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r8 + xor r12,r10 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rax + vpaddq xmm0,xmm0,xmm11 + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax +DB 143,72,120,195,209,7 + xor r12,r10 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,223,3 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + ror r14,28 + vpsrlq xmm10,xmm7,6 + add rdx,r11 + add r11,rdi + vpaddq xmm0,xmm0,xmm8 + mov r13,rdx + add r14,r11 +DB 143,72,120,195,203,42 + ror r13,23 + mov r11,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + vpaddq xmm0,xmm0,xmm11 + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + ror r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rcx + xor r12,r8 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r10 + vpaddq xmm1,xmm1,xmm11 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 +DB 143,72,120,195,209,7 + xor r12,r8 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,216,3 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + ror r14,28 + vpsrlq xmm10,xmm0,6 + add rbx,r9 + add r9,rdi + vpaddq xmm1,xmm1,xmm8 + mov r13,rbx + add r14,r9 +DB 143,72,120,195,203,42 + ror r13,23 + mov r9,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + vpaddq xmm1,xmm1,xmm11 + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + ror r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rax + xor r12,rcx + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r8 + vpaddq xmm2,xmm2,xmm11 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 +DB 143,72,120,195,209,7 + xor r12,rcx + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,217,3 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + ror r14,28 + vpsrlq xmm10,xmm1,6 + add r11,rdx + add rdx,rdi + vpaddq xmm2,xmm2,xmm8 + mov r13,r11 + add r14,rdx +DB 143,72,120,195,203,42 + ror r13,23 + mov rdx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + vpaddq xmm2,xmm2,xmm11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + ror r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r10 + xor r12,rax + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rcx + vpaddq xmm3,xmm3,xmm11 + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx +DB 143,72,120,195,209,7 + xor r12,rax + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,218,3 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + ror r14,28 + vpsrlq xmm10,xmm2,6 + add r9,rbx + add rbx,rdi + vpaddq xmm3,xmm3,xmm8 + mov r13,r9 + add r14,rbx +DB 143,72,120,195,203,42 + ror r13,23 + mov rbx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + vpaddq xmm3,xmm3,xmm11 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + ror r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r8 + xor r12,r10 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rax + vpaddq xmm4,xmm4,xmm11 + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax +DB 143,72,120,195,209,7 + xor r12,r10 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,219,3 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + ror r14,28 + vpsrlq xmm10,xmm3,6 + add rdx,r11 + add r11,rdi + vpaddq xmm4,xmm4,xmm8 + mov r13,rdx + add r14,r11 +DB 143,72,120,195,203,42 + ror r13,23 + mov r11,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + vpaddq xmm4,xmm4,xmm11 + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + ror r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rcx + xor r12,r8 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r10 + vpaddq xmm5,xmm5,xmm11 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 +DB 143,72,120,195,209,7 + xor r12,r8 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,220,3 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + ror r14,28 + vpsrlq xmm10,xmm4,6 + add rbx,r9 + add r9,rdi + vpaddq xmm5,xmm5,xmm8 + mov r13,rbx + add r14,r9 +DB 143,72,120,195,203,42 + ror r13,23 + mov r9,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + vpaddq xmm5,xmm5,xmm11 + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + ror r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rax + xor r12,rcx + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r8 + vpaddq xmm6,xmm6,xmm11 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 +DB 143,72,120,195,209,7 + xor r12,rcx + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,221,3 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + ror r14,28 + vpsrlq xmm10,xmm5,6 + add r11,rdx + add rdx,rdi + vpaddq xmm6,xmm6,xmm8 + mov r13,r11 + add r14,rdx +DB 143,72,120,195,203,42 + ror r13,23 + mov rdx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + vpaddq xmm6,xmm6,xmm11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + ror r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r10 + xor r12,rax + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rcx + vpaddq xmm7,xmm7,xmm11 + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx +DB 143,72,120,195,209,7 + xor r12,rax + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,222,3 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + ror r14,28 + vpsrlq xmm10,xmm6,6 + add r9,rbx + add rbx,rdi + vpaddq xmm7,xmm7,xmm8 + mov r13,r9 + add r14,rbx +DB 143,72,120,195,203,42 + ror r13,23 + mov rbx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + vpaddq xmm7,xmm7,xmm11 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$xop_00_47 + ror r13,23 + mov rax,r14 + mov r12,r9 + ror r14,5 + xor r13,r8 + xor r12,r10 + ror r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + ror r14,6 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + ror r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + ror r13,23 + mov r11,r14 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + ror r13,23 + mov r10,r14 + mov r12,rdx + ror r14,5 + xor r13,rcx + xor r12,r8 + ror r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + ror r14,6 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + ror r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + ror r13,23 + mov r9,r14 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + ror r13,23 + mov r8,r14 + mov r12,rbx + ror r14,5 + xor r13,rax + xor r12,rcx + ror r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + ror r14,6 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + ror r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + ror r13,23 + mov rdx,r14 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + ror r13,23 + mov rcx,r14 + mov r12,r11 + ror r14,5 + xor r13,r10 + xor r12,rax + ror r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + ror r14,6 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + ror r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + ror r13,23 + mov rbx,r14 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + ror r13,23 + mov rax,r14 + mov r12,r9 + ror r14,5 + xor r13,r8 + xor r12,r10 + ror r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + ror r14,6 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + ror r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + ror r13,23 + mov r11,r14 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + ror r13,23 + mov r10,r14 + mov r12,rdx + ror r14,5 + xor r13,rcx + xor r12,r8 + ror r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + ror r14,6 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + ror r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + ror r13,23 + mov r9,r14 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + ror r13,23 + mov r8,r14 + mov r12,rbx + ror r14,5 + xor r13,rax + xor r12,rcx + ror r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + ror r14,6 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + ror r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + ror r13,23 + mov rdx,r14 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + ror r13,23 + mov rcx,r14 + mov r12,r11 + ror r14,5 + xor r13,r10 + xor r12,rax + ror r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + ror r14,6 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + ror r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + ror r13,23 + mov rbx,r14 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_xop + + mov rsi,QWORD[((128+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_xop: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha512_block_data_order_xop: + +ALIGN 64 +sha512_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[((128+24))+rsp],r11 + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_avx: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm0,xmm0,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm7,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm7,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm0,xmm0,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm7,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[8+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm0,xmm0,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm1,xmm1,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[16+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm0,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm0,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm1,xmm1,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm0,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[24+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm1,xmm1,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm2,xmm2,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[32+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm1,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm1,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm2,xmm2,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm1,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[40+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm2,xmm2,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm3,xmm3,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[48+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm2,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm2,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm3,xmm3,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm2,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[56+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm3,xmm3,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm4,xmm4,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[64+rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm3,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm3,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm4,xmm4,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm3,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[72+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm4,xmm4,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm5,xmm5,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[80+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm4,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm4,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm5,xmm5,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm4,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[88+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm5,xmm5,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm6,xmm6,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[96+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm5,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm5,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm6,xmm6,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm5,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[104+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm6,xmm6,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm7,xmm7,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[112+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm6,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm6,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm7,xmm7,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm6,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[120+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm7,xmm7,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_avx + + mov rsi,QWORD[((128+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha512_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((128+24))+rax] + lea rax,[48+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((128+32))+rsi] + lea rdi,[512+r8] + mov ecx,12 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_sha512_block_data_order: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha512_block_data_order_xop: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase +$L$SEH_info_sha512_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase