From 5df1b26631e9cd1ce9f2278e4d7631f381ee84f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Fri, 26 Jan 2024 13:50:41 +0100 Subject: [PATCH] Add flint_mpn_mulhigh_normalised_n --- dev/gen_mulhigh_basecase.jl | 308 ++++++++++++++ doc/source/mpn_extras.rst | 6 +- src/mpn_extras.h | 13 + .../broadwell/mulhigh_normalised_1.asm | 36 ++ .../broadwell/mulhigh_normalised_10.asm | 284 +++++++++++++ .../broadwell/mulhigh_normalised_11.asm | 324 +++++++++++++++ .../broadwell/mulhigh_normalised_12.asm | 378 ++++++++++++++++++ .../broadwell/mulhigh_normalised_2.asm | 51 +++ .../broadwell/mulhigh_normalised_3.asm | 74 ++++ .../broadwell/mulhigh_normalised_4.asm | 95 +++++ .../broadwell/mulhigh_normalised_5.asm | 119 ++++++ .../broadwell/mulhigh_normalised_6.asm | 146 +++++++ .../broadwell/mulhigh_normalised_7.asm | 176 ++++++++ .../broadwell/mulhigh_normalised_8.asm | 209 ++++++++++ .../broadwell/mulhigh_normalised_9.asm | 245 ++++++++++++ src/mpn_extras/mulhigh.c | 29 ++ src/mpn_extras/profile/p-mulhigh_normalised.c | 55 +++ src/mpn_extras/test/main.c | 2 + src/mpn_extras/test/t-mulhigh_normalised_n.c | 99 +++++ 19 files changed, 2647 insertions(+), 2 deletions(-) create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_1.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_10.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_11.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_12.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_2.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_3.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_4.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_5.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_6.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_7.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_8.asm create mode 100644 src/mpn_extras/broadwell/mulhigh_normalised_9.asm create mode 100644 src/mpn_extras/profile/p-mulhigh_normalised.c create mode 100644 src/mpn_extras/test/t-mulhigh_normalised_n.c diff --git a/dev/gen_mulhigh_basecase.jl b/dev/gen_mulhigh_basecase.jl index f1f5194650..707bc7b581 100644 --- a/dev/gen_mulhigh_basecase.jl +++ b/dev/gen_mulhigh_basecase.jl @@ -376,6 +376,295 @@ function mulhigh(n::Int; debug::Bool = false) end end +############################################################################### +# mulhigh, normalised +############################################################################### + +function mulhigh_normalised_1() + r0 = _regs[9] # Important that r0 is rax + r1 = _regs[4] + + res = _regs[1] + ap = _regs[2] + bp = _regs[3] + + body = "" + + body *= "\tmov\t0*8($bp), %rdx\n" + body *= "\tmulx\t0*8($ap), $r0, $r1\n" + + # Check if normalised + body *= "\tmov\t\$0, %rdx\n" + body *= "\ttest\t$r1, $r1\n" + body *= "\tsetns\t$(R8("%rdx"))\n" + body *= "\tjs\t.Lcontinue\n" + + # If not normalised, shift by one + body *= "\tadd\t$r0, $r0\n" + body *= "\tadc\t$r1, $r1\n" + + body *= ".Lcontinue:\n" + body *= "\tmov\t$r1, 0*8($res)\n" + + return body * "\n\tret\n" +end + +function mulhigh_normalised_2() + r0 = _regs[9] + r1 = _regs[4] + r2 = _regs[5] + + sc = _regs[6] + zr = _regs[7] + + res = _regs[1] + ap = _regs[2] + bp_old = _regs[3] + b1 = _regs[8] + + body = "" + + body *= "\tmov\t1*8($bp_old), $b1\n" + body *= "\tmov\t0*8($bp_old), %rdx\n" + body *= "\txor\t$(R32(zr)), $(R32(zr))\n" + body *= "\tmulx\t0*8($ap), $sc, $r0\n" + body *= "\tmulx\t1*8($ap), $sc, $r1\n" + body *= "\tadcx\t$sc, $r0\n" + body *= "\tadcx\t$zr, $r1\n" + + body *= "\tmov\t$b1, %rdx\n" + body *= "\tmulx\t0*8($ap), $sc, $r2\n" + body *= "\tadcx\t$sc, $r0\n" + body *= "\tadcx\t$r2, $r1\n" + body *= "\tmulx\t1*8($ap), $sc, $r2\n" + body *= "\tadox\t$sc, $r1\n" + body *= "\tadox\t$zr, $r2\n" + body *= "\tadcx\t$zr, $r2\n" + + # Check if normalised + body *= "\tmov\t\$0, %rdx\n" + body *= "\ttest\t$r2, $r2\n" + body *= "\tsetns\t$(R8("%rdx"))\n" + body *= "\tjs\t.Lcontinue\n" + + # If not normalised, shift by one + body *= "\tadd\t$r0, $r0\n" + body *= "\tadc\t$r1, $r1\n" + body *= "\tadc\t$r2, $r2\n" + + body *= ".Lcontinue:\n" + body *= "\tmov\t$r1, 0*8($res)\n" + body *= "\tmov\t$r2, 1*8($res)\n" + + return body * "\n\tret\n" +end + +function mulhigh_normalised(n::Int; debug::Bool = false) + if n < 1 + error() + elseif n == 1 + return mulhigh_normalised_1() + elseif n == 2 + return mulhigh_normalised_2() + elseif n <= 12 + # Continue + else + error() + end + + if debug + res = "res" + ap = "ap" + bp_old = "bp_old" + bp = "bp" + sc = "sc" + zr = "zr" + else + res = _regs[1] + ap = _regs[2] + bp_old = _regs[3] # rdx + bp = _regs[4] # rdx is used by mulx, so we need to switch register for bp + + if n != 12 + sc = _regs[5] # scrap register + zr = (n < 10) ? _regs[6] : "%rsp" # zero + else + sc = "%rsp" + zr = sc + end + + if n < 9 + _r = [_regs[9]; _regs[7:8]; __regs[1:n - 2]] + elseif n == 9 + _r = [_regs[9]; _regs[7:8]; __regs[1:6]; res] + elseif n == 10 + _r = [_regs[9]; _regs[6:8]; __regs[1:6]; res] + elseif n == 11 + _r = [_regs[9]; _regs[6:8]; __regs[1:6]; res; bp] + elseif n == 12 + _r = [_regs[9]; _regs[5:8]; __regs[1:6]; res; bp] + end + end + + r(ix::Int) = debug ? "r$ix" : _r[ix + 1] + + # Push + body = "" + for ix in 1:min(n - 2, 6) + body *= "\tpush\t$(__regs[ix])\n" + end + if n == 9 + body *= "\tpush\t$res\n" + elseif n >= 10 + body *= "\tvmovq\t%rsp, %xmm0\n" + body *= "\tvmovq\t$res, %xmm1\n" + end + body *= "\n" + + # Prepare + body *= "\tmov\t$bp_old, $bp\n" + body *= "\tmov\t0*8($bp_old), %rdx\n" + if n != 12 + body *= "\txor\t$(R32(zr)), $(R32(zr))\n" + end + body *= "\n" + + # First multiplication chain + body *= "\tmulx\t$(n - 2)*8($ap), $sc, $(r(0))\n" + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(1))\n" + if n != 12 + body *= "\tadcx\t$sc, $(r(0))\n" + body *= "\tadcx\t$zr, $(r(1))\n" + else + body *= "\tadd\t$sc, $(r(0))\n" + body *= "\tadc\t\$0, $(r(1))\n" + body *= "\ttest\t%al, %al\n" + end + body *= "\n" + + # Intermediate multiplication chains + for ix in 1:min(n - 2, (n != 12) ? 8 : 9) + body *= "\tmov\t$ix*8($bp), %rdx\n" + + body *= "\tmulx\t$(n - 2 - ix)*8($ap), $sc, $(r(ix + 2))\n" + body *= "\tmulx\t$(n - 1 - ix)*8($ap), $sc, $(r(ix + 1))\n" + body *= "\tadcx\t$(r(ix + 2)), $(r(0))\n" + body *= "\tadox\t$sc, $(r(0))\n" + body *= "\tadcx\t$(r(ix + 1)), $(r(1))\n" + + for jx in 1:ix - 1 + body *= "\tmulx\t$(n - 1 - ix + jx)*8($ap), $sc, $(r(ix + 1))\n" + body *= "\tadox\t$sc, $(r(jx + 0))\n" + body *= "\tadcx\t$(r(ix + 1)), $(r(jx + 1))\n" + end + + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(ix + 1))\n" + body *= "\tadox\t$sc, $(r(ix + 0))\n" + if n == 12 + body *= "\tmov\t\$0, $(R32(zr))\n" + end + body *= "\tadcx\t$zr, $(r(ix + 1))\n" + body *= "\tadox\t$zr, $(r(ix + 1))\n" + + body *= "\n" + end + + if n >= 11 + N = n - 1 + body *= "\tmov\t$(n - 2)*8($bp), %rdx\n" + for ix in 0:n - 2 + body *= "\tmulx\t$ix*8($ap), $sc, $(r(N))\n" + if ix == 0 + body *= "\tadcx\t$(r(N)), $(r(ix + 0))\n" + else + body *= "\tadox\t$sc, $(r(ix - 1))\n" + body *= "\tadcx\t$(r(N)), $(r(ix + 0))\n" + end + end + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(N))\n" + body *= "\tadox\t$sc, $(r(N - 1))\n" + if n == 12 + body *= "\tmov\t\$0, $(R32(zr))\n" + end + body *= "\tadcx\t$zr, $(r(N))\n" + body *= "\tadox\t$zr, $(r(N))\n" + body *= "\n" + end + + # Last multiplication chain + body *= "\tmov\t$(n - 1)*8($bp), %rdx\n" + for ix in 0:n - 2 + body *= "\tmulx\t$ix*8($ap), $sc, $(r(n))\n" + if ix % 2 == 0 + body *= "\tadcx\t$sc, $(r(ix + 0))\n" + body *= "\tadcx\t$(r(n)), $(r(ix + 1))\n" + else + body *= "\tadox\t$sc, $(r(ix + 0))\n" + body *= "\tadox\t$(r(n)), $(r(ix + 1))\n" + end + end + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(n))\n" + if (n - 1) % 2 == 0 + body *= "\tadcx\t$sc, $(r(n - 1))\n" + else + body *= "\tadox\t$sc, $(r(n - 1))\n" + end + if n == 12 + body *= "\tmov\t\$0, $(R32(zr))\n" + end + body *= "\tadcx\t$zr, $(r(n))\n" + if n == 9 + # Use scrap register for storing pointer to res + res = sc + body *= "\tpop\t$res\n" + end + body *= "\tadox\t$zr, $(r(n))\n" + body *= "\n" + + # Check if normalised + body *= "\tmov\t\$0, %rdx\n" + body *= "\ttest\t$(r(n)), $(r(n))\n" + body *= "\tsetns\t$(R8("%rdx"))\n" + body *= "\tjs\t.Lcontinue\n" + + # If not normalised, shift by one + body *= "\tadd\t$(r(0)), $(r(0))\n" + for ix in 1:n + body *= "\tadc\t$(r(ix)), $(r(ix))\n" + end + + body *= ".Lcontinue:\n" + if n == 10 || n == 11 + res, zr = sc, "error zr" + body *= "\tvmovq\t%xmm1, $res\n" + body *= "\tvmovq\t%xmm0, %rsp\n" + elseif n == 12 + res = sc + body *= "\tvmovq\t%xmm1, $res\n" + end + + # Store result + for ix in 1:n + body *= "\tmov\t$(r(ix)), $(ix - 1)*8($res)\n" + end + body *= "\n" + + # Pop + if n == 12 + body *= "\tvmovq\t%xmm0, %rsp\n" + end + for ix in min(n - 2, 6):-1:1 + body *= "\tpop\t$(__regs[ix])\n" + end + body *= "\n" + + if debug + print(body * "\tret\n") + else + return body * "\tret\n" + end +end + ############################################################################### # Generate file ############################################################################### @@ -396,8 +685,27 @@ function gen_mulhigh(m::Int, nofile::Bool = false) end end +function gen_mulhigh_normalised(m::Int, nofile::Bool = false) + (pre, post) = function_pre_post("flint_mpn_mulhigh_normalised_$m") + functionbody = mulhigh_normalised(m) + + str = "$copyright\n$preamble\n$pre$functionbody$post" + + if nofile + print(str) + else + path = String(@__DIR__) * "/../src/mpn_extras/broadwell/mulhigh_normalised_$m.asm" + file = open(path, "w") + write(file, str) + close(file) + end +end + function gen_all() for m in 1:12 gen_mulhigh(m) end + for m in 1:12 + gen_mulhigh_normalised(m) + end end diff --git a/doc/source/mpn_extras.rst b/doc/source/mpn_extras.rst index 2551af3845..689ec4e30a 100644 --- a/doc/source/mpn_extras.rst +++ b/doc/source/mpn_extras.rst @@ -77,8 +77,10 @@ Multiplication is typically non-exact for sizes larger than one. The highest error is *n + 2* ULP in the returned limb. - .. note:: This function may not exist on processors not supporting the ADX - instruction set. +.. note:: + + This function may not exist on processors not supporting the ADX instruction + set. Divisibility diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 7594cbe6b2..5963c7d3a1 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -237,7 +237,11 @@ flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n) #define FLINT_HAVE_MULHIGH_N_FUNC(n) ((n) <= FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH) +struct mp_limb_pair_t { mp_limb_t m1; mp_limb_t m2; }; +typedef struct mp_limb_pair_t (* flint_mpn_mulhigh_normalised_func_t)(mp_ptr, mp_srcptr, mp_srcptr); + FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_mulhigh_n_func_tab[]; +FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_normalised_n_func_tab[]; /* NOTE: Aliasing is allowed! */ /* FIXME: How do we proceed for bigger n? */ @@ -250,6 +254,15 @@ mp_limb_t flint_mpn_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n return flint_mpn_mulhigh_n_func_tab[n - 1](rp, xp, yp); } +FLINT_FORCE_INLINE +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + FLINT_ASSERT(FLINT_HAVE_MULHIGH_N_FUNC(n)); + + return flint_mpn_mulhigh_normalised_n_func_tab[n - 1](rp, xp, yp); +} + /* return the high limb of a two limb left shift by n < GMP_LIMB_BITS bits. Note: if GMP_NAIL_BITS != 0, the rest of flint is already broken anyways. diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_1.asm b/src/mpn_extras/broadwell/mulhigh_normalised_1.asm new file mode 100644 index 0000000000..39093a6cab --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_1.asm @@ -0,0 +1,36 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_1) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_1) + +FUNC(flint_mpn_mulhigh_normalised_1): + .cfi_startproc + mov 0*8(%rdx), %rdx + mulx 0*8(%rsi), %rax, %rcx + mov $0, %rdx + test %rcx, %rcx + setns %dl + js .Lcontinue + add %rax, %rax + adc %rcx, %rcx +.Lcontinue: + mov %rcx, 0*8(%rdi) + + ret +.flint_mpn_mulhigh_normalised_1_end: +SIZE(flint_mpn_mulhigh_normalised_1, .flint_mpn_mulhigh_normalised_1_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_10.asm b/src/mpn_extras/broadwell/mulhigh_normalised_10.asm new file mode 100644 index 0000000000..6680c531c0 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_10.asm @@ -0,0 +1,284 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_10) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_10) + +FUNC(flint_mpn_mulhigh_normalised_10): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vmovq %rsp, %xmm0 + vmovq %rdi, %xmm1 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %esp, %esp + + mulx 8*8(%rsi), %r8, %rax + mulx 9*8(%rsi), %r8, %r9 + adcx %r8, %rax + adcx %rsp, %r9 + + mov 1*8(%rcx), %rdx + mulx 7*8(%rsi), %r8, %r11 + mulx 8*8(%rsi), %r8, %r10 + adcx %r11, %rax + adox %r8, %rax + adcx %r10, %r9 + mulx 9*8(%rsi), %r8, %r10 + adox %r8, %r9 + adcx %rsp, %r10 + adox %rsp, %r10 + + mov 2*8(%rcx), %rdx + mulx 6*8(%rsi), %r8, %rbx + mulx 7*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r9 + mulx 8*8(%rsi), %r8, %r11 + adox %r8, %r9 + adcx %r11, %r10 + mulx 9*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %rsp, %r11 + adox %rsp, %r11 + + mov 3*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %rbp + mulx 6*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r9 + mulx 7*8(%rsi), %r8, %rbx + adox %r8, %r9 + adcx %rbx, %r10 + mulx 8*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 9*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %rsp, %rbx + adox %rsp, %rbx + + mov 4*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %r12 + mulx 5*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r9 + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %r9 + adcx %rbp, %r10 + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 8*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 9*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %rsp, %rbp + adox %rsp, %rbp + + mov 5*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r13 + mulx 4*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r9 + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %r9 + adcx %r12, %r10 + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 8*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 9*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %rsp, %r12 + adox %rsp, %r12 + + mov 6*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r14 + mulx 3*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r9 + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %r9 + adcx %r13, %r10 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 8*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 9*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %rsp, %r13 + adox %rsp, %r13 + + mov 7*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r15 + mulx 2*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r9 + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %r9 + adcx %r14, %r10 + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 8*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 9*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %rsp, %r14 + adox %rsp, %r14 + + mov 8*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + mulx 1*8(%rsi), %r8, %r15 + adcx %rdi, %rax + adox %r8, %rax + adcx %r15, %r9 + mulx 2*8(%rsi), %r8, %r15 + adox %r8, %r9 + adcx %r15, %r10 + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %r10 + adcx %r15, %r11 + mulx 4*8(%rsi), %r8, %r15 + adox %r8, %r11 + adcx %r15, %rbx + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %rbx + adcx %r15, %rbp + mulx 6*8(%rsi), %r8, %r15 + adox %r8, %rbp + adcx %r15, %r12 + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %r12 + adcx %r15, %r13 + mulx 8*8(%rsi), %r8, %r15 + adox %r8, %r13 + adcx %r15, %r14 + mulx 9*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %rsp, %r15 + adox %rsp, %r15 + + mov 9*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + adcx %r8, %rax + adcx %rdi, %r9 + mulx 1*8(%rsi), %r8, %rdi + adox %r8, %r9 + adox %rdi, %r10 + mulx 2*8(%rsi), %r8, %rdi + adcx %r8, %r10 + adcx %rdi, %r11 + mulx 3*8(%rsi), %r8, %rdi + adox %r8, %r11 + adox %rdi, %rbx + mulx 4*8(%rsi), %r8, %rdi + adcx %r8, %rbx + adcx %rdi, %rbp + mulx 5*8(%rsi), %r8, %rdi + adox %r8, %rbp + adox %rdi, %r12 + mulx 6*8(%rsi), %r8, %rdi + adcx %r8, %r12 + adcx %rdi, %r13 + mulx 7*8(%rsi), %r8, %rdi + adox %r8, %r13 + adox %rdi, %r14 + mulx 8*8(%rsi), %r8, %rdi + adcx %r8, %r14 + adcx %rdi, %r15 + mulx 9*8(%rsi), %r8, %rdi + adox %r8, %r15 + adcx %rsp, %rdi + adox %rsp, %rdi + + mov $0, %rdx + test %rdi, %rdi + setns %dl + js .Lcontinue + add %rax, %rax + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 + adc %r14, %r14 + adc %r15, %r15 + adc %rdi, %rdi +.Lcontinue: + vmovq %xmm1, %r8 + vmovq %xmm0, %rsp + mov %r9, 0*8(%r8) + mov %r10, 1*8(%r8) + mov %r11, 2*8(%r8) + mov %rbx, 3*8(%r8) + mov %rbp, 4*8(%r8) + mov %r12, 5*8(%r8) + mov %r13, 6*8(%r8) + mov %r14, 7*8(%r8) + mov %r15, 8*8(%r8) + mov %rdi, 9*8(%r8) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_10_end: +SIZE(flint_mpn_mulhigh_normalised_10, .flint_mpn_mulhigh_normalised_10_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_11.asm b/src/mpn_extras/broadwell/mulhigh_normalised_11.asm new file mode 100644 index 0000000000..bb61d7f2aa --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_11.asm @@ -0,0 +1,324 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_11) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_11) + +FUNC(flint_mpn_mulhigh_normalised_11): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vmovq %rsp, %xmm0 + vmovq %rdi, %xmm1 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %esp, %esp + + mulx 9*8(%rsi), %r8, %rax + mulx 10*8(%rsi), %r8, %r9 + adcx %r8, %rax + adcx %rsp, %r9 + + mov 1*8(%rcx), %rdx + mulx 8*8(%rsi), %r8, %r11 + mulx 9*8(%rsi), %r8, %r10 + adcx %r11, %rax + adox %r8, %rax + adcx %r10, %r9 + mulx 10*8(%rsi), %r8, %r10 + adox %r8, %r9 + adcx %rsp, %r10 + adox %rsp, %r10 + + mov 2*8(%rcx), %rdx + mulx 7*8(%rsi), %r8, %rbx + mulx 8*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r9 + mulx 9*8(%rsi), %r8, %r11 + adox %r8, %r9 + adcx %r11, %r10 + mulx 10*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %rsp, %r11 + adox %rsp, %r11 + + mov 3*8(%rcx), %rdx + mulx 6*8(%rsi), %r8, %rbp + mulx 7*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r9 + mulx 8*8(%rsi), %r8, %rbx + adox %r8, %r9 + adcx %rbx, %r10 + mulx 9*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 10*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %rsp, %rbx + adox %rsp, %rbx + + mov 4*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %r12 + mulx 6*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r9 + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %r9 + adcx %rbp, %r10 + mulx 8*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 9*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 10*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %rsp, %rbp + adox %rsp, %rbp + + mov 5*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %r13 + mulx 5*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r9 + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %r9 + adcx %r12, %r10 + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 8*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 9*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 10*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %rsp, %r12 + adox %rsp, %r12 + + mov 6*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r14 + mulx 4*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r9 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r9 + adcx %r13, %r10 + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 8*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 9*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 10*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %rsp, %r13 + adox %rsp, %r13 + + mov 7*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r15 + mulx 3*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r9 + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %r9 + adcx %r14, %r10 + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 8*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 9*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 10*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %rsp, %r14 + adox %rsp, %r14 + + mov 8*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %rdi + mulx 2*8(%rsi), %r8, %r15 + adcx %rdi, %rax + adox %r8, %rax + adcx %r15, %r9 + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %r9 + adcx %r15, %r10 + mulx 4*8(%rsi), %r8, %r15 + adox %r8, %r10 + adcx %r15, %r11 + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %r11 + adcx %r15, %rbx + mulx 6*8(%rsi), %r8, %r15 + adox %r8, %rbx + adcx %r15, %rbp + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %rbp + adcx %r15, %r12 + mulx 8*8(%rsi), %r8, %r15 + adox %r8, %r12 + adcx %r15, %r13 + mulx 9*8(%rsi), %r8, %r15 + adox %r8, %r13 + adcx %r15, %r14 + mulx 10*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %rsp, %r15 + adox %rsp, %r15 + + mov 9*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + adcx %rdi, %rax + mulx 1*8(%rsi), %r8, %rdi + adox %r8, %rax + adcx %rdi, %r9 + mulx 2*8(%rsi), %r8, %rdi + adox %r8, %r9 + adcx %rdi, %r10 + mulx 3*8(%rsi), %r8, %rdi + adox %r8, %r10 + adcx %rdi, %r11 + mulx 4*8(%rsi), %r8, %rdi + adox %r8, %r11 + adcx %rdi, %rbx + mulx 5*8(%rsi), %r8, %rdi + adox %r8, %rbx + adcx %rdi, %rbp + mulx 6*8(%rsi), %r8, %rdi + adox %r8, %rbp + adcx %rdi, %r12 + mulx 7*8(%rsi), %r8, %rdi + adox %r8, %r12 + adcx %rdi, %r13 + mulx 8*8(%rsi), %r8, %rdi + adox %r8, %r13 + adcx %rdi, %r14 + mulx 9*8(%rsi), %r8, %rdi + adox %r8, %r14 + adcx %rdi, %r15 + mulx 10*8(%rsi), %r8, %rdi + adox %r8, %r15 + adcx %rsp, %rdi + adox %rsp, %rdi + + mov 10*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rcx + adcx %r8, %rax + adcx %rcx, %r9 + mulx 1*8(%rsi), %r8, %rcx + adox %r8, %r9 + adox %rcx, %r10 + mulx 2*8(%rsi), %r8, %rcx + adcx %r8, %r10 + adcx %rcx, %r11 + mulx 3*8(%rsi), %r8, %rcx + adox %r8, %r11 + adox %rcx, %rbx + mulx 4*8(%rsi), %r8, %rcx + adcx %r8, %rbx + adcx %rcx, %rbp + mulx 5*8(%rsi), %r8, %rcx + adox %r8, %rbp + adox %rcx, %r12 + mulx 6*8(%rsi), %r8, %rcx + adcx %r8, %r12 + adcx %rcx, %r13 + mulx 7*8(%rsi), %r8, %rcx + adox %r8, %r13 + adox %rcx, %r14 + mulx 8*8(%rsi), %r8, %rcx + adcx %r8, %r14 + adcx %rcx, %r15 + mulx 9*8(%rsi), %r8, %rcx + adox %r8, %r15 + adox %rcx, %rdi + mulx 10*8(%rsi), %r8, %rcx + adcx %r8, %rdi + adcx %rsp, %rcx + adox %rsp, %rcx + + mov $0, %rdx + test %rcx, %rcx + setns %dl + js .Lcontinue + add %rax, %rax + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 + adc %r14, %r14 + adc %r15, %r15 + adc %rdi, %rdi + adc %rcx, %rcx +.Lcontinue: + vmovq %xmm1, %r8 + vmovq %xmm0, %rsp + mov %r9, 0*8(%r8) + mov %r10, 1*8(%r8) + mov %r11, 2*8(%r8) + mov %rbx, 3*8(%r8) + mov %rbp, 4*8(%r8) + mov %r12, 5*8(%r8) + mov %r13, 6*8(%r8) + mov %r14, 7*8(%r8) + mov %r15, 8*8(%r8) + mov %rdi, 9*8(%r8) + mov %rcx, 10*8(%r8) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_11_end: +SIZE(flint_mpn_mulhigh_normalised_11, .flint_mpn_mulhigh_normalised_11_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_12.asm b/src/mpn_extras/broadwell/mulhigh_normalised_12.asm new file mode 100644 index 0000000000..f9c4940dc4 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_12.asm @@ -0,0 +1,378 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_12) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_12) + +FUNC(flint_mpn_mulhigh_normalised_12): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vmovq %rsp, %xmm0 + vmovq %rdi, %xmm1 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + + mulx 10*8(%rsi), %rsp, %rax + mulx 11*8(%rsi), %rsp, %r8 + add %rsp, %rax + adc $0, %r8 + test %al, %al + + mov 1*8(%rcx), %rdx + mulx 9*8(%rsi), %rsp, %r10 + mulx 10*8(%rsi), %rsp, %r9 + adcx %r10, %rax + adox %rsp, %rax + adcx %r9, %r8 + mulx 11*8(%rsi), %rsp, %r9 + adox %rsp, %r8 + mov $0, %esp + adcx %rsp, %r9 + adox %rsp, %r9 + + mov 2*8(%rcx), %rdx + mulx 8*8(%rsi), %rsp, %r11 + mulx 9*8(%rsi), %rsp, %r10 + adcx %r11, %rax + adox %rsp, %rax + adcx %r10, %r8 + mulx 10*8(%rsi), %rsp, %r10 + adox %rsp, %r8 + adcx %r10, %r9 + mulx 11*8(%rsi), %rsp, %r10 + adox %rsp, %r9 + mov $0, %esp + adcx %rsp, %r10 + adox %rsp, %r10 + + mov 3*8(%rcx), %rdx + mulx 7*8(%rsi), %rsp, %rbx + mulx 8*8(%rsi), %rsp, %r11 + adcx %rbx, %rax + adox %rsp, %rax + adcx %r11, %r8 + mulx 9*8(%rsi), %rsp, %r11 + adox %rsp, %r8 + adcx %r11, %r9 + mulx 10*8(%rsi), %rsp, %r11 + adox %rsp, %r9 + adcx %r11, %r10 + mulx 11*8(%rsi), %rsp, %r11 + adox %rsp, %r10 + mov $0, %esp + adcx %rsp, %r11 + adox %rsp, %r11 + + mov 4*8(%rcx), %rdx + mulx 6*8(%rsi), %rsp, %rbp + mulx 7*8(%rsi), %rsp, %rbx + adcx %rbp, %rax + adox %rsp, %rax + adcx %rbx, %r8 + mulx 8*8(%rsi), %rsp, %rbx + adox %rsp, %r8 + adcx %rbx, %r9 + mulx 9*8(%rsi), %rsp, %rbx + adox %rsp, %r9 + adcx %rbx, %r10 + mulx 10*8(%rsi), %rsp, %rbx + adox %rsp, %r10 + adcx %rbx, %r11 + mulx 11*8(%rsi), %rsp, %rbx + adox %rsp, %r11 + mov $0, %esp + adcx %rsp, %rbx + adox %rsp, %rbx + + mov 5*8(%rcx), %rdx + mulx 5*8(%rsi), %rsp, %r12 + mulx 6*8(%rsi), %rsp, %rbp + adcx %r12, %rax + adox %rsp, %rax + adcx %rbp, %r8 + mulx 7*8(%rsi), %rsp, %rbp + adox %rsp, %r8 + adcx %rbp, %r9 + mulx 8*8(%rsi), %rsp, %rbp + adox %rsp, %r9 + adcx %rbp, %r10 + mulx 9*8(%rsi), %rsp, %rbp + adox %rsp, %r10 + adcx %rbp, %r11 + mulx 10*8(%rsi), %rsp, %rbp + adox %rsp, %r11 + adcx %rbp, %rbx + mulx 11*8(%rsi), %rsp, %rbp + adox %rsp, %rbx + mov $0, %esp + adcx %rsp, %rbp + adox %rsp, %rbp + + mov 6*8(%rcx), %rdx + mulx 4*8(%rsi), %rsp, %r13 + mulx 5*8(%rsi), %rsp, %r12 + adcx %r13, %rax + adox %rsp, %rax + adcx %r12, %r8 + mulx 6*8(%rsi), %rsp, %r12 + adox %rsp, %r8 + adcx %r12, %r9 + mulx 7*8(%rsi), %rsp, %r12 + adox %rsp, %r9 + adcx %r12, %r10 + mulx 8*8(%rsi), %rsp, %r12 + adox %rsp, %r10 + adcx %r12, %r11 + mulx 9*8(%rsi), %rsp, %r12 + adox %rsp, %r11 + adcx %r12, %rbx + mulx 10*8(%rsi), %rsp, %r12 + adox %rsp, %rbx + adcx %r12, %rbp + mulx 11*8(%rsi), %rsp, %r12 + adox %rsp, %rbp + mov $0, %esp + adcx %rsp, %r12 + adox %rsp, %r12 + + mov 7*8(%rcx), %rdx + mulx 3*8(%rsi), %rsp, %r14 + mulx 4*8(%rsi), %rsp, %r13 + adcx %r14, %rax + adox %rsp, %rax + adcx %r13, %r8 + mulx 5*8(%rsi), %rsp, %r13 + adox %rsp, %r8 + adcx %r13, %r9 + mulx 6*8(%rsi), %rsp, %r13 + adox %rsp, %r9 + adcx %r13, %r10 + mulx 7*8(%rsi), %rsp, %r13 + adox %rsp, %r10 + adcx %r13, %r11 + mulx 8*8(%rsi), %rsp, %r13 + adox %rsp, %r11 + adcx %r13, %rbx + mulx 9*8(%rsi), %rsp, %r13 + adox %rsp, %rbx + adcx %r13, %rbp + mulx 10*8(%rsi), %rsp, %r13 + adox %rsp, %rbp + adcx %r13, %r12 + mulx 11*8(%rsi), %rsp, %r13 + adox %rsp, %r12 + mov $0, %esp + adcx %rsp, %r13 + adox %rsp, %r13 + + mov 8*8(%rcx), %rdx + mulx 2*8(%rsi), %rsp, %r15 + mulx 3*8(%rsi), %rsp, %r14 + adcx %r15, %rax + adox %rsp, %rax + adcx %r14, %r8 + mulx 4*8(%rsi), %rsp, %r14 + adox %rsp, %r8 + adcx %r14, %r9 + mulx 5*8(%rsi), %rsp, %r14 + adox %rsp, %r9 + adcx %r14, %r10 + mulx 6*8(%rsi), %rsp, %r14 + adox %rsp, %r10 + adcx %r14, %r11 + mulx 7*8(%rsi), %rsp, %r14 + adox %rsp, %r11 + adcx %r14, %rbx + mulx 8*8(%rsi), %rsp, %r14 + adox %rsp, %rbx + adcx %r14, %rbp + mulx 9*8(%rsi), %rsp, %r14 + adox %rsp, %rbp + adcx %r14, %r12 + mulx 10*8(%rsi), %rsp, %r14 + adox %rsp, %r12 + adcx %r14, %r13 + mulx 11*8(%rsi), %rsp, %r14 + adox %rsp, %r13 + mov $0, %esp + adcx %rsp, %r14 + adox %rsp, %r14 + + mov 9*8(%rcx), %rdx + mulx 1*8(%rsi), %rsp, %rdi + mulx 2*8(%rsi), %rsp, %r15 + adcx %rdi, %rax + adox %rsp, %rax + adcx %r15, %r8 + mulx 3*8(%rsi), %rsp, %r15 + adox %rsp, %r8 + adcx %r15, %r9 + mulx 4*8(%rsi), %rsp, %r15 + adox %rsp, %r9 + adcx %r15, %r10 + mulx 5*8(%rsi), %rsp, %r15 + adox %rsp, %r10 + adcx %r15, %r11 + mulx 6*8(%rsi), %rsp, %r15 + adox %rsp, %r11 + adcx %r15, %rbx + mulx 7*8(%rsi), %rsp, %r15 + adox %rsp, %rbx + adcx %r15, %rbp + mulx 8*8(%rsi), %rsp, %r15 + adox %rsp, %rbp + adcx %r15, %r12 + mulx 9*8(%rsi), %rsp, %r15 + adox %rsp, %r12 + adcx %r15, %r13 + mulx 10*8(%rsi), %rsp, %r15 + adox %rsp, %r13 + adcx %r15, %r14 + mulx 11*8(%rsi), %rsp, %r15 + adox %rsp, %r14 + mov $0, %esp + adcx %rsp, %r15 + adox %rsp, %r15 + + mov 10*8(%rcx), %rdx + mulx 0*8(%rsi), %rsp, %rdi + adcx %rdi, %rax + mulx 1*8(%rsi), %rsp, %rdi + adox %rsp, %rax + adcx %rdi, %r8 + mulx 2*8(%rsi), %rsp, %rdi + adox %rsp, %r8 + adcx %rdi, %r9 + mulx 3*8(%rsi), %rsp, %rdi + adox %rsp, %r9 + adcx %rdi, %r10 + mulx 4*8(%rsi), %rsp, %rdi + adox %rsp, %r10 + adcx %rdi, %r11 + mulx 5*8(%rsi), %rsp, %rdi + adox %rsp, %r11 + adcx %rdi, %rbx + mulx 6*8(%rsi), %rsp, %rdi + adox %rsp, %rbx + adcx %rdi, %rbp + mulx 7*8(%rsi), %rsp, %rdi + adox %rsp, %rbp + adcx %rdi, %r12 + mulx 8*8(%rsi), %rsp, %rdi + adox %rsp, %r12 + adcx %rdi, %r13 + mulx 9*8(%rsi), %rsp, %rdi + adox %rsp, %r13 + adcx %rdi, %r14 + mulx 10*8(%rsi), %rsp, %rdi + adox %rsp, %r14 + adcx %rdi, %r15 + mulx 11*8(%rsi), %rsp, %rdi + adox %rsp, %r15 + mov $0, %esp + adcx %rsp, %rdi + adox %rsp, %rdi + + mov 11*8(%rcx), %rdx + mulx 0*8(%rsi), %rsp, %rcx + adcx %rsp, %rax + adcx %rcx, %r8 + mulx 1*8(%rsi), %rsp, %rcx + adox %rsp, %r8 + adox %rcx, %r9 + mulx 2*8(%rsi), %rsp, %rcx + adcx %rsp, %r9 + adcx %rcx, %r10 + mulx 3*8(%rsi), %rsp, %rcx + adox %rsp, %r10 + adox %rcx, %r11 + mulx 4*8(%rsi), %rsp, %rcx + adcx %rsp, %r11 + adcx %rcx, %rbx + mulx 5*8(%rsi), %rsp, %rcx + adox %rsp, %rbx + adox %rcx, %rbp + mulx 6*8(%rsi), %rsp, %rcx + adcx %rsp, %rbp + adcx %rcx, %r12 + mulx 7*8(%rsi), %rsp, %rcx + adox %rsp, %r12 + adox %rcx, %r13 + mulx 8*8(%rsi), %rsp, %rcx + adcx %rsp, %r13 + adcx %rcx, %r14 + mulx 9*8(%rsi), %rsp, %rcx + adox %rsp, %r14 + adox %rcx, %r15 + mulx 10*8(%rsi), %rsp, %rcx + adcx %rsp, %r15 + adcx %rcx, %rdi + mulx 11*8(%rsi), %rsp, %rcx + adox %rsp, %rdi + mov $0, %esp + adcx %rsp, %rcx + adox %rsp, %rcx + + mov $0, %rdx + test %rcx, %rcx + setns %dl + js .Lcontinue + add %rax, %rax + adc %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 + adc %r14, %r14 + adc %r15, %r15 + adc %rdi, %rdi + adc %rcx, %rcx +.Lcontinue: + vmovq %xmm1, %rsp + mov %r8, 0*8(%rsp) + mov %r9, 1*8(%rsp) + mov %r10, 2*8(%rsp) + mov %r11, 3*8(%rsp) + mov %rbx, 4*8(%rsp) + mov %rbp, 5*8(%rsp) + mov %r12, 6*8(%rsp) + mov %r13, 7*8(%rsp) + mov %r14, 8*8(%rsp) + mov %r15, 9*8(%rsp) + mov %rdi, 10*8(%rsp) + mov %rcx, 11*8(%rsp) + + vmovq %xmm0, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_12_end: +SIZE(flint_mpn_mulhigh_normalised_12, .flint_mpn_mulhigh_normalised_12_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_2.asm b/src/mpn_extras/broadwell/mulhigh_normalised_2.asm new file mode 100644 index 0000000000..cc2e550893 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_2.asm @@ -0,0 +1,51 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_2) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_2) + +FUNC(flint_mpn_mulhigh_normalised_2): + .cfi_startproc + mov 1*8(%rdx), %r11 + mov 0*8(%rdx), %rdx + xor %r10d, %r10d + mulx 0*8(%rsi), %r9, %rax + mulx 1*8(%rsi), %r9, %rcx + adcx %r9, %rax + adcx %r10, %rcx + mov %r11, %rdx + mulx 0*8(%rsi), %r9, %r8 + adcx %r9, %rax + adcx %r8, %rcx + mulx 1*8(%rsi), %r9, %r8 + adox %r9, %rcx + adox %r10, %r8 + adcx %r10, %r8 + mov $0, %rdx + test %r8, %r8 + setns %dl + js .Lcontinue + add %rax, %rax + adc %rcx, %rcx + adc %r8, %r8 +.Lcontinue: + mov %rcx, 0*8(%rdi) + mov %r8, 1*8(%rdi) + + ret +.flint_mpn_mulhigh_normalised_2_end: +SIZE(flint_mpn_mulhigh_normalised_2, .flint_mpn_mulhigh_normalised_2_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_3.asm b/src/mpn_extras/broadwell/mulhigh_normalised_3.asm new file mode 100644 index 0000000000..c3390b14fb --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_3.asm @@ -0,0 +1,74 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_3) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_3) + +FUNC(flint_mpn_mulhigh_normalised_3): + .cfi_startproc + push %rbx + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 1*8(%rsi), %r8, %rax + mulx 2*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbx + mulx 1*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 2*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbx + adcx %r8, %rax + adcx %rbx, %r10 + mulx 1*8(%rsi), %r8, %rbx + adox %r8, %r10 + adox %rbx, %r11 + mulx 2*8(%rsi), %r8, %rbx + adcx %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov $0, %rdx + test %rbx, %rbx + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx +.Lcontinue: + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_3_end: +SIZE(flint_mpn_mulhigh_normalised_3, .flint_mpn_mulhigh_normalised_3_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_4.asm b/src/mpn_extras/broadwell/mulhigh_normalised_4.asm new file mode 100644 index 0000000000..2169e2b829 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_4.asm @@ -0,0 +1,95 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_4) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_4) + +FUNC(flint_mpn_mulhigh_normalised_4): + .cfi_startproc + push %rbx + push %rbp + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 2*8(%rsi), %r8, %rax + mulx 3*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %rbx + mulx 2*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 3*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbp + mulx 1*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 2*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 3*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbp + adcx %r8, %rax + adcx %rbp, %r10 + mulx 1*8(%rsi), %r8, %rbp + adox %r8, %r10 + adox %rbp, %r11 + mulx 2*8(%rsi), %r8, %rbp + adcx %r8, %r11 + adcx %rbp, %rbx + mulx 3*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov $0, %rdx + test %rbp, %rbp + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp +.Lcontinue: + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_4_end: +SIZE(flint_mpn_mulhigh_normalised_4, .flint_mpn_mulhigh_normalised_4_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_5.asm b/src/mpn_extras/broadwell/mulhigh_normalised_5.asm new file mode 100644 index 0000000000..4029f8d091 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_5.asm @@ -0,0 +1,119 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_5) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_5) + +FUNC(flint_mpn_mulhigh_normalised_5): + .cfi_startproc + push %rbx + push %rbp + push %r12 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 3*8(%rsi), %r8, %rax + mulx 4*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %rbx + mulx 3*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 4*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %rbp + mulx 2*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 3*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 4*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r12 + mulx 1*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 2*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 3*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 4*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r12 + adcx %r8, %rax + adcx %r12, %r10 + mulx 1*8(%rsi), %r8, %r12 + adox %r8, %r10 + adox %r12, %r11 + mulx 2*8(%rsi), %r8, %r12 + adcx %r8, %r11 + adcx %r12, %rbx + mulx 3*8(%rsi), %r8, %r12 + adox %r8, %rbx + adox %r12, %rbp + mulx 4*8(%rsi), %r8, %r12 + adcx %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov $0, %rdx + test %r12, %r12 + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 +.Lcontinue: + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_5_end: +SIZE(flint_mpn_mulhigh_normalised_5, .flint_mpn_mulhigh_normalised_5_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_6.asm b/src/mpn_extras/broadwell/mulhigh_normalised_6.asm new file mode 100644 index 0000000000..0dcdf64fb6 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_6.asm @@ -0,0 +1,146 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_6) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_6) + +FUNC(flint_mpn_mulhigh_normalised_6): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 4*8(%rsi), %r8, %rax + mulx 5*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %rbx + mulx 4*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 5*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %rbp + mulx 3*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 4*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 5*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r12 + mulx 2*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 3*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 4*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 5*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r13 + mulx 1*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 2*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 3*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 4*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r13 + adcx %r8, %rax + adcx %r13, %r10 + mulx 1*8(%rsi), %r8, %r13 + adox %r8, %r10 + adox %r13, %r11 + mulx 2*8(%rsi), %r8, %r13 + adcx %r8, %r11 + adcx %r13, %rbx + mulx 3*8(%rsi), %r8, %r13 + adox %r8, %rbx + adox %r13, %rbp + mulx 4*8(%rsi), %r8, %r13 + adcx %r8, %rbp + adcx %r13, %r12 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov $0, %rdx + test %r13, %r13 + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 +.Lcontinue: + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + mov %r13, 5*8(%rdi) + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_6_end: +SIZE(flint_mpn_mulhigh_normalised_6, .flint_mpn_mulhigh_normalised_6_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_7.asm b/src/mpn_extras/broadwell/mulhigh_normalised_7.asm new file mode 100644 index 0000000000..556a2fc867 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_7.asm @@ -0,0 +1,176 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_7) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_7) + +FUNC(flint_mpn_mulhigh_normalised_7): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 5*8(%rsi), %r8, %rax + mulx 6*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %rbx + mulx 5*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 6*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %rbp + mulx 4*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 5*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 6*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r12 + mulx 3*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 4*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 5*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r13 + mulx 2*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 3*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 4*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r14 + mulx 1*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r10 + mulx 2*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 3*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov 6*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r14 + adcx %r8, %rax + adcx %r14, %r10 + mulx 1*8(%rsi), %r8, %r14 + adox %r8, %r10 + adox %r14, %r11 + mulx 2*8(%rsi), %r8, %r14 + adcx %r8, %r11 + adcx %r14, %rbx + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %rbx + adox %r14, %rbp + mulx 4*8(%rsi), %r8, %r14 + adcx %r8, %rbp + adcx %r14, %r12 + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %r12 + adox %r14, %r13 + mulx 6*8(%rsi), %r8, %r14 + adcx %r8, %r13 + adcx %r9, %r14 + adox %r9, %r14 + + mov $0, %rdx + test %r14, %r14 + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 + adc %r14, %r14 +.Lcontinue: + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + mov %r13, 5*8(%rdi) + mov %r14, 6*8(%rdi) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_7_end: +SIZE(flint_mpn_mulhigh_normalised_7, .flint_mpn_mulhigh_normalised_7_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_8.asm b/src/mpn_extras/broadwell/mulhigh_normalised_8.asm new file mode 100644 index 0000000000..421139c6ea --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_8.asm @@ -0,0 +1,209 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_8) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_8) + +FUNC(flint_mpn_mulhigh_normalised_8): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 6*8(%rsi), %r8, %rax + mulx 7*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %rbx + mulx 6*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 7*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %rbp + mulx 5*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 6*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 7*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r12 + mulx 4*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 5*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r13 + mulx 3*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 4*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r14 + mulx 2*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r10 + mulx 3*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov 6*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r15 + mulx 1*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r10 + mulx 2*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %r9, %r14 + adox %r9, %r14 + + mov 7*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r15 + adcx %r8, %rax + adcx %r15, %r10 + mulx 1*8(%rsi), %r8, %r15 + adox %r8, %r10 + adox %r15, %r11 + mulx 2*8(%rsi), %r8, %r15 + adcx %r8, %r11 + adcx %r15, %rbx + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %rbx + adox %r15, %rbp + mulx 4*8(%rsi), %r8, %r15 + adcx %r8, %rbp + adcx %r15, %r12 + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %r12 + adox %r15, %r13 + mulx 6*8(%rsi), %r8, %r15 + adcx %r8, %r13 + adcx %r15, %r14 + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %r9, %r15 + adox %r9, %r15 + + mov $0, %rdx + test %r15, %r15 + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 + adc %r14, %r14 + adc %r15, %r15 +.Lcontinue: + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + mov %r13, 5*8(%rdi) + mov %r14, 6*8(%rdi) + mov %r15, 7*8(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_8_end: +SIZE(flint_mpn_mulhigh_normalised_8, .flint_mpn_mulhigh_normalised_8_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_normalised_9.asm b/src/mpn_extras/broadwell/mulhigh_normalised_9.asm new file mode 100644 index 0000000000..1532a484b5 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_normalised_9.asm @@ -0,0 +1,245 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_normalised_9) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_normalised_9) + +FUNC(flint_mpn_mulhigh_normalised_9): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 7*8(%rsi), %r8, %rax + mulx 8*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 6*8(%rsi), %r8, %rbx + mulx 7*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 8*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %rbp + mulx 6*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 7*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 8*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %r12 + mulx 5*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 8*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r13 + mulx 4*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 8*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r14 + mulx 3*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r10 + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 8*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov 6*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r15 + mulx 2*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r10 + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 8*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %r9, %r14 + adox %r9, %r14 + + mov 7*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + mulx 1*8(%rsi), %r8, %r15 + adcx %rdi, %rax + adox %r8, %rax + adcx %r15, %r10 + mulx 2*8(%rsi), %r8, %r15 + adox %r8, %r10 + adcx %r15, %r11 + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %r11 + adcx %r15, %rbx + mulx 4*8(%rsi), %r8, %r15 + adox %r8, %rbx + adcx %r15, %rbp + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %rbp + adcx %r15, %r12 + mulx 6*8(%rsi), %r8, %r15 + adox %r8, %r12 + adcx %r15, %r13 + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %r13 + adcx %r15, %r14 + mulx 8*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %r9, %r15 + adox %r9, %r15 + + mov 8*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + adcx %r8, %rax + adcx %rdi, %r10 + mulx 1*8(%rsi), %r8, %rdi + adox %r8, %r10 + adox %rdi, %r11 + mulx 2*8(%rsi), %r8, %rdi + adcx %r8, %r11 + adcx %rdi, %rbx + mulx 3*8(%rsi), %r8, %rdi + adox %r8, %rbx + adox %rdi, %rbp + mulx 4*8(%rsi), %r8, %rdi + adcx %r8, %rbp + adcx %rdi, %r12 + mulx 5*8(%rsi), %r8, %rdi + adox %r8, %r12 + adox %rdi, %r13 + mulx 6*8(%rsi), %r8, %rdi + adcx %r8, %r13 + adcx %rdi, %r14 + mulx 7*8(%rsi), %r8, %rdi + adox %r8, %r14 + adox %rdi, %r15 + mulx 8*8(%rsi), %r8, %rdi + adcx %r8, %r15 + adcx %r9, %rdi + pop %r8 + adox %r9, %rdi + + mov $0, %rdx + test %rdi, %rdi + setns %dl + js .Lcontinue + add %rax, %rax + adc %r10, %r10 + adc %r11, %r11 + adc %rbx, %rbx + adc %rbp, %rbp + adc %r12, %r12 + adc %r13, %r13 + adc %r14, %r14 + adc %r15, %r15 + adc %rdi, %rdi +.Lcontinue: + mov %r10, 0*8(%r8) + mov %r11, 1*8(%r8) + mov %rbx, 2*8(%r8) + mov %rbp, 3*8(%r8) + mov %r12, 4*8(%r8) + mov %r13, 5*8(%r8) + mov %r14, 6*8(%r8) + mov %r15, 7*8(%r8) + mov %rdi, 8*8(%r8) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_normalised_9_end: +SIZE(flint_mpn_mulhigh_normalised_9, .flint_mpn_mulhigh_normalised_9_end) +.cfi_endproc diff --git a/src/mpn_extras/mulhigh.c b/src/mpn_extras/mulhigh.c index 57365b5cfd..58614e7161 100644 --- a/src/mpn_extras/mulhigh.c +++ b/src/mpn_extras/mulhigh.c @@ -25,6 +25,19 @@ mp_limb_t flint_mpn_mulhigh_10(mp_ptr, mp_srcptr, mp_srcptr); mp_limb_t flint_mpn_mulhigh_11(mp_ptr, mp_srcptr, mp_srcptr); mp_limb_t flint_mpn_mulhigh_12(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_1(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_2(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_3(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_4(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_5(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_6(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_7(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_8(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_9(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_10(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_11(mp_ptr, mp_srcptr, mp_srcptr); +struct mp_limb_pair_t flint_mpn_mulhigh_normalised_12(mp_ptr, mp_srcptr, mp_srcptr); + const flint_mpn_mul_func_t flint_mpn_mulhigh_n_func_tab[] = { flint_mpn_mulhigh_1, @@ -40,6 +53,22 @@ const flint_mpn_mul_func_t flint_mpn_mulhigh_n_func_tab[] = flint_mpn_mulhigh_11, flint_mpn_mulhigh_12 }; + +const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_normalised_n_func_tab[] = +{ + flint_mpn_mulhigh_normalised_1, + flint_mpn_mulhigh_normalised_2, + flint_mpn_mulhigh_normalised_3, + flint_mpn_mulhigh_normalised_4, + flint_mpn_mulhigh_normalised_5, + flint_mpn_mulhigh_normalised_6, + flint_mpn_mulhigh_normalised_7, + flint_mpn_mulhigh_normalised_8, + flint_mpn_mulhigh_normalised_9, + flint_mpn_mulhigh_normalised_10, + flint_mpn_mulhigh_normalised_11, + flint_mpn_mulhigh_normalised_12 +}; #else typedef int this_file_is_empty; #endif diff --git a/src/mpn_extras/profile/p-mulhigh_normalised.c b/src/mpn_extras/profile/p-mulhigh_normalised.c new file mode 100644 index 0000000000..ee1e250aad --- /dev/null +++ b/src/mpn_extras/profile/p-mulhigh_normalised.c @@ -0,0 +1,55 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" +#include "profiler.h" + +#define N_MAX FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH + +int main(void) +{ + mp_limb_t rn[N_MAX]; + mp_limb_t ru[N_MAX]; + mp_limb_t xp[N_MAX]; + mp_limb_t yp[N_MAX]; + mp_size_t n; + + flint_printf("Times: Normalised / Unnormalised\n"); + for (n = 1; n <= N_MAX; n++) + { + flint_printf("n = %2wd:", n); + + for (slong ix = 0; ix < 10; ix++) + { + double t1, t2, __attribute__((unused)) __; + + mpn_random2(xp, n); + mpn_random2(yp, n); + xp[n - 1] |= (UWORD(1) << (FLINT_BITS - 1)); + yp[n - 1] |= (UWORD(1) << (FLINT_BITS - 1)); + + TIMEIT_START + flint_mpn_mulhigh_n(ru, xp, yp, n); + TIMEIT_STOP_VALUES(__, t1) + + TIMEIT_START + flint_mpn_mulhigh_normalised_n(rn, xp, yp, n); + TIMEIT_STOP_VALUES(__, t2) + + flint_printf("%7.2fx", t2 / t1); + } + flint_printf("\n"); + } + + flint_cleanup_master(); + + return 0; +} diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index fb7a0307d4..d56122db9c 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -24,6 +24,7 @@ #include "t-mul_n.c" #include "t-mul_basecase.c" #include "t-mulhigh_n.c" +#include "t-mulhigh_normalised_n.c" #include "t-mulmod_2expp1.c" #include "t-mulmod_preinv1.c" #include "t-mulmod_preinvn.c" @@ -45,6 +46,7 @@ test_struct tests[] = TEST_FUNCTION(flint_mpn_mul_n), TEST_FUNCTION(flint_mpn_mul_basecase), TEST_FUNCTION(flint_mpn_mulhigh_n), + TEST_FUNCTION(flint_mpn_mulhigh_normalised_n), TEST_FUNCTION(flint_mpn_mulmod_2expp1), TEST_FUNCTION(flint_mpn_mulmod_preinv1), TEST_FUNCTION(flint_mpn_mulmod_preinvn), diff --git a/src/mpn_extras/test/t-mulhigh_normalised_n.c b/src/mpn_extras/test/t-mulhigh_normalised_n.c new file mode 100644 index 0000000000..206c4b285e --- /dev/null +++ b/src/mpn_extras/test/t-mulhigh_normalised_n.c @@ -0,0 +1,99 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "mpn_extras.h" + +#if FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH != 0 + +# define N_MAX FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH + +void mpfr_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n); + +TEST_FUNCTION_START(flint_mpn_mulhigh_normalised_n, state) +{ + slong ix; + int result; + + for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) + { + mp_limb_t rp_n[N_MAX + 1] = {UWORD(0)}; + mp_limb_t rp_u[N_MAX + 1] = {UWORD(0)}; + mp_limb_t xp[N_MAX]; + mp_limb_t yp[N_MAX]; + mp_size_t n; + struct mp_limb_pair_t res_norm; + mp_limb_t retlimb, normalised; + + n = 1 + n_randint(state, N_MAX); + + mpn_random2(xp, n); + mpn_random2(yp, n); + xp[n - 1] |= (UWORD(1) << (FLINT_BITS - 1)); + yp[n - 1] |= (UWORD(1) << (FLINT_BITS - 1)); + + rp_u[0] = flint_mpn_mulhigh_n(rp_u + 1, xp, yp, n); + res_norm = flint_mpn_mulhigh_normalised_n(rp_n + 1, xp, yp, n); + retlimb = res_norm.m1; + normalised = res_norm.m2; + rp_n[0] = retlimb; + + result = ((rp_n[n] & (UWORD(1) << (FLINT_BITS - 1))) != UWORD(0)); + if (!result) + TEST_FUNCTION_FAIL( + "Top bit not set in normalised result\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "rp_n = %{ulong*}\n" + "rp_u = %{ulong*}\n", + ix, n, xp, n, yp, n, rp_n, n + 1, rp_u, n + 1); + + if (normalised) + { + result = (mpn_lshift(rp_u, rp_u, n + 1, 1) == 0); + result = result && (mpn_cmp(rp_n, rp_u, n + 1) == 0); + if (!result) + TEST_FUNCTION_FAIL( + "rp_n != rp_u << 1 when normalised\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "rp_n = %{ulong*}\n" + "rp_u = %{ulong*}\n", + ix, n, xp, n, yp, n, rp_n, n + 1, rp_u, n + 1); + } + else + { + result = (mpn_cmp(rp_n, rp_u, n + 1) == 0); + if (!result) + TEST_FUNCTION_FAIL( + "rp_n != rp_u when unnormalised\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "rp_n = %{ulong*}\n" + "rp_u = %{ulong*}\n", + ix, n, xp, n, yp, n, rp_n, n + 1, rp_u, n + 1); + } + } + + TEST_FUNCTION_END(state); +} +#else +TEST_FUNCTION_START(flint_mpn_mulhigh_normalised_n, state) +{ + TEST_FUNCTION_END_SKIPPED(state); +} +#endif