diff --git a/AUTHORS b/AUTHORS index 6ef3b793e2..297c0f9f44 100644 --- a/AUTHORS +++ b/AUTHORS @@ -455,7 +455,7 @@ Mathieu Gouttenoire Primality testing for Gaussian integers. - github math-gout + github math-gout Michael Abshoff diff --git a/doc/source/fmpz_vec.rst b/doc/source/fmpz_vec.rst index 23957dcc3f..1c7f3d8c78 100644 --- a/doc/source/fmpz_vec.rst +++ b/doc/source/fmpz_vec.rst @@ -408,13 +408,22 @@ Gaussian content Dot product -------------------------------------------------------------------------------- +.. function:: void _fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * a, const fmpz * b, int reverse, slong len) + void _fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * a, const fmpz * b, int reverse, slong len) + + Computes the dot product of the vectors *a* and *b*, setting + *res* to `s + (-1)^{subtract} \sum_{i=0}^{len-1} a_i b_i`. + The initial term *s* is optional and can be + omitted by passing *NULL* (equivalently, `s = 0`). + The parameter *subtract* must be 0 or 1. + If the *reverse* flag is 1, the second vector is reversed. + + Aliasing is allowed between ``res`` and ``initial`` but not + between ``res`` and the entries of ``a`` and ``b``. + + The *naive* version is used for testing purposes. .. function:: void _fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2) Sets ``res`` to the dot product of ``(vec1, len2)`` and ``(vec2, len2)``. - -.. function:: void _fmpz_vec_dot_ptr(fmpz_t res, const fmpz * vec1, fmpz ** const vec2, slong offset, slong len) - - Sets ``res`` to the dot product of ``len`` values at ``vec1`` and the - ``len`` values ``vec2[i] + offset`` for `0 \leq i < len`. diff --git a/src/arith/stirling1.c b/src/arith/stirling1.c index c389724fda..4d9058bbca 100644 --- a/src/arith/stirling1.c +++ b/src/arith/stirling1.c @@ -16,18 +16,15 @@ #include "arith.h" /* compute single coefficient in polynomial product */ -static void +FLINT_FORCE_INLINE void _fmpz_poly_mulmid_single(fmpz_t res, const fmpz * poly1, slong len1, const fmpz * poly2, slong len2, slong i) { - slong j, top1, top2; + slong top1, top2; top1 = FLINT_MIN(len1 - 1, i); top2 = FLINT_MIN(len2 - 1, i); - fmpz_mul(res, poly1 + i - top2, poly2 + top2); - - for (j = 1; j < top1 + top2 - i + 1; j++) - fmpz_addmul(res, poly1 + i - top2 + j, poly2 + top2 - j); + _fmpz_vec_dot_general(res, NULL, 0, poly1 + i - top2, poly2 + i - top1, 1, top1 + top2 - i + 1); } #define MAX_BASECASE 16 diff --git a/src/fmpq_poly/exp_series.c b/src/fmpq_poly/exp_series.c index 88f8b3d142..6d4b62a85a 100644 --- a/src/fmpq_poly/exp_series.c +++ b/src/fmpq_poly/exp_series.c @@ -40,7 +40,7 @@ _fmpq_poly_exp_series_basecase_deriv(fmpz * B, fmpz_t Bden, const fmpz * Aprime, const fmpz_t Aden, slong Alen, slong n) { fmpz_t t, u; - slong j, k; + slong k; Alen = FLINT_MIN(Alen, n); @@ -55,11 +55,8 @@ _fmpq_poly_exp_series_basecase_deriv(fmpz * B, fmpz_t Bden, for (k = 1; k < n; k++) { - fmpz_mul(t, Aprime, B + k - 1); - - for (j = 2; j < FLINT_MIN(Alen, k + 1); j++) - fmpz_addmul(t, Aprime + j - 1, B + k - j); - + slong l = FLINT_MIN(Alen - 1, k); + _fmpz_vec_dot_general(t, NULL, 0, Aprime, B + k - l, 1, l); fmpz_mul_ui(u, Aden, k); fmpz_divexact(B + k, t, u); } diff --git a/src/fmpq_poly/power_sums.c b/src/fmpq_poly/power_sums.c index 2eac351e65..d808656b76 100644 --- a/src/fmpq_poly/power_sums.c +++ b/src/fmpq_poly/power_sums.c @@ -9,6 +9,7 @@ (at your option) any later version. See . */ +#include "fmpz_vec.h" #include "fmpz_poly.h" #include "fmpq.h" #include "fmpq_poly.h" @@ -61,29 +62,20 @@ _fmpq_poly_power_sums(fmpz * res, fmpz_t rden, const fmpz * poly, slong len, for (k = 1; k < FLINT_MIN(n, len); k++) { - fmpz_mul_ui(res + k, poly + len - 1 - k, k); + fmpz_mul_si(res + k, poly + len - 1 - k, -k); fmpz_mul(res + k, res + k, rden); - - for (i = 1; i < k - 1; i++) - fmpz_mul(res + i, res + i, poly + len - 1); - for (i = 1; i < k; i++) - fmpz_addmul(res + k, poly + len - 1 - k + i, res + i); - fmpz_neg(res + k, res + k); + _fmpz_vec_scalar_mul_fmpz(res + 1, res + 1, k - 2, poly + len - 1); + _fmpz_vec_dot_general(res + k, res + k, 1, poly + len - 1 - k + 1, res + 1, 0, k - 1); fmpz_mul(rden, rden, poly + len - 1); } for (k = len; k < n; k++) { - fmpz_zero(res + k); - for (i = k - len + 1; i < k - 1; i++) - fmpz_mul(res + i, res + i, poly + len - 1); - for (i = k - len + 1; i < k; i++) - fmpz_addmul(res + k, poly + len - 1 - k + i, res + i); - fmpz_neg(res + k, res + k); + _fmpz_vec_scalar_mul_fmpz(res + k - len + 1, res + k - len + 1, len - 2, poly + len - 1); + _fmpz_vec_dot_general(res + k, NULL, 1, poly, res + k - len + 1, 0, len - 1); } - for (i = n - len + 1; i < n - 1; i++) - fmpz_mul(res + i, res + i, poly + len - 1); + _fmpz_vec_scalar_mul_fmpz(res + n - len + 1, res + n - len + 1, len - 2, poly + len - 1); fmpz_one(rden); for (i = n - len; i > 0; i--) diff --git a/src/fmpq_poly/power_sums_to_poly.c b/src/fmpq_poly/power_sums_to_poly.c index dbea185cd9..b9f82a0dda 100644 --- a/src/fmpq_poly/power_sums_to_poly.c +++ b/src/fmpq_poly/power_sums_to_poly.c @@ -11,6 +11,7 @@ #include "ulong_extras.h" #include "fmpz.h" +#include "fmpz_vec.h" #include "fmpz_poly.h" #include "fmpq_poly.h" @@ -30,12 +31,16 @@ _fmpq_poly_power_sums_to_poly(fmpz * res, const fmpz * poly, const fmpz_t den, fmpz_one(f); for (k = 1; k <= d; k++) { - if(k < len) + if (k < len) + { fmpz_mul(res + d - k, poly + k, f); + _fmpz_vec_dot_general(res + d - k, res + d - k, 0, res + d - k + 1, poly + 1, 0, k - 1); + + } else - fmpz_zero(res + d - k); - for (i = 1; i < FLINT_MIN(k, len); i++) - fmpz_addmul(res + d - k, res + d - k + i, poly + i); + { + _fmpz_vec_dot_general(res + d - k, NULL, 0, res + d - k + 1, poly + 1, 0, len - 1); + } a = n_gcd(FLINT_ABS(fmpz_fdiv_ui(res + d - k, k)), k); fmpz_divexact_ui(res + d - k, res + d - k, a); diff --git a/src/fmpq_poly/revert_series_lagrange_fast.c b/src/fmpq_poly/revert_series_lagrange_fast.c index 5117f1cef7..d78af92bea 100644 --- a/src/fmpq_poly/revert_series_lagrange_fast.c +++ b/src/fmpq_poly/revert_series_lagrange_fast.c @@ -44,9 +44,9 @@ void _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den, const fmpz * Q, const fmpz_t Qden, slong Qlen, slong n) { - slong i, j, k, m; + slong i, j, m; fmpz *R, *Rden, *S, *T, *dens, *tmp; - fmpz_t Sden, Tden, t; + fmpz_t Sden, Tden; if (Qlen <= 2) { @@ -65,7 +65,6 @@ _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den, m = n_sqrt(n); - fmpz_init(t); dens = _fmpz_vec_init(n); R = _fmpz_vec_init((n - 1) * m); S = _fmpz_vec_init(n - 1); @@ -103,12 +102,7 @@ _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den, for (j = 1; j < m && i + j < n; j++) { - fmpz_mul(t, S + 0, Ri(j) + i + j - 1); - - for (k = 1; k <= i + j - 1; k++) - fmpz_addmul(t, S + k, Ri(j) + i + j - 1 - k); - - fmpz_set(Qinv + i + j, t); + _fmpz_vec_dot_general(Qinv + i + j, NULL, 0, S, Ri(j), 1, i + j); fmpz_mul(dens + i + j, Sden, Rdeni(j)); fmpz_mul_ui(dens + i + j, dens + i + j, i + j); } @@ -126,7 +120,6 @@ _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den, _set_vec(Qinv, den, Qinv, dens, n); _fmpq_poly_canonicalise(Qinv, den, n); - fmpz_clear(t); _fmpz_vec_clear(dens, n); _fmpz_vec_clear(R, (n - 1) * m); _fmpz_vec_clear(S, n - 1); diff --git a/src/fmpq_poly/sin_cos_series.c b/src/fmpq_poly/sin_cos_series.c index ceace1fec0..796783eee2 100644 --- a/src/fmpq_poly/sin_cos_series.c +++ b/src/fmpq_poly/sin_cos_series.c @@ -61,6 +61,7 @@ _fmpq_poly_sin_cos_series_basecase_can(fmpz * S, fmpz_t Sden, fmpz_zero(t); fmpz_zero(u); + /* todo: precompute A[j] * j, use dot products */ for (j = 1; j < FLINT_MIN(Alen, k + 1); j++) { fmpz_mul_ui(v, A + j, j); diff --git a/src/fmpz_mat.h b/src/fmpz_mat.h index e379e74c55..f7009d2c92 100644 --- a/src/fmpz_mat.h +++ b/src/fmpz_mat.h @@ -174,8 +174,7 @@ void fmpz_mat_mul_classical(fmpz_mat_t C, const fmpz_mat_t A, void fmpz_mat_mul_strassen(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B); -void fmpz_mat_mul_classical_inline(fmpz_mat_t C, const fmpz_mat_t A, - const fmpz_mat_t B); +#define fmpz_mat_mul_classical_inline _Pragma("GCC error \"'fmpz_mat_mul_classical_inline' is deprecated. Use 'fmpz_mat_mul_classical' instead.\"") void _fmpz_mat_mul_fft(fmpz_mat_t C, const fmpz_mat_t A, slong abits, diff --git a/src/fmpz_mat/charpoly.c b/src/fmpz_mat/charpoly.c index 66c3664b4e..c43d5763bc 100644 --- a/src/fmpz_mat/charpoly.c +++ b/src/fmpz_mat/charpoly.c @@ -17,6 +17,8 @@ #include "fmpz_vec.h" #include "fmpz_mat.h" #include "fmpz_poly.h" +#include "gr.h" +#include "gr_mat.h" /* Assumes that \code{mat} is an $n \times n$ matrix and sets \code{(cp,n+1)} @@ -27,78 +29,9 @@ void _fmpz_mat_charpoly_berkowitz(fmpz *cp, const fmpz_mat_t mat) { - const slong n = mat->r; - - if (n == 0) - { - fmpz_one(cp); - } - else if (n == 1) - { - fmpz_neg(cp + 0, fmpz_mat_entry(mat, 0, 0)); - fmpz_one(cp + 1); - } - else - { - slong i, j, k, t; - fmpz *a, *A, *s; - - a = _fmpz_vec_init(n * n); - A = a + (n - 1) * n; - - _fmpz_vec_zero(cp, n + 1); - fmpz_neg(cp + 0, fmpz_mat_entry(mat, 0, 0)); - - for (t = 1; t < n; t++) - { - for (i = 0; i <= t; i++) - { - fmpz_set(a + 0 * n + i, fmpz_mat_entry(mat, i, t)); - } - - fmpz_set(A + 0, fmpz_mat_entry(mat, t, t)); - - for (k = 1; k < t; k++) - { - for (i = 0; i <= t; i++) - { - s = a + k * n + i; - fmpz_zero(s); - for (j = 0; j <= t; j++) - { - fmpz_addmul(s, fmpz_mat_entry(mat, i, j), a + (k - 1) * n + j); - } - } - fmpz_set(A + k, a + k * n + t); - } - - fmpz_zero(A + t); - for (j = 0; j <= t; j++) - { - fmpz_addmul(A + t, fmpz_mat_entry(mat, t, j), a + (t - 1) * n + j); - } - - for (k = 0; k <= t; k++) - { - for (j = 0; j < k; j++) - { - fmpz_submul(cp + k, A + j, cp + (k - j - 1)); - } - fmpz_sub(cp + k, cp + k, A + k); - } - } - - /* Shift all coefficients up by one */ - for (i = n; i > 0; i--) - { - fmpz_swap(cp + i, cp + (i - 1)); - } - fmpz_one(cp + 0); - - _fmpz_poly_reverse(cp, cp, n + 1, n + 1); - - _fmpz_vec_clear(a, n * n); - } + gr_ctx_t ctx; + gr_ctx_init_fmpz(ctx); + GR_MUST_SUCCEED(_gr_mat_charpoly_berkowitz(cp, (const gr_mat_struct *) mat, ctx)); } void fmpz_mat_charpoly_berkowitz(fmpz_poly_t cp, const fmpz_mat_t mat) diff --git a/src/fmpz_mat/mul.c b/src/fmpz_mat/mul.c index a2cf467ac7..bbdfa2afb5 100644 --- a/src/fmpz_mat/mul.c +++ b/src/fmpz_mat/mul.c @@ -279,6 +279,6 @@ fmpz_mat_mul(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B) else if (abits >= 500 && bbits >= 500 && dim >= 8) /* tuning param */ fmpz_mat_mul_strassen(C, A, B); else - fmpz_mat_mul_classical_inline(C, A, B); + fmpz_mat_mul_classical(C, A, B); } } diff --git a/src/fmpz_mat/mul_classical.c b/src/fmpz_mat/mul_classical.c index 299ac130d3..af6572091a 100644 --- a/src/fmpz_mat/mul_classical.c +++ b/src/fmpz_mat/mul_classical.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2010,2011 Fredrik Johansson + Copyright (C) 2010, 2011, 2024 Fredrik Johansson This file is part of FLINT. @@ -10,38 +10,71 @@ */ #include "fmpz.h" +#include "fmpz_vec.h" #include "fmpz_mat.h" void fmpz_mat_mul_classical(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B) { - slong ar, bc, br; - slong i, j, k; + slong ar, br, bc, i, j; - ar = A->r; - br = B->r; - bc = B->c; + ar = fmpz_mat_nrows(A); + br = fmpz_mat_nrows(B); + bc = fmpz_mat_ncols(B); - if (br == 0) + if (ar == 0 || br == 0 || bc == 0) { fmpz_mat_zero(C); return; } - for (i = 0; i < ar; i++) + if (br == 1) { - for (j = 0; j < bc; j++) + for (i = 0; i < ar; i++) { - fmpz_mul(fmpz_mat_entry(C, i, j), - fmpz_mat_entry(A, i, 0), - fmpz_mat_entry(B, 0, j)); + for (j = 0; j < bc; j++) + { + fmpz_mul(fmpz_mat_entry(C, i, j), + fmpz_mat_entry(A, i, 0), + fmpz_mat_entry(B, 0, j)); + } + } + } + else if (br == 2) + { + for (i = 0; i < ar; i++) + { + for (j = 0; j < bc; j++) + { + fmpz_fmma(fmpz_mat_entry(C, i, j), + fmpz_mat_entry(A, i, 0), + fmpz_mat_entry(B, 0, j), + fmpz_mat_entry(A, i, 1), + fmpz_mat_entry(B, 1, j)); + } + } + } + else + { + fmpz * tmp; + TMP_INIT; + + TMP_START; + tmp = TMP_ALLOC(sizeof(fmpz) * br * bc); - for (k = 1; k < br; k++) + for (i = 0; i < br; i++) + for (j = 0; j < bc; j++) + tmp[j * br + i] = *fmpz_mat_entry(B, i, j); + + for (i = 0; i < ar; i++) + { + for (j = 0; j < bc; j++) { - fmpz_addmul(fmpz_mat_entry(C, i, j), - fmpz_mat_entry(A, i, k), - fmpz_mat_entry(B, k, j)); + _fmpz_vec_dot_general(fmpz_mat_entry(C, i, j), + NULL, 0, fmpz_mat_entry(A, i, 0), tmp + j * br, 0, br); } } + + TMP_END; } } diff --git a/src/fmpz_mat/mul_classical_inline.c b/src/fmpz_mat/mul_classical_inline.c deleted file mode 100644 index 1626b54d8a..0000000000 --- a/src/fmpz_mat/mul_classical_inline.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - Copyright (C) 2011 Fredrik Johansson - - This file is part of FLINT. - - FLINT is free software: you can redistribute it and/or modify it under - the terms of the GNU Lesser General Public License (LGPL) as published - by the Free Software Foundation; either version 2.1 of the License, or - (at your option) any later version. See . -*/ - -#include "gmpcompat.h" -#include "fmpz.h" -#include "fmpz_mat.h" - -void -fmpz_mat_mul_classical_inline(fmpz_mat_t C, const fmpz_mat_t A, - const fmpz_mat_t B) -{ - slong ar, bc, br; - slong i, j, k; - - fmpz a, b; - mpz_t t; - - mp_limb_t au, bu; - mp_limb_t pos[3]; - mp_limb_t neg[3]; - - ar = A->r; - br = B->r; - bc = B->c; - - mpz_init(t); - - for (i = 0; i < ar; i++) - { - for (j = 0; j < bc; j++) - { - flint_mpz_set_ui(t, UWORD(0)); - - pos[2] = pos[1] = pos[0] = neg[2] = neg[1] = neg[0] = UWORD(0); - - for (k = 0; k < br; k++) - { - a = A->rows[i][k]; - b = B->rows[k][j]; - - if (a == 0 || b == 0) - continue; - - if (!COEFF_IS_MPZ(a)) /* a is small */ - { - if (!COEFF_IS_MPZ(b)) /* both are small */ - { - au = FLINT_ABS(a); - bu = FLINT_ABS(b); - - umul_ppmm(au, bu, au, bu); - - if ((a ^ b) >= WORD(0)) - add_sssaaaaaa(pos[2], pos[1], pos[0], - pos[2], pos[1], pos[0], 0, au, bu); - else - add_sssaaaaaa(neg[2], neg[1], neg[0], - neg[2], neg[1], neg[0], 0, au, bu); - } - else - { - if (a >= 0) - flint_mpz_addmul_ui(t, COEFF_TO_PTR(b), a); - else - flint_mpz_submul_ui(t, COEFF_TO_PTR(b), -a); - } - } - else if (!COEFF_IS_MPZ(b)) /* b is small */ - { - if (b >= 0) - flint_mpz_addmul_ui(t, COEFF_TO_PTR(a), b); - else - flint_mpz_submul_ui(t, COEFF_TO_PTR(a), -b); - } - else - { - mpz_addmul(t, COEFF_TO_PTR(a), COEFF_TO_PTR(b)); - } - } - - if (mpz_sgn(t) != 0 || pos[2] || neg[2] || pos[1] || neg[1]) - { - __mpz_struct r; - - r._mp_size = pos[2] ? 3 : (pos[1] ? 2 : pos[0] != 0); - r._mp_alloc = r._mp_size; - r._mp_d = pos; - - mpz_add(t, t, &r); - - r._mp_size = neg[2] ? 3 : (neg[1] ? 2 : neg[0] != 0); - r._mp_alloc = r._mp_size; - r._mp_d = neg; - - mpz_sub(t, t, &r); - - fmpz_set_mpz(fmpz_mat_entry(C, i, j), t); - } - else - { - if (neg[0] > pos[0]) - fmpz_neg_ui(fmpz_mat_entry(C, i, j), neg[0] - pos[0]); - else - fmpz_set_ui(fmpz_mat_entry(C, i, j), pos[0] - neg[0]); - } - } - } - - mpz_clear(t); -} diff --git a/src/fmpz_mat/profile/p-mul.c b/src/fmpz_mat/profile/p-mul.c index 3c1240e302..e999601a37 100644 --- a/src/fmpz_mat/profile/p-mul.c +++ b/src/fmpz_mat/profile/p-mul.c @@ -50,9 +50,6 @@ void sample(void * arg, ulong count) else if (algorithm == 1) for (i = 0; i < count; i++) fmpz_mat_mul_classical(C, A, B); - else if (algorithm == 2) - for (i = 0; i < count; i++) - fmpz_mat_mul_classical_inline(C, A, B); else if (algorithm == 3) for (i = 0; i < count; i++) fmpz_mat_mul_multi_mod(C, A, B); @@ -71,7 +68,7 @@ void sample(void * arg, ulong count) int main(void) { - double min_default, min_classical, min_inline, min_multi_mod, min_strassen, max; + double min_default, min_classical, min_multi_mod, min_strassen, max; mat_mul_t params; slong bits, dim; @@ -93,28 +90,22 @@ int main(void) params.algorithm = 1; prof_repeat(&min_classical, &max, sample, ¶ms); - params.algorithm = 2; - prof_repeat(&min_inline, &max, sample, ¶ms); - params.algorithm = 3; prof_repeat(&min_multi_mod, &max, sample, ¶ms); params.algorithm = 4; prof_repeat(&min_strassen, &max, sample, ¶ms); - flint_printf("dim = %wd default/classical/inline/multi_mod/strassen %.2f %.2f %.2f %.2f %.2f (us)\n", - dim, min_default, min_classical, min_inline, min_multi_mod, min_strassen); + flint_printf("dim = %wd default/classical/multi_mod/strassen %.2f %.2f %.2f %.2f (us)\n", + dim, min_default, min_classical, min_multi_mod, min_strassen); if (min_multi_mod < 0.6*min_default) flint_printf("BAD!\n"); - if (min_inline < 0.6*min_default) - flint_printf("BAD!\n"); - if (min_strassen < 0.7*min_default) flint_printf("BAD!\n"); - if (min_multi_mod < 0.7*min_inline) + if (min_multi_mod < 0.7*min_classical) break; } } diff --git a/src/fmpz_mat/profile/p-mul_double_word.c b/src/fmpz_mat/profile/p-mul_double_word.c index 2631b056af..38c11f354e 100644 --- a/src/fmpz_mat/profile/p-mul_double_word.c +++ b/src/fmpz_mat/profile/p-mul_double_word.c @@ -71,7 +71,7 @@ int main(void) if (dim < 150) { - fmpz_mat_mul_classical_inline(D, A, B); + fmpz_mat_mul_classical(D, A, B); if (!fmpz_mat_equal(D, E)) { diff --git a/src/fmpz_mat/profile/p-mul_multi_mod.c b/src/fmpz_mat/profile/p-mul_multi_mod.c index b6cf3167c9..135a4d1e4a 100644 --- a/src/fmpz_mat/profile/p-mul_multi_mod.c +++ b/src/fmpz_mat/profile/p-mul_multi_mod.c @@ -61,7 +61,7 @@ int main(void) if (dim < 150) { - fmpz_mat_mul_classical_inline(D, A, B); + fmpz_mat_mul_classical(D, A, B); if (!fmpz_mat_equal(D, E)) { diff --git a/src/fmpz_mat/test/t-mul.c b/src/fmpz_mat/test/t-mul.c index ba71570382..2456038115 100644 --- a/src/fmpz_mat/test/t-mul.c +++ b/src/fmpz_mat/test/t-mul.c @@ -55,7 +55,7 @@ TEST_FUNCTION_START(fmpz_mat_mul, state) fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); fmpz_mat_mul(C, A, B); - fmpz_mat_mul_classical_inline(D, A, B); + fmpz_mat_mul_classical(D, A, B); if (!fmpz_mat_equal(C, D)) { @@ -169,8 +169,8 @@ TEST_FUNCTION_START(fmpz_mat_mul, state) if (!fmpz_mat_equal(A, B)) { flint_printf("FAIL: window aliasing failed\n"); - fmpz_mat_print(A); flint_printf("\n\n"); - fmpz_mat_print(B); flint_printf("\n\n"); + fmpz_mat_print(A); flint_printf("\n\n"); + fmpz_mat_print(B); flint_printf("\n\n"); fflush(stdout); flint_abort(); } diff --git a/src/fmpz_mat/test/t-mul_blas.c b/src/fmpz_mat/test/t-mul_blas.c index cb856966c6..d06b2a5c26 100644 --- a/src/fmpz_mat/test/t-mul_blas.c +++ b/src/fmpz_mat/test/t-mul_blas.c @@ -44,7 +44,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_blas, state) /* Make sure noise in the output is ok */ fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); - fmpz_mat_mul_classical_inline(C, A, B); + fmpz_mat_mul_classical(C, A, B); if (fmpz_mat_mul_blas(D, A, B)) { if (!fmpz_mat_equal(C, D)) @@ -92,7 +92,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_blas, state) fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); fmpz_mat_randtest(D, state, n_randint(state, 200) + 1); - fmpz_mat_mul_classical_inline(C, A, B); + fmpz_mat_mul_classical(C, A, B); if (fmpz_mat_mul_blas(D, A, B)) { if (!fmpz_mat_equal(C, D)) diff --git a/src/fmpz_mat/test/t-mul_classical.c b/src/fmpz_mat/test/t-mul_classical.c index 679ce9494e..e28bbab32c 100644 --- a/src/fmpz_mat/test/t-mul_classical.c +++ b/src/fmpz_mat/test/t-mul_classical.c @@ -19,6 +19,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_classical, state) for (i = 0; i < 100 * flint_test_multiplier(); i++) { slong m, n, k; + slong i, j, h; m = n_randint(state, 50); n = n_randint(state, 50); @@ -36,7 +37,13 @@ TEST_FUNCTION_START(fmpz_mat_mul_classical, state) fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); fmpz_mat_mul_classical(C, A, B); - fmpz_mat_mul_classical_inline(D, A, B); + + for (i = 0; i < D->r; i++) + for (j = 0; j < D->c; j++) + for (h = 0; h < B->r; h++) + fmpz_addmul(fmpz_mat_entry(D, i, j), + fmpz_mat_entry(A, i, h), + fmpz_mat_entry(B, h, j)); if (!fmpz_mat_equal(C, D)) { diff --git a/src/fmpz_mat/test/t-mul_double_word.c b/src/fmpz_mat/test/t-mul_double_word.c index 0d63157a11..93d3f9b355 100644 --- a/src/fmpz_mat/test/t-mul_double_word.c +++ b/src/fmpz_mat/test/t-mul_double_word.c @@ -87,7 +87,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_double_word, state) fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); _fmpz_mat_mul_double_word(C, A, B); - fmpz_mat_mul_classical_inline(D, A, B); + fmpz_mat_mul_classical(D, A, B); if (!fmpz_mat_equal(C, D)) { diff --git a/src/fmpz_mat/test/t-mul_fft.c b/src/fmpz_mat/test/t-mul_fft.c index dc60e0ac89..56fc9fabdf 100644 --- a/src/fmpz_mat/test/t-mul_fft.c +++ b/src/fmpz_mat/test/t-mul_fft.c @@ -38,7 +38,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_fft, state) fmpz_mat_randtest(C, state, n_randint(state, 2000) + 1); fmpz_mat_mul_fft(C, A, B); - fmpz_mat_mul_classical_inline(D, A, B); + fmpz_mat_mul_classical(D, A, B); if (!fmpz_mat_equal(C, D)) { diff --git a/src/fmpz_mat/test/t-mul_multi_mod.c b/src/fmpz_mat/test/t-mul_multi_mod.c index 07df123128..24ecf6098d 100644 --- a/src/fmpz_mat/test/t-mul_multi_mod.c +++ b/src/fmpz_mat/test/t-mul_multi_mod.c @@ -36,7 +36,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_multi_mod, state) /* Make sure noise in the output is ok */ fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); - fmpz_mat_mul_classical_inline(C, A, B); + fmpz_mat_mul_classical(C, A, B); fmpz_mat_mul_multi_mod(D, A, B); if (!fmpz_mat_equal(C, D)) @@ -71,7 +71,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_multi_mod, state) /* Make sure noise in the output is ok */ fmpz_mat_randtest(C, state, n_randint(state, 200) + 1); - fmpz_mat_mul_classical_inline(C, A, B); + fmpz_mat_mul_classical(C, A, B); fmpz_mat_mul_multi_mod(D, A, B); if (!fmpz_mat_equal(C, D)) diff --git a/src/fmpz_mat/test/t-mul_small.c b/src/fmpz_mat/test/t-mul_small.c index 1db309bc00..2bda60b825 100644 --- a/src/fmpz_mat/test/t-mul_small.c +++ b/src/fmpz_mat/test/t-mul_small.c @@ -47,7 +47,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_small, state) fmpz_mat_randtest(D, state, n_randint(state, 200) + 1); _fmpz_mat_mul_small(C, A, B); - fmpz_mat_mul_classical_inline(D, A, B); + fmpz_mat_mul_classical(D, A, B); if (!fmpz_mat_equal(C, D)) { diff --git a/src/fmpz_mod_mat/mul_classical_threaded.c b/src/fmpz_mod_mat/mul_classical_threaded.c index dcab679382..c2679b9036 100644 --- a/src/fmpz_mod_mat/mul_classical_threaded.c +++ b/src/fmpz_mod_mat/mul_classical_threaded.c @@ -24,6 +24,17 @@ with op = 1, computes D = C + A*B with op = -1, computes D = C - A*B */ +static inline void +_fmpz_vec_dot_ptr(fmpz_t c, const fmpz * vec1, fmpz ** const vec2, + slong offset, slong len) +{ + slong i; + + fmpz_zero(c); + for (i = 0; i < len; i++) + fmpz_addmul(c, vec1 + i, vec2[i] + offset); +} + static inline void _fmpz_mod_mat_addmul_basic_op(fmpz ** D, fmpz ** const C, fmpz ** const A, fmpz ** const B, slong m, slong k, slong n, int op, fmpz_t p) diff --git a/src/fmpz_poly/2norm.c b/src/fmpz_poly/2norm.c index 2ec047c296..e0fc9dabe7 100644 --- a/src/fmpz_poly/2norm.c +++ b/src/fmpz_poly/2norm.c @@ -10,15 +10,13 @@ */ #include "fmpz.h" +#include "fmpz_vec.h" #include "fmpz_poly.h" void _fmpz_poly_2norm(fmpz_t res, const fmpz * poly, slong len) { - slong i; - fmpz_zero(res); - for (i = 0; i < len; i++) - fmpz_addmul(res, poly + i, poly + i); + _fmpz_vec_dot(res, poly, poly, len); fmpz_sqrt(res, res); } diff --git a/src/fmpz_poly/div_series_basecase.c b/src/fmpz_poly/div_series_basecase.c index b412f0ba88..479852d459 100644 --- a/src/fmpz_poly/div_series_basecase.c +++ b/src/fmpz_poly/div_series_basecase.c @@ -180,10 +180,9 @@ _fmpz_poly_div_series_basecase(fmpz * Q, const fmpz * A, slong Alen, } else { - fmpz_mul(Q + i, B + 1, Q + i - 1); - - for (j = 2; j < FLINT_MIN(i + 1, Blen); j++) - fmpz_addmul(Q + i, B + j, Q + i - j); + slong l = FLINT_MIN(i, Blen - 1); + /* todo: merge final subtraction */ + _fmpz_vec_dot_general(Q + i, NULL, 0, B + 1, Q + i - l, 1, l); } if (i < Alen) diff --git a/src/fmpz_poly/gcd_modular.c b/src/fmpz_poly/gcd_modular.c index fc6390e33f..c6dfb47d95 100644 --- a/src/fmpz_poly/gcd_modular.c +++ b/src/fmpz_poly/gcd_modular.c @@ -67,15 +67,12 @@ void _fmpz_poly_gcd_modular(fmpz * res, const fmpz * poly1, slong len1, if (len1 < 64 && len2 < 64) /* compute the squares of the 2-norms */ { - fmpz_set_ui(l, 0); - for (i = 0; i < len1; i++) - fmpz_addmul(l, A + i, A + i); + _fmpz_vec_dot(l, A, A, len1); nb1 = fmpz_bits(l); - fmpz_set_ui(l, 0); - for (i = 0; i < len2; i++) - fmpz_addmul(l, B + i, B + i); + _fmpz_vec_dot(l, B, B, len2); nb2 = fmpz_bits(l); - } else /* approximate to save time */ + } + else /* approximate to save time */ { nb1 = 2*bits1 + FLINT_BIT_COUNT(len1); nb2 = 2*bits2 + FLINT_BIT_COUNT(len2); diff --git a/src/fmpz_poly/inv_series.c b/src/fmpz_poly/inv_series.c index e9affd9771..eae17bf875 100644 --- a/src/fmpz_poly/inv_series.c +++ b/src/fmpz_poly/inv_series.c @@ -220,13 +220,8 @@ _fmpz_poly_inv_series_basecase(fmpz * Qinv, const fmpz * Q, slong Qlen, slong n) } else { - fmpz_mul(Qinv + i, Q + 1, Qinv + i - 1); - - for (j = 2; j < FLINT_MIN(i + 1, Qlen); j++) - fmpz_addmul(Qinv + i, Q + j, Qinv + i - j); - - if (neg) - fmpz_neg(Qinv + i, Qinv + i); + slong l = FLINT_MIN(i, Qlen - 1); + _fmpz_vec_dot_general(Qinv + i, NULL, neg, Q + 1, Qinv + i - l, 1, l); } } diff --git a/src/fmpz_poly/mul_classical.c b/src/fmpz_poly/mul_classical.c index 844ddc135d..71d5e36fc1 100644 --- a/src/fmpz_poly/mul_classical.c +++ b/src/fmpz_poly/mul_classical.c @@ -1,5 +1,6 @@ /* Copyright (C) 2008, 2009 William Hart + Copyright (C) 2024 Fredrik Johansson This file is part of FLINT. @@ -18,26 +19,37 @@ void _fmpz_poly_mul_classical(fmpz * res, const fmpz * poly1, slong len1, const fmpz * poly2, slong len2) { - if (len1 == 1 && len2 == 1) /* Special case if the length of both inputs is 1 */ + slong i, top1, top2; + + if (len1 == 1 && len2 == 1) { fmpz_mul(res, poly1, poly2); + return; } - else /* Ordinary case */ + + if (len1 == 1) { - slong i; + _fmpz_vec_scalar_mul_fmpz(res, poly2, len2, poly1); + return; + } - /* Set res[i] = poly1[i]*poly2[0] */ + if (len2 == 1) + { _fmpz_vec_scalar_mul_fmpz(res, poly1, len1, poly2); + return; + } - /* Set res[i+len1-1] = in1[len1-1]*in2[i] */ - _fmpz_vec_scalar_mul_fmpz(res + len1, poly2 + 1, len2 - 1, - poly1 + len1 - 1); + fmpz_mul(res, poly1, poly2); - /* out[i+j] += in1[i]*in2[j] */ - for (i = 0; i < len1 - 1; i++) - _fmpz_vec_scalar_addmul_fmpz(res + i + 1, poly2 + 1, len2 - 1, - poly1 + i); + for (i = 1; i < len1 + len2 - 2; i++) + { + top1 = FLINT_MIN(len1 - 1, i); + top2 = FLINT_MIN(len2 - 1, i); + + _fmpz_vec_dot_general(res + i, NULL, 0, poly1 + i - top2, poly2 + i - top1, 1, top1 + top2 - i + 1); } + + fmpz_mul(res + len1 + len2 - 2, poly1 + len1 - 1, poly2 + len2 - 1); } void diff --git a/src/fmpz_poly/mullow_classical.c b/src/fmpz_poly/mullow_classical.c index 94553b1bc1..bcfa94e3e3 100644 --- a/src/fmpz_poly/mullow_classical.c +++ b/src/fmpz_poly/mullow_classical.c @@ -1,5 +1,6 @@ /* Copyright (C) 2008, 2009 William Hart + Copyright (C) 2024 Fredrik Johansson This file is part of FLINT. @@ -20,27 +21,37 @@ void _fmpz_poly_mullow_classical(fmpz * res, const fmpz * poly1, slong len1, const fmpz * poly2, slong len2, slong n) { - if ((len1 == 1 && len2 == 1) || n == 1) /* Special case if the length of output is 1 */ + slong i, top1, top2; + + len1 = FLINT_MIN(len1, n); + len2 = FLINT_MIN(len2, n); + + if (n == 1) { fmpz_mul(res, poly1, poly2); + return; } - else /* Ordinary case */ + + if (len1 == 1) { - slong i; + _fmpz_vec_scalar_mul_fmpz(res, poly2, n, poly1); + return; + } - /* Set res[i] = poly1[i]*poly2[0] */ - _fmpz_vec_scalar_mul_fmpz(res, poly1, FLINT_MIN(len1, n), poly2); + if (len2 == 1) + { + _fmpz_vec_scalar_mul_fmpz(res, poly1, n, poly2); + return; + } - /* Set res[i+len1-1] = in1[len1-1]*in2[i] */ - if (n > len1) - _fmpz_vec_scalar_mul_fmpz(res + len1, poly2 + 1, n - len1, - poly1 + len1 - 1); + fmpz_mul(res, poly1, poly2); + + for (i = 1; i < n; i++) + { + top1 = FLINT_MIN(len1 - 1, i); + top2 = FLINT_MIN(len2 - 1, i); - /* out[i+j] += in1[i]*in2[j] */ - for (i = 0; i < FLINT_MIN(len1, n) - 1; i++) - _fmpz_vec_scalar_addmul_fmpz(res + i + 1, poly2 + 1, - FLINT_MIN(len2, n - i) - 1, - poly1 + i); + _fmpz_vec_dot_general(res + i, NULL, 0, poly1 + i - top2, poly2 + i - top1, 1, top1 + top2 - i + 1); } } diff --git a/src/fmpz_poly/power_sums_naive.c b/src/fmpz_poly/power_sums_naive.c index 16a6c5ccaa..a1fee2d1e1 100644 --- a/src/fmpz_poly/power_sums_naive.c +++ b/src/fmpz_poly/power_sums_naive.c @@ -10,28 +10,24 @@ */ #include "fmpz.h" +#include "fmpz_vec.h" #include "fmpz_poly.h" void _fmpz_poly_power_sums_naive(fmpz * res, const fmpz * poly, slong len, slong n) { - slong i, k; + slong k; fmpz_set_ui(res, len - 1); + for (k = 1; k < FLINT_MIN(n, len); k++) { - fmpz_mul_ui(res + k, poly + len - 1 - k, k); - for (i = 1; i < k; i++) - fmpz_addmul(res + k, poly + len - 1 - k + i, res + i); - fmpz_neg(res + k, res + k); + fmpz_mul_si(res + k, poly + len - 1 - k, -k); + _fmpz_vec_dot_general(res + k, res + k, 1, poly + len - 1 - k + 1, res + 1, 0, k - 1); } + for (k = len; k < n; k++) - { - fmpz_zero(res + k); - for (i = k - len + 1; i < k; i++) - fmpz_addmul(res + k, poly + len - 1 - k + i, res + i); - fmpz_neg(res + k, res + k); - } + _fmpz_vec_dot_general(res + k, NULL, 1, poly, res + k - len + 1, 0, len - 1); } void diff --git a/src/fmpz_poly/power_sums_to_poly.c b/src/fmpz_poly/power_sums_to_poly.c index df00b67ea4..b1cfeb17a0 100644 --- a/src/fmpz_poly/power_sums_to_poly.c +++ b/src/fmpz_poly/power_sums_to_poly.c @@ -10,30 +10,25 @@ */ #include "fmpz.h" +#include "fmpz_vec.h" #include "fmpz_poly.h" void _fmpz_poly_power_sums_to_poly(fmpz * res, const fmpz * poly, slong len) { - slong i, k; + slong k; slong d = fmpz_get_ui(poly); fmpz_one(res + d); for (k = 1; k < FLINT_MIN(d + 1, len); k++) { - fmpz_set(res + d - k, poly + k); - for (i = 1; i < k; i++) - fmpz_addmul(res + d - k, res + d - k + i, poly + i); - fmpz_divexact_si(res + d - k, res + d - k, k); - fmpz_neg(res + d - k, res + d - k); + _fmpz_vec_dot_general(res + d - k, poly + k, 0, res + d - k + 1, poly + 1, 0, k - 1); + fmpz_divexact_si(res + d - k, res + d - k, -k); } for (k = len; k <= d; k++) { - fmpz_zero(res + d - k); - for (i = 1; i < len; i++) - fmpz_addmul(res + d - k, res + d - k + i, poly + i); - fmpz_divexact_si(res + d - k, res + d - k, k); - fmpz_neg(res + d - k, res + d - k); + _fmpz_vec_dot_general(res + d - k, NULL, 0, res + d - k + 1, poly + 1, 0, len - 1); + fmpz_divexact_si(res + d - k, res + d - k, -k); } } diff --git a/src/fmpz_poly/resultant_modular.c b/src/fmpz_poly/resultant_modular.c index 3ed5bf4f6c..8ea6c4c6e9 100644 --- a/src/fmpz_poly/resultant_modular.c +++ b/src/fmpz_poly/resultant_modular.c @@ -68,10 +68,8 @@ void _fmpz_poly_resultant_modular(fmpz_t res, const fmpz * poly1, slong len1, fmpz_init(b1); fmpz_init(b2); - for (i = 0; i < len1; i++) - fmpz_addmul(b1, A + i, A + i); - for (i = 0; i < len2; i++) - fmpz_addmul(b2, B + i, B + i); + _fmpz_vec_dot(b1, A, A, len1); + _fmpz_vec_dot(b2, B, B, len2); fmpz_pow_ui(b1, b1, len2 - 1); fmpz_pow_ui(b2, b2, len1 - 1); diff --git a/src/fmpz_poly/sqr_classical.c b/src/fmpz_poly/sqr_classical.c index d9b112b6ae..2352dfd719 100644 --- a/src/fmpz_poly/sqr_classical.c +++ b/src/fmpz_poly/sqr_classical.c @@ -1,6 +1,7 @@ /* Copyright (C) 2008, 2009 William Hart Copyright (C) 2011 Sebastian Pancratz + Copyright (C) 2024 Fredrik Johansson This file is part of FLINT. @@ -15,29 +16,39 @@ #include "fmpz_poly.h" /* Assumes len > 0. */ -void _fmpz_poly_sqr_classical(fmpz *rop, const fmpz *op, slong len) +void _fmpz_poly_sqr_classical(fmpz *res, const fmpz *op, slong len) { - if (len == 1) /* Special case */ + slong i, start, stop; + + if (len == 1) { - fmpz_mul(rop, op, op); + fmpz_mul(res, op, op); + return; } - else /* Ordinary case */ - { - slong i; - _fmpz_vec_scalar_mul_fmpz(rop, op, len, op); + fmpz_mul(res, op, op); + fmpz_mul(res + 1, op, op + 1); + fmpz_mul_2exp(res + 1, res + 1, 1); - _fmpz_vec_scalar_mul_fmpz(rop + len, op + 1, len - 1, op + len - 1); + for (i = 2; i < 2 * len - 3; i++) + { + start = FLINT_MAX(0, i - len + 1); + stop = FLINT_MIN(len - 1, (i + 1) / 2 - 1); - for (i = 1; i < len - 1; i++) - _fmpz_vec_scalar_addmul_fmpz(rop + i + 1, op + 1, i - 1, op + i); + _fmpz_vec_dot_general(res + i, NULL, 0, op + start, op + i - stop, 1, stop - start + 1); + fmpz_mul_2exp(res + i, res + i, 1); - for (i = 1; i < 2 * len - 2; i++) - fmpz_mul_ui(rop + i, rop + i, 2); + if (i % 2 == 0 && i / 2 < len) + fmpz_addmul(res + i, op + i / 2, op + i / 2); + } - for (i = 1; i < len - 1; i++) - fmpz_addmul(rop + 2 * i, op + i, op + i); + if (len > 2) + { + fmpz_mul(res + 2 * len - 3, op + len - 1, op + len - 2); + fmpz_mul_2exp(res + 2 * len - 3, res + 2 * len - 3, 1); } + + fmpz_mul(res + 2 * len - 2, op + len - 1, op + len - 1); } void fmpz_poly_sqr_classical(fmpz_poly_t rop, const fmpz_poly_t op) diff --git a/src/fmpz_poly/sqrlow_classical.c b/src/fmpz_poly/sqrlow_classical.c index adf9f1afef..c55390fa02 100644 --- a/src/fmpz_poly/sqrlow_classical.c +++ b/src/fmpz_poly/sqrlow_classical.c @@ -1,5 +1,6 @@ /* Copyright (C) 2011 Sebastian Pancratz + Copyright (C) 2024 Fredrik Johansson This file is part of FLINT. @@ -16,30 +17,42 @@ /* Assumes len > 0 and 0 < n <= 2 * len - 1. */ -void _fmpz_poly_sqrlow_classical(fmpz *rop, const fmpz *op, slong len, slong n) +void _fmpz_poly_sqrlow_classical(fmpz *res, const fmpz *op, slong len, slong n) { - if (len == 1 || n == 1) /* Special case */ + slong i, start, stop; + + len = FLINT_MIN(len, n); + + if (n == 1) { - fmpz_mul(rop, op, op); + fmpz_mul(res, op, op); + return; } - else /* Ordinary case */ - { - slong i; - _fmpz_vec_scalar_mul_fmpz(rop, op, FLINT_MIN(len, n), op); + fmpz_mul(res, op, op); + fmpz_mul(res + 1, op, op + 1); + fmpz_mul_2exp(res + 1, res + 1, 1); - _fmpz_vec_scalar_mul_fmpz(rop + len, op + 1, n - len, op + len - 1); + for (i = 2; i < FLINT_MIN(n, 2 * len - 3); i++) + { + start = FLINT_MAX(0, i - len + 1); + stop = FLINT_MIN(len - 1, (i + 1) / 2 - 1); - for (i = 1; i < len - 1; i++) - _fmpz_vec_scalar_addmul_fmpz(rop + i + 1, - op + 1, FLINT_MIN(i - 1, n - (i + 1)), op + i); + _fmpz_vec_dot_general(res + i, NULL, 0, op + start, op + i - stop, 1, stop - start + 1); + fmpz_mul_2exp(res + i, res + i, 1); - for (i = 1; i < FLINT_MIN(2 * len - 2, n); i++) - fmpz_mul_ui(rop + i, rop + i, 2); + if (i % 2 == 0 && i / 2 < len) + fmpz_addmul(res + i, op + i / 2, op + i / 2); + } - for (i = 1; i < FLINT_MIN(len - 1, (n + 1) / 2); i++) - fmpz_addmul(rop + 2 * i, op + i, op + i); + if (len > 2 && n >= 2 * len - 2) + { + fmpz_mul(res + 2 * len - 3, op + len - 1, op + len - 2); + fmpz_mul_2exp(res + 2 * len - 3, res + 2 * len - 3, 1); } + + if (n >= 2 * len - 1) + fmpz_mul(res + 2 * len - 2, op + len - 1, op + len - 1); } void diff --git a/src/fmpz_vec.h b/src/fmpz_vec.h index 8250567429..4606e43c60 100644 --- a/src/fmpz_vec.h +++ b/src/fmpz_vec.h @@ -256,10 +256,16 @@ void _fmpz_vec_lcm(fmpz_t res, const fmpz * vec, slong len); /* Dot product *************************************************************/ -void _fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2); +void _fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial, int subtract, + const fmpz * a, const fmpz * b, int reverse, slong len); +void _fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract, + const fmpz * a, const fmpz * b, int reverse, slong len); -void _fmpz_vec_dot_ptr(fmpz_t c, const fmpz * vec1, - fmpz ** const vec2, slong offset, slong len); +FMPZ_VEC_INLINE +void _fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2) +{ + _fmpz_vec_dot_general(res, NULL, 0, vec1, vec2, 0, len2); +} #ifdef __cplusplus } diff --git a/src/fmpz_vec/dot.c b/src/fmpz_vec/dot.c index 5c0e1ca75a..548c71bf2a 100644 --- a/src/fmpz_vec/dot.c +++ b/src/fmpz_vec/dot.c @@ -1,7 +1,5 @@ /* - Copyright (C) 2010, 2020 William Hart - Copyright (C) 2010, 2011 Fredrik Johansson - Copyright (C) 2014 Abhinav Baid + Copyright (C) 2024 Fredrik Johansson This file is part of FLINT. @@ -11,26 +9,394 @@ (at your option) any later version. See . */ +#include "mpn_extras.h" #include "fmpz.h" +#include "fmpz_extras.h" #include "fmpz_vec.h" void -_fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2) +_fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial, + int subtract, const fmpz * a, const fmpz * b, int reverse, slong len) { slong i; - fmpz_zero(res); - for (i = 0; i < len2; i++) - fmpz_addmul(res, vec1 + i, vec2 + i); + if (initial == NULL) + { + if (len == 0) + { + fmpz_zero(res); + return; + } + + fmpz_mul(res, a, reverse ? b + len - 1 : b); + + if (subtract) + { + fmpz_neg(res, res); + for (i = 1; i < len; i++) + fmpz_submul(res, a + i, reverse ? b + len - 1 - i : b + i); + } + else + { + for (i = 1; i < len; i++) + fmpz_addmul(res, a + i, reverse ? b + len - 1 - i : b + i); + } + } + else + { + if (res != initial) + fmpz_set(res, initial); + + if (subtract) + for(i = 0; i < len; i++) + fmpz_submul(res, a + i, reverse ? b + len - 1 - i : b + i); + else + for(i = 0; i < len; i++) + fmpz_addmul(res, a + i, reverse ? b + len - 1 - i : b + i); + } +} + +#define INITIAL_ALLOC 64 + +#define FMPZ_GET_MPN(xptr, xn, xneg, xtmplimb, x) \ + if (!COEFF_IS_MPZ(x)) \ + { \ + (xtmplimb) = FLINT_ABS(x); \ + (xptr) = &(xtmplimb); \ + (xn) = 1; \ + (xneg) = ((x) < 0); \ + } \ + else \ + { \ + __mpz_struct * __z1 = COEFF_TO_PTR(x); \ + (xptr) = __z1->_mp_d; \ + (xn) = FLINT_ABS(__z1->_mp_size); \ + (xneg) = (__z1->_mp_size < 0); \ + } + +/* (s,sn) = (a,an) + (b,bn). Allows an == 0 but not bn == 0. */ +#define MPN_ADD(s, sn, a, an, b, bn) \ + do { \ + if ((an) == 0) \ + { \ + FLINT_SWAP(mp_ptr, s, b); \ + (sn) = (bn); \ + } \ + else if ((an) >= (bn)) \ + { \ + mp_limb_t __cy; \ + (s)[(an)] = __cy = mpn_add((s), (a), (an), (b), (bn)); \ + (sn) = (an) + (__cy != 0); \ + } \ + else \ + { \ + mp_limb_t __cy; \ + (s)[(bn)] = __cy = mpn_add((s), (b), (bn), (a), (an)); \ + (sn) = (bn) + (__cy != 0); \ + } \ + } while (0) + +/* (s,sn) = (s,sn) + (a,an) * b. Allows sn == 0 but not an == 0. */ +#define MPN_ADDMUL_1(s, sn, a, an, b) \ + do { \ + mp_limb_t __cy; \ + if ((sn) >= (an)) \ + { \ + FLINT_ASSERT((an) != 0); \ + __cy = mpn_addmul_1((s), (a), (an), (b)); \ + if ((sn) > (an)) \ + __cy = mpn_add_1((s) + (an), (s) + (an), (sn) - (an), __cy); \ + (s)[(sn)] = __cy; \ + (sn) += (__cy != 0); \ + } \ + else \ + { \ + (s)[(an)] = mpn_mul_1((s) + (sn), (a) + (sn), (an) - (sn), (b)); \ + if ((sn) != 0) \ + { \ + __cy = mpn_addmul_1((s), (a), (sn), (b)); \ + (s)[(an)] += mpn_add_1((s) + (sn), (s) + (sn), (an) - (sn), __cy); \ + } \ + (sn) = (an) + ((s)[(an)] != 0); \ + } \ + } while (0) \ + + +FLINT_STATIC_NOINLINE +void _fmpz_set_mpn(fmpz_t res, mp_srcptr x, mp_size_t xn, int neg) +{ + if (xn <= 1 && x[0] <= COEFF_MAX) + { + _fmpz_demote(res); + *res = neg ? -(slong) x[0] : x[0]; + } + else + { + fmpz_set_mpn_large(res, x, xn, neg); + } } void -_fmpz_vec_dot_ptr(fmpz_t c, const fmpz * vec1, fmpz ** const vec2, - slong offset, slong len) +_fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract, + const fmpz * a, const fmpz * b, int reverse, slong len) { + mp_limb_t tmp1[INITIAL_ALLOC + 2]; + mp_limb_t tmp2[INITIAL_ALLOC + 2]; + mp_limb_t tmp3[INITIAL_ALLOC + 2]; + mp_size_t alloc = INITIAL_ALLOC; + mp_size_t new_alloc; + + /* We maintain separate sums for small terms, large positive terms, + and large negative terms, the idea being to have fewer + adjustments in the main loop in exchange for some added + complexity combining things in the end. Should profile + alternative strategies. */ + mp_limb_t s0 = 0, s1 = 0, s2 = 0; + mp_ptr neg = tmp1; + mp_ptr pos = tmp2; + mp_size_t posn = 0, negn = 0; + + /* Temporary space for products. */ + mp_ptr t = tmp3; + mp_size_t tn; + + mp_ptr tmp_heap = NULL; + slong i; - fmpz_zero(c); + if (len <= 1) + { + if (initial == NULL) + { + if (len == 1) + { + fmpz_mul(res, a, b); + if (subtract) + fmpz_neg(res, res); + } + else + fmpz_zero(res); + } + else + { + if (res != initial) + fmpz_set(res, initial); + + if (subtract) + { + if (len == 1) + fmpz_submul(res, a, b); + } + else + { + if (len == 1) + fmpz_addmul(res, a, b); + } + } + return; + } + + if (initial != NULL) + { + fmpz ca; + mp_limb_t atmp; + mp_srcptr ap; + mp_size_t an; + int aneg; + + ca = *initial; + FMPZ_GET_MPN(ap, an, aneg, atmp, ca); + + if (an <= 2) + { + s0 = ap[0]; + if (an == 2) + s1 = ap[1]; + + if (aneg ^ subtract) + sub_dddmmmsss(s2, s1, s0, 0, 0, 0, 0, s1, s0); + } + else + { + if (an > INITIAL_ALLOC) + { + new_alloc = an + 4; + + tmp_heap = flint_malloc(3 * (new_alloc + 2) * sizeof(mp_limb_t)); + + t = tmp_heap; + pos = t + (new_alloc + 2); + neg = pos + (new_alloc + 2); + + alloc = new_alloc; + } + + if (aneg ^ subtract) + { + flint_mpn_copyi(neg, ap, an); + negn = an; + } + else + { + flint_mpn_copyi(pos, ap, an); + posn = an; + } + } + } + for (i = 0; i < len; i++) - fmpz_addmul(c, vec1 + i, vec2[i] + offset); + { + fmpz ca, cb; + mp_limb_t atmp, btmp; + mp_srcptr ap, bp; + mp_size_t an, bn; + mp_limb_t cy; + int aneg, bneg; + + ca = a[i]; + if (ca == 0) + continue; + + cb = reverse ? b[len - 1 - i] : b[i]; + if (cb == 0) + continue; + + if (!COEFF_IS_MPZ(ca) && !COEFF_IS_MPZ(cb)) + { + mp_limb_t hi, lo; + smul_ppmm(hi, lo, ca, cb); + add_sssaaaaaa(s2, s1, s0, s2, s1, s0, FLINT_SIGN_EXT(hi), hi, lo); + continue; + } + + FMPZ_GET_MPN(ap, an, aneg, atmp, ca); + FMPZ_GET_MPN(bp, bn, bneg, btmp, cb); + tn = an + bn; + + if (tn > alloc) + { + mp_ptr p1, p2, p3; + + new_alloc = FLINT_MAX(3 * alloc / 2, tn + 4); + + p1 = flint_malloc(3 * (new_alloc + 2) * sizeof(mp_limb_t)); + p2 = p1 + (new_alloc + 2); + p3 = p2 + (new_alloc + 2); + + flint_mpn_copyi(p2, pos, posn); + flint_mpn_copyi(p3, neg, negn); + t = p1; + pos = p2; + neg = p3; + + FLINT_SWAP(mp_ptr, tmp_heap, p1); + + if (p1 != NULL) + flint_free(p1); + + alloc = new_alloc; + } + + if (an < bn) + { + FLINT_SWAP(mp_srcptr, ap, bp); + FLINT_SWAP(mp_size_t, an, bn); + } + + if (bn == 1) + { + mp_limb_t b0 = bp[0]; + + if (aneg ^ bneg) + MPN_ADDMUL_1(neg, negn, ap, an, b0); + else + MPN_ADDMUL_1(pos, posn, ap, an, b0); + continue; + } + + if (ap == bp && an == bn) + { + flint_mpn_sqr(t, ap, an); + cy = t[tn - 1]; + } + else + { + cy = flint_mpn_mul(t, ap, an, bp, bn); + } + + tn -= (cy == 0); + + if (aneg ^ bneg) + MPN_ADD(neg, negn, neg, negn, t, tn); + else + MPN_ADD(pos, posn, pos, posn, t, tn); + } + + /* There are only small terms. */ + if (posn == 0 && negn == 0) + { + if (subtract) + sub_dddmmmsss(s2, s1, s0, 0, 0, 0, s2, s1, s0); + + fmpz_set_signed_uiuiui(res, s2, s1, s0); + return; + } + + /* Add small terms to large ones. */ + if (((slong) s2 >= WORD(0))) + { + t[2] = s2; + t[1] = s1; + t[0] = s0; + MPN_ADD(pos, posn, pos, posn, t, 3); + } + else + { + sub_dddmmmsss(t[2], t[1], t[0], 0, 0, 0, s2, s1, s0); + MPN_ADD(neg, negn, neg, negn, t, 3); + } + + MPN_NORM(pos, posn); + MPN_NORM(neg, negn); + + if (negn == 0) + { + _fmpz_set_mpn(res, pos, posn, 0 ^ subtract); + } + else if (posn == 0) + { + _fmpz_set_mpn(res, neg, negn, 1 ^ subtract); + } + else + { + /* Do subtraction */ + int tneg = 0; + + if (posn > negn) + { + tn = posn; + } + else if (negn > posn) + { + tn = negn; + tneg = 1; + } + else if (posn != 0) + { + tn = posn; + if (mpn_cmp(pos, neg, tn) < 0) + tneg = 1; + } + + if (tneg) + mpn_sub(t, neg, negn, pos, posn); + else + mpn_sub(t, pos, posn, neg, negn); + + MPN_NORM(t, tn); + _fmpz_set_mpn(res, t, tn, tneg ^ subtract); + } + + if (tmp_heap != NULL) + flint_free(tmp_heap); } diff --git a/src/fmpz_vec/test/main.c b/src/fmpz_vec/test/main.c index 9a4681abc3..3fb5d04c70 100644 --- a/src/fmpz_vec/test/main.c +++ b/src/fmpz_vec/test/main.c @@ -17,6 +17,7 @@ #include "t-add.c" #include "t-content.c" #include "t-dot.c" +#include "t-dot_general.c" #include "t-get_d_vec_2exp.c" #include "t-get_set_fft.c" #include "t-get_set_nmod_vec.c" @@ -61,6 +62,7 @@ test_struct tests[] = TEST_FUNCTION(fmpz_vec_add), TEST_FUNCTION(fmpz_vec_content), TEST_FUNCTION(fmpz_vec_dot), + TEST_FUNCTION(fmpz_vec_dot_general), TEST_FUNCTION(fmpz_vec_get_d_vec_2exp), TEST_FUNCTION(fmpz_vec_get_set_fft), TEST_FUNCTION(fmpz_vec_get_set_nmod_vec), diff --git a/src/fmpz_vec/test/t-dot_general.c b/src/fmpz_vec/test/t-dot_general.c new file mode 100644 index 0000000000..d78baef966 --- /dev/null +++ b/src/fmpz_vec/test/t-dot_general.c @@ -0,0 +1,82 @@ +/* + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "fmpz.h" +#include "fmpz_vec.h" + +TEST_FUNCTION_START(fmpz_vec_dot_general, state) +{ + int iter; + + for (iter = 0; iter < 100000 * flint_test_multiplier(); iter++) + { + fmpz * a, * b; + fmpz_t s, t, c; + slong n, bits1, bits2; + int alias, negate, initial, reverse; + + initial = n_randint(state, 2); + alias = n_randint(state, 2); + negate = n_randint(state, 2); + reverse = n_randint(state, 2); + n = n_randint(state, 8); + + if (n_randint(state, 30) == 0) + bits1 = 2 + n_randint(state, 20000); + else + bits1 = 2 + n_randint(state, 1000); + + bits2 = 2 + n_randint(state, 1000); + + a = _fmpz_vec_init(n); + b = _fmpz_vec_init(n); + fmpz_init(s); + fmpz_init(t); + fmpz_init(c); + + fmpz_randtest(c, state, bits1); + _fmpz_vec_randtest(a, state, n, bits1); + _fmpz_vec_randtest(b, state, n, bits2); + + if (initial && alias) + { + fmpz_set(s, c); + fmpz_set(t, c); + _fmpz_vec_dot_general(s, s, negate, a, b, reverse, n); + _fmpz_vec_dot_general_naive(t, t, negate, a, b, reverse, n); + } + else + { + _fmpz_vec_dot_general(s, initial ? c : NULL, negate, a, b, reverse, n); + _fmpz_vec_dot_general_naive(t, initial ? c : NULL, negate, a, b, reverse, n); + } + + if (!fmpz_equal(s, t) || !_fmpz_is_canonical(s)) + { + flint_printf("negate = %d, initial = %d, reverse = %d, alias = %d\n", negate, initial, reverse, alias); + flint_printf("c = %{fmpz}\n\n", c); + flint_printf("a = %{fmpz*}\n\n", a, n); + flint_printf("b = %{fmpz*}\n\n", b, n); + flint_printf("s = %{fmpz}\n\n", s); + flint_printf("t = %{fmpz}\n\n", t); + flint_abort(); + } + + fmpz_clear(s); + fmpz_clear(t); + fmpz_clear(c); + _fmpz_vec_clear(a, n); + _fmpz_vec_clear(b, n); + } + + TEST_FUNCTION_END(state); +} diff --git a/src/gr/fmpz.c b/src/gr/fmpz.c index 80ec5dbb1c..d24743ce57 100644 --- a/src/gr/fmpz.c +++ b/src/gr/fmpz.c @@ -15,6 +15,7 @@ #include "fmpz_factor.h" #include "fmpz_poly.h" #include "fmpz_poly_factor.h" +#include "fmpz_vec.h" #include "fmpz_mat.h" #include "fmpq.h" #include "gr.h" @@ -917,74 +918,14 @@ _gr_fmpz_vec_sum(fmpz_t res, const fmpz * vec, slong len, gr_ctx_t ctx) int _gr_fmpz_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx) { - slong i; - - if (len <= 0) - { - if (initial == NULL) - fmpz_zero(res); - else - fmpz_set(res, initial); - return GR_SUCCESS; - } - - if (initial == NULL) - { - fmpz_mul(res, vec1, vec2); - } - else - { - if (subtract) - fmpz_neg(res, initial); - else - fmpz_set(res, initial); - - fmpz_addmul(res, vec1, vec2); - } - - for (i = 1; i < len; i++) - fmpz_addmul(res, vec1 + i, vec2 + i); - - if (subtract) - fmpz_neg(res, res); - + _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 0, len); return GR_SUCCESS; } int _gr_fmpz_vec_dot_rev(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx) { - slong i; - - if (len <= 0) - { - if (initial == NULL) - fmpz_zero(res); - else - fmpz_set(res, initial); - return GR_SUCCESS; - } - - if (initial == NULL) - { - fmpz_mul(res, vec1, vec2 + len - 1); - } - else - { - if (subtract) - fmpz_neg(res, initial); - else - fmpz_set(res, initial); - - fmpz_addmul(res, vec1, vec2 + len - 1); - } - - for (i = 1; i < len; i++) - fmpz_addmul(res, vec1 + i, vec2 + len - 1 - i); - - if (subtract) - fmpz_neg(res, res); - + _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 1, len); return GR_SUCCESS; } diff --git a/src/gr/fmpz_mod.c b/src/gr/fmpz_mod.c index 99101bc6d6..58e3b97e3a 100644 --- a/src/gr/fmpz_mod.c +++ b/src/gr/fmpz_mod.c @@ -11,6 +11,7 @@ #include "fmpz.h" #include "fmpz_factor.h" +#include "fmpz_vec.h" #include "fmpz_mod.h" #include "fmpz_mod_mat.h" #include "fmpz_mod_poly.h" @@ -403,8 +404,6 @@ _gr_fmpz_mod_pow_fmpz(fmpz_t res, const fmpz_t x, const fmpz_t exp, const gr_ctx int _gr_fmpz_mod_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx) { - slong i; - if (len <= 0) { if (initial == NULL) @@ -414,28 +413,8 @@ _gr_fmpz_mod_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz return GR_SUCCESS; } - if (initial == NULL) - { - fmpz_mul(res, vec1, vec2); - } - else - { - if (subtract) - fmpz_neg(res, initial); - else - fmpz_set(res, initial); - - fmpz_addmul(res, vec1, vec2); - } - - for (i = 1; i < len; i++) - fmpz_addmul(res, vec1 + i, vec2 + i); - - if (subtract) - fmpz_neg(res, res); - + _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 0, len); fmpz_mod_set_fmpz(res, res, FMPZ_MOD_CTX(ctx)); - return GR_SUCCESS; } @@ -443,8 +422,6 @@ _gr_fmpz_mod_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz int _gr_fmpz_mod_vec_dot_rev(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx) { - slong i; - if (len <= 0) { if (initial == NULL) @@ -454,28 +431,8 @@ _gr_fmpz_mod_vec_dot_rev(fmpz_t res, const fmpz_t initial, int subtract, const f return GR_SUCCESS; } - if (initial == NULL) - { - fmpz_mul(res, vec1, vec2 + len - 1); - } - else - { - if (subtract) - fmpz_neg(res, initial); - else - fmpz_set(res, initial); - - fmpz_addmul(res, vec1, vec2 + len - 1); - } - - for (i = 1; i < len; i++) - fmpz_addmul(res, vec1 + i, vec2 + len - 1 - i); - - if (subtract) - fmpz_neg(res, res); - + _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 1, len); fmpz_mod_set_fmpz(res, res, FMPZ_MOD_CTX(ctx)); - return GR_SUCCESS; } diff --git a/src/nmod_poly/power_sums.c b/src/nmod_poly/power_sums.c index af187d0e2d..78625f504d 100644 --- a/src/nmod_poly/power_sums.c +++ b/src/nmod_poly/power_sums.c @@ -78,6 +78,7 @@ nmod_poly_power_sums(nmod_poly_t res, const nmod_poly_t poly, slong n) } } +/* todo: should use dot products */ void _nmod_poly_power_sums_naive(mp_ptr res, mp_srcptr poly, slong len, slong n, nmod_t mod) diff --git a/src/nmod_poly/power_sums_to_poly.c b/src/nmod_poly/power_sums_to_poly.c index ab351b89b3..1281b55dec 100644 --- a/src/nmod_poly/power_sums_to_poly.c +++ b/src/nmod_poly/power_sums_to_poly.c @@ -56,6 +56,7 @@ nmod_poly_power_sums_to_poly(nmod_poly_t res, const nmod_poly_t Q) } } +/* todo: should use dot products */ void _nmod_poly_power_sums_to_poly_naive(mp_ptr res, mp_srcptr poly, slong len, nmod_t mod)