From 83c2748b31171c78b500b558efb351980951d92b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 1 Oct 2024 16:48:28 -0600 Subject: [PATCH] bytes_from_utf8: Copy initial invariants as-is The paradigm used in this commit is in place in several other places in core. When dealing with UTF-8, it may well be that the first part of a string contains only characters that are the same when encoded as UTF-8 as when not. There is a function that finds the first position in a string not like that. It works on a whole word at a time instead of per-byte, effectively speeding things up by a factor of 8. In this case, calling that function tells us that we can use memcpy() to do the initial part of our task, before having to switch to looking at individual bytes. --- utf8.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/utf8.c b/utf8.c index 7877e6c76ecb..d3cd16e04353 100644 --- a/utf8.c +++ b/utf8.c @@ -2679,12 +2679,22 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f } const U8 * const s0 = s; - const U8 * send = s + *lenp; + const U8 * const send = s + *lenp; + const U8 * first_variant; + + /* The initial portion of 's' that consists of invariants can be Copied + * as-is. If it is entirely invariant, the whole thing can be Copied. */ + if (is_utf8_invariant_string_loc(s, *lenp, &first_variant)) { + first_variant = send; + } U8 *d; Newx(d, (*lenp) + 1, U8); + Copy(s, d, first_variant - s, U8); U8 *converted_start = d; + d += first_variant - s; + s = first_variant; while (s < send) { U8 c = *s++;