From 534434065472ce3ef47847d8d84d0ce1d28963e2 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 26 Oct 2024 06:30:28 +0000 Subject: [PATCH] utf8_count --- Objects/unicodeobject.c | 70 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9995ff8dc7c79e4..44f051028c21d0e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s, #include "stringlib/codecs.h" #include "stringlib/undef.h" +#if (SIZEOF_SIZE_T == 8) /* Mask to quickly check whether a C 'size_t' contains a non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_SIZE_T == 8) # define ASCII_CHAR_MASK 0x8080808080808080ULL +// used to count codepoints in UTF-8 string. +# define VECTOR_0101 0x0101010101010101ULL +# define VECTOR_00FF 0x00ff00ff00ff00ffULL #elif (SIZEOF_SIZE_T == 4) # define ASCII_CHAR_MASK 0x80808080U +# define VECTOR_0101 0x01010101U +# define VECTOR_00FF 0x00ff00ffU #else # error C 'size_t' size should be either 4 or 8! #endif @@ -5087,6 +5092,61 @@ find_first_nonascii(const char *start, const char *end) } +#if SIZEOF_SIZE_T == 4 + const size_t vector_01 = 0x01010101; + const size_t vector_00ff = 0x00ff00ff; +#else + const size_t vector_01 = 0x0101010101010101; + const size_t vector_00ff = 0x00ff00ff00ff00ff; +#endif + +static inline int scalar_utf8_start_char(unsigned int ch) +{ + // 0xxxxxxx or 11xxxxxx are first byte. + return (~ch >> 7 | ch >> 6) & 1; +} + +static inline size_t vector_utf8_start_chars(size_t v) +{ + return ((~v>>7) | (v>>6)) & VECTOR_0101; +} + +static Py_ssize_t utf8_count(const unsigned char *s, Py_ssize_t size) +{ + Py_ssize_t len = 0; + const unsigned char *end = s+size; + + if (end - s > SIZEOF_SIZE_T*2) { + while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { + len += scalar_utf8_start_char(*s++); + } + + while (s + SIZEOF_SIZE_T <= end) { + const unsigned char *e = end; + if (e - s > SIZEOF_SIZE_T * 255) { + e = s + SIZEOF_SIZE_T * 255; + } + Py_ssize_t vstart = 0; + while (s + SIZEOF_SIZE_T <= e) { + size_t v = *(size_t*)s; + size_t vs = vector_utf8_start_chars(v); + vstart += vs; + s += SIZEOF_SIZE_T; + } + vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF); + vstart += vstart >> 16; +#if SIZEOF_SIZE_T == 8 + vstart += vstart >> 32; +#endif + len += vstart & 0x7ff; + } + } + while (s < end) { + len += scalar_utf8_start_char(*s++); + } + return len; +} + static int unicode_decode_utf8_impl(_PyUnicodeWriter *writer, const char *starts, const char *s, const char *end, @@ -5234,8 +5294,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, const char *end = s + size; Py_ssize_t pos = find_first_nonascii(starts, end); - if (pos == size) { - // fast path: ASCII + if (pos == size) { // fast path: ASCII string. PyObject *u = PyUnicode_New(size, 127); if (u == NULL) { return NULL; @@ -5248,8 +5307,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, } int maxchr = 127; + Py_ssize_t maxsize = size; + unsigned char ch = (unsigned char)s[pos]; if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2) { + maxsize = utf8_count((const unsigned char *)s, size); if (ch < 0xc4) { // latin1 maxchr = 255; } @@ -5260,7 +5322,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, maxchr = 0x10ffff; } } - PyObject *u = PyUnicode_New(size, maxchr); + PyObject *u = PyUnicode_New(maxsize, maxchr); if (!u) { return NULL; }