Document new utf8_to_uv function family

Perl · Oct 23, 2024 · c95a178 · c95a178
1 parent f34d969
commit c95a178
Show file tree

Hide file tree

Showing 4 changed files with 442 additions and 263 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -890,7 +890,7 @@ CTp	|Signal_t|csighandler1	|int sig
 CTp	|Signal_t|csighandler3	|int sig				\
 				|NULLOK Siginfo_t *info 		\
 				|NULLOK void *uap
-ATmp	|bool	|c9strict_utf8_to_uv					\
+ATdmp	|bool	|c9strict_utf8_to_uv					\
 				|NN const U8 * const s			\
 				|NN const U8 * const e			\
 				|NN UV *cp_p				\
@@ -1150,7 +1150,7 @@ AOdp	|SV *	|eval_pv	|NN const char *p			\
 				|I32 croak_on_error
 AOdp	|SSize_t|eval_sv	|NN SV *sv				\
 				|I32 flags
-ATmp	|bool	|extended_utf8_to_uv					\
+ATdmp	|bool	|extended_utf8_to_uv					\
 				|NN const U8 * const s			\
 				|NN const U8 * const e			\
 				|NN UV *cp_p				\
@@ -3051,7 +3051,7 @@ dopx	|PerlIO *|start_glob	|NN SV *tmpglob 			\
 				|NN IO *io
 Adp	|I32	|start_subparse |I32 is_format				\
 				|U32 flags
-ATmp	|bool	|strict_utf8_to_uv					\
+ATdmp	|bool	|strict_utf8_to_uv					\
 				|NN const U8 * const s			\
 				|NN const U8 * const e			\
 				|NN UV *cp_p				\
@@ -3687,7 +3687,7 @@ EMXp	|U8 *	|utf16_to_utf8_reversed 				\
 				|NN U8 *d				\
 				|Size_t bytelen 			\
 				|NN Size_t *newlen
-ATmp	|bool	|utf8_to_uv	|NN const U8 * const s			\
+ATdmp	|bool	|utf8_to_uv	|NN const U8 * const s			\
 				|NN const U8 * const e			\
 				|NN UV *cp_p				\
 				|NULLOK Size_t *advance_p
@@ -3697,20 +3697,20 @@ AMdip	|UV	|utf8_to_uvchr_buf					\
 				|NN const U8 *s 			\
 				|NN const U8 *send			\
 				|NULLOK STRLEN *retlen
-ATmp	|bool	|utf8_to_uv_errors					\
+ATdmp	|bool	|utf8_to_uv_errors					\
 				|NN const U8 * const s			\
 				|NN const U8 * const e			\
 				|NN UV *cp_p				\
 				|NULLOK Size_t *advance_p		\
 				|const U32 flags			\
 				|NULLOK U32 *errors
-ATmp	|bool	|utf8_to_uv_flags					\
+ATdmp	|bool	|utf8_to_uv_flags					\
 				|NN const U8 * const s			\
 				|NN const U8 * const e			\
 				|NN UV *cp_p				\
 				|NULLOK Size_t *advance_p		\
 				|const U32 flag
-ATip	|bool	|utf8_to_uv_msgs|NN const U8 * const s0 		\
+ATdip	|bool	|utf8_to_uv_msgs|NN const U8 * const s0 		\
 				|NN const U8 *e 			\
 				|NN UV *cp_p				\
 				|NULLOK Size_t *advance_p		\

diff --git a/inline.h b/inline.h
@@ -1231,17 +1231,6 @@ Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
     }
 }
 
-/*
-=for apidoc valid_utf8_to_uvchr
-Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
-known that the next character in the input UTF-8 string C<s> is well-formed
-(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
-points, and non-Unicode code points are allowed.
-
-=cut
-
- */
-
 PERL_STATIC_INLINE UV
 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 {
@@ -2053,7 +2042,7 @@ C<L</is_strict_utf8_string>> (and kin); and if C<flags> is
 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, they give the same results as
 C<L</is_c9strict_utf8_string>> (and kin).  Otherwise C<flags> may be any
 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
-C<L</utf8n_to_uvchr>>, with the same meanings.
+C<L</utf8_to_uv>>, with the same meanings.
 
 It's better to use one of the non-C<_flags> functions if they give you the
 desired strictness, as those have a better chance of being inlined by the C
@@ -2306,7 +2295,7 @@ as C<L</isSTRICT_UTF8_CHAR>>;
 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
-understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+understood by C<L</utf8_to_uv>>, with the same meanings.
 
 The three alternative macros are for the most commonly needed validations; they
 are likely to run somewhat faster than this more general one, as they can be
@@ -2855,7 +2844,7 @@ C<is_utf8_valid_partial_char_flags> when the latter is called with a zero
 C<flags> parameter.  This parameter is used to restrict the classes of code
 points that are considered to be valid.  When zero, Perl's extended UTF-8 is
 used.  Otherwise C<flags> can be any combination of the C<UTF8_DISALLOW_I<foo>>
-flags accepted by C<L</utf8n_to_uvchr>>.  If there is any sequence of bytes
+flags accepted by C<L</utf8_to_uv>>.  If there is any sequence of bytes
 that can complete the input partial character in such a way that a
 non-prohibited character is formed, the function returns TRUE; otherwise FALSE.
 Non-character code points cannot be determined based on partial character
@@ -2927,7 +2916,7 @@ complete code point, this will return TRUE anyway, provided that
 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
 
 C<flags> can be zero or any combination of the C<UTF8_DISALLOW_I<foo>> flags
-accepted by C<L</utf8n_to_uvchr>>, and with the same meanings.
+accepted by C<L</utf8_to_uv>>, and with the same meanings.
 
 The functions differ from C<L</is_utf8_string_flags>> only in that the latter
 returns FALSE if the final few bytes of the string don't form a complete code

diff --git a/mathoms.c b/mathoms.c
@@ -884,29 +884,6 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
 }
 
-/*
-=for apidoc_section $unicode
-=for apidoc utf8_to_uvchr
-
-Returns the native code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-C<NULL>) to -1.  If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
 UV
 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 {