Add new API function utf8_to_uv()

This is designed to replace the problematic utf8_to_uvchr(), which is problematic. Its behavior varies depending on if <utf8> warnings are enabled or not, and no code in core actually takes that into account If warnings are enabled: A zero return can mean both success or failure Hence a zero return must be disambiguated. Success would come from the next character being a NUL. If failure, <retlen> will be -1, so can't be used to find where to start parsing again. If disabled: Both the return and <retlen> will be usable values, but the return of the REPLACEMENT CHARACTER is ambiguous. It could mean failure, or it could mean that that was the next character in the input and was successfully decoded. utf8_to_uv() solves these. This commit includes a few changes to use it, to show it works. I have WIP that changes the rest of core to use it. I found that it makes coding simpler. The new function returns true upon success; false on failure. And it is passed pointers to return the computed code point and byte length into. These values always contain the correct information, regardless of if the input is malformed or not. It is easy to test for failure in a conditional and then to take appropriate action. However, most often it seems the appropriate action is to use, going forward, the REPLACEMENT CHARACTER returned in failure cases. And if you don't care particularly if it succeeds or not, you just use it without testing the result. This happens when you are confident that the input is well-formed, or say in converting a string for display. There is another function utf8_to_uv_flags() which merely extends this API for more flexible use, and doesn't offer the advantages over the existing API function that does the same thing. I included it because the main function is just a small wrapper around it, and the API is similar and some may prefer it.
Perl · Aug 25, 2024 · b3321c0 · b3321c0
1 parent 34d9693
commit b3321c0
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 13 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -3676,6 +3676,10 @@ EMXp	|U8 *	|utf16_to_utf8_reversed 				\
 				|NN U8 *d				\
 				|Size_t bytelen 			\
 				|NN Size_t *newlen
+Adip	|bool	|utf8_to_uv	|NN const U8 *s 			\
+				|NN const U8 *send			\
+				|NN UV *cp				\
+				|NN STRLEN *advance
 ADbdp	|UV	|utf8_to_uvchr	|NN const U8 *s 			\
 				|NULLOK STRLEN *retlen
 AMdp	|UV	|utf8_to_uvchr_buf					\
@@ -3686,6 +3690,12 @@ Cip	|UV	|utf8_to_uvchr_buf_helper				\
 				|NN const U8 *s 			\
 				|NN const U8 *send			\
 				|NULLOK STRLEN *retlen
+Adip	|I32	|utf8_to_uv_flags					\
+				|NN const U8 *s 			\
+				|NN const U8 *send			\
+				|NN UV *cp				\
+				|NN STRLEN *advance			\
+				|U32 flags
 CDbdp	|UV	|utf8_to_uvuni	|NN const U8 *s 			\
 				|NULLOK STRLEN *retlen
 : Used in perly.y

diff --git a/embed.h b/embed.h
@@ -788,6 +788,8 @@
 # define utf8_hop_safe                          Perl_utf8_hop_safe
 # define utf8_length(a,b)                       Perl_utf8_length(aTHX_ a,b)
 # define utf8_to_bytes(a,b)                     Perl_utf8_to_bytes(aTHX_ a,b)
+# define utf8_to_uv(a,b,c,d)                    Perl_utf8_to_uv(aTHX_ a,b,c,d)
+# define utf8_to_uv_flags(a,b,c,d,e)            Perl_utf8_to_uv_flags(aTHX_ a,b,c,d,e)
 # define utf8_to_uvchr_buf_helper(a,b,c)        Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
 # define utf8n_to_uvchr_msgs                    Perl_utf8n_to_uvchr_msgs
 # define uvoffuni_to_utf8_flags_msgs(a,b,c,d)   Perl_uvoffuni_to_utf8_flags_msgs(aTHX_ a,b,c,d)

diff --git a/inline.h b/inline.h
@@ -3090,6 +3090,94 @@ Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
     }
 }
 
+/*
+=for apidoc utf8_to_uv
+
+Convert the first character in the UTF-8-encoded string starting at C<s> to the
+native code point it represents, storing its value into C<*cp>, and the number
+of bytes of C<s> it occupies into C<*advance>.
+
+Only the bytes from C<s> to S<C<send -1>> are examined.  B<It is an error to call
+this function without> S<C<send E<gt> s>>.
+
+If the bytes starting at C<s> do not represent at least one well-formed UTF-8
+character, this function returns C<false> and sets C<*cp> to the Unicode
+C<REPLACEMENT CHARACTER>.
+
+Otherwise it returns C<true>.
+
+With both success and failure, the value of C<*advance> is the number of bytes
+to add to C<s> to look for the next code point in it.  You might be tempted to
+use C<UTF8_SKIP> instead for this purpose, but B<don't>; it requires extra
+work, and is likely wrong when this function returns C<false>.
+
+Starting in Perl v5.42, this is the preferred function to call to convert from
+UTF-8 for code that doesn't want to get into the weeds of possible
+malformations.
+
+=cut
+*/
+
+PERL_STATIC_INLINE  bool
+Perl_utf8_to_uv(pTHX_ const U8 *s, const U8 *send, UV *cp, STRLEN *advance)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UV;
+
+    return utf8_to_uv_flags(s, send, cp, advance, 0) >= 0;
+}
+
+/*
+=for apidoc utf8_to_uv_flags
+
+Convert the first character in the UTF-8-encoded string starting at C<s> to the
+native code point it represents, storing its value into C<*cp>, and the number
+of bytes of C<s> it occupies into C<*advance>.
+
+Only the bytes from C<s> to S<C<send -1>> are examined.  B<It is an error to call
+this function without> S<C<send E<gt> s>>.
+
+If the bytes starting at C<s> do not represent at least one well-formed UTF-8
+character, this function returns a negative value and sets C<*cp> to the
+Unicode C<REPLACEMENT CHARACTER>.
+
+Otherwise it returns a value greater than or equal to zero.
+
+With both success and failure, the value of C<*advance> is the number of bytes
+to add to C<s> to look for the next code point in it.  You might be tempted to
+use C<UTF8_SKIP> instead for this purpose, but B<don't>; it requires extra
+work, and is likely wrong when this function returns a negative value.
+
+You can pass in the C<flags> parameter any of the C<DISALLOW>-type flags
+accepted by C<L</utf8n_to_uvchr>> to restrict the types of code points accepted
+by this function.
+
+A negative return from this function indicates an error.  If you take its
+negative (turning it into a positive), the value will be the same as that
+returned in C<*error> parameter to C<L</utf8n_to_uvchr_error>>, so you can
+determine the exact malformations.
+
+This function is an alternative to that function, but with an API more
+consistent with that of C<L</utf8_to_uv>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE  I32
+Perl_utf8_to_uv_flags(pTHX_ const U8 *s, const U8 *send, UV *cp,
+                            STRLEN *advance, U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UV_FLAGS;
+    assert(s < send);
+
+    U32 errors;
+
+    /* Same logic as in utf8_to_uvchr_buf_helper for why we allow EMPTY here */
+    *cp = utf8n_to_uvchr_error(s, send - s, advance,
+                               (flags | (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)),
+                               &errors);
+    return -errors;
+}
+
 /* ------------------------------- perl.h ----------------------------- */
 
 /*

diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -1030,8 +1030,7 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
 =for apidoc utf8n_to_uvchr
 
 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this
-directly.
+Most code should use L</utf8_to_uv>() rather than call this directly.
 
 Bottom level UTF-8 decode routine.
 Returns the native code point value of the first character in the string C<s>,
@@ -1160,8 +1159,7 @@ Perl_utf8n_to_uvchr(const U8 *s,
 =for apidoc utf8n_to_uvchr_error
 
 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this
-directly.
+Most code should use L</utf8_to_uv>() rather than call this directly.
 
 This function is for code that needs to know what the precise malformation(s)
 are when an error is found.  If you also need to know the generated warning
@@ -1331,8 +1329,7 @@ Perl_utf8n_to_uvchr_error(const U8 *s,
 =for apidoc utf8n_to_uvchr_msgs
 
 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this
-directly.
+Most code should use L</utf8_to_uv>() rather than call this directly.
 
 This function is for code that needs to know what the precise malformation(s)
 are when an error is found, and wants the corresponding warning and/or error
@@ -2107,6 +2104,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
 /*
 =for apidoc utf8_to_uvchr_buf
 
+It is better to use C<L</utf8_to_uv>> instead of this function.
+
 Returns the native code point of the first character in the string C<s> which
 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 C<*retlen> will be set to the length, in bytes, of that character.
@@ -2120,13 +2119,45 @@ the next possible position in C<s> that could begin a non-malformed character.
 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
 returned.
 
+This function is problematic.  The behavior varies depending on if C<utf8>
+warnings are enabled or not.
+
+=over
+
+=item If enabled:
+
+=over
+
+=item A zero return can mean both success or failure
+
+Hence a zero return must be disambiguated.  Success would come from the next
+character being a NUL.
+
+=item If failure, C<retlen> will be -1, so can't be used to find where to start
+parsing again.
+
+=back
+
+=item If disabled:
+
+Both the return and C<retlen> will be usable values, but the return of the
+S<C<REPLACEMENT CHARACTER>> is ambiguous.  It could mean failure, or it could
+mean that that was the next character in the input and was successfully
+decoded.
+
+=back
+
+In both cases, there are problems, and in practice no one has bothered to
+notice if warnings are enbabled or not.
+
+C<L</utf8_to_uv>> was invented to solve these problems.
+
 =cut
 
 Also implemented as a macro in utf8.h
 
 */
 
-
 UV
 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 {
@@ -3718,13 +3749,14 @@ S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
             /* For the dot above to modify the 'I', it must be part of a
              * combining sequence immediately following the 'I', and no other
              * modifier with a ccc of 230 may intervene */
-            cp = utf8_to_uvchr_buf(p, e, NULL);
+            STRLEN advance;
+            utf8_to_uv(p, e, &cp, &advance);
             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
                 break;
             }
 
             /* Here the combining sequence continues */
-            p += UTF8SKIP(p);
+            p += advance;
         }
     }
 
@@ -4116,7 +4148,9 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
 
     while (s < e) {
-        if (UTF8SKIP(s) > len) {
+        STRLEN advance = UTF8SKIP(s);
+
+        if (advance > len) {
             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
             return FALSE;
@@ -4138,10 +4172,11 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
                     /* This has a different warning than the one the called
                      * function would output, so can't just call it, unlike we
                      * do for the non-chars and above-unicodes */
-                    UV uv = utf8_to_uvchr_buf(s, e, NULL);
+                    UV uv;
+                    utf8_to_uv(s, e, &uv, &advance);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
-                                             uv);
+                        uv);
                     ok = FALSE;
                 }
             }
@@ -4153,7 +4188,7 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
                 ok = FALSE;
             }
         }
-        s += UTF8SKIP(s);
+        s += advance;
     }
 
     return ok;