grok_atoUV: allow non-C strings and document

This changes the internal function grok_atoUV() to not require its input to be NUL-terminated. That means the existing calls to it must be changed to set the ending position before calling it, as some did already. This function is recommended to use in a couple of pods, but it wasn't documented in perlintern. This commit does that as well.
Perl · Jun 25, 2018 · 5d4a52b · 5d4a52b
1 parent 6928bed
commit 5d4a52b
Show file tree

Hide file tree

Showing 10 changed files with 72 additions and 39 deletions.
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm
@@ -5,7 +5,7 @@ use strict;
 use warnings;
 use Carp;
 
-our $VERSION = '0.98';
+our $VERSION = '0.99';
 
 require XSLoader;
 

diff --git a/ext/XS-APItest/numeric.xs b/ext/XS-APItest/numeric.xs
@@ -40,7 +40,7 @@ grok_atoUV(number, endsv)
 	const char *pv = SvPV(number, len);
 	UV value = 0xdeadbeef;
 	bool result;
-	const char* endptr = NULL;
+	const char* endptr = pv + len;
     PPCODE:
 	EXTEND(SP,2);
 	if (endsv == &PL_sv_undef) {

diff --git a/mg.c b/mg.c
@@ -3170,7 +3170,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
 	{
 	    const char *p = SvPV_const(sv, len);
             Groups_t *gary = NULL;
-            const char* endptr;
+            const char* endptr = p + len;
             UV uv;
 #ifdef _SC_NGROUPS_MAX
            int maxgrp = sysconf(_SC_NGROUPS_MAX);

diff --git a/numeric.c b/numeric.c
@@ -1049,31 +1049,39 @@ Perl_grok_number_flags(pTHX_ const char *pv, STRLEN len, UV *valuep, U32 flags)
 }
 
 /*
-grok_atoUV
+=for apidoc grok_atoUV
 
-grok_atoUV parses a C-style zero-byte terminated string, looking for
-a decimal unsigned integer.
+parse a string, looking for a decimal unsigned integer.
 
-Returns the unsigned integer, if a valid value can be parsed
-from the beginning of the string.
+On entry, C<pv> points to the beginning of the string;
+C<valptr> points to a UV that will receive the converted value, if found;
+C<endptr> is either NULL or points to a variable that points to one byte
+beyond the point in C<pv> that this routine should examine.
+If C<endptr> is NULL, C<pv> is assumed to be NUL-terminated.
 
-Accepts only the decimal digits '0'..'9'.
+Returns FALSE if C<pv> doesn't represent a valid unsigned integer value (with
+no leading zeros).  Otherwise it returns TRUE, and sets C<*valptr> to that
+value.
 
-As opposed to atoi or strtol, grok_atoUV does NOT allow optional
-leading whitespace, or negative inputs.  If such features are
-required, the calling code needs to explicitly implement those.
+If you constrain the portion of C<pv> that is looked at by this function (by
+passing a non-NULL C<endptr>), and if the intial bytes of that portion form a
+valid value, it will return TRUE, setting C<*endptr> to the byte following the
+final digit of the value.  But if there is no constraint at what's looked at,
+all of C<pv> must be valid in order for TRUE to be returned.
 
-Returns true if a valid value could be parsed. In that case, valptr
-is set to the parsed value, and endptr (if provided) is set to point
-to the character after the last digit.
+The only characters this accepts are the decimal digits '0'..'9'.
 
-Returns false otherwise. This can happen if a) there is a leading zero
-followed by another digit; b) the digits would overflow a UV; or c)
-there are trailing non-digits AND endptr is not provided.
+As opposed to L<atoi(3)> or L<strtol(3)>, C<grok_atoUV> does NOT allow optional
+leading whitespace, nor negative inputs.  If such features are required, the
+calling code needs to explicitly implement those.
 
-Background: atoi has severe problems with illegal inputs, it cannot be
+Note that this function returns FALSE for inputs that would overflow a UV,
+or have leading zeros.  Thus a single C<0> is accepted, but not C<00> nor
+C<01>, C<002>, I<etc>.
+
+Background: C<atoi> has severe problems with illegal inputs, it cannot be
 used for incremental parsing, and therefore should be avoided
-atoi and strtol are also affected by locale settings, which can also be
+C<atoi> and C<strtol> are also affected by locale settings, which can also be
 seen as a bug (global state controlled by user environment).
 
 */
@@ -1088,15 +1096,27 @@ Perl_grok_atoUV(const char *pv, UV *valptr, const char** endptr)
 
     PERL_ARGS_ASSERT_GROK_ATOUV;
 
-    eptr = endptr ? endptr : &end2;
-    if (isDIGIT(*s)) {
+    if (endptr) {
+        eptr = endptr;
+    }
+    else {
+        end2 = s + strlen(s);
+        eptr = &end2;
+    }
+
+    if (   *eptr <= s
+        || ! isDIGIT(*s))
+    {
+        return FALSE;
+    }
+
         /* Single-digit inputs are quite common. */
         val = *s++ - '0';
-        if (isDIGIT(*s)) {
+        if (s < *eptr && isDIGIT(*s)) {
             /* Fail on extra leading zeros. */
             if (val == 0)
                 return FALSE;
-            while (isDIGIT(*s)) {
+            while (s < *eptr && isDIGIT(*s)) {
                 /* This could be unrolled like in grok_number(), but
                  * the expected uses of this are not speed-needy, and
                  * unlikely to need full 64-bitness. */
@@ -1109,12 +1129,14 @@ Perl_grok_atoUV(const char *pv, UV *valptr, const char** endptr)
                 }
             }
         }
+    if (endptr == NULL) {
+        if (*s) {
+            return FALSE; /* If endptr is NULL, no trailing non-digits allowed. */
+        }
+    }
+    else {
+        *endptr = s;
     }
-    if (s == pv)
-        return FALSE;
-    if (endptr == NULL && *s)
-        return FALSE; /* If endptr is NULL, no trailing non-digits allowed. */
-    *eptr = s;
     *valptr = val;
     return TRUE;
 }

diff --git a/perl.c b/perl.c
@@ -3351,7 +3351,7 @@ Perl_get_debug_opts(pTHX_ const char **s, bool givehelp)
 	}
     }
     else if (isDIGIT(**s)) {
-        const char* e;
+        const char* e = *s + strlen(*s);
 	if (grok_atoUV(*s, &uv, &e))
             *s = e;
 	for (; isWORDCHAR(**s); (*s)++) ;
@@ -3946,6 +3946,7 @@ S_open_script(pTHX_ const char *scriptname, bool dosearch, bool *suidscript)
         UV uv;
 	/* if find_script() returns, it returns a malloc()-ed value */
 	scriptname = PL_origfilename = find_script(scriptname, dosearch, NULL, 1);
+        s = scriptname + strlen(scriptname);
 
 	if (strBEGINs(scriptname, "/dev/fd/")
             && isDIGIT(scriptname[8])

diff --git a/pod/perlclib.pod b/pod/perlclib.pod
@@ -211,7 +211,8 @@ C<toUPPER_uni>, as described in L<perlapi/Character case changing>.)
 
 Typical use is to do range checks on C<uv> before casting:
 
-  int i; UV uv; char* end_ptr;
+  int i; UV uv;
+  char* end_ptr = input_end;
   if (grok_atoUV(input, &uv, &end_ptr)
       && uv <= INT_MAX)
     i = (int)uv;

diff --git a/regcomp.c b/regcomp.c
@@ -11247,6 +11247,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         RExC_parse++;
                         is_neg = TRUE;
                     }
+                    endptr = RExC_end;
                     if (grok_atoUV(RExC_parse, &unum, &endptr)
                         && unum <= I32_MAX
                     ) {
@@ -11485,6 +11486,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     }
                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                         UV uv;
+                        endptr = RExC_end;
                         if (grok_atoUV(RExC_parse, &uv, &endptr)
                             && uv <= I32_MAX
                         ) {
@@ -11520,6 +11522,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     /* (?(1)...) */
 		    char c;
                     UV uv;
+                    endptr = RExC_end;
                     if (grok_atoUV(RExC_parse, &uv, &endptr)
                         && uv <= I32_MAX
                     ) {
@@ -12029,6 +12032,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 		maxpos = next;
 	    RExC_parse++;
             if (isDIGIT(*RExC_parse)) {
+                endptr = RExC_end;
                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
                     vFAIL("Invalid quantifier in {,}");
                 if (uv >= REG_INFTY)
@@ -12042,6 +12046,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	    else
 		maxpos = RExC_parse;
             if (isDIGIT(*maxpos)) {
+                endptr = RExC_end;
                 if (!grok_atoUV(maxpos, &uv, &endptr))
                     vFAIL("Invalid quantifier in {,}");
                 if (uv >= REG_INFTY)
@@ -12799,9 +12804,9 @@ S_new_regcurly(const char *s, const char *e)
  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
 
 static I32
-S_backref_value(char *p)
+S_backref_value(char *p, char *e)
 {
-    const char* endptr;
+    const char* endptr = e;
     UV val;
     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
         return (I32)val;
@@ -13347,7 +13352,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     if (RExC_parse >= RExC_end) {
                         goto unterminated_g;
                     }
-                    num = S_backref_value(RExC_parse);
+                    num = S_backref_value(RExC_parse, RExC_end);
                     if (num == 0)
                         vFAIL("Reference to invalid group 0");
                     else if (num == I32_MAX) {
@@ -13365,7 +13370,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     }
                 }
                 else {
-                    num = S_backref_value(RExC_parse);
+                    num = S_backref_value(RExC_parse, RExC_end);
                     /* bare \NNN might be backref or octal - if it is larger
                      * than or equal RExC_npar then it is assumed to be an
                      * octal escape. Note RExC_npar is +1 from the actual
@@ -13742,7 +13747,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* NOTE, RExC_npar is 1 more than the actual number of
                          * parens we have seen so far, hence the < RExC_npar below. */
 
-                        if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
+                        if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
                         {  /* Not to be treated as an octal constant, go
                                    find backref */
                             --p;

diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat
@@ -23,6 +23,7 @@ Apache::SmallProf
 Archive::Extract
 Array::Base
 atan2(3)
+atoi(3)
 Attribute::Constant
 autobox
 B::Generate
@@ -283,6 +284,7 @@ strftime(3)
 strictures
 String::Base
 String::Scanf
+strtol(3)
 Switch
 tar(1)
 Template::Declare

diff --git a/utf8.c b/utf8.c
@@ -5201,6 +5201,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
             /* Get the 0th element, which is needed to setup the inversion list
              * */
             while (isSPACE(*l)) l++;
+            after_atou = (char *) lend;
             if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
                 Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
                                  " inversion list");
@@ -5217,6 +5218,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
                                      " elements than available", elements);
                 }
                 while (isSPACE(*l)) l++;
+                after_atou = (char *) lend;
                 if (!grok_atoUV((const char *)l, other_elements_ptr++,
                                  &after_atou))
                 {

diff --git a/util.c b/util.c
@@ -4320,7 +4320,7 @@ Perl_parse_unicode_opts(pTHX_ const char **popt)
 
   if (*p) {
        if (isDIGIT(*p)) {
-            const char* endptr;
+            const char* endptr = p + strlen(p);
             UV uv;
             if (grok_atoUV(p, &uv, &endptr) && uv <= U32_MAX) {
                 opt = (U32)uv;
@@ -4707,7 +4707,7 @@ S_mem_log_common(enum mem_log_type mlt, const UV n,
 	 * timeval. */
 	{
 	    STRLEN len;
-            const char* endptr;
+            const char* endptr = pmlenv + stren(pmlenv);
 	    int fd;
             UV uv;
             if (grok_atoUV(pmlenv, &uv, &endptr) /* Ignore endptr. */
@@ -5989,7 +5989,7 @@ static const char* atos_parse(const char* p,
      * The matched regular expression is roughly "\(.*:\d+\)\s*$" */
     const char* source_number_start;
     const char* source_name_end;
-    const char* source_line_end;
+    const char* source_line_end = start;
     const char* close_paren;
     UV uv;