Skip to content

Commit

Permalink
grok_atoUV: allow non-C strings and document
Browse files Browse the repository at this point in the history
This changes the internal function grok_atoUV() to not require its input
to be NUL-terminated.  That means the existing calls to it must be
changed to set the ending position before calling it, as some did
already.

This function is recommended to use in a couple of pods, but it wasn't
documented in perlintern.  This commit does that as well.
  • Loading branch information
khwilliamson committed Jun 25, 2018
1 parent 6928bed commit 5d4a52b
Show file tree
Hide file tree
Showing 10 changed files with 72 additions and 39 deletions.
2 changes: 1 addition & 1 deletion ext/XS-APItest/APItest.pm
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use strict;
use warnings;
use Carp;

our $VERSION = '0.98';
our $VERSION = '0.99';

require XSLoader;

Expand Down
2 changes: 1 addition & 1 deletion ext/XS-APItest/numeric.xs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ grok_atoUV(number, endsv)
const char *pv = SvPV(number, len);
UV value = 0xdeadbeef;
bool result;
const char* endptr = NULL;
const char* endptr = pv + len;
PPCODE:
EXTEND(SP,2);
if (endsv == &PL_sv_undef) {
Expand Down
2 changes: 1 addition & 1 deletion mg.c
Original file line number Diff line number Diff line change
Expand Up @@ -3170,7 +3170,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
{
const char *p = SvPV_const(sv, len);
Groups_t *gary = NULL;
const char* endptr;
const char* endptr = p + len;
UV uv;
#ifdef _SC_NGROUPS_MAX
int maxgrp = sysconf(_SC_NGROUPS_MAX);
Expand Down
74 changes: 48 additions & 26 deletions numeric.c
Original file line number Diff line number Diff line change
Expand Up @@ -1049,31 +1049,39 @@ Perl_grok_number_flags(pTHX_ const char *pv, STRLEN len, UV *valuep, U32 flags)
}

/*
grok_atoUV
=for apidoc grok_atoUV
grok_atoUV parses a C-style zero-byte terminated string, looking for
a decimal unsigned integer.
parse a string, looking for a decimal unsigned integer.
Returns the unsigned integer, if a valid value can be parsed
from the beginning of the string.
On entry, C<pv> points to the beginning of the string;
C<valptr> points to a UV that will receive the converted value, if found;
C<endptr> is either NULL or points to a variable that points to one byte
beyond the point in C<pv> that this routine should examine.
If C<endptr> is NULL, C<pv> is assumed to be NUL-terminated.
Accepts only the decimal digits '0'..'9'.
Returns FALSE if C<pv> doesn't represent a valid unsigned integer value (with
no leading zeros). Otherwise it returns TRUE, and sets C<*valptr> to that
value.
As opposed to atoi or strtol, grok_atoUV does NOT allow optional
leading whitespace, or negative inputs. If such features are
required, the calling code needs to explicitly implement those.
If you constrain the portion of C<pv> that is looked at by this function (by
passing a non-NULL C<endptr>), and if the intial bytes of that portion form a
valid value, it will return TRUE, setting C<*endptr> to the byte following the
final digit of the value. But if there is no constraint at what's looked at,
all of C<pv> must be valid in order for TRUE to be returned.
Returns true if a valid value could be parsed. In that case, valptr
is set to the parsed value, and endptr (if provided) is set to point
to the character after the last digit.
The only characters this accepts are the decimal digits '0'..'9'.
Returns false otherwise. This can happen if a) there is a leading zero
followed by another digit; b) the digits would overflow a UV; or c)
there are trailing non-digits AND endptr is not provided.
As opposed to L<atoi(3)> or L<strtol(3)>, C<grok_atoUV> does NOT allow optional
leading whitespace, nor negative inputs. If such features are required, the
calling code needs to explicitly implement those.
Background: atoi has severe problems with illegal inputs, it cannot be
Note that this function returns FALSE for inputs that would overflow a UV,
or have leading zeros. Thus a single C<0> is accepted, but not C<00> nor
C<01>, C<002>, I<etc>.
Background: C<atoi> has severe problems with illegal inputs, it cannot be
used for incremental parsing, and therefore should be avoided
atoi and strtol are also affected by locale settings, which can also be
C<atoi> and C<strtol> are also affected by locale settings, which can also be
seen as a bug (global state controlled by user environment).
*/
Expand All @@ -1088,15 +1096,27 @@ Perl_grok_atoUV(const char *pv, UV *valptr, const char** endptr)

PERL_ARGS_ASSERT_GROK_ATOUV;

eptr = endptr ? endptr : &end2;
if (isDIGIT(*s)) {
if (endptr) {
eptr = endptr;
}
else {
end2 = s + strlen(s);
eptr = &end2;
}

if ( *eptr <= s
|| ! isDIGIT(*s))
{
return FALSE;
}

/* Single-digit inputs are quite common. */
val = *s++ - '0';
if (isDIGIT(*s)) {
if (s < *eptr && isDIGIT(*s)) {
/* Fail on extra leading zeros. */
if (val == 0)
return FALSE;
while (isDIGIT(*s)) {
while (s < *eptr && isDIGIT(*s)) {
/* This could be unrolled like in grok_number(), but
* the expected uses of this are not speed-needy, and
* unlikely to need full 64-bitness. */
Expand All @@ -1109,12 +1129,14 @@ Perl_grok_atoUV(const char *pv, UV *valptr, const char** endptr)
}
}
}
if (endptr == NULL) {
if (*s) {
return FALSE; /* If endptr is NULL, no trailing non-digits allowed. */
}
}
else {
*endptr = s;
}
if (s == pv)
return FALSE;
if (endptr == NULL && *s)
return FALSE; /* If endptr is NULL, no trailing non-digits allowed. */
*eptr = s;
*valptr = val;
return TRUE;
}
Expand Down
3 changes: 2 additions & 1 deletion perl.c
Original file line number Diff line number Diff line change
Expand Up @@ -3351,7 +3351,7 @@ Perl_get_debug_opts(pTHX_ const char **s, bool givehelp)
}
}
else if (isDIGIT(**s)) {
const char* e;
const char* e = *s + strlen(*s);
if (grok_atoUV(*s, &uv, &e))
*s = e;
for (; isWORDCHAR(**s); (*s)++) ;
Expand Down Expand Up @@ -3946,6 +3946,7 @@ S_open_script(pTHX_ const char *scriptname, bool dosearch, bool *suidscript)
UV uv;
/* if find_script() returns, it returns a malloc()-ed value */
scriptname = PL_origfilename = find_script(scriptname, dosearch, NULL, 1);
s = scriptname + strlen(scriptname);

if (strBEGINs(scriptname, "/dev/fd/")
&& isDIGIT(scriptname[8])
Expand Down
3 changes: 2 additions & 1 deletion pod/perlclib.pod
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ C<toUPPER_uni>, as described in L<perlapi/Character case changing>.)

Typical use is to do range checks on C<uv> before casting:

int i; UV uv; char* end_ptr;
int i; UV uv;
char* end_ptr = input_end;
if (grok_atoUV(input, &uv, &end_ptr)
&& uv <= INT_MAX)
i = (int)uv;
Expand Down
15 changes: 10 additions & 5 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -11247,6 +11247,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
RExC_parse++;
is_neg = TRUE;
}
endptr = RExC_end;
if (grok_atoUV(RExC_parse, &unum, &endptr)
&& unum <= I32_MAX
) {
Expand Down Expand Up @@ -11485,6 +11486,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
UV uv;
endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
) {
Expand Down Expand Up @@ -11520,6 +11522,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* (?(1)...) */
char c;
UV uv;
endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
) {
Expand Down Expand Up @@ -12029,6 +12032,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
maxpos = next;
RExC_parse++;
if (isDIGIT(*RExC_parse)) {
endptr = RExC_end;
if (!grok_atoUV(RExC_parse, &uv, &endptr))
vFAIL("Invalid quantifier in {,}");
if (uv >= REG_INFTY)
Expand All @@ -12042,6 +12046,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
else
maxpos = RExC_parse;
if (isDIGIT(*maxpos)) {
endptr = RExC_end;
if (!grok_atoUV(maxpos, &uv, &endptr))
vFAIL("Invalid quantifier in {,}");
if (uv >= REG_INFTY)
Expand Down Expand Up @@ -12799,9 +12804,9 @@ S_new_regcurly(const char *s, const char *e)
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */

static I32
S_backref_value(char *p)
S_backref_value(char *p, char *e)
{
const char* endptr;
const char* endptr = e;
UV val;
if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
return (I32)val;
Expand Down Expand Up @@ -13347,7 +13352,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
if (RExC_parse >= RExC_end) {
goto unterminated_g;
}
num = S_backref_value(RExC_parse);
num = S_backref_value(RExC_parse, RExC_end);
if (num == 0)
vFAIL("Reference to invalid group 0");
else if (num == I32_MAX) {
Expand All @@ -13365,7 +13370,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
}
}
else {
num = S_backref_value(RExC_parse);
num = S_backref_value(RExC_parse, RExC_end);
/* bare \NNN might be backref or octal - if it is larger
* than or equal RExC_npar then it is assumed to be an
* octal escape. Note RExC_npar is +1 from the actual
Expand Down Expand Up @@ -13742,7 +13747,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
/* NOTE, RExC_npar is 1 more than the actual number of
* parens we have seen so far, hence the < RExC_npar below. */

if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
{ /* Not to be treated as an octal constant, go
find backref */
--p;
Expand Down
2 changes: 2 additions & 0 deletions t/porting/known_pod_issues.dat
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Apache::SmallProf
Archive::Extract
Array::Base
atan2(3)
atoi(3)
Attribute::Constant
autobox
B::Generate
Expand Down Expand Up @@ -283,6 +284,7 @@ strftime(3)
strictures
String::Base
String::Scanf
strtol(3)
Switch
tar(1)
Template::Declare
Expand Down
2 changes: 2 additions & 0 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -5201,6 +5201,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
/* Get the 0th element, which is needed to setup the inversion list
* */
while (isSPACE(*l)) l++;
after_atou = (char *) lend;
if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
" inversion list");
Expand All @@ -5217,6 +5218,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
" elements than available", elements);
}
while (isSPACE(*l)) l++;
after_atou = (char *) lend;
if (!grok_atoUV((const char *)l, other_elements_ptr++,
&after_atou))
{
Expand Down
6 changes: 3 additions & 3 deletions util.c
Original file line number Diff line number Diff line change
Expand Up @@ -4320,7 +4320,7 @@ Perl_parse_unicode_opts(pTHX_ const char **popt)

if (*p) {
if (isDIGIT(*p)) {
const char* endptr;
const char* endptr = p + strlen(p);
UV uv;
if (grok_atoUV(p, &uv, &endptr) && uv <= U32_MAX) {
opt = (U32)uv;
Expand Down Expand Up @@ -4707,7 +4707,7 @@ S_mem_log_common(enum mem_log_type mlt, const UV n,
* timeval. */
{
STRLEN len;
const char* endptr;
const char* endptr = pmlenv + stren(pmlenv);
int fd;
UV uv;
if (grok_atoUV(pmlenv, &uv, &endptr) /* Ignore endptr. */
Expand Down Expand Up @@ -5989,7 +5989,7 @@ static const char* atos_parse(const char* p,
* The matched regular expression is roughly "\(.*:\d+\)\s*$" */
const char* source_number_start;
const char* source_name_end;
const char* source_line_end;
const char* source_line_end = start;
const char* close_paren;
UV uv;

Expand Down

0 comments on commit 5d4a52b

Please sign in to comment.