Skip to content

Commit

Permalink
Fix that "ss" in look-behind causes syntax error
Browse files Browse the repository at this point in the history
Fixes #92.

This fix was ported from oniguruma:
kkos/oniguruma@257082d
  • Loading branch information
k-takata committed Jan 25, 2019
1 parent cf3bc70 commit b1a5445
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 15 deletions.
37 changes: 22 additions & 15 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -3264,6 +3264,14 @@ setup_subexp_call(Node* node, ScanEnv* env)
}
#endif

#define IN_ALT (1<<0)
#define IN_NOT (1<<1)
#define IN_REPEAT (1<<2)
#define IN_VAR_REPEAT (1<<3)
#define IN_CALL (1<<4)
#define IN_RECCALL (1<<5)
#define IN_LOOK_BEHIND (1<<6)

/* divide different length alternatives in look-behind.
(?<=A|B) ==> (?<=A)|(?<=B)
(?<!A|B) ==> (?<!A)(?<!B)
Expand Down Expand Up @@ -3560,24 +3568,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
return ONIGERR_MEMORY;
}

static int
expand_case_fold_string(Node* node, regex_t* reg)
{
#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8

static int
expand_case_fold_string(Node* node, regex_t* reg, int state)
{
int r, n, len, alt_num;
int varlen = 0;
int is_in_look_behind;
UChar *start, *end, *p;
Node *top_root, *root, *snode, *prev_node;
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
StrNode* sn = NSTR(node);
StrNode* sn;

if (NSTRING_IS_AMBIG(node)) return 0;

sn = NSTR(node);

start = sn->s;
end = sn->end;
if (start >= end) return 0;

is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;

r = 0;
top_root = root = prev_node = snode = NULL_NODE;
alt_num = 1;
Expand All @@ -3593,7 +3606,7 @@ expand_case_fold_string(Node* node, regex_t* reg)
len = enclen(reg->enc, p, end);

varlen = is_case_fold_variable_len(n, items, len);
if (n == 0 || varlen == 0) {
if (n == 0 || varlen == 0 || is_in_look_behind) {
if (IS_NULL(snode)) {
if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
onig_node_free(top_root);
Expand Down Expand Up @@ -3854,13 +3867,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env)
}
#endif

#define IN_ALT (1<<0)
#define IN_NOT (1<<1)
#define IN_REPEAT (1<<2)
#define IN_VAR_REPEAT (1<<3)
#define IN_CALL (1<<4)
#define IN_RECCALL (1<<5)

/* setup_tree does the following work.
1. check empty loop. (set qn->target_empty_info)
2. expand ignore-case in char class.
Expand Down Expand Up @@ -3902,7 +3908,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)

case NT_STR:
if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
r = expand_case_fold_string(node, reg);
r = expand_case_fold_string(node, reg, state);
}
break;

Expand Down Expand Up @@ -4145,7 +4151,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
if (NTYPE(node) != NT_ANCHOR) goto restart;
r = setup_tree(an->target, reg, state, env);
r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
if (r != 0) return r;
r = setup_look_behind(node, reg, env);
}
Expand All @@ -4158,7 +4164,8 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
if (NTYPE(node) != NT_ANCHOR) goto restart;
r = setup_tree(an->target, reg, (state | IN_NOT), env);
r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
env);
if (r != 0) return r;
r = setup_look_behind(node, reg, env);
}
Expand Down
22 changes: 22 additions & 0 deletions testpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,28 @@ def main():
x2("(?i)(?<=\u0149)a", "\u02bcna", 2, 3) # with look-behind
# Other Unicode tests
x2("\\x{25771}", "\U00025771", 0, 1)
x2("(?i:ss)", "ss", 0, 2)
x2("(?i:ss)", "Ss", 0, 2)
x2("(?i:ss)", "SS", 0, 2)
if is_unicode_encoding(onig_encoding):
x2("(?i:ss)", "\u017fS", 0, 2) # LATIN SMALL LETTER LONG S
x2("(?i:ss)", "s\u017f", 0, 2)
x2("(?i:ss)", "\u00df", 0, 1) # LATIN SMALL LETTER SHARP S
x2("(?i:ss)", "\u1e9e", 0, 1) # LATIN CAPITAL LETTER SHARP S
x2("(?i:xssy)", "xssy", 0, 4)
x2("(?i:xssy)", "xSsy", 0, 4)
x2("(?i:xssy)", "xSSy", 0, 4)
if is_unicode_encoding(onig_encoding):
x2("(?i:xssy)", "x\u017fSy", 0, 4)
x2("(?i:xssy)", "xs\u017fy", 0, 4)
x2("(?i:xssy)", "x\u00dfy", 0, 3)
x2("(?i:xssy)", "x\u1e9ey", 0, 3)
x2("(?i:\u00df)", "ss", 0, 2)
x2("(?i:\u00df)", "SS", 0, 2)
x2("(?i:[\u00df])", "ss", 0, 2)
x2("(?i:[\u00df])", "SS", 0, 2)
x2("(?i)(?<!ss)z", "qqz", 2, 3) # Issue #92
x2("(?i)(?<!xss)z", "qqz", 2, 3)
x2("[0-9-a]+", " 0123456789-a ", 1, 13) # same as [0-9\-a]
x2("[0-9-\\s]+", " 0123456789-a ", 0, 12) # same as [0-9\-\s]
n("[0-9-a]", "", syn=onigmo.ONIG_SYNTAX_GREP, err=onigmo.ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS)
Expand Down

0 comments on commit b1a5445

Please sign in to comment.