Fix parsing of + and ? following intervals ...

these characters were wrongly treated as possessive or reluctant mode flags for the interval quantifier. Ruby/Onigmo does not support these modes for intervals, so it treats them as extra, chained quantifiers instead. c.f. #3, #69
ammar · May 1, 2022 · 1302812 · 1302812
1 parent 2e580dc
commit 1302812
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@
   - `#conditional_level`, `#level`, `#nesting_level` , `#set_level`
   - this allows a more unified handling with `Expression::Base` instances
 
+### Fixed
+
+- fixed interpretation of `+` and `?` following interval quantifiers (`{n,n}`)
+  - treat as chained quantifiers like Ruby does instead of possessive/lazy mode
+
 ## [2.3.1] - 2022-04-24 - [Janosch Müller](mailto:[email protected])
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -367,12 +367,12 @@ _Note that not all of these are available in all versions of Ruby_
 | **POSIX Classes**                     | `[:alpha:]`, `[:^digit:]`                               | &#x2713; |
 | **Quantifiers**                       |                                                         | &#x22f1; |
 | &emsp;&nbsp;_**Greedy**_              | `?`, `*`, `+`, `{m,M}`                                  | &#x2713; |
-| &emsp;&nbsp;_**Reluctant** (Lazy)_    | `??`, `*?`, `+?`, `{m,M}?`                              | &#x2713; |
-| &emsp;&nbsp;_**Possessive**_          | `?+`, `*+`, `++`, `{m,M}+`                              | &#x2713; |
+| &emsp;&nbsp;_**Reluctant** (Lazy)_    | `??`, `*?`, `+?` \[1\]                                  | &#x2713; |
+| &emsp;&nbsp;_**Possessive**_          | `?+`, `*+`, `++` \[1\]                                  | &#x2713; |
 | **String Escapes**                    |                                                         | &#x22f1; |
-| &emsp;&nbsp;_**Control** \[1\]_       | `\C-C`, `\cD`                                           | &#x2713; |
+| &emsp;&nbsp;_**Control** \[2\]_       | `\C-C`, `\cD`                                           | &#x2713; |
 | &emsp;&nbsp;_**Hex**_                 | `\x20`, `\x{701230}`                                    | &#x2713; |
-| &emsp;&nbsp;_**Meta** \[1\]_          | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C`        | &#x2713; |
+| &emsp;&nbsp;_**Meta** \[2\]_          | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C`        | &#x2713; |
 | &emsp;&nbsp;_**Octal**_               | `\0`, `\01`, `\012`                                     | &#x2713; |
 | &emsp;&nbsp;_**Unicode**_             | `\uHHHH`, `\u{H+ H+}`                                   | &#x2713; |
 | **Unicode Properties**                | _<sub>([Unicode 13.0.0](https://www.unicode.org/versions/Unicode13.0.0/))</sub>_ | &#x22f1; |
@@ -384,7 +384,11 @@ _Note that not all of these are available in all versions of Ruby_
 | &emsp;&nbsp;_**Scripts**_             | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}`              | &#x2713; |
 | &emsp;&nbsp;_**Simple**_              | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}`               | &#x2713; |
 
-**\[1\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
+**\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval
+quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3),
+[#69](https://github.com/ammar/regexp_parser/pull/69).
+
+**\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](
  https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 ), so they will only reach the
 scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned.
 

diff --git a/lib/regexp_parser/expression/quantifier.rb b/lib/regexp_parser/expression/quantifier.rb
@@ -34,7 +34,7 @@ def initialize(*args)
         when /zero_or_one/  then @min, @max = 0, 1
         when /zero_or_more/ then @min, @max = 0, -1
         when /one_or_more/  then @min, @max = 1, -1
-        when /interval/
+        when :interval
           int_min = token.text[/\{(\d*)/, 1]
           int_max = token.text[/,?(\d*)\}/, 1]
           @min, @max = int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)

diff --git a/lib/regexp_parser/parser.rb b/lib/regexp_parser/parser.rb
@@ -499,13 +499,6 @@ def quantifier(token)
       target_node = new_group
     end
 
-    # TODO: in v3.0.0, solve in scanner.rl and remove this code block
-    if token.text =~ /\{.*\?/
-      token.token = :interval_reluctant
-    elsif token.text =~ /\{.*\+/
-      token.token = :interval_possessive
-    end
-
     unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
                              (?:_greedy|_reluctant|_possessive)?\z/x
       raise UnknownTokenError.new('Quantifier', token)

diff --git a/lib/regexp_parser/scanner/scanner.rl b/lib/regexp_parser/scanner/scanner.rl
@@ -74,8 +74,7 @@
   quantity_maximum      = ',' . (digit+);
   quantity_range        = (digit+) . ',' . (digit+);
   quantifier_interval   = range_open . ( quantity_exact | quantity_minimum |
-                          quantity_maximum | quantity_range ) . range_close .
-                          quantifier_mode?;
+                          quantity_maximum | quantity_range ) . range_close;
 
   quantifiers           = quantifier_greedy | quantifier_reluctant |
                           quantifier_possessive | quantifier_interval;
@@ -605,7 +604,6 @@
       end
     };
 
-    # TODO: in v3.0.0 emit token with mode included (e.g. :interval_greedy)
     quantifier_interval {
       emit(:quantifier, :interval, copy(data, ts, te))
     };

diff --git a/spec/parser/quantifiers_spec.rb b/spec/parser/quantifiers_spec.rb
@@ -5,6 +5,7 @@
     it "parses the quantifier in #{pattern} as #{mode} #{token}" do
       root = RP.parse(pattern, '*')
       exp = root[0]
+      exp = exp[0] if exp.is_a?(Group::Passive) && exp.implicit?
 
       expect(exp).to be_quantified
       expect(exp.quantifier.token).to eq token
@@ -25,34 +26,34 @@
   include_examples 'quantifier', /a+?b/,     '+?',     :reluctant,  :one_or_more,  1, -1
   include_examples 'quantifier', /a++b/,     '++',     :possessive, :one_or_more,  1, -1
   include_examples 'quantifier', /a{2,4}b/,  '{2,4}',  :greedy,     :interval,     2, 4
-  include_examples 'quantifier', /a{2,4}?b/, '{2,4}?', :reluctant,  :interval,     2, 4
-  include_examples 'quantifier', /a{2,4}+b/, '{2,4}+', :possessive, :interval,     2, 4
   include_examples 'quantifier', /a{2,}b/,   '{2,}',   :greedy,     :interval,     2, -1
-  include_examples 'quantifier', /a{2,}?b/,  '{2,}?',  :reluctant,  :interval,     2, -1
-  include_examples 'quantifier', /a{2,}+b/,  '{2,}+',  :possessive, :interval,     2, -1
   include_examples 'quantifier', /a{,3}b/,   '{,3}',   :greedy,     :interval,     0, 3
-  include_examples 'quantifier', /a{,3}?b/,  '{,3}?',  :reluctant,  :interval,     0, 3
-  include_examples 'quantifier', /a{,3}+b/,  '{,3}+',  :possessive, :interval,     0, 3
   include_examples 'quantifier', /a{4}b/,    '{4}',    :greedy,     :interval,     4, 4
-  include_examples 'quantifier', /a{4}?b/,   '{4}?',   :reluctant,  :interval,     4, 4
-  include_examples 'quantifier', /a{4}+b/,   '{4}+',   :possessive, :interval,     4, 4
-  include_examples 'quantifier', /a{004}+b/, '{004}+', :possessive, :interval,     4, 4
+  include_examples 'quantifier', /a{004}b/,  '{004}',  :greedy,     :interval,     4, 4
 
   # special case: exps with chained quantifiers are wrapped in implicit passive groups
-  include_examples 'parse', /a+{2}{3}/,
+  include_examples 'parse', /a+{2}{3}+/,
     0 => [
       :group, :passive, Group::Passive, implicit?: true, level: 0,
-      quantifier: Quantifier.new(Regexp::Token.new(:quantifier, :interval, '{3}', 0, 0, 0, 0, 0))
+      quantifier: Quantifier.new(Regexp::Token.new(:quantifier, :one_or_more, '+', 0, 0, 0, 0, 0))
     ],
     [0, 0] => [
       :group, :passive, Group::Passive, implicit?: true, level: 1,
-      quantifier: Quantifier.new(Regexp::Token.new(:quantifier, :interval, '{2}', 0, 0, 0, 0, 0))
+      quantifier: Quantifier.new(Regexp::Token.new(:quantifier, :interval, '{3}', 0, 0, 0, 0, 0))
     ],
     [0, 0, 0] => [
-      :literal, :literal, Literal, text: 'a', level: 2,
+      :group, :passive, Group::Passive, implicit?: true, level: 2,
+      quantifier: Quantifier.new(Regexp::Token.new(:quantifier, :interval, '{2}', 0, 0, 0, 0, 0))
+    ],
+    [0, 0, 0, 0] => [
+      :literal, :literal, Literal, text: 'a', level: 3,
       quantifier: Quantifier.new(Regexp::Token.new(:quantifier, :one_or_more, '+', 0, 0, 0, 0, 0))
     ]
 
+  # Ruby does not support modes for intervals, following `?` and `+` are read as chained quantifiers
+  include_examples 'quantifier', /a{2,4}?b/, '{2,4}', :greedy, :interval, 2, 4
+  include_examples 'quantifier', /a{2,4}+b/, '{2,4}', :greedy, :interval, 2, 4
+
   specify('mode-checking methods') do
     exp = RP.parse(/a??/).first