Skip to content

Commit

Permalink
Deduplicate Regexp literals
Browse files Browse the repository at this point in the history
Ruby ticket: https://bugs.ruby-lang.org/issues/16557

Real world application contain many duplicated Regexp literals.

From a rails/console in Redmine:

```
>> ObjectSpace.each_object(Regexp).count
=> 6828
>> ObjectSpace.each_object(Regexp).uniq.count
=> 4162
>> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum
=> 4611957 # 4.4 MB total
>> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum - ObjectSpace.each_object(Regexp).to_a.uniq.map { |r| ObjectSpace.memsize_of(r) }.sum
=> 1490601 # 1.42 MB could be saved
```

Here's the to 10 duplicated regexps in Redmine:

```
147: /"/
107: /\s+/
103: //
89: /\n/
83: /'/
76: /\s+/m
37: /\d+/
35: /\[/
33: /./
33: /\\./
```

Any empty Rails application will have a similar amount of regexps.

Since https://bugs.ruby-lang.org/issues/16377 made literal regexps frozen, it is possible to deduplicate literal regexps without changing any semantic.

This patch is heavily inspired by the `frozen_strings` table, but applied to literal regexps.
  • Loading branch information
byroot committed Jun 26, 2020
1 parent ad1ebef commit c083076
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 5 deletions.
43 changes: 38 additions & 5 deletions re.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "ruby/encoding.h"
#include "ruby/re.h"
#include "ruby/util.h"
#include "vm_core.h"

VALUE rb_eRegexpError;

Expand Down Expand Up @@ -2956,19 +2957,49 @@ rb_reg_new(const char *s, long len, int options)
return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
}

static VALUE
rb_reg_lookup_literal(VALUE str, int options)
{
VALUE cache = GET_VM()->regexp_literals_cache;
VALUE options_cache = rb_hash_lookup(cache, INT2FIX(options));
if (RTEST(options_cache)) {
return rb_hash_lookup(options_cache, str);
}
return Qnil;
}

static void
rb_reg_cache_literal(VALUE str, int options, VALUE re)
{
VALUE cache = GET_VM()->regexp_literals_cache;
VALUE options_cache = rb_hash_lookup(cache, INT2FIX(options));
if (!RTEST(options_cache)) {
options_cache = rb_hash_new();
rb_hash_aset(cache, INT2FIX(options), options_cache);
}
rb_hash_aset(options_cache, str, re);
}

VALUE
rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
{
VALUE re = rb_reg_alloc();
onig_errmsg_buffer err = "";

if (!str) str = rb_str_new(0,0);

VALUE re = rb_reg_lookup_literal(str, options);
if (RTEST(re)) {
return re;
}

re = rb_reg_alloc();
onig_errmsg_buffer err = "";
if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
rb_set_errinfo(rb_reg_error_desc(str, options, err));
return Qnil;
rb_set_errinfo(rb_reg_error_desc(str, options, err));
return Qnil;
}
FL_SET(re, REG_LITERAL);
rb_obj_freeze(re);
rb_reg_cache_literal(str, options, re);

return re;
}

Expand Down Expand Up @@ -4111,4 +4142,6 @@ Init_Regexp(void)
rb_define_method(rb_cMatch, "hash", match_hash, 0);
rb_define_method(rb_cMatch, "eql?", match_equal, 1);
rb_define_method(rb_cMatch, "==", match_equal, 1);

rb_gc_register_mark_object(GET_VM()->regexp_literals_cache = rb_hash_new());
}
7 changes: 7 additions & 0 deletions test/ruby/test_regexp.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ def test_assert_normal_exit
Regexp.union("a", "a")
end

def test_literal_deduplication
assert_same(/a/, /a/)
refute_same(/a/, /a/m)
refute_same(/a/, Regexp.new('a'))
assert_equal(/a/, Regexp.new('a'))
end

def test_to_s
assert_equal '(?-mix:\x00)', Regexp.new("\0").to_s

Expand Down
1 change: 1 addition & 0 deletions vm_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,7 @@ typedef struct rb_vm_struct {

VALUE *defined_strings;
st_table *frozen_strings;
VALUE regexp_literals_cache;

const struct rb_builtin_function *builtin_function_table;
int builtin_inline_index;
Expand Down

0 comments on commit c083076

Please sign in to comment.