From 11302eea88b4b81c4a16c628bc5259c8aba1e11b Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Fri, 26 Jun 2020 13:45:39 +0200 Subject: [PATCH] Deduplicate Regexp literals Ruby ticket: https://bugs.ruby-lang.org/issues/16557 Real world application contain many duplicated Regexp literals. From a rails/console in Redmine: ``` >> ObjectSpace.each_object(Regexp).count => 6828 >> ObjectSpace.each_object(Regexp).uniq.count => 4162 >> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum => 4611957 # 4.4 MB total >> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum - ObjectSpace.each_object(Regexp).to_a.uniq.map { |r| ObjectSpace.memsize_of(r) }.sum => 1490601 # 1.42 MB could be saved ``` Here's the to 10 duplicated regexps in Redmine: ``` 147: /"/ 107: /\s+/ 103: // 89: /\n/ 83: /'/ 76: /\s+/m 37: /\d+/ 35: /\[/ 33: /./ 33: /\\./ ``` Any empty Rails application will have a similar amount of regexps. Since https://bugs.ruby-lang.org/issues/16377 made literal regexps frozen, it is possible to deduplicate literal regexps without changing any semantic. This patch is heavily inspired by the `frozen_strings` table, but applied to literal regexps. --- re.c | 44 +++++++++++++++++++++++++++++++++++----- test/ruby/test_regexp.rb | 7 +++++++ vm_core.h | 1 + 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/re.c b/re.c index b5d0bf74cf1e05..b7b449174113cc 100644 --- a/re.c +++ b/re.c @@ -23,6 +23,7 @@ #include "ruby/encoding.h" #include "ruby/re.h" #include "ruby/util.h" +#include "vm_core.h" VALUE rb_eRegexpError; @@ -2956,19 +2957,50 @@ rb_reg_new(const char *s, long len, int options) return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options); } +static VALUE +rb_reg_lookup_literal(VALUE str, int options) +{ + VALUE cache = GET_VM()->regexp_literals_cache; + VALUE options_cache = rb_hash_lookup(cache, INT2FIX(options)); + if (RTEST(options_cache)) { + return rb_hash_lookup(options_cache, str); + } + return Qnil; +} + +static void +rb_reg_cache_literal(VALUE str, int options, VALUE re) +{ + VALUE cache = GET_VM()->regexp_literals_cache; + VALUE options_cache = rb_hash_lookup(cache, INT2FIX(options)); + if (!RTEST(options_cache)) { + options_cache = rb_ident_hash_new(); + rb_hash_aset(cache, INT2FIX(options), options_cache); + } + rb_hash_aset(options_cache, str, re); +} + VALUE rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline) { - VALUE re = rb_reg_alloc(); - onig_errmsg_buffer err = ""; - if (!str) str = rb_str_new(0,0); + str = rb_fstring(str); + + VALUE re = rb_reg_lookup_literal(str, options); + if (RTEST(re)) { + return re; + } + + re = rb_reg_alloc(); + onig_errmsg_buffer err = ""; if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) { - rb_set_errinfo(rb_reg_error_desc(str, options, err)); - return Qnil; + rb_set_errinfo(rb_reg_error_desc(str, options, err)); + return Qnil; } FL_SET(re, REG_LITERAL); rb_obj_freeze(re); + rb_reg_cache_literal(str, options, re); + return re; } @@ -4111,4 +4143,6 @@ Init_Regexp(void) rb_define_method(rb_cMatch, "hash", match_hash, 0); rb_define_method(rb_cMatch, "eql?", match_equal, 1); rb_define_method(rb_cMatch, "==", match_equal, 1); + + rb_gc_register_mark_object(GET_VM()->regexp_literals_cache = rb_hash_new()); } diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 231fd392d17d0e..3411746cc9ebec 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -62,6 +62,13 @@ def test_assert_normal_exit Regexp.union("a", "a") end + def test_literal_deduplication + assert_same(/a/, /a/) + refute_same(/a/, /a/m) + refute_same(/a/, Regexp.new('a')) + assert_equal(/a/, Regexp.new('a')) + end + def test_to_s assert_equal '(?-mix:\x00)', Regexp.new("\0").to_s diff --git a/vm_core.h b/vm_core.h index fd49c243e3aee4..75f2f45c4b490c 100644 --- a/vm_core.h +++ b/vm_core.h @@ -634,6 +634,7 @@ typedef struct rb_vm_struct { VALUE *defined_strings; st_table *frozen_strings; + VALUE regexp_literals_cache; const struct rb_builtin_function *builtin_function_table; int builtin_inline_index;