From a79ac8b4b42a9efabe33a0be31bd82d33fd50347 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:18:11 +0900 Subject: [PATCH] Fix performance issue caused by using repeated `>` characters inside `]>` (#174) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_document_type_declaration.rb | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7fe6c4e8..4fcdaba7 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -378,7 +378,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) + elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 61c3f04d..3c3371ea 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -290,6 +290,13 @@ def test_gt_linear_performance_malformed_entity end end + def test_gt_linear_performance_comment + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' -->]>') + end + end + private def parse(internal_subset) super(<<-DOCTYPE)