Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the parse_attributes method to use Source#match to parse XML. #119

Merged
merged 2 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 44 additions & 72 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ class BaseParser

module Private
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
TAG_PATTERN = /((?>#{QNAME_STR}))/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NAME_PATTERN = /\s*#{NAME}/um
Expand All @@ -128,7 +128,6 @@ module Private
def initialize( source )
self.stream = source
@listeners = []
@attributes_scanner = StringScanner.new('')
end

def add_listener( listener )
Expand Down Expand Up @@ -614,87 +613,60 @@ def process_instruction(start_position)
def parse_attributes(prefixes, curr_ns)
attributes = {}
closed = false
match_data = @source.match(/^(.*?)(\/)?>/um, true)
if match_data.nil?
message = "Start tag isn't ended"
raise REXML::ParseException.new(message, @source)
end

raw_attributes = match_data[1]
closed = !match_data[2].nil?
return attributes, closed if raw_attributes.nil?
return attributes, closed if raw_attributes.empty?

@attributes_scanner.string = raw_attributes
scanner = @attributes_scanner
until scanner.eos?
if scanner.scan(/\s+/)
break if scanner.eos?
end

start_position = scanner.pos
while true
break if scanner.scan(ATTRIBUTE_PATTERN)
unless scanner.scan(QNAME)
message = "Invalid attribute name: <#{scanner.rest}>"
raise REXML::ParseException.new(message, @source)
end
name = scanner[0]
unless scanner.scan(/\s*=\s*/um)
while true
if @source.match(">", true)
return attributes, closed
elsif @source.match("/>", true)
closed = true
return attributes, closed
elsif match = @source.match(QNAME, true)
name = match[1]
prefix = match[2]
local_part = match[3]

unless @source.match(/\s*=\s*/um, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
quote = scanner.scan(/['"]/)
unless quote
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
@source.ensure_buffer
match_data = @source.match(/^(.*?)(\/)?>/um, true)
if match_data
scanner << "/" if closed
scanner << ">"
scanner << match_data[1]
scanner.pos = start_position
closed = !match_data[2].nil?
next
unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
if match = @source.match(/(['"])/, true)
message =
"Missing attribute value end quote: <#{name}>: <#{match[1]}>"
raise REXML::ParseException.new(message, @source)
else
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
message =
"Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
end
name = scanner[1]
prefix = scanner[2]
local_part = scanner[3]
# quote = scanner[4]
value = scanner[5]
if prefix == "xmlns"
if local_part == "xml"
if value != "http://www.w3.org/XML/1998/namespace"
msg = "The 'xml' prefix must not be bound to any other namespace "+
value = match[2]
if prefix == "xmlns"
if local_part == "xml"
if value != "http://www.w3.org/XML/1998/namespace"
msg = "The 'xml' prefix must not be bound to any other namespace "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self )
end
elsif local_part == "xmlns"
msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self )
raise REXML::ParseException.new( msg, @source, self)
end
elsif local_part == "xmlns"
msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self)
curr_ns << local_part
elsif prefix
prefixes << prefix unless prefix == "xml"
end
curr_ns << local_part
elsif prefix
prefixes << prefix unless prefix == "xml"
end

if attributes.has_key?(name)
msg = "Duplicate attribute #{name.inspect}"
raise REXML::ParseException.new(msg, @source, self)
end
if attributes.has_key?(name)
msg = "Duplicate attribute #{name.inspect}"
raise REXML::ParseException.new(msg, @source, self)
end

attributes[name] = value
attributes[name] = value
else
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
raise REXML::ParseException.new(message, @source)
end
end
return attributes, closed
end
end
end
Expand Down
4 changes: 2 additions & 2 deletions test/parse/test_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ def test_empty_namespace_attribute_name
assert_equal(<<-DETAIL.chomp, exception.to_s)
Invalid attribute name: <:a="">
Line: 1
Position: 9
Position: 13
Last 80 unconsumed characters:

:a=""></x>
DETAIL
end

Expand Down
20 changes: 18 additions & 2 deletions test/test_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,12 @@ def test_attribute

def test_attribute_namespace_conflict
# https://www.w3.org/TR/xml-names/#uniqAttrs
message = <<-MESSAGE
message = <<-MESSAGE.chomp
Duplicate attribute "a"
Line: 4
Position: 140
Last 80 unconsumed characters:
/>
MESSAGE
assert_raise(REXML::ParseException.new(message)) do
Document.new(<<-XML)
Expand Down Expand Up @@ -1323,11 +1324,26 @@ def test_ticket_21
exception = assert_raise(ParseException) do
Document.new(src)
end
assert_equal(<<-DETAIL, exception.to_s)
assert_equal(<<-DETAIL.chomp, exception.to_s)
Missing attribute value start quote: <bar>
Line: 1
Position: 16
Last 80 unconsumed characters:
value/>
DETAIL
end

def test_parse_exception_on_missing_attribute_end_quote
src = '<foo bar="value/>'
exception = assert_raise(ParseException) do
Document.new(src)
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Missing attribute value end quote: <bar>: <">
Line: 1
Position: 17
Last 80 unconsumed characters:
value/>
DETAIL
end

Expand Down