Skip to content

Commit

Permalink
fix: Extra content at the end of the document
Browse files Browse the repository at this point in the history
## Why?

XML with additional content at the end of the document is invalid.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc

```
[27]   	Misc	   ::=   	Comment | PI | S
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

```
[16]   	PI	   ::=   	'<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget

```
[17]   	PITarget	   ::=   	Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
```
  • Loading branch information
naitoh committed Jul 7, 2024
1 parent c4fb89b commit c094825
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 0 deletions.
9 changes: 9 additions & 0 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -460,15 +460,24 @@ def pull_event
@closed = tag
@nsstack.shift
else
if @tags.empty? and @have_root
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
end
@tags.push( tag )
end
@have_root = true
return [ :start_element, tag, attributes ]
end
else
text = @source.read_until("<")
if text.chomp!("<")
@source.position -= "<".bytesize
end
if @tags.empty? and @have_root
unless /\A\s*\z/.match?(text)
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
end
end
return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
Expand Down
12 changes: 12 additions & 0 deletions test/parse/test_comment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,5 +105,17 @@ def test_after_doctype_malformed_comment_end
DETAIL
end
end

def test_after_root
parser = REXML::Parsers::BaseParser.new('<a></a><!-- ok comment -->')

events = {}
while parser.has_next?
event = parser.pull
events[event[0]] = event[1]
end

assert_equal(" ok comment ", events[:comment])
end
end
end
34 changes: 34 additions & 0 deletions test/parse/test_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,40 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start
</ </x>
DETAIL
end

def test_after_root
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a><b>')
while parser.has_next?
parser.pull
end
end

assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Extra tag at the end of the document (got '<b')
Line: 1
Position: 10
Last 80 unconsumed characters:
DETAIL
end

def test_after_empty_element_tag_root
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a/><b>')
while parser.has_next?
parser.pull
end
end

assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Extra tag at the end of the document (got '<b')
Line: 1
Position: 7
Last 80 unconsumed characters:
DETAIL
end
end
end
end
12 changes: 12 additions & 0 deletions test/parse/test_processing_instruction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,17 @@ def test_garbage_text
])
end
end

def test_after_root
parser = REXML::Parsers::BaseParser.new('<a></a><?abc version="1.0" ?>')

events = {}
while parser.has_next?
event = parser.pull
events[event[0]] = event[1]
end

assert_equal("abc", events[:processing_instruction])
end
end
end
25 changes: 25 additions & 0 deletions test/parse/test_text.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require "test/unit"
require 'rexml/parsers/baseparser'

module REXMLTests
class TestParseText < Test::Unit::TestCase
class TestInvalid < self
def test_after_root
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a>c')
while parser.has_next?
parser.pull
end
end

assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Extra content at the end of the document (got 'c')
Line: 1
Position: 8
Last 80 unconsumed characters:
DETAIL
end
end
end
end

0 comments on commit c094825

Please sign in to comment.