From 53dd46c695bcb9ab6a94a31784a5bd6f86a92bb7 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 6 Jul 2024 09:11:08 +0900 Subject: [PATCH] Add position check for XML declaration ## Why? XML declaration must be the first item. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ``` [22] prolog ::= XMLDecl Misc* (doctypedecl Misc*)? ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ``` [23] XMLDecl ::= '' ``` See: https://github.com/ruby/rexml/pull/161#discussion_r1666118193 --- lib/rexml/parsers/baseparser.rb | 5 ++++- test/parse/test_processing_instruction.rb | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 02759e70..4e34a183 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -635,7 +635,10 @@ def process_instruction(start_position) @source.position = start_position raise REXML::ParseException.new(message, @source) end - if @document_status.nil? and match_data[1] == "xml" + if match_data[1] == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index f0c0c24e..5e4b1fc9 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -39,6 +39,23 @@ def test_garbage_text pi.content, ]) end + + def test_xml_declaration_not_at_document_start + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + + DETAIL + end end end end