diff --git a/lib/rspec/support/encoded_string.rb b/lib/rspec/support/encoded_string.rb index f15021704..8b6a9ff3b 100644 --- a/lib/rspec/support/encoded_string.rb +++ b/lib/rspec/support/encoded_string.rb @@ -3,16 +3,32 @@ module Support # @private class EncodedString # Reduce allocations by storing constants. - UTF_8 = "UTF-8" - US_ASCII = 'US-ASCII' - # else: '?' 63.chr ("\x3F") + UTF_8 = "UTF-8" + US_ASCII = "US-ASCII" + # + # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence + # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176 + # https://www.ruby-forum.com/topic/6861247 + # https://twitter.com/nalsh/status/553413844685438976 + # + # For example, given: + # "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a + # + # On MRI 2.1 or above: 63 # '?' + # else : 128 # "\x80" + # + # Ruby's default replacement string is: + # U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else + # ? ("\x3F") REPLACE = "?" ENCODE_UNCONVERTABLE_BYTES = { :invalid => :replace, - :undef => :replace + :undef => :replace, + :replace => REPLACE } ENCODE_NO_CONVERTER = { :invalid => :replace, + :replace => REPLACE } def initialize(string, encoding=nil) @@ -64,13 +80,13 @@ def to_s # vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '') # # => '' # ArgumentError - # when operating on a string with invalid bytes - # e.g."\xEF".split("\n") + # when operating on a string with invalid bytes + # e.g."\x80".split("\n") # TypeError - # when a symbol is passed as an encoding - # Encoding.find(:"utf-8") - # when calling force_encoding on an object - # that doesn't respond to #to_str + # when a symbol is passed as an encoding + # Encoding.find(:"UTF-8") + # when calling force_encoding on an object + # that doesn't respond to #to_str # # Raised by transcoding methods: # Encoding::ConverterNotFoundError: @@ -80,25 +96,38 @@ def to_s # e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule') # # Raised by byte <-> char conversions - # RangeError: out of char range - # e.g. the UTF-16LE emoji: 128169.chr + # RangeError: out of char range + # e.g. the UTF-16LE emoji: 128169.chr def matching_encoding(string) + string = remove_invalid_bytes(string) string.encode(@encoding) rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)) + string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES) rescue Encoding::ConverterNotFoundError - normalize_missing(string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)) + string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER) end - # Ruby's default replacement string is: - # for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD") - MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD".force_encoding(UTF_8) - - def normalize_missing(string) - if @encoding.to_s == UTF_8 - string.gsub(MRI_UNICODE_UNKOWN_CHARACTER, REPLACE) - else + # Work around bad bytes with a double conversion + # Prevents raising ArgumentError + # + # Emulates Ruby 2.1 String#scrub + # see https://github.com/hsbt/string-scrub + # https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123 + # https://speakerdeck.com/samsaffron/why-ruby-2-dot-1-excites-me?slide=48 + # + # Force UTF-8 encoding, + # Converting it to a higher higher character set (UTF-16) and then + # back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences, + # Restore original encoding + def remove_invalid_bytes(string) + if string.valid_encoding? string + else + string.dup. + force_encoding(UTF_8) + encode(Encoding::UTF_16LE, UTF_8, ENCODE_NO_CONVERTER). + encode(UTF_8, Encoding::UTF_16LE). + force_encoding(string.encoding) end end diff --git a/spec/rspec/support/encoded_string_spec.rb b/spec/rspec/support/encoded_string_spec.rb index 47da1aff5..3b77b0fb2 100644 --- a/spec/rspec/support/encoded_string_spec.rb +++ b/spec/rspec/support/encoded_string_spec.rb @@ -80,14 +80,8 @@ module RSpec::Support }.to raise_error(Encoding::ConverterNotFoundError) end - # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence - # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176 - # https://www.ruby-forum.com/topic/6861247 - # https://twitter.com/nalsh/status/553413844685438976 - # For example, given: - # "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a - # On MRI 2.1 or above: 63 # '?' - # else : 128 # "\x80" + # See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb + # for why the behavior differs by (MRI) Ruby version. if RUBY_VERSION < '2.1' it 'does nothing' do resulting_string = build_encoded_string(string, no_converter_encoding).to_s @@ -220,6 +214,25 @@ module RSpec::Support ] end end + + context 'when the string has an invalid byte sequence' do + let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) } + + it 'normally raises an ArgumentError' do + expect(message_with_invalid_byte_sequence).not_to be_valid_encoding + expect { + message_with_invalid_byte_sequence.split("\n") + }.to raise_error(ArgumentError) + end + + it 'replaces invalid bytes with the REPLACE string' do + resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n") + expected_string = "? ? ? I have bad bytes" + expect(resulting_array).to match [ + a_string_identical_to(expected_string) + ] + end + end end def build_encoded_string(string, target_encoding = string.encoding)