Fix invalid byte sequence on EncodedString#split

Map string char with invalid encoding to '?' Format identical string expectation to read easier Refs: - rspec/rspec-core#1760 - via rspec#134
bf4 · Feb 8, 2015 · 8260a59 · 8260a59
1 parent b666ad5
commit 8260a59
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 34 deletions.
diff --git a/lib/rspec/support/encoded_string.rb b/lib/rspec/support/encoded_string.rb
@@ -3,16 +3,32 @@ module Support
     # @private
     class EncodedString
       # Reduce allocations by storing constants.
-      UTF_8 = "UTF-8"
-      US_ASCII = 'US-ASCII'
-      #  else: '?' 63.chr ("\x3F")
+      UTF_8    = "UTF-8"
+      US_ASCII = "US-ASCII"
+      #
+      # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
+      # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
+      # https://www.ruby-forum.com/topic/6861247
+      # https://twitter.com/nalsh/status/553413844685438976
+      #
+      # For example, given:
+      #  "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
+      #
+      # On MRI 2.1 or above: 63  # '?'
+      # else               : 128 # "\x80"
+      #
+      # Ruby's default replacement string is:
+      #   U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
+      #   ?      ("\x3F")
       REPLACE = "?"
       ENCODE_UNCONVERTABLE_BYTES =  {
         :invalid => :replace,
-        :undef   => :replace
+        :undef   => :replace,
+        :replace => REPLACE
       }
       ENCODE_NO_CONVERTER = {
         :invalid => :replace,
+        :replace => REPLACE
       }
 
       def initialize(string, encoding=nil)
@@ -54,7 +70,7 @@ def to_s
         #     vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
         #     # => '<undef>'
         #   Encoding::CompatibilityError
-        #    when Enconding.compatbile?(str1, str2) is false
+        #    when Encoding.compatibile?(str1, str2) is nil
         #     e.g. utf_16le_emoji_string.split("\n")
         #     e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
         #   Encoding::InvalidByteSequenceError:
@@ -64,13 +80,13 @@ def to_s
         #     vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
         #     # => '<byte>'
         #   ArgumentError
-        #    when operating on a string with invalid bytes
-        #     e.g."\xEF".split("\n")
+        #     when operating on a string with invalid bytes
+        #     e.g."\x80".split("\n")
         #   TypeError
-        #    when a symbol is passed as an encoding
-        #    Encoding.find(:"utf-8")
-        #    when calling force_encoding on an object
-        #    that doesn't respond to #to_str
+        #     when a symbol is passed as an encoding
+        #     Encoding.find(:"UTF-8")
+        #     when calling force_encoding on an object
+        #     that doesn't respond to #to_str
         #
         # Raised by transcoding methods:
         #   Encoding::ConverterNotFoundError:
@@ -80,25 +96,35 @@ def to_s
         #     e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
         #
         # Raised by byte <-> char conversions
-        #  RangeError: out of char range
-        #   e.g. the UTF-16LE emoji: 128169.chr
+        #   RangeError: out of char range
+        #    e.g. the UTF-16LE emoji: 128169.chr
         def matching_encoding(string)
+          string = remove_invalid_bytes(string)
           string.encode(@encoding)
         rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
-          normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
+          string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
         rescue Encoding::ConverterNotFoundError
-          normalize_missing(string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
+          string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
         end
 
-        # Ruby's default replacement string is:
-        # for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
-        MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD".force_encoding(UTF_8)
-
-        def normalize_missing(string)
-          if @encoding.to_s == UTF_8
-            string.gsub(MRI_UNICODE_UNKOWN_CHARACTER, REPLACE)
-          else
-            string
+        # Prevents raising ArgumentError
+        if String.method_defined?(:scrub)
+          # https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
+          # https://github.com/ruby/ruby/blob/v2_1_0/string.c#L8242
+          # https://github.com/hsbt/string-scrub
+          # https://github.com/rubinius/rubinius/blob/v2.5.2/kernel/common/string.rb#L1913-L1972
+          def remove_invalid_bytes(string)
+            string.scrub(REPLACE)
+          end
+        else
+          # http://stackoverflow.com/a/8711118/879854
+          # Loop over chars in a string replacing chars
+          # with invalid encoding, which is a pretty good proxy
+          # for the invalid byte sequence that causes an ArgumentError
+          def remove_invalid_bytes(string)
+            string.chars.map do |char|
+              char.valid_encoding? ? char : REPLACE
+            end.join
           end
         end
 

diff --git a/spec/rspec/support/encoded_string_spec.rb b/spec/rspec/support/encoded_string_spec.rb
@@ -15,8 +15,11 @@
     end
 
     failure_message do
-      "expected #{actual.inspect} (#{actual.encoding.name}) to be identical to "\
-        "#{expected.inspect} (#{expected.encoding.name})"
+      "expected\n#{actual.inspect} (#{actual.encoding.name}) to be identical to\n"\
+        "#{expected.inspect} (#{expected.encoding.name})\n"\
+        "The exact bytes are printed below for more detail:\n"\
+        "#{actual.bytes.to_a}\n"\
+        "#{expected.bytes.to_a}\n"\
     end
   else
     match do |actual|
@@ -80,14 +83,8 @@ module RSpec::Support
             }.to raise_error(Encoding::ConverterNotFoundError)
           end
 
-          # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
-          # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
-          # https://www.ruby-forum.com/topic/6861247
-          # https://twitter.com/nalsh/status/553413844685438976
-          # For example, given:
-          #  "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
-          # On MRI 2.1 or above: 63 # '?'
-          # else               : 128 # "\x80"
+          # See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb
+          # for why the behavior differs by (MRI) Ruby version.
           if RUBY_VERSION < '2.1'
             it 'does nothing' do
               resulting_string = build_encoded_string(string, no_converter_encoding).to_s
@@ -220,6 +217,25 @@ module RSpec::Support
             ]
           end
         end
+
+        context 'when the string has an invalid byte sequence' do
+          let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) }
+
+          it 'normally raises an ArgumentError' do
+            expect(message_with_invalid_byte_sequence).not_to be_valid_encoding
+            expect {
+              message_with_invalid_byte_sequence.split("\n")
+            }.to raise_error(ArgumentError)
+          end
+
+          it 'replaces invalid bytes with the REPLACE string' do
+            resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n")
+            expected_string = "? ? ? I have bad bytes"
+            expect(resulting_array).to match [
+              a_string_identical_to(expected_string)
+            ]
+          end
+        end
       end
 
       def build_encoded_string(string, target_encoding = string.encoding)