Skip to content

Commit

Permalink
Fix invalid byte sequence on EncodedString#split
Browse files Browse the repository at this point in the history
Map string char with invalid encoding to '?'
Format identical string expectation to read easier

Refs:
- rspec/rspec-core#1760
- via rspec#134
  • Loading branch information
bf4 committed Feb 8, 2015
1 parent b666ad5 commit 8260a59
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 34 deletions.
74 changes: 50 additions & 24 deletions lib/rspec/support/encoded_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,32 @@ module Support
# @private
class EncodedString
# Reduce allocations by storing constants.
UTF_8 = "UTF-8"
US_ASCII = 'US-ASCII'
# else: '?' 63.chr ("\x3F")
UTF_8 = "UTF-8"
US_ASCII = "US-ASCII"
#
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
# https://www.ruby-forum.com/topic/6861247
# https://twitter.com/nalsh/status/553413844685438976
#
# For example, given:
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
#
# On MRI 2.1 or above: 63 # '?'
# else : 128 # "\x80"
#
# Ruby's default replacement string is:
# U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
# ? ("\x3F")
REPLACE = "?"
ENCODE_UNCONVERTABLE_BYTES = {
:invalid => :replace,
:undef => :replace
:undef => :replace,
:replace => REPLACE
}
ENCODE_NO_CONVERTER = {
:invalid => :replace,
:replace => REPLACE
}

def initialize(string, encoding=nil)
Expand Down Expand Up @@ -54,7 +70,7 @@ def to_s
# vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
# # => '<undef>'
# Encoding::CompatibilityError
# when Enconding.compatbile?(str1, str2) is false
# when Encoding.compatibile?(str1, str2) is nil
# e.g. utf_16le_emoji_string.split("\n")
# e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
# Encoding::InvalidByteSequenceError:
Expand All @@ -64,13 +80,13 @@ def to_s
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
# # => '<byte>'
# ArgumentError
# when operating on a string with invalid bytes
# e.g."\xEF".split("\n")
# when operating on a string with invalid bytes
# e.g."\x80".split("\n")
# TypeError
# when a symbol is passed as an encoding
# Encoding.find(:"utf-8")
# when calling force_encoding on an object
# that doesn't respond to #to_str
# when a symbol is passed as an encoding
# Encoding.find(:"UTF-8")
# when calling force_encoding on an object
# that doesn't respond to #to_str
#
# Raised by transcoding methods:
# Encoding::ConverterNotFoundError:
Expand All @@ -80,25 +96,35 @@ def to_s
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
#
# Raised by byte <-> char conversions
# RangeError: out of char range
# e.g. the UTF-16LE emoji: 128169.chr
# RangeError: out of char range
# e.g. the UTF-16LE emoji: 128169.chr
def matching_encoding(string)
string = remove_invalid_bytes(string)
string.encode(@encoding)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
rescue Encoding::ConverterNotFoundError
normalize_missing(string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
end

# Ruby's default replacement string is:
# for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD".force_encoding(UTF_8)

def normalize_missing(string)
if @encoding.to_s == UTF_8
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER, REPLACE)
else
string
# Prevents raising ArgumentError
if String.method_defined?(:scrub)
# https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
# https://github.com/ruby/ruby/blob/v2_1_0/string.c#L8242
# https://github.com/hsbt/string-scrub
# https://github.com/rubinius/rubinius/blob/v2.5.2/kernel/common/string.rb#L1913-L1972
def remove_invalid_bytes(string)
string.scrub(REPLACE)
end
else
# http://stackoverflow.com/a/8711118/879854
# Loop over chars in a string replacing chars
# with invalid encoding, which is a pretty good proxy
# for the invalid byte sequence that causes an ArgumentError
def remove_invalid_bytes(string)
string.chars.map do |char|
char.valid_encoding? ? char : REPLACE
end.join
end
end

Expand Down
36 changes: 26 additions & 10 deletions spec/rspec/support/encoded_string_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
end

failure_message do
"expected #{actual.inspect} (#{actual.encoding.name}) to be identical to "\
"#{expected.inspect} (#{expected.encoding.name})"
"expected\n#{actual.inspect} (#{actual.encoding.name}) to be identical to\n"\
"#{expected.inspect} (#{expected.encoding.name})\n"\
"The exact bytes are printed below for more detail:\n"\
"#{actual.bytes.to_a}\n"\
"#{expected.bytes.to_a}\n"\
end
else
match do |actual|
Expand Down Expand Up @@ -80,14 +83,8 @@ module RSpec::Support
}.to raise_error(Encoding::ConverterNotFoundError)
end

# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
# https://www.ruby-forum.com/topic/6861247
# https://twitter.com/nalsh/status/553413844685438976
# For example, given:
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
# On MRI 2.1 or above: 63 # '?'
# else : 128 # "\x80"
# See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb
# for why the behavior differs by (MRI) Ruby version.
if RUBY_VERSION < '2.1'
it 'does nothing' do
resulting_string = build_encoded_string(string, no_converter_encoding).to_s
Expand Down Expand Up @@ -220,6 +217,25 @@ module RSpec::Support
]
end
end

context 'when the string has an invalid byte sequence' do
let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) }

it 'normally raises an ArgumentError' do
expect(message_with_invalid_byte_sequence).not_to be_valid_encoding
expect {
message_with_invalid_byte_sequence.split("\n")
}.to raise_error(ArgumentError)
end

it 'replaces invalid bytes with the REPLACE string' do
resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n")
expected_string = "? ? ? I have bad bytes"
expect(resulting_array).to match [
a_string_identical_to(expected_string)
]
end
end
end

def build_encoded_string(string, target_encoding = string.encoding)
Expand Down

0 comments on commit 8260a59

Please sign in to comment.