Skip to content

Commit

Permalink
Fix invalid byte sequence on EncodedString#split
Browse files Browse the repository at this point in the history
  • Loading branch information
bf4 committed Feb 8, 2015
1 parent b666ad5 commit e327630
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 30 deletions.
73 changes: 51 additions & 22 deletions lib/rspec/support/encoded_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,32 @@ module Support
# @private
class EncodedString
# Reduce allocations by storing constants.
UTF_8 = "UTF-8"
US_ASCII = 'US-ASCII'
# else: '?' 63.chr ("\x3F")
UTF_8 = "UTF-8"
US_ASCII = "US-ASCII"
#
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
# https://www.ruby-forum.com/topic/6861247
# https://twitter.com/nalsh/status/553413844685438976
#
# For example, given:
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
#
# On MRI 2.1 or above: 63 # '?'
# else : 128 # "\x80"
#
# Ruby's default replacement string is:
# U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
# ? ("\x3F")
REPLACE = "?"
ENCODE_UNCONVERTABLE_BYTES = {
:invalid => :replace,
:undef => :replace
:undef => :replace,
:replace => REPLACE
}
ENCODE_NO_CONVERTER = {
:invalid => :replace,
:replace => REPLACE
}

def initialize(string, encoding=nil)
Expand Down Expand Up @@ -64,13 +80,13 @@ def to_s
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
# # => '<byte>'
# ArgumentError
# when operating on a string with invalid bytes
# e.g."\xEF".split("\n")
# when operating on a string with invalid bytes
# e.g."\x80".split("\n")
# TypeError
# when a symbol is passed as an encoding
# Encoding.find(:"utf-8")
# when calling force_encoding on an object
# that doesn't respond to #to_str
# when a symbol is passed as an encoding
# Encoding.find(:"UTF-8")
# when calling force_encoding on an object
# that doesn't respond to #to_str
#
# Raised by transcoding methods:
# Encoding::ConverterNotFoundError:
Expand All @@ -80,25 +96,38 @@ def to_s
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
#
# Raised by byte <-> char conversions
# RangeError: out of char range
# e.g. the UTF-16LE emoji: 128169.chr
# RangeError: out of char range
# e.g. the UTF-16LE emoji: 128169.chr
def matching_encoding(string)
string = remove_invalid_bytes(string)
string.encode(@encoding)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
rescue Encoding::ConverterNotFoundError
normalize_missing(string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
end

# Ruby's default replacement string is:
# for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD".force_encoding(UTF_8)

def normalize_missing(string)
if @encoding.to_s == UTF_8
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER, REPLACE)
else
# Work around bad bytes with a double conversion
# Prevents raising ArgumentError
#
# Emulates Ruby 2.1 String#scrub
# see https://github.com/hsbt/string-scrub
# https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
# https://speakerdeck.com/samsaffron/why-ruby-2-dot-1-excites-me?slide=48
#
# Force UTF-8 encoding,
# Converting it to a higher higher character set (UTF-16) and then
# back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences,
# Restore original encoding
def remove_invalid_bytes(string)
if string.valid_encoding?
string
else
string.dup.
force_encoding(UTF_8)
encode(Encoding::UTF_16, UTF_8, ENCODE_NO_CONVERTER).
encode(UTF_8, Encoding::UTF_16).
force_encoding(string.encoding)
end
end

Expand Down
29 changes: 21 additions & 8 deletions spec/rspec/support/encoded_string_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,8 @@ module RSpec::Support
}.to raise_error(Encoding::ConverterNotFoundError)
end

# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
# https://www.ruby-forum.com/topic/6861247
# https://twitter.com/nalsh/status/553413844685438976
# For example, given:
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
# On MRI 2.1 or above: 63 # '?'
# else : 128 # "\x80"
# See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb
# for why the behavior differs by (MRI) Ruby version.
if RUBY_VERSION < '2.1'
it 'does nothing' do
resulting_string = build_encoded_string(string, no_converter_encoding).to_s
Expand Down Expand Up @@ -220,6 +214,25 @@ module RSpec::Support
]
end
end

context 'when the string has an invalid byte sequence' do
let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) }

it 'normally raises an ArgumentError' do
expect(message_with_invalid_byte_sequence).not_to be_valid_encoding
expect {
message_with_invalid_byte_sequence.split("\n")
}.to raise_error(ArgumentError)
end

it 'replaces invalid bytes with the REPLACE string' do
resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n")
expected_string = "? ? ? I have bad bytes"
expect(resulting_array).to match [
a_string_identical_to(expected_string)
]
end
end
end

def build_encoded_string(string, target_encoding = string.encoding)
Expand Down

0 comments on commit e327630

Please sign in to comment.