Skip to content

Commit

Permalink
Fix invalid byte sequence on EncodedString#split
Browse files Browse the repository at this point in the history
  • Loading branch information
bf4 committed Jan 6, 2015
1 parent 2615f3d commit 5c54a6e
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 13 deletions.
40 changes: 28 additions & 12 deletions lib/rspec/support/encoded_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ module Support
class EncodedString
# Ruby's default replacement string for is U+FFFD ("\xEF\xBF\xBD") for Unicode encoding forms
# else is '?' ("\x3F")
MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD"
REPLACE = "\x3F"

def initialize(string, encoding=nil)
Expand Down Expand Up @@ -36,6 +35,24 @@ def to_s

private

ENCODING_STRATEGY = {
:bad_bytes => {
:invalid => :replace,
# :undef => :nil,
:replace => REPLACE
},
:cannot_convert => {
# :invalid => :nil,
:undef => :replace,
:replace => REPLACE
},
:no_converter => {
:invalid => :replace,
# :undef => :nil,
:replace => REPLACE
}
}

# Raised by Encoding and String methods:
# Encoding::UndefinedConversionError:
# when a transcoding operation fails
Expand All @@ -51,20 +68,19 @@ def to_s
# Encoding::CompatibilityError
#
def matching_encoding(string)
string.encode(@encoding)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
normalize_missing(string.encode(@encoding, :invalid => :replace, :undef => :replace))
# Converting it to a higher character set (UTF-16) and then back (to UTF-8)
# ensures that we strip away invalid or undefined byte sequences
# => no need to rescue Encoding::InvalidByteSequenceError, ArgumentError
string.encode(::Encoding::UTF_16LE, ENCODING_STRATEGY[:bad_bytes]).
encode(@encoding)
rescue Encoding::UndefinedConversionError, Encoding::CompatibilityError
string.encode(@encoding, ENCODING_STRATEGY[:cannot_convert])
# Begin: Needed for 1.9.2
rescue Encoding::ConverterNotFoundError
normalize_missing(string.force_encoding(@encoding).encode(:invalid => :replace))
string.force_encoding(@encoding).encode(ENCODING_STRATEGY[:no_converter])
end
# End: Needed for 1.9.2

def normalize_missing(string)
if @encoding.to_s == "UTF-8"
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), REPLACE)
else
string
end
end

def detect_source_encoding(string)
string.encoding
Expand Down
1 change: 0 additions & 1 deletion spec/rspec/support/encoded_string_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ module RSpec::Support
end

it 'replaces invalid bytes with the REPLACE string' do
pending 'but is currently failing'
resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n")
expected_array = ["? ? ? I have bad bytes"]
expect(resulting_array).to eq(expected_array)
Expand Down

0 comments on commit 5c54a6e

Please sign in to comment.