Skip to content

Commit

Permalink
Clarify encoding failures and provide better replacement strings for …
Browse files Browse the repository at this point in the history
…undefined conversions
  • Loading branch information
bf4 committed Dec 26, 2014
1 parent 08416fd commit c8b0521
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 45 deletions.
28 changes: 15 additions & 13 deletions lib/rspec/support/encoded_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,24 @@ def to_s

private

UNCOVERTABLE_ENCODINGS = [
Encoding::UndefinedConversionError,
Encoding::InvalidByteSequenceError,
Encoding::CompatibilityError
]
# Ruby's default replacement string for Unicode encoding forms is U+FFFD
# Allow the replacement character to be configured by changing to something like
# Encoding.compatible?(enc, Encoding::UTF_8) && "\uFFFD" || "?"
REPLACE = proc {|enc| '?' }
# see https://github.com/ruby/ruby/blob/trunk/test/ruby/test_transcode.rb
# a valid byte sequence like "hi" becomes "U+6968="
# an invalid or malformed byte sequence like "\xAE" becomes "\\xAE"
UNDEF_FALLBACK = Hash.new {|h, x|
begin
"U+%.4X=" % x.unpack("U")
rescue ArgumentError
'\x' << x.inspect.gsub(/[^0-9A-F]/, '')
end
}

def matching_encoding(string)
# Converting it to a higher higher character set (UTF-16) and then
# back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences.
string.
encode(::Encoding::UTF_16LE, :invalid => :replace, :undef => :replace, :replace => '?').
encode(@encoding)
rescue *UNCOVERTABLE_ENCODINGS
string.encode(@encoding, :invalid => :replace, :undef => :replace, :replace => '?')
rescue Encoding::ConverterNotFoundError
string.force_encoding(@encoding).encode(:invalid => :replace, :replace => '?')
encode(@encoding, invalid: :replace, replace: REPLACE[source_encoding], fallback: UNDEF_FALLBACK)
end

def detect_source_encoding(string)
Expand Down
76 changes: 44 additions & 32 deletions spec/rspec/support/encoded_string_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,28 @@ module RSpec::Support
end
else

it 'replaces invalid byte sequences with either a ? or a unicode ?' do
source_encoding = Encoding.find('UTF8-MAC')
incompatible_encoding = Encoding.find('IBM737')
string = "\xEF hi I am not going to work".force_encoding(source_encoding)
resulting_string = build_encoded_string(string, incompatible_encoding)

expect(resulting_string.to_s).to eq("? hi I am not going to work")
context 'Encoding::InvalidByteSequenceError: "\xEF" followed by " " on UTF8-MAC' do
it 'replaces invalid byte sequences with the REPLACE string' do
source_encoding = Encoding.find('UTF8-MAC')
incompatible_encoding = Encoding.find('IBM737')
string = "\xEF hi I am not going to work".force_encoding(source_encoding)
expect{ string.encode(incompatible_encoding) }.to raise_error(Encoding::InvalidByteSequenceError)
resulting_string = build_encoded_string(string, incompatible_encoding)

expect(resulting_string.to_s).to eq("? hi I am not going to work")
end
end

it 'replaces all characters with either a ? or a unicode ?' do
source_encoding = Encoding.find('UTF-16LE')
incompatible_encoding = Encoding.find('IBM737')
string = "\xEF hi I am not going to work".force_encoding(source_encoding)
resulting_string = build_encoded_string(string, incompatible_encoding)
context 'Encoding::UndefinedConversionError: U+20EF to IBM737 in conversion from UTF-16LE to UTF-8 to IBM737' do
it 'replaces all bytes with a unicode representation in the format e.g. U+20EF=' do
source_encoding = Encoding.find('UTF-16LE')
incompatible_encoding = Encoding.find('IBM737')
string = "hi I am not going to work".force_encoding(source_encoding)
expect{ string.encode(incompatible_encoding) }.to raise_error(Encoding::UndefinedConversionError)
resulting_string = build_encoded_string(string, incompatible_encoding)

expect(resulting_string.to_s).to eq("??????????????")
expect(resulting_string.to_s).to eq("U+6968=U+4920=U+6120=U+206D=U+6F6E=U+2074=U+6F67=U+6E69=U+2067=U+6F74=U+7720=U+726F=?")
end
end
end
end
Expand All @@ -78,15 +84,15 @@ module RSpec::Support
end

context 'with a string that cannot be converted to the target encoding' do
it 'replaces undefined characters with either a ? or a unicode ?' do
ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT")
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')

resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << ascii_string
expected_bytes = utf_8_euro_symbol.each_byte.to_a + ["?".unpack("c").first]
actual_bytes = resulting_string.each_byte.to_a

expect(actual_bytes).to eq(expected_bytes)
context 'Encoding::CompatibilityError: incompatible character encodings: UTF-8 and ASCII-8BIT' do
it 'replaces unconvertable characters with a string representation of their hex value' do
ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT")
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')
expect{ valid_unicode_string.encode(target_encoding) << ascii_string }.to raise_error(Encoding::CompatibilityError)

resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << ascii_string
expect(resulting_string).to eq("#{utf_8_euro_symbol}\\xAE")
end
end
end

Expand All @@ -96,24 +102,30 @@ module RSpec::Support
other_ascii_string = '123'.force_encoding("ASCII-8BIT")

resulting_string = build_encoded_string(ascii_string, target_encoding) << other_ascii_string
expect(resulting_string.encoding.to_s).to eq 'UTF-8'
expect(resulting_string.encoding.to_s).to eq('UTF-8')
end
end
end

describe '#split' do
it 'fails to split a string with an invalid byte sequence' do
message_with_invalid_byte_sequence = "\xEF \255 \xAD I have bad bytes".force_encoding(target_encoding)
resulting_string = build_encoded_string(message_with_invalid_byte_sequence, target_encoding)
expect(resulting_string.split("\n")).to eq(["? ? ? I have bad bytes"])
context 'ArgumentError: invalid byte sequence in UTF-8' do
it 'fails to split a string with an invalid byte sequence' do
message_with_invalid_byte_sequence = "\xEF \255 \xAD I have bad bytes".force_encoding(target_encoding)
expect{ message_with_invalid_byte_sequence.split("\n") }.to raise_error(ArgumentError)
resulting_string = build_encoded_string(message_with_invalid_byte_sequence, target_encoding)
expect(resulting_string.split("\n")).to eq(["? ? ? I have bad bytes"])
end
end

it 'splits the string based on the delimiter accounting for encoding' do
wrapped_string = "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT")
context 'Encoding::UndefinedConversionError: "\xAE" from ASCII-8BIT to UTF-8' do
it 'splits the string based on the delimiter accounting for encoding' do
wrapped_string = "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT")

expect {
build_encoded_string(wrapped_string, target_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8"))
}.not_to raise_error
expect{ wrapped_string.encode(target_encoding).split(utf_8_euro_symbol) }.to raise_error(Encoding::UndefinedConversionError)
expect {
build_encoded_string(wrapped_string, target_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8"))
}.not_to raise_error
end
end
end

Expand Down

0 comments on commit c8b0521

Please sign in to comment.