Skip to content

Commit

Permalink
Merge pull request #151 from bf4/encoding_specs
Browse files Browse the repository at this point in the history
Extracting EncodedString specs from #134
  • Loading branch information
myronmarston committed Jan 23, 2015
2 parents 68bce4d + 9eeedb9 commit eb27b60
Show file tree
Hide file tree
Showing 2 changed files with 231 additions and 32 deletions.
62 changes: 57 additions & 5 deletions lib/rspec/support/encoded_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,21 @@ module RSpec
module Support
# @private
class EncodedString
# Reduce allocations by storing constants.
UTF_8 = "UTF-8"
US_ASCII = 'US-ASCII'
# Ruby's default replacement string is:
# for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD"
# else: '?' 63.chr ("\x3F")
REPLACE = "?"
ENCODE_UNCONVERTABLE_BYTES = {
:invalid => :replace,
:undef => :replace
}
ENCODE_NO_CONVERTER = {
:invalid => :replace,
}

def initialize(string, encoding=nil)
@encoding = encoding
Expand Down Expand Up @@ -33,17 +47,55 @@ def to_s

private

# Encoding Exceptions:
#
# Raised by Encoding and String methods:
# Encoding::UndefinedConversionError:
# when a transcoding operation fails
# if the String contains characters invalid for the target encoding
# e.g. "\x80".encode('UTF-8','ASCII-8BIT')
# vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
# # => '<undef>'
# Encoding::CompatibilityError
# when Enconding.compatbile?(str1, str2) is false
# e.g. utf_16le_emoji_string.split("\n")
# e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
# Encoding::InvalidByteSequenceError:
# when the string being transcoded contains a byte invalid for
# either the source or target encoding
# e.g. "\x80".encode('UTF-8','US-ASCII')
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
# # => '<byte>'
# ArgumentError
# when operating on a string with invalid bytes
# e.g."\xEF".split("\n")
# TypeError
# when a symbol is passed as an encoding
# Encoding.find(:"utf-8")
# when calling force_encoding on an object
# that doesn't respond to #to_str
#
# Raised by transcoding methods:
# Encoding::ConverterNotFoundError:
# when a named encoding does not correspond with a known converter
# e.g. 'abc'.force_encoding('UTF-8').encode('foo')
# or a converter path cannot be found
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
#
# Raised by byte <-> char conversions
# RangeError: out of char range
# e.g. the UTF-16LE emoji: 128169.chr
def matching_encoding(string)
string.encode(@encoding)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
normalize_missing(string.encode(@encoding, :invalid => :replace, :undef => :replace))
normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
rescue Encoding::ConverterNotFoundError
normalize_missing(string.force_encoding(@encoding).encode(:invalid => :replace))
normalize_missing(string.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
end

def normalize_missing(string)
if @encoding.to_s == "UTF-8"
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), "?")
if @encoding.to_s == UTF_8
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), REPLACE)
else
string
end
Expand All @@ -61,7 +113,7 @@ def matching_encoding(string)
end

def detect_source_encoding(_string)
'US-ASCII'
US_ASCII
end
end
end
Expand Down
201 changes: 174 additions & 27 deletions spec/rspec/support/encoded_string_spec.rb
Original file line number Diff line number Diff line change
@@ -1,51 +1,162 @@
# encoding: utf-8
require 'spec_helper'
require 'rspec/support/encoded_string'

# Special matcher for comparing encoded strings so that
# we don't run any expectation failures through the Differ,
# which also relies on EncodedString. Instead, confirm the
# strings have the same encoding and same bytes.
RSpec::Matchers.define :be_identical_string do |expected|

if String.method_defined?(:encoding)
match do
actual.encoding == expected.encoding &&
actual.bytes.to_a == expected.bytes.to_a
end

failure_message do
"expected #{actual.inspect} (#{actual.encoding.name}) to be identical to "\
"#{expected.inspect} (#{expected.encoding.name})"
end
else
match do |actual|
actual.split(//) == expected.split(//)
end
end
end
RSpec::Matchers.alias_matcher :a_string_identical_to, :be_identical_string

module RSpec::Support
describe EncodedString do
let(:target_encoding) { 'UTF-8' }
let(:utf8_encoding) { 'UTF-8' }

delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?]
delegated_methods.each do |delegated_method|
it "responds to #{delegated_method}" do
encoded_string = EncodedString.new("abc", target_encoding)
encoded_string = EncodedString.new("abc", utf8_encoding)
expect(encoded_string).to respond_to(delegated_method)
end
end

if String.method_defined?(:encoding)

describe '#source_encoding' do
it 'knows the original encoding of the string' do
str = EncodedString.new("abc".encode('ASCII-8BIT'), "UTF-8")
expect( str.source_encoding.to_s ).to eq('ASCII-8BIT')
expect(str.source_encoding.to_s).to eq('ASCII-8BIT')
end
end

let(:ascii_arrow_symbol) { "\xAE" }
describe '#to_s' do
context 'when encoding a string with invalid bytes in the target encoding' do
# see https://github.com/jruby/jruby/blob/c1be61a501/test/mri/ruby/test_transcode.rb#L13
let(:source_encoding) { Encoding.find('US-ASCII') }
let(:target_encoding) { Encoding.find('UTF-8') }
let(:string) { "I have a bad byté\x80".force_encoding(source_encoding) }

it 'normally raises an EncodedString::InvalidByteSequenceError' do
expect {
string.encode(target_encoding)
}.to raise_error(Encoding::InvalidByteSequenceError)
end

it 'replaces invalid byte sequences with the REPLACE string' do
resulting_string = build_encoded_string(string, target_encoding).to_s
replacement = EncodedString::REPLACE * 3
expected_string = "I have a bad byt#{replacement}".force_encoding(target_encoding)
expect(resulting_string).to be_identical_string(expected_string)
end
end

context 'when no converter is known for an encoding' do
# see https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/string/shared/encode.rb#L12
let(:source_encoding) { Encoding.find('ASCII-8BIT') }
let(:no_converter_encoding) { Encoding::Emacs_Mule }
let(:string) { "\x80".force_encoding(source_encoding) }

it 'normally raises an Encoding::ConverterNotFoundError' do
expect {
string.encode(no_converter_encoding)
}.to raise_error(Encoding::ConverterNotFoundError)
end

# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
# https://www.ruby-forum.com/topic/6861247
# https://twitter.com/nalsh/status/553413844685438976
# For example, given:
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
# On MRI 2.1 or above: 63 # '?'
# else : 128 # "\x80"
if RUBY_VERSION < '2.1'
it 'does nothing' do
resulting_string = build_encoded_string(string, no_converter_encoding).to_s
expected_string = "\x80".force_encoding(no_converter_encoding)
expect(resulting_string).to be_identical_string(expected_string)
end
else
it 'forces the encoding and replaces invalid characters with the REPLACE string' do
resulting_string = build_encoded_string(string, no_converter_encoding).to_s
expected_string = EncodedString::REPLACE.force_encoding(no_converter_encoding)
expect(resulting_string).to be_identical_string(expected_string)
end
end
end

# see https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4289
# ISO-8859-1 -> UTF-8 -> EUC-JP
# "\xa0" NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP
context 'when there is an undefined conversion to the target encoding' do
let(:source_encoding) { Encoding.find('ISO-8859-1') }
let(:incompatible_encoding) { Encoding.find('EUC-JP') }
let(:string) { "\xa0 hi I am not going to work".force_encoding(source_encoding) }

it 'normally raises an Encoding::UndefinedConversionError' do
expect {
string.encode(incompatible_encoding)
}.to raise_error(Encoding::UndefinedConversionError)
end

it 'replaces all undefines conversions with the REPLACE string' do
resulting_string = build_encoded_string(string, incompatible_encoding).to_s
replacement = EncodedString::REPLACE
expected_string = "#{replacement} hi I am not going to work".force_encoding('EUC-JP')
expect(resulting_string).to be_identical_string(expected_string)
end
end
end

let(:ascii_arrow_symbol) { "\xAE" }
let(:utf_8_euro_symbol) { "\xE2\x82\xAC" }

describe '#<<' do
context 'with strings that can be converted to the target encoding' do
it 'encodes and appends the string' do
valid_ascii_string = "abc".force_encoding("ASCII-8BIT")
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')
let(:valid_ascii_string) { "abcde".force_encoding("ASCII-8BIT") }
let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') }

resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << valid_ascii_string
expect(resulting_string).to eq "#{utf_8_euro_symbol}abc".force_encoding('UTF-8')
it 'encodes and appends the string' do
resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << valid_ascii_string
expected_string = "#{utf_8_euro_symbol}abcde".force_encoding('UTF-8')
expect(resulting_string).to be_identical_string(expected_string)
end
end

context 'with a string that cannot be converted to the target encoding' do
it 'replaces undefined characters with either a ? or a unicode ?' do
ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT")
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')

resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << ascii_string
expected_bytes = utf_8_euro_symbol.each_byte.to_a + ["?".unpack("c").first]
actual_bytes = resulting_string.each_byte.to_a

expect(actual_bytes).to eq(expected_bytes)
context 'when appending a string with an incompatible character encoding' do
let(:ascii_string) { ascii_arrow_symbol.force_encoding("ASCII-8BIT") }
let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') }

it "normally raises an Encoding::CompatibilityError" do
expect {
valid_unicode_string.encode(utf8_encoding) << ascii_string
}.to raise_error(Encoding::CompatibilityError)
end

it 'replaces unconvertable characters with the REPLACE string' do
resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << ascii_string
expected_string = "#{utf_8_euro_symbol}#{EncodedString::REPLACE}"
expect(resulting_string).to be_identical_string(expected_string)
end
end
end

Expand All @@ -54,31 +165,67 @@ module RSpec::Support
ascii_string = 'abc'.force_encoding("ASCII-8BIT")
other_ascii_string = '123'.force_encoding("ASCII-8BIT")

resulting_string = build_encoded_string(ascii_string, target_encoding) << other_ascii_string
expect(resulting_string.encoding.to_s).to eq 'UTF-8'
resulting_string = build_encoded_string(ascii_string, utf8_encoding) << other_ascii_string
expected_string = 'abc123'.force_encoding(utf8_encoding)
expect(resulting_string).to be_identical_string(expected_string)
end
end
end

describe '#split' do
it 'splits the string based on the delimiter accounting for encoding' do
wrapped_string = "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT")
context 'when there is an undefined conversion to the target encoding' do
let(:wrapped_string_template) { "abaaaaaaaaaa%saaaaa" }
let(:wrapped_string) { sprintf(wrapped_string_template, ascii_arrow_symbol).force_encoding("ASCII-8BIT") }

it 'normally raises an Encoding::UndefinedConversionError' do
expect {
wrapped_string.encode(utf8_encoding)
}.to raise_error(Encoding::UndefinedConversionError)
end

it 'splits the string based on the delimiter accounting for encoding' do
delimiter = "b".force_encoding(utf8_encoding)
resulting_string = build_encoded_string(wrapped_string, utf8_encoding).
split(delimiter)
exp1, exp2 = sprintf(wrapped_string_template, EncodedString::REPLACE).force_encoding(utf8_encoding).split(delimiter)
expect(resulting_string).to match [
a_string_identical_to(exp1),
a_string_identical_to(exp2)
]
end
end

expect {
build_encoded_string(wrapped_string, target_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8"))
}.not_to raise_error
# see https://github.com/rspec/rspec-expectations/blob/f8a1232/spec/rspec/expectations/fail_with_spec.rb#L50
# https://github.com/rspec/rspec-expectations/issues/201
# https://github.com/rspec/rspec-expectations/pull/220
context 'with a string that cannot be converted to the target encoding' do
let(:binary_poop) {'💩' } # [128169] "\u{1F4A9}"
let(:non_ascii_compatible_string) { "This is a pile of poo: #{binary_poop}, yuck".encode("UTF-16LE") }

it 'normally raises an Encoding::CompatibilityError' do
expect {
non_ascii_compatible_string.split("\n")
}.to raise_error(Encoding::CompatibilityError)
end

it 'makes no changes to the resulting string' do
resulting_array = build_encoded_string(non_ascii_compatible_string).split("\n")
expect(resulting_array).to match [
a_string_identical_to(non_ascii_compatible_string)
]
end
end
end

def build_encoded_string(string, target_encoding)
def build_encoded_string(string, target_encoding = string.encoding)
EncodedString.new(string, target_encoding)
end
else

describe '#source_encoding' do
it 'defaults to US-ASCII' do
str = EncodedString.new("abc", "UTF-8")
expect( str.source_encoding ).to eq('US-ASCII')
expect(str.source_encoding).to eq('US-ASCII')
end
end
end
Expand Down

0 comments on commit eb27b60

Please sign in to comment.