From ecce488daed676351b561e39a2dae1147983c939 Mon Sep 17 00:00:00 2001 From: Andi Bachmann Date: Mon, 14 Dec 2015 14:21:20 +0100 Subject: [PATCH] . adds correct UTF-8 encoding --- lib/net/ber.rb | 38 ++++++++++++++++++++++++++++++++++---- test/ber/test_ber.rb | 10 +++++++++- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/lib/net/ber.rb b/lib/net/ber.rb index b4b9e9da..498b8aaf 100644 --- a/lib/net/ber.rb +++ b/lib/net/ber.rb @@ -293,13 +293,43 @@ def to_arr ## # A String object with a BER identifier attached. +# class Net::BER::BerIdentifiedString < String attr_accessor :ber_identifier + + # The binary data provided when parsing the result of the LDAP search + # has the encoding 'ASCII-8BIT' (which is basically 'BINARY', or 'unknown'). + # + # This is the kind of a backtrace showing how the binary `data` comes to + # BerIdentifiedString.new(data): + # + # @conn.read_ber(syntax) + # -> StringIO.new(self).read_ber(syntax), i.e. included from module + # -> Net::BER::BERParser.read_ber(syntax) + # -> (private)Net::BER::BERParser.parse_ber_object(syntax, id, data) + # + # In the `#parse_ber_object` method `data`, according to its OID, is being + # 'casted' to one of the Net::BER:BerIdentifiedXXX classes. + # + # As we are using LDAP v3 we can safely assume that the data is encoded + # in UTF-8 and therefore the only thing to be done when instantiating is to + # switch the encoding from 'ASCII-8BIT' to 'UTF-8'. + # + # Unfortunately, there are some ActiveDirectory specific attributes + # (like `objectguid`) that should remain binary (do they really?). + # Using the `#valid_encoding?` we can trap this cases. Special cases like + # Japanese, Korean, etc. encodings might also profit from this. However + # I have no clue how this encodings function. def initialize args - super begin - args.respond_to?(:encode) ? args.encode('UTF-8') : args - rescue - args + super + # + # Check the encoding of the newly created String and set the encoding + # to 'UTF-8' (NOTE: we do NOT change the bytes, but only set the + # encoding to 'UTF-8'). + current_encoding = encoding + if current_encoding == Encoding::BINARY + force_encoding('UTF-8') + force_encoding(current_encoding) unless valid_encoding? end end end diff --git a/test/ber/test_ber.rb b/test/ber/test_ber.rb index 92b3902d..ae17ddd1 100644 --- a/test/ber/test_ber.rb +++ b/test/ber/test_ber.rb @@ -130,12 +130,20 @@ def test_binary_data def test_ascii_data_in_utf8 data = "some text".force_encoding("UTF-8") bis = Net::BER::BerIdentifiedString.new(data) + + assert bis.valid_encoding?, "should be a valid encoding" + assert_equal "UTF-8", bis.encoding.name + end + + def test_umlaut_data_in_utf8 + data = "Müller".force_encoding("UTF-8") + bis = Net::BER::BerIdentifiedString.new(data) assert bis.valid_encoding?, "should be a valid encoding" assert_equal "UTF-8", bis.encoding.name end - def test_ut8_data_in_utf8 + def test_utf8_data_in_utf8 data = ["e4b8ad"].pack("H*").force_encoding("UTF-8") bis = Net::BER::BerIdentifiedString.new(data)