From 054aeb06759776fdb60fa49e08ea2c235fb8c959 Mon Sep 17 00:00:00 2001 From: Scott Paul Jones Date: Wed, 22 Apr 2015 11:50:33 -0400 Subject: [PATCH] (fixes #10919) Fixes error in utf16() with handle Unicode characters > 0x100000 --- base/utf16.jl | 6 ++++-- test/unicode.jl | 21 +++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/base/utf16.jl b/base/utf16.jl index 0e114955ca903..b631311ef032a 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -55,9 +55,11 @@ function encode16(s::AbstractString) c = reinterpret(UInt32, ch) if c < 0x10000 push!(buf, UInt16(c)) + elseif c <= 0x10ffff + push!(buf, UInt16(0xd7c0 + (c>>10))) + push!(buf, UInt16(0xdc00 + (c & 0x3ff))) else - push!(buf, UInt16(0xd7c0 + (c>>10) & 0x3ff)) - push!(buf, UInt16(0xdc00 + c & 0x3ff)) + throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)")) end end push!(buf, 0) # NULL termination diff --git a/test/unicode.jl b/test/unicode.jl index c0342b46d1cac..c672e1bcb65e5 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -1,27 +1,28 @@ # UTF16 -u8 = "\U1d565\U1d7f6\U00066\U2008a" +u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" u16 = utf16(u8) -@test sizeof(u16) == 14 -@test length(u16.data) == 8 && u16.data[end] == 0 -@test length(u16) == 4 +@test sizeof(u16) == 18 +@test length(u16.data) == 10 && u16.data[end] == 0 +@test length(u16) == 5 @test utf8(u16) == u8 @test collect(u8) == collect(u16) -@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 14), 1, reinterpret(UInt8, u16.data), 1, 14)) +@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18)) @test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) +@test_throws ArgumentError utf16(utf32(Char(0x120000))) # UTF32 u32 = utf32(u8) -@test sizeof(u32) == 16 -@test length(u32.data) == 5 && u32.data[end] == Char(0) -@test length(u32) == 4 +@test sizeof(u32) == 20 +@test length(u32.data) == 6 && u32.data[end] == Char(0) +@test length(u32) == 5 @test utf8(u32) == u8 @test collect(u8) == collect(u32) -@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 16), 1, reinterpret(UInt8, u32.data), 1, 16)) +@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20)) @test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) # Wstring w = wstring(u8) -@test length(w) == 4 && utf8(w) == u8 && collect(u8) == collect(w) +@test length(w) == 5 && utf8(w) == u8 && collect(u8) == collect(w) @test u8 == WString(w.data) if !success(`iconv --version`)