diff --git a/base/c.jl b/base/c.jl index 061599a11b2ce..f30fc62434a42 100644 --- a/base/c.jl +++ b/base/c.jl @@ -84,7 +84,7 @@ containsnul(p::Ptr, len) = C_NULL != ccall(:memchr, Ptr{Cchar}, (Ptr{Cchar}, Cin function unsafe_convert(::Type{Cstring}, s::ByteString) p = unsafe_convert(Ptr{Cchar}, s) if containsnul(p, sizeof(s)) - throw(ArgumentError("embedded NUL chars are not allowed in C strings: $(repr(s))")) + throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) end return Cstring(p) end @@ -94,6 +94,105 @@ convert(::Type{Cstring}, s::Symbol) = Cstring(unsafe_convert(Ptr{Cchar}, s)) # in string.jl: unsafe_convert(::Type{Cwstring}, s::WString) +# FIXME: this should be handled by implicit conversion to Cwstring, but good luck with that +@windows_only function cwstring(s::AbstractString) + bytes = bytestring(s).data + 0 in bytes && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) + return push!(utf8to16(bytes), 0) +end + +# conversions between UTF-8 and UTF-16 for Windows APIs + +function utf8to16(src::Vector{UInt8}) + dst = UInt16[] + i, n = 1, length(src) + n > 0 || return dst + sizehint!(dst, 2n) + a = src[1] + while true + if i < n && -64 <= a % Int8 <= -12 # multi-byte character + b = src[i += 1] + if -64 <= (b % Int8) || a == 0xf4 && 0x8f < b + # invalid UTF-8 (non-continuation or too-high code point) + push!(dst, a) + a = b; continue + elseif a < 0xe0 # 2-byte UTF-8 + push!(dst, 0x3080 $ (UInt16(a) << 6) $ b) + elseif i < n # 3/4-byte character + c = src[i += 1] + if -64 <= (c % Int8) # invalid UTF-8 (non-continuation) + push!(dst, a, b) + a = c; continue + elseif a < 0xf0 # 3-byte UTF-8 + push!(dst, 0x2080 $ (UInt16(a) << 12) $ (UInt16(b) << 6) $ c) + elseif i < n + d = src[i += 1] + if -64 <= (d % Int8) # invalid UTF-8 (non-continuation) + push!(dst, a, b, c) + a = d; continue + elseif a == 0xf0 && b < 0x90 # overlong encoding + push!(dst, 0x2080 $ (UInt16(b) << 12) $ (UInt16(c) << 6) $ d) + else # 4-byte UTF-8 + push!(dst, 0xe5b8 + (UInt16(a) << 8) + (UInt16(b) << 2) + (c >> 4), + 0xdc80 $ (UInt16(c & 0xf) << 6) $ d) + end + else # too short + push!(dst, a, b, c) + break + end + else # too short + push!(dst, a, b) + break + end + else # ASCII or invalid UTF-8 (continuation byte or too-high code point) + push!(dst, a) + end + i < n || break + a = src[i += 1] + end + return dst +end + +function utf16to8(src::Vector{UInt16}) + dst = UInt8[] + i, n = 1, length(src) + n > 0 || return dst + sizehint!(dst, n) + a = src[1] + while true + if a < 0x80 # ASCII + push!(dst, a % UInt8) + elseif a < 0x800 # 2-byte UTF-8 + push!(dst, 0xc0 | ((a >> 6) % UInt8), + 0x80 | ((a % UInt8) & 0x3f)) + elseif a & 0xfc00 == 0xd800 && i < n + b = src[i += 1] + if (b & 0xfc00) == 0xdc00 + # 2-unit UTF-16 sequence => 4-byte UTF-8 + a += 0x2840 + push!(dst, 0xf0 | ((a >> 8) % UInt8), + 0x80 | ((a % UInt8) >> 2), + 0xf0 $ ((((a % UInt8) << 4) & 0x3f) $ (b >> 6) % UInt8), + 0x80 | ((b % UInt8) & 0x3f)) + else + push!(dst, 0xe0 | ((a >> 12) % UInt8), + 0x80 | (((a >> 6) % UInt8) & 0x3f), + 0x80 | ((a % UInt8) & 0x3f)) + a = b; continue + end + else + # 1-unit high UTF-16 or unpaired high surrogate + # either way, encode as 3-byte UTF-8 code point + push!(dst, 0xe0 | ((a >> 12) % UInt8), + 0x80 | (((a >> 6) % UInt8) & 0x3f), + 0x80 | ((a % UInt8) & 0x3f)) + end + i < n || break + a = src[i += 1] + end + return dst +end + # deferring (or un-deferring) ctrl-c handler for external C code that # is not interrupt safe (see also issue #2622). The sigatomic_begin/end # functions should always be called in matched pairs, ideally via: diff --git a/base/env.jl b/base/env.jl index 0f47a04202e78..876e32b807b41 100644 --- a/base/env.jl +++ b/base/env.jl @@ -26,33 +26,37 @@ end # @unix_only const ERROR_ENVVAR_NOT_FOUND = UInt32(203) -_getenvlen(var::AbstractString) = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Cwstring,Ptr{UInt8},UInt32),var,C_NULL,0) -_hasenv(s::AbstractString) = _getenvlen(s)!=0 || Libc.GetLastError()!=ERROR_ENVVAR_NOT_FOUND +_getenvlen(var::Vector{UInt16}) = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Ptr{UInt16},Ptr{UInt16},UInt32),var,C_NULL,0) +_hasenv(s::Vector{UInt16}) = _getenvlen(s) != 0 || Libc.GetLastError() != ERROR_ENVVAR_NOT_FOUND +_hasenv(s::AbstractString) = _hasenv(cwstring(s)) function access_env(onError::Function, str::AbstractString) - var = utf16(str) + var = cwstring(str) len = _getenvlen(var) if len == 0 return Libc.GetLastError() != ERROR_ENVVAR_NOT_FOUND ? utf8("") : onError(str) end val = zeros(UInt16,len) - ret = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Cwstring,Ptr{UInt16},UInt32),var,val,len) + ret = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Ptr{UInt16},Ptr{UInt16},UInt32),var,val,len) if (ret == 0 && len != 1) || ret != len-1 || val[end] != 0 error(string("getenv: ", str, ' ', len, "-1 != ", ret, ": ", Libc.FormatMessage())) end - return utf8(UTF16String(val)) + pop!(val) # NUL + return UTF8String(utf16to8(val)) end -function _setenv(var::AbstractString, val::AbstractString, overwrite::Bool=true) - var = utf16(var) +function _setenv(svar::AbstractString, sval::AbstractString, overwrite::Bool=true) + var = cwstring(svar) + val = cwstring(sval) if overwrite || !_hasenv(var) - ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Cwstring,Cwstring),var,val) + ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Ptr{UInt16},Ptr{UInt16}),var,val) systemerror(:setenv, ret == 0) end end -function _unsetenv(var::AbstractString) - ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Cwstring,Ptr{UInt16}),var,C_NULL) +function _unsetenv(svar::AbstractString) + var = cwstring(svar) + ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Ptr{UInt16},Ptr{UInt16}),var,C_NULL) systemerror(:setenv, ret == 0) end @@ -105,10 +109,10 @@ end function next(hash::EnvHash, block::Tuple{Ptr{UInt16},Ptr{UInt16}}) pos = block[1] blk = block[2] - len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos)+1 + len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos) buf = Array(UInt16, len) unsafe_copy!(pointer(buf), pos, len) - env = utf8(UTF16String(buf)) + env = UTF8String(utf16to8(buf)) m = match(r"^(=?[^=]+)=(.*)$"s, env) if m === nothing error("malformed environment entry: $env") diff --git a/base/file.jl b/base/file.jl index 8ca14f1807620..87df9b99e64f0 100644 --- a/base/file.jl +++ b/base/file.jl @@ -218,19 +218,21 @@ function tempdir() if lentemppath >= length(temppath) || lentemppath == 0 error("GetTempPath failed: $(Libc.FormatMessage())") end - resize!(temppath,lentemppath+1) - return utf8(UTF16String(temppath)) + resize!(temppath,lentemppath) + return UTF8String(utf16to8(temppath)) end tempname(uunique::UInt32=UInt32(0)) = tempname(tempdir(), uunique) +const temp_prefix = cwstring("jl_") function tempname(temppath::AbstractString,uunique::UInt32) + tempp = cwstring(temppath) tname = Array(UInt16,32767) - uunique = ccall(:GetTempFileNameW,stdcall,UInt32,(Cwstring,Ptr{UInt16},UInt32,Ptr{UInt16}), temppath,utf16("jul"),uunique,tname) + uunique = ccall(:GetTempFileNameW,stdcall,UInt32,(Ptr{UInt16},Ptr{UInt16},UInt32,Ptr{UInt16}), tempp,temp_prefix,uunique,tname) lentname = findfirst(tname,0)-1 if uunique == 0 || lentname <= 0 error("GetTempFileName failed: $(Libc.FormatMessage())") end - resize!(tname,lentname+1) - return utf8(UTF16String(tname)) + resize!(tname,lentname) + return UTF8String(utf16to8(tname)) end function mktemp(parent=tempdir()) filename = tempname(parent, UInt32(0)) @@ -243,7 +245,7 @@ function mktempdir(parent=tempdir()) seed += 1 end filename = tempname(parent, seed) - ret = ccall(:_wmkdir, Int32, (Ptr{UInt16},), utf16(filename)) + ret = ccall(:_wmkdir, Int32, (Ptr{UInt16},), cwstring(filename)) if ret == 0 return filename end diff --git a/base/filesystem.jl b/base/filesystem.jl index 94a4d4e22815e..50755ceef8cf1 100644 --- a/base/filesystem.jl +++ b/base/filesystem.jl @@ -38,10 +38,13 @@ export File, S_IRGRP, S_IWGRP, S_IXGRP, S_IRWXG, S_IROTH, S_IWOTH, S_IXOTH, S_IRWXO -import Base: uvtype, uvhandle, eventloop, fd, position, stat, close, - write, read, unsafe_write, unsafe_read, readavailable, read!, - isopen, show, seek, seekend, skip, eof, nb_available, - check_open, _sizeof_uv_fs, uv_error, UVError +import Base: + UVError, _sizeof_uv_fs, check_open, close, eof, eventloop, fd, isopen, + nb_available, position, read, read!, readavailable, seek, seekend, show, + skip, stat, unsafe_read, unsafe_write, utf16to8, utf8to16, uv_error, + uvhandle, uvtype, write + +@windows_only import Base: cwstring include("path.jl") include("stat.jl") diff --git a/base/interactiveutil.jl b/base/interactiveutil.jl index e29176063d326..709230a88ed8e 100644 --- a/base/interactiveutil.jl +++ b/base/interactiveutil.jl @@ -131,13 +131,13 @@ end end systemerror(:OpenClipboard, 0==ccall((:OpenClipboard, "user32"), stdcall, Cint, (Ptr{Void},), C_NULL)) systemerror(:EmptyClipboard, 0==ccall((:EmptyClipboard, "user32"), stdcall, Cint, ())) - x_u16 = utf16(x) + x_u16 = cwstring(x) # copy data to locked, allocated space - p = ccall((:GlobalAlloc, "kernel32"), stdcall, Ptr{UInt16}, (UInt16, Int32), 2, sizeof(x_u16)+2) + p = ccall((:GlobalAlloc, "kernel32"), stdcall, Ptr{UInt16}, (UInt16, Int32), 2, sizeof(x_u16)) systemerror(:GlobalAlloc, p==C_NULL) plock = ccall((:GlobalLock, "kernel32"), stdcall, Ptr{UInt16}, (Ptr{UInt16},), p) systemerror(:GlobalLock, plock==C_NULL) - ccall(:memcpy, Ptr{UInt16}, (Ptr{UInt16},Ptr{UInt16},Int), plock, x_u16, sizeof(x_u16)+2) + ccall(:memcpy, Ptr{UInt16}, (Ptr{UInt16},Ptr{UInt16},Int), plock, x_u16, sizeof(x_u16)) systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{Void},), plock)) pdata = ccall((:SetClipboardData, "user32"), stdcall, Ptr{UInt16}, (UInt32, Ptr{UInt16}), 13, p) systemerror(:SetClipboardData, pdata!=p) @@ -152,7 +152,9 @@ end systemerror(:CloseClipboard, 0==ccall((:CloseClipboard, "user32"), stdcall, Cint, ())) plock = ccall((:GlobalLock, "kernel32"), stdcall, Ptr{UInt16}, (Ptr{UInt16},), pdata) systemerror(:GlobalLock, plock==C_NULL) - s = utf8(utf16(plock)) + len = 0 + while unsafe_load(plock, len+1) != 0; len += 1; end + s = UTF8String(utf16to8(pointer_to_array(plock, len))) systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{UInt16},), plock)) return s end diff --git a/base/libc.jl b/base/libc.jl index ae55eb50ad2aa..7e0b0bef582c4 100644 --- a/base/libc.jl +++ b/base/libc.jl @@ -6,6 +6,8 @@ export FILE, TmStruct, strftime, strptime, getpid, gethostname, free, malloc, ca errno, strerror, flush_cstdio, systemsleep, time @windows_only export GetLastError, FormatMessage +import Base: utf16to8 + include(string(length(Core.ARGS)>=2?Core.ARGS[2]:"","errno_h.jl")) # include($BUILDROOT/base/errno_h.jl) ## RawFD ## @@ -258,11 +260,10 @@ function FormatMessage end C_NULL, e, 0, lpMsgBuf, 0, C_NULL) p = lpMsgBuf[1] len == 0 && return utf8("") - len = len + 1 buf = Array(UInt16, len) unsafe_copy!(pointer(buf), p, len) ccall(:LocalFree,stdcall,Ptr{Void},(Ptr{Void},),p) - return utf8(UTF16String(buf)) + return UTF8String(utf16to8(buf)) end end diff --git a/base/path.jl b/base/path.jl index 6f191a0cd67c5..4cd8ebfa3c338 100644 --- a/base/path.jl +++ b/base/path.jl @@ -124,36 +124,31 @@ normpath(a::AbstractString, b::AbstractString...) = normpath(joinpath(a,b...)) abspath(a::AbstractString) = normpath(isabspath(a) ? a : joinpath(pwd(),a)) abspath(a::AbstractString, b::AbstractString...) = abspath(joinpath(a,b...)) -@windows_only realpath(path::AbstractString) = realpath(utf16(path)) -@windows_only function realpath(path::UTF16String) - p::UInt32 = sizeof(path)>>1 +@windows_only function realpath(path::AbstractString) + path = cwstring(path) + buf = zeros(UInt16, length(path)) while true - buf = zeros(UInt16, p + 1) - p = ccall((:GetFullPathNameW, "kernel32"), stdcall, - UInt32, (Cwstring, UInt32, Ptr{UInt16}, Ptr{Void}), + n = ccall((:GetFullPathNameW, "kernel32"), stdcall, + UInt32, (Ptr{UInt16}, UInt32, Ptr{UInt16}, Ptr{Void}), path, length(buf), buf, C_NULL) - systemerror(:realpath, p == 0) - if (p < length(buf)) - resize!(buf, p + 1) - return utf8(UTF16String(buf)) - end + systemerror(:realpath, n == 0) + x = n < length(buf) # is the buffer big enough? + resize!(buf, n) # shrink if x, grow if !x + x && return UTF8String(utf16to8(buf)) end end -@windows_only longpath(path::AbstractString) = longpath(utf16(path)) -@windows_only function longpath(path::UTF16String) - p::UInt32 = sizeof(path)>>1 +@windows_only function longpath(path::AbstractString) + path = cwstring(path) + buf = zeros(UInt16, length(path)) while true - buf = zeros(UInt16, p + 1) - p = ccall((:GetLongPathNameW, "kernel32"), stdcall, UInt32, - (Cwstring, Ptr{UInt16}, UInt32), + n = ccall((:GetLongPathNameW, "kernel32"), stdcall, + UInt32, (Ptr{UInt16}, Ptr{UInt16}, UInt32), path, buf, length(buf)) - systemerror(:longpath, p == 0) - # Buffer wasn't big enough, in which case `p` is the necessary buffer size - if (p < length(buf)) - resize!(buf, p + 1) - return utf8(UTF16String(buf)) - end + systemerror(:longpath, n == 0) + x = n < length(buf) # is the buffer big enough? + resize!(buf, n) # shrink if x, grow if !x + x && return UTF8String(utf16to8(buf)) end end diff --git a/test/misc.jl b/test/misc.jl index e8f55e8f78ffa..260f7d19ac1e9 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -210,3 +210,170 @@ end whos(IOBuffer(), Tmp14173) # warm up @test @allocated(whos(IOBuffer(), Tmp14173)) < 10000 +## test conversion from UTF-8 to UTF-16 (for Windows APIs) +import Base: utf8to16, utf16to8 + +# empty arrays +@test utf8to16(UInt8[]) == UInt16[] +@test utf16to8(UInt16[]) == UInt8[] + +# UTF-8-like sequences +V8 = [ + # 1-byte (ASCII) + ([0x00],[0x0000]) + ([0x0a],[0x000a]) + ([0x7f],[0x007f]) + # 2-byte + ([0xc0,0x80],[0x0000]) # overlong encoding + ([0xc1,0xbf],[0x007f]) # overlong encoding + ([0xc2,0x80],[0x0080]) + ([0xc3,0xbf],[0x00ff]) + ([0xc4,0x80],[0x0100]) + ([0xc4,0xa3],[0x0123]) + ([0xdf,0xbf],[0x07ff]) + # 3-byte + ([0xe0,0x80,0x80],[0x0000]) # overlong encoding + ([0xe0,0x81,0xbf],[0x007f]) # overlong encoding + ([0xe0,0x82,0x80],[0x0080]) # overlong encoding + ([0xe0,0x9f,0xbf],[0x07ff]) # overlong encoding + ([0xe0,0xa0,0x80],[0x0800]) + ([0xe0,0xa2,0x9a],[0x089a]) + ([0xe1,0x88,0xb4],[0x1234]) + ([0xea,0xaf,0x8d],[0xabcd]) + ([0xed,0x9f,0xbf],[0xd7ff]) + ([0xed,0xa0,0x80],[0xd800]) # invalid code point – high surrogate + ([0xed,0xaf,0xbf],[0xdbff]) # invalid code point – high surrogate + ([0xed,0xb0,0x80],[0xdc00]) # invalid code point – low surrogate + ([0xed,0xbf,0xbf],[0xdfff]) # invalid code point – low surrogate + ([0xee,0x80,0x80],[0xe000]) + ([0xef,0xbf,0xbf],[0xffff]) + # 4-byte + ([0xf0,0x80,0x80,0x80],[0x0000]) # overlong encoding + ([0xf0,0x80,0x81,0xbf],[0x007f]) # overlong encoding + ([0xf0,0x80,0x82,0x80],[0x0080]) # overlong encoding + ([0xf0,0x80,0x9f,0xbf],[0x07ff]) # overlong encoding + ([0xf0,0x80,0xa0,0x80],[0x0800]) # overlong encoding + ([0xf0,0x8f,0xbf,0xbf],[0xffff]) # overlong encoding + ([0xf0,0x90,0x80,0x80],[0xd800,0xdc00]) # U+10000 + ([0xf0,0x90,0x8d,0x88],[0xd800,0xdf48]) # U+10348 + ([0xf0,0x90,0x90,0xb7],[0xd801,0xdc37]) # U+10437 + ([0xf0,0xa4,0xad,0xa2],[0xd852,0xdf62]) # U+24b62 + ([0xf2,0xab,0xb3,0x9e],[0xda6f,0xdcde]) # U+abcde + ([0xf3,0xbf,0xbf,0xbf],[0xdbbf,0xdfff]) # U+fffff + ([0xf4,0x80,0x80,0x80],[0xdbc0,0xdc00]) # U+100000 + ([0xf4,0x8a,0xaf,0x8d],[0xdbea,0xdfcd]) # U+10abcd + ([0xf4,0x8f,0xbf,0xbf],[0xdbff,0xdfff]) # U+10ffff +] + +# non UTF-8-like sequences +X8 = Vector{UInt8}[ + # invalid 1-byte sequences + [0x80], # 1 leading ones + [0xbf], + [0xc0], # 2 leading ones + [0xdf], + [0xe0], # 3 leading ones + [0xef], + [0xf0], # 4 leading ones + [0xf7], + [0xf8], # 5 leading ones + [0xfb], + [0xfc], # 6 leading ones + [0xfd], + [0xfe], # 7 leading ones + [0xff], # 8 leading ones + # other invalid sequences + [0xf4,0x90,0xbf,0xbf], + [0xf4,0x91,0x80,0x80], + [0xf7,0x80,0x80,0x80], + [0xf7,0xbf,0xbf,0xbf], + [0xf8,0x80,0x80,0x80], + [0xf8,0xbf,0xbf,0xbf], + [0xff,0x80,0x80,0x80], + [0xff,0xbf,0xbf,0xbf], +] + +for s in [map(first,V8); X8], + i = 1:length(s)-1, + j = i+1:length(s)-(i==1) + ss = s[i:j] + ss in X8 || push!(X8, ss) +end +sort!(X8, lt=lexless) +sort!(X8, by=length) + +I8 = [(s,map(UInt16,s)) for s in X8] + +for (X,Y,Z) in ((V8,V8,V8), (I8,V8,I8), (V8,I8,V8), (V8,V8,I8), (I8,V8,V8)) + for (a8, a16) in X + @test utf8to16(a8) == a16 + for (b8, b16) in Y + ab8 = [a8; b8] + ab16 = [a16; b16] + @test utf8to16(ab8) == ab16 + for (c8, c16) in Z + abc8 = [ab8; c8] + abc16 = [ab16; c16] + @test utf8to16(abc8) == abc16 + end + end + end +end + +# UTF-16-like sequences +V16 = [ + # 1-unit UTF-16, 1-byte UTF-8 (ASCII) + ([0x0000],[0x00]) + ([0x000a],[0x0a]) + ([0x007f],[0x7f]) + # 1-unit UTF-16, 2-byte UTF-8 + ([0x0080],[0xc2,0x80]) + ([0x00ff],[0xc3,0xbf]) + ([0x0100],[0xc4,0x80]) + ([0x0123],[0xc4,0xa3]) + ([0x07ff],[0xdf,0xbf]) + # 1-unit UTF-16, 3-byte UTF-8 + ([0x0800],[0xe0,0xa0,0x80]) + ([0x089a],[0xe0,0xa2,0x9a]) + ([0x1234],[0xe1,0x88,0xb4]) + ([0xabcd],[0xea,0xaf,0x8d]) + ([0xd7ff],[0xed,0x9f,0xbf]) + ([0xe000],[0xee,0x80,0x80]) + ([0xffff],[0xef,0xbf,0xbf]) + # 2-unit UTF-16, 4-byte UTF-8 + ([0xd800,0xdc00],[0xf0,0x90,0x80,0x80]) # U+10000 + ([0xd800,0xdf48],[0xf0,0x90,0x8d,0x88]) # U+10348 + ([0xd801,0xdc37],[0xf0,0x90,0x90,0xb7]) # U+10437 + ([0xd852,0xdf62],[0xf0,0xa4,0xad,0xa2]) # U+24b62 + ([0xda6f,0xdcde],[0xf2,0xab,0xb3,0x9e]) # U+abcde + ([0xdbbf,0xdfff],[0xf3,0xbf,0xbf,0xbf]) # U+fffff + ([0xdbc0,0xdc00],[0xf4,0x80,0x80,0x80]) # U+100000 + ([0xdbea,0xdfcd],[0xf4,0x8a,0xaf,0x8d]) # U+10abcd + ([0xdbff,0xdfff],[0xf4,0x8f,0xbf,0xbf]) # U+10ffff +] + +I16 = [ + ([0xd800],[0xed,0xa0,0x80]) # high surrogate + ([0xdbff],[0xed,0xaf,0xbf]) # high surrogate + ([0xdc00],[0xed,0xb0,0x80]) # low surrogate + ([0xdfff],[0xed,0xbf,0xbf]) # low surrogate +] + +for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16,V16,V16)) + for (a16, a8) in X + @test utf16to8(a16) == a8 + @test utf8to16(a8) == a16 + for (b16, b8) in Y + ab16 = [a16; b16] + ab8 = [a8; b8] + @test utf16to8(ab16) == ab8 + @test utf8to16(ab8) == ab16 + for (c16, c8) in Z + abc16 = [ab16; c16] + abc8 = [ab8; c8] + @test utf16to8(abc16) == abc8 + @test utf8to16(abc8) == abc16 + end + end + end +end