Skip to content

Commit

Permalink
export and document transcode (JuliaLang#17323)
Browse files Browse the repository at this point in the history
* export and document transcode from JuliaLang#16974, add transcode(String, x) and transcode(T, ::String) convenience methods

* docs

* support UTF-32 in transcode

* don't use splatting for UTF-32 to String conversion

* typo

* eliminate method ambiguities

* re-run genstdlib

* doc clarification

* typo
  • Loading branch information
stevengj authored and mfasi committed Sep 5, 2016
1 parent eec4d67 commit 1fc7018
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 22 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ Library improvements
`String(s)`, `unsafe_string(ptr)` (formerly `bytestring(ptr)`), and
`unsafe_wrap(String, ptr)` (formerly `pointer_to_string`) ([#16731]).

* A `transcode(T, src)` function is now exported for converting data
between UTF-xx Unicode encodings ([#17323]).

* Most of the combinatorics functions have been moved from `Base`
to the [Combinatorics.jl package](https://github.com/JuliaLang/Combinatorics.jl) ([#13897]).

Expand Down Expand Up @@ -334,4 +337,5 @@ Deprecated or removed
[#17075]: https://github.com/JuliaLang/julia/issues/17075
[#17266]: https://github.com/JuliaLang/julia/issues/17266
[#17300]: https://github.com/JuliaLang/julia/issues/17300
[#17323]: https://github.com/JuliaLang/julia/issues/17323
[#17374]: https://github.com/JuliaLang/julia/issues/17374
37 changes: 28 additions & 9 deletions base/c.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,39 @@ function cwstring(s::AbstractString)
end
end

# transcoding between data in UTF-8 and UTF-16 for Windows APIs
# transcoding between data in UTF-8 and UTF-16 for Windows APIs,
# and also UTF-32 for APIs using Cwchar_t on other platforms.

"""
Base.transcode(T,src::Vector{U})
transcode(T, src)
Convert string data between Unicode encodings. `src` is either a
`String` or a `Vector{UIntXX}` of UTF-XX code units, where
`XX` is 8, 16, or 32. `T` indicates the encoding of the return value:
`String` to return a (UTF-8 encoded) `String` or `UIntXX`
to return a `Vector{UIntXX}` of UTF-`XX` data. (The alias `Cwchar_t`
can also be used as the integer type, for converting `wchar_t*` strings
used by external C libraries.)
Transcodes unicode data `src` to a different encoding, where `U` and `T` are the integers
denoting the input and output code units. Currently supported are UTF-8 and UTF-16, which
are denoted by integers `UInt8` and `UInt16`, respectively.
The `transcode` function succeeds as long as the input data can be
reasonably represented in the target encoding; it always succeeds for
conversions between UTF-XX encodings, even for invalid Unicode data.
NULs are handled like any other character (i.e. the output will be NUL-terminated if and
only if the `src` is).
Only conversion to/from UTF-8 is currently supported.
"""
function transcode end
transcode{T<:Union{UInt8,UInt16}}(::Type{T}, src::Vector{T}) = src
transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src)

transcode{T<:Union{UInt8,UInt16,UInt32,Int32}}(::Type{T}, src::Vector{T}) = src
transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::String) = T[T(c) for c in src]
transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::Vector{UInt8}) = transcode(T, String(src))
function transcode{S<:Union{Int32,UInt32}}(::Type{UInt8}, src::Vector{S})
buf = IOBuffer()
for c in src; print(buf, Char(c)); end
takebuf_array(buf)
end
transcode(::Type{String}, src::String) = src
transcode(T, src::String) = transcode(T, src.data)
transcode(::Type{String}, src) = String(transcode(UInt8, src))

function transcode(::Type{UInt16}, src::Vector{UInt8})
dst = UInt16[]
Expand Down
4 changes: 2 additions & 2 deletions base/env.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ function access_env(onError::Function, str::AbstractString)
error(string("getenv: ", str, ' ', len, "-1 != ", ret, ": ", Libc.FormatMessage()))
end
pop!(val) # NUL
return String(transcode(UInt8, val))
return transcode(String, val)
end

function _setenv(svar::AbstractString, sval::AbstractString, overwrite::Bool=true)
Expand Down Expand Up @@ -97,7 +97,7 @@ function next(hash::EnvHash, block::Tuple{Ptr{UInt16},Ptr{UInt16}})
len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos)
buf = Array{UInt16}(len)
unsafe_copy!(pointer(buf), pos, len)
env = String(transcode(UInt8, buf))
env = transcode(String, buf)
m = match(r"^(=?[^=]+)=(.*)$"s, env)
if m === nothing
error("malformed environment entry: $env")
Expand Down
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ export
strip,
strwidth,
summary,
transcode,
ucfirst,
unescape_string,
uppercase,
Expand Down
4 changes: 2 additions & 2 deletions base/file.jl
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ function tempdir()
error("GetTempPath failed: $(Libc.FormatMessage())")
end
resize!(temppath,lentemppath)
return String(transcode(UInt8, temppath))
return transcode(String, temppath)
end
tempname(uunique::UInt32=UInt32(0)) = tempname(tempdir(), uunique)
const temp_prefix = cwstring("jl_")
Expand All @@ -216,7 +216,7 @@ function tempname(temppath::AbstractString,uunique::UInt32)
error("GetTempFileName failed: $(Libc.FormatMessage())")
end
resize!(tname,lentname)
return String(transcode(UInt8, tname))
return transcode(String, tname)
end
function mktemp(parent=tempdir())
filename = tempname(parent, UInt32(0))
Expand Down
2 changes: 1 addition & 1 deletion base/interactiveutil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ elseif is_windows()
len = 0
while unsafe_load(plock, len+1) != 0; len += 1; end
# get Vector{UInt16}, transcode data to UTF-8, make a String of it
s = String(transcode(UInt8, unsafe_wrap(Array, plock, len)))
s = transcode(String, unsafe_wrap(Array, plock, len))
systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{UInt16},), plock))
return s
end
Expand Down
2 changes: 1 addition & 1 deletion base/libc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ if is_windows()
buf = Array{UInt16}(len)
unsafe_copy!(pointer(buf), p, len)
ccall(:LocalFree,stdcall,Ptr{Void},(Ptr{Void},),p)
return String(transcode(UInt8, buf))
return transcode(String, buf)
end
end

Expand Down
4 changes: 2 additions & 2 deletions base/path.jl
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ function realpath(path::AbstractString)
systemerror(:realpath, n == 0)
x = n < length(buf) # is the buffer big enough?
resize!(buf, n) # shrink if x, grow if !x
x && return String(transcode(UInt8, buf))
x && return transcode(String, buf)
end
end

Expand All @@ -150,7 +150,7 @@ function longpath(path::AbstractString)
systemerror(:longpath, n == 0)
x = n < length(buf) # is the buffer big enough?
resize!(buf, n) # shrink if x, grow if !x
x && return String(transcode(UInt8, buf))
x && return transcode(String, buf)
end
end

Expand Down
8 changes: 5 additions & 3 deletions doc/manual/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,16 @@ exception handling required:
<BLANKLINE>
y

Julia uses UTF-8 encoding by default, and support for new encodings can
Julia uses the UTF-8 encoding by default, and support for new encodings can
be added by packages. For example, the `LegacyStrings.jl
<https://github.com/JuliaArchive/LegacyStrings.jl>`_ package implements
``UTF16String`` and ``UTF32String`` types. Additional discussion of other
encodings and how to implement support for them is beyond the scope of this
document for the time being. For further discussion of UTF-8 encoding issues,
see the section below on `byte array literals <#Byte+Array+Literals>`_,
which goes into some greater detail.
see the section below on `byte array literals <#Byte+Array+Literals>`_.
The :func:`transcode` function is provided to convert data between
the various UTF-xx encodings, primarily for working with external
data and libraries.

.. _man-string-interpolation:

Expand Down
11 changes: 10 additions & 1 deletion doc/stdlib/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@
Convert a string to a contiguous byte array representation encoded as UTF-8 bytes. This representation is often appropriate for passing strings to C.

.. function:: transcode(T, src)

.. Docstring generated from Julia source
Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. (The alias ``Cwchar_t`` can also be used as the integer type, for converting ``wchar_t*`` strings used by external C libraries.)

The ``transcode`` function succeeds as long as the input data can be reasonably represented in the target encoding; it always succeeds for conversions between UTF-XX encodings, even for invalid Unicode data.

Only conversion to/from UTF-8 is currently supported.

.. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer])

.. Docstring generated from Julia source
Expand Down Expand Up @@ -472,4 +482,3 @@
.. Docstring generated from Julia source
General unescaping of traditional C and Unicode escape sequences. Reverse of :func:`escape_string`\ . See also :func:`unescape_string`\ .

8 changes: 7 additions & 1 deletion test/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ whos(IOBuffer(), Tmp14173) # warm up
@test @allocated(whos(IOBuffer(), Tmp14173)) < 10000

## test conversion from UTF-8 to UTF-16 (for Windows APIs)
import Base.Libc: transcode

# empty arrays
@test transcode(UInt16, UInt8[]) == UInt16[]
Expand Down Expand Up @@ -376,6 +375,13 @@ for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16
end
end

let s = "abcα🐨\0x\0"
for T in (UInt8, UInt16, UInt32, Int32)
@test transcode(T, s) == transcode(T, s.data)
@test transcode(String, transcode(T, s)) == s
end
end

# clipboard functionality
if is_windows()
for str in ("Hello, world.", "∀ x ∃ y", "")
Expand Down

0 comments on commit 1fc7018

Please sign in to comment.