diff --git a/base/string.jl b/base/string.jl index 3c363eda1e4c2..306ac61858a63 100644 --- a/base/string.jl +++ b/base/string.jl @@ -541,8 +541,6 @@ startswith(a::Array{UInt8,1}, b::Array{UInt8,1}) = ## character column width function ## strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w) -strwidth(s::ByteString) = Int(ccall(:u8_strwidth, Csize_t, (Ptr{UInt8},), s.data)) -# TODO: implement and use u8_strnwidth that takes a length argument isascii(c::Char) = c < Char(0x80) isascii(s::AbstractString) = all(isascii, s) diff --git a/base/utf8.jl b/base/utf8.jl index c254a50c0060d..ef36d6afe59f4 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -37,7 +37,8 @@ function endof(s::UTF8String) end i end -length(s::UTF8String) = Int(ccall(:u8_strlen, Csize_t, (Ptr{UInt8},), s.data)) +length(s::UTF8String) = Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t), + s.data, length(s.data))) function next(s::UTF8String, i::Int) # potentially faster version diff --git a/base/utf8proc.jl b/base/utf8proc.jl index f07328a812768..7b514feb0e1f5 100644 --- a/base/utf8proc.jl +++ b/base/utf8proc.jl @@ -46,7 +46,6 @@ const UTF8PROC_CATEGORY_CF = 27 const UTF8PROC_CATEGORY_CS = 28 const UTF8PROC_CATEGORY_CO = 29 -const UTF8PROC_NULLTERM = (1<<0) const UTF8PROC_STABLE = (1<<1) const UTF8PROC_COMPAT = (1<<2) const UTF8PROC_COMPOSE = (1<<3) @@ -64,22 +63,21 @@ const UTF8PROC_STRIPMARK = (1<<13) ############################################################################ -let - const p = Array(Ptr{UInt8}, 1) - global utf8proc_map - function utf8proc_map(s::AbstractString, flags::Integer) - result = ccall(:utf8proc_map, Cssize_t, - (Ptr{UInt8}, Cssize_t, Ptr{Ptr{UInt8}}, Cint), - s, 0, p, flags | UTF8PROC_NULLTERM) - result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8}, - (Cssize_t,), result))) - a = ccall(:jl_ptr_to_array_1d, Vector{UInt8}, - (Any, Ptr{UInt8}, Csize_t, Cint), - Vector{UInt8}, p[1], result, true) - ccall(:jl_array_to_string, Any, (Any,), a)::ByteString - end +function utf8proc_map(s::ByteString, flags::Integer) + p = Ref{Ptr{UInt8}}() + result = ccall(:utf8proc_map, Cssize_t, + (Ptr{UInt8}, Cssize_t, Ref{Ptr{UInt8}}, Cint), + s, sizeof(s), p, flags) + result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8}, + (Cssize_t,), result))) + a = ccall(:jl_ptr_to_array_1d, Vector{UInt8}, + (Any, Ptr{UInt8}, Csize_t, Cint), + Vector{UInt8}, p[], result, true) + ccall(:jl_array_to_string, Any, (Any,), a)::ByteString end +utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(bytestring(s), flags) + function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) flags = 0 stable && (flags = flags | UTF8PROC_STABLE) diff --git a/test/unicode.jl b/test/unicode.jl index c672e1bcb65e5..d2fc1961d627d 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -129,3 +129,8 @@ end # up-to-date character widths (#3721, #6939) @test charwidth('\U1f355') == strwidth("\U1f355") == strwidth(utf16("\U1f355")) == strwidth("\U1f355\u0302") == strwidth(utf16("\U1f355\u0302")) == 2 + +# handling of embedded NUL chars (#10958) +@test length("\0w") == length("\0α") == 2 +@test strwidth("\0w") == strwidth("\0α") == 1 +@test normalize_string("\0W", casefold=true) == "\0w"