From b192bf0e2f0a0c5b232c3352c6e996e5f11053f2 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 24 Apr 2015 11:47:59 -0400 Subject: [PATCH] fix #10958: buggy handling of embedded NUL chars (cherry picked from commit 1d90e973869075a2bd4476dda3c6480a4fce9aa2) ref PR #10991 Conflicts: base/string.jl base/utf8.jl base/utf8proc.jl test/unicode.jl --- base/string.jl | 2 -- base/utf8.jl | 3 ++- base/utf8proc.jl | 7 ++++--- test/unicode.jl | 5 +++++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/base/string.jl b/base/string.jl index 4bc4d5633f876..221f5207eeb79 100644 --- a/base/string.jl +++ b/base/string.jl @@ -538,8 +538,6 @@ beginswith(a::Array{Uint8,1}, b::Array{Uint8,1}) = charwidth(c::Char) = max(0,int(ccall(:wcwidth, Int32, (Uint32,), c))) strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w) -strwidth(s::ByteString) = int(ccall(:u8_strwidth, Csize_t, (Ptr{Uint8},), s.data)) -# TODO: implement and use u8_strnwidth that takes a length argument ## libc character class predicates ## diff --git a/base/utf8.jl b/base/utf8.jl index 684a3b5e437a8..ac7ded8a3336f 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -37,7 +37,8 @@ function endof(s::UTF8String) end i end -length(s::UTF8String) = int(ccall(:u8_strlen, Csize_t, (Ptr{Uint8},), s.data)) +length(s::UTF8String) = int(ccall(:u8_charnum, Csize_t, (Ptr{Uint8}, Csize_t), + s.data, length(s.data))) function next(s::UTF8String, i::Int) # potentially faster version diff --git a/base/utf8proc.jl b/base/utf8proc.jl index 594dfe08d8ba2..8656e0c1d70f5 100644 --- a/base/utf8proc.jl +++ b/base/utf8proc.jl @@ -41,7 +41,6 @@ const UTF8PROC_CATEGORY_CS = 28 const UTF8PROC_CATEGORY_CO = 29 const UTF8PROC_CATEGORY_CN = 30 -const UTF8PROC_NULLTERM = (1<<0) const UTF8PROC_STABLE = (1<<1) const UTF8PROC_COMPAT = (1<<2) const UTF8PROC_COMPOSE = (1<<3) @@ -60,10 +59,10 @@ const UTF8PROC_STRIPMARK = (1<<13) let const p = Array(Ptr{Uint8}, 1) global utf8proc_map - function utf8proc_map(s::String, flags::Integer) + function utf8proc_map(s::ByteString, flags::Integer) result = ccall(:utf8proc_map, Cssize_t, (Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint), - s, 0, p, flags | UTF8PROC_NULLTERM) + s, sizeof(s), p, flags) result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8}, (Cssize_t,), result))) a = ccall(:jl_ptr_to_array_1d, Vector{Uint8}, @@ -73,6 +72,8 @@ let end end +utf8proc_map(s::String, flags::Integer) = utf8proc_map(bytestring(s), flags) + function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) flags = 0 stable && (flags = flags | UTF8PROC_STABLE) diff --git a/test/unicode.jl b/test/unicode.jl index ec99f04c1102b..724574f26f7bd 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -99,3 +99,8 @@ let c_ll = 'β', c_cn = '\u038B' # check codepoint with category code CN @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN end + +# handling of embedded NUL chars (#10958) +@test length("\0w") == length("\0α") == 2 +@test strwidth("\0w") == strwidth("\0α") == 1 +@test normalize_string("\0W", casefold=true) == "\0w"