Skip to content

Commit

Permalink
Merge pull request #10991 from stevengj/nullsafe
Browse files Browse the repository at this point in the history
fix #10958: buggy handling of embedded NUL chars
  • Loading branch information
stevengj committed Apr 24, 2015
2 parents cad0325 + 1d90e97 commit 8bcdb3f
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 18 deletions.
2 changes: 0 additions & 2 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -541,8 +541,6 @@ startswith(a::Array{UInt8,1}, b::Array{UInt8,1}) =
## character column width function ##

strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w)
strwidth(s::ByteString) = Int(ccall(:u8_strwidth, Csize_t, (Ptr{UInt8},), s.data))
# TODO: implement and use u8_strnwidth that takes a length argument

isascii(c::Char) = c < Char(0x80)
isascii(s::AbstractString) = all(isascii, s)
Expand Down
3 changes: 2 additions & 1 deletion base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ function endof(s::UTF8String)
end
i
end
length(s::UTF8String) = Int(ccall(:u8_strlen, Csize_t, (Ptr{UInt8},), s.data))
length(s::UTF8String) = Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
s.data, length(s.data)))

function next(s::UTF8String, i::Int)
# potentially faster version
Expand Down
28 changes: 13 additions & 15 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ const UTF8PROC_CATEGORY_CF = 27
const UTF8PROC_CATEGORY_CS = 28
const UTF8PROC_CATEGORY_CO = 29

const UTF8PROC_NULLTERM = (1<<0)
const UTF8PROC_STABLE = (1<<1)
const UTF8PROC_COMPAT = (1<<2)
const UTF8PROC_COMPOSE = (1<<3)
Expand All @@ -64,22 +63,21 @@ const UTF8PROC_STRIPMARK = (1<<13)

############################################################################

let
const p = Array(Ptr{UInt8}, 1)
global utf8proc_map
function utf8proc_map(s::AbstractString, flags::Integer)
result = ccall(:utf8proc_map, Cssize_t,
(Ptr{UInt8}, Cssize_t, Ptr{Ptr{UInt8}}, Cint),
s, 0, p, flags | UTF8PROC_NULLTERM)
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8},
(Cssize_t,), result)))
a = ccall(:jl_ptr_to_array_1d, Vector{UInt8},
(Any, Ptr{UInt8}, Csize_t, Cint),
Vector{UInt8}, p[1], result, true)
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
end
function utf8proc_map(s::ByteString, flags::Integer)
p = Ref{Ptr{UInt8}}()
result = ccall(:utf8proc_map, Cssize_t,
(Ptr{UInt8}, Cssize_t, Ref{Ptr{UInt8}}, Cint),
s, sizeof(s), p, flags)
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8},
(Cssize_t,), result)))
a = ccall(:jl_ptr_to_array_1d, Vector{UInt8},
(Any, Ptr{UInt8}, Csize_t, Cint),
Vector{UInt8}, p[], result, true)
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
end

utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(bytestring(s), flags)

function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
Expand Down
5 changes: 5 additions & 0 deletions test/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,8 @@ end

# up-to-date character widths (#3721, #6939)
@test charwidth('\U1f355') == strwidth("\U1f355") == strwidth(utf16("\U1f355")) == strwidth("\U1f355\u0302") == strwidth(utf16("\U1f355\u0302")) == 2

# handling of embedded NUL chars (#10958)
@test length("\0w") == length("\0α") == 2
@test strwidth("\0w") == strwidth("\0α") == 1
@test normalize_string("\0W", casefold=true) == "\0w"

0 comments on commit 8bcdb3f

Please sign in to comment.