From b192bf0e2f0a0c5b232c3352c6e996e5f11053f2 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Fri, 24 Apr 2015 11:47:59 -0400
Subject: [PATCH] fix #10958: buggy handling of embedded NUL chars

(cherry picked from commit 1d90e973869075a2bd4476dda3c6480a4fce9aa2)
ref PR #10991

Conflicts:
	base/string.jl
	base/utf8.jl
	base/utf8proc.jl
	test/unicode.jl
---
 base/string.jl   | 2 --
 base/utf8.jl     | 3 ++-
 base/utf8proc.jl | 7 ++++---
 test/unicode.jl  | 5 +++++
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/base/string.jl b/base/string.jl
index 4bc4d5633f876..221f5207eeb79 100644
--- a/base/string.jl
+++ b/base/string.jl
@@ -538,8 +538,6 @@ beginswith(a::Array{Uint8,1}, b::Array{Uint8,1}) =
 
 charwidth(c::Char) = max(0,int(ccall(:wcwidth, Int32, (Uint32,), c)))
 strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w)
-strwidth(s::ByteString) = int(ccall(:u8_strwidth, Csize_t, (Ptr{Uint8},), s.data))
-# TODO: implement and use u8_strnwidth that takes a length argument
 
 ## libc character class predicates ##
 
diff --git a/base/utf8.jl b/base/utf8.jl
index 684a3b5e437a8..ac7ded8a3336f 100644
--- a/base/utf8.jl
+++ b/base/utf8.jl
@@ -37,7 +37,8 @@ function endof(s::UTF8String)
     end
     i
 end
-length(s::UTF8String) = int(ccall(:u8_strlen, Csize_t, (Ptr{Uint8},), s.data))
+length(s::UTF8String) = int(ccall(:u8_charnum, Csize_t, (Ptr{Uint8}, Csize_t),
+                                  s.data, length(s.data)))
 
 function next(s::UTF8String, i::Int)
     # potentially faster version
diff --git a/base/utf8proc.jl b/base/utf8proc.jl
index 594dfe08d8ba2..8656e0c1d70f5 100644
--- a/base/utf8proc.jl
+++ b/base/utf8proc.jl
@@ -41,7 +41,6 @@ const UTF8PROC_CATEGORY_CS = 28
 const UTF8PROC_CATEGORY_CO = 29
 const UTF8PROC_CATEGORY_CN = 30
 
-const UTF8PROC_NULLTERM  = (1<<0)
 const UTF8PROC_STABLE    = (1<<1)
 const UTF8PROC_COMPAT    = (1<<2)
 const UTF8PROC_COMPOSE   = (1<<3)
@@ -60,10 +59,10 @@ const UTF8PROC_STRIPMARK = (1<<13)
 let
     const p = Array(Ptr{Uint8}, 1)
     global utf8proc_map
-    function utf8proc_map(s::String, flags::Integer)
+    function utf8proc_map(s::ByteString, flags::Integer)
         result = ccall(:utf8proc_map, Cssize_t,
                        (Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
-                       s, 0, p, flags | UTF8PROC_NULLTERM)
+                       s, sizeof(s), p, flags)
         result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
                                              (Cssize_t,), result)))
         a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
@@ -73,6 +72,8 @@ let
     end
 end
 
+utf8proc_map(s::String, flags::Integer) = utf8proc_map(bytestring(s), flags)
+
 function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
     flags = 0
     stable && (flags = flags | UTF8PROC_STABLE)
diff --git a/test/unicode.jl b/test/unicode.jl
index ec99f04c1102b..724574f26f7bd 100644
--- a/test/unicode.jl
+++ b/test/unicode.jl
@@ -99,3 +99,8 @@ let c_ll = 'β', c_cn = '\u038B'
     # check codepoint with category code CN
     @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
 end
+
+# handling of embedded NUL chars (#10958)
+@test length("\0w") == length("\0α") == 2
+@test strwidth("\0w") == strwidth("\0α") == 1
+@test normalize_string("\0W", casefold=true) == "\0w"