diff --git a/base/strings/annotated.jl b/base/strings/annotated.jl index a85cdf1b08bbb..28aa092c578bd 100644 --- a/base/strings/annotated.jl +++ b/base/strings/annotated.jl @@ -399,6 +399,51 @@ Get all annotations of `chr`, in the form of a vector of annotation pairs. """ annotations(c::AnnotatedChar) = c.annotations +## Character transformation helper function, c.f. `unicode.jl`. + +""" + annotated_chartransform(f::Function, str::AnnotatedString, state=nothing) + +Transform every character in `str` with `f`, adjusting annotation regions as +appropriate. `f` must take one of two forms, either: +- `f(c::Char) -> Char`, or +- `f(c::Char, state) -> (Char, state)`. + +This works by comparing the number of code units of each character before and +after transforming with `f`, recording and aggregating any differences, then +applying them to the annotation regions. + +Returns an `AnnotatedString{String}` (regardless of the original underling +string type of `str`). +""" +function annotated_chartransform(f::Function, str::AnnotatedString, state=nothing) + outstr = IOBuffer() + annots = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[] + bytepos = firstindex(str) - 1 + offsets = [bytepos => 0] + for c in str.string + oldnb = ncodeunits(c) + bytepos += oldnb + if isnothing(state) + c = f(c) + else + c, state = f(c, state) + end + nb = write(outstr, c) + if nb != oldnb + push!(offsets, bytepos => last(last(offsets)) + nb - oldnb) + end + end + for annot in str.annotations + region, value = annot + start, stop = first(region), last(region) + start_offset = last(offsets[findlast(<=(start) ∘ first, offsets)::Int]) + stop_offset = last(offsets[findlast(<=(stop) ∘ first, offsets)::Int]) + push!(annots, ((start + start_offset):(stop + stop_offset), value)) + end + AnnotatedString(String(take!(outstr)), annots) +end + ## AnnotatedIOBuffer struct AnnotatedIOBuffer <: AbstractPipe diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 2e04633b87487..a3b06063b98ac 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -4,7 +4,8 @@ module Unicode import Base: show, ==, hash, string, Symbol, isless, length, eltype, - convert, isvalid, ismalformed, isoverlong, iterate + convert, isvalid, ismalformed, isoverlong, iterate, + AnnotatedString, AnnotatedChar, annotated_chartransform # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff @@ -271,6 +272,8 @@ julia> textwidth("March") """ textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0) +textwidth(s::AnnotatedString) = textwidth(s.string) + """ lowercase(c::AbstractChar) @@ -290,6 +293,8 @@ julia> lowercase('Ö') lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)) +lowercase(c::AnnotatedChar) = AnnotatedChar(lowercase(c.char), annotations(c)) + """ uppercase(c::AbstractChar) @@ -309,6 +314,8 @@ julia> uppercase('ê') uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)) +uppercase(c::AnnotatedChar) = AnnotatedChar(uppercase(c.char), annotations(c)) + """ titlecase(c::AbstractChar) @@ -332,6 +339,8 @@ julia> uppercase('dž') titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c)) +titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c)) + ############################################################################ # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category @@ -606,6 +615,7 @@ julia> uppercase("Julia") ``` """ uppercase(s::AbstractString) = map(uppercase, s) +uppercase(s::AnnotatedString) = annotated_chartransform(uppercase, s) """ lowercase(s::AbstractString) @@ -621,6 +631,7 @@ julia> lowercase("STRINGS AND THINGS") ``` """ lowercase(s::AbstractString) = map(lowercase, s) +lowercase(s::AnnotatedString) = annotated_chartransform(lowercase, s) """ titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String @@ -669,6 +680,23 @@ function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Boo return String(take!(b)) end +# TODO: improve performance characteristics, room for a ~10x improvement. +function titlecase(s::AnnotatedString; wordsep::Function = !isletter, strict::Bool=true) + initial_state = (; startword = true, state = Ref{Int32}(0), + c0 = eltype(s)(zero(UInt32)), wordsep, strict) + annotated_chartransform(s, initial_state) do c, state + if isgraphemebreak!(state.state, state.c0, c) && state.wordsep(c) + state = Base.setindex(state, true, :startword) + cnew = c + else + cnew = state.startword ? titlecase(c) : state.strict ? lowercase(c) : c + state = Base.setindex(state, false, :startword) + end + state = Base.setindex(state, c, :c0) + cnew, state + end +end + """ uppercasefirst(s::AbstractString) -> String @@ -693,6 +721,17 @@ function uppercasefirst(s::AbstractString) string(c′, SubString(s, nextind(s, 1))) end +# TODO: improve performance characteristics, room for a ~5x improvement. +function uppercasefirst(s::AnnotatedString) + annotated_chartransform(s, true) do c, state + if state + (titlecase(c), false) + else + (c, state) + end + end +end + """ lowercasefirst(s::AbstractString) @@ -715,6 +754,17 @@ function lowercasefirst(s::AbstractString) string(c′, SubString(s, nextind(s, 1))) end +# TODO: improve performance characteristics, room for a ~5x improvement. +function lowercasefirst(s::AnnotatedString) + annotated_chartransform(s, true) do c, state + if state + (lowercase(c), false) + else + (c, state) + end + end +end + ############################################################################ # iterators for grapheme segmentation diff --git a/test/strings/annotated.jl b/test/strings/annotated.jl index fda583bf7f778..b70a2350757a2 100644 --- a/test/strings/annotated.jl +++ b/test/strings/annotated.jl @@ -108,6 +108,33 @@ end @test reverse(str2) == Base.AnnotatedString("esac", [(2:3, :label => "oomph")]) end +@testset "Unicode" begin + for words in (["ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE", "Сodeunıts"], + ["Сodeunıts", "ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE"]) + ann_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)]) + for (i, w) in enumerate(words)] + ann_str = join(ann_words, '-') + for transform in (lowercase, uppercase, titlecase) + t_words = map(transform, words) + ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)]) + for (i, w) in enumerate(t_words)] + ann_t_str = join(ann_t_words, '-') + t_ann_str = transform(ann_str) + @test String(ann_t_str) == String(t_ann_str) + @test Base.annotations(ann_t_str) == Base.annotations(t_ann_str) + end + for transform in (uppercasefirst, lowercasefirst) + t_words = vcat(transform(first(words)), words[2:end]) + ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)]) + for (i, w) in enumerate(t_words)] + ann_t_str = join(ann_t_words, '-') + t_ann_str = transform(ann_str) + @test String(ann_t_str) == String(t_ann_str) + @test Base.annotations(ann_t_str) == Base.annotations(t_ann_str) + end + end +end + @testset "AnnotatedIOBuffer" begin aio = Base.AnnotatedIOBuffer() # Append-only writing