diff --git a/Project.toml b/Project.toml index ef16280..47c5df1 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "ShortStrings" uuid = "63221d1c-8677-4ff0-9126-0ff0817b4975" authors = ["Dai ZJ ", "ScottPJones ", "Lyndon White "] -version = "0.3.2" +version = "0.3.3" [deps] BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" diff --git a/README.jmd b/README.jmd index 355cdb1..5f3cad5 100644 --- a/README.jmd +++ b/README.jmd @@ -1,7 +1,7 @@ ## ShortStrings This is an efficient string format for storing strings using integer types. For example, `UInt32` can hold 3 bytes of string with 1 byte to record the size of the string and a `UInt128` can hold a byte string with 1 byte to record the size of the string. -Using BitIntegers.jl, integer of larger size than `UInt128` can be defined. This package support string with up to 126 bytes in size. +Using BitIntegers.jl, integer of larger size than `UInt128` can be defined. This package support string with up to 255 bytes in size. ## Quick Start ```julia @@ -27,6 +27,15 @@ ShortString3(randstring(3)) s15 = ss15"A short string" # ShortString15 === ShortString{Int128} s7 = ss7"shorter" # ShortString7 === ShortString{Int64} s3 = ss3"srt" # ShortString3 === ShortString{Int32} + +# The ShortString constructor can automatically select the shortest size that a string will fit in +ShortString("This is a long string") + +# The maximum length can also be added: +ShortString("Foo", 15) + +# The `ss` macro will also select the shortest size that will fit +s31 = ss"This also is a long string" ``` ## Benchmarks @@ -86,8 +95,12 @@ This is based on the discussion [here](https://discourse.julialang.org/t/progres # Build Status -[![Build Status](https://travis-ci.org/xiaodaigh/ShortStrings.jl.svg?branch=master)](https://travis-ci.org/xiaodaigh/ShortStrings.jl) - -[![Coverage Status](https://coveralls.io/repos/xiaodaigh/ShortStrings.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/xiaodaigh/ShortStrings.jl?branch=master) +[contrib]: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat +[travis-url]: https://travis-ci.org/JuliaString/ShortStrings.jl +[travis-img]: https://travis-ci.org/JuliaString/ShortStrings.jl.svg?branch=master +[codecov-url]: https://codecov.io/gh/JuliaString/ShortStrings.jl?branch=master +[codecov-img]: https://codecov.io/gh/JuliaString/ShortStrings.jl/branch/master/graph/badge.svg -[![codecov.io](http://codecov.io/github/xiaodaigh/ShortStrings.jl/coverage.svg?branch=master)](http://codecov.io/github/xiaodaigh/ShortStrings.jl?branch=master) +[![contributions welcome][contrib]](https://github.com/JuliaString/Strs.jl/issues) +[![][travis-img]][travis-url] +[![][codecov-img]][codecov-url] diff --git a/README.md b/README.md index d03f2a3..eae5d12 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,10 @@ ## ShortStrings This is an efficient string format for storing strings using integer types. For example, `UInt32` can hold 3 bytes of string with 1 byte to record the size of the string and a `UInt128` can hold a byte string with 1 byte to record the size of the string. -Using BitIntegers.jl, integer of larger size than `UInt128` can be defined. This package support string with up to 126 bytes in size. +Using BitIntegers.jl, integer of larger size than `UInt128` can be defined. This package support string with up to 255 bytes in size. ## Quick Start -````julia - +```julia using ShortStrings using SortingAlgorithms @@ -28,23 +27,32 @@ ShortString3(randstring(3)) s15 = ss15"A short string" # ShortString15 === ShortString{Int128} s7 = ss7"shorter" # ShortString7 === ShortString{Int64} s3 = ss3"srt" # ShortString3 === ShortString{Int32} -```` +# The ShortString constructor can automatically select the shortest size that a string will fit in +ShortString("This is a long string") -```` -0.305076 seconds (9 allocations: 11.445 MiB) - 0.166334 seconds (259.04 k allocations: 44.419 MiB) -"srt" -```` +# The maximum length can also be added: +ShortString("Foo", 15) +# The `ss` macro will also select the shortest size that will fit +s31 = ss"This also is a long string" +``` +``` +0.386383 seconds (126 allocations: 11.450 MiB, 18.62% gc time, 0.59% comp +ilation time) + 0.279547 seconds (742.26 k allocations: 74.320 MiB, 70.85% compilation ti +me) +"This also is a long string" +``` -## Benchmarks -````julia +## Benchmarks + +```julia using SortingLab, ShortStrings, SortingAlgorithms, BenchmarkTools; N = Int(1e6); svec = [randstring(rand(1:15)) for i=1:N]; @@ -59,7 +67,6 @@ sort(ssvec, by = x->x.size_content, alg=RadixSort) using RCall @rput svec; r_timings = R""" -memory.limit(2^31-1) replicate($(length(short_radixsort.times)), system.time(sort(svec, method="radix"))[3]) """; @@ -68,13 +75,11 @@ bar(["Base.sort","SortingLab.radixsort","ShortStrings radix sort", "R radix sort mean.([basesort.times./1e9, radixsort_timings.times./1e9, short_radixsort.times./1e9, r_timings]), title="String sort performance - len: 1m, variable size 15", label = "seconds") -```` - +``` ![](figures/README_2_1.png) -````julia - +```julia using SortingLab, ShortStrings, SortingAlgorithms, BenchmarkTools; N = Int(1e6); svec = rand([randstring(rand(1:15)) for i=1:N÷100],N) @@ -96,8 +101,7 @@ bar(["Base.sort","SortingLab.radixsort","ShortStrings radix sort", "R radix sort mean.([basesort.times./1e9, radixsort_timings.times./1e9, short_radixsort.times./1e9, r_timings]), title="String sort performance - len: $(N÷1_000_000)m, fixed size: 15", label = "seconds") -```` - +``` ![](figures/README_3_1.png) @@ -108,8 +112,12 @@ This is based on the discussion [here](https://discourse.julialang.org/t/progres # Build Status -[![Build Status](https://travis-ci.org/xiaodaigh/ShortStrings.jl.svg?branch=master)](https://travis-ci.org/xiaodaigh/ShortStrings.jl) - -[![Coverage Status](https://coveralls.io/repos/xiaodaigh/ShortStrings.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/xiaodaigh/ShortStrings.jl?branch=master) +[contrib]: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat +[travis-url]: https://travis-ci.org/JuliaString/ShortStrings.jl +[travis-img]: https://travis-ci.org/JuliaString/ShortStrings.jl.svg?branch=master +[codecov-url]: https://codecov.io/gh/JuliaString/ShortStrings.jl?branch=master +[codecov-img]: https://codecov.io/gh/JuliaString/ShortStrings.jl/branch/master/graph/badge.svg -[![codecov.io](http://codecov.io/github/xiaodaigh/ShortStrings.jl/coverage.svg?branch=master)](http://codecov.io/github/xiaodaigh/ShortStrings.jl?branch=master) +[![contributions welcome][contrib]](https://github.com/JuliaString/Strs.jl/issues) +[![][travis-img]][travis-url] +[![][codecov-img]][codecov-url] diff --git a/build-readme.jl b/build-readme.jl index 378f08c..8cb0c37 100644 --- a/build-readme.jl +++ b/build-readme.jl @@ -1,7 +1,6 @@ # Weave readme using Pkg -cd("c:/git/ShortStrings/") -Pkg.activate("c:/git/ShortStrings/readme-env") +Pkg.activate("./readme-env") using Weave diff --git a/figures/README_2_1.png b/figures/README_2_1.png index d973bc8..a716b13 100644 Binary files a/figures/README_2_1.png and b/figures/README_2_1.png differ diff --git a/figures/README_3_1.png b/figures/README_3_1.png index fb2f577..fc493c9 100644 Binary files a/figures/README_3_1.png and b/figures/README_3_1.png differ diff --git a/src/ShortStrings.jl b/src/ShortStrings.jl index 33ba514..076cb7e 100644 --- a/src/ShortStrings.jl +++ b/src/ShortStrings.jl @@ -5,7 +5,7 @@ using SortingAlgorithms export fsort, fsort!, ShortString, ShortString3, ShortString7, ShortString15 export ShortString31, ShortString63, ShortString127, ShortString255 -export @ss3_str, @ss7_str, @ss15_str, @ss31_str, @ss63_str, @ss127_str, @ss255_str +export @ss_str, @ss3_str, @ss7_str, @ss15_str, @ss31_str, @ss63_str, @ss127_str, @ss255_str include("base.jl") diff --git a/src/base.jl b/src/base.jl index b8b1375..59d9f98 100644 --- a/src/base.jl +++ b/src/base.jl @@ -10,7 +10,7 @@ struct ShortString{T} <: AbstractString where {T} size_content::T end -# check if a string of size `sz` can be stored in ShortString{T}` +"""Check if a string of size `sz` can be stored in ShortString{T}""" function check_size(T, sz) max_len = sizeof(T) - size_bytes(T) # the last few bytes are used to store the length if sz > max_len @@ -18,14 +18,46 @@ function check_size(T, sz) end end +"""Calculate the number of bytes required to store the size of the ShortString""" size_bytes(::Type{T}) where {T} = (count_ones(sizeof(T)-1)+7)>>3 +"""Calculate a mask to get the size stored in the ShortString""" size_mask(T) = T((1<<(size_bytes(T)*8)) - 1) size_mask(s::ShortString{T}) where {T} = size_mask(T) +"""The size of the chunk used to process String values""" const CHUNKSZ = sizeof(UInt) + +"""The number of bits in the chunk type used to process String values""" const CHUNKBITS = sizeof(UInt) == 4 ? 32 : 64 +"""Internal function to pick up a byte at the given index in a ShortString""" +@inline _get_byte(s::ShortString, i::Int) = (s.size_content >>> (8*(sizeof(s) - i)))%UInt8 + +""" +Internal function to pick up a UInt32 (i.e. to contain 1 Char) at the given index +in a ShortString +""" +@inline function _get_word(s::ShortString{T}, i::Int) where {T} + sz = sizeof(T) + if sz <= 4 + # Shift up by 0-3 bytes + (s.size_content%UInt32) << (8*(i + 3 - sz)) + else + (s.size_content >>> (8*(sz - i - 3)))%UInt32 + end +end + +"""Internal function to get the UInt32 representation of a Char from an index in a ShortString""" +@inline function _get_char(str::ShortString, pos::Int) + chr = _get_word(str, pos) + typ = chr >>> 28 + chr & ~ifelse(typ < 0x8, 0xffffff, + ifelse(typ < 0xe, 0x00ffff, + ifelse(typ < 0xf, 0x0000ff, 0x000000))) +end + +"""Internal function, given a String and it's size in bytes, load it into a value of type T""" @inline function _ss(::Type{T}, str::String, sz) where {T} if sizeof(T) <= sizeof(UInt) unsafe_load(reinterpret(Ptr{T}, pointer(str))) @@ -80,6 +112,7 @@ end """Amount to shift ShortString value by for each UInt sized chunk""" const SHFT_INT = UInt === UInt32 ? 2 : 3 +# Optimized conversion of a ShortString to a String function String(s::ShortString{T}) where {T} len = sizeof(s) len === 0 && return "" @@ -89,7 +122,7 @@ function String(s::ShortString{T}) where {T} pnt = reinterpret(Ptr{UInt}, pointer(sv)) for i = 1:(len + sizeof(UInt) - 1) >>> SHFT_INT unsafe_store!(pnt, val % UInt) - val >>>= SHFT_INT + val >>>= 8*sizeof(UInt) pnt += sizeof(UInt) end String(sv) @@ -106,21 +139,11 @@ Base.convert(::String, ss::ShortString) = String(ss) Base.sizeof(s::ShortString) = Int(s.size_content & size_mask(s)) Base.firstindex(::ShortString) = 1 -Base.isvalid(s::ShortString, i::Integer) = isvalid(String(s), i) Base.lastindex(s::ShortString) = sizeof(s) Base.ncodeunits(s::ShortString) = sizeof(s) -Base.show(io::IO, str::ShortString) = show(io, String(str)) - -@inline function _get_word(s::ShortString{T}, i::Int) where {T} - sz = sizeof(T) - if sz <= 4 - # Shift up by 0-3 bytes - (s.size_content%UInt32) << (8*(i + 3 - sz)) - else - (s.size_content >>> (8*(sz - i - 3)))%UInt32 - end -end +# Checks top two bits of first byte of character to see if valid position +isvalid(s::String, i::Integer) = (0 < i <= sizeof(s)) && ((_get_byte(s, i) & 0xc0) != 0x80) @inline function Base.iterate(s::ShortString, i::Int=1) 0 < i <= ncodeunits(s) || return nothing @@ -168,14 +191,6 @@ function Base.length(s::ShortString{T}) where T return len end -@inline function _get_char(str::ShortString, pos::Int) - chr = _get_word(str, pos) - typ = chr >>> 28 - chr & ~ifelse(typ < 0x8, 0xffffff, - ifelse(typ < 0xe, 0x00ffff, - ifelse(typ < 0xf, 0x0000ff, 0x000000))) -end - @propagate_inbounds function Base.getindex(str::ShortString, pos::Int=1) @_inline_meta() @boundscheck checkbounds(str, pos) @@ -221,7 +236,10 @@ size_content(s::ShortString) = s.size_content @define_integers 2048 Int2048 UInt2048 -for T in (UInt2048, UInt1024, UInt512, UInt256, UInt128, UInt64, UInt32) +"""These are the default types used to for selecting the size of a ShortString""" +const def_types = (UInt32, UInt64, UInt128, UInt256, UInt512, UInt1024, UInt2048) + +for T in def_types max_len = sizeof(T) - size_bytes(T) constructor_name = Symbol(:ShortString, max_len) macro_name = Symbol(:ss, max_len, :_str) @@ -232,6 +250,26 @@ for T in (UInt2048, UInt1024, UInt512, UInt256, UInt128, UInt64, UInt32) end end +""" +Return a ShortString type that can hold maxlen codeunits +The keyword parameter `types` can be used to pass a list of types +which can be used to store the string +If no type is large enough, then an `ArgumentError` is thrown +""" +function get_type(maxlen; types=def_types) + for T in types + maxlen <= sizeof(T) - size_bytes(T) && return ShortString{T} + end + throw(ArgumentError("$maxlen is too large to fit into any of the provided types: $types")) +end + +ShortString(str::Union{String,SubString{String}}, maxlen = sizeof(str); types=def_types) = + get_type(maxlen, types=types)(str) + +macro ss_str(str, max="0") + :( ShortString($str, $(parse(Int, max))) ) +end + fsort(v::Vector{ShortString{T}}; rev = false) where {T} = sort(v, rev = rev, by = size_content, alg = RadixSort) fsort!(v::Vector{ShortString{T}}; rev = false) where {T} =