From 6059c89cb0a14eea79e96ab32e96454274ef9d4f Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Sat, 31 Oct 2020 13:43:25 -0400 Subject: [PATCH] Use MurmurHash3 to allow for fast in-memory hashing with no conversion --- Project.toml | 4 +++- src/ShortStrings.jl | 3 +-- src/base.jl | 42 ++++++++++++++++++++++++++---------------- src/hash.jl | 9 +++++---- 4 files changed, 35 insertions(+), 23 deletions(-) diff --git a/Project.toml b/Project.toml index 5d9a346..a439680 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,15 @@ name = "ShortStrings" uuid = "63221d1c-8677-4ff0-9126-0ff0817b4975" authors = ["Dai ZJ "] -version = "0.2.6" +version = "0.2.7" [deps] BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +MurmurHash3 = "b10b62ed-fbae-5ea5-b934-abaf0477b71d" [compat] +MurmurHash3 = "1.1" BitIntegers = "0.2" SortingAlgorithms = "0.3" julia = "1" diff --git a/src/ShortStrings.jl b/src/ShortStrings.jl index ddd07f9..3b3ba71 100644 --- a/src/ShortStrings.jl +++ b/src/ShortStrings.jl @@ -2,12 +2,11 @@ module ShortStrings using BitIntegers using SortingAlgorithms + export fsort, fsort!, ShortString, ShortString3, ShortString7, ShortString15, ShortString30, ShortString62, ShortString126, @ss3_str, @ss7_str, @ss15_str, @ss30_str, @ss62_str, @ss126_str -export hash # from hash.jl - include("base.jl") include("hash.jl") diff --git a/src/base.jl b/src/base.jl index 18dd9d0..958928e 100644 --- a/src/base.jl +++ b/src/base.jl @@ -2,7 +2,7 @@ import Base:unsafe_getindex, ==, show, promote_rule -struct ShortString{T} <: AbstractString where T +struct ShortString{T} <: AbstractString where {T} size_content::T end @@ -14,10 +14,16 @@ function check_size(T, sz) end end -function ShortString{T}(s::Union{String, SubString{String}}) where T +function ShortString{T}(s::Union{String, SubString{String}}) where {T} sz = sizeof(s) check_size(T, sz) bits_to_wipe = 8(sizeof(T) - sz) + + # Warning: if a SubString is at the very end of a string, which is at the end of allocated + # memory, this can cause an access violation, by trying to access past the end + # (for example, reading a 1 byte substring at the end of a length 119 string, could go past + # the end) + # TODO some times this can throw errors for longish strings # Exception: EXCEPTION_ACCESS_VIOLATION at 0x1e0b7afd -- bswap at C:\Users\RTX2080\.julia\packages\BitIntegers\xU40U\src\BitIntegers.jl:332 [inlined] # ntoh at .\io.jl:541 [inlined] @@ -25,7 +31,7 @@ function ShortString{T}(s::Union{String, SubString{String}}) where T ShortString{T}(content | T(sz)) end -ShortString{T}(s::ShortString{T}) where T = s +ShortString{T}(s::ShortString{T}) where {T} = s function ShortString{T}(s::ShortString{S}) where {T, S} sz = sizeof(s) check_size(T, sz) @@ -44,27 +50,30 @@ Base.codeunit(s::ShortString) = UInt8 Base.codeunit(s::ShortString, i) = codeunits(String(s), i) Base.codeunit(s::ShortString, i::Integer) = codeunit(String(s), i) Base.codeunits(s::ShortString) = codeunits(String(s)) -Base.convert(::ShortString{T}, s::String) where T = ShortString{T}(s) + +Base.convert(::ShortString{T}, s::String) where {T} = ShortString{T}(s) Base.convert(::String, ss::ShortString) = String(ss) -Base.display(s::ShortString) = display(String(s)) + +Base.sizeof(s::ShortString{T}) where {T} = Int(s.size_content & (size_mask(s) % UInt)) Base.firstindex(::ShortString) = 1 Base.isvalid(s::ShortString, i::Integer) = isvalid(String(s), i) Base.iterate(s::ShortString) = iterate(String(s)) Base.iterate(s::ShortString, i::Integer) = iterate(String(s), i) Base.lastindex(s::ShortString) = sizeof(s) Base.ncodeunits(s::ShortString) = sizeof(s) + +Base.display(s::ShortString) = display(String(s)) Base.print(s::ShortString) = print(String(s)) Base.show(io::IO, str::ShortString) = show(io, String(str)) -Base.sizeof(s::ShortString{T}) where T = Int(s.size_content & (size_mask(s) % UInt)) size_nibbles(::Type{<:Union{UInt16, UInt32, UInt64, UInt128}}) = 1 size_nibbles(::Type{<:Union{Int16, Int32, Int64, Int128}}) = 1 size_nibbles(::Type{<:Union{UInt256, UInt512, UInt1024}}) = 2 size_nibbles(::Type{<:Union{Int256, Int512, Int1024}}) = 2 -size_nibbles(::Type{T}) where T = ceil(log2(sizeof(T))/4) +size_nibbles(::Type{T}) where {T} = ceil(log2(sizeof(T))/4) size_mask(T) = T(exp2(4*size_nibbles(T)) - 1) -size_mask(s::ShortString{T}) where T = size_mask(T) +size_mask(s::ShortString{T}) where {T} = size_mask(T) # function Base.getindex(s::ShortString, i::Integer) @@ -77,7 +86,7 @@ size_mask(s::ShortString{T}) where T = size_mask(T) Base.collect(s::ShortString) = collect(String(s)) -function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where S +function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where {S} ncodeunits(b) == ncodeunits(s) || return false return s == ShortString{S}(b) end @@ -88,7 +97,7 @@ function ==(s::ShortString, b::AbstractString) end ==(a::AbstractString, b::ShortString) = b == a -function ==(a::ShortString{S}, b::ShortString{S}) where S +function ==(a::ShortString{S}, b::ShortString{S}) where {S} return a.size_content == b.size_content end function ==(a::ShortString{A}, b::ShortString{B}) where {A,B} @@ -98,12 +107,11 @@ function ==(a::ShortString{A}, b::ShortString{B}) where {A,B} ntoh(a.size_content & ~size_mask(A)) == ntoh(b.size_content & ~size_mask(B)) end - -function Base.cmp(a::ShortString{S}, b::ShortString{S}) where S +function Base.cmp(a::ShortString{S}, b::ShortString{S}) where {S} return cmp(a.size_content, b.size_content) end -promote_rule(::Type{String}, ::Type{ShortString{S}}) where S = String +promote_rule(::Type{String}, ::Type{ShortString{S}}) where {S} = String function promote_rule(::Type{ShortString{T}}, ::Type{ShortString{S}}) where {T,S} if sizeof(T) >= sizeof(S) @@ -126,7 +134,9 @@ for T in (UInt1024, UInt512, UInt256, UInt128, UInt64, UInt32) end end -fsort(v::Vector{ShortString{T}}; rev = false) where T = sort(v, rev = rev, by = size_content, alg = RadixSort) -fsort!(v::Vector{ShortString{T}}; rev = false) where T = sort!(v, rev = rev, by = size_content, alg = RadixSort) +fsort(v::Vector{ShortString{T}}; rev = false) where {T} = + sort(v, rev = rev, by = size_content, alg = RadixSort) +fsort!(v::Vector{ShortString{T}}; rev = false) where {T} = + sort!(v, rev = rev, by = size_content, alg = RadixSort) -fsortperm(v::Vector{ShortString{T}}; rev = false) where T = sortperm(v, rev = rev) +fsortperm(v::Vector{ShortString{T}}; rev = false) where {T} = sortperm(v, rev = rev) diff --git a/src/hash.jl b/src/hash.jl index 467dcac..7713864 100644 --- a/src/hash.jl +++ b/src/hash.jl @@ -1,5 +1,6 @@ -export hash +using MurmurHash3: mmhash128_a -import Base.hash - -Base.hash(x::ShortString, h::UInt) = hash(String(x), h) +function Base.hash(x::ShortString, h::UInt) + h += Base.memhash_seed + last(mmhash128_a(sizeof(x), bswap(x.size_content), h%UInt32)) + h +end