Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use MurmurHash3 to allow for fast in-memory hashing with no conversion #26

Merged
merged 1 commit into from
Nov 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
name = "ShortStrings"
uuid = "63221d1c-8677-4ff0-9126-0ff0817b4975"
authors = ["Dai ZJ <[email protected]>"]
version = "0.2.6"
version = "0.2.7"

[deps]
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
MurmurHash3 = "b10b62ed-fbae-5ea5-b934-abaf0477b71d"

[compat]
MurmurHash3 = "1.1"
BitIntegers = "0.2"
SortingAlgorithms = "0.3"
julia = "1"
Expand Down
3 changes: 1 addition & 2 deletions src/ShortStrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ module ShortStrings

using BitIntegers
using SortingAlgorithms

export fsort, fsort!, ShortString,
ShortString3, ShortString7, ShortString15, ShortString30, ShortString62, ShortString126,
@ss3_str, @ss7_str, @ss15_str, @ss30_str, @ss62_str, @ss126_str

export hash # from hash.jl

include("base.jl")
include("hash.jl")

Expand Down
42 changes: 26 additions & 16 deletions src/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import Base:unsafe_getindex, ==, show, promote_rule

struct ShortString{T} <: AbstractString where T
struct ShortString{T} <: AbstractString where {T}
size_content::T
end

Expand All @@ -14,18 +14,24 @@ function check_size(T, sz)
end
end

function ShortString{T}(s::Union{String, SubString{String}}) where T
function ShortString{T}(s::Union{String, SubString{String}}) where {T}
sz = sizeof(s)
check_size(T, sz)
bits_to_wipe = 8(sizeof(T) - sz)

# Warning: if a SubString is at the very end of a string, which is at the end of allocated
# memory, this can cause an access violation, by trying to access past the end
# (for example, reading a 1 byte substring at the end of a length 119 string, could go past
# the end)

Comment on lines +21 to +26
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need a proper fix for this # #

# TODO some times this can throw errors for longish strings
# Exception: EXCEPTION_ACCESS_VIOLATION at 0x1e0b7afd -- bswap at C:\Users\RTX2080\.julia\packages\BitIntegers\xU40U\src\BitIntegers.jl:332 [inlined]
# ntoh at .\io.jl:541 [inlined]
content = (T(s |> pointer |> Ptr{T} |> Base.unsafe_load |> ntoh) >> bits_to_wipe) << bits_to_wipe
ShortString{T}(content | T(sz))
end

ShortString{T}(s::ShortString{T}) where T = s
ShortString{T}(s::ShortString{T}) where {T} = s
function ShortString{T}(s::ShortString{S}) where {T, S}
sz = sizeof(s)
check_size(T, sz)
Expand All @@ -44,27 +50,30 @@ Base.codeunit(s::ShortString) = UInt8
Base.codeunit(s::ShortString, i) = codeunits(String(s), i)
Base.codeunit(s::ShortString, i::Integer) = codeunit(String(s), i)
Base.codeunits(s::ShortString) = codeunits(String(s))
Base.convert(::ShortString{T}, s::String) where T = ShortString{T}(s)

Base.convert(::ShortString{T}, s::String) where {T} = ShortString{T}(s)
Base.convert(::String, ss::ShortString) = String(ss)
Base.display(s::ShortString) = display(String(s))

Base.sizeof(s::ShortString{T}) where {T} = Int(s.size_content & (size_mask(s) % UInt))
Base.firstindex(::ShortString) = 1
Base.isvalid(s::ShortString, i::Integer) = isvalid(String(s), i)
Base.iterate(s::ShortString) = iterate(String(s))
Base.iterate(s::ShortString, i::Integer) = iterate(String(s), i)
Base.lastindex(s::ShortString) = sizeof(s)
Base.ncodeunits(s::ShortString) = sizeof(s)

Base.display(s::ShortString) = display(String(s))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't show enough?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just moved that, to organize them a bit better, but I think you are right.

Base.print(s::ShortString) = print(String(s))
Base.show(io::IO, str::ShortString) = show(io, String(str))
Base.sizeof(s::ShortString{T}) where T = Int(s.size_content & (size_mask(s) % UInt))

size_nibbles(::Type{<:Union{UInt16, UInt32, UInt64, UInt128}}) = 1
size_nibbles(::Type{<:Union{Int16, Int32, Int64, Int128}}) = 1
size_nibbles(::Type{<:Union{UInt256, UInt512, UInt1024}}) = 2
size_nibbles(::Type{<:Union{Int256, Int512, Int1024}}) = 2
size_nibbles(::Type{T}) where T = ceil(log2(sizeof(T))/4)
size_nibbles(::Type{T}) where {T} = ceil(log2(sizeof(T))/4)

size_mask(T) = T(exp2(4*size_nibbles(T)) - 1)
size_mask(s::ShortString{T}) where T = size_mask(T)
size_mask(s::ShortString{T}) where {T} = size_mask(T)


# function Base.getindex(s::ShortString, i::Integer)
Expand All @@ -77,7 +86,7 @@ size_mask(s::ShortString{T}) where T = size_mask(T)

Base.collect(s::ShortString) = collect(String(s))

function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where S
function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where {S}
ncodeunits(b) == ncodeunits(s) || return false
return s == ShortString{S}(b)
end
Expand All @@ -88,7 +97,7 @@ function ==(s::ShortString, b::AbstractString)
end

==(a::AbstractString, b::ShortString) = b == a
function ==(a::ShortString{S}, b::ShortString{S}) where S
function ==(a::ShortString{S}, b::ShortString{S}) where {S}
return a.size_content == b.size_content
end
function ==(a::ShortString{A}, b::ShortString{B}) where {A,B}
Expand All @@ -98,12 +107,11 @@ function ==(a::ShortString{A}, b::ShortString{B}) where {A,B}
ntoh(a.size_content & ~size_mask(A)) == ntoh(b.size_content & ~size_mask(B))
end


function Base.cmp(a::ShortString{S}, b::ShortString{S}) where S
function Base.cmp(a::ShortString{S}, b::ShortString{S}) where {S}
return cmp(a.size_content, b.size_content)
end

promote_rule(::Type{String}, ::Type{ShortString{S}}) where S = String
promote_rule(::Type{String}, ::Type{ShortString{S}}) where {S} = String

function promote_rule(::Type{ShortString{T}}, ::Type{ShortString{S}}) where {T,S}
if sizeof(T) >= sizeof(S)
Expand All @@ -126,7 +134,9 @@ for T in (UInt1024, UInt512, UInt256, UInt128, UInt64, UInt32)
end
end

fsort(v::Vector{ShortString{T}}; rev = false) where T = sort(v, rev = rev, by = size_content, alg = RadixSort)
fsort!(v::Vector{ShortString{T}}; rev = false) where T = sort!(v, rev = rev, by = size_content, alg = RadixSort)
fsort(v::Vector{ShortString{T}}; rev = false) where {T} =
sort(v, rev = rev, by = size_content, alg = RadixSort)
fsort!(v::Vector{ShortString{T}}; rev = false) where {T} =
sort!(v, rev = rev, by = size_content, alg = RadixSort)

fsortperm(v::Vector{ShortString{T}}; rev = false) where T = sortperm(v, rev = rev)
fsortperm(v::Vector{ShortString{T}}; rev = false) where {T} = sortperm(v, rev = rev)
9 changes: 5 additions & 4 deletions src/hash.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
export hash
using MurmurHash3: mmhash128_a

import Base.hash

Base.hash(x::ShortString, h::UInt) = hash(String(x), h)
function Base.hash(x::ShortString, h::UInt)
h += Base.memhash_seed
last(mmhash128_a(sizeof(x), bswap(x.size_content), h%UInt32)) + h
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should be:

Suggested change
last(mmhash128_a(sizeof(x), bswap(x.size_content), h%UInt32)) + h
last(mmhash128_a(ncodeunits(x), bswap(x.size_content), h%UInt32)) + h

to be more semantic

end