-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use MurmurHash3 to allow for fast in-memory hashing with no conversion #26
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,15 @@ | ||
name = "ShortStrings" | ||
uuid = "63221d1c-8677-4ff0-9126-0ff0817b4975" | ||
authors = ["Dai ZJ <[email protected]>"] | ||
version = "0.2.6" | ||
version = "0.2.7" | ||
|
||
[deps] | ||
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" | ||
SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" | ||
MurmurHash3 = "b10b62ed-fbae-5ea5-b934-abaf0477b71d" | ||
|
||
[compat] | ||
MurmurHash3 = "1.1" | ||
BitIntegers = "0.2" | ||
SortingAlgorithms = "0.3" | ||
julia = "1" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
|
||
import Base:unsafe_getindex, ==, show, promote_rule | ||
|
||
struct ShortString{T} <: AbstractString where T | ||
struct ShortString{T} <: AbstractString where {T} | ||
size_content::T | ||
end | ||
|
||
|
@@ -14,18 +14,24 @@ function check_size(T, sz) | |
end | ||
end | ||
|
||
function ShortString{T}(s::Union{String, SubString{String}}) where T | ||
function ShortString{T}(s::Union{String, SubString{String}}) where {T} | ||
sz = sizeof(s) | ||
check_size(T, sz) | ||
bits_to_wipe = 8(sizeof(T) - sz) | ||
|
||
# Warning: if a SubString is at the very end of a string, which is at the end of allocated | ||
# memory, this can cause an access violation, by trying to access past the end | ||
# (for example, reading a 1 byte substring at the end of a length 119 string, could go past | ||
# the end) | ||
|
||
# TODO some times this can throw errors for longish strings | ||
# Exception: EXCEPTION_ACCESS_VIOLATION at 0x1e0b7afd -- bswap at C:\Users\RTX2080\.julia\packages\BitIntegers\xU40U\src\BitIntegers.jl:332 [inlined] | ||
# ntoh at .\io.jl:541 [inlined] | ||
content = (T(s |> pointer |> Ptr{T} |> Base.unsafe_load |> ntoh) >> bits_to_wipe) << bits_to_wipe | ||
ShortString{T}(content | T(sz)) | ||
end | ||
|
||
ShortString{T}(s::ShortString{T}) where T = s | ||
ShortString{T}(s::ShortString{T}) where {T} = s | ||
function ShortString{T}(s::ShortString{S}) where {T, S} | ||
sz = sizeof(s) | ||
check_size(T, sz) | ||
|
@@ -44,27 +50,30 @@ Base.codeunit(s::ShortString) = UInt8 | |
Base.codeunit(s::ShortString, i) = codeunits(String(s), i) | ||
Base.codeunit(s::ShortString, i::Integer) = codeunit(String(s), i) | ||
Base.codeunits(s::ShortString) = codeunits(String(s)) | ||
Base.convert(::ShortString{T}, s::String) where T = ShortString{T}(s) | ||
|
||
Base.convert(::ShortString{T}, s::String) where {T} = ShortString{T}(s) | ||
Base.convert(::String, ss::ShortString) = String(ss) | ||
Base.display(s::ShortString) = display(String(s)) | ||
|
||
Base.sizeof(s::ShortString{T}) where {T} = Int(s.size_content & (size_mask(s) % UInt)) | ||
Base.firstindex(::ShortString) = 1 | ||
Base.isvalid(s::ShortString, i::Integer) = isvalid(String(s), i) | ||
Base.iterate(s::ShortString) = iterate(String(s)) | ||
Base.iterate(s::ShortString, i::Integer) = iterate(String(s), i) | ||
Base.lastindex(s::ShortString) = sizeof(s) | ||
Base.ncodeunits(s::ShortString) = sizeof(s) | ||
|
||
Base.display(s::ShortString) = display(String(s)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just moved that, to organize them a bit better, but I think you are right. |
||
Base.print(s::ShortString) = print(String(s)) | ||
Base.show(io::IO, str::ShortString) = show(io, String(str)) | ||
Base.sizeof(s::ShortString{T}) where T = Int(s.size_content & (size_mask(s) % UInt)) | ||
|
||
size_nibbles(::Type{<:Union{UInt16, UInt32, UInt64, UInt128}}) = 1 | ||
size_nibbles(::Type{<:Union{Int16, Int32, Int64, Int128}}) = 1 | ||
size_nibbles(::Type{<:Union{UInt256, UInt512, UInt1024}}) = 2 | ||
size_nibbles(::Type{<:Union{Int256, Int512, Int1024}}) = 2 | ||
size_nibbles(::Type{T}) where T = ceil(log2(sizeof(T))/4) | ||
size_nibbles(::Type{T}) where {T} = ceil(log2(sizeof(T))/4) | ||
|
||
size_mask(T) = T(exp2(4*size_nibbles(T)) - 1) | ||
size_mask(s::ShortString{T}) where T = size_mask(T) | ||
size_mask(s::ShortString{T}) where {T} = size_mask(T) | ||
|
||
|
||
# function Base.getindex(s::ShortString, i::Integer) | ||
|
@@ -77,7 +86,7 @@ size_mask(s::ShortString{T}) where T = size_mask(T) | |
|
||
Base.collect(s::ShortString) = collect(String(s)) | ||
|
||
function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where S | ||
function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where {S} | ||
ncodeunits(b) == ncodeunits(s) || return false | ||
return s == ShortString{S}(b) | ||
end | ||
|
@@ -88,7 +97,7 @@ function ==(s::ShortString, b::AbstractString) | |
end | ||
|
||
==(a::AbstractString, b::ShortString) = b == a | ||
function ==(a::ShortString{S}, b::ShortString{S}) where S | ||
function ==(a::ShortString{S}, b::ShortString{S}) where {S} | ||
return a.size_content == b.size_content | ||
end | ||
function ==(a::ShortString{A}, b::ShortString{B}) where {A,B} | ||
|
@@ -98,12 +107,11 @@ function ==(a::ShortString{A}, b::ShortString{B}) where {A,B} | |
ntoh(a.size_content & ~size_mask(A)) == ntoh(b.size_content & ~size_mask(B)) | ||
end | ||
|
||
|
||
function Base.cmp(a::ShortString{S}, b::ShortString{S}) where S | ||
function Base.cmp(a::ShortString{S}, b::ShortString{S}) where {S} | ||
return cmp(a.size_content, b.size_content) | ||
end | ||
|
||
promote_rule(::Type{String}, ::Type{ShortString{S}}) where S = String | ||
promote_rule(::Type{String}, ::Type{ShortString{S}}) where {S} = String | ||
|
||
function promote_rule(::Type{ShortString{T}}, ::Type{ShortString{S}}) where {T,S} | ||
if sizeof(T) >= sizeof(S) | ||
|
@@ -126,7 +134,9 @@ for T in (UInt1024, UInt512, UInt256, UInt128, UInt64, UInt32) | |
end | ||
end | ||
|
||
fsort(v::Vector{ShortString{T}}; rev = false) where T = sort(v, rev = rev, by = size_content, alg = RadixSort) | ||
fsort!(v::Vector{ShortString{T}}; rev = false) where T = sort!(v, rev = rev, by = size_content, alg = RadixSort) | ||
fsort(v::Vector{ShortString{T}}; rev = false) where {T} = | ||
sort(v, rev = rev, by = size_content, alg = RadixSort) | ||
fsort!(v::Vector{ShortString{T}}; rev = false) where {T} = | ||
sort!(v, rev = rev, by = size_content, alg = RadixSort) | ||
|
||
fsortperm(v::Vector{ShortString{T}}; rev = false) where T = sortperm(v, rev = rev) | ||
fsortperm(v::Vector{ShortString{T}}; rev = false) where {T} = sortperm(v, rev = rev) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||
export hash | ||||||
using MurmurHash3: mmhash128_a | ||||||
|
||||||
import Base.hash | ||||||
|
||||||
Base.hash(x::ShortString, h::UInt) = hash(String(x), h) | ||||||
function Base.hash(x::ShortString, h::UInt) | ||||||
h += Base.memhash_seed | ||||||
last(mmhash128_a(sizeof(x), bswap(x.size_content), h%UInt32)) + h | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this should be:
Suggested change
to be more semantic |
||||||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we need a proper fix for this # #