Skip to content

Commit

Permalink
fixed corruption of beyond-UInt8 characters
Browse files Browse the repository at this point in the history
  • Loading branch information
ExpandingMan committed Sep 11, 2018
1 parent e02a232 commit f78ac6c
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 18 deletions.
5 changes: 4 additions & 1 deletion src/lists.jl
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ totalbytes(A::AbstractList) = valuesbytes(A) + minbitmaskbytes(A) + offsetsbytes

# helper function for offsets
_offsize(::Type{C}, x) where C = sizeof(x)
_offsize(::Type{C}, x::AbstractString) where C = sizeof(C)*length(x)
_offsize(::Type{C}, x::AbstractString) where {C} = sizeof(C)*ncodeunits(x)

# TODO how to deal with sizeof of Arrow objects such as lists?
# note that this works fine with missings because sizeof(missing) == 0
Expand All @@ -307,6 +307,9 @@ function offsets(::Type{K}, ::Type{C}, v::AbstractVector) where {K<:Integer,C}
off
end
offsets(::Type{K}, v::AbstractVector{C}) where {K<:Integer,C} = offsets(K, C, v)
function offsets(::Type{K}, v::AbstractVector{String}) where {K}
offsets(K, UInt8, v)
end
function offsets(v::AbstractVector{<:AbstractString})
throw(ArgumentError("must specify encoding type for computing string offsets"))
end
Expand Down
35 changes: 18 additions & 17 deletions test/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ end
@testset "indexing_List_buffer" begin
len = 5
offstype = rand(OFFSET_ELTYPES)
offs = convert(Vector{offstype}, [0,4,7,8,12,14])
vals = convert(Vector{UInt8}, codeunits("firewalkwithme"))
offs = convert(Vector{offstype}, [0,7,15,18,24,30])
vals = convert(Vector{UInt8}, codeunits("fireα walk∀ with🐱 me🐟"))
valspad = zeros(UInt8, Arrow.paddinglength(length(vals)))
lpad = randpad()
rpad = randpad()
Expand All @@ -154,14 +154,14 @@ end
len, UInt8, length(vals))
@test offsets(l)[:] == offs
@test values(l)[:] == vals
@test l[1] == "fire"
@test l[2] == "wal"
@test l[3] == "k"
@test l[4] == "with"
@test l[5] == "me"
@test l[[1,3,5]] == ["fire", "k", "me"]
@test l[[false,true,false,true,false]] == ["wal", "with"]
@test l[:] == ["fire", "wal", "k", "with", "me"]
@test l[1] == "fireα "
@test l[2] == "walk∀ "
@test l[3] == "wit"
@test l[4] == "h🐱 "
@test l[5] == "me🐟"
@test l[[1,3,5]] == ["fireα ", "wit", "me🐟"]
@test l[[false,true,false,true,false]] == ["walk∀ ", "h🐱 "]
@test l[:] == ["fireα ", "walk∀ ", "wit", "h🐱 ", "me🐟"]
end


Expand All @@ -187,7 +187,8 @@ end
len = 7
offstype = rand(OFFSET_ELTYPES)
offs = convert(Vector{offstype}, [0,4,9,9,14,14,17,21])
vals = convert(Vector{UInt8}, codeunits("kirkspockbonesncc1701"))
offs = convert(Vector{offstype}, [0,9,17,17,28,28,31,35])
vals = convert(Vector{UInt8}, codeunits("kirk 🚀η spockbones, 💀ncc1701"))
valspad = zeros(UInt8, Arrow.paddinglength(length(vals)))
pres = Bool[true,true,false,true,false,true,true]
mask = Arrow.bitpackpadded(pres)
Expand All @@ -199,16 +200,16 @@ end
len, UInt8, length(vals))
@test offsets(l)[:] == offs
@test values(l)[:] == vals
@test l[1] == "kirk"
@test l[2] == "spock"
@test l[1] == "kirk 🚀"
@test l[2] == "η spock"
@test ismissing(l[3])
@test l[4] == "bones"
@test l[4] == "bones, 💀"
@test ismissing(l[5])
@test l[6] == "ncc"
@test l[7] == "1701"
@test l[[2,5,4]] ["spock", missing, "bones"]
@test l[[true,false,true,false,false,false,false]] ["kirk", missing]
@test l[:] ["kirk", "spock", missing, "bones", missing, "ncc", "1701"]
@test l[[2,5,4]] ["η spock", missing, "bones, 💀"]
@test l[[true,false,true,false,false,false,false]] ["kirk 🚀", missing]
@test l[:] ["kirk 🚀", "η spock", missing, "bones, 💀", missing, "ncc", "1701"]
end


Expand Down

0 comments on commit f78ac6c

Please sign in to comment.