From b545cf8000c494713c55899a7dd75f1dbae811fd Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 15 Oct 2020 21:33:16 -0600 Subject: [PATCH] Improve inferrability of getproperty(::Row2, ::Symbol) Fixes #752. This is a case of us not providing just not quite enough information to the compiler, along with the compiler itself being too clever. The default for `CSV.Rows` is to treat each column as `Union{String, Missing}`, which results in the `V` type parameter of `CSV.Rows` being `CSV.PosLen`, instead of `Any`. If that's the case, we should get pretty good inferrability for `getproperty(::Row2, ::Symbol)`, because we should be able to know the return value will at least be `Union{String, Missing}`. This knowledge, however, was trapped in the "csv domain" and not expressed clearly enough to the compiler. It inspected `Tables.getcolumn(::Row2, nm::Symbol)` and saw that it called `Tables.getcolumn(::Row2, i::Int)`, which in turn called `Tables.getcolumn(::Row2, T, i, nm)`. This is all fine an expected, except that when we started supporting non-String types for `CSV.Rows` (i.e. you can pass in whatever type you want and we'll parse it directly from the file for each row), we added an additional typed `Tables.getcolumn` method that handled all the non-String columns. Oops. Now the compiler is confused because from `Tables.getcolumn(::Row2, nm::Symbol)` it knows that it can return `missing`, a `String`, or if we call this third method, it'll return an instance of our `V` type parameter, which, if you'll remember, in the default case is `CSV.PosLen`, or more simply, `UInt64`. So we ended up with a return type of `Union{Missing, UInt64, String}`, which makes downstream operations even trickier to figure out. Luckily, the solution here is to just help connect the dots for the compiler: i.e. define specialize methods that dispatch on `V`, specifically when `V === UInt64`. Then the compiler will see/know that we will only ever call the `Union{String, Missing}` method and can ignore the custom types codepath. This PR also rearranges a few `@inbounds` uses since we can avoid the bounds checks further down the stack once we've checked them higher up. --- src/rows.jl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/rows.jl b/src/rows.jl index 1417af54..3e3fb6e9 100644 --- a/src/rows.jl +++ b/src/rows.jl @@ -278,6 +278,8 @@ Base.checkbounds(r::Row2, i) = 0 < i < length(r) Tables.getcolumn(r::Row2, nm::Symbol) = Tables.getcolumn(r, getlookup(r)[nm]) Tables.getcolumn(r::Row2, i::Int) = Tables.getcolumn(r, gettypes(r)[i], i, getnames(r)[i]) +Tables.getcolumn(r::Row2{O, O2, PosLen}, nm::Symbol) where {O, O2} = @inbounds Tables.getcolumn(r, getlookup(r)[nm]) + Base.@propagate_inbounds function Tables.getcolumn(r::Row2, ::Type{Missing}, i::Int, nm::Symbol) @boundscheck checkbounds(r, i) return missing @@ -290,6 +292,8 @@ Base.@propagate_inbounds function Tables.getcolumn(r::Row2, ::Type{T}, i::Int, n return x end +Base.@propagate_inbounds Tables.getcolumn(r::Row2{O, O2, PosLen}, ::Type{T}, i::Int, nm::Symbol) where {T, O, O2} = error("row values are string only; requested type $T not supported; see `Parsers.parse(row, $T, $i)`") + Base.@propagate_inbounds function Tables.getcolumn(r::Row2, ::Union{Type{Union{Missing, String}}, Type{String}}, i::Int, nm::Symbol) @boundscheck checkbounds(r, i) j = getcolumnmap(r)[i] @@ -303,6 +307,18 @@ Base.@propagate_inbounds function Tables.getcolumn(r::Row2, ::Union{Type{Union{M end end +Tables.getcolumn(r::Row2{O, O2, PosLen}, ::Union{Type{Union{Missing, String}}, Type{String}}, i::Int, nm::Symbol) where {O, O2} = Tables.getcolumn(r, i) +Base.@propagate_inbounds function Tables.getcolumn(r::Row2{O, O2, PosLen}, i::Int) where {O, O2} + @boundscheck checkbounds(r, i) + @inbounds j = getcolumnmap(r)[i] + @inbounds poslen = getvalues(r)[j] + if poslen isa Missing + return missing + else + return @inbounds str(getbuf(r), gete(r), poslen) + end +end + @noinline stringsonly() = error("Parsers.parse only allowed on String column types") Base.@propagate_inbounds function Parsers.parse(::Type{T}, r::Row2, i::Int) where {T}