Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert to using only symbols for column names. #509

Merged
merged 1 commit into from
Jan 29, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -306,4 +306,5 @@ function data(rc::RComplex)
BitArray(imag(rc.data) .== R_NA_FLOAT64))
end

DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data), rl.attr["names"].data)
DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're reading in files from RDA format, we probably need to do column name cleaning before we can make symbols since there will almost certainly be columns with a . in their names.

Symbol[symbol(x) for x in rl.attr["names"].data])
27 changes: 13 additions & 14 deletions src/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ end
function DataFrame(;kwargs...)
result = DataFrame({}, Index())
for (k, v) in kwargs
result[string(k)] = v
result[k] = v
end
return result
end
Expand All @@ -73,16 +73,16 @@ end
#' @description
#'
#' Construct a DataFrame from a vector of columns and, optionally, specify
#' the names of the columns as a vector of strings.
#' the names of the columns as a vector of symbols.
#'
#' @returns df::DataFrame A newly constructed DataFrame.
#'
#' @examples
#'
#' df = DataFrame()
#' df = DataFrame(A = 1:3, B = ["x", "y", "z"])
function DataFrame{T <: String}(columns::Vector{Any},
cnames::Vector{T} = gennames(length(columns)))
function DataFrame(columns::Vector{Any},
cnames::Vector{Symbol} = gennames(length(columns)))
return DataFrame(columns, Index(cnames))
end

Expand Down Expand Up @@ -131,10 +131,10 @@ end

# Pandas' Dict of Vectors -> DataFrame constructor w/ explicit column names
function DataFrame(d::Dict)
cnames = sort(convert(Array{ByteString, 1}, collect(keys(d))))
cnames = sort(Symbol[x for x in keys(d)])
p = length(cnames)
if p == 0
DataFrame()
return DataFrame()
end
n = length(d[cnames[1]])
columns = Array(Any, p)
Expand Down Expand Up @@ -221,7 +221,7 @@ function DataFrame{D <: Associative}(ds::Vector{D})
end

# Initialize from a Vector of Associatives (aka list of dicts)
function DataFrame{D <: Associative, T <: String}(ds::Vector{D}, ks::Vector{T})
function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector{Symbol})
invoke(DataFrame, (Vector{D}, Vector), ds, ks)
end

Expand Down Expand Up @@ -340,7 +340,7 @@ index(df::DataFrame) = df.colindex
# Let getindex(df.columns[j], row_inds) from AbstractDataVector() handle
# the resolution of row indices

typealias ColumnIndex Union(Real, String, Symbol)
typealias ColumnIndex Union(Real, Symbol)

# df[SingleColumnIndex] => AbstractDataVector
function Base.getindex(df::DataFrame, col_ind::ColumnIndex)
Expand Down Expand Up @@ -401,14 +401,13 @@ function create_new_column_from_scalar(df::DataFrame, val::Any)
return DataArray(col_data, falses(n))
end

isnextcol(df::DataFrame, col_ind::String) = true
isnextcol(df::DataFrame, col_ind::Symbol) = true
function isnextcol(df::DataFrame, col_ind::Real)
return ncol(df) + 1 == int(col_ind)
end

function nextcolname(df::DataFrame)
return string("x", ncol(df) + 1)
return symbol(string("x", ncol(df) + 1))
end

# Will automatically add a new column if needed
Expand All @@ -427,7 +426,7 @@ function insert_single_column!(df::DataFrame,
j = df.colindex[col_ind]
df.columns[j] = dv
else
if typeof(col_ind) <: String || typeof(col_ind) <: Symbol
if typeof(col_ind) <: Symbol
push!(df.colindex, col_ind)
push!(df.columns, dv)
else
Expand Down Expand Up @@ -1277,7 +1276,7 @@ function dict(adf::AbstractDataFrame, flatten::Bool)
# TODO: Make flatten an option
# TODO: Provide a de-data option that makes Vector's, not
# DataVector's
res = Dict{UTF8String, Any}()
res = Dict{Symbol, Any}()
if flatten && nrow(adf) == 1
for colname in names(adf)
res[colname] = adf[colname][1]
Expand All @@ -1304,12 +1303,12 @@ dict(adf::AbstractDataFrame) = dict(adf, false)

pool(a::AbstractVector) = compact(PooledDataArray(a))

function pool!(df::AbstractDataFrame, cname::Union(Integer, String))
function pool!(df::AbstractDataFrame, cname::Union(Integer, Symbol))
df[cname] = pool(df[cname])
return
end

function pool!{T <: Union(Integer, String)}(df::AbstractDataFrame, cnames::Vector{T})
function pool!{T <: Union(Integer, Symbol)}(df::AbstractDataFrame, cnames::Vector{T})
for cname in cnames
df[cname] = pool(df[cname])
end
Expand Down
2 changes: 1 addition & 1 deletion src/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ colwise(fns::Vector{Function}, d::GroupedDataFrame, cn::Vector{String}) = map(co
colwise(fns::Vector{Function}) = x -> colwise(fns, x)

function colwise(d::AbstractDataFrame, s::Vector{Symbol}, cn::Vector)
header = [s2 * "_" * string(s1) for s1 in s, s2 in cn][:]
header = [symbol(string(s2) * "_" * string(s1)) for s1 in s, s2 in cn][:]
payload = colwise(map(eval, s), d)
df = DataFrame()
# TODO fix this to assign the longest column first or preallocate
Expand Down
31 changes: 14 additions & 17 deletions src/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ typealias Indices Union(Real, AbstractVector{Real})
abstract AbstractIndex

type Index <: AbstractIndex # an OrderedDict would be nice here...
lookup::Dict{ByteString, Indices} # name => names array position
names::Vector{ByteString}
lookup::Dict{Symbol, Indices} # name => names array position
names::Vector{Symbol}
end
function Index{T <: ByteString}(x::Vector{T})
x = make_unique(convert(Vector{ByteString}, x))
Index(Dict{ByteString, Indices}(tuple(x...), tuple([1:length(x)]...)), x)
function Index{T <: Symbol}(x::Vector{T})
x = make_unique(convert(Vector{Symbol}, x))
Index(Dict{Symbol, Indices}(tuple(x...), tuple([1:length(x)]...)), x)
end
Index() = Index(Dict{ByteString, Indices}(), ByteString[])
Index() = Index(Dict{Symbol, Indices}(), Symbol[])
Base.length(x::Index) = length(x.names)
Base.names(x::Index) = copy(x.names)
Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names))
Base.deepcopy(x::Index) = Index(deepcopy(x.lookup), deepcopy(x.names))
Base.isequal(x::Index, y::Index) = isequal(x.lookup, y.lookup) && isequal(x.names, y.names)

# I think this should be Vector{T <: String}
function names!(x::Index, nm::Vector)
# I think this should be Vector{T <: Symbol}
function names!(x::Index, nm::Vector{Symbol})
if length(nm) != length(x)
error("lengths don't match.")
end
Expand Down Expand Up @@ -51,17 +51,16 @@ function rename!(x::Index, nms)
end

rename!(x::Index, from, to) = rename!(x, zip(from, to))
rename!(x::Index, from::String, to::String) = rename!(x, ((from, to),))
rename!(x::Index, from::Symbol, to::Symbol) = rename!(x, ((from, to),))
rename!(x::Index, f::Function) = rename!(x, [(x,f(x)) for x in x.names])

rename(x::Index, args...) = rename!(copy(x), args...)

Base.haskey(x::Index, key::String) = haskey(x.lookup, key)
Base.haskey(x::Index, key::Symbol) = haskey(x.lookup, string(key))
Base.haskey(x::Index, key::Symbol) = haskey(x.lookup, key)
Base.haskey(x::Index, key::Real) = 1 <= key <= length(x.names)
Base.keys(x::Index) = names(x)

function Base.push!(x::Index, nm::String)
function Base.push!(x::Index, nm::Symbol)
x.lookup[nm] = length(x) + 1
push!(x.names, nm)
return x
Expand All @@ -77,24 +76,22 @@ function Base.delete!(x::Index, idx::Integer)
return x
end

function Base.delete!(x::Index, nm::String)
function Base.delete!(x::Index, nm::Symbol)
if !haskey(x.lookup, nm)
return x
end
idx = x.lookup[nm]
return delete!(x, idx)
end

Base.getindex(x::Index, idx::String) = x.lookup[idx]
Base.getindex(x::Index, idx::Symbol) = x.lookup[string(idx)]
Base.getindex(x::Index, idx::Symbol) = x.lookup[idx]
Base.getindex(x::AbstractIndex, idx::Real) = int(idx)
Base.getindex(x::AbstractIndex, idx::AbstractDataVector{Bool}) = getindex(x, array(idx, false))
Base.getindex{T}(x::AbstractIndex, idx::AbstractDataVector{T}) = getindex(x, dropna(idx))
Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) = find(idx)
Base.getindex(x::AbstractIndex, idx::Ranges) = [idx]
Base.getindex{T <: Real}(x::AbstractIndex, idx::AbstractVector{T}) = convert(Vector{Int}, idx)
Base.getindex{T <: String}(x::AbstractIndex, idx::AbstractVector{T}) = [[x.lookup[i] for i in idx]...]
Base.getindex{T <: Symbol}(x::AbstractIndex, idx::AbstractVector{T}) = [[x.lookup[string(i)] for i in idx]...]
Base.getindex(x::AbstractIndex, idx::AbstractVector{Symbol}) = [[x.lookup[i] for i in idx]...]

type SimpleIndex <: AbstractIndex
length::Integer
Expand Down
12 changes: 6 additions & 6 deletions src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ immutable ParsedCSV
quoted::BitVector # Was field quoted in text
end

immutable ParseOptions{S <: ByteString, T <: ByteString}
immutable ParseOptions{S <: ByteString}
header::Bool
separator::Char
quotemarks::Vector{Char}
Expand All @@ -16,7 +16,7 @@ immutable ParseOptions{S <: ByteString, T <: ByteString}
truestrings::Vector{S}
falsestrings::Vector{S}
makefactors::Bool
colnames::Vector{T}
colnames::Vector{Symbol}
cleannames::Bool
coltypes::Vector{DataType}
allowcomments::Bool
Expand Down Expand Up @@ -582,7 +582,7 @@ function builddf(rows::Integer,
end
end

function parsecolnames!(colnames::Vector{UTF8String},
function parsecolnames!(colnames::Vector{Symbol},
bytes::Vector{Uint8},
bounds::Vector{Int},
fields::Int)
Expand All @@ -596,9 +596,9 @@ function parsecolnames!(colnames::Vector{UTF8String},
left = bounds[j] + 2
right = bounds[j + 1]
if bytes[right] == '\r' || bytes[right] == '\n'
colnames[j] = bytestring(bytes[left:(right - 1)])
colnames[j] = symbol(bytestring(bytes[left:(right - 1)]))
else
colnames[j] = bytestring(bytes[left:right])
colnames[j] = symbol(bytestring(bytes[left:right]))
end
end

Expand Down Expand Up @@ -705,7 +705,7 @@ function readtable(pathname::String;
falsestrings::Vector = ASCIIString["F", "f", "FALSE", "false"],
makefactors::Bool = false,
nrows::Int = -1,
colnames::Vector = UTF8String[],
colnames::Vector = Symbol[],
cleannames::Bool = false,
coltypes::Vector{DataType} = DataType[],
allowcomments::Bool = false,
Expand Down
6 changes: 3 additions & 3 deletions src/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
##############################################################################

function stack(df::DataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int})
res = DataFrame[insert!(df[[i, id_vars]], 1, names(df)[i], "variable") for i in measure_vars]
res = DataFrame[insert!(df[[i, id_vars]], 1, names(df)[i], :variable) for i in measure_vars]
# fix column names
map(x -> names!(x, ["variable", "value", names(df[id_vars])]), res)
map(x -> names!(x, [:variable, :value, names(df[id_vars])]), res)
res = vcat(res)
res
end
Expand Down Expand Up @@ -44,7 +44,7 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
Nrow = length(refkeycol.pool)
Ncol = length(keycol.pool)
# TODO make fillNA(type, length)
payload = DataFrame({DataArray([fill(valuecol[1],Nrow)], fill(true, Nrow)) for i in 1:Ncol}, map(string, keycol.pool))
payload = DataFrame({DataArray([fill(valuecol[1],Nrow)], fill(true, Nrow)) for i in 1:Ncol}, map(symbol, keycol.pool))
nowarning = true
for k in 1:nrow(df)
j = int(keycol.refs[k])
Expand Down
12 changes: 6 additions & 6 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ function _uniqueofsorted(x::Vector)
x[idx]
end

function make_unique{S<:ByteString}(names::Vector{S})
function make_unique(names::Vector{Symbol})
x = Index()
names = copy(names)
dups = Int[]
Expand All @@ -38,7 +38,7 @@ function make_unique{S<:ByteString}(names::Vector{S})
newnm = nm
k = 1
while true
newnm = "$(nm)_$k"
newnm = symbol("$(nm)_$k")
if !haskey(x, newnm)
push!(x, newnm)
break
Expand All @@ -53,19 +53,19 @@ end
#' @description
#'
#' Generate standardized names for columns of a DataFrame. The
#' first name will be "x1", the second "x2", etc.
#' first name will be :x1, the second :x2, etc.
#'
#' @field n::Integer The number of names to generate.
#'
#' @returns names::Vector{UTF8String} A vector of standardized column names.
#' @returns names::Vector{Symbol} A vector of standardized column names.
#'
#' @examples
#'
#' DataFrames.gennames(10)
function gennames(n::Integer)
res = Array(UTF8String, n)
res = Array(Symbol, n)
for i in 1:n
res[i] = @sprintf "x%d" i
res[i] = symbol(@sprintf "x%d" i)
end
return res
end
Expand Down
10 changes: 5 additions & 5 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ module TestRDA
df = DataFrame(num = [1.1, 2.2])
@assert isequal(DataFrame(read_rda("test/data/RDA/minimal.rda")["df"]), df)

df["int"] = Int32[1, 2]
df["logi"] = [true, false]
df["chr"] = ["ab", "c"]
df["factor"] = pool(df["chr"])
df[:int] = Int32[1, 2]
df[:logi] = [true, false]
df[:chr] = ["ab", "c"]
df[:factor] = pool(df[:chr])
@assert isequal(DataFrame(read_rda("test/data/RDA/types.rda")["df"]), df)

df[2, :] = NA
df = df[:, ["num", "int", "logi", "factor"]] # (NA) chr breaks read_rda
df = df[:, [:num, :int, :logi, :factor]] # (NA) chr breaks read_rda
@assert isequal(DataFrame(read_rda("test/data/RDA/NAs.rda")["df"]), df)
end
17 changes: 8 additions & 9 deletions test/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ module TestConstructors
@test isequal(df.colindex, Index())

df = DataFrame({data(zeros(3)), data(ones(3))},
Index(["x1", "x2"]))
Index([:x1, :x2]))
@test size(df, 1) == 3
@test size(df, 2) == 2

Expand All @@ -26,20 +26,19 @@ module TestConstructors
DataFrame([0.0 1.0;
0.0 1.0;
0.0 1.0],
["x1", "x2"]))
[:x1, :x2]))
@test isequal(df,
DataFrame([0.0 1.0;
0.0 1.0;
0.0 1.0]))
@test isequal(df,
DataFrame(data(zeros(3)), data(ones(3))))

@test isequal(df, DataFrame({"x1" => [0.0, 0.0, 0.0],
"x2" => [1.0, 1.0, 1.0]}))
@test isequal(df, DataFrame({"x1" => [0.0, 0.0, 0.0],
"x2" => [1.0, 1.0, 1.0],
"x3" => [2.0, 2.0, 2.0]},
["x1", "x2"]))
@test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
x2 = [1.0, 1.0, 1.0]))
@test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
x2 = [1.0, 1.0, 1.0],
x3 = [2.0, 2.0, 2.0])[[:x1, :x2]])

df = DataFrame(Int, 2, 2)
@test size(df) == (2, 2)
Expand All @@ -49,7 +48,7 @@ module TestConstructors
@test size(df) == (2, 2)
@test all(types(df) .== [Float64, Float64])

df = DataFrame([Int, Float64], ["x1", "x2"], 2)
df = DataFrame([Int, Float64], [:x1, :x2], 2)
@test size(df) == (2, 2)
@test all(types(df) .== {Int, Float64})

Expand Down
Loading