JuliaData · tshort · Jan 29, 2014 · Jan 28, 2014 · johnmyleswhite · Jan 28, 2014
diff --git a/src/RDA.jl b/src/RDA.jl
@@ -306,4 +306,5 @@ function data(rc::RComplex)
               BitArray(imag(rc.data) .== R_NA_FLOAT64))
 end
 
-DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data), rl.attr["names"].data)
+DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data), 
+                                 Symbol[symbol(x) for x in rl.attr["names"].data]) 
diff --git a/src/dataframe.jl b/src/dataframe.jl
@@ -52,7 +52,7 @@ end
 function DataFrame(;kwargs...)
     result = DataFrame({}, Index())
     for (k, v) in kwargs
-        result[string(k)] = v
+        result[k] = v
     end
     return result
 end
@@ -73,16 +73,16 @@ end
 #' @description
 #'
 #' Construct a DataFrame from a vector of columns and, optionally, specify
-#' the names of the columns as a vector of strings.
+#' the names of the columns as a vector of symbols.
 #'
 #' @returns df::DataFrame A newly constructed DataFrame.
 #'
 #' @examples
 #'
 #' df = DataFrame()
 #' df = DataFrame(A = 1:3, B = ["x", "y", "z"])
-function DataFrame{T <: String}(columns::Vector{Any},
-                                cnames::Vector{T} = gennames(length(columns)))
+function DataFrame(columns::Vector{Any},
+                   cnames::Vector{Symbol} = gennames(length(columns)))
     return DataFrame(columns, Index(cnames))
 end
 
@@ -131,10 +131,10 @@ end
 
 # Pandas' Dict of Vectors -> DataFrame constructor w/ explicit column names
 function DataFrame(d::Dict)
-    cnames = sort(convert(Array{ByteString, 1}, collect(keys(d))))
+    cnames = sort(Symbol[x for x in keys(d)])
     p = length(cnames)
     if p == 0
-        DataFrame()
+        return DataFrame()
     end
     n = length(d[cnames[1]])
     columns = Array(Any, p)
@@ -221,7 +221,7 @@ function DataFrame{D <: Associative}(ds::Vector{D})
 end
 
 # Initialize from a Vector of Associatives (aka list of dicts)
-function DataFrame{D <: Associative, T <: String}(ds::Vector{D}, ks::Vector{T})
+function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector{Symbol})
     invoke(DataFrame, (Vector{D}, Vector), ds, ks)
 end
 
@@ -340,7 +340,7 @@ index(df::DataFrame) = df.colindex
 # Let getindex(df.columns[j], row_inds) from AbstractDataVector() handle
 #  the resolution of row indices
 
-typealias ColumnIndex Union(Real, String, Symbol)
+typealias ColumnIndex Union(Real, Symbol)
 
 # df[SingleColumnIndex] => AbstractDataVector
 function Base.getindex(df::DataFrame, col_ind::ColumnIndex)
@@ -401,14 +401,13 @@ function create_new_column_from_scalar(df::DataFrame, val::Any)
     return DataArray(col_data, falses(n))
 end
 
-isnextcol(df::DataFrame, col_ind::String) = true
 isnextcol(df::DataFrame, col_ind::Symbol) = true
 function isnextcol(df::DataFrame, col_ind::Real)
     return ncol(df) + 1 == int(col_ind)
 end
 
 function nextcolname(df::DataFrame)
-    return string("x", ncol(df) + 1)
+    return symbol(string("x", ncol(df) + 1))
 end
 
 # Will automatically add a new column if needed
@@ -427,7 +426,7 @@ function insert_single_column!(df::DataFrame,
         j = df.colindex[col_ind]
         df.columns[j] = dv
     else
-        if typeof(col_ind) <: String || typeof(col_ind) <: Symbol
+        if typeof(col_ind) <: Symbol
             push!(df.colindex, col_ind)
             push!(df.columns, dv)
         else
@@ -1277,7 +1276,7 @@ function dict(adf::AbstractDataFrame, flatten::Bool)
     # TODO: Make flatten an option
     # TODO: Provide a de-data option that makes Vector's, not
     #       DataVector's
-    res = Dict{UTF8String, Any}()
+    res = Dict{Symbol, Any}()
     if flatten && nrow(adf) == 1
         for colname in names(adf)
             res[colname] = adf[colname][1]
@@ -1304,12 +1303,12 @@ dict(adf::AbstractDataFrame) = dict(adf, false)
 
 pool(a::AbstractVector) = compact(PooledDataArray(a))
 
-function pool!(df::AbstractDataFrame, cname::Union(Integer, String))
+function pool!(df::AbstractDataFrame, cname::Union(Integer, Symbol))
     df[cname] = pool(df[cname])
     return
 end
 
-function pool!{T <: Union(Integer, String)}(df::AbstractDataFrame, cnames::Vector{T})
+function pool!{T <: Union(Integer, Symbol)}(df::AbstractDataFrame, cnames::Vector{T})
     for cname in cnames
         df[cname] = pool(df[cname])
     end

diff --git a/src/grouping.jl b/src/grouping.jl
@@ -160,7 +160,7 @@ colwise(fns::Vector{Function}, d::GroupedDataFrame, cn::Vector{String}) = map(co
 colwise(fns::Vector{Function}) = x -> colwise(fns, x)
 
 function colwise(d::AbstractDataFrame, s::Vector{Symbol}, cn::Vector)
-    header = [s2 * "_" * string(s1) for s1 in s, s2 in cn][:]
+    header = [symbol(string(s2) * "_" * string(s1)) for s1 in s, s2 in cn][:]
     payload = colwise(map(eval, s), d)
     df = DataFrame()
     # TODO fix this to assign the longest column first or preallocate

diff --git a/src/index.jl b/src/index.jl
@@ -8,22 +8,22 @@ typealias Indices Union(Real, AbstractVector{Real})
 abstract AbstractIndex
 
 type Index <: AbstractIndex   # an OrderedDict would be nice here...
-    lookup::Dict{ByteString, Indices}      # name => names array position
-    names::Vector{ByteString}
+    lookup::Dict{Symbol, Indices}      # name => names array position
+    names::Vector{Symbol}
 end
-function Index{T <: ByteString}(x::Vector{T})
-    x = make_unique(convert(Vector{ByteString}, x))
-    Index(Dict{ByteString, Indices}(tuple(x...), tuple([1:length(x)]...)), x)
+function Index{T <: Symbol}(x::Vector{T})
+    x = make_unique(convert(Vector{Symbol}, x))
+    Index(Dict{Symbol, Indices}(tuple(x...), tuple([1:length(x)]...)), x)
 end
-Index() = Index(Dict{ByteString, Indices}(), ByteString[])
+Index() = Index(Dict{Symbol, Indices}(), Symbol[])
 Base.length(x::Index) = length(x.names)
 Base.names(x::Index) = copy(x.names)
 Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names))
 Base.deepcopy(x::Index) = Index(deepcopy(x.lookup), deepcopy(x.names))
 Base.isequal(x::Index, y::Index) = isequal(x.lookup, y.lookup) && isequal(x.names, y.names)
 
-# I think this should be Vector{T <: String}
-function names!(x::Index, nm::Vector)
+# I think this should be Vector{T <: Symbol}
+function names!(x::Index, nm::Vector{Symbol})
     if length(nm) != length(x)
         error("lengths don't match.")
     end
@@ -51,17 +51,16 @@ function rename!(x::Index, nms)
 end
 
 rename!(x::Index, from, to) = rename!(x, zip(from, to))
-rename!(x::Index, from::String, to::String) = rename!(x, ((from, to),))
+rename!(x::Index, from::Symbol, to::Symbol) = rename!(x, ((from, to),))
 rename!(x::Index, f::Function) = rename!(x, [(x,f(x)) for x in x.names])
 
 rename(x::Index, args...) = rename!(copy(x), args...)
 
-Base.haskey(x::Index, key::String) = haskey(x.lookup, key)
-Base.haskey(x::Index, key::Symbol) = haskey(x.lookup, string(key))
+Base.haskey(x::Index, key::Symbol) = haskey(x.lookup, key)
 Base.haskey(x::Index, key::Real) = 1 <= key <= length(x.names)
 Base.keys(x::Index) = names(x)
 
-function Base.push!(x::Index, nm::String)
+function Base.push!(x::Index, nm::Symbol)
     x.lookup[nm] = length(x) + 1
     push!(x.names, nm)
     return x
@@ -77,24 +76,22 @@ function Base.delete!(x::Index, idx::Integer)
     return x
 end
 
-function Base.delete!(x::Index, nm::String)
+function Base.delete!(x::Index, nm::Symbol)
     if !haskey(x.lookup, nm)
         return x
     end
     idx = x.lookup[nm]
     return delete!(x, idx)
 end
 
-Base.getindex(x::Index, idx::String) = x.lookup[idx]
-Base.getindex(x::Index, idx::Symbol) = x.lookup[string(idx)]
+Base.getindex(x::Index, idx::Symbol) = x.lookup[idx]
 Base.getindex(x::AbstractIndex, idx::Real) = int(idx)
 Base.getindex(x::AbstractIndex, idx::AbstractDataVector{Bool}) = getindex(x, array(idx, false))
 Base.getindex{T}(x::AbstractIndex, idx::AbstractDataVector{T}) = getindex(x, dropna(idx))
 Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) = find(idx)
 Base.getindex(x::AbstractIndex, idx::Ranges) = [idx]
 Base.getindex{T <: Real}(x::AbstractIndex, idx::AbstractVector{T}) = convert(Vector{Int}, idx)
-Base.getindex{T <: String}(x::AbstractIndex, idx::AbstractVector{T}) = [[x.lookup[i] for i in idx]...]
-Base.getindex{T <: Symbol}(x::AbstractIndex, idx::AbstractVector{T}) = [[x.lookup[string(i)] for i in idx]...]
+Base.getindex(x::AbstractIndex, idx::AbstractVector{Symbol}) = [[x.lookup[i] for i in idx]...]
 
 type SimpleIndex <: AbstractIndex
     length::Integer

diff --git a/src/io.jl b/src/io.jl
@@ -7,7 +7,7 @@ immutable ParsedCSV
     quoted::BitVector    # Was field quoted in text
 end
 
-immutable ParseOptions{S <: ByteString, T <: ByteString}
+immutable ParseOptions{S <: ByteString}
     header::Bool
     separator::Char
     quotemarks::Vector{Char}
@@ -16,7 +16,7 @@ immutable ParseOptions{S <: ByteString, T <: ByteString}
     truestrings::Vector{S}
     falsestrings::Vector{S}
     makefactors::Bool
-    colnames::Vector{T}
+    colnames::Vector{Symbol}
     cleannames::Bool
     coltypes::Vector{DataType}
     allowcomments::Bool
@@ -582,7 +582,7 @@ function builddf(rows::Integer,
     end
 end
 
-function parsecolnames!(colnames::Vector{UTF8String},
+function parsecolnames!(colnames::Vector{Symbol},
                         bytes::Vector{Uint8},
                         bounds::Vector{Int},
                         fields::Int)
@@ -596,9 +596,9 @@ function parsecolnames!(colnames::Vector{UTF8String},
         left = bounds[j] + 2
         right = bounds[j + 1]
         if bytes[right] == '\r' || bytes[right] == '\n'
-            colnames[j] = bytestring(bytes[left:(right - 1)])
+            colnames[j] = symbol(bytestring(bytes[left:(right - 1)]))
         else
-            colnames[j] = bytestring(bytes[left:right])
+            colnames[j] = symbol(bytestring(bytes[left:right]))
         end
     end
 
@@ -705,7 +705,7 @@ function readtable(pathname::String;
                    falsestrings::Vector = ASCIIString["F", "f", "FALSE", "false"],
                    makefactors::Bool = false,
                    nrows::Int = -1,
-                   colnames::Vector = UTF8String[],
+                   colnames::Vector = Symbol[],
                    cleannames::Bool = false,
                    coltypes::Vector{DataType} = DataType[],
                    allowcomments::Bool = false,

diff --git a/src/reshape.jl b/src/reshape.jl
@@ -14,9 +14,9 @@
 ##############################################################################
 
 function stack(df::DataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int})
-    res = DataFrame[insert!(df[[i, id_vars]], 1, names(df)[i], "variable") for i in measure_vars]
+    res = DataFrame[insert!(df[[i, id_vars]], 1, names(df)[i], :variable) for i in measure_vars]
     # fix column names
-    map(x -> names!(x, ["variable", "value", names(df[id_vars])]), res)
+    map(x -> names!(x, [:variable, :value, names(df[id_vars])]), res)
     res = vcat(res)
     res
 end
@@ -44,7 +44,7 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
     Nrow = length(refkeycol.pool)
     Ncol = length(keycol.pool)
     # TODO make fillNA(type, length)
-    payload = DataFrame({DataArray([fill(valuecol[1],Nrow)], fill(true, Nrow))  for i in 1:Ncol}, map(string, keycol.pool))
+    payload = DataFrame({DataArray([fill(valuecol[1],Nrow)], fill(true, Nrow))  for i in 1:Ncol}, map(symbol, keycol.pool))
     nowarning = true
     for k in 1:nrow(df)
         j = int(keycol.refs[k])

diff --git a/src/utils.jl b/src/utils.jl
@@ -22,7 +22,7 @@ function _uniqueofsorted(x::Vector)
     x[idx]
 end
 
-function make_unique{S<:ByteString}(names::Vector{S})
+function make_unique(names::Vector{Symbol})
     x = Index()
     names = copy(names)
     dups = Int[]
@@ -38,7 +38,7 @@ function make_unique{S<:ByteString}(names::Vector{S})
         newnm = nm
         k = 1
         while true
-            newnm = "$(nm)_$k"
+            newnm = symbol("$(nm)_$k")
             if !haskey(x, newnm)
                 push!(x, newnm)
                 break
@@ -53,19 +53,19 @@ end
 #' @description
 #'
 #' Generate standardized names for columns of a DataFrame. The
-#' first name will be "x1", the second "x2", etc.
+#' first name will be :x1, the second :x2, etc.
 #'
 #' @field n::Integer The number of names to generate.
 #'
-#' @returns names::Vector{UTF8String} A vector of standardized column names.
+#' @returns names::Vector{Symbol} A vector of standardized column names.
 #'
 #' @examples
 #'
 #' DataFrames.gennames(10)
 function gennames(n::Integer)
-    res = Array(UTF8String, n)
+    res = Array(Symbol, n)
     for i in 1:n
-        res[i] = @sprintf "x%d" i
+        res[i] = symbol(@sprintf "x%d" i)
     end
     return res
 end

diff --git a/test/RDA.jl b/test/RDA.jl
@@ -21,13 +21,13 @@ module TestRDA
     df = DataFrame(num = [1.1, 2.2])
     @assert isequal(DataFrame(read_rda("test/data/RDA/minimal.rda")["df"]), df)
 
-    df["int"] = Int32[1, 2]
-    df["logi"] = [true, false]
-    df["chr"] = ["ab", "c"]
-    df["factor"] = pool(df["chr"])
+    df[:int] = Int32[1, 2]
+    df[:logi] = [true, false]
+    df[:chr] = ["ab", "c"]
+    df[:factor] = pool(df[:chr])
     @assert isequal(DataFrame(read_rda("test/data/RDA/types.rda")["df"]), df)
 
     df[2, :] = NA
-    df = df[:, ["num", "int", "logi", "factor"]]  # (NA) chr breaks read_rda
+    df = df[:, [:num, :int, :logi, :factor]]  # (NA) chr breaks read_rda
     @assert isequal(DataFrame(read_rda("test/data/RDA/NAs.rda")["df"]), df)
 end
diff --git a/test/constructors.jl b/test/constructors.jl
@@ -12,7 +12,7 @@ module TestConstructors
 	@test isequal(df.colindex, Index())
 
 	df = DataFrame({data(zeros(3)), data(ones(3))},
-		            Index(["x1", "x2"]))
+		            Index([:x1, :x2]))
 	@test size(df, 1) == 3
 	@test size(df, 2) == 2
 
@@ -26,20 +26,19 @@ module TestConstructors
 		          DataFrame([0.0 1.0;
 		          	         0.0 1.0;
 		          	         0.0 1.0],
-		          ["x1", "x2"]))
+		          [:x1, :x2]))
 	@test isequal(df,
 		          DataFrame([0.0 1.0;
 		          	         0.0 1.0;
 		          	         0.0 1.0]))
 	@test isequal(df,
 		          DataFrame(data(zeros(3)), data(ones(3))))
 
-	@test isequal(df, DataFrame({"x1" => [0.0, 0.0, 0.0],
-		                         "x2" => [1.0, 1.0, 1.0]}))
-	@test isequal(df, DataFrame({"x1" => [0.0, 0.0, 0.0],
-		                         "x2" => [1.0, 1.0, 1.0],
-		                         "x3" => [2.0, 2.0, 2.0]},
-		                        ["x1", "x2"]))
+	@test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
+		                        x2 = [1.0, 1.0, 1.0]))
+	@test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
+		                        x2 = [1.0, 1.0, 1.0],
+		                        x3 = [2.0, 2.0, 2.0])[[:x1, :x2]])
 
 	df = DataFrame(Int, 2, 2)
 	@test size(df) == (2, 2)
@@ -49,7 +48,7 @@ module TestConstructors
 	@test size(df) == (2, 2)
 	@test all(types(df) .== [Float64, Float64])
 
-	df = DataFrame([Int, Float64], ["x1", "x2"], 2)
+	df = DataFrame([Int, Float64], [:x1, :x2], 2)
 	@test size(df) == (2, 2)
 	@test all(types(df) .== {Int, Float64})