Skip to content

Commit

Permalink
feat: some improves of as_polars_df (#896)
Browse files Browse the repository at this point in the history
  • Loading branch information
eitsupi authored Mar 5, 2024
1 parent 1cb9a6e commit f56c92a
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 51 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ S3method(as_polars_df,RPolarsRollingGroupBy)
S3method(as_polars_df,RPolarsSeries)
S3method(as_polars_df,data.frame)
S3method(as_polars_df,default)
S3method(as_polars_df,nanoarrow_array)
S3method(as_polars_df,nanoarrow_array_stream)
S3method(as_polars_lf,RPolarsLazyFrame)
S3method(as_polars_lf,RPolarsLazyGroupBy)
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

## Polars R Package (development version)

### New features

- `as_polars_df(<nanoarrow_array>)` is added (#893).

### Bug fixes

- The default value of the `format` of `$str$strptime()` is now correctly set (#892).

### Other improvements

- Performance of `as_polars_df(<nanoarrow_array_stream>)` is improved (#896).

## Polars R Package 0.15.0

### Breaking changes due to Rust-polars update
Expand Down
69 changes: 37 additions & 32 deletions R/as_polars.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
#' # Convert an arrow Table, with renaming and casting all columns
#' as_polars_df(
#' at,
#' schema = list(a = pl$Int64, b = pl$String)
#' schema = list(b = pl$Int64, a = pl$String)
#' )
#'
#' # Convert an arrow Table, with renaming and casting some columns
#' # Convert an arrow Table, with casting some columns
#' as_polars_df(
#' at,
#' schema_overrides = list(y = pl$String) # cast some columns
Expand Down Expand Up @@ -202,8 +202,8 @@ as_polars_df.RPolarsLazyGroupBy = function(x, ...) {
#' @param rechunk A logical flag (default `TRUE`).
#' Make sure that all data of each column is in contiguous memory.
#' @param schema named list of DataTypes, or character vector of column names.
#' Should be the same length as the number of columns of `x`.
#' If schema names or types do not match `x`, the columns will be renamed/recast.
#' Should match the number of columns in `x` and correspond to each column in `x` by position.
#' If a column in `x` does not match the name or type at the same position, it will be renamed/recast.
#' If `NULL` (default), convert columns as is.
#' @param schema_overrides named list of DataTypes. Cast some columns to the DataType.
#' @export
Expand All @@ -224,40 +224,43 @@ as_polars_df.ArrowTabular = function(

#' @rdname as_polars_df
#' @export
as_polars_df.nanoarrow_array_stream = function(x, ...) {
on.exit(x$release())
as_polars_df.nanoarrow_array = function(x, ...) {
array_type = nanoarrow::infer_nanoarrow_schema(x) |>
nanoarrow::nanoarrow_schema_parse() |>
(\(x) x$type)()

if (!inherits(nanoarrow::infer_nanoarrow_ptype(x$get_schema()), "data.frame")) {
stop("Can't convert non-struct array stream to RPolarsDataFrame")
if (array_type != "struct") {
Err_plain("Can't convert non-struct array to RPolarsDataFrame") |>
unwrap("in as_polars_df(<nanoarrow_array>):")
}

list_of_struct_arrays = nanoarrow::collect_array_stream(x, validate = FALSE)
if (length(list_of_struct_arrays)) {
data_cols = list()
series = as_polars_series.nanoarrow_array(x, name = NULL)

struct_array = list_of_struct_arrays[[1L]]
list_of_arrays = struct_array$children
col_names = names(list_of_arrays)
if (length(series)) {
series$to_frame()$unnest("")
} else {
# TODO: support 0-length array
pl$DataFrame()
}
}

for (i in seq_along(list_of_arrays)) {
data_cols[[col_names[i]]] = as_polars_series.nanoarrow_array(list_of_arrays[[i]])
}

for (struct_array in list_of_struct_arrays[-1L]) {
list_of_arrays = struct_array$children
col_names = names(list_of_arrays)
for (i in seq_along(list_of_arrays)) {
.pr$Series$append_mut(data_cols[[col_names[i]]], as_polars_series.nanoarrow_array(list_of_arrays[[i]])) |>
unwrap("in as_polars_df(<nanoarrow_array_stream>):")
}
}
#' @rdname as_polars_df
#' @export
as_polars_df.nanoarrow_array_stream = function(x, ...) {
if (!inherits(nanoarrow::infer_nanoarrow_ptype(x$get_schema()), "data.frame")) {
Err_plain("Can't convert non-struct array stream to RPolarsDataFrame") |>
unwrap("in as_polars_df(<nanoarrow_array_stream>):")
}

out = do.call(pl$select, data_cols)
series = as_polars_series.nanoarrow_array_stream(x, name = NULL)

if (length(series)) {
series$to_frame()$unnest("")
} else {
out = pl$DataFrame() # TODO: support creating 0-row DataFrame
# TODO: support 0-length array stream
pl$DataFrame()
}

out
}


Expand Down Expand Up @@ -408,9 +411,11 @@ as_polars_series.nanoarrow_array_stream = function(x, name = NULL, ...) {
out = pl$Series(NULL, name = name)
} else {
out = as_polars_series.nanoarrow_array(list_of_arrays[[1L]], name = name)
for (array in list_of_arrays[-1L]) {
.pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array))
}
lapply(
list_of_arrays[-1L],
\(array) .pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array))
) |>
invisible()
}

out
Expand Down
16 changes: 3 additions & 13 deletions R/construction.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,20 +223,10 @@ df_to_rpldf = function(x, ..., schema = NULL, schema_overrides = NULL) {
unwrap()
}

data_cols = list()
out = lapply(x, as_polars_series) |>
pl$select()

for (i in seq_len(n_cols)) {
column = as_polars_series(x[[i]])
col_name = col_names[i]

data_cols[[col_name]] = column
}

if (length(data_cols)) {
out = do.call(pl$select, data_cols)
} else {
out = pl$DataFrame()
}
out$columns = col_names

cast_these_fields = mapply(
new_schema,
Expand Down
11 changes: 7 additions & 4 deletions man/as_polars_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions tests/testthat/test-as_polars.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ if (requireNamespace("arrow", quietly = TRUE) && requireNamespace("nanoarrow", q
"polars_lazy_group_by_dynamic", pl$LazyFrame(test_df)$group_by_dynamic("col_int", every = "1i"),
"arrow Table", arrow::as_arrow_table(test_df),
"arrow RecordBatch", arrow::as_record_batch(test_df),
"nanoarrow_array", nanoarrow::as_nanoarrow_array(test_df),
"nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(test_df),
)
}
Expand Down Expand Up @@ -101,13 +102,13 @@ test_that("as_polars_df throws error when make_names_unique = FALSE and there ar

test_that("schema option and schema_overrides for as_polars_df.data.frame", {
df = data.frame(a = 1:3, b = 4:6)
pl_df_1 = as_polars_df(df, schema = list(a = pl$String, b = pl$Int32))
pl_df_1 = as_polars_df(df, schema = list(b = pl$String, y = pl$Int32))
pl_df_2 = as_polars_df(df, schema = c("x", "y"))
pl_df_3 = as_polars_df(df, schema_overrides = list(a = pl$String))

expect_equal(
pl_df_1$to_data_frame(),
data.frame(a = as.character(1:3), b = 4L:6L)
data.frame(b = as.character(1:3), y = 4L:6L)
)
expect_equal(
pl_df_2$to_data_frame(),
Expand Down

0 comments on commit f56c92a

Please sign in to comment.