-
Notifications
You must be signed in to change notification settings - Fork 159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CHORE] Populate previews only when show() or __repr__() is called #1889
Changes from 6 commits
5aa19d1
11f8fc2
10c5e56
5dfa3d2
e8ab252
d2093f6
38fa51b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,48 +24,3 @@ def test_show_some(make_df, valid_data, data_source): | |
elif variant == "arrow": | ||
assert df_display.preview.dataframe_num_rows == len(valid_data) | ||
assert df_display.num_rows == 1 | ||
|
||
|
||
def test_show_from_cached_collect(make_df, valid_data): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. concept of 'cached_collect' doesn't exist anymore with this PR so i removed these tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess "cached" would be if we called |
||
df = make_df(valid_data) | ||
df = df.collect() | ||
collected_preview = df._preview | ||
df_display = df._construct_show_display(8) | ||
|
||
# Check that cached preview from df.collect() was used. | ||
assert df_display.preview is collected_preview | ||
assert df_display.schema == df.schema() | ||
assert len(df_display.preview.preview_partition) == len(valid_data) | ||
assert df_display.preview.dataframe_num_rows == 3 | ||
assert df_display.num_rows == 3 | ||
|
||
|
||
def test_show_from_cached_collect_prefix(make_df, valid_data): | ||
df = make_df(valid_data) | ||
df = df.collect(3) | ||
df_display = df._construct_show_display(2) | ||
|
||
assert df_display.schema == df.schema() | ||
assert len(df_display.preview.preview_partition) == 2 | ||
# Check that a prefix of the cached preview from df.collect() was used, so dataframe_num_rows should be set. | ||
assert df_display.preview.dataframe_num_rows == 3 | ||
assert df_display.num_rows == 2 | ||
|
||
|
||
def test_show_not_from_cached_collect(make_df, valid_data, data_source): | ||
df = make_df(valid_data) | ||
df = df.collect(2) | ||
collected_preview = df._preview | ||
df_display = df._construct_show_display(8) | ||
|
||
variant = data_source | ||
if variant == "parquet": | ||
# Cached preview from df.collect() is NOT USED because data was not materialized from parquet. | ||
assert df_display.preview != collected_preview | ||
elif variant == "arrow": | ||
# Cached preview from df.collect() is USED because data was materialized from arrow. | ||
assert df_display.preview == collected_preview | ||
assert df_display.schema == df.schema() | ||
assert len(df_display.preview.preview_partition) == len(valid_data) | ||
assert df_display.preview.dataframe_num_rows == 3 | ||
assert df_display.num_rows == 3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this was weird, if i didn't do the indent, it would fail with:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh i see, .collect() creates an unloaded micropartition, so it didn't do any actual reading of the parquet data