-
Notifications
You must be signed in to change notification settings - Fork 159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FEAT] [New Query Planner] Refactor file globbing logic by exposing FileInfos
to Python
#1307
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -13,6 +13,7 @@ | |||||
from daft.daft import ( | ||||||
FileFormat, | ||||||
FileFormatConfig, | ||||||
FileInfos, | ||||||
JoinType, | ||||||
PartitionScheme, | ||||||
PartitionSpec, | ||||||
|
@@ -28,6 +29,7 @@ | |||||
from daft.logical.map_partition_ops import ExplodeOp, MapPartitionOp | ||||||
from daft.logical.schema import Schema | ||||||
from daft.runners.partitioning import PartitionCacheEntry | ||||||
from daft.runners.pyrunner import LocalPartitionSet | ||||||
from daft.table import Table | ||||||
|
||||||
if TYPE_CHECKING: | ||||||
|
@@ -111,39 +113,30 @@ | |||||
def from_tabular_scan( | ||||||
cls, | ||||||
*, | ||||||
paths: list[str], | ||||||
file_infos: FileInfos, | ||||||
schema: Schema, | ||||||
file_format_config: FileFormatConfig, | ||||||
schema_hint: Schema | None, | ||||||
fs: fsspec.AbstractFileSystem | None, | ||||||
) -> PyLogicalPlanBuilder: | ||||||
# Glob the path using the Runner | ||||||
runner_io = get_context().runner().runner_io() | ||||||
file_info_partition_set = runner_io.glob_paths_details(paths, file_format_config, fs) | ||||||
|
||||||
# Infer schema if no hints provided | ||||||
inferred_or_provided_schema = ( | ||||||
schema_hint | ||||||
if schema_hint is not None | ||||||
else runner_io.get_schema_from_first_filepath(file_info_partition_set, file_format_config, fs) | ||||||
) | ||||||
cache_entry = get_context().runner().put_partition_set_into_cache(file_info_partition_set) | ||||||
file_infos_table = Table._from_pytable(file_infos.to_table()) | ||||||
partition = LocalPartitionSet({0: file_infos_table}) | ||||||
cache_entry = get_context().runner().put_partition_set_into_cache(partition) | ||||||
filepath_plan = InMemoryScan( | ||||||
cache_entry=cache_entry, | ||||||
schema=runner_io.FS_LISTING_SCHEMA, | ||||||
partition_spec=PartitionSpec(PartitionScheme.Unknown, file_info_partition_set.num_partitions()), | ||||||
schema=file_infos_table.schema(), | ||||||
partition_spec=PartitionSpec(PartitionScheme.Unknown, len(file_infos)), | ||||||
) | ||||||
|
||||||
return TabularFilesScan( | ||||||
schema=inferred_or_provided_schema, | ||||||
schema=schema, | ||||||
predicate=None, | ||||||
columns=None, | ||||||
file_format_config=file_format_config, | ||||||
fs=fs, | ||||||
filepaths_child=filepath_plan, | ||||||
filepaths_column_name=runner_io.FS_LISTING_PATH_COLUMN_NAME, | ||||||
# WARNING: This is currently hardcoded to be the same number of partitions as rows!! This is because we emit | ||||||
# one partition per filepath. This will change in the future and our logic here should change accordingly. | ||||||
num_partitions=len(file_info_partition_set), | ||||||
num_partitions=len(file_infos), | ||||||
).to_builder() | ||||||
|
||||||
def project( | ||||||
|
@@ -445,7 +438,6 @@ | |||||
predicate: ExpressionsProjection | None = None, | ||||||
columns: list[str] | None = None, | ||||||
filepaths_child: LogicalPlan, | ||||||
filepaths_column_name: str, | ||||||
num_partitions: int | None = None, | ||||||
limit_rows: int | None = None, | ||||||
) -> None: | ||||||
|
@@ -472,12 +464,7 @@ | |||||
self._fs = fs | ||||||
self._limit_rows = limit_rows | ||||||
|
||||||
# TabularFilesScan has a single child node that provides the filepaths to read from. | ||||||
assert ( | ||||||
filepaths_child.schema()[filepaths_column_name] is not None | ||||||
), f"TabularFileScan requires a child with '{filepaths_column_name}' column" | ||||||
self._register_child(filepaths_child) | ||||||
self._filepaths_column_name = filepaths_column_name | ||||||
|
||||||
@property | ||||||
def _filepaths_child(self) -> LogicalPlan: | ||||||
|
@@ -493,7 +480,7 @@ | |||||
) | ||||||
|
||||||
def required_columns(self) -> list[set[str]]: | ||||||
return [{self._filepaths_column_name} | self._predicate.required_columns()] | ||||||
return [{"file_paths"} | self._predicate.required_columns()] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Codecov comment is interesting, is this not covered by tests running on the old planner? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm it appears that it is not. The only place that it could happen is in the Daft/daft/logical/optimizer.py Lines 219 to 220 in 102de0f
|
||||||
|
||||||
def input_mapping(self) -> list[dict[str, str]]: | ||||||
return [dict()] | ||||||
|
@@ -505,7 +492,6 @@ | |||||
and self._predicate == other._predicate | ||||||
and self._columns == other._columns | ||||||
and self._file_format_config == other._file_format_config | ||||||
and self._filepaths_column_name == other._filepaths_column_name | ||||||
) | ||||||
|
||||||
def rebuild(self) -> LogicalPlan: | ||||||
|
@@ -517,7 +503,6 @@ | |||||
predicate=self._predicate if self._predicate is not None else None, | ||||||
columns=self._column_names, | ||||||
filepaths_child=child, | ||||||
filepaths_column_name=self._filepaths_column_name, | ||||||
) | ||||||
|
||||||
def copy_with_new_children(self, new_children: list[LogicalPlan]) -> LogicalPlan: | ||||||
|
@@ -529,7 +514,6 @@ | |||||
predicate=self._predicate, | ||||||
columns=self._column_names, | ||||||
filepaths_child=new_children[0], | ||||||
filepaths_column_name=self._filepaths_column_name, | ||||||
) | ||||||
|
||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we make this a constant somewhere?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Eh we weren't using a constant for
"file_sizes"
or"num_rows"
, and I personally think that we can keep the contract informal for now since we're going to continue to rework this as (1) we move to the new query planner, (2) we move to the Rust-native file globber, and (3) we move file globbing to execution time.If you feel strongly about this, I can try to expose the column names on
FileInfo
andFileInfos
somehow, and propagate them as sidecar data whenever we pass around theTable
representation.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Naw I don't feel strongly about this 😛 lgtm!