Skip to content

Commit

Permalink
REFACTOR-#5718: add columns parameter for get_dtypes function (#5717
Browse files Browse the repository at this point in the history
)

Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev authored Mar 1, 2023
1 parent 4db3e70 commit bb0950d
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 30 deletions.
10 changes: 3 additions & 7 deletions modin/core/io/text/csv_glob_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,18 +231,14 @@ def _read(cls, filepath_or_buffer, **kwargs):
new_index = index_objs[0].append(index_objs[1:])
new_index.name = pd_df_metadata.index.name

partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)

# Compute dtypes by getting collecting and combining all of the partitions. The
# reported dtypes from differing rows can be different based on the inference in
# the limited data seen by each worker. We use pandas to compute the exact dtype
# over the whole column for each column. The index is set below.
dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
dtypes = cls.get_dtypes(dtypes_ids, column_names)

partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
# Set the index for the dtypes to the column names
if isinstance(dtypes, pandas.Series):
dtypes.index = column_names
else:
dtypes = pandas.Series(dtypes, index=column_names)
new_frame = cls.frame_cls(
partition_ids,
new_index,
Expand Down
10 changes: 3 additions & 7 deletions modin/core/io/text/excel_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,18 +221,14 @@ def _read(cls, io, **kwargs):
row_lengths = [len(o) for o in index_objs]
new_index = index_objs[0].append(index_objs[1:])

data_ids = cls.build_partition(data_ids, row_lengths, column_widths)

# Compute dtypes by getting collecting and combining all of the partitions. The
# reported dtypes from differing rows can be different based on the inference in
# the limited data seen by each worker. We use pandas to compute the exact dtype
# over the whole column for each column. The index is set below.
dtypes = cls.get_dtypes(dtypes_ids)
dtypes = cls.get_dtypes(dtypes_ids, column_names)

data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
# Set the index for the dtypes to the column names
if isinstance(dtypes, pandas.Series):
dtypes.index = column_names
else:
dtypes = pandas.Series(dtypes, index=column_names)
new_frame = cls.frame_cls(
data_ids,
new_index,
Expand Down
10 changes: 5 additions & 5 deletions modin/core/io/text/json_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,13 @@ def _read(cls, path_or_buf, **kwargs):
row_lengths = cls.materialize(index_ids)
new_index = pandas.RangeIndex(sum(row_lengths))

dtypes = cls.get_dtypes(dtypes_ids)
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)

if isinstance(dtypes, pandas.Series):
dtypes.index = columns
else:
dtypes = pandas.Series(dtypes, index=columns)
# Compute dtypes by getting collecting and combining all of the partitions. The
# reported dtypes from differing rows can be different based on the inference in
# the limited data seen by each worker. We use pandas to compute the exact dtype
# over the whole column for each column. The index is set below.
dtypes = cls.get_dtypes(dtypes_ids, columns)

new_frame = cls.frame_cls(
np.array(partition_ids),
Expand Down
13 changes: 4 additions & 9 deletions modin/core/io/text/text_file_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,19 +909,14 @@ def _get_new_qc(
New query compiler, created from `new_frame`.
"""
new_index, row_lengths = cls._define_index(index_ids, index_name)
# Compose modin partitions from `partition_ids`
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)

# Compute dtypes by collecting and combining all of the partition dtypes. The
# reported dtypes from differing rows can be different based on the inference in
# the limited data seen by each worker. We use pandas to compute the exact dtype
# over the whole column for each column. The index is set below.
dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
# Compose modin partitions from `partition_ids`
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)

# Set the index for the dtypes to the column names
if isinstance(dtypes, pandas.Series):
dtypes.index = column_names
else:
dtypes = pandas.Series(dtypes, index=column_names)
dtypes = cls.get_dtypes(dtypes_ids, column_names)

new_frame = cls.frame_cls(
partition_ids,
Expand Down
15 changes: 13 additions & 2 deletions modin/core/storage_formats/pandas/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,22 +211,27 @@ def generic_parse(fname, **kwargs):
]

@classmethod
def get_dtypes(cls, dtypes_ids):
def get_dtypes(cls, dtypes_ids, columns):
"""
Get common for all partitions dtype for each of the columns.
Parameters
----------
dtypes_ids : list
Array with references to the partitions dtypes objects.
columns : array-like or Index (1d)
The names of the columns in this variable will be used
for dtypes creation.
Returns
-------
frame_dtypes : pandas.Series or dtype
frame_dtypes : pandas.Series, dtype or None
Resulting dtype or pandas.Series where column names are used as
index and types of columns are used as values for full resulting
frame.
"""
if len(dtypes_ids) == 0:
return None
# each element in `partitions_dtypes` is a Series, where column names are
# used as index and types of columns for different partitions are used as values
partitions_dtypes = cls.materialize(dtypes_ids)
Expand All @@ -251,6 +256,12 @@ def get_dtypes(cls, dtypes_ids):
axis=1,
).squeeze(axis=0)

# Set the index for the dtypes to the column names
if isinstance(frame_dtypes, pandas.Series):
frame_dtypes.index = columns
else:
frame_dtypes = pandas.Series(frame_dtypes, index=columns)

return frame_dtypes

@classmethod
Expand Down

0 comments on commit bb0950d

Please sign in to comment.