diff --git a/.travis.yml b/.travis.yml index 200ca15bf77..cacba8fbb21 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,7 @@ matrix: env: LINT=1 script: - export PATH="$HOME/miniconda/bin:$PATH" + - yapf -dr modin/pandas - flake8 . install: diff --git a/.travis/install-dependencies.sh b/.travis/install-dependencies.sh index 436368598ef..7b30edb0c1d 100755 --- a/.travis/install-dependencies.sh +++ b/.travis/install-dependencies.sh @@ -55,7 +55,7 @@ elif [[ "$LINT" == "1" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" # Install Python linting tools. - pip install -q flake8 flake8-comprehensions + pip install -q flake8 flake8-comprehensions yapf else echo "Unrecognized environment." exit 1 diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index a6ea191ee72..374ea75d24e 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -15,8 +15,10 @@ try: if threading.current_thread().name == "MainThread": - ray.init(redirect_output=True, include_webui=False, - redirect_worker_output=True) + ray.init( + redirect_output=True, + include_webui=False, + redirect_worker_output=True) except AssertionError: pass @@ -41,17 +43,20 @@ def get_npartitions(): from .concat import concat # noqa: 402 from .dataframe import DataFrame # noqa: 402 from .datetimes import to_datetime # noqa: 402 -from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402 - read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402 - read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402 - read_sql) # noqa: 402 +from .io import ( # noqa: 402 + read_csv, read_parquet, read_json, read_html, read_clipboard, read_excel, + read_hdf, read_feather, read_msgpack, read_stata, read_sas, read_pickle, + read_sql) from .reshape import get_dummies # noqa: 402 __all__ = [ - "DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval", - "unique", "value_counts", "cut", "to_numeric", "factorize", "test", "qcut", - "match", "to_datetime", "get_dummies", "Panel", "date_range", "Index", - "MultiIndex", "Series", "bdate_range", "DatetimeIndex", "to_timedelta", - "set_eng_float_format", "set_option", "CategoricalIndex", "Timedelta", - "Timestamp", "NaT", "PeriodIndex", "Categorical" + "DataFrame", "Series", "read_csv", "read_parquet", "read_json", + "read_html", "read_clipboard", "read_excel", "read_hdf", "read_feather", + "read_msgpack", "read_stata", "read_sas", "read_pickle", "read_sql", + "concat", "eval", "unique", "value_counts", "cut", "to_numeric", + "factorize", "test", "qcut", "match", "to_datetime", "get_dummies", + "Panel", "date_range", "Index", "MultiIndex", "Series", "bdate_range", + "DatetimeIndex", "to_timedelta", "set_eng_float_format", "set_option", + "CategoricalIndex", "Timedelta", "Timestamp", "NaT", "PeriodIndex", + "Categorical" ] diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py index 6e35a4ae28d..ef7f345d011 100644 --- a/modin/pandas/concat.py +++ b/modin/pandas/concat.py @@ -10,8 +10,15 @@ from .utils import _reindex_helper -def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, +def concat(objs, + axis=0, + join='outer', + join_axes=None, + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, copy=True): if keys is not None: @@ -28,24 +35,24 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, raise ValueError("All objects passed were None") try: - type_check = next(obj for obj in objs - if not isinstance(obj, (pandas.Series, - pandas.DataFrame, - DataFrame))) + type_check = next( + obj for obj in objs + if not isinstance(obj, (pandas.Series, pandas.DataFrame, + DataFrame))) except StopIteration: type_check = None if type_check is not None: - raise ValueError("cannot concatenate object of type \"{0}\"; only " - "pandas.Series, pandas.DataFrame, " - "and modin.pandas.DataFrame objs are " - "valid", type(type_check)) + raise ValueError( + "cannot concatenate object of type \"{0}\"; only " + "pandas.Series, pandas.DataFrame, " + "and modin.pandas.DataFrame objs are " + "valid", type(type_check)) - all_series = all(isinstance(obj, pandas.Series) - for obj in objs) + all_series = all(isinstance(obj, pandas.Series) for obj in objs) if all_series: - return DataFrame(pandas.concat(objs, axis, join, join_axes, - ignore_index, keys, levels, names, - verify_integrity, copy)) + return DataFrame( + pandas.concat(objs, axis, join, join_axes, ignore_index, keys, + levels, names, verify_integrity, copy)) if isinstance(objs, dict): raise NotImplementedError( @@ -59,8 +66,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, " other axis") # We need this in a list because we use it later. - all_index, all_columns = list(zip(*[(obj.index, obj.columns) - for obj in objs])) + all_index, all_columns = list( + zip(*[(obj.index, obj.columns) for obj in objs])) def series_to_df(series, columns): df = pandas.DataFrame(series) @@ -71,8 +78,10 @@ def series_to_df(series, columns): # true regardless of the existence of another column named 0 in the # concat. if axis == 0: - objs = [series_to_df(obj, [0]) - if isinstance(obj, pandas.Series) else obj for obj in objs] + objs = [ + series_to_df(obj, [0]) if isinstance(obj, pandas.Series) else obj + for obj in objs + ] else: # Pandas starts the count at 0 so this will increment the names as # long as there's a new nameless Series being added. @@ -82,9 +91,11 @@ def name_incrementer(i): return val i = [0] - objs = [series_to_df(obj, obj.name if obj.name is not None - else name_incrementer(i)) - if isinstance(obj, pandas.Series) else obj for obj in objs] + objs = [ + series_to_df( + obj, obj.name if obj.name is not None else name_incrementer(i)) + if isinstance(obj, pandas.Series) else obj for obj in objs + ] # Using concat on the columns and index is fast because they're empty, # and it forces the error checking. It also puts the columns in the @@ -105,31 +116,38 @@ def name_incrementer(i): # Put all of the DataFrames into Ray format # TODO just partition the DataFrames instead of building a new Ray DF. - objs = [DataFrame(obj) if isinstance(obj, (pandas.DataFrame, - pandas.Series)) else obj - for obj in objs] + objs = [ + DataFrame(obj) + if isinstance(obj, (pandas.DataFrame, pandas.Series)) else obj + for obj in objs + ] # Here we reuse all_columns/index so we don't have to materialize objects # from remote memory built in the previous line. In the future, we won't be # building new DataFrames, rather just partitioning the DataFrames. if axis == 0: - new_blocks = np.array([_reindex_helper._submit( - args=tuple([all_columns[i], final_columns, axis, - len(objs[0]._block_partitions)] + part.tolist()), - num_return_vals=len(objs[0]._block_partitions)) - for i in range(len(objs)) - for part in objs[i]._block_partitions]) + new_blocks = np.array([ + _reindex_helper._submit( + args=tuple([ + all_columns[i], final_columns, axis, + len(objs[0]._block_partitions) + ] + part.tolist()), + num_return_vals=len(objs[0]._block_partitions)) + for i in range(len(objs)) for part in objs[i]._block_partitions + ]) else: # Transposing the columns is necessary because the remote task treats # everything like rows and returns in row-major format. Luckily, this # operation is cheap in numpy. - new_blocks = np.array([_reindex_helper._submit( - args=tuple([all_index[i], final_index, axis, - len(objs[0]._block_partitions.T)] + part.tolist()), - num_return_vals=len(objs[0]._block_partitions.T)) - for i in range(len(objs)) - for part in objs[i]._block_partitions.T]).T - - return DataFrame(block_partitions=new_blocks, - columns=final_columns, - index=final_index) + new_blocks = np.array([ + _reindex_helper._submit( + args=tuple([ + all_index[i], final_index, axis, + len(objs[0]._block_partitions.T) + ] + part.tolist()), + num_return_vals=len(objs[0]._block_partitions.T)) + for i in range(len(objs)) for part in objs[i]._block_partitions.T + ]).T + + return DataFrame( + block_partitions=new_blocks, columns=final_columns, index=final_index) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 8bcdd9a5ae7..67118973227 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -7,12 +7,9 @@ from pandas.compat import lzip, to_str, string_types, cPickle as pkl import pandas.core.common as com from pandas.core.dtypes.cast import maybe_upcast_putmask -from pandas.core.dtypes.common import ( - _get_dtype_from_object, - is_bool_dtype, - is_list_like, - is_numeric_dtype, - is_timedelta64_dtype) +from pandas.core.dtypes.common import (_get_dtype_from_object, is_bool_dtype, + is_list_like, is_numeric_dtype, + is_timedelta64_dtype) from pandas.core.index import _ensure_index_from_sequences from pandas.core.indexing import check_bool_indexer from pandas.errors import MergeError @@ -29,34 +26,31 @@ import sys import warnings -from .utils import ( - to_pandas, - _blocks_to_col, - _blocks_to_row, - _compile_remote_dtypes, - _concat_index, - _co_op_helper, - _create_block_partitions, - _create_blocks_helper, - _deploy_func, - _fix_blocks_dimensions, - _inherit_docstrings, - _map_partitions, - _match_partitioning, - _partition_pandas_dataframe, - _reindex_helper) +from .utils import (to_pandas, _blocks_to_col, _blocks_to_row, + _compile_remote_dtypes, _concat_index, _co_op_helper, + _create_block_partitions, _create_blocks_helper, + _deploy_func, _fix_blocks_dimensions, _inherit_docstrings, + _map_partitions, _match_partitioning, + _partition_pandas_dataframe, _reindex_helper) from . import get_npartitions from .index_metadata import _IndexMetadata from .iterator import PartitionIterator -@_inherit_docstrings(pandas.DataFrame, - excluded=[pandas.DataFrame, pandas.DataFrame.__init__]) +@_inherit_docstrings( + pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__]) class DataFrame(object): - - def __init__(self, data=None, index=None, columns=None, dtype=None, - copy=False, col_partitions=None, row_partitions=None, - block_partitions=None, row_metadata=None, col_metadata=None, + def __init__(self, + data=None, + index=None, + columns=None, + dtype=None, + copy=False, + col_partitions=None, + row_partitions=None, + block_partitions=None, + row_metadata=None, + col_metadata=None, dtypes_cache=None): """Distributed DataFrame object backed by Pandas dataframes. @@ -89,13 +83,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, self._dtypes_cache = dtypes_cache # Check type of data and use appropriate constructor - if data is not None or (col_partitions is None and - row_partitions is None and - block_partitions is None): + if data is not None or (col_partitions is None + and row_partitions is None + and block_partitions is None): - pandas_df = pandas.DataFrame(data=data, index=index, - columns=columns, dtype=dtype, - copy=copy) + pandas_df = pandas.DataFrame( + data=data, + index=index, + columns=columns, + dtype=dtype, + copy=copy) # Cache dtypes self._dtypes_cache = pandas_df.dtypes @@ -159,16 +156,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if index is not None: self.index = index else: - self._row_metadata = _IndexMetadata(self._block_partitions[:, 0], - index=index, axis=0) + self._row_metadata = _IndexMetadata( + self._block_partitions[:, 0], index=index, axis=0) if col_metadata is not None: self._col_metadata = col_metadata.copy() if columns is not None: self.columns = columns else: - self._col_metadata = _IndexMetadata(self._block_partitions[0, :], - index=columns, axis=1) + self._col_metadata = _IndexMetadata( + self._block_partitions[0, :], index=columns, axis=1) if self._dtypes_cache is None: self._get_remote_dtypes() @@ -200,8 +197,10 @@ def _get_row_partitions(self): self._row_metadata._lengths = \ self._row_metadata._lengths[empty_rows_mask] self._block_partitions = self._block_partitions[empty_rows_mask, :] - return [_blocks_to_row.remote(*part) - for i, part in enumerate(self._block_partitions)] + return [ + _blocks_to_row.remote(*part) + for i, part in enumerate(self._block_partitions) + ] def _set_row_partitions(self, new_row_partitions): self._block_partitions = \ @@ -216,8 +215,10 @@ def _get_col_partitions(self): self._col_metadata._lengths = \ self._col_metadata._lengths[empty_cols_mask] self._block_partitions = self._block_partitions[:, empty_cols_mask] - return [_blocks_to_col.remote(*self._block_partitions[:, i]) - for i in range(self._block_partitions.shape[1])] + return [ + _blocks_to_col.remote(*self._block_partitions[:, i]) + for i in range(self._block_partitions.shape[1]) + ] def _set_col_partitions(self, new_col_partitions): self._block_partitions = \ @@ -374,8 +375,8 @@ def col_dots_builder(full_front, full_back): col_dots = pandas.Series(["..." for _ in range(len(full_front))]) col_dots.index = index_of_head col_dots.name = "..." - return pandas.concat([full_front, col_dots, full_back], - axis=1, copy=False) + return pandas.concat( + [full_front, col_dots, full_back], axis=1, copy=False) # If we don't exceed the maximum number of values on either dimension if len(self.index) <= 60 and len(self.columns) <= 20: @@ -428,8 +429,10 @@ def col_dots_builder(full_front, full_back): else: # Convert head_blocks into a pandas DataFrame - list_of_head_rows = [pandas.concat(ray.get(df.tolist()), axis=1) - for df in head_blocks] + list_of_head_rows = [ + pandas.concat(ray.get(df.tolist()), axis=1) + for df in head_blocks + ] full_head = pandas.concat(list_of_head_rows) full_head.columns = self.columns @@ -520,9 +523,10 @@ def _arithmetic_helper(self, remote_func, axis, level=None): axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ else 0 - oid_series = ray.get(_map_partitions(remote_func, - self._col_partitions if axis == 0 - else self._row_partitions)) + oid_series = ray.get( + _map_partitions( + remote_func, self._col_partitions + if axis == 0 else self._row_partitions)) if axis == 0: # We use the index to get the internal index. @@ -535,8 +539,8 @@ def _arithmetic_helper(self, remote_func, axis, level=None): df.index = \ this_partition[this_partition.isin(df.index)].index - result_series = pandas.concat([obj[0] for obj in oid_series], - axis=0, copy=False) + result_series = pandas.concat( + [obj[0] for obj in oid_series], axis=0, copy=False) else: result_series = pandas.concat(oid_series, axis=0, copy=False) result_series.index = self.index @@ -578,8 +582,8 @@ def ndim(self): """ # The number of dimensions is common across all partitions. # The first partition will be enough. - return ray.get(_deploy_func.remote(lambda df: df.ndim, - self._row_partitions[0])) + return ray.get( + _deploy_func.remote(lambda df: df.ndim, self._row_partitions[0])) @property def ftypes(self): @@ -590,16 +594,18 @@ def ftypes(self): """ # The ftypes are common across all partitions. # The first partition will be enough. - result = ray.get(_deploy_func.remote(lambda df: df.ftypes, - self._row_partitions[0])) + result = ray.get( + _deploy_func.remote(lambda df: df.ftypes, self._row_partitions[0])) result.index = self.columns return result def _get_remote_dtypes(self): """Finds and caches ObjectIDs for the dtypes of each column partition. """ - self._dtypes_cache = [_compile_remote_dtypes.remote(*column) - for column in self._block_partitions.T] + self._dtypes_cache = [ + _compile_remote_dtypes.remote(*column) + for column in self._block_partitions.T + ] @property def dtypes(self): @@ -613,8 +619,8 @@ def dtypes(self): if isinstance(self._dtypes_cache, list) and \ isinstance(self._dtypes_cache[0], ray.ObjectID): - self._dtypes_cache = pandas.concat(ray.get(self._dtypes_cache), - copy=False) + self._dtypes_cache = pandas.concat( + ray.get(self._dtypes_cache), copy=False) self._dtypes_cache.index = self.columns return self._dtypes_cache @@ -636,8 +642,9 @@ def values(self): Returns: The numpy representation of this DataFrame. """ - return np.concatenate(ray.get(_map_partitions( - lambda df: df.values, self._row_partitions))) + return np.concatenate( + ray.get( + _map_partitions(lambda df: df.values, self._row_partitions))) @property def axes(self): @@ -657,9 +664,14 @@ def shape(self): """ return len(self.index), len(self.columns) - def _update_inplace(self, row_partitions=None, col_partitions=None, - block_partitions=None, columns=None, index=None, - col_metadata=None, row_metadata=None): + def _update_inplace(self, + row_partitions=None, + col_partitions=None, + block_partitions=None, + columns=None, + index=None, + col_metadata=None, + row_metadata=None): """updates the current DataFrame inplace. Behavior should be similar to the constructor, given the corresponding @@ -720,11 +732,12 @@ def add_prefix(self, prefix): A new DataFrame containing the new column names. """ new_cols = self.columns.map(lambda x: str(prefix) + str(x)) - return DataFrame(block_partitions=self._block_partitions, - columns=new_cols, - col_metadata=self._col_metadata, - row_metadata=self._row_metadata, - dtypes_cache=self._dtypes_cache) + return DataFrame( + block_partitions=self._block_partitions, + columns=new_cols, + col_metadata=self._col_metadata, + row_metadata=self._row_metadata, + dtypes_cache=self._dtypes_cache) def add_suffix(self, suffix): """Add a suffix to each of the column names. @@ -733,11 +746,12 @@ def add_suffix(self, suffix): A new DataFrame containing the new column names. """ new_cols = self.columns.map(lambda x: str(x) + str(suffix)) - return DataFrame(block_partitions=self._block_partitions, - columns=new_cols, - col_metadata=self._col_metadata, - row_metadata=self._row_metadata, - dtypes_cache=self._dtypes_cache) + return DataFrame( + block_partitions=self._block_partitions, + columns=new_cols, + col_metadata=self._col_metadata, + row_metadata=self._row_metadata, + dtypes_cache=self._dtypes_cache) def applymap(self, func): """Apply a function to a DataFrame elementwise. @@ -746,16 +760,18 @@ def applymap(self, func): func (callable): The function to apply. """ if not callable(func): - raise ValueError( - "\'{0}\' object is not callable".format(type(func))) + raise ValueError("\'{0}\' object is not callable".format( + type(func))) new_block_partitions = np.array([ _map_partitions(lambda df: df.applymap(func), block) - for block in self._block_partitions]) + for block in self._block_partitions + ]) - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) + return DataFrame( + block_partitions=new_block_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def copy(self, deep=True): """Creates a shallow copy of the DataFrame. @@ -763,13 +779,21 @@ def copy(self, deep=True): Returns: A new DataFrame pointing to the same partitions as this one. """ - return DataFrame(block_partitions=self._block_partitions, - columns=self.columns, - index=self.index, - dtypes_cache=self._dtypes_cache) - - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): + return DataFrame( + block_partitions=self._block_partitions, + columns=self.columns, + index=self.index, + dtypes_cache=self._dtypes_cache) + + def groupby(self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + **kwargs): """Apply a groupby to this DataFrame. See _groupby() remote task. Args: by: The value to groupby. @@ -804,8 +828,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, return DataFrameGroupBy(self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs) - def sum(self, axis=None, skipna=True, level=None, numeric_only=None, - min_count=1, **kwargs): + def sum(self, + axis=None, + skipna=True, + level=None, + numeric_only=None, + min_count=1, + **kwargs): """Perform a sum across the DataFrame. Args: @@ -815,10 +844,15 @@ def sum(self, axis=None, skipna=True, level=None, numeric_only=None, Returns: The sum of the DataFrame. """ + def remote_func(df): - return df.sum(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, min_count=min_count, - **kwargs) + return df.sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -833,14 +867,16 @@ def abs(self): # TODO Give a more accurate error to Pandas raise TypeError("bad operand type for abs():", "str") - new_block_partitions = np.array([_map_partitions(lambda df: df.abs(), - block) - for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.abs(), block) + for block in self._block_partitions + ]) - return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index, - dtypes_cache=self._dtypes_cache) + return DataFrame( + block_partitions=new_block_partitions, + columns=self.columns, + index=self.index, + dtypes_cache=self._dtypes_cache) def isin(self, values): """Fill a DataFrame with booleans for cells contained in values. @@ -854,13 +890,15 @@ def isin(self, values): True: cell is contained in values. False: otherwise """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.isin(values), block) - for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.isin(values), block) + for block in self._block_partitions + ]) - return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + return DataFrame( + block_partitions=new_block_partitions, + columns=self.columns, + index=self.index) def isna(self): """Fill a DataFrame with booleans for cells containing NA. @@ -871,16 +909,19 @@ def isna(self): True: cell contains NA. False: otherwise. """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.isna(), block) for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.isna(), block) + for block in self._block_partitions + ]) - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool")] * len(self.columns), index=self.columns) - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) + return DataFrame( + block_partitions=new_block_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata, + dtypes_cache=new_dtypes) def isnull(self): """Fill a DataFrame with booleans for cells containing a null value. @@ -891,17 +932,19 @@ def isnull(self): True: cell contains null. False: otherwise. """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.isnull(), block) - for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.isnull(), block) + for block in self._block_partitions + ]) - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool")] * len(self.columns), index=self.columns) - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) + return DataFrame( + block_partitions=new_block_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata, + dtypes_cache=new_dtypes) def keys(self): """Get the info axis for the DataFrame. @@ -918,16 +961,23 @@ def transpose(self, *args, **kwargs): Returns: A new DataFrame transposed from this DataFrame. """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.T, block) for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.T, block) + for block in self._block_partitions + ]) - return DataFrame(block_partitions=new_block_partitions.T, - columns=self.index, - index=self.columns) + return DataFrame( + block_partitions=new_block_partitions.T, + columns=self.index, + index=self.columns) T = property(transpose) - def dropna(self, axis=0, how='any', thresh=None, subset=None, + def dropna(self, + axis=0, + how='any', + thresh=None, + subset=None, inplace=False): """Create a new DataFrame from the removed NA values from this one. @@ -960,9 +1010,10 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, if not inplace: return result - self._update_inplace(block_partitions=result._block_partitions, - columns=result.columns, - index=result.index) + self._update_inplace( + block_partitions=result._block_partitions, + columns=result.columns, + index=result.index) return None @@ -987,8 +1038,12 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, raise KeyError(list(np.compress(check, subset))) def dropna_helper(df): - new_df = df.dropna(axis=axis, how=how, thresh=thresh, - subset=indices, inplace=False) + new_df = df.dropna( + axis=axis, + how=how, + thresh=thresh, + subset=indices, + inplace=False) if axis == 1: new_index = new_df.columns @@ -1000,43 +1055,49 @@ def dropna_helper(df): return new_df, new_index parts = self._col_partitions if axis == 1 else self._row_partitions - result = [_deploy_func._submit(args=(dropna_helper, df), - num_return_vals=2) for df in parts] + result = [ + _deploy_func._submit(args=(dropna_helper, df), num_return_vals=2) + for df in parts + ] new_parts, new_vals = [list(t) for t in zip(*result)] if axis == 1: - new_vals = [self._col_metadata.get_global_indices(i, vals) - for i, vals in enumerate(ray.get(new_vals))] + new_vals = [ + self._col_metadata.get_global_indices(i, vals) + for i, vals in enumerate(ray.get(new_vals)) + ] # This flattens the 2d array to 1d new_vals = [i for j in new_vals for i in j] new_cols = self.columns[new_vals] if not inplace: - return DataFrame(col_partitions=new_parts, - columns=new_cols, - index=self.index) + return DataFrame( + col_partitions=new_parts, + columns=new_cols, + index=self.index) - self._update_inplace(col_partitions=new_parts, - columns=new_cols, - index=self.index) + self._update_inplace( + col_partitions=new_parts, columns=new_cols, index=self.index) else: - new_vals = [self._row_metadata.get_global_indices(i, vals) - for i, vals in enumerate(ray.get(new_vals))] + new_vals = [ + self._row_metadata.get_global_indices(i, vals) + for i, vals in enumerate(ray.get(new_vals)) + ] # This flattens the 2d array to 1d new_vals = [i for j in new_vals for i in j] new_rows = self.index[new_vals] if not inplace: - return DataFrame(row_partitions=new_parts, - index=new_rows, - columns=self.columns) + return DataFrame( + row_partitions=new_parts, + index=new_rows, + columns=self.columns) - self._update_inplace(row_partitions=new_parts, - index=new_rows, - columns=self.columns) + self._update_inplace( + row_partitions=new_parts, index=new_rows, columns=self.columns) return None @@ -1108,9 +1169,9 @@ def _string_function(self, func, *args, **kwargs): return f(*args, **kwargs) assert len(args) == 0 - assert len([kwarg - for kwarg in kwargs - if kwarg not in ['axis', '_level']]) == 0 + assert len([ + kwarg for kwarg in kwargs if kwarg not in ['axis', '_level'] + ]) == 0 return f f = getattr(np, func, None) @@ -1149,8 +1210,10 @@ def agg_helper(df, arg, index, columns, *args, **kwargs): if axis == 0: index = self.index - columns = [self._col_metadata.partition_series(i).index - for i in range(len(self._col_partitions))] + columns = [ + self._col_metadata.partition_series(i).index + for i in range(len(self._col_partitions)) + ] remote_result = \ [_deploy_func._submit(args=( @@ -1164,8 +1227,10 @@ def agg_helper(df, arg, index, columns, *args, **kwargs): for cols, part in zip(columns, self._col_partitions)] if axis == 1: - indexes = [self._row_metadata.partition_series(i).index - for i in range(len(self._row_partitions))] + indexes = [ + self._row_metadata.partition_series(i).index + for i in range(len(self._row_partitions)) + ] columns = self.columns remote_result = \ @@ -1204,21 +1269,27 @@ def agg_helper(df, arg, index, columns, *args, **kwargs): new_columns = ray.get(columns) new_columns = new_columns[0].append(new_columns[1:]) - return DataFrame(col_partitions=new_parts, - columns=new_columns, - index=new_index) + return DataFrame( + col_partitions=new_parts, columns=new_columns, index=new_index) else: new_columns = ray.get(columns[0]) # This does not handle the Multi Index case new_index = ray.get(index) new_index = new_index[0].append(new_index[1:]) - return DataFrame(row_partitions=new_parts, - columns=new_columns, - index=new_index) - - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, + return DataFrame( + row_partitions=new_parts, columns=new_columns, index=new_index) + + def align(self, + other, + join='outer', + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, broadcast_axis=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -1232,9 +1303,14 @@ def all(self, axis=None, bool_only=None, skipna=None, level=None, If axis=None or axis=0, this call applies df.all(axis=1) to the transpose of df. """ + def remote_func(df): - return df.all(axis=axis, bool_only=bool_only, skipna=skipna, - level=level, **kwargs) + return df.all( + axis=axis, + bool_only=bool_only, + skipna=skipna, + level=level, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -1246,9 +1322,14 @@ def any(self, axis=None, bool_only=None, skipna=None, level=None, If axis=None or axis=0, this call applies on the column partitions, otherwise operates on row partitions """ + def remote_func(df): - return df.any(axis=axis, bool_only=bool_only, skipna=skipna, - level=level, **kwargs) + return df.any( + axis=axis, + bool_only=bool_only, + skipna=skipna, + level=level, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -1280,9 +1361,10 @@ def append(self, other, ignore_index=False, verify_integrity=False): combined_columns = self.columns.tolist() + self.columns.union( other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) - other = pandas.DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) + other = pandas.DataFrame( + other.values.reshape((1, len(other))), + index=index, + columns=combined_columns) other = other._convert(datetime=True, timedelta=True) elif isinstance(other, list) and not isinstance(other[0], DataFrame): other = pandas.DataFrame(other) @@ -1295,11 +1377,19 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) - - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): + return concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity) + + def apply(self, + func, + axis=0, + broadcast=False, + raw=False, + reduce=None, + args=(), + **kwds): """Apply a function along input axis of DataFrame. Args: @@ -1320,13 +1410,14 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, return getattr(self, func)(*args, **kwds) elif isinstance(func, dict): if axis == 1: - raise TypeError( - "(\"'dict' object is not callable\", " - "'occurred at index {0}'".format(self.index[0])) + raise TypeError("(\"'dict' object is not callable\", " + "'occurred at index {0}'".format( + self.index[0])) if len(self.columns) != len(set(self.columns)): warnings.warn( 'duplicate column names not supported with apply().', - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) has_list = list in map(type, func.values()) part_ind_tuples = [(self._col_metadata[key], key) for key in func] @@ -1342,28 +1433,33 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, for (part, ind), key in part_ind_tuples] return pandas.concat(ray.get(result), axis=1, copy=False) else: - result = [_deploy_func.remote( - lambda df: df.iloc[:, ind].apply(func[key]), - self._col_partitions[part]) - for (part, ind), key in part_ind_tuples] + result = [ + _deploy_func.remote( + lambda df: df.iloc[:, ind].apply(func[key]), + self._col_partitions[part]) + for (part, ind), key in part_ind_tuples + ] return pandas.Series(ray.get(result), index=func.keys()) elif is_list_like(func): if axis == 1: - raise TypeError( - "(\"'list' object is not callable\", " - "'occurred at index {0}'".format(self.index[0])) + raise TypeError("(\"'list' object is not callable\", " + "'occurred at index {0}'".format( + self.index[0])) # TODO: some checking on functions that return Series or Dataframe new_cols = _map_partitions(lambda df: df.apply(func), self._col_partitions) # resolve function names for the DataFrame index - new_index = [f_name if isinstance(f_name, string_types) - else f_name.__name__ for f_name in func] - return DataFrame(col_partitions=new_cols, - columns=self.columns, - index=new_index, - col_metadata=self._col_metadata) + new_index = [ + f_name if isinstance(f_name, string_types) else f_name.__name__ + for f_name in func + ] + return DataFrame( + col_partitions=new_cols, + columns=self.columns, + index=new_index, + col_metadata=self._col_metadata) elif callable(func): return self._callable_function(func, axis=axis, *args, **kwds) @@ -1385,7 +1481,11 @@ def as_matrix(self, columns=None): # TODO this is very inefficient, also see __array__ return to_pandas(self).as_matrix(columns) - def asfreq(self, freq, method=None, how=None, normalize=False, + def asfreq(self, + freq, + method=None, + how=None, + normalize=False, fill_value=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -1403,15 +1503,13 @@ def assign(self, **kwargs): def astype(self, dtype, copy=True, errors='raise', **kwargs): if isinstance(dtype, dict): - if (not set(dtype.keys()).issubset(set(self.columns)) and - errors == 'raise'): - raise KeyError( - "Only a column name can be used for the key in" - "a dtype mappings argument.") + if (not set(dtype.keys()).issubset(set(self.columns)) + and errors == 'raise'): + raise KeyError("Only a column name can be used for the key in" + "a dtype mappings argument.") columns = list(dtype.keys()) - col_idx = [(self.columns.get_loc(columns[i]), columns[i]) - if columns[i] in self.columns - else (columns[i], columns[i]) + col_idx = [(self.columns.get_loc(columns[i]), columns[i]) if + columns[i] in self.columns else (columns[i], columns[i]) for i in range(len(columns))] new_dict = {} for idx, key in col_idx: @@ -1422,9 +1520,10 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): **kwargs), self._row_partitions, new_dict) if copy: - return DataFrame(row_partitions=new_rows, - columns=self.columns, - index=self.index) + return DataFrame( + row_partitions=new_rows, + columns=self.columns, + index=self.index) self._row_partitions = new_rows else: new_blocks = [_map_partitions(lambda d: d.astype(dtype=dtype, @@ -1434,9 +1533,10 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): block) for block in self._block_partitions] if copy: - return DataFrame(block_partitions=new_blocks, - columns=self.columns, - index=self.index) + return DataFrame( + block_partitions=new_blocks, + columns=self.columns, + index=self.index) self._block_partitions = new_blocks def at_time(self, time, asof=False): @@ -1444,7 +1544,10 @@ def at_time(self, time, asof=False): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def between_time(self, start_time, end_time, include_start=True, + def between_time(self, + start_time, + end_time, + include_start=True, include_end=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -1453,11 +1556,12 @@ def between_time(self, start_time, end_time, include_start=True, def bfill(self, axis=None, inplace=False, limit=None, downcast=None): """Synonym for DataFrame.fillna(method='bfill') """ - new_df = self.fillna(method='bfill', - axis=axis, - limit=limit, - downcast=downcast, - inplace=inplace) + new_df = self.fillna( + method='bfill', + axis=axis, + limit=limit, + downcast=downcast, + inplace=inplace) if not inplace: return new_df @@ -1469,7 +1573,7 @@ def bool(self): element is not boolean """ shape = self.shape - if shape != (1,) and shape != (1, 1): + if shape != (1, ) and shape != (1, 1): raise ValueError("""The PandasObject does not have exactly 1 element. Return the bool of a single element PandasObject. The truth value is @@ -1478,14 +1582,27 @@ def bool(self): else: return to_pandas(self).bool() - def boxplot(self, column=None, by=None, ax=None, fontsize=None, rot=0, - grid=True, figsize=None, layout=None, return_type=None, + def boxplot(self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, **kwds): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, + def clip(self, + lower=None, + upper=None, + axis=None, + inplace=False, + *args, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -1521,8 +1638,11 @@ def consolidate(self, inplace=False): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): + def convert_objects(self, + convert_dates=True, + convert_numeric=False, + convert_timedeltas=True, + copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -1549,6 +1669,7 @@ def count(self, axis=0, level=None, numeric_only=False): Returns: The count, in a Series (or DataFrame if level is specified). """ + def remote_func(df): return df.count(axis=axis, level=level, numeric_only=numeric_only) @@ -1565,14 +1686,16 @@ def _cumulative_helper(self, func, axis): if axis == 0: new_cols = _map_partitions(func, self._col_partitions) - return DataFrame(col_partitions=new_cols, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) + return DataFrame( + col_partitions=new_cols, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) else: new_rows = _map_partitions(func, self._row_partitions) - return DataFrame(row_partitions=new_rows, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) + return DataFrame( + row_partitions=new_rows, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def cummax(self, axis=None, skipna=True, *args, **kwargs): """Perform a cumulative maximum across the DataFrame. @@ -1584,6 +1707,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative maximum of the DataFrame. """ + def remote_func(df): return df.cummax(axis=axis, skipna=skipna, *args, **kwargs) @@ -1599,6 +1723,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative minimum of the DataFrame. """ + def remote_func(df): return df.cummin(axis=axis, skipna=skipna, *args, **kwargs) @@ -1614,6 +1739,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative product of the DataFrame. """ + def remote_func(df): return df.cumprod(axis=axis, skipna=skipna, *args, **kwargs) @@ -1629,6 +1755,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative sum of the DataFrame. """ + def remote_func(df): return df.cumsum(axis=axis, skipna=skipna, *args, **kwargs) @@ -1647,13 +1774,12 @@ def describe(self, percentiles=None, include=None, exclude=None): Returns: Series/DataFrame of summary statistics """ + def describe_helper(df): """This to ensure nothing goes on with non-numeric columns""" try: return df.select_dtypes(exclude='object').describe( - percentiles=percentiles, - include=include, - exclude=exclude) + percentiles=percentiles, include=include, exclude=exclude) # This exception is thrown when there are only non-numeric columns # in this partition except ValueError: @@ -1683,21 +1809,18 @@ def diff(self, periods=1, axis=0): DataFrame with the diff applied """ axis = pandas.DataFrame()._get_axis_number(axis) - partitions = (self._col_partitions if - axis == 0 else self._row_partitions) + partitions = (self._col_partitions + if axis == 0 else self._row_partitions) - result = _map_partitions(lambda df: - df.diff(axis=axis, periods=periods), - partitions) + result = _map_partitions( + lambda df: df.diff(axis=axis, periods=periods), partitions) if (axis == 1): - return DataFrame(row_partitions=result, - columns=self.columns, - index=self.index) + return DataFrame( + row_partitions=result, columns=self.columns, index=self.index) if (axis == 0): - return DataFrame(col_partitions=result, - columns=self.columns, - index=self.index) + return DataFrame( + col_partitions=result, columns=self.columns, index=self.index) def div(self, other, axis='columns', level=None, fill_value=None): """Divides this DataFrame against another DataFrame/Series/scalar. @@ -1733,8 +1856,14 @@ def dot(self, other): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def drop(self, labels=None, axis=0, index=None, columns=None, level=None, - inplace=False, errors='raise'): + def drop(self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors='raise'): """Return new object with labels in requested axis removed. Args: labels: Index or column labels to drop. @@ -1786,9 +1915,10 @@ def drop_helper(obj, axis, label): return obj if isinstance(coords, pandas.DataFrame): - drop_map = {part: list(df['index_within_partition']) - for part, df in - coords.copy().groupby('partition')} + drop_map = { + part: list(df['index_within_partition']) + for part, df in coords.copy().groupby('partition') + } else: partitions, indexes = coords drop_map = {partitions: indexes} @@ -1887,8 +2017,10 @@ def equals(self, other): # _match_partitioning (See _match_partitioning) new_zipped_parts = self._copartition(other, self.index) - equals_partitions = [_equals_helper.remote(left, right) - for left, right in new_zipped_parts] + equals_partitions = [ + _equals_helper.remote(left, right) + for left, right in new_zipped_parts + ] # To avoid getting all we use next notation. return next((False for eq in equals_partitions if not ray.get(eq)), @@ -1954,8 +2086,8 @@ def eval_helper(df): inplace = validate_bool_kwarg(inplace, "inplace") new_rows = _map_partitions(eval_helper, self._row_partitions) - result_type = ray.get(_deploy_func.remote(lambda df: type(df), - new_rows[0])) + result_type = ray.get( + _deploy_func.remote(lambda df: type(df), new_rows[0])) if result_type is pandas.Series: new_series = pandas.concat(ray.get(new_rows), axis=0, copy=False) new_series.index = self.index @@ -1966,13 +2098,21 @@ def eval_helper(df): columns = columns_copy.columns if inplace: - self._update_inplace(row_partitions=new_rows, columns=columns, - index=self.index) + self._update_inplace( + row_partitions=new_rows, columns=columns, index=self.index) else: return DataFrame(columns=columns, row_partitions=new_rows) - def ewm(self, com=None, span=None, halflife=None, alpha=None, - min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0): + def ewm(self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + freq=None, + adjust=True, + ignore_na=False, + axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -1985,16 +2125,23 @@ def expanding(self, min_periods=1, freq=None, center=False, axis=0): def ffill(self, axis=None, inplace=False, limit=None, downcast=None): """Synonym for DataFrame.fillna(method='ffill') """ - new_df = self.fillna(method='ffill', - axis=axis, - limit=limit, - downcast=downcast, - inplace=inplace) + new_df = self.fillna( + method='ffill', + axis=axis, + limit=limit, + downcast=downcast, + inplace=inplace) if not inplace: return new_df - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): + def fillna(self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs): """Fill NA/NaN values using the specified method. Args: @@ -2043,8 +2190,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, raise ValueError('must specify a fill method or value') if value is not None and method is not None: raise ValueError('cannot specify both a fill method and value') - if method is not None and method not in ['backfill', 'bfill', 'pad', - 'ffill']: + if method is not None and method not in [ + 'backfill', 'bfill', 'pad', 'ffill' + ]: expecting = 'pad (ffill) or backfill (bfill)' msg = 'Invalid fill method. Expecting {expecting}. Got {method}'\ .format(expecting=expecting, method=method) @@ -2088,8 +2236,10 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, # Not every partition was changed, so we put everything back that # was not changed and update those that were. - new_parts = [parts[i] if i not in new_vals else new_vals[i] - for i in range(len(parts))] + new_parts = [ + parts[i] if i not in new_vals else new_vals[i] + for i in range(len(parts)) + ] else: new_parts = _map_partitions(lambda df: df.fillna( value=value, @@ -2101,13 +2251,15 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, **kwargs), parts) if axis == 0: - new_obj._update_inplace(col_partitions=new_parts, - columns=self.columns, - index=self.index) + new_obj._update_inplace( + col_partitions=new_parts, + columns=self.columns, + index=self.index) else: - new_obj._update_inplace(row_partitions=new_parts, - columns=self.columns, - index=self.index) + new_obj._update_inplace( + row_partitions=new_parts, + columns=self.columns, + index=self.index) if not inplace: return new_obj @@ -2139,12 +2291,16 @@ def filter(self, items=None, like=None, regex=None, axis=None): if items is not None: bool_arr = labels.isin(items) elif like is not None: + def f(x): return like in to_str(x) + bool_arr = labels.map(f).tolist() else: + def f(x): return matcher.search(to_str(x)) is not None + matcher = re.compile(regex) bool_arr = labels.map(f).tolist() @@ -2181,8 +2337,14 @@ def floordiv(self, other, axis='columns', level=None, fill_value=None): level, fill_value) @classmethod - def from_csv(self, path, header=0, sep=', ', index_col=0, - parse_dates=True, encoding=None, tupleize_cols=None, + def from_csv(self, + path, + header=0, + sep=', ', + index_col=0, + parse_dates=True, + encoding=None, + tupleize_cols=None, infer_datetime_format=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -2201,8 +2363,13 @@ def from_items(self, items, columns=None, orient='columns'): "github.com/modin-project/modin.") @classmethod - def from_records(self, data, index=None, exclude=None, columns=None, - coerce_float=False, nrows=None): + def from_records(self, + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -2243,8 +2410,9 @@ def get_dtype_counts(self): Returns: The counts of dtypes in this object. """ - return ray.get(_deploy_func.remote(lambda df: df.get_dtype_counts(), - self._row_partitions[0])) + return ray.get( + _deploy_func.remote(lambda df: df.get_dtype_counts(), + self._row_partitions[0])) def get_ftype_counts(self): """Get the counts of ftypes in this object. @@ -2252,8 +2420,9 @@ def get_ftype_counts(self): Returns: The counts of ftypes in this object. """ - return ray.get(_deploy_func.remote(lambda df: df.get_ftype_counts(), - self._row_partitions[0])) + return ray.get( + _deploy_func.remote(lambda df: df.get_ftype_counts(), + self._row_partitions[0])) def get_value(self, index, col, takeable=False): raise NotImplementedError( @@ -2287,11 +2456,12 @@ def _head_block_builder(self, n): remaining = n - length_bins[idx - 1] else: remaining = n - return np.array([self._block_partitions[i] if i != idx - else [_deploy_func.remote(lambda df: - df.head(remaining), blk) - for blk in self._block_partitions[i]] - for i in range(idx + 1)]) + return np.array([ + self._block_partitions[i] if i != idx else [ + _deploy_func.remote(lambda df: df.head(remaining), blk) + for blk in self._block_partitions[i] + ] for i in range(idx + 1) + ]) def head(self, n=5): """Get the first n rows of the DataFrame. @@ -2309,14 +2479,28 @@ def head(self, n=5): index = self._row_metadata.index[:n] - return DataFrame(block_partitions=new_blocks, - col_metadata=self._col_metadata, - index=index, - dtypes_cache=self._dtypes_cache) - - def hist(self, data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): + return DataFrame( + block_partitions=new_blocks, + col_metadata=self._col_metadata, + index=index, + dtypes_cache=self._dtypes_cache) + + def hist(self, + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -2370,21 +2554,25 @@ def infer_objects(self): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, + def info(self, + verbose=None, + buf=None, + max_cols=None, + memory_usage=None, null_counts=None): - def info_helper(df): output_buffer = io.StringIO() - df.info(verbose=verbose, - buf=output_buffer, - max_cols=max_cols, - memory_usage=memory_usage, - null_counts=null_counts) + df.info( + verbose=verbose, + buf=output_buffer, + max_cols=max_cols, + memory_usage=memory_usage, + null_counts=null_counts) return output_buffer.getvalue() # Combine the per-partition info and split into lines - result = ''.join(ray.get(_map_partitions(info_helper, - self._col_partitions))) + result = ''.join( + ray.get(_map_partitions(info_helper, self._col_partitions))) lines = result.split('\n') # Class denoted in info() output @@ -2403,9 +2591,10 @@ def info_helper(df): col_lines = [prog.match(line) for line in lines] cols = [c.group(0) for c in col_lines if c is not None] # replace the partition columns names with real column names - columns = ["{0}\t{1}\n".format(self.columns[i], - cols[i].split(" ", 1)[1]) - for i in range(len(cols))] + columns = [ + "{0}\t{1}\n".format(self.columns[i], cols[i].split(" ", 1)[1]) + for i in range(len(cols)) + ] col_string = ''.join(columns) + '\n' # A summary of the dtypes in the dataframe @@ -2418,8 +2607,10 @@ def info_helper(df): # Parse lines for memory usage number prog = re.compile('^memory+.+') mems = [prog.match(line) for line in lines] - mem_vals = [float(re.search(r'\d+', m.group(0)).group()) - for m in mems if m is not None] + mem_vals = [ + float(re.search(r'\d+', m.group(0)).group()) for m in mems + if m is not None + ] memory_string = "" @@ -2432,8 +2623,10 @@ def info_helper(df): memory_string = 'memory usage: {0} bytes'.format(sum(mem_vals)) # Combine all the components of the info() output - result = ''.join([class_string, index_string, col_header, - col_string, dtypes_string, memory_string]) + result = ''.join([ + class_string, index_string, col_header, col_string, dtypes_string, + memory_string + ]) # Write to specified output buffer if buf: @@ -2454,8 +2647,7 @@ def insert(self, loc, column, value, allow_duplicates=False): value = np.full(len(self.index), value) if len(value) != len(self.index): - raise ValueError( - "Length of values does not match length of index") + raise ValueError("Length of values does not match length of index") if not allow_duplicates and column in self.columns: raise ValueError( "cannot insert {0}, already exists".format(column)) @@ -2480,30 +2672,36 @@ def insert_col_part(df): # Need to set index to index of this dtype or inserted values # become NaT df.index = value - df.insert(index_within_partition, column, - value, allow_duplicates) + df.insert(index_within_partition, column, value, + allow_duplicates) df.index = pandas.RangeIndex(0, len(df)) else: df.index = index - df.insert(index_within_partition, column, - value, allow_duplicates) + df.insert(index_within_partition, column, value, + allow_duplicates) df.index = pandas.RangeIndex(0, len(df)) return df new_obj = _deploy_func.remote(insert_col_part, self._col_partitions[partition]) - new_cols = [self._col_partitions[i] - if i != partition - else new_obj - for i in range(len(self._col_partitions))] + new_cols = [ + self._col_partitions[i] if i != partition else new_obj + for i in range(len(self._col_partitions)) + ] new_col_names = self.columns.insert(loc, column) - self._update_inplace(col_partitions=new_cols, columns=new_col_names, - index=self.index) + self._update_inplace( + col_partitions=new_cols, columns=new_col_names, index=self.index) - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): + def interpolate(self, + method='linear', + axis=0, + limit=None, + inplace=False, + limit_direction='forward', + downcast=None, + **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -2602,7 +2800,12 @@ def itertuples_helper(part): for v in partition_iterator: yield v - def join(self, other, on=None, how='left', lsuffix='', rsuffix='', + def join(self, + other, + on=None, + how='left', + lsuffix='', + rsuffix='', sort=False): """Join two or more DataFrames, or a DataFrame with a collection. @@ -2639,25 +2842,27 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', .join(pandas.DataFrame(columns=other.columns), lsuffix=lsuffix, rsuffix=rsuffix).columns - new_partition_num = max(len(self._block_partitions.T), - len(other._block_partitions.T)) + new_partition_num = max( + len(self._block_partitions.T), len(other._block_partitions.T)) # Join is a concat once we have shuffled the data internally. # We shuffle the data by computing the correct order. # Another important thing to note: We set the current self index # to the index variable which may be 'on'. new_self = np.array([ - _reindex_helper._submit(args=tuple([index, new_index, 1, - new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num) - for block in self._block_partitions.T]) + _reindex_helper._submit( + args=tuple([index, new_index, 1, new_partition_num] + + block.tolist()), + num_return_vals=new_partition_num) + for block in self._block_partitions.T + ]) new_other = np.array([ - _reindex_helper._submit(args=tuple([other.index, new_index, 1, - new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num) - for block in other._block_partitions.T]) + _reindex_helper._submit( + args=tuple([other.index, new_index, 1, new_partition_num] + + block.tolist()), + num_return_vals=new_partition_num) + for block in other._block_partitions.T + ]) # Append the blocks together (i.e. concat) new_block_parts = np.concatenate((new_self, new_other)).T @@ -2667,9 +2872,10 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', new_index = None # TODO join the two metadata tables for performance. - return DataFrame(block_partitions=new_block_parts, - index=new_index, - columns=new_column_labels) + return DataFrame( + block_partitions=new_block_parts, + index=new_index, + columns=new_column_labels) else: # This constraint carried over from Pandas. if on is not None: @@ -2681,44 +2887,58 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', # would otherwise require a lot more logic. new_index = pandas.DataFrame(index=self.index).join( [pandas.DataFrame(index=obj.index) for obj in other], - how=how, sort=sort).index + how=how, + sort=sort).index new_column_labels = pandas.DataFrame(columns=self.columns).join( [pandas.DataFrame(columns=obj.columns) for obj in other], - lsuffix=lsuffix, rsuffix=rsuffix).columns + lsuffix=lsuffix, + rsuffix=rsuffix).columns - new_partition_num = max([len(self._block_partitions.T)] + - [len(obj._block_partitions.T) - for obj in other]) + new_partition_num = max( + [len(self._block_partitions.T)] + + [len(obj._block_partitions.T) for obj in other]) new_self = np.array([ - _reindex_helper._submit(args=tuple([self.index, new_index, 1, - new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num) - for block in self._block_partitions.T]) - - new_others = np.array([_reindex_helper._submit( - args=tuple([obj.index, new_index, 1, new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num - ) for obj in other for block in obj._block_partitions.T]) + _reindex_helper._submit( + args=tuple([self.index, new_index, 1, new_partition_num] + + block.tolist()), + num_return_vals=new_partition_num) + for block in self._block_partitions.T + ]) + + new_others = np.array([ + _reindex_helper._submit( + args=tuple([obj.index, new_index, 1, new_partition_num] + + block.tolist()), + num_return_vals=new_partition_num) for obj in other + for block in obj._block_partitions.T + ]) # Append the columns together (i.e. concat) new_block_parts = np.concatenate((new_self, new_others)).T # TODO join the two metadata tables for performance. - return DataFrame(block_partitions=new_block_parts, - index=new_index, - columns=new_column_labels) - - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, + return DataFrame( + block_partitions=new_block_parts, + index=new_index, + columns=new_column_labels) + + def kurt(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, + def kurtosis(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -2773,13 +2993,24 @@ def mad(self, axis=None, skipna=None, level=None): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False, raise_on_error=None): + def mask(self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors='raise', + try_cast=False, + raise_on_error=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def max(self, axis=None, skipna=None, level=None, numeric_only=None, + def max(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): """Perform max across the DataFrame. @@ -2790,13 +3021,22 @@ def max(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: The max of the DataFrame. """ + def remote_func(df): - return df.max(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) + return df.max( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, + def mean(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): """Computes mean across the DataFrame. @@ -2807,13 +3047,22 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: The mean of the DataFrame. (Pandas series) """ + def remote_func(df): - return df.mean(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) + return df.mean( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, + def median(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): """Computes median across the DataFrame. @@ -2824,20 +3073,28 @@ def median(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: The median of the DataFrame. (Pandas series) """ + def remote_func(df): - return df.median(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) + return df.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) - def melt(self, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): + def melt(self, + id_vars=None, + value_vars=None, + var_name=None, + value_name='value', + col_level=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") def memory_usage(self, index=True, deep=False): - def remote_func(df): return df.memory_usage(index=False, deep=deep) @@ -2850,9 +3107,18 @@ def remote_func(df): return result - def merge(self, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False, + def merge(self, + right, + how='inner', + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=('_x', '_y'), + copy=True, + indicator=False, validate=None): """Database style join, where common columns in "on" are merged. @@ -2933,16 +3199,15 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_on = [left_on] if next((True for key in left_on if key not in self), False): - raise KeyError(next(key for key in left_on - if key not in self)) + raise KeyError(next(key for key in left_on if key not in self)) if right_on is not None: if not is_list_like(right_on): right_on = [right_on] if next((True for key in right_on if key not in right), False): - raise KeyError(next(key for key in right_on - if key not in right)) + raise KeyError( + next(key for key in right_on if key not in right)) # There's a small chance that our partitions are already perfect, but # if it's not, we need to adjust them. We adjust the right against the @@ -2952,10 +3217,12 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None, if not np.array_equal(self._row_metadata._lengths, right._row_metadata._lengths) or right_index: - repartitioned_right = np.array([_match_partitioning._submit( - args=(df, self._row_metadata._lengths, right.index), - num_return_vals=len(self._row_metadata._lengths)) - for df in right._col_partitions]).T + repartitioned_right = np.array([ + _match_partitioning._submit( + args=(df, self._row_metadata._lengths, right.index), + num_return_vals=len(self._row_metadata._lengths)) + for df in right._col_partitions + ]).T else: repartitioned_right = right._block_partitions @@ -2995,11 +3262,14 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None, new_index = _concat_index.remote(*new_index_parts) new_blocks = new_blocks[:, :-1] - return DataFrame(block_partitions=new_blocks, - columns=new_columns, - index=new_index) + return DataFrame( + block_partitions=new_blocks, columns=new_columns, index=new_index) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, + def min(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): """Perform min across the DataFrame. @@ -3010,9 +3280,14 @@ def min(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: The min of the DataFrame. """ + def remote_func(df): - return df.min(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) + return df.min( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -3054,22 +3329,23 @@ def fix_length(df, *lengths): parts = self._col_partitions if axis == 0 else self._row_partitions - result = [_deploy_func._submit(args=(lambda df: mode_helper(df), - part), num_return_vals=2) - for part in parts] + result = [ + _deploy_func._submit( + args=(lambda df: mode_helper(df), part), num_return_vals=2) + for part in parts + ] parts, lengths = [list(t) for t in zip(*result)] - parts = [_deploy_func.remote( - lambda df, *l: fix_length(df, l), part, *lengths) - for part in parts] + parts = [ + _deploy_func.remote(lambda df, *l: fix_length(df, l), part, + *lengths) for part in parts + ] if axis == 0: - return DataFrame(col_partitions=parts, - columns=self.columns) + return DataFrame(col_partitions=parts, columns=self.columns) else: - return DataFrame(row_partitions=parts, - index=self.index) + return DataFrame(row_partitions=parts, index=self.index) def mul(self, other, axis='columns', level=None, fill_value=None): """Multiplies this DataFrame against another DataFrame/Series/scalar. @@ -3128,16 +3404,19 @@ def notna(self): Boolean DataFrame where value is False if corresponding value is NaN, True otherwise """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.notna(), block) for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.notna(), block) + for block in self._block_partitions + ]) - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool")] * len(self.columns), index=self.columns) - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) + return DataFrame( + block_partitions=new_block_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata, + dtypes_cache=new_dtypes) def notnull(self): """Perform notnull across the DataFrame. @@ -3149,17 +3428,19 @@ def notnull(self): Boolean DataFrame where value is False if corresponding value is NaN, True otherwise """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.notnull(), block) - for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.notnull(), block) + for block in self._block_partitions + ]) - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool")] * len(self.columns), index=self.columns) - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) + return DataFrame( + block_partitions=new_block_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata, + dtypes_cache=new_dtypes) def nsmallest(self, n, columns, keep='first'): raise NotImplementedError( @@ -3177,12 +3458,17 @@ def nunique(self, axis=0, dropna=True): Returns: nunique : Series """ + def remote_func(df): return df.nunique(axis=axis, dropna=dropna) return self._arithmetic_helper(remote_func, axis) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + def pct_change(self, + periods=1, + fill_method='pad', + limit=None, + freq=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -3206,20 +3492,50 @@ def pivot(self, index=None, columns=None, values=None): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def pivot_table(self, values=None, index=None, columns=None, - aggfunc='mean', fill_value=None, margins=False, - dropna=True, margins_name='All'): + def pivot_table(self, + values=None, + index=None, + columns=None, + aggfunc='mean', + fill_value=None, + margins=False, + dropna=True, + margins_name='All'): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def plot(self, x=None, y=None, kind='line', ax=None, subplots=False, - sharex=None, sharey=False, layout=None, figsize=None, - use_index=True, title=None, grid=None, legend=True, style=None, - logx=False, logy=False, loglog=False, xticks=None, yticks=None, - xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, - table=False, yerr=None, xerr=None, secondary_y=False, - sort_columns=False, **kwds): + def plot(self, + x=None, + y=None, + kind='line', + ax=None, + subplots=False, + sharex=None, + sharey=False, + layout=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=True, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + secondary_y=False, + sort_columns=False, + **kwds): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -3253,8 +3569,13 @@ def pow(self, other, axis='columns', level=None, fill_value=None): return self._operator_helper(pandas.DataFrame.pow, other, axis, level, fill_value) - def prod(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=1, **kwargs): + def prod(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=1, + **kwargs): """Return the product of the values for the requested axis Args: @@ -3267,15 +3588,25 @@ def prod(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: prod : Series or DataFrame (if level specified) """ + def remote_func(df): - return df.prod(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, min_count=min_count, - **kwargs) + return df.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) - def product(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=1, **kwargs): + def product(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=1, + **kwargs): """Return the product of the values for the requested axis Args: @@ -3288,11 +3619,18 @@ def product(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: product : Series or DataFrame (if level specified) """ - return self.prod(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, min_count=min_count, - **kwargs) - - def quantile(self, q=0.5, axis=0, numeric_only=True, + return self.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs) + + def quantile(self, + q=0.5, + axis=0, + numeric_only=True, interpolation='linear'): """Return values at the given quantile over requested axis, a la numpy.percentile. @@ -3351,14 +3689,16 @@ def quantile_helper(df, base_object): A new Series or DataFrame depending on q. """ # This if call prevents ValueErrors with object only partitions - if (numeric_only and - all(dtype == np.dtype('O') or - is_timedelta64_dtype(dtype) - for dtype in df.dtypes)): + if (numeric_only and all( + dtype == np.dtype('O') or is_timedelta64_dtype(dtype) + for dtype in df.dtypes)): return base_object else: - return df.quantile(q=q, axis=axis, numeric_only=numeric_only, - interpolation=interpolation) + return df.quantile( + q=q, + axis=axis, + numeric_only=numeric_only, + interpolation=interpolation) axis = pandas.DataFrame()._get_axis_number(axis) @@ -3373,7 +3713,7 @@ def quantile_helper(df, base_object): # select only correct dtype columns new_columns = self.dtypes[self.dtypes.apply( - lambda x: is_numeric_dtype(x))].index + lambda x: is_numeric_dtype(x))].index else: new_partitions = _map_partitions( @@ -3381,9 +3721,10 @@ def quantile_helper(df, base_object): self._row_partitions) new_columns = self.index - return DataFrame(col_partitions=new_partitions, - index=q_index, - columns=new_columns) + return DataFrame( + col_partitions=new_partitions, + index=q_index, + columns=new_columns) else: # When q is a single float, we return a Series, so using @@ -3410,21 +3751,24 @@ def query_helper(df): df.columns = pandas.RangeIndex(0, len(df.columns)) return df - new_rows = _map_partitions(query_helper, - self._row_partitions) + new_rows = _map_partitions(query_helper, self._row_partitions) if inplace: self._update_inplace(row_partitions=new_rows, index=self.index) else: - return DataFrame(row_partitions=new_rows, - col_metadata=self._col_metadata) + return DataFrame( + row_partitions=new_rows, col_metadata=self._col_metadata) def radd(self, other, axis='columns', level=None, fill_value=None): return self.add(other, axis, level, fill_value) - def rank(self, axis=0, method='average', numeric_only=None, - na_option='keep', ascending=True, pct=False): - + def rank(self, + axis=0, + method='average', + numeric_only=None, + na_option='keep', + ascending=True, + pct=False): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the [method] of @@ -3448,37 +3792,44 @@ def rank(self, axis=0, method='average', numeric_only=None, """ def rank_helper(df): - return df.rank(axis=axis, method=method, - numeric_only=numeric_only, - na_option=na_option, - ascending=ascending, pct=pct) + return df.rank( + axis=axis, + method=method, + numeric_only=numeric_only, + na_option=na_option, + ascending=ascending, + pct=pct) axis = pandas.DataFrame()._get_axis_number(axis) if (axis == 1): new_cols = self.dtypes[self.dtypes.apply( - lambda x: is_numeric_dtype(x))].index - result = _map_partitions(rank_helper, - self._row_partitions) - return DataFrame(row_partitions=result, - columns=new_cols, - index=self.index) + lambda x: is_numeric_dtype(x))].index + result = _map_partitions(rank_helper, self._row_partitions) + return DataFrame( + row_partitions=result, columns=new_cols, index=self.index) if (axis == 0): - result = _map_partitions(rank_helper, - self._col_partitions) - return DataFrame(col_partitions=result, - columns=self.columns, - index=self.index) + result = _map_partitions(rank_helper, self._col_partitions) + return DataFrame( + col_partitions=result, columns=self.columns, index=self.index) def rdiv(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( - lambda df: df.rdiv(other, axis, level, fill_value), - other, axis, level) - - def reindex(self, labels=None, index=None, columns=None, axis=None, - method=None, copy=True, level=None, fill_value=np.nan, - limit=None, tolerance=None): + lambda df: df.rdiv(other, axis, level, fill_value), other, axis, + level) + + def reindex(self, + labels=None, + index=None, + columns=None, + axis=None, + method=None, + copy=True, + level=None, + fill_value=np.nan, + limit=None, + tolerance=None): if level is not None: raise NotImplementedError( "Multilevel Index not Implemented. " @@ -3495,47 +3846,66 @@ def reindex(self, labels=None, index=None, columns=None, axis=None, new_blocks = self._block_partitions if index is not None: old_index = self.index - new_blocks = np.array([reindex_helper._submit( - args=(old_index, index, 1, len(new_blocks), method, - fill_value, limit, tolerance) + tuple(block.tolist()), - num_return_vals=len(new_blocks)) - for block in new_blocks.T]).T + new_blocks = np.array([ + reindex_helper._submit( + args=(old_index, index, 1, len(new_blocks), method, + fill_value, limit, tolerance) + tuple( + block.tolist()), + num_return_vals=len(new_blocks)) for block in new_blocks.T + ]).T else: index = self.index if columns is not None: old_columns = self.columns - new_blocks = np.array([reindex_helper._submit( - args=(old_columns, columns, 0, new_blocks.shape[1], method, - fill_value, limit, tolerance) + tuple(block.tolist()), - num_return_vals=new_blocks.shape[1]) - for block in new_blocks]) + new_blocks = np.array([ + reindex_helper._submit( + args=(old_columns, columns, 0, new_blocks.shape[1], method, + fill_value, limit, tolerance) + tuple( + block.tolist()), + num_return_vals=new_blocks.shape[1]) + for block in new_blocks + ]) else: columns = self.columns if copy: - return DataFrame(block_partitions=new_blocks, - index=index, - columns=columns) - - self._update_inplace(block_partitions=new_blocks, - index=index, - columns=columns) - - def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): + return DataFrame( + block_partitions=new_blocks, index=index, columns=columns) + + self._update_inplace( + block_partitions=new_blocks, index=index, columns=columns) + + def reindex_axis(self, + labels, + axis=0, + method=None, + level=None, + copy=True, + limit=None, + fill_value=np.nan): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def reindex_like(self, other, method=None, copy=True, limit=None, + def reindex_like(self, + other, + method=None, + copy=True, + limit=None, tolerance=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def rename(self, mapper=None, index=None, columns=None, axis=None, - copy=True, inplace=False, level=None): + def rename(self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None): """Alters axes labels. Args: @@ -3555,8 +3925,10 @@ def rename(self, mapper=None, index=None, columns=None, axis=None, # kwargs. It doesn't ignore None values passed in, so we have to filter # them ourselves. args = locals() - kwargs = {k: v for k, v in args.items() - if v is not None and k != "self"} + kwargs = { + k: v + for k, v in args.items() if v is not None and k != "self" + } # inplace should always be true because this is just a copy, and we # will use the results after. kwargs['inplace'] = True @@ -3611,20 +3983,41 @@ def reorder_levels(self, order, axis=0): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad', axis=None): + def replace(self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method='pad', + axis=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, - label=None, convention='start', kind=None, loffset=None, - limit=None, base=0, on=None, level=None): + def resample(self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention='start', + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def reset_index(self, level=None, drop=False, inplace=False, col_level=0, + def reset_index(self, + level=None, + drop=False, + inplace=False, + col_level=0, col_fill=''): """Reset this index to default and create column from current index. @@ -3689,8 +4082,10 @@ def _maybe_casted_values(index, labels=None): if not drop: if isinstance(self.index, pandas.MultiIndex): - names = [n if n is not None else ('level_%d' % i) - for (i, n) in enumerate(self.index.names)] + names = [ + n if n is not None else ('level_%d' % i) + for (i, n) in enumerate(self.index.names) + ] to_insert = lzip(self.index.levels, self.index.labels) else: default = 'index' @@ -3699,9 +4094,9 @@ def _maybe_casted_values(index, labels=None): default = 'level_{}'.format(i) i += 1 - names = ([default] if self.index.name is None - else [self.index.name]) - to_insert = ((self.index, None),) + names = ([default] + if self.index.name is None else [self.index.name]) + to_insert = ((self.index, None), ) multi_col = isinstance(self.columns, pandas.MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): @@ -3709,8 +4104,8 @@ def _maybe_casted_values(index, labels=None): continue name = names[i] if multi_col: - col_name = (list(name) if isinstance(name, tuple) - else [name]) + col_name = (list(name) + if isinstance(name, tuple) else [name]) if col_fill is None: if len(col_name) not in (1, self.columns.nlevels): raise ValueError("col_fill=None is incompatible " @@ -3734,49 +4129,64 @@ def _maybe_casted_values(index, labels=None): def rfloordiv(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( - lambda df: df.rfloordiv(other, axis, level, fill_value), - other, axis, level) + lambda df: df.rfloordiv(other, axis, level, fill_value), other, + axis, level) def rmod(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( - lambda df: df.rmod(other, axis, level, fill_value), - other, axis, level) + lambda df: df.rmod(other, axis, level, fill_value), other, axis, + level) def rmul(self, other, axis='columns', level=None, fill_value=None): return self.mul(other, axis, level, fill_value) - def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, on=None, axis=0, closed=None): + def rolling(self, + window, + min_periods=None, + freq=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") def round(self, decimals=0, *args, **kwargs): - new_block_partitions = np.array([_map_partitions( - lambda df: df.round(decimals=decimals, *args, **kwargs), block) - for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions( + lambda df: df.round(decimals=decimals, *args, **kwargs), block) + for block in self._block_partitions + ]) - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) + return DataFrame( + block_partitions=new_block_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def rpow(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( - lambda df: df.rpow(other, axis, level, fill_value), - other, axis, level) + lambda df: df.rpow(other, axis, level, fill_value), other, axis, + level) def rsub(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( - lambda df: df.rsub(other, axis, level, fill_value), - other, axis, level) + lambda df: df.rsub(other, axis, level, fill_value), other, axis, + level) def rtruediv(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( - lambda df: df.rtruediv(other, axis, level, fill_value), - other, axis, level) - - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): + lambda df: df.rtruediv(other, axis, level, fill_value), other, + axis, level) + + def sample(self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None): """Returns a random sample of items from an axis of object. Args: @@ -3882,8 +4292,9 @@ def sample(self, n=None, frac=None, replace=False, weights=None, # An Empty DataFrame is returned if the number of samples is 0. # The Empty Dataframe should have either columns or index specified # depending on which axis is passed in. - return DataFrame(columns=[] if axis == 1 else self.columns, - index=self.index if axis == 1 else []) + return DataFrame( + columns=[] if axis == 1 else self.columns, + index=self.index if axis == 1 else []) if axis == 1: axis_labels = self.columns @@ -3909,34 +4320,35 @@ def sample(self, n=None, frac=None, replace=False, weights=None, # choose random numbers and then get corresponding labels from # chosen axis sample_indices = random_num_gen.randint( - low=0, - high=len(partition_metadata), - size=n) + low=0, high=len(partition_metadata), size=n) samples = axis_labels[sample_indices] else: # randomly select labels from chosen axis - samples = np.random.choice(a=axis_labels, size=n, - replace=replace, p=weights) + samples = np.random.choice( + a=axis_labels, size=n, replace=replace, p=weights) # create an array of (partition, index_within_partition) tuples for # each sample - part_ind_tuples = [partition_metadata[sample] - for sample in samples] + part_ind_tuples = [partition_metadata[sample] for sample in samples] if axis == 1: # tup[0] refers to the partition number and tup[1] is the index # within that partition - new_cols = [_deploy_func.remote(lambda df: df.iloc[:, [tup[1]]], - partitions[tup[0]]) for tup in part_ind_tuples] - return DataFrame(col_partitions=new_cols, - columns=samples, - index=self.index) + new_cols = [ + _deploy_func.remote(lambda df: df.iloc[:, [tup[1]]], + partitions[tup[0]]) + for tup in part_ind_tuples + ] + return DataFrame( + col_partitions=new_cols, columns=samples, index=self.index) else: - new_rows = [_deploy_func.remote(lambda df: df.loc[[tup[1]]], - partitions[tup[0]]) for tup in part_ind_tuples] - return DataFrame(row_partitions=new_rows, - columns=self.columns, - index=samples) + new_rows = [ + _deploy_func.remote(lambda df: df.loc[[tup[1]]], + partitions[tup[0]]) + for tup in part_ind_tuples + ] + return DataFrame( + row_partitions=new_rows, columns=self.columns, index=samples) def select(self, crit, axis=0): raise NotImplementedError( @@ -3960,8 +4372,8 @@ def select_dtypes(self, include=None, exclude=None): sel = tuple(map(set, (include, exclude))) - include, exclude = map( - lambda x: set(map(_get_dtype_from_object, x)), sel) + include, exclude = map(lambda x: set(map(_get_dtype_from_object, x)), + sel) include_these = pandas.Series(not bool(include), index=self.columns) exclude_these = pandas.Series(not bool(exclude), index=self.columns) @@ -3977,12 +4389,19 @@ def is_dtype_instance_mapper(column, dtype): exclude_these[column] = not any(map(f, exclude)) dtype_indexer = include_these & exclude_these - indicate = [i for i in range(len(dtype_indexer.values)) - if not dtype_indexer.values[i]] + indicate = [ + i for i in range(len(dtype_indexer.values)) + if not dtype_indexer.values[i] + ] return self.drop(columns=self.columns[indicate], inplace=False) - def sem(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def sem(self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -4004,7 +4423,8 @@ def set_axis(self, labels, axis=0, inplace=None): '"axis" as named parameter. The old form, with "axis" as ' 'first parameter and \"labels\" as second, is still supported ' 'but will be deprecated in a future version of pandas.', - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) labels, axis = axis, labels if inplace is None: @@ -4012,7 +4432,8 @@ def set_axis(self, labels, axis=0, inplace=None): 'set_axis currently defaults to operating inplace.\nThis ' 'will change in a future version of pandas, use ' 'inplace=True to avoid this warning.', - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2) inplace = True if inplace: setattr(self, pandas.DataFrame()._get_axis_name(axis), labels) @@ -4021,7 +4442,11 @@ def set_axis(self, labels, axis=0, inplace=None): obj.set_axis(labels, axis=axis, inplace=True) return obj - def set_index(self, keys, drop=True, append=False, inplace=False, + def set_index(self, + keys, + drop=True, + append=False, + inplace=False, verify_integrity=False): """Set the DataFrame index using one or more existing columns. @@ -4109,7 +4534,11 @@ def shift(self, periods=1, freq=None, axis=0): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, + def skew(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): """Return unbiased skew over requested axis Normalized by N-1 @@ -4123,9 +4552,14 @@ def skew(self, axis=None, skipna=None, level=None, numeric_only=None, Returns: skew : Series or DataFrame (if level specified) """ + def remote_func(df): - return df.skew(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) + return df.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -4134,8 +4568,14 @@ def slice_shift(self, periods=1, axis=0): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True, + def sort_index(self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last', + sort_remaining=True, by=None): """Sort a DataFrame by one of the indices (columns or index). @@ -4156,13 +4596,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, raise NotImplementedError("Multilevel index not yet implemented.") if by is not None: - warnings.warn("by argument to sort_index is deprecated, " - "please use .sort_values(by=...)", - FutureWarning, stacklevel=2) + warnings.warn( + "by argument to sort_index is deprecated, " + "please use .sort_values(by=...)", + FutureWarning, + stacklevel=2) if level is not None: raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, - inplace=inplace) + return self.sort_values( + by, axis=axis, ascending=ascending, inplace=inplace) axis = pandas.DataFrame()._get_axis_number(axis) @@ -4200,18 +4642,25 @@ def _sort_helper(df, index, axis, *args): new_column_parts = None if not inplace: - return DataFrame(col_partitions=new_column_parts, - row_partitions=new_row_parts, - index=new_index, - columns=new_columns) + return DataFrame( + col_partitions=new_column_parts, + row_partitions=new_row_parts, + index=new_index, + columns=new_columns) else: - self._update_inplace(row_partitions=new_row_parts, - col_partitions=new_column_parts, - columns=new_columns, - index=new_index) - - def sort_values(self, by, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + self._update_inplace( + row_partitions=new_row_parts, + col_partitions=new_column_parts, + columns=new_columns, + index=new_index) + + def sort_values(self, + by, + axis=0, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last'): """Sorts by a column/row or list of columns/rows. Args: @@ -4235,8 +4684,9 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, broadcast_value_dict = {str(col): self[col] for col in by} broadcast_values = pandas.DataFrame(broadcast_value_dict) else: - broadcast_value_list = [to_pandas(self[row::len(self.index)]) - for row in by] + broadcast_value_list = [ + to_pandas(self[row::len(self.index)]) for row in by + ] index_builder = list(zip(broadcast_value_list, by)) @@ -4298,18 +4748,24 @@ def _sort_helper(df, broadcast_values, axis, *args): new_index = self.index if inplace: - self._update_inplace(row_partitions=new_row_partitions, - col_partitions=new_column_partitions, - columns=new_columns, - index=new_index) + self._update_inplace( + row_partitions=new_row_partitions, + col_partitions=new_column_partitions, + columns=new_columns, + index=new_index) else: - return DataFrame(row_partitions=new_row_partitions, - col_partitions=new_column_partitions, - columns=new_columns, - index=new_index, - dtypes_cache=self._dtypes_cache) - - def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, + return DataFrame( + row_partitions=new_row_partitions, + col_partitions=new_column_partitions, + columns=new_columns, + index=new_index, + dtypes_cache=self._dtypes_cache) + + def sortlevel(self, + level=0, + axis=0, + ascending=True, + inplace=False, sort_remaining=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -4325,8 +4781,13 @@ def stack(self, level=-1, dropna=True): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def std(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def std(self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs): """Computes standard deviation across the DataFrame. Args: @@ -4337,9 +4798,15 @@ def std(self, axis=None, skipna=None, level=None, ddof=1, Returns: The std of the DataFrame (Pandas Series) """ + def remote_func(df): - return df.std(axis=axis, skipna=skipna, level=level, ddof=ddof, - numeric_only=numeric_only, **kwargs) + return df.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -4396,11 +4863,12 @@ def _tail_block_builder(self, n): # We are building the blocks in reverse order, then reversing the # numpy array order - return np.array( - [self._block_partitions[npartitions - i] if i != idx - else [_deploy_func.remote(lambda df: df.tail(remaining), blk) - for blk in self._block_partitions[npartitions - i]] - for i in range(idx + 1)])[::-1] + return np.array([ + self._block_partitions[npartitions - i] if i != idx else [ + _deploy_func.remote(lambda df: df.tail(remaining), blk) + for blk in self._block_partitions[npartitions - i] + ] for i in range(idx + 1) + ])[::-1] def tail(self, n=5): """Get the last n rows of the DataFrame. @@ -4417,10 +4885,11 @@ def tail(self, n=5): new_blocks = self._tail_block_builder(n) index = self._row_metadata.index[-n:] - return DataFrame(block_partitions=new_blocks, - col_metadata=self._col_metadata, - index=index, - dtypes_cache=self._dtypes_cache) + return DataFrame( + block_partitions=new_blocks, + col_metadata=self._col_metadata, + index=index, + dtypes_cache=self._dtypes_cache) def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): raise NotImplementedError( @@ -4435,12 +4904,27 @@ def to_clipboard(self, excel=None, sep=None, **kwargs): port_frame = to_pandas(self) port_frame.to_clipboard(excel, sep, **kwargs) - def to_csv(self, path_or_buf=None, sep=",", na_rep="", float_format=None, - columns=None, header=True, index=True, index_label=None, - mode="w", encoding=None, compression=None, quoting=None, - quotechar='"', line_terminator="\n", chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal="."): + def to_csv(self, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression=None, + quoting=None, + quotechar='"', + line_terminator="\n", + chunksize=None, + tupleize_cols=None, + date_format=None, + doublequote=True, + escapechar=None, + decimal="."): kwargs = { 'path_or_buf': path_or_buf, @@ -4471,9 +4955,11 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep="", float_format=None, return to_pandas(self).to_csv(**kwargs) if tupleize_cols is not None: - warnings.warn("The 'tupleize_cols' parameter is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'tupleize_cols' parameter is deprecated and " + "will be removed in a future version", + FutureWarning, + stacklevel=2) else: tupleize_cols = False @@ -4487,8 +4973,10 @@ def get_csv_str(df, index, columns, header, kwargs): return df.to_csv(**kwargs) idxs = [0] + np.cumsum(self._row_metadata._lengths).tolist() - idx_args = [self.index[idxs[i]:idxs[i + 1]] - for i in range(len(self._row_partitions))] + idx_args = [ + self.index[idxs[i]:idxs[i + 1]] + for i in range(len(self._row_partitions)) + ] csv_str_ids = _map_partitions( get_csv_str, self._row_partitions, idx_args, [columns_id] * len(self._row_partitions), @@ -4524,21 +5012,32 @@ def to_dict(self, orient='dict', into=dict): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True, + def to_excel(self, + excel_writer, + sheet_name='Sheet1', + na_rep='', + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep='inf', + verbose=True, freeze_panes=None): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_excel(excel_writer, sheet_name, na_rep, - float_format, columns, header, index, - index_label, startrow, startcol, engine, - merge_cells, encoding, inf_rep, verbose, - freeze_panes) + port_frame.to_excel(excel_writer, sheet_name, na_rep, float_format, + columns, header, index, index_label, startrow, + startcol, engine, merge_cells, encoding, inf_rep, + verbose, freeze_panes) def to_feather(self, fname): @@ -4548,8 +5047,13 @@ def to_feather(self, fname): port_frame = to_pandas(self) port_frame.to_feather(fname) - def to_gbq(self, destination_table, project_id, chunksize=10000, - verbose=True, reauth=False, if_exists='fail', + def to_gbq(self, + destination_table, + project_id, + chunksize=10000, + verbose=True, + reauth=False, + if_exists='fail', private_key=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -4563,42 +5067,77 @@ def to_hdf(self, path_or_buf, key, **kwargs): port_frame = to_pandas(self) port_frame.to_hdf(path_or_buf, key, **kwargs) - def to_html(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='np.NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, bold_rows=True, classes=None, escape=True, - max_rows=None, max_cols=None, show_dimensions=False, - notebook=False, decimal='.', border=None): + def to_html(self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep='np.NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + bold_rows=True, + classes=None, + escape=True, + max_rows=None, + max_cols=None, + show_dimensions=False, + notebook=False, + decimal='.', + border=None): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_html(buf, columns, col_space, header, - index, na_rep, formatters, - float_format, sparsify, index_names, - justify, bold_rows, classes, escape, - max_rows, max_cols, show_dimensions, - notebook, decimal, border) - - def to_json(self, path_or_buf=None, orient=None, date_format=None, - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression=None): + port_frame.to_html(buf, columns, col_space, header, index, na_rep, + formatters, float_format, sparsify, index_names, + justify, bold_rows, classes, escape, max_rows, + max_cols, show_dimensions, notebook, decimal, + border) + + def to_json(self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit='ms', + default_handler=None, + lines=False, + compression=None): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_json(path_or_buf, orient, date_format, - double_precision, force_ascii, date_unit, - default_handler, lines, compression) - - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='np.NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - bold_rows=False, column_format=None, longtable=None, - escape=None, encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): + port_frame.to_json(path_or_buf, orient, date_format, double_precision, + force_ascii, date_unit, default_handler, lines, + compression) + + def to_latex(self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep='np.NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal='.', + multicolumn=None, + multicolumn_format=None, + multirow=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -4616,8 +5155,7 @@ def to_panel(self): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def to_parquet(self, fname, engine='auto', compression='snappy', - **kwargs): + def to_parquet(self, fname, engine='auto', compression='snappy', **kwargs): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) @@ -4630,7 +5168,9 @@ def to_period(self, freq=None, axis=0, copy=True): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def to_pickle(self, path, compression='infer', + def to_pickle(self, + path, + compression='infer', protocol=pkl.HIGHEST_PROTOCOL): warnings.warn("Defaulting to Pandas implementation", @@ -4649,32 +5189,56 @@ def to_sparse(self, fill_value=None, kind='block'): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', - index=True, index_label=None, chunksize=None, dtype=None): + def to_sql(self, + name, + con, + flavor=None, + schema=None, + if_exists='fail', + index=True, + index_label=None, + chunksize=None, + dtype=None): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_sql(name, con, flavor, schema, if_exists, - index, index_label, chunksize, dtype) - - def to_stata(self, fname, convert_dates=None, write_index=True, - encoding='latin-1', byteorder=None, time_stamp=None, - data_label=None, variable_labels=None): + port_frame.to_sql(name, con, flavor, schema, if_exists, index, + index_label, chunksize, dtype) + + def to_stata(self, + fname, + convert_dates=None, + write_index=True, + encoding='latin-1', + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_stata(fname, convert_dates, write_index, - encoding, byteorder, time_stamp, - data_label, variable_labels) - - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='np.NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, line_width=None, max_rows=None, max_cols=None, + port_frame.to_stata(fname, convert_dates, write_index, encoding, + byteorder, time_stamp, data_label, variable_labels) + + def to_string(self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep='np.NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + line_width=None, + max_rows=None, + max_cols=None, show_dimensions=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " @@ -4741,7 +5305,11 @@ def unstack(self, level=-1, fill_value=None): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def update(self, other, join='left', overwrite=True, filter_func=None, + def update(self, + other, + join='left', + overwrite=True, + filter_func=None, raise_conflict=False): """Modify DataFrame in place using non-NA values from other. @@ -4769,11 +5337,16 @@ def update_helper(x, y): x.update(y, join, overwrite, filter_func, False) return x - self._inter_df_op_helper(update_helper, other, join, 0, None, - inplace=True) + self._inter_df_op_helper( + update_helper, other, join, 0, None, inplace=True) - def var(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def var(self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs): """Computes variance across the DataFrame. Args: @@ -4784,14 +5357,27 @@ def var(self, axis=None, skipna=None, level=None, ddof=1, Returns: The variance of the DataFrame. """ + def remote_func(df): - return df.var(axis=axis, skipna=skipna, level=level, ddof=ddof, - numeric_only=numeric_only, **kwargs) + return df.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False, raise_on_error=None): + def where(self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors='raise', + try_cast=False, + raise_on_error=None): """Replaces values not meeting condition with values in other. Args: @@ -4836,13 +5422,13 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, args = (False, axis, level, errors, try_cast, raise_on_error) if isinstance(other, DataFrame): - other_zipped = (v for k, v in self._copartition(other, - self.index)) + other_zipped = (v for k, v in self._copartition(other, self.index)) - new_partitions = [_where_helper.remote(k, v, next(other_zipped), - self.columns, cond.columns, - other.columns, *args) - for k, v in zipped_partitions] + new_partitions = [ + _where_helper.remote(k, v, next(other_zipped), self.columns, + cond.columns, other.columns, *args) + for k, v in zipped_partitions + ] # Series has to be treated specially because we're operating on row # partitions from here on. @@ -4865,34 +5451,35 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, other = (obj for obj in other_builder) - new_partitions = [_where_helper.remote(k, v, - next(other, - pandas.Series()), - self.columns, - cond.columns, - None, *args) - for k, v in zipped_partitions] + new_partitions = [ + _where_helper.remote(k, v, next(other, pandas.Series()), + self.columns, cond.columns, None, + *args) for k, v in zipped_partitions + ] else: other = other.reindex(self.columns) - new_partitions = [_where_helper.remote(k, v, other, - self.columns, - cond.columns, - None, *args) - for k, v in zipped_partitions] + new_partitions = [ + _where_helper.remote(k, v, other, self.columns, + cond.columns, None, *args) + for k, v in zipped_partitions + ] else: - new_partitions = [_where_helper.remote(k, v, other, self.columns, - cond.columns, None, *args) - for k, v in zipped_partitions] + new_partitions = [ + _where_helper.remote(k, v, other, self.columns, cond.columns, + None, *args) for k, v in zipped_partitions + ] if inplace: - self._update_inplace(row_partitions=new_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) + self._update_inplace( + row_partitions=new_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) else: - return DataFrame(row_partitions=new_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) + return DataFrame( + row_partitions=new_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def xs(self, key, axis=0, level=None, drop_level=True): raise NotImplementedError( @@ -4948,11 +5535,14 @@ def _getitem_array(self, key): if com.is_bool_indexer(key): if isinstance(key, pandas.Series) and \ not key.index.equals(self.index): - warnings.warn("Boolean Series key will be reindexed to match " - "DataFrame index.", UserWarning, stacklevel=3) + warnings.warn( + "Boolean Series key will be reindexed to match " + "DataFrame index.", + UserWarning, + stacklevel=3) elif len(key) != len(self.index): raise ValueError('Item wrong length {} instead of {}.'.format( - len(key), len(self.index))) + len(key), len(self.index))) key = check_bool_indexer(self.index, key) new_parts = _map_partitions(lambda df: df[key], @@ -4960,9 +5550,8 @@ def _getitem_array(self, key): columns = self.columns index = self.index[key] - return DataFrame(col_partitions=new_parts, - columns=columns, - index=index) + return DataFrame( + col_partitions=new_parts, columns=columns, index=index) else: columns = self._col_metadata[key].index column_indices = {item: i for i, item in enumerate(self.columns)} @@ -4973,15 +5562,16 @@ def get_columns_partition(df): result.columns = pandas.RangeIndex(0, len(result.columns)) return result - new_parts = [_deploy_func.remote( - lambda df: df.__getitem__(indices_for_rows), - part) for part in self._row_partitions] + new_parts = [ + _deploy_func.remote( + lambda df: df.__getitem__(indices_for_rows), part) + for part in self._row_partitions + ] index = self.index - return DataFrame(row_partitions=new_parts, - columns=columns, - index=index) + return DataFrame( + row_partitions=new_parts, columns=columns, index=index) def _getitem_indiv_col(self, key, part): loc = self._col_metadata[key] @@ -4989,18 +5579,17 @@ def _getitem_indiv_col(self, key, part): index = loc[loc['partition'] == part] else: index = loc[loc['partition'] == part]['index_within_partition'] - return _deploy_func.remote( - lambda df: df.__getitem__(index), - self._col_partitions[part]) + return _deploy_func.remote(lambda df: df.__getitem__(index), + self._col_partitions[part]) def _getitem_slice(self, key): - new_cols = _map_partitions(lambda df: df[key], - self._col_partitions) + new_cols = _map_partitions(lambda df: df[key], self._col_partitions) index = self.index[key] - return DataFrame(col_partitions=new_cols, - col_metadata=self._col_metadata, - index=index) + return DataFrame( + col_partitions=new_cols, + col_metadata=self._col_metadata, + index=index) def __getattr__(self, key): """After regular attribute access, looks up the name in the columns @@ -5122,6 +5711,7 @@ def __delitem__(self, key): Args: key: key to delete """ + # Create helper method for deleting column(s) in row partition. def del_helper(df, to_delete): cols = df.columns[to_delete] # either int or an array of ints @@ -5302,19 +5892,20 @@ def __neg__(self): A modified DataFrame where every element is the negation of before """ for t in self.dtypes: - if not (is_bool_dtype(t) - or is_numeric_dtype(t) + if not (is_bool_dtype(t) or is_numeric_dtype(t) or is_timedelta64_dtype(t)): raise TypeError("Unary negative expects numeric dtype, not {}" .format(t)) - new_block_partitions = np.array([_map_partitions( - lambda df: df.__neg__(), block) - for block in self._block_partitions]) + new_block_partitions = np.array([ + _map_partitions(lambda df: df.__neg__(), block) + for block in self._block_partitions + ]) - return DataFrame(block_partitions=new_block_partitions, - col_metadata=self._col_metadata, - row_metadata=self._row_metadata) + return DataFrame( + block_partitions=new_block_partitions, + col_metadata=self._col_metadata, + row_metadata=self._row_metadata) def __sizeof__(self): raise NotImplementedError( @@ -5398,8 +5989,8 @@ def _copartition(self, other, new_index): new_index = ray.put(new_index) old_other_index = ray.put(other.index) - new_num_partitions = max(len(self._block_partitions.T), - len(other._block_partitions.T)) + new_num_partitions = max( + len(self._block_partitions.T), len(other._block_partitions.T)) new_partitions_self = \ np.array([_reindex_helper._submit( @@ -5421,12 +6012,12 @@ def _operator_helper(self, func, other, axis, level, *args): """Helper method for inter-DataFrame and scalar operations""" if isinstance(other, DataFrame): return self._inter_df_op_helper( - lambda x, y: func(x, y, axis, level, *args), - other, "outer", axis, level) + lambda x, y: func(x, y, axis, level, *args), other, "outer", + axis, level) else: return self._single_df_op_helper( - lambda df: func(df, other, axis, level, *args), - other, axis, level) + lambda df: func(df, other, axis, level, *args), other, axis, + level) def _inter_df_op_helper(self, func, other, how, axis, level, inplace=False): @@ -5449,13 +6040,15 @@ def _inter_df_op_helper(self, func, other, how, axis, level, if not inplace: # TODO join the Index Metadata objects together for performance. - return DataFrame(block_partitions=new_blocks, - columns=new_column_index, - index=new_index) + return DataFrame( + block_partitions=new_blocks, + columns=new_column_index, + index=new_index) else: - self._update_inplace(block_partitions=new_blocks, - columns=new_column_index, - index=new_index) + self._update_inplace( + block_partitions=new_blocks, + columns=new_column_index, + index=new_index) def _single_df_op_helper(self, func, other, axis, level): if level is not None: @@ -5486,8 +6079,10 @@ def _single_df_op_helper(self, func, other, axis, level): new_columns = None else: - new_blocks = np.array([_map_partitions(func, block) - for block in self._block_partitions]) + new_blocks = np.array([ + _map_partitions(func, block) + for block in self._block_partitions + ]) new_columns = None new_rows = None new_index = self.index @@ -5495,13 +6090,14 @@ def _single_df_op_helper(self, func, other, axis, level): new_col_metadata = self._col_metadata new_row_metadata = self._row_metadata - return DataFrame(col_partitions=new_columns, - row_partitions=new_rows, - block_partitions=new_blocks, - index=new_index, - columns=new_column_index, - col_metadata=new_col_metadata, - row_metadata=new_row_metadata) + return DataFrame( + col_partitions=new_columns, + row_partitions=new_rows, + block_partitions=new_blocks, + index=new_index, + columns=new_column_index, + col_metadata=new_col_metadata, + row_metadata=new_row_metadata) @ray.remote @@ -5522,8 +6118,8 @@ def _merge_columns(left_columns, right_columns, *args): @ray.remote -def _where_helper(left, cond, other, left_columns, cond_columns, - other_columns, *args): +def _where_helper(left, cond, other, left_columns, cond_columns, other_columns, + *args): left = pandas.concat(ray.get(left.tolist()), axis=1, copy=False) # We have to reset the index and columns here because we are coming @@ -5554,9 +6150,14 @@ def reindex_helper(old_index, new_index, axis, npartitions, method, fill_value, else: df.columns = old_index - df = df.reindex(new_index, copy=False, axis=axis ^ 1, - method=method, fill_value=fill_value, - limit=limit, tolerance=tolerance) + df = df.reindex( + new_index, + copy=False, + axis=axis ^ 1, + method=method, + fill_value=fill_value, + limit=limit, + tolerance=tolerance) return _create_blocks_helper(df, npartitions, axis) diff --git a/modin/pandas/datetimes.py b/modin/pandas/datetimes.py index a97d268b395..77a03de607c 100644 --- a/modin/pandas/datetimes.py +++ b/modin/pandas/datetimes.py @@ -10,9 +10,17 @@ from .utils import _map_partitions -def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, - box=True, format=None, exact=True, unit=None, - infer_datetime_format=False, origin='unix'): +def to_datetime(arg, + errors='raise', + dayfirst=False, + yearfirst=False, + utc=None, + box=True, + format=None, + exact=True, + unit=None, + infer_datetime_format=False, + origin='unix'): """Convert the arg to datetime format. If not Ray DataFrame, this falls back on pandas. @@ -36,26 +44,46 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, - scalar: Timestamp """ if not isinstance(arg, DataFrame): - return pandas.to_datetime(arg, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, - format=format, exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin) + return pandas.to_datetime( + arg, + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + box=box, + format=format, + exact=exact, + unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin) if errors == 'raise': - pandas.to_datetime(pandas.DataFrame(columns=arg.columns), - errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, - format=format, exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin) + pandas.to_datetime( + pandas.DataFrame(columns=arg.columns), + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + box=box, + format=format, + exact=exact, + unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin) def datetime_helper(df, cols): df.columns = cols - return pandas.to_datetime(df, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, - format=format, exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin) + return pandas.to_datetime( + df, + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + box=box, + format=format, + exact=exact, + unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin) datetime_series = _map_partitions(datetime_helper, arg._row_partitions, arg.columns) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index a3dd31b9982..c07f0c4802e 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -15,11 +15,13 @@ from .utils import _inherit_docstrings, _reindex_helper, post_task_gc -@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy, - excluded=[pandas.core.groupby.DataFrameGroupBy, - pandas.core.groupby.DataFrameGroupBy.__init__]) +@_inherit_docstrings( + pandas.core.groupby.DataFrameGroupBy, + excluded=[ + pandas.core.groupby.DataFrameGroupBy, + pandas.core.groupby.DataFrameGroupBy.__init__ + ]) class DataFrameGroupBy(object): - def __init__(self, df, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs): @@ -70,11 +72,13 @@ def _index_grouped(self): if self._axis == 0: self._index_grouped_cache = pandas.Series( np.zeros(len(self._index), dtype=np.uint8), - index=self._index).groupby(by=self._by, sort=self._sort) + index=self._index).groupby( + by=self._by, sort=self._sort) else: self._index_grouped_cache = pandas.Series( np.zeros(len(self._columns), dtype=np.uint8), - index=self._columns).groupby(by=self._by, sort=self._sort) + index=self._columns).groupby( + by=self._by, sort=self._sort) return self._index_grouped_cache @@ -101,17 +105,11 @@ def _grouped_partitions(self): .groupby(by='partition')] if len(self._index_grouped) > 1: - return zip(*(groupby._submit(args=(remote_index[i], - remote_by, - self._axis, - self._level, - self._as_index, - self._sort, - self._group_keys, - self._squeeze) - + tuple(part.tolist()), - num_return_vals=len( - self._index_grouped)) + return zip(*(groupby._submit( + args=(remote_index[i], remote_by, self._axis, self._level, + self._as_index, self._sort, self._group_keys, + self._squeeze) + tuple(part.tolist()), + num_return_vals=len(self._index_grouped)) for i, part in enumerate(self._partitions))) elif self._axis == 0: return [self._df._col_partitions] @@ -124,17 +122,19 @@ def _iter(self): if self._axis == 0: return ((self._keys_and_values[i][0], - DataFrame(col_partitions=part, - columns=self._columns, - index=self._keys_and_values[i][1].index, - col_metadata=self._col_metadata)) + DataFrame( + col_partitions=part, + columns=self._columns, + index=self._keys_and_values[i][1].index, + col_metadata=self._col_metadata)) for i, part in enumerate(self._grouped_partitions)) else: return ((self._keys_and_values[i][0], - DataFrame(row_partitions=part, - columns=self._keys_and_values[i][1].index, - index=self._index, - row_metadata=self._row_metadata)) + DataFrame( + row_partitions=part, + columns=self._keys_and_values[i][1].index, + index=self._index, + row_metadata=self._row_metadata)) for i, part in enumerate(self._grouped_partitions)) @property @@ -146,8 +146,8 @@ def skew(self, **kwargs): lambda df: _skew_remote.remote(df, self._axis, kwargs)) def ffill(self, limit=None): - return self._apply_df_function(lambda df: df.ffill(axis=self._axis, - limit=limit)) + return self._apply_df_function( + lambda df: df.ffill(axis=self._axis, limit=limit)) def sem(self, ddof=1): raise NotImplementedError( @@ -212,9 +212,8 @@ def nth(self, n, dropna=None): "github.com/modin-project/modin.") def cumsum(self, axis=0, *args, **kwargs): - return self._apply_df_function(lambda df: df.cumsum(axis, - *args, - **kwargs)) + return self._apply_df_function( + lambda df: df.cumsum(axis, *args, **kwargs)) @property def indices(self): @@ -231,8 +230,7 @@ def filter(self, func, dropna=True, *args, **kwargs): "github.com/modin-project/modin.") def cummax(self, axis=0, **kwargs): - return self._apply_df_function(lambda df: df.cummax(axis, - **kwargs)) + return self._apply_df_function(lambda df: df.cummax(axis, **kwargs)) def apply(self, func, *args, **kwargs): def apply_helper(df): @@ -247,12 +245,15 @@ def apply_helper(df): new_df.index = [k for k, v in self._iter] else: new_df = concat(result, axis=self._axis) - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.index, self._index, self._axis ^ 1, - len(new_df._block_partitions)] - + block.tolist()), - num_return_vals=len(new_df._block_partitions)) - for block in new_df._block_partitions.T]).T + new_df._block_partitions = np.array([ + _reindex_helper._submit( + args=tuple([ + new_df.index, self._index, self._axis ^ 1, + len(new_df._block_partitions) + ] + block.tolist()), + num_return_vals=len(new_df._block_partitions)) + for block in new_df._block_partitions.T + ]).T new_df.index = self._index new_df._row_metadata = \ _IndexMetadata(new_df._block_partitions[:, 0], @@ -265,12 +266,15 @@ def apply_helper(df): new_df.index = self._index else: new_df = concat(result, axis=self._axis) - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.columns, self._columns, self._axis ^ 1, - new_df._block_partitions.shape[1]] - + block.tolist()), - num_return_vals=new_df._block_partitions.shape[1]) - for block in new_df._block_partitions]) + new_df._block_partitions = np.array([ + _reindex_helper._submit( + args=tuple([ + new_df.columns, self._columns, self._axis ^ 1, + new_df._block_partitions.shape[1] + ] + block.tolist()), + num_return_vals=new_df._block_partitions.shape[1]) + for block in new_df._block_partitions + ]) new_df.columns = self._columns new_df._col_metadata = \ _IndexMetadata(new_df._block_partitions[0, :], @@ -298,12 +302,12 @@ def __getitem__(self, key): "github.com/modin-project/modin.") def cummin(self, axis=0, **kwargs): - return self._apply_df_function(lambda df: df.cummin(axis=axis, - **kwargs)) + return self._apply_df_function( + lambda df: df.cummin(axis=axis, **kwargs)) def bfill(self, limit=None): - return self._apply_df_function(lambda df: df.bfill(axis=self._axis, - limit=limit)) + return self._apply_df_function( + lambda df: df.bfill(axis=self._axis, limit=limit)) def idxmin(self): raise NotImplementedError( @@ -393,8 +397,17 @@ def describe(self, **kwargs): "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") - def boxplot(self, grouped, subplots=True, column=None, fontsize=None, - rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds): + def boxplot(self, + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + **kwds): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " "github.com/modin-project/modin.") @@ -421,9 +434,8 @@ def head(self, n=5): "github.com/modin-project/modin.") def cumprod(self, axis=0, *args, **kwargs): - return self._apply_df_function(lambda df: df.cumprod(axis, - *args, - **kwargs)) + return self._apply_df_function( + lambda df: df.cumprod(axis, *args, **kwargs)) def __iter__(self): return self._iter.__iter__() @@ -437,9 +449,8 @@ def cov(self): "github.com/modin-project/modin.") def transform(self, func, *args, **kwargs): - return self._apply_df_function(lambda df: df.transform(func, - *args, - **kwargs)) + return self._apply_df_function( + lambda df: df.transform(func, *args, **kwargs)) def corr(self, **kwargs): raise NotImplementedError( @@ -447,8 +458,8 @@ def corr(self, **kwargs): "github.com/modin-project/modin.") def fillna(self, **kwargs): - return self._apply_df_function(lambda df: df.fillna(axis=self._axis, - **kwargs)) + return self._apply_df_function( + lambda df: df.fillna(axis=self._axis, **kwargs)) def count(self, **kwargs): return self._apply_agg_function( @@ -519,13 +530,17 @@ def _apply_agg_function(self, f, index=None): from .dataframe import DataFrame if self._axis == 0: - return DataFrame(block_partitions=blocks, columns=self._columns, - index=index if index is not None - else [k for k, _ in self._index_grouped]) + return DataFrame( + block_partitions=blocks, + columns=self._columns, + index=index + if index is not None else [k for k, _ in self._index_grouped]) else: - return DataFrame(block_partitions=blocks.T, index=self._index, - columns=index if index is not None - else [k for k, _ in self._index_grouped]) + return DataFrame( + block_partitions=blocks.T, + index=self._index, + columns=index + if index is not None else [k for k, _ in self._index_grouped]) def _apply_df_function(self, f, concat_axis=None): assert callable(f), "\'{0}\' object is not callable".format(type(f)) @@ -536,22 +551,29 @@ def _apply_df_function(self, f, concat_axis=None): new_df = concat(result, axis=concat_axis) if self._axis == 0: - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.index, self._index, 1, - len(new_df._block_partitions)] + block.tolist()), - num_return_vals=len(new_df._block_partitions)) - for block in new_df._block_partitions.T]).T + new_df._block_partitions = np.array([ + _reindex_helper._submit( + args=tuple([ + new_df.index, self._index, 1, + len(new_df._block_partitions) + ] + block.tolist()), + num_return_vals=len(new_df._block_partitions)) + for block in new_df._block_partitions.T + ]).T new_df.index = self._index new_df._row_metadata = \ _IndexMetadata(new_df._block_partitions[:, 0], index=new_df.index, axis=0) else: - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.columns, self._columns, 0, - new_df._block_partitions.shape[1]] - + block.tolist()), - num_return_vals=new_df._block_partitions.shape[1]) - for block in new_df._block_partitions]) + new_df._block_partitions = np.array([ + _reindex_helper._submit( + args=tuple([ + new_df.columns, self._columns, 0, + new_df._block_partitions.shape[1] + ] + block.tolist()), + num_return_vals=new_df._block_partitions.shape[1]) + for block in new_df._block_partitions + ]) new_df.columns = self._columns new_df._col_metadata = \ _IndexMetadata(new_df._block_partitions[0, :], @@ -570,13 +592,16 @@ def groupby(index, by, axis, level, as_index, sort, group_keys, squeeze, *df): df.columns = index else: df.index = index - return [v for k, v in df.groupby(by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze)] + return [ + v for k, v in df.groupby( + by=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze) + ] @ray.remote diff --git a/modin/pandas/index_metadata.py b/modin/pandas/index_metadata.py index ba10a2687a8..23fd716b838 100644 --- a/modin/pandas/index_metadata.py +++ b/modin/pandas/index_metadata.py @@ -7,10 +7,7 @@ import numpy as np import ray -from .utils import ( - _build_row_lengths, - _build_col_widths, - _build_coord_df) +from .utils import (_build_row_lengths, _build_col_widths, _build_coord_df) from pandas.core.indexing import convert_to_index_sliceable @@ -31,7 +28,11 @@ class _IndexMetadata(object): lengths. Otherwise bad things might happen! """ - def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None, + def __init__(self, + dfs=None, + index=None, + axis=0, + lengths_oid=None, coord_df_oid=None): """Inits a IndexMetadata from Ray DataFrame partitions @@ -144,8 +145,7 @@ def _get_index_cache(self): """ if self._index_cache_validator is None: self._index_cache_validator = pandas.RangeIndex(len(self)) - elif isinstance(self._index_cache_validator, - ray.ObjectID): + elif isinstance(self._index_cache_validator, ray.ObjectID): self._index_cache_validator = ray.get(self._index_cache_validator) return self._index_cache_validator @@ -181,8 +181,15 @@ def coords_of(self, key): """ return self._coord_df.loc[key] - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): + def groupby(self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + **kwargs): # TODO: Find out what this does, and write a docstring assignments_df = self._coord_df.groupby(by=by, axis=axis, level=level, as_index=as_index, sort=sort, @@ -218,7 +225,10 @@ def reset_partition_coords(self, partitions=None): 'index_within_partition'] = np.arange( sum(partition_mask)).astype(int) - def insert(self, key, loc=None, partition=None, + def insert(self, + key, + loc=None, + partition=None, index_within_partition=None): """Inserts a key at a certain location in the index, or a certain coord in a partition. Called with either `loc` or `partition` and @@ -269,9 +279,11 @@ def insert(self, key, loc=None, partition=None, # pandas, because this is very annoying/unsure of efficiency # Create new coord entry to insert coord_to_insert = pandas.DataFrame( - {'partition': partition, - 'index_within_partition': index_within_partition}, - index=[key]) + { + 'partition': partition, + 'index_within_partition': index_within_partition + }, + index=[key]) # Insert into cached RangeIndex, and order by new column index self._coord_df = _coord_df_copy.append(coord_to_insert).loc[new_index] @@ -315,9 +327,10 @@ def copy(self): if self._index_cache is not None: index_copy = self._index_cache.copy() - return _IndexMetadata(index=index_copy, - coord_df_oid=coord_df_copy, - lengths_oid=lengths_copy) + return _IndexMetadata( + index=index_copy, + coord_df_oid=coord_df_copy, + lengths_oid=lengths_copy) def __getitem__(self, key): """Returns the coordinates (partition, index_within_partition) of the @@ -390,8 +403,9 @@ def drop(self, labels, errors='raise'): new_coord_df['partition'][new_coord_df['partition'] == i] \ -= num_dropped - new_coord_df['index_within_partition'] = [i for l in self._lengths - for i in range(l)] + new_coord_df['index_within_partition'] = [ + i for l in self._lengths for i in range(l) + ] self._coord_df = new_coord_df return dropped @@ -418,6 +432,5 @@ def get_partition(self, partition_id): return self._coord_df[self._coord_df.partition == partition_id] def sorted_index(self): - return (self._coord_df - .sort_values(['partition', 'index_within_partition']) - .index) + return (self._coord_df.sort_values( + ['partition', 'index_within_partition']).index) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index d100eed260c..bc5dd61b066 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -11,11 +11,10 @@ import ray from warnings import warn -from .utils import (_get_nan_block_id, extractor, - _mask_block_partitions, writer, _blocks_to_series) +from .utils import (_get_nan_block_id, extractor, _mask_block_partitions, + writer, _blocks_to_series) from .index_metadata import _IndexMetadata from .dataframe import DataFrame - """Indexing Helper Class works as follows: _Location_Indexer_Base provide methods framework for __getitem__ @@ -33,16 +32,20 @@ """ -def is_slice(x): return isinstance(x, slice) +def is_slice(x): + return isinstance(x, slice) -def is_2d(x): return is_list_like(x) or is_slice(x) +def is_2d(x): + return is_list_like(x) or is_slice(x) -def is_tuple(x): return isinstance(x, tuple) +def is_tuple(x): + return isinstance(x, tuple) -def is_boolean_array(x): return is_list_like(x) and all(map(is_bool, x)) +def is_boolean_array(x): + return is_list_like(x) and all(map(is_bool, x)) def is_integer_slice(x): @@ -241,10 +244,9 @@ def _broadcast_item(self, item, to_shape): return np.broadcast_to(item, to_shape) except ValueError: from_shape = np.array(item).shape - raise ValueError( - "could not broadcast input array from \ + raise ValueError("could not broadcast input array from \ shape {from_shape} into shape {to_shape}".format( - from_shape=from_shape, to_shape=to_shape)) + from_shape=from_shape, to_shape=to_shape)) def _write_items(self, row_lookup, col_lookup, item): """Perform remote write and replace blocks. @@ -299,8 +301,7 @@ def __setitem__(self, key, item): row_loc, col_loc, _ = _parse_tuple(key) self._handle_enlargement(row_loc, col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup, - item) + super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup, item) def _handle_enlargement(self, row_loc, col_loc): """Handle Enlargement (if there is one). @@ -353,11 +354,13 @@ def _enlarge_axis(self, locator, axis): [self.block_oids, nan_blks], axis=0 if row_based_bool else 1) # 3. Prepare metadata to return - nan_coord_df = pandas.DataFrame(data=[{ - '': name, - 'partition': blk_part_n_row if row_based_bool else blk_part_n_col, - 'index_within_partition': i - } for name, i in zip(nan_labels, np.arange(num_nan_labels)) + nan_coord_df = pandas.DataFrame(data=[ + { + '': name, + 'partition': blk_part_n_row + if row_based_bool else blk_part_n_col, + 'index_within_partition': i + } for name, i in zip(nan_labels, np.arange(num_nan_labels)) ]).set_index('') coord_df = pandas.concat([major_meta._coord_df, nan_coord_df]) @@ -433,8 +436,7 @@ def __setitem__(self, key, item): self._check_dtypes(col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - super(_iLoc_Indexer, self).__setitem__( - row_lookup, col_lookup, item) + super(_iLoc_Indexer, self).__setitem__(row_lookup, col_lookup, item) def _compute_lookup(self, row_loc, col_loc): # We use reindex for list to avoid duplicates. diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 4d18d5c2a45..97406892af3 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -118,14 +118,14 @@ def _read_csv_from_file(filepath, npartitions, kwargs={}): DataFrame or Series constructed from CSV file. """ empty_pd_df = pandas.read_csv( - filepath, **dict(kwargs, nrows=0, skipfooter=0, skip_footer=0)) + filepath, **dict(kwargs, nrows=0, skipfooter=0, skip_footer=0)) names = empty_pd_df.columns skipfooter = kwargs["skipfooter"] skip_footer = kwargs["skip_footer"] partition_kwargs = dict( - kwargs, header=None, names=names, skipfooter=0, skip_footer=0) + kwargs, header=None, names=names, skipfooter=0, skip_footer=0) with open(filepath, "rb") as f: # Get the BOM if necessary prefix = b"" @@ -148,16 +148,16 @@ def _read_csv_from_file(filepath, npartitions, kwargs={}): while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) - f.readline() # Read a whole number of lines + f.readline() # Read a whole number of lines if f.tell() >= total_bytes: kwargs["skipfooter"] = skipfooter kwargs["skip_footer"] = skip_footer partition_id, index_id = _read_csv_with_offset._submit( - args=(filepath, start, f.tell(), partition_kwargs_id, - prefix_id), - num_return_vals=2) + args=(filepath, start, f.tell(), partition_kwargs_id, + prefix_id), + num_return_vals=2) partition_ids.append(partition_id) index_ids.append(index_id) diff --git a/modin/pandas/pandas_code_gen.py b/modin/pandas/pandas_code_gen.py index 8a122e5300d..e5fdf91d573 100644 --- a/modin/pandas/pandas_code_gen.py +++ b/modin/pandas/pandas_code_gen.py @@ -24,9 +24,8 @@ def code_gen(pandas_obj, ray_obj, path): # let's not mess with these continue try: - outfile.write( - "\ndef " + func + - str(inspect.signature(getattr(pandas_obj, func))) + ":\n") + outfile.write("\ndef " + func + str( + inspect.signature(getattr(pandas_obj, func))) + ":\n") except TypeError: outfile.write("\n@property") @@ -48,10 +47,10 @@ def code_gen_test(ray_obj, path, name): continue outfile.write("\n\ndef test_" + func + "():\n") - outfile.write( - " ray_" + name + " = create_test_" + name + "()\n\n" + - " with pytest.raises(NotImplementedError):\n" + - " ray_" + name + "." + func) + outfile.write(" ray_" + name + " = create_test_" + name + + "()\n\n" + + " with pytest.raises(NotImplementedError):\n" + + " ray_" + name + "." + func) try: first = True param_num = \ @@ -87,8 +86,8 @@ def pandas_ray_diff(pandas_obj, ray_obj): pandas_funcs = dir(pandas_obj) ray_funcs = dir(ray_obj) - pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_", - pandas_funcs)) + pandas_funcs = set( + filter(lambda f: f[0] != "_" or f[1] == "_", pandas_funcs)) diff = [x for x in pandas_funcs if x not in set(ray_funcs)] return diff diff --git a/modin/pandas/reshape.py b/modin/pandas/reshape.py index a3e14643bbf..2fbd0d2f418 100644 --- a/modin/pandas/reshape.py +++ b/modin/pandas/reshape.py @@ -14,8 +14,13 @@ from .utils import _deploy_func -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False): +def get_dummies(data, + prefix=None, + prefix_sep='_', + dummy_na=False, + columns=None, + sparse=False, + drop_first=False): """Convert categorical variable into indicator variables. Args: @@ -32,9 +37,14 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, DataFrame or one-hot encoded data. """ if not isinstance(data, DataFrame): - return pandas.get_dummies(data, prefix=prefix, prefix_sep=prefix_sep, - dummy_na=dummy_na, columns=columns, - sparse=sparse, drop_first=drop_first) + return pandas.get_dummies( + data, + prefix=prefix, + prefix_sep=prefix_sep, + dummy_na=dummy_na, + columns=columns, + sparse=sparse, + drop_first=drop_first) if sparse: raise NotImplementedError( @@ -54,8 +64,10 @@ def check_len(item, name): if is_list_like(item): if not len(item) == len(columns_to_encode): - len_msg = len_msg.format(name=name, len_item=len(item), - len_enc=len(columns_to_encode)) + len_msg = len_msg.format( + name=name, + len_item=len(item), + len_enc=len(columns_to_encode)) raise ValueError(len_msg) check_len(prefix, 'prefix') @@ -89,9 +101,14 @@ def get_dummies_remote(df, to_drop, prefix, prefix_sep): if df.size == 0: return df, df.columns - df = pandas.get_dummies(df, prefix=prefix, prefix_sep=prefix_sep, - dummy_na=dummy_na, columns=None, sparse=sparse, - drop_first=drop_first) + df = pandas.get_dummies( + df, + prefix=prefix, + prefix_sep=prefix_sep, + dummy_na=dummy_na, + columns=None, + sparse=sparse, + drop_first=drop_first) columns = df.columns df.columns = pandas.RangeIndex(0, len(df.columns)) return df, columns @@ -120,6 +137,5 @@ def get_dummies_remote(df, to_drop, prefix, prefix_sep): columns = ray.get(columns) dropped_columns = dropped_columns.append(columns) - return DataFrame(col_partitions=with_dummies, - columns=dropped_columns, - index=data.index) + return DataFrame( + col_partitions=with_dummies, columns=dropped_columns, index=data.index) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index bc663e9a77c..30fba343995 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -15,10 +15,9 @@ def na_op(): raise NotImplementedError("Not Yet implemented.") -@_inherit_docstrings(pandas.Series, excluded=[pandas.Series, - pandas.Series.__init__]) +@_inherit_docstrings( + pandas.Series, excluded=[pandas.Series, pandas.Series.__init__]) class Series(object): - def __init__(self, series_oids): """Constructor for a Series object. @@ -59,8 +58,13 @@ def __bool__(self): def __bytes__(self): raise NotImplementedError("Not Yet implemented.") - def __class__(self, data=None, index=None, dtype=None, name=None, - copy=False, fastpath=False): + def __class__(self, + data=None, + index=None, + dtype=None, + name=None, + copy=False, + fastpath=False): raise NotImplementedError("Not Yet implemented.") def __contains__(self, key): @@ -214,8 +218,16 @@ def agg(self, func, axis=0, *args, **kwargs): def aggregate(self, func, axis=0, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, + def align(self, + other, + join='outer', + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, broadcast_axis=None): raise NotImplementedError("Not Yet implemented.") @@ -248,7 +260,11 @@ def as_blocks(self, copy=True): def as_matrix(self, columns=None): raise NotImplementedError("Not Yet implemented.") - def asfreq(self, freq, method=None, how=None, normalize=False, + def asfreq(self, + freq, + method=None, + how=None, + normalize=False, fill_value=None): raise NotImplementedError("Not Yet implemented.") @@ -270,7 +286,10 @@ def autocorr(self, lag=1): def between(self, left, right, inclusive=True): raise NotImplementedError("Not Yet implemented.") - def between_time(self, start_time, end_time, include_start=True, + def between_time(self, + start_time, + end_time, + include_start=True, include_end=True): raise NotImplementedError("Not Yet implemented.") @@ -304,8 +323,11 @@ def compress(self, condition, *args, **kwargs): def consolidate(self, inplace=False): raise NotImplementedError("Not Yet implemented.") - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): + def convert_objects(self, + convert_dates=True, + convert_numeric=False, + convert_timedeltas=True, + copy=True): raise NotImplementedError("Not Yet implemented.") def copy(self, deep=True): @@ -365,8 +387,16 @@ def eq(self, other, level=None, fill_value=None, axis=0): def equals(self, other): raise NotImplementedError("Not Yet implemented.") - def ewm(self, com=None, span=None, halflife=None, alpha=None, - min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0): + def ewm(self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + freq=None, + adjust=True, + ignore_na=False, + axis=0): raise NotImplementedError("Not Yet implemented.") def expanding(self, min_periods=1, freq=None, center=False, axis=0): @@ -378,8 +408,14 @@ def factorize(self, sort=False, na_sentinel=-1): def ffill(self, axis=None, inplace=False, limit=None, downcast=None): raise NotImplementedError("Not Yet implemented.") - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): + def fillna(self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs): raise NotImplementedError("Not Yet implemented.") def filter(self, items=None, like=None, regex=None, axis=None): @@ -394,12 +430,23 @@ def first_valid_index(self): def floordiv(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def from_array(self, arr, index=None, name=None, dtype=None, copy=False, + def from_array(self, + arr, + index=None, + name=None, + dtype=None, + copy=False, fastpath=False): raise NotImplementedError("Not Yet implemented.") - def from_csv(self, path, sep=',', parse_dates=True, header=None, - index_col=0, encoding=None, infer_datetime_format=False): + def from_csv(self, + path, + sep=',', + parse_dates=True, + header=None, + index_col=0, + encoding=None, + infer_datetime_format=False): raise NotImplementedError("Not Yet implemented.") def ge(self, other, level=None, fill_value=None, axis=0): @@ -420,8 +467,15 @@ def get_value(self, label, takeable=False): def get_values(self): raise NotImplementedError("Not Yet implemented.") - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): + def groupby(self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + **kwargs): raise NotImplementedError("Not Yet implemented.") def gt(self, other, level=None, fill_value=None, axis=0): @@ -430,8 +484,17 @@ def gt(self, other, level=None, fill_value=None, axis=0): def head(self, n=5): raise NotImplementedError("Not Yet implemented.") - def hist(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds): + def hist(self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds): raise NotImplementedError("Not Yet implemented.") def iat(self, axis=None): @@ -446,8 +509,14 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs): def iloc(self, axis=None): raise NotImplementedError("Not Yet implemented.") - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): + def interpolate(self, + method='linear', + axis=0, + limit=None, + inplace=False, + limit_direction='forward', + downcast=None, + **kwargs): raise NotImplementedError("Not Yet implemented.") def isin(self, values): @@ -471,11 +540,19 @@ def ix(self, axis=None): def keys(self): raise NotImplementedError("Not Yet implemented.") - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, + def kurt(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, + def kurtosis(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") @@ -500,26 +577,48 @@ def mad(self, axis=None, skipna=None, level=None): def map(self, arg, na_action=None): raise NotImplementedError("Not Yet implemented.") - def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - try_cast=False, raise_on_error=True): + def mask(self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + try_cast=False, + raise_on_error=True): raise NotImplementedError("Not Yet implemented.") - def max(self, axis=None, skipna=None, level=None, numeric_only=None, + def max(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, + def mean(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def median(self, axis=None, skipna=None, level=None, numeric_only=None, + def median(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def memory_usage(self, index=True, deep=False): raise NotImplementedError("Not Yet implemented.") - def min(self, axis=None, skipna=None, level=None, numeric_only=None, + def min(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") @@ -553,18 +652,42 @@ def nsmallest(self, n=5, keep='first'): def nunique(self, dropna=True): raise NotImplementedError("Not Yet implemented.") - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + def pct_change(self, + periods=1, + fill_method='pad', + limit=None, + freq=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def pipe(self, func, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def plot(self, kind='line', ax=None, figsize=None, use_index=True, - title=None, grid=None, legend=False, style=None, logx=False, - logy=False, loglog=False, xticks=None, yticks=None, xlim=None, - ylim=None, rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, label=None, secondary_y=False, **kwds): + def plot(self, + kind='line', + ax=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=False, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + label=None, + secondary_y=False, + **kwds): raise NotImplementedError("Not Yet implemented.") def pop(self, item): @@ -573,15 +696,27 @@ def pop(self, item): def pow(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def prod(self, axis=None, skipna=None, level=None, numeric_only=None, + def prod(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def product(self, axis=None, skipna=None, level=None, numeric_only=None, + def product(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def ptp(self, axis=None, skipna=None, level=None, numeric_only=None, + def ptp(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") @@ -594,8 +729,13 @@ def quantile(self, q=0.5, interpolation='linear'): def radd(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def rank(self, axis=0, method='average', numeric_only=None, - na_option='keep', ascending=True, pct=False): + def rank(self, + axis=0, + method='average', + numeric_only=None, + na_option='keep', + ascending=True, + pct=False): raise NotImplementedError("Not Yet implemented.") def ravel(self, order='C'): @@ -610,7 +750,11 @@ def reindex(self, index=None, **kwargs): def reindex_axis(self, labels, axis=0, **kwargs): raise NotImplementedError("Not Yet implemented.") - def reindex_like(self, other, method=None, copy=True, limit=None, + def reindex_like(self, + other, + method=None, + copy=True, + limit=None, tolerance=None): raise NotImplementedError("Not Yet implemented.") @@ -626,13 +770,30 @@ def reorder_levels(self, order): def repeat(self, repeats, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad', axis=None): - raise NotImplementedError("Not Yet implemented.") - - def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, - label=None, convention='start', kind=None, loffset=None, - limit=None, base=0, on=None, level=None): + def replace(self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method='pad', + axis=None): + raise NotImplementedError("Not Yet implemented.") + + def resample(self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention='start', + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None): raise NotImplementedError("Not Yet implemented.") def reset_index(self, level=None, drop=False, name=None, inplace=False): @@ -650,8 +811,15 @@ def rmod(self, other, level=None, fill_value=None, axis=0): def rmul(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, on=None, axis=0, closed=None): + def rolling(self, + window, + min_periods=None, + freq=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None): raise NotImplementedError("Not Yet implemented.") def round(self, decimals=0, *args, **kwargs): @@ -666,8 +834,13 @@ def rsub(self, other, level=None, fill_value=None, axis=0): def rtruediv(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): + def sample(self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None): raise NotImplementedError("Not Yet implemented.") def searchsorted(self, value, side='left', sorter=None): @@ -676,8 +849,13 @@ def searchsorted(self, value, side='left', sorter=None): def select(self, crit, axis=0): raise NotImplementedError("Not Yet implemented.") - def sem(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def sem(self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs): raise NotImplementedError("Not Yet implemented.") def set_axis(self, axis, labels): @@ -689,19 +867,33 @@ def set_value(self, label, value, takeable=False): def shift(self, periods=1, freq=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, + def skew(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def slice_shift(self, periods=1, axis=0): raise NotImplementedError("Not Yet implemented.") - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True): + def sort_index(self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last', + sort_remaining=True): raise NotImplementedError("Not Yet implemented.") - def sort_values(self, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + def sort_values(self, + axis=0, + ascending=True, + inplace=False, + kind='quicksort', + na_position='last'): raise NotImplementedError("Not Yet implemented.") def sortlevel(self, level=0, ascending=True, sort_remaining=True): @@ -710,8 +902,13 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): def squeeze(self, axis=None): raise NotImplementedError("Not Yet implemented.") - def std(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def std(self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs): raise NotImplementedError("Not Yet implemented.") def sub(self, other, level=None, fill_value=None, axis=0): @@ -720,7 +917,11 @@ def sub(self, other, level=None, fill_value=None, axis=0): def subtract(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def sum(self, axis=None, skipna=None, level=None, numeric_only=None, + def sum(self, + axis=None, + skipna=None, + level=None, + numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") @@ -739,9 +940,18 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): def to_clipboard(self, excel=None, sep=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def to_csv(self, path=None, index=True, sep=',', na_rep='', - float_format=None, header=False, index_label=None, mode='w', - encoding=None, date_format=None, decimal='.'): + def to_csv(self, + path=None, + index=True, + sep=',', + na_rep='', + float_format=None, + header=False, + index_label=None, + mode='w', + encoding=None, + date_format=None, + decimal='.'): raise NotImplementedError("Not Yet implemented.") def to_dense(self): @@ -750,10 +960,21 @@ def to_dense(self): def to_dict(self): raise NotImplementedError("Not Yet implemented.") - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', + def to_excel(self, + excel_writer, + sheet_name='Sheet1', + na_rep='', + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep='inf', verbose=True): raise NotImplementedError("Not Yet implemented.") @@ -763,17 +984,37 @@ def to_frame(self, name=None): def to_hdf(self, path_or_buf, key, **kwargs): raise NotImplementedError("Not Yet implemented.") - def to_json(self, path_or_buf=None, orient=None, date_format=None, - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False): - raise NotImplementedError("Not Yet implemented.") - - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, bold_rows=False, - column_format=None, longtable=None, escape=None, - encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): + def to_json(self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit='ms', + default_handler=None, + lines=False): + raise NotImplementedError("Not Yet implemented.") + + def to_latex(self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep='NaN', + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal='.', + multicolumn=None, + multicolumn_format=None, + multirow=None): raise NotImplementedError("Not Yet implemented.") def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): @@ -788,13 +1029,28 @@ def to_pickle(self, path, compression='infer'): def to_sparse(self, kind='block', fill_value=None): raise NotImplementedError("Not Yet implemented.") - def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', - index=True, index_label=None, chunksize=None, dtype=None): + def to_sql(self, + name, + con, + flavor=None, + schema=None, + if_exists='fail', + index=True, + index_label=None, + chunksize=None, + dtype=None): raise NotImplementedError("Not Yet implemented.") - def to_string(self, buf=None, na_rep='NaN', float_format=None, - header=True, index=True, length=False, dtype=False, - name=False, max_rows=None): + def to_string(self, + buf=None, + na_rep='NaN', + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None): raise NotImplementedError("Not Yet implemented.") def to_timestamp(self, freq=None, how='start', copy=True): @@ -840,19 +1096,34 @@ def upandasate(self, other): def valid(self, inplace=False, **kwargs): raise NotImplementedError("Not Yet implemented.") - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None, dropna=True): + def value_counts(self, + normalize=False, + sort=True, + ascending=False, + bins=None, + dropna=True): raise NotImplementedError("Not Yet implemented.") - def var(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def var(self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs): raise NotImplementedError("Not Yet implemented.") def view(self, dtype=None): raise NotImplementedError("Not Yet implemented.") - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - try_cast=False, raise_on_error=True): + def where(self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + try_cast=False, + raise_on_error=True): raise NotImplementedError("Not Yet implemented.") def xs(key, axis=0, level=None, drop_level=True): diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 030b9cfd935..33d08ed43d9 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -5,10 +5,7 @@ import pytest import pandas import modin.pandas as pd -from modin.pandas.utils import ( - to_pandas, - from_pandas -) +from modin.pandas.utils import (to_pandas, from_pandas) @pytest.fixture @@ -18,33 +15,41 @@ def ray_df_equals_pandas(ray_df, pandas_df): @pytest.fixture def generate_dfs(): - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - - df2 = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col6': [12, 13, 14, 15], - 'col7': [0, 0, 0, 0]}) + df = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + }) + + df2 = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col6': [12, 13, 14, 15], + 'col7': [0, 0, 0, 0] + }) return df, df2 @pytest.fixture def generate_none_dfs(): - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, None, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [None, None, None, None]}) - - df2 = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col6': [12, 13, 14, 15], - 'col7': [0, 0, 0, 0]}) + df = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, None, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [None, None, None, None] + }) + + df2 = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col6': [12, 13, 14, 15], + 'col7': [0, 0, 0, 0] + }) return df, df2 @@ -52,41 +57,44 @@ def generate_none_dfs(): def test_df_concat(): df, df2 = generate_dfs() - assert(ray_df_equals_pandas(pd.concat([df, df2]), - pandas.concat([df, df2]))) + assert (ray_df_equals_pandas( + pd.concat([df, df2]), pandas.concat([df, df2]))) def test_ray_concat(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), - pandas.concat([df, df2])) + assert ray_df_equals_pandas( + pd.concat([ray_df, ray_df2]), pandas.concat([df, df2])) def test_ray_concat_on_index(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'), - pandas.concat([df, df2], axis='index')) + assert ray_df_equals_pandas( + pd.concat([ray_df, ray_df2], axis='index'), + pandas.concat([df, df2], axis='index')) - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'), - pandas.concat([df, df2], axis='rows')) + assert ray_df_equals_pandas( + pd.concat([ray_df, ray_df2], axis='rows'), + pandas.concat([df, df2], axis='rows')) - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0), - pandas.concat([df, df2], axis=0)) + assert ray_df_equals_pandas( + pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0)) def test_ray_concat_on_column(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1), - pandas.concat([df, df2], axis=1)) + assert ray_df_equals_pandas( + pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1)) - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"), - pandas.concat([df, df2], axis="columns")) + assert ray_df_equals_pandas( + pd.concat([ray_df, ray_df2], axis="columns"), + pandas.concat([df, df2], axis="columns")) def test_invalid_axis_errors(): @@ -103,8 +111,8 @@ def test_mixed_concat(): mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] - assert(ray_df_equals_pandas(pd.concat(mixed_dfs), - pandas.concat([df, df2, df3]))) + assert (ray_df_equals_pandas( + pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))) def test_mixed_inner_concat(): @@ -113,8 +121,9 @@ def test_mixed_inner_concat(): mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] - assert(ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'), - pandas.concat([df, df2, df3], join='inner'))) + assert (ray_df_equals_pandas( + pd.concat(mixed_dfs, join='inner'), + pandas.concat([df, df2, df3], join='inner'))) def test_mixed_none_concat(): @@ -123,5 +132,5 @@ def test_mixed_none_concat(): mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] - assert(ray_df_equals_pandas(pd.concat(mixed_dfs), - pandas.concat([df, df2, df3]))) + assert (ray_df_equals_pandas( + pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index ed26052b311..3b3e8aaf696 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -28,39 +28,39 @@ def ray_df_equals(ray_df1, ray_df2): @pytest.fixture def create_test_dataframe(): - return pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) + return pd.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + }) def test_int_dataframe(): - frame_data = {'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]} + frame_data = { + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - testfuncs = [lambda x: x + 1, - lambda x: str(x), - lambda x: x * x, - lambda x: x, - lambda x: False] + testfuncs = [ + lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x, + lambda x: False + ] - query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] + query_funcs = [ + 'col1 < col2', 'col3 > col4', 'col1 == col2', + '(col2 > col1) and (col1 < col3)' + ] - keys = ['col1', - 'col2', - 'col3', - 'col4'] + keys = ['col1', 'col2', 'col3', 'col4'] - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} + filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) test_index(ray_df, pandas_df) @@ -171,8 +171,9 @@ def test_int_dataframe(): test___array__(ray_df, pandas_df) - apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'], - ['sum', 'sum']] + apply_agg_functions = [ + 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum'] + ] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) @@ -206,32 +207,30 @@ def test_int_dataframe(): def test_float_dataframe(): - frame_data = {'col1': [0.0, 1.0, 2.0, 3.0], - 'col2': [4.0, 5.0, 6.0, 7.0], - 'col3': [8.0, 9.0, 10.0, 11.0], - 'col4': [12.0, 13.0, 14.0, 15.0], - 'col5': [0.0, 0.0, 0.0, 0.0]} + frame_data = { + 'col1': [0.0, 1.0, 2.0, 3.0], + 'col2': [4.0, 5.0, 6.0, 7.0], + 'col3': [8.0, 9.0, 10.0, 11.0], + 'col4': [12.0, 13.0, 14.0, 15.0], + 'col5': [0.0, 0.0, 0.0, 0.0] + } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - testfuncs = [lambda x: x + 1, - lambda x: str(x), - lambda x: x * x, - lambda x: x, - lambda x: False] + testfuncs = [ + lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x, + lambda x: False + ] - query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] + query_funcs = [ + 'col1 < col2', 'col3 > col4', 'col1 == col2', + '(col2 > col1) and (col1 < col3)' + ] - keys = ['col1', - 'col2', - 'col3', - 'col4'] + keys = ['col1', 'col2', 'col3', 'col4'] - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} + filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) test_index(ray_df, pandas_df) @@ -344,8 +343,9 @@ def test_float_dataframe(): # TODO Nans are always not equal to each other, fix it # test___array__(ray_df, pandas_df) - apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'], - ['sum', 'sum']] + apply_agg_functions = [ + 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum'] + ] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) @@ -379,30 +379,27 @@ def test_float_dataframe(): def test_mixed_dtype_dataframe(): - frame_data = {'col1': [1, 2, 3, 4], - 'col2': [4, 5, 6, 7], - 'col3': [8.0, 9.4, 10.1, 11.3], - 'col4': ['a', 'b', 'c', 'd']} + frame_data = { + 'col1': [1, 2, 3, 4], + 'col2': [4, 5, 6, 7], + 'col3': [8.0, 9.4, 10.1, 11.3], + 'col4': ['a', 'b', 'c', 'd'] + } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - testfuncs = [lambda x: x + x, - lambda x: str(x), - lambda x: x, - lambda x: False] + testfuncs = [ + lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False + ] - query_funcs = ['col1 < col2', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] + query_funcs = [ + 'col1 < col2', 'col1 == col2', '(col2 > col1) and (col1 < col3)' + ] - keys = ['col1', - 'col2', - 'col3', - 'col4'] + keys = ['col1', 'col2', 'col3', 'col4'] - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} + filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) test_index(ray_df, pandas_df) @@ -548,30 +545,28 @@ def test_mixed_dtype_dataframe(): def test_nan_dataframe(): - frame_data = {'col1': [1, 2, 3, np.nan], - 'col2': [4, 5, np.nan, 7], - 'col3': [8, np.nan, 10, 11], - 'col4': [np.nan, 13, 14, 15]} + frame_data = { + 'col1': [1, 2, 3, np.nan], + 'col2': [4, 5, np.nan, 7], + 'col3': [8, np.nan, 10, 11], + 'col4': [np.nan, 13, 14, 15] + } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - testfuncs = [lambda x: x + x, - lambda x: str(x), - lambda x: x, - lambda x: False] + testfuncs = [ + lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False + ] - query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] + query_funcs = [ + 'col1 < col2', 'col3 > col4', 'col1 == col2', + '(col2 > col1) and (col1 < col3)' + ] - keys = ['col1', - 'col2', - 'col3', - 'col4'] + keys = ['col1', 'col2', 'col3', 'col4'] - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} + filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} test_filter(ray_df, pandas_df, filter_by) test_index(ray_df, pandas_df) @@ -680,8 +675,9 @@ def test_nan_dataframe(): # TODO Nans are always not equal to each other, fix it # test___array__(ray_df, pandas_df) - apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'], - ['sum', 'sum']] + apply_agg_functions = [ + 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum'] + ] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) @@ -753,8 +749,7 @@ def test_is_empty(df): def test_dense_nan_df(): - frame_data = [[np.nan, 2, np.nan, 0], - [3, 4, np.nan, 1], + frame_data = [[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, 5]] ray_df = pd.DataFrame(frame_data, columns=list('ABCD')) @@ -773,72 +768,96 @@ def test_dense_nan_df(): @pytest.fixture def test_inter_df_math(op, simple=False): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6] + } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df), - getattr(pandas_df, op)(pandas_df)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(ray_df), + getattr(pandas_df, op)(pandas_df)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(4), + getattr(pandas_df, op)(4)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(4.0), + getattr(pandas_df, op)(4.0)) frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} ray_df2 = pd.DataFrame(frame_data) pandas_df2 = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df2), - getattr(pandas_df, op)(pandas_df2)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(ray_df2), + getattr(pandas_df, op)(pandas_df2)) list_test = [0, 1, 2, 4] if not simple: - assert ray_df_equals_pandas(getattr(ray_df, op)(list_test, axis=1), - getattr(pandas_df, op)(list_test, axis=1)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(list_test, axis=1), + getattr(pandas_df, op)(list_test, axis=1)) - assert ray_df_equals_pandas(getattr(ray_df, op)(list_test, axis=0), - getattr(pandas_df, op)(list_test, axis=0)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(list_test, axis=0), + getattr(pandas_df, op)(list_test, axis=0)) @pytest.fixture def test_comparison_inter_ops(op): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6] + } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df), - getattr(pandas_df, op)(pandas_df)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(ray_df), + getattr(pandas_df, op)(pandas_df)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(4), + getattr(pandas_df, op)(4)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(4.0), + getattr(pandas_df, op)(4.0)) frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} ray_df2 = pd.DataFrame(frame_data) pandas_df2 = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(getattr(ray_df2, op)(ray_df2), - getattr(pandas_df2, op)(pandas_df2)) + assert ray_df_equals_pandas( + getattr(ray_df2, op)(ray_df2), + getattr(pandas_df2, op)(pandas_df2)) @pytest.fixture def test_inter_df_math_right_ops(op): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6] + } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(4), + getattr(pandas_df, op)(4)) + assert ray_df_equals_pandas( + getattr(ray_df, op)(4.0), + getattr(pandas_df, op)(4.0)) @pytest.fixture @@ -1004,8 +1023,12 @@ def test_any(ray_df, pd_df): def test_append(): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6] + } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) @@ -1015,8 +1038,8 @@ def test_append(): ray_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) - assert ray_df_equals_pandas(ray_df.append(ray_df2), - pandas_df.append(pandas_df2)) + assert ray_df_equals_pandas( + ray_df.append(ray_df2), pandas_df.append(pandas_df2)) with pytest.raises(ValueError): ray_df.append(ray_df2, verify_integrity=True) @@ -1092,23 +1115,26 @@ def test_astype(): td = TestData() ray_df = pd.DataFrame(td.frame) our_df_casted = ray_df.astype(np.int32) - expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.int32), - index=td.frame.index, - columns=td.frame.columns) + expected_df_casted = pandas.DataFrame( + td.frame.values.astype(np.int32), + index=td.frame.index, + columns=td.frame.columns) assert ray_df_equals_pandas(our_df_casted, expected_df_casted) our_df_casted = ray_df.astype(np.float64) - expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.float64), - index=td.frame.index, - columns=td.frame.columns) + expected_df_casted = pandas.DataFrame( + td.frame.values.astype(np.float64), + index=td.frame.index, + columns=td.frame.columns) assert ray_df_equals_pandas(our_df_casted, expected_df_casted) our_df_casted = ray_df.astype(str) - expected_df_casted = pandas.DataFrame(td.frame.values.astype(str), - index=td.frame.index, - columns=td.frame.columns) + expected_df_casted = pandas.DataFrame( + td.frame.values.astype(str), + index=td.frame.index, + columns=td.frame.columns) assert ray_df_equals_pandas(our_df_casted, expected_df_casted) @@ -1267,8 +1293,8 @@ def test_describe(ray_df, pandas_df): def test_diff(ray_df, pandas_df): assert ray_df_equals_pandas(ray_df.diff(), pandas_df.diff()) assert ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1)) - assert ray_df_equals_pandas(ray_df.diff(periods=1), - pandas_df.diff(periods=1)) + assert ray_df_equals_pandas( + ray_df.diff(periods=1), pandas_df.diff(periods=1)) def test_div(): @@ -1291,12 +1317,12 @@ def test_drop(): simple = pandas.DataFrame(frame_data) ray_simple = pd.DataFrame(frame_data) assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']]) - assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis='columns'), - simple[[]]) - assert ray_df_equals_pandas(ray_simple.drop([0, 1, 3], axis=0), - simple.loc[[2], :]) - assert ray_df_equals_pandas(ray_simple.drop([0, 3], axis='index'), - simple.loc[[1, 2], :]) + assert ray_df_equals_pandas( + ray_simple.drop(["A", "B"], axis='columns'), simple[[]]) + assert ray_df_equals_pandas( + ray_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + assert ray_df_equals_pandas( + ray_simple.drop([0, 3], axis='index'), simple.loc[[1, 2], :]) pytest.raises(ValueError, ray_simple.drop, 5) pytest.raises(ValueError, ray_simple.drop, 'C', 1) @@ -1305,31 +1331,30 @@ def test_drop(): # errors = 'ignore' assert ray_df_equals_pandas(ray_simple.drop(5, errors='ignore'), simple) - assert ray_df_equals_pandas(ray_simple.drop([0, 5], errors='ignore'), - simple.loc[[1, 2, 3], :]) - assert ray_df_equals_pandas(ray_simple.drop('C', axis=1, errors='ignore'), - simple) - assert ray_df_equals_pandas(ray_simple.drop(['A', 'C'], axis=1, - errors='ignore'), - simple[['B']]) + assert ray_df_equals_pandas( + ray_simple.drop([0, 5], errors='ignore'), simple.loc[[1, 2, 3], :]) + assert ray_df_equals_pandas( + ray_simple.drop('C', axis=1, errors='ignore'), simple) + assert ray_df_equals_pandas( + ray_simple.drop(['A', 'C'], axis=1, errors='ignore'), simple[['B']]) # non-unique - nu_df = pandas.DataFrame(pandas.compat.lzip(range(3), range(-3, 1), - list('abc')), - columns=['a', 'a', 'b']) + nu_df = pandas.DataFrame( + pandas.compat.lzip(range(3), range(-3, 1), list('abc')), + columns=['a', 'a', 'b']) ray_nu_df = pd.DataFrame(nu_df) assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']]) - assert ray_df_equals_pandas(ray_nu_df.drop('b', axis='columns'), - nu_df['a']) + assert ray_df_equals_pandas( + ray_nu_df.drop('b', axis='columns'), nu_df['a']) assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df) nu_df = nu_df.set_index(pandas.Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') ray_nu_df = pd.DataFrame(nu_df) - assert ray_df_equals_pandas(ray_nu_df.drop('X', axis='rows'), - nu_df.loc[["Y"], :]) - assert ray_df_equals_pandas(ray_nu_df.drop(['X', 'Y'], axis=0), - nu_df.loc[[], :]) + assert ray_df_equals_pandas( + ray_nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) + assert ray_df_equals_pandas( + ray_nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) # inplace cache issue frame_data = np.random.randn(10, 3) @@ -1344,8 +1369,8 @@ def test_drop_api_equivalence(): # equivalence of the labels/axis and index/columns API's frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] - ray_df = pd.DataFrame(frame_data, index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) + ray_df = pd.DataFrame( + frame_data, index=['a', 'b', 'c'], columns=['d', 'e', 'f']) ray_df1 = ray_df.drop('a') ray_df2 = ray_df.drop(index='a') @@ -1386,17 +1411,17 @@ def test_drop_duplicates(): @pytest.fixture def test_dropna(ray_df, pd_df): - assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'), - pd_df.dropna(axis=1, how='all')) + assert ray_df_equals_pandas( + ray_df.dropna(axis=1, how='all'), pd_df.dropna(axis=1, how='all')) - assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'), - pd_df.dropna(axis=1, how='any')) + assert ray_df_equals_pandas( + ray_df.dropna(axis=1, how='any'), pd_df.dropna(axis=1, how='any')) - assert ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'), - pd_df.dropna(axis=0, how='all')) + assert ray_df_equals_pandas( + ray_df.dropna(axis=0, how='all'), pd_df.dropna(axis=0, how='all')) - assert ray_df_equals_pandas(ray_df.dropna(thresh=2), - pd_df.dropna(thresh=2)) + assert ray_df_equals_pandas( + ray_df.dropna(thresh=2), pd_df.dropna(thresh=2)) @pytest.fixture @@ -1417,10 +1442,12 @@ def test_dropna_inplace(ray_df, pd_df): @pytest.fixture def test_dropna_multiple_axes(ray_df, pd_df): - assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=[0, 1]), - pd_df.dropna(how='all', axis=[0, 1])) - assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=(0, 1)), - pd_df.dropna(how='all', axis=(0, 1))) + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=[0, 1]), + pd_df.dropna(how='all', axis=[0, 1])) + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=(0, 1)), + pd_df.dropna(how='all', axis=(0, 1))) @pytest.fixture @@ -1445,11 +1472,13 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df): @pytest.fixture def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): for subset in column_subsets: - assert ray_df_equals_pandas(ray_df.dropna(how='all', subset=subset), - pd_df.dropna(how='all', subset=subset)) + assert ray_df_equals_pandas( + ray_df.dropna(how='all', subset=subset), + pd_df.dropna(how='all', subset=subset)) - assert ray_df_equals_pandas(ray_df.dropna(how='any', subset=subset), - pd_df.dropna(how='any', subset=subset)) + assert ray_df_equals_pandas( + ray_df.dropna(how='any', subset=subset), + pd_df.dropna(how='any', subset=subset)) for subset in row_subsets: assert ray_df_equals_pandas( @@ -1499,11 +1528,16 @@ def test_eval_df_use_case(): frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - df.eval("e = arctan2(sin(a), b)", - engine='python', parser='pandas', inplace=True) - ray_df.eval("e = arctan2(sin(a), b)", - engine='python', - parser='pandas', inplace=True) + df.eval( + "e = arctan2(sin(a), b)", + engine='python', + parser='pandas', + inplace=True) + ray_df.eval( + "e = arctan2(sin(a), b)", + engine='python', + parser='pandas', + inplace=True) # TODO: Use a series equality validator. assert ray_df_equals_pandas(ray_df, df) @@ -1512,10 +1546,10 @@ def test_eval_df_arithmetic_subexpression(): frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - df.eval("not_e = sin(a + b)", - engine='python', parser='pandas', inplace=True) - ray_df.eval("not_e = sin(a + b)", - engine='python', parser='pandas', inplace=True) + df.eval( + "not_e = sin(a + b)", engine='python', parser='pandas', inplace=True) + ray_df.eval( + "not_e = sin(a + b)", engine='python', parser='pandas', inplace=True) # TODO: Use a series equality validator. assert ray_df_equals_pandas(ray_df, df) @@ -1620,8 +1654,7 @@ def test_fillna_sanity(): # df.x.fillna(method=m) # with different dtype - frame_data = [['a', 'a', np.nan, 'a'], - ['b', 'b', np.nan, 'b'], + frame_data = [['a', 'a', np.nan, 'a'], ['b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']] df = pandas.DataFrame(frame_data) @@ -1635,8 +1668,10 @@ def test_fillna_sanity(): ray_df.fillna({2: 'foo'}, inplace=True) assert ray_df_equals_pandas(ray_df, result) - frame_data = {'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")], - 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT]} + frame_data = { + 'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")], + 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT] + } df = pandas.DataFrame(frame_data) result = df.fillna(value={'Date': df['Date2']}) ray_df = pd.DataFrame(frame_data).fillna(value={'Date': df['Date2']}) @@ -1683,8 +1718,9 @@ def test_ffill2(): test_data.tsframe['A'][:5] = np.nan test_data.tsframe['A'][-5:] = np.nan ray_df = pd.DataFrame(test_data.tsframe) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill'), - test_data.tsframe.fillna(method='ffill')) + assert ray_df_equals_pandas( + ray_df.fillna(method='ffill'), + test_data.tsframe.fillna(method='ffill')) @pytest.fixture @@ -1693,8 +1729,9 @@ def test_bfill2(): test_data.tsframe['A'][:5] = np.nan test_data.tsframe['A'][-5:] = np.nan ray_df = pd.DataFrame(test_data.tsframe) - assert ray_df_equals_pandas(ray_df.fillna(method='bfill'), - test_data.tsframe.fillna(method='bfill')) + assert ray_df_equals_pandas( + ray_df.fillna(method='bfill'), + test_data.tsframe.fillna(method='bfill')) @pytest.fixture @@ -1739,8 +1776,8 @@ def test_frame_fillna_limit(): expected = df[-2:].reindex(index) expected = expected.fillna(method='backfill', limit=5) - ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(method='backfill', - limit=5) + ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna( + method='backfill', limit=5) assert ray_df_equals_pandas(ray_df, expected) @@ -1752,13 +1789,15 @@ def test_frame_pad_backfill_limit(): result = df[:2].reindex(index) ray_df = pd.DataFrame(result) - assert ray_df_equals_pandas(ray_df.fillna(method='pad', limit=5), - result.fillna(method='pad', limit=5)) + assert ray_df_equals_pandas( + ray_df.fillna(method='pad', limit=5), + result.fillna(method='pad', limit=5)) result = df[-2:].reindex(index) ray_df = pd.DataFrame(result) - assert ray_df_equals_pandas(ray_df.fillna(method='backfill', limit=5), - result.fillna(method='backfill', limit=5)) + assert ray_df_equals_pandas( + ray_df.fillna(method='backfill', limit=5), + result.fillna(method='backfill', limit=5)) @pytest.fixture @@ -1788,17 +1827,33 @@ def test_fillna_skip_certain_blocks(): @pytest.fixture def test_fillna_dict_series(): - frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]} + frame_data = { + 'a': [np.nan, 1, 2, np.nan, np.nan], + 'b': [1, 2, 3, np.nan, np.nan], + 'c': [np.nan, 1, 2, 3, 4] + } df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5}), - df.fillna({'a': 0, 'b': 5})) + assert ray_df_equals_pandas( + ray_df.fillna({ + 'a': 0, + 'b': 5 + }), df.fillna({ + 'a': 0, + 'b': 5 + })) - assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5, 'd': 7}), - df.fillna({'a': 0, 'b': 5, 'd': 7})) + assert ray_df_equals_pandas( + ray_df.fillna({ + 'a': 0, + 'b': 5, + 'd': 7 + }), df.fillna({ + 'a': 0, + 'b': 5, + 'd': 7 + })) # Series treated same as dict assert ray_df_equals_pandas(ray_df.fillna(df.max()), df.fillna(df.max())) @@ -1806,17 +1861,22 @@ def test_fillna_dict_series(): @pytest.fixture def test_fillna_dataframe(): - frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]} + frame_data = { + 'a': [np.nan, 1, 2, np.nan, np.nan], + 'b': [1, 2, 3, np.nan, np.nan], + 'c': [np.nan, 1, 2, 3, 4] + } df = pandas.DataFrame(frame_data, index=list('VWXYZ')) ray_df = pd.DataFrame(frame_data, index=list('VWXYZ')) # df2 may have different index and columns - df2 = pandas.DataFrame({'a': [np.nan, 10, 20, 30, 40], - 'b': [50, 60, 70, 80, 90], - 'foo': ['bar'] * 5}, - index=list('VWXuZ')) + df2 = pandas.DataFrame( + { + 'a': [np.nan, 10, 20, 30, 40], + 'b': [50, 60, 70, 80, 90], + 'foo': ['bar'] * 5 + }, + index=list('VWXuZ')) # only those columns and indices which are shared get filled assert ray_df_equals_pandas(ray_df.fillna(df2), df.fillna(df2)) @@ -1829,13 +1889,15 @@ def test_fillna_columns(): df.values[:, ::2] = np.nan ray_df = pd.DataFrame(df) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1), - df.fillna(method='ffill', axis=1)) + assert ray_df_equals_pandas( + ray_df.fillna(method='ffill', axis=1), df.fillna( + method='ffill', axis=1)) df.insert(6, 'foo', 5) ray_df = pd.DataFrame(df) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1), - df.fillna(method='ffill', axis=1)) + assert ray_df_equals_pandas( + ray_df.fillna(method='ffill', axis=1), df.fillna( + method='ffill', axis=1)) @pytest.fixture @@ -1865,8 +1927,8 @@ def test_fillna_col_reordering(): data = np.random.rand(20, 5) df = pandas.DataFrame(index=range(20), columns=cols, data=data) ray_df = pd.DataFrame(index=range(20), columns=cols, data=data) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill'), - df.fillna(method='ffill')) + assert ray_df_equals_pandas( + ray_df.fillna(method='ffill'), df.fillna(method='ffill')) """ @@ -1895,14 +1957,14 @@ def test_fillna_datetime_columns(): @pytest.fixture def test_filter(ray_df, pandas_df, by): - assert ray_df_equals_pandas(ray_df.filter(items=by['items']), - pandas_df.filter(items=by['items'])) + assert ray_df_equals_pandas( + ray_df.filter(items=by['items']), pandas_df.filter(items=by['items'])) - assert ray_df_equals_pandas(ray_df.filter(regex=by['regex']), - pandas_df.filter(regex=by['regex'])) + assert ray_df_equals_pandas( + ray_df.filter(regex=by['regex']), pandas_df.filter(regex=by['regex'])) - assert ray_df_equals_pandas(ray_df.filter(like=by['like']), - pandas_df.filter(like=by['like'])) + assert ray_df_equals_pandas( + ray_df.filter(like=by['like']), pandas_df.filter(like=by['like'])) def test_first(): @@ -2072,8 +2134,12 @@ def test_itertuples(ray_df, pandas_df): def test_join(): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6] + } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) @@ -2187,8 +2253,12 @@ def test_memory_usage(ray_df): def test_merge(): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6] + } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) @@ -2206,38 +2276,38 @@ def test_merge(): ray_df_equals_pandas(ray_result, pandas_result) # left_on and right_index - ray_result = ray_df.merge(ray_df2, how=how, left_on='col1', - right_index=True) - pandas_result = pandas_df.merge(pandas_df2, how=how, - left_on='col1', right_index=True) + ray_result = ray_df.merge( + ray_df2, how=how, left_on='col1', right_index=True) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_on='col1', right_index=True) ray_df_equals_pandas(ray_result, pandas_result) # left_index and right_on - ray_result = ray_df.merge(ray_df2, how=how, left_index=True, - right_on='col1') - pandas_result = pandas_df.merge(pandas_df2, how=how, - left_index=True, right_on='col1') + ray_result = ray_df.merge( + ray_df2, how=how, left_index=True, right_on='col1') + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_index=True, right_on='col1') ray_df_equals_pandas(ray_result, pandas_result) # left_on and right_on col1 - ray_result = ray_df.merge(ray_df2, how=how, left_on='col1', - right_on='col1') - pandas_result = pandas_df.merge(pandas_df2, how=how, - left_on='col1', right_on='col1') + ray_result = ray_df.merge( + ray_df2, how=how, left_on='col1', right_on='col1') + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_on='col1', right_on='col1') ray_df_equals_pandas(ray_result, pandas_result) # left_on and right_on col2 - ray_result = ray_df.merge(ray_df2, how=how, left_on='col2', - right_on='col2') - pandas_result = pandas_df.merge(pandas_df2, how=how, - left_on='col2', right_on='col2') + ray_result = ray_df.merge( + ray_df2, how=how, left_on='col2', right_on='col2') + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_on='col2', right_on='col2') ray_df_equals_pandas(ray_result, pandas_result) # left_index and right_index - ray_result = ray_df.merge(ray_df2, how=how, left_index=True, - right_index=True) - pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, - right_index=True) + ray_result = ray_df.merge( + ray_df2, how=how, left_index=True, right_index=True) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_index=True, right_index=True) ray_df_equals_pandas(ray_result, pandas_result) @@ -2254,8 +2324,8 @@ def test_mod(): @pytest.fixture def test_mode(ray_df, pandas_df): assert ray_series_equals_pandas(ray_df.mode(), pandas_df.mode()) - assert ray_series_equals_pandas(ray_df.mode(axis=1), - pandas_df.mode(axis=1)) + assert ray_series_equals_pandas( + ray_df.mode(axis=1), pandas_df.mode(axis=1)) def test_mul(): @@ -2297,8 +2367,8 @@ def test_nsmallest(): @pytest.fixture def test_nunique(ray_df, pandas_df): assert ray_df_equals_pandas(ray_df.nunique(), pandas_df.nunique()) - assert ray_df_equals_pandas(ray_df.nunique(axis=1), - pandas_df.nunique(axis=1)) + assert ray_df_equals_pandas( + ray_df.nunique(axis=1), pandas_df.nunique(axis=1)) def test_pct_change(): @@ -2325,17 +2395,13 @@ def g(x, arg1=0): def f(x, arg2=0, arg3=0): return x.drop([arg2, arg3]) - assert ray_df_equals(f(g(h(ray_df), arg1=a), arg2=b, arg3=c), - (ray_df.pipe(h) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c))) + assert ray_df_equals( + f(g(h(ray_df), arg1=a), arg2=b, arg3=c), + (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c))) - assert ray_df_equals_pandas((ray_df.pipe(h) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c)), - (pandas_df.pipe(h) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c))) + assert ray_df_equals_pandas( + (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c))) def test_pivot(): @@ -2410,11 +2476,13 @@ def test_rdiv(): def test_reindex(): - frame_data = {'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]} + frame_data = { + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) @@ -2433,10 +2501,10 @@ def test_reindex(): pandas_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1)) assert ray_df_equals_pandas( - ray_df.reindex(index=[0, 1, 5], - columns=['col1', 'col7', 'col4', 'col8']), - pandas_df.reindex(index=[0, 1, 5], - columns=['col1', 'col7', 'col4', 'col8'])) + ray_df.reindex( + index=[0, 1, 5], columns=['col1', 'col7', 'col4', 'col8']), + pandas_df.reindex( + index=[0, 1, 5], columns=['col1', 'col7', 'col4', 'col8'])) def test_reindex_axis(): @@ -2455,6 +2523,7 @@ def test_reindex_like(): # Renaming + def test_rename(): test_rename_sanity() test_rename_multiindex() @@ -2470,15 +2539,16 @@ def test_rename_sanity(): mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'} ray_df = pd.DataFrame(test_data.frame) - assert ray_df_equals_pandas(ray_df.rename(columns=mapping), - test_data.frame.rename(columns=mapping)) + assert ray_df_equals_pandas( + ray_df.rename(columns=mapping), + test_data.frame.rename(columns=mapping)) renamed2 = test_data.frame.rename(columns=str.lower) assert ray_df_equals_pandas(ray_df.rename(columns=str.lower), renamed2) ray_df = pd.DataFrame(renamed2) - assert ray_df_equals_pandas(ray_df.rename(columns=str.upper), - renamed2.rename(columns=str.upper)) + assert ray_df_equals_pandas( + ray_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)) # index data = {'A': {'foo': 0, 'bar': 1}} @@ -2487,11 +2557,17 @@ def test_rename_sanity(): df = pandas.DataFrame(data) ray_df = pd.DataFrame(data) tm.assert_index_equal( - ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}).index, - df.rename(index={'foo': 'bar', 'bar': 'foo'}).index) + ray_df.rename(index={ + 'foo': 'bar', + 'bar': 'foo' + }).index, + df.rename(index={ + 'foo': 'bar', + 'bar': 'foo' + }).index) - tm.assert_index_equal(ray_df.rename(index=str.upper).index, - df.rename(index=str.upper).index) + tm.assert_index_equal( + ray_df.rename(index=str.upper).index, df.rename(index=str.upper).index) # have to pass something pytest.raises(TypeError, ray_df.rename) @@ -2500,8 +2576,14 @@ def test_rename_sanity(): renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}) ray_df = pd.DataFrame(test_data.frame) tm.assert_index_equal( - ray_df.rename(columns={'C': 'foo', 'D': 'bar'}).index, - test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index) + ray_df.rename(columns={ + 'C': 'foo', + 'D': 'bar' + }).index, + test_data.frame.rename(columns={ + 'C': 'foo', + 'D': 'bar' + }).index) # TODO: Uncomment when transpose works # other axis @@ -2536,14 +2618,35 @@ def test_rename_multiindex(): # # without specifying level -> accross all levels - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) + renamed = df.rename( + index={ + 'foo1': 'foo3', + 'bar2': 'bar3' + }, + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }) + ray_renamed = ray_df.rename( + index={ + 'foo1': 'foo3', + 'bar2': 'bar3' + }, + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }) tm.assert_index_equal(renamed.index, ray_renamed.index) - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) + renamed = df.rename( + index={ + 'foo1': 'foo3', + 'bar2': 'bar3' + }, + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }) tm.assert_index_equal(renamed.columns, ray_renamed.columns) assert renamed.index.names == ray_renamed.index.names assert renamed.columns.names == ray_renamed.columns.names @@ -2552,25 +2655,42 @@ def test_rename_multiindex(): # with specifying a level # dict - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=0) - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=0) + renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=0) + ray_renamed = ray_df.rename( + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }, level=0) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='fizz') - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='fizz') + renamed = df.rename( + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }, level='fizz') + ray_renamed = ray_df.rename( + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }, level='fizz') tm.assert_index_equal(renamed.columns, ray_renamed.columns) renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1) - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=1) + ray_renamed = ray_df.rename( + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }, level=1) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='buzz') - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='buzz') + renamed = df.rename( + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }, level='buzz') + ray_renamed = ray_df.rename( + columns={ + 'fizz1': 'fizz3', + 'buzz2': 'buzz3' + }, level='buzz') tm.assert_index_equal(renamed.columns, ray_renamed.columns) # function @@ -2591,8 +2711,11 @@ def test_rename_multiindex(): # index renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0) - ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - level=0) + ray_renamed = ray_df.rename( + index={ + 'foo1': 'foo3', + 'bar2': 'bar3' + }, level=0) tm.assert_index_equal(ray_renamed.index, renamed.index) @@ -2610,8 +2733,9 @@ def test_rename_inplace(): test_data = TestData().frame ray_df = pd.DataFrame(test_data) - assert ray_df_equals_pandas(ray_df.rename(columns={'C': 'foo'}), - test_data.rename(columns={'C': 'foo'})) + assert ray_df_equals_pandas( + ray_df.rename(columns={'C': 'foo'}), + test_data.rename(columns={'C': 'foo'})) frame = test_data.copy() ray_frame = ray_df.copy() @@ -2698,13 +2822,13 @@ def test_reset_index(ray_df, pandas_df, inplace=False): @pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") + "no fix for this yet") def test_rfloordiv(): test_inter_df_math_right_ops("rfloordiv") @pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") + "no fix for this yet") def test_rmod(): test_inter_df_math_right_ops("rmod") @@ -2735,7 +2859,7 @@ def test_rsub(): @pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") + "no fix for this yet") def test_rtruediv(): test_inter_df_math_right_ops("rtruediv") @@ -2754,12 +2878,14 @@ def test_select(): def test_select_dtypes(): - frame_data = {'test1': list('abc'), - 'test2': np.arange(3, 6).astype('u1'), - 'test3': np.arange(8.0, 11.0, dtype='float64'), - 'test4': [True, False, True], - 'test5': pandas.date_range('now', periods=3).values, - 'test6': list(range(5, 8))} + frame_data = { + 'test1': list('abc'), + 'test2': np.arange(3, 6).astype('u1'), + 'test3': np.arange(8.0, 11.0, dtype='float64'), + 'test4': [True, False, True], + 'test5': pandas.date_range('now', periods=3).values, + 'test6': list(range(5, 8)) + } df = pandas.DataFrame(frame_data) rd = pd.DataFrame(frame_data) @@ -2820,8 +2946,7 @@ def test_shift(): @pytest.fixture def test_skew(ray_df, pandas_df): assert ray_df_equals_pandas(ray_df.skew(), pandas_df.skew()) - assert ray_df_equals_pandas(ray_df.skew(axis=1), - pandas_df.skew(axis=1)) + assert ray_df_equals_pandas(ray_df.skew(axis=1), pandas_df.skew(axis=1)) def test_slice_shift(): @@ -2970,10 +3095,11 @@ def test_to_xarray(): @pytest.fixture def test_transform(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()), - pandas_df.transform(lambda df: df.isna())) - assert ray_df_equals_pandas(ray_df.transform('isna'), - pandas_df.transform('isna')) + assert ray_df_equals_pandas( + ray_df.transform(lambda df: df.isna()), + pandas_df.transform(lambda df: df.isna())) + assert ray_df_equals_pandas( + ray_df.transform('isna'), pandas_df.transform('isna')) def test_truediv(): @@ -3016,19 +3142,15 @@ def test_unstack(): def test_update(): - df = pd.DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], + df = pd.DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3], [1.5, np.nan, 3]]) - other = pd.DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = pd.DataFrame( + [[3.6, 2., np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other) - expected = pd.DataFrame([[1.5, np.nan, 3], - [3.6, 2, 3], - [1.5, np.nan, 3], + expected = pd.DataFrame([[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.]]) assert ray_df_equals(df, expected) @@ -3375,9 +3497,7 @@ def test__doc__(): def test_to_datetime(): - frame_data = {'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]} + frame_data = {'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]} ray_df = pd.DataFrame(frame_data) pd_df = pandas.DataFrame(frame_data) @@ -3385,11 +3505,9 @@ def test_to_datetime(): def test_get_dummies(): - frame_data = {'A': ['a', 'b', 'a'], - 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]} + frame_data = {'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]} ray_df = pd.DataFrame(frame_data) pd_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas(pd.get_dummies(ray_df), - pandas.get_dummies(pd_df)) + assert ray_df_equals_pandas( + pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 608d8f45c55..ed5653b60a3 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -7,9 +7,7 @@ import pandas import numpy as np import modin.pandas as pd -from modin.pandas.utils import ( - from_pandas, - to_pandas) +from modin.pandas.utils import (from_pandas, to_pandas) PY2 = False if sys.version_info.major < 3: @@ -48,11 +46,13 @@ def ray_groupby_equals_pandas(ray_groupby, pandas_groupby): def test_simple_row_groupby(): - pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [3, 8, 12, 10], - 'col4': [17, 13, 16, 15], - 'col5': [-4, -5, -6, -7]}) + pandas_df = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [3, 8, 12, 10], + 'col4': [17, 13, 16, 15], + 'col5': [-4, -5, -6, -7] + }) ray_df = from_pandas(pandas_df, 2) @@ -125,11 +125,13 @@ def test_simple_row_groupby(): def test_single_group_row_groupby(): - pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 36, 7], - 'col3': [3, 8, 12, 10], - 'col4': [17, 3, 16, 15], - 'col5': [-4, 5, -6, -7]}) + pandas_df = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 36, 7], + 'col3': [3, 8, 12, 10], + 'col4': [17, 3, 16, 15], + 'col5': [-4, 5, -6, -7] + }) ray_df = from_pandas(pandas_df, 2) @@ -203,8 +205,8 @@ def test_single_group_row_groupby(): @pytest.mark.skip(reason="See Modin issue #21.") def test_large_row_groupby(): - pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)), - columns=list('ABCD')) + pandas_df = pandas.DataFrame( + np.random.randint(0, 8, size=(100, 4)), columns=list('ABCD')) ray_df = from_pandas(pandas_df, 2) @@ -277,11 +279,13 @@ def test_large_row_groupby(): def test_simple_col_groupby(): - pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3], - 'col2': [4, 1, 6, 7], - 'col3': [3, 8, 2, 10], - 'col4': [1, 13, 6, 15], - 'col5': [-4, 5, 6, -7]}) + pandas_df = pandas.DataFrame({ + 'col1': [0, 3, 2, 3], + 'col2': [4, 1, 6, 7], + 'col3': [3, 8, 2, 10], + 'col4': [1, 13, 6, 15], + 'col5': [-4, 5, 6, -7] + }) ray_df = from_pandas(pandas_df, 2) @@ -400,8 +404,8 @@ def test_ndim(ray_groupby, pandas_groupby): @pytest.fixture def test_cumsum(ray_groupby, pandas_groupby): ray_df_equals_pandas(ray_groupby.cumsum(), pandas_groupby.cumsum()) - ray_df_equals_pandas(ray_groupby.cumsum(axis=1), - pandas_groupby.cumsum(axis=1)) + ray_df_equals_pandas( + ray_groupby.cumsum(axis=1), pandas_groupby.cumsum(axis=1)) @pytest.fixture @@ -413,8 +417,8 @@ def test_pct_change(ray_groupby, pandas_groupby): @pytest.fixture def test_cummax(ray_groupby, pandas_groupby): ray_df_equals_pandas(ray_groupby.cummax(), pandas_groupby.cummax()) - ray_df_equals_pandas(ray_groupby.cummax(axis=1), - pandas_groupby.cummax(axis=1)) + ray_df_equals_pandas( + ray_groupby.cummax(axis=1), pandas_groupby.cummax(axis=1)) @pytest.fixture @@ -441,8 +445,8 @@ def test_backfill(ray_groupby, pandas_groupby): @pytest.fixture def test_cummin(ray_groupby, pandas_groupby): ray_df_equals_pandas(ray_groupby.cummin(), pandas_groupby.cummin()) - ray_df_equals_pandas(ray_groupby.cummin(axis=1), - pandas_groupby.cummin(axis=1)) + ray_df_equals_pandas( + ray_groupby.cummin(axis=1), pandas_groupby.cummin(axis=1)) @pytest.fixture @@ -468,8 +472,8 @@ def test_std(ray_groupby, pandas_groupby): @pytest.fixture def test_aggregate(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.aggregate(func), - pandas_groupby.aggregate(func)) + ray_df_equals_pandas( + ray_groupby.aggregate(func), pandas_groupby.aggregate(func)) @pytest.fixture @@ -538,8 +542,8 @@ def test_head(ray_groupby, pandas_groupby, n): @pytest.fixture def test_cumprod(ray_groupby, pandas_groupby): ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod()) - ray_df_equals_pandas(ray_groupby.cumprod(axis=1), - pandas_groupby.cumprod(axis=1)) + ray_df_equals_pandas( + ray_groupby.cumprod(axis=1), pandas_groupby.cumprod(axis=1)) @pytest.fixture @@ -550,8 +554,8 @@ def test_cov(ray_groupby, pandas_groupby): @pytest.fixture def test_transform(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.transform(func), - pandas_groupby.transform(func)) + ray_df_equals_pandas( + ray_groupby.transform(func), pandas_groupby.transform(func)) @pytest.fixture @@ -562,8 +566,9 @@ def test_corr(ray_groupby, pandas_groupby): @pytest.fixture def test_fillna(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.fillna(method="ffill"), - pandas_groupby.fillna(method="ffill")) + ray_df_equals_pandas( + ray_groupby.fillna(method="ffill"), + pandas_groupby.fillna(method="ffill")) @pytest.fixture @@ -584,8 +589,8 @@ def test_tail(ray_groupby, pandas_groupby, n): @pytest.fixture def test_quantile(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.quantile(q=0.4), - pandas_groupby.quantile(q=0.4)) + ray_df_equals_pandas( + ray_groupby.quantile(q=0.4), pandas_groupby.quantile(q=0.4)) @pytest.fixture diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index d6e82fc9689..0def3731a3a 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -44,22 +44,26 @@ def setup_parquet_file(row_size, force=False): @pytest.fixture def create_test_ray_dataframe(): - df = pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) + df = pd.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + }) return df @pytest.fixture def create_test_pandas_dataframe(): - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) + df = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + }) return df @@ -264,11 +268,13 @@ def setup_sql_file(conn, force=False): if os.path.exists(TEST_SQL_FILENAME) and not force: pass else: - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) + df = pandas.DataFrame({ + 'col1': [0, 1, 2, 3], + 'col2': [4, 5, 6, 7], + 'col3': [8, 9, 10, 11], + 'col4': [12, 13, 14, 15], + 'col5': [0, 0, 0, 0] + }) df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn) @@ -480,7 +486,7 @@ def test_to_clipboard(): pandas_df.to_clipboard() pandas_as_clip = pandas.read_clipboard() - assert(ray_as_clip.equals(pandas_as_clip)) + assert (ray_as_clip.equals(pandas_as_clip)) def test_to_csv(): @@ -493,8 +499,7 @@ def test_to_csv(): ray_df.to_csv(TEST_CSV_DF_FILENAME) pandas_df.to_csv(TEST_CSV_pandas_FILENAME) - assert(test_files_eq(TEST_CSV_DF_FILENAME, - TEST_CSV_pandas_FILENAME)) + assert (test_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME)) teardown_test_file(TEST_CSV_pandas_FILENAME) teardown_test_file(TEST_CSV_DF_FILENAME) @@ -530,8 +535,7 @@ def test_to_excel(): ray_writer.save() pandas_writer.save() - assert(test_files_eq(TEST_EXCEL_DF_FILENAME, - TEST_EXCEL_pandas_FILENAME)) + assert (test_files_eq(TEST_EXCEL_DF_FILENAME, TEST_EXCEL_pandas_FILENAME)) teardown_test_file(TEST_EXCEL_DF_FILENAME) teardown_test_file(TEST_EXCEL_pandas_FILENAME) @@ -547,8 +551,8 @@ def test_to_feather(): ray_df.to_feather(TEST_FEATHER_DF_FILENAME) pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME) - assert(test_files_eq(TEST_FEATHER_DF_FILENAME, - TEST_FEATHER_pandas_FILENAME)) + assert (test_files_eq(TEST_FEATHER_DF_FILENAME, + TEST_FEATHER_pandas_FILENAME)) teardown_test_file(TEST_FEATHER_pandas_FILENAME) teardown_test_file(TEST_FEATHER_DF_FILENAME) @@ -572,8 +576,7 @@ def test_to_html(): ray_df.to_html(TEST_HTML_DF_FILENAME) pandas_df.to_html(TEST_HTML_pandas_FILENAME) - assert(test_files_eq(TEST_HTML_DF_FILENAME, - TEST_HTML_pandas_FILENAME)) + assert (test_files_eq(TEST_HTML_DF_FILENAME, TEST_HTML_pandas_FILENAME)) teardown_test_file(TEST_HTML_pandas_FILENAME) teardown_test_file(TEST_HTML_DF_FILENAME) @@ -589,8 +592,7 @@ def test_to_json(): ray_df.to_json(TEST_JSON_DF_FILENAME) pandas_df.to_json(TEST_JSON_pandas_FILENAME) - assert(test_files_eq(TEST_JSON_DF_FILENAME, - TEST_JSON_pandas_FILENAME)) + assert (test_files_eq(TEST_JSON_DF_FILENAME, TEST_JSON_pandas_FILENAME)) teardown_test_file(TEST_JSON_pandas_FILENAME) teardown_test_file(TEST_JSON_DF_FILENAME) @@ -613,8 +615,8 @@ def test_to_msgpack(): ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME) pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME) - assert(test_files_eq(TEST_MSGPACK_DF_FILENAME, - TEST_MSGPACK_pandas_FILENAME)) + assert (test_files_eq(TEST_MSGPACK_DF_FILENAME, + TEST_MSGPACK_pandas_FILENAME)) teardown_test_file(TEST_MSGPACK_pandas_FILENAME) teardown_test_file(TEST_MSGPACK_DF_FILENAME) @@ -637,8 +639,8 @@ def test_to_parquet(): ray_df.to_parquet(TEST_PARQUET_DF_FILENAME) pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME) - assert(test_files_eq(TEST_PARQUET_DF_FILENAME, - TEST_PARQUET_pandas_FILENAME)) + assert (test_files_eq(TEST_PARQUET_DF_FILENAME, + TEST_PARQUET_pandas_FILENAME)) teardown_test_file(TEST_PARQUET_pandas_FILENAME) teardown_test_file(TEST_PARQUET_DF_FILENAME) @@ -661,8 +663,8 @@ def test_to_pickle(): ray_df.to_pickle(TEST_PICKLE_DF_FILENAME) pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME) - assert(test_files_eq(TEST_PICKLE_DF_FILENAME, - TEST_PICKLE_pandas_FILENAME)) + assert (test_files_eq(TEST_PICKLE_DF_FILENAME, + TEST_PICKLE_pandas_FILENAME)) teardown_test_file(TEST_PICKLE_pandas_FILENAME) teardown_test_file(TEST_PICKLE_DF_FILENAME) @@ -678,8 +680,7 @@ def test_to_sql(): ray_df.to_pickle(TEST_SQL_DF_FILENAME) pandas_df.to_pickle(TEST_SQL_pandas_FILENAME) - assert(test_files_eq(TEST_SQL_DF_FILENAME, - TEST_SQL_pandas_FILENAME)) + assert (test_files_eq(TEST_SQL_DF_FILENAME, TEST_SQL_pandas_FILENAME)) teardown_test_file(TEST_SQL_DF_FILENAME) teardown_test_file(TEST_SQL_pandas_FILENAME) @@ -695,8 +696,7 @@ def test_to_stata(): ray_df.to_stata(TEST_STATA_DF_FILENAME) pandas_df.to_stata(TEST_STATA_pandas_FILENAME) - assert(test_files_eq(TEST_STATA_DF_FILENAME, - TEST_STATA_pandas_FILENAME)) + assert (test_files_eq(TEST_STATA_DF_FILENAME, TEST_STATA_pandas_FILENAME)) teardown_test_file(TEST_STATA_pandas_FILENAME) teardown_test_file(TEST_STATA_DF_FILENAME) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 4cc4eaab145..1178056f9c3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2003,8 +2003,8 @@ def test_to_csv(): ray_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.to_csv(None, None, None, None, None, None, None, None, - None, None) + ray_series.to_csv(None, None, None, None, None, None, None, None, None, + None) @pytest.mark.skip(reason="Using pandas Series.") diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index f25361f8117..38754687dbb 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -12,7 +12,6 @@ from . import get_npartitions - _NAN_BLOCKS = {} _MEMOIZER_CAPACITY = 1000 # Capacity per function @@ -123,6 +122,7 @@ def memory_hungry_op(): speed and memory. If the task takes more than 500ms to run, we will do the GC. """ + def wrapped(*args): start_time = time.time() @@ -134,6 +134,7 @@ def wrapped(*args): gc.collect() return result + return wrapped @@ -245,9 +246,8 @@ def from_pandas(df, num_partitions=None, chunksize=None): row_partitions = \ _partition_pandas_dataframe(df, num_partitions, chunksize) - return DataFrame(row_partitions=row_partitions, - columns=df.columns, - index=df.index) + return DataFrame( + row_partitions=row_partitions, columns=df.columns, index=df.index) def to_pandas(df): @@ -304,16 +304,20 @@ def _map_partitions(func, partitions, *argslists): if partitions is None: return None - assert(callable(func)) + assert (callable(func)) if len(argslists) == 0: return [_deploy_func.remote(func, part) for part in partitions] elif len(argslists) == 1: - return [_deploy_func.remote(func, part, argslists[0]) - for part in partitions] + return [ + _deploy_func.remote(func, part, argslists[0]) + for part in partitions + ] else: - assert(all(len(args) == len(partitions) for args in argslists)) - return [_deploy_func.remote(func, *args) - for args in zip(partitions, *argslists)] + assert (all(len(args) == len(partitions) for args in argslists)) + return [ + _deploy_func.remote(func, *args) + for args in zip(partitions, *argslists) + ] def _create_block_partitions(partitions, axis=0, length=None): @@ -325,9 +329,11 @@ def _create_block_partitions(partitions, axis=0, length=None): else: npartitions = get_npartitions() - x = [create_blocks._submit(args=(partition, npartitions, axis), - num_return_vals=npartitions) - for partition in partitions] + x = [ + create_blocks._submit( + args=(partition, npartitions, axis), num_return_vals=npartitions) + for partition in partitions + ] # In the case that axis is 1 we have to transpose because we build the # columns into rows. Fortunately numpy is efficient at this. @@ -352,10 +358,11 @@ def _create_blocks_helper(df, npartitions, axis): # if not isinstance(df.columns, pandas.RangeIndex): # df.columns = pandas.RangeIndex(0, len(df.columns)) - blocks = [df.iloc[:, i * block_size: (i + 1) * block_size] - if axis == 0 - else df.iloc[i * block_size: (i + 1) * block_size, :] - for i in range(npartitions)] + blocks = [ + df.iloc[:, i * block_size:(i + 1) * block_size] + if axis == 0 else df.iloc[i * block_size:(i + 1) * block_size, :] + for i in range(npartitions) + ] for block in blocks: block.columns = pandas.RangeIndex(0, len(block.columns)) @@ -378,6 +385,7 @@ def _inherit_docstrings(parent, excluded=[]): function: decorator which replaces the decorated class' documentation parent's documentation. """ + def decorator(cls): if parent not in excluded: cls.__doc__ = parent.__doc__ @@ -448,8 +456,8 @@ def writer(df_chunk, row_loc, col_loc, item): @ray.remote def _build_col_widths(df_col): """Compute widths (# of columns) for each partition.""" - widths = np.array(ray.get([_deploy_func.remote(_get_widths, d) - for d in df_col])) + widths = np.array( + ray.get([_deploy_func.remote(_get_widths, d) for d in df_col])) return widths @@ -457,8 +465,8 @@ def _build_col_widths(df_col): @ray.remote def _build_row_lengths(df_row): """Compute lengths (# of rows) for each partition.""" - lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d) - for d in df_row])) + lengths = np.array( + ray.get([_deploy_func.remote(_get_lengths, d) for d in df_row])) return lengths @@ -469,8 +477,10 @@ def _build_coord_df(lengths, index): filtered_lengths = [x for x in lengths if x > 0] coords = None if len(filtered_lengths) > 0: - coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l))) - for i, l in enumerate(filtered_lengths)]) + coords = np.vstack([ + np.column_stack((np.full(l, i), np.arange(l))) + for i, l in enumerate(filtered_lengths) + ]) col_names = ("partition", "index_within_partition") return pandas.DataFrame(coords, index=index, columns=col_names)