From d61c478a75b6f1041d51cc93f42773bfa51dc327 Mon Sep 17 00:00:00 2001 From: Mohammed Kashif Date: Sun, 15 Nov 2020 04:41:48 +0530 Subject: [PATCH] DOCS-#2420: Changed documentation to numpydoc style Signed-off-by: Mohammed Kashif --- .github/workflows/ci.yml | 1 + modin/engines/base/frame/axis_partition.py | 57 +++-- modin/engines/base/frame/data.py | 220 +++++++++++------- modin/engines/base/frame/partition.py | 46 ++-- modin/engines/base/frame/partition_manager.py | 109 ++++++--- 5 files changed, 278 insertions(+), 155 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0de02b29d3..062f32fe4b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,6 +51,7 @@ jobs: - run: pydocstyle --convention=numpy --add-ignore=D101,D102 modin/pandas/series_utils.py - run: pydocstyle --convention=numpy --add-ignore=D103 modin/pandas/general.py - run: pydocstyle --convention=numpy modin/pandas/plotting.py modin/pandas/utils.py modin/pandas/iterator.py modin/pandas/indexing.py + - run: pydocstyle --convention=numpy --add-ignore=D100,D104 modin/engines/base/frame lint-flake8: name: lint (flake8) diff --git a/modin/engines/base/frame/axis_partition.py b/modin/engines/base/frame/axis_partition.py index 765ac55777a..cf3c9bfa511 100644 --- a/modin/engines/base/frame/axis_partition.py +++ b/modin/engines/base/frame/axis_partition.py @@ -18,9 +18,9 @@ class BaseFrameAxisPartition(object): # pragma: no cover - """This abstract class represents the Parent class for any - `ColumnPartition` or `RowPartition` class. This class is intended to - simplify the way that operations are performed + """An abstract class that represents the Parent class for any `ColumnPartition` or `RowPartition` class. + + This class is intended to simplify the way that operations are performed. Note 0: The procedures that use this class and its methods assume that they have some global knowledge about the entire axis. This may @@ -46,7 +46,7 @@ def apply( maintain_partitioning=True, **kwargs, ): - """Applies a function to a full axis. + """Apply a function to a full axis. Note: The procedures that invoke this method assume full axis knowledge. Implement this method accordingly. @@ -69,7 +69,8 @@ def apply( orientation (the lengths will remain the same). This is ignored between two axis partitions. - Returns: + Returns + ------- A list of `BaseFramePartition` objects. """ raise NotImplementedError(NOT_IMPLMENTED_MESSAGE) @@ -81,7 +82,8 @@ def shuffle(self, func, lengths, **kwargs): func: The function to apply before splitting. lengths: The list of partition lengths to split the result into. - Returns: + Returns + ------- A list of RemotePartition objects split by `lengths`. """ raise NotImplementedError(NOT_IMPLMENTED_MESSAGE) @@ -95,9 +97,9 @@ def _wrap_partitions(self, partitions): class PandasFrameAxisPartition(BaseFrameAxisPartition): - """This abstract class is created to simplify and consolidate the code for - AxisPartitions that run pandas. Because much of the code is similar, this allows - us to reuse this code. + """An abstract class is created to simplify and consolidate the code for AxisPartitions that run pandas. + + Because much of the code is similar, this allows us to reuse this code. Subclasses must implement `list_of_blocks` which unwraps the `RemotePartition` objects and creates something interpretable as a pandas DataFrame. @@ -115,23 +117,28 @@ def apply( maintain_partitioning=True, **kwargs, ): - """Applies func to the object in the plasma store. + """Apply func to the object in the plasma store. See notes in Parent class about this method. - Args: - func: The function to apply. - num_splits: The number of times to split the result object. - other_axis_partition: Another `PandasOnRayFrameAxisPartition` object to apply to - func with this one. - maintain_partitioning: Whether or not to keep the partitioning in the same - orientation as it was previously. This is important because we may be - operating on an individual AxisPartition and not touching the rest. - In this case, we have to return the partitioning to its previous - orientation (the lengths will remain the same). This is ignored between - two axis partitions. + Parameters + ---------- + func: callable + The function to apply. + num_splits: int + The number of times to split the result object. + other_axis_partition: PandasOnRayFrameAxisPartition object + Another `PandasOnRayFrameAxisPartition` object to apply to func with this one. + maintain_partitioning: boolean + Whether or not to keep the partitioning in the same + orientation as it was previously. This is important because we may be + operating on an individual AxisPartition and not touching the rest. + In this case, we have to return the partitioning to its previous + orientation (the lengths will remain the same). This is ignored between + two axis partitions. - Returns: + Returns + ------- A list of `RayRemotePartition` objects. """ if num_splits is None: @@ -177,7 +184,8 @@ def shuffle(self, func, lengths, **kwargs): func: The function to apply before splitting. lengths: The list of partition lengths to split the result into. - Returns: + Returns + ------- A list of RemotePartition objects split by `lengths`. """ num_splits = len(lengths) @@ -204,7 +212,8 @@ def deploy_axis_func( If False, create a new partition layout. partitions: All partitions that make up the full axis (row or column) - Returns: + Returns + ------- A list of Pandas DataFrames. """ # Pop these off first because they aren't expected by the function. diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 3d57cd6b754..9bae5a85c92 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -25,13 +25,17 @@ class BasePandasFrame(object): + """An abstract class that represents the Parent class for any Pandas DataFrame class. + + This class is intended to simplify the way that operations are performed + """ _frame_mgr_cls = None _query_compiler_cls = PandasQueryCompiler @property def __constructor__(self): - """The constructor for this object. A convenience method""" + """Create a new instance of this object.""" return type(self) def __init__( @@ -87,7 +91,8 @@ def __init__( def _row_lengths(self): """Compute the row lengths if they are not cached. - Returns: + Returns + ------- A list of row lengths. """ if self._row_lengths_cache is None: @@ -103,7 +108,8 @@ def _row_lengths(self): def _column_widths(self): """Compute the column widths if they are not cached. - Returns: + Returns + ------- A list of column widths. """ if self._column_widths_cache is None: @@ -115,14 +121,15 @@ def _column_widths(self): @property def _axes_lengths(self): - """The row lengths, column widths that can be accessed with an `axis` integer.""" + """Row lengths, column widths that can be accessed with an `axis` integer.""" return [self._row_lengths, self._column_widths] @property def dtypes(self): """Compute the data types if they are not cached. - Returns: + Returns + ------- A pandas Series containing the data types for this dataframe. """ if self._dtypes is None: @@ -132,7 +139,8 @@ def dtypes(self): def _compute_dtypes(self): """Compute the dtypes via MapReduce. - Returns: + Returns + ------- The data types of this dataframe. """ @@ -154,13 +162,17 @@ def dtype_builder(df): _columns_cache = None def _validate_set_axis(self, new_labels, old_labels): - """Validates the index or columns replacement against the old labels. + """Validate the index or columns replacement against the old labels. - Args: - new_labels: The labels to replace with. - old_labels: The labels to replace. + Parameters + ---------- + new_labels: list-like + The labels to replace with. + old_labels: list-like + The labels to replace. - Returns: + Returns + ------- The validated labels. """ new_labels = ensure_index(new_labels) @@ -174,26 +186,30 @@ def _validate_set_axis(self, new_labels, old_labels): return new_labels def _get_index(self): - """Gets the index from the cache object. + """Get the index from the cache object. - Returns: + Returns + ------- A pandas.Index object containing the row labels. """ return self._index_cache def _get_columns(self): - """Gets the columns from the cache object. + """Get the columns from the cache object. - Returns: + Returns + ------- A pandas.Index object containing the column labels. """ return self._columns_cache def _set_index(self, new_index): - """Replaces the current row labels with new labels. + """Replace the current row labels with new labels. - Args: - new_index: The replacement row labels. + Parameters + ---------- + new_index: list-like + The replacement row labels. """ if self._index_cache is None: self._index_cache = ensure_index(new_index) @@ -203,10 +219,12 @@ def _set_index(self, new_index): self._apply_index_objs(axis=0) def _set_columns(self, new_columns): - """Replaces the current column labels with new labels. + """Replace the current column labels with new labels. - Args: - new_columns: The replacement column labels. + Parameters + ---------- + new_columns: list-like + The replacement column labels. """ if self._columns_cache is None: self._columns_cache = ensure_index(new_columns) @@ -218,7 +236,7 @@ def _set_columns(self, new_columns): self._apply_index_objs(axis=1) def _set_axis(self, axis, new_axis, cache_only=False): - """Replaces the current labels at the specified axis with the new one + """Replace the current labels at the specified axis with the new one. Parameters ---------- @@ -246,12 +264,12 @@ def _set_axis(self, axis, new_axis, cache_only=False): @property def axes(self): - """The index, columns that can be accessed with an `axis` integer.""" + """Index, columns that can be accessed with an `axis` integer.""" return [self.index, self.columns] def _compute_axis_labels(self, axis: int, partitions=None): """ - Computes labels for specific `axis` + Compute the labels for specific `axis`. Parameters ---------- @@ -273,7 +291,7 @@ def _compute_axis_labels(self, axis: int, partitions=None): ) def _filter_empties(self): - """Removes empty partitions to avoid triggering excess computation.""" + """Remove empty partitions to avoid triggering excess computation.""" if len(self.axes[0]) == 0 or len(self.axes[1]) == 0: # This is the case for an empty frame. We don't want to completely remove # all metadata and partitions so for the moment, we won't prune if the frame @@ -296,7 +314,7 @@ def _filter_empties(self): def _validate_axis_equality(self, axis: int, force: bool = False): """ - Validates internal and external indices of modin_frame at the specified axis. + Validate internal and external indices of modin_frame at the specified axis. Parameters ---------- @@ -329,8 +347,9 @@ def _validate_axis_equality(self, axis: int, force: bool = False): def _validate_internal_indices(self, mode=None, **kwargs): """ - Validates and optionally updates internal and external indices - of modin_frame in specified mode. There is 4 modes supported: + Validate and optionally updates internal and external indices of modin_frame in specified mode. + + There are 4 modes supported: 1. "reduced" - validates on that axis where external indices is ["__reduced__"] for not force 2. "reduced+other" - validates on axis where external @@ -394,7 +413,8 @@ def _apply_index_objs(self, axis=None): Args: axis: The axis to apply to, None applies to both axes. - Returns: + Returns + ------- A new 2D array of partitions that have the index assignment added to the call queue. """ @@ -680,7 +700,8 @@ def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None): def copy(self): """Copy this object. - Returns: + Returns + ------- A copied version of this object. """ return self.__constructor__( @@ -694,13 +715,14 @@ def copy(self): @classmethod def combine_dtypes(cls, list_of_dtypes, column_names): - """Describes how data types should be combined when they do not match. + """Describe how data types should be combined when they do not match. Args: list_of_dtypes: A list of pandas Series with the data types. column_names: The names of the columns that the data types map to. - Returns: + Returns + ------- A pandas Series containing the finalized data types. """ # Compute dtypes by getting collecting and combining all of the partitions. The @@ -716,13 +738,14 @@ def combine_dtypes(cls, list_of_dtypes, column_names): return dtypes def astype(self, col_dtypes): - """Converts columns dtypes to given dtypes. + """Convert the columns dtypes to given dtypes. Args: col_dtypes: Dictionary of {col: dtype,...} where col is the column name and dtype is a numpy dtype. - Returns: + Returns + ------- dataframe with updated dtypes. """ columns = col_dtypes.keys() @@ -774,7 +797,8 @@ def add_prefix(self, prefix, axis): prefix: The prefix to add. axis: The axis to update. - Returns: + Returns + ------- A new dataframe with the updated labels. """ new_labels = self.axes[axis].map(lambda x: str(prefix) + str(x)) @@ -792,7 +816,8 @@ def add_suffix(self, suffix, axis): suffix: The suffix to add. axis: The axis to update. - Returns: + Returns + ------- A new dataframe with the updated labels. """ new_labels = self.axes[axis].map(lambda x: str(x) + str(suffix)) @@ -806,9 +831,10 @@ def add_suffix(self, suffix, axis): # END Metadata modification methods def _numeric_columns(self, include_bool=True): - """Returns the numeric columns of the Manager. + """Return the numeric columns of the Manager. - Returns: + Returns + ------- List of index names. """ columns = [] @@ -945,7 +971,7 @@ def internal(block_idx, global_index): def _join_index_objects(self, axis, other_index, how, sort): """ - Joins a pair of index objects (columns or rows) by a given strategy. + Join the pair of index objects (columns or rows) by a given strategy. Unlike Index.join() in Pandas, if axis is 1, the sort is False, and how is "outer", the result will _not_ be sorted. @@ -994,11 +1020,15 @@ def _build_mapreduce_func(self, axis, func): Note: This should be used for any MapReduce style operation that results in a reduced data dimensionality (dataframe -> series). - Args: - axis: The axis along which to apply the function. - func: The function to apply. + Parameters + ---------- + axis: int + The axis along which to apply the function. + func: callable + The function to apply. - Returns: + Returns + ------- A function to be shipped to the partitions to be executed. """ @@ -1020,7 +1050,7 @@ def _map_reduce_func(df, *args, **kwargs): def _compute_map_reduce_metadata(self, axis, new_parts, preserve_index=True): """ - Computes metadata for the result of reduce function. + Compute the metadata for the result of reduce function. Parameters ---------- @@ -1126,7 +1156,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): """Perform a function that maps across the entire dataset. - Pamareters + Parameters ---------- func : callable The function to apply. @@ -1136,6 +1166,7 @@ def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): type, and allows us to avoid (re)computing it. validate_index : bool, (default False) Is index validation required after performing `func` on partitions. + Returns ------- A new dataframe. @@ -1175,11 +1206,15 @@ def _fold(self, axis, func): Note: The data shape is not changed (length and width of the table). - Args: - axis: The axis to apply over. - func: The function to apply. + Parameters + ---------- + axis: int + The axis to apply over. + func: callable + The function to apply. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.map_axis_partitions( @@ -1196,12 +1231,16 @@ def _fold(self, axis, func): def filter_full_axis(self, axis, func): """Filter data based on the function provided along an entire axis. - Args: - axis: The axis to filter over. - func: The function to use for the filter. This function should filter the + Parameters + ---------- + axis: int + The axis to filter over. + func: callable + The function to use for the filter. This function should filter the data itself. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.map_axis_partitions( @@ -1280,18 +1319,27 @@ def _apply_full_axis_select_indices( ): """Apply a function across an entire axis for a subset of the data. - Args: - axis: The axis to apply over. - func: The function to apply - apply_indices: The labels to apply over. - numeric_indices: The indices to apply over. - new_index: (optional) The index of the result. We may know this in advance, + Parameters + ---------- + axis: int + The axis to apply over. + func: callable + The function to apply + apply_indices: list-like + The labels to apply over. + numeric_indices: list-like + The indices to apply over. + new_index: list-like (optional) + The index of the result. We may know this in advance, and if not provided it must be computed. - new_columns: (optional) The columns of the result. We may know this in + new_columns: list-like (optional) + The columns of the result. We may know this in advance, and if not provided it must be computed. - keep_remaining: Whether or not to drop the data that is not computed over. + keep_remaining: boolean + Whether or not to drop the data that is not computed over. - Returns: + Returns + ------- A new dataframe. """ assert apply_indices is not None or numeric_indices is not None @@ -1332,7 +1380,8 @@ def _apply_select_indices( ): """Apply a function for a subset of the data. - Args: + Parameters + ---------- axis: The axis to apply over. func: The function to apply apply_indices: (optional) The labels to apply over. Must be given if axis is @@ -1349,7 +1398,8 @@ def _apply_select_indices( item_to_distribute: (optional) The item to split up so it can be applied over both axes. - Returns: + Returns + ------- A new dataframe. """ # TODO Infer columns and index from `keep_remaining` and `apply_indices` @@ -1458,7 +1508,7 @@ def broadcast_apply( def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): """ - Computes indices to broadcast `self` with considering of `indices` + Compute the indices to broadcast `self` with considering of `indices`. Parameters ---------- @@ -1508,8 +1558,7 @@ def broadcast_apply_select_indices( new_columns=None, ): """ - Applyies `func` to select indices at specified axis and broadcasts - partitions of `other` frame. + Apply `func` to select indices at specified axis and broadcasts partitions of `other` frame. Parameters ---------- @@ -1811,13 +1860,19 @@ def _binary_op(self, op, right_frame, join_type="outer"): def _concat(self, axis, others, how, sort): """Concatenate this dataframe with one or more others. - Args: - axis: The axis to concatenate over. - others: The list of dataframes to concatenate with. - how: The type of join to use for the axis. - sort: Whether or not to sort the result. + Parameters + ---------- + axis: int + The axis to concatenate over. + others: List of dataframes + The list of dataframes to concatenate with. + how: str + The type of join to use for the axis. + sort: boolean + Whether or not to sort the result. - Returns: + Returns + ------- A new dataframe. """ # Fast path for equivalent columns and partitioning @@ -1883,7 +1938,8 @@ def groupby_reduce( new_columns: (optional) The columns of the result. We may know this in advance, and if not provided it must be computed. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.groupby_reduce( @@ -1902,10 +1958,12 @@ def groupby_reduce( def from_pandas(cls, df): """Improve simple Pandas DataFrame to an advanced and superior Modin DataFrame. - Args: + Parameters + ---------- df: Pandas DataFrame object. - Returns: + Returns + ------- A new dataframe. """ new_index = df.index @@ -1961,9 +2019,10 @@ def _arrow_type_to_dtype(cls, arrow_type): return res def to_pandas(self): - """Converts Modin DataFrame to Pandas DataFrame. + """Convert a Modin DataFrame to Pandas DataFrame. - Returns: + Returns + ------- Pandas DataFrame. """ df = self._frame_mgr_cls.to_pandas(self._partitions) @@ -1985,7 +2044,7 @@ def to_pandas(self): def to_numpy(self, **kwargs): """ - Converts Modin DataFrame to a 2D NumPy array. + Convert a Modin DataFrame to a 2D NumPy array. Returns ------- @@ -1996,7 +2055,8 @@ def to_numpy(self, **kwargs): def transpose(self): """Transpose the index and columns of this dataframe. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.lazy_map_partitions( diff --git a/modin/engines/base/frame/partition.py b/modin/engines/base/frame/partition.py index 8854b346e77..6a3c9a49d8e 100644 --- a/modin/engines/base/frame/partition.py +++ b/modin/engines/base/frame/partition.py @@ -15,7 +15,8 @@ class BaseFramePartition(object): # pragma: no cover - """This abstract class holds the data and metadata for a single partition. + """An abstract class that holds the data and metadata for a single partition. + The methods required for implementing this abstract class are listed in the section immediately following this. @@ -36,7 +37,8 @@ def get(self): E.g. if you assign `x = BaseFramePartition.put(1)`, `x.get()` should always return 1. - Returns: + Returns + ------- The object that was `put`. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -51,7 +53,8 @@ def apply(self, func, **kwargs): Args: func: The lambda to apply (may already be correctly formatted) - Returns: + Returns + ------- A new `BaseFramePartition` containing the object that has had `func` applied to it. """ @@ -74,7 +77,8 @@ def to_pandas(self): Note: If the underlying object is a Pandas DataFrame, this will likely only need to call `get` - Returns: + Returns + ------- A Pandas DataFrame. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -85,7 +89,8 @@ def to_numpy(self, **kwargs): Note: If the underlying object is a Pandas DataFrame, this will return a 2D NumPy array. - Returns: + Returns + ------- A NumPy array. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -97,19 +102,22 @@ def mask(self, row_indices, col_indices): row_indices: The indices for the rows to extract. col_indices: The indices for the columns to extract. - Returns: + Returns + ------- A `BaseFramePartition` object. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def put(cls, obj): - """A factory classmethod to format a given object. + """Format a given object. - Args: + Parameters + ---------- obj: An object. - Returns: + Returns + ------- A `BaseFramePartition` object. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -126,25 +134,28 @@ def preprocess_func(cls, func): Args: func: The function to preprocess. - Returns: + Returns + ------- An object that can be accepted by `apply`. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def length_extraction_fn(cls): - """The function to compute the length of the object in this partition. + """Compute the length of the object in this partition. - Returns: + Returns + ------- A callable function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def width_extraction_fn(cls): - """The function to compute the width of the object in this partition. + """Compute the width of the object in this partition. - Returns: + Returns + ------- A callable function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -153,6 +164,7 @@ def width_extraction_fn(cls): _width_cache = None def length(self): + """Return the length of partition.""" if self._length_cache is None: cls = type(self) func = cls.length_extraction_fn() @@ -161,6 +173,7 @@ def length(self): return self._length_cache def width(self): + """Return the width of partition.""" if self._width_cache is None: cls = type(self) func = cls.width_extraction_fn() @@ -170,9 +183,10 @@ def width(self): @classmethod def empty(cls): - """Create an empty partition + """Create an empty partition. - Returns; + Returns + ------- An empty partition """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 5f23ffd98a0..b917ccedb13 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -20,8 +20,11 @@ class BaseFrameManager(object): - # Partition class is the class to use for storing each partition. It must - # extend the `BaseFramePartition` class. + """Partition class is the class to use for storing each partition. It must extend the `BaseFramePartition` class. + + It is the base class for managing the dataframe data layout and operators. + """ + _partition_class = None # Column partitions class is the class to use to create the column partitions. _column_partitions_class = None @@ -43,6 +46,7 @@ def preprocess_func(cls, map_func): map_func: The function to be preprocessed. Returns + ------- The preprocessed version of the `map_func` provided. Note: This does not require any specific format, only that the `BaseFramePartition.apply` method will recognize it (For the subclass @@ -54,28 +58,33 @@ def preprocess_func(cls, map_func): @classmethod def column_partitions(cls, partitions): - """A list of `BaseFrameAxisPartition` objects. + """List of `BaseFrameAxisPartition` objects. Note: Each value in this list will be an `BaseFrameAxisPartition` object. `BaseFrameAxisPartition` is located in `axis_partition.py`. - Returns a list of `BaseFrameAxisPartition` objects. + Returns + ------- + a list of `BaseFrameAxisPartition` objects. """ return [cls._column_partitions_class(col) for col in partitions.T] @classmethod def row_partitions(cls, partitions): - """A list of `BaseFrameAxisPartition` objects, represents column partitions. + """List of `BaseFrameAxisPartition` objects, represents column partitions. Note: Each value in this list will an `BaseFrameAxisPartition` object. `BaseFrameAxisPartition` is located in `axis_partition.py`. - Returns a list of `BaseFrameAxisPartition` objects. + Returns + ------- + a list of `BaseFrameAxisPartition` objects. """ return [cls._row_partition_class(row) for row in partitions] @classmethod def axis_partition(cls, partitions, axis): + """Logically partition along either the columns or the rows.""" return ( cls.column_partitions(partitions) if not axis @@ -84,6 +93,7 @@ def axis_partition(cls, partitions, axis): @classmethod def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func): + """Groupby data using the map_func provided along the axis over the partitions then reduce using reduce_func.""" mapped_partitions = cls.broadcast_apply( axis, map_func, left=partitions, right=by, other_name="other" ) @@ -101,7 +111,7 @@ def broadcast_apply_select_indices( keep_remaining=False, ): """ - Broadcast the right partitions to left and apply a function to selected indices + Broadcast the right partitions to left and apply a function to selected indices. Note: Your internal function must take this kwargs: [`internal_indices`, `other`, `internal_other_indices`] to work correctly @@ -272,12 +282,15 @@ def broadcast_axis_partitions( @classmethod def map_partitions(cls, partitions, map_func): - """Applies `map_func` to every partition. + """Apply `map_func` to every partition. - Args: - map_func: The function to apply. + Parameters + ---------- + map_func: callable + The function to apply. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ preprocessed_map_func = cls.preprocess_func(map_func) @@ -290,6 +303,18 @@ def map_partitions(cls, partitions, map_func): @classmethod def lazy_map_partitions(cls, partitions, map_func): + """ + Apply `map_func` to every partition lazily. + + Parameters + ---------- + map_func: callable + The function to apply. + + Returns + ------- + A new BaseFrameManager object, the type of object that called this. + """ preprocessed_map_func = cls.preprocess_func(map_func) return np.array( [ @@ -308,7 +333,7 @@ def map_axis_partitions( lengths=None, ): """ - Applies `map_func` to every partition. + Apply `map_func` to every partition. Parameters ---------- @@ -345,7 +370,7 @@ def map_axis_partitions( @classmethod def simple_shuffle(cls, axis, partitions, map_func, lengths): """ - Shuffle data using `lengths` via `map_func` + Shuffle data using `lengths` via `map_func`. Parameters ---------- @@ -395,7 +420,8 @@ def concat(cls, axis, left_parts, right_parts): right_parts: the other blocks to be concatenated. This is a BaseFrameManager object. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if type(right_parts) is list: @@ -410,7 +436,7 @@ def concat(cls, axis, left_parts, right_parts): @classmethod def concatenate(cls, dfs): """ - Concatenate Pandas DataFrames with saving 'category' dtype + Concatenate Pandas DataFrames with saving 'category' dtype. Parameters ---------- @@ -435,7 +461,8 @@ def concatenate(cls, dfs): def to_pandas(cls, partitions): """Convert this object into a Pandas DataFrame from the partitions. - Returns: + Returns + ------- A Pandas DataFrame """ retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions] @@ -476,6 +503,7 @@ def to_numpy(cls, partitions, **kwargs): @classmethod def from_pandas(cls, df, return_dims=False): + """Return the partitions from Pandas DataFrame.""" num_splits = cls._compute_num_partitions() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) @@ -505,11 +533,12 @@ def from_pandas(cls, df, return_dims=False): @classmethod def from_arrow(cls, at, return_dims=False): + """Return the partitions from Apache Arrow (PyArrow).""" return cls.from_pandas(at.to_pandas(), return_dims=return_dims) @classmethod def get_indices(cls, axis, partitions, index_func=None): - """This gets the internal indices stored in the partitions. + """Get the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know @@ -519,7 +548,8 @@ def get_indices(cls, axis, partitions, index_func=None): axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. - Returns: + Returns + ------- A Pandas Index object. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) @@ -541,10 +571,11 @@ def get_indices(cls, axis, partitions, index_func=None): @classmethod def _compute_num_partitions(cls): - """Currently, this method returns the default. In the future it will - estimate the optimal number of partitions. + """Retrieve the default number of partitions currently. Will estimate the optimal no. of partitions in future. - :return: + Returns + ------- + Number of partitions. """ from modin.pandas import DEFAULT_NPARTITIONS @@ -564,7 +595,7 @@ def _apply_func_to_list_of_partitions_broadcast( @classmethod def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): - """Applies a function to a list of remote partitions. + """Apply a function to a list of remote partitions. Note: The main use for this is to preprocess the func. @@ -572,7 +603,8 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): func: The func to apply partitions: The list of partitions - Returns: + Returns + ------- A list of BaseFramePartition objects. """ preprocessed_func = cls.preprocess_func(func) @@ -584,7 +616,7 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): def apply_func_to_select_indices( cls, axis, partitions, func, indices, keep_remaining=False ): - """Applies a function to select indices. + """Apply a function to select indices. Note: Your internal function must take a kwarg `internal_indices` for this to work correctly. This prevents information leakage of the @@ -598,7 +630,8 @@ def apply_func_to_select_indices( Some operations may want to drop the remaining partitions and keep only the results. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if partitions.size == 0: @@ -685,7 +718,7 @@ def apply_func_to_select_indices( def apply_func_to_select_indices_along_full_axis( cls, axis, partitions, func, indices, keep_remaining=False ): - """Applies a function to a select subset of full columns/rows. + """Apply a function to a select subset of full columns/rows. Note: This should be used when you need to apply a function that relies on some global information for the entire column/row, but only need @@ -694,15 +727,21 @@ def apply_func_to_select_indices_along_full_axis( Important: For your func to operate directly on the indices provided, it must use `internal_indices` as a keyword argument. - Args: - axis: The axis to apply the function over (0 - rows, 1 - columns) - func: The function to apply. - indices: The global indices to apply the func to. - keep_remaining: Whether or not to keep the other partitions. - Some operations may want to drop the remaining partitions and - keep only the results. + Parameters + ---------- + axis: int + The axis to apply the function over (0 - rows, 1 - columns) + func: callable + The function to apply. + indices: list-like + The global indices to apply the func to. + keep_remaining: boolean + Whether or not to keep the other partitions. + Some operations may want to drop the remaining partitions and + keep only the results. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if partitions.size == 0: @@ -794,7 +833,7 @@ def apply_func_to_indices_both_axis( item_to_distribute=None, ): """ - Apply a function to along both axis + Apply a function to along both axis. Important: For your func to operate directly on the indices provided, it must use `row_internal_indices, col_internal_indices` as keyword