Skip to content

Commit

Permalink
FIX-modin-project#7238: Fix docstring inheritance for 'cached_propert…
Browse files Browse the repository at this point in the history
…y' and use it

Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev committed May 8, 2024
1 parent bb43cd5 commit 4562e96
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 50 deletions.
76 changes: 30 additions & 46 deletions modin/core/io/column_stores/parquet_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from __future__ import annotations

import functools
import json
import os
import re
Expand Down Expand Up @@ -60,11 +61,6 @@ class ColumnStoreDataset:
dataset : ParquetDataset or ParquetFile
Underlying dataset implementation for PyArrow and fastparquet
respectively.
_row_groups_per_file : list
List that contains the number of row groups for each file in the
given parquet dataset.
_files : list
List that contains the full paths of the parquet files in the dataset.
"""

def __init__(self, path, storage_options): # noqa : PR01
Expand All @@ -73,8 +69,6 @@ def __init__(self, path, storage_options): # noqa : PR01
self._fs_path = None
self._fs = None
self.dataset = self._init_dataset()
self._row_groups_per_file = None
self._files = None

@property
def pandas_metadata(self):
Expand All @@ -91,14 +85,12 @@ def engine(self):
"""Return string representing what engine is being used."""
raise NotImplementedError

# TODO: make this cached_property after docstring inheritance is fixed.
@property
@functools.cached_property
def files(self):
"""Return the list of formatted file paths of the dataset."""
raise NotImplementedError

# TODO: make this cached_property after docstring inheritance is fixed.
@property
@functools.cached_property
def row_groups_per_file(self):
"""Return a list with the number of row groups per file."""
raise NotImplementedError
Expand Down Expand Up @@ -203,31 +195,27 @@ def columns(self):
def engine(self):
return "pyarrow"

@property
@functools.cached_property
def row_groups_per_file(self):
from pyarrow.parquet import ParquetFile

if self._row_groups_per_file is None:
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).num_row_groups
row_groups_per_file.append(row_groups)
self._row_groups_per_file = row_groups_per_file
return self._row_groups_per_file
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).num_row_groups
row_groups_per_file.append(row_groups)
return row_groups_per_file

@property
@functools.cached_property
def files(self):
if self._files is None:
try:
files = self.dataset.files
except AttributeError:
# compatibility at least with 3.0.0 <= pyarrow < 8.0.0
files = self.dataset._dataset.files
self._files = self._get_files(files)
return self._files
try:
files = self.dataset.files
except AttributeError:
# compatibility at least with 3.0.0 <= pyarrow < 8.0.0
files = self.dataset._dataset.files
return self._get_files(files)

def to_pandas_dataframe(
self,
Expand Down Expand Up @@ -261,26 +249,22 @@ def columns(self):
def engine(self):
return "fastparquet"

@property
@functools.cached_property
def row_groups_per_file(self):
from fastparquet import ParquetFile

if self._row_groups_per_file is None:
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).info["row_groups"]
row_groups_per_file.append(row_groups)
self._row_groups_per_file = row_groups_per_file
return self._row_groups_per_file
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).info["row_groups"]
row_groups_per_file.append(row_groups)
return row_groups_per_file

@property
@functools.cached_property
def files(self):
if self._files is None:
self._files = self._get_files(self._get_fastparquet_files())
return self._files
return self._get_files(self._get_fastparquet_files())

def to_pandas_dataframe(self, columns):
return self.dataset.to_pandas(columns=columns)
Expand Down
15 changes: 14 additions & 1 deletion modin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def _replace_doc(
parent_cls : class, optional
If `target_obj` is an attribute of a class, `parent_cls` should be that class.
This is used for generating the API URL as well as for handling special cases
like `target_obj` being a property.
like `target_obj` being a property or a cached_property.
attr_name : str, optional
Gives the name to `target_obj` if it's an attribute of `parent_cls`.
Needed to handle some special cases and in most cases could be determined automatically.
Expand All @@ -314,6 +314,8 @@ def _replace_doc(
if parent_cls and not attr_name:
if isinstance(target_obj, property):
attr_name = target_obj.fget.__name__ # type: ignore[union-attr]
elif isinstance(target_obj, functools.cached_property):
attr_name = target_obj.func.__name__
elif isinstance(target_obj, (staticmethod, classmethod)):
attr_name = target_obj.__func__.__name__
else:
Expand Down Expand Up @@ -355,6 +357,16 @@ def _replace_doc(
attr_name,
property(target_obj.fget, target_obj.fset, target_obj.fdel, doc),
)
elif parent_cls and isinstance(target_obj, functools.cached_property):
if overwrite:
target_obj.func.__doc_inherited__ = True # type: ignore[attr-defined]
assert attr_name is not None
target_obj.func.__doc__ = doc
setattr(
parent_cls,
attr_name,
functools.cached_property(target_obj.func),
)
else:
if overwrite:
target_obj.__doc_inherited__ = True # type: ignore[attr-defined]
Expand Down Expand Up @@ -421,6 +433,7 @@ def _documentable_obj(obj: object) -> bool:
return bool(
callable(obj)
or (isinstance(obj, property) and obj.fget)
or (isinstance(obj, functools.cached_property))
or (isinstance(obj, (staticmethod, classmethod)) and obj.__func__)
)

Expand Down
13 changes: 10 additions & 3 deletions scripts/doc_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,9 +367,16 @@ def validate_object(import_path: str) -> list:

errors = []
doc = construct_validator(import_path)
if getattr(doc.obj, "__doc_inherited__", False) or (
isinstance(doc.obj, property)
and getattr(doc.obj.fget, "__doc_inherited__", False)
if (
getattr(doc.obj, "__doc_inherited__", False)
or (
isinstance(doc.obj, property)
and getattr(doc.obj.fget, "__doc_inherited__", False)
)
or (
isinstance(doc.obj, functools.cached_property)
and getattr(doc.obj.func, "__doc_inherited__", False)
)
):
# do not check inherited docstrings
return errors
Expand Down

0 comments on commit 4562e96

Please sign in to comment.