From 0e7c8227f1288f6d42c6bb04137bb4bb957ab586 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Thu, 3 Mar 2022 05:03:05 +0900 Subject: [PATCH] [refactor] Fix SparseMatrixType --> spmatrix and add ispandas --- autoPyTorch/data/base_feature_validator.py | 5 ++-- autoPyTorch/data/base_target_validator.py | 5 ++-- autoPyTorch/data/tabular_feature_validator.py | 11 ++++---- autoPyTorch/data/tabular_target_validator.py | 26 +++++++++---------- autoPyTorch/data/utils.py | 6 +++-- autoPyTorch/datasets/base_dataset.py | 4 +-- .../preprocessing/base_preprocessing.py | 4 +-- .../early_preprocessor/EarlyPreprocessing.py | 4 +-- .../network_backbone/base_network_backbone.py | 4 +-- autoPyTorch/utils/common.py | 22 +++++++--------- 10 files changed, 46 insertions(+), 45 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 2c4ce4de9..11c6cf577 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -5,13 +5,14 @@ import pandas as pd +from scipy.sparse import spmatrix + from sklearn.base import BaseEstimator -from autoPyTorch.utils.common import SparseMatrixType from autoPyTorch.utils.logging_ import PicklableClientLogger -SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType] +SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix] class BaseFeatureValidator(BaseEstimator): diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index ddbe384cb..530675fbd 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -5,13 +5,14 @@ import pandas as pd +from scipy.sparse import spmatrix + from sklearn.base import BaseEstimator -from autoPyTorch.utils.common import SparseMatrixType from autoPyTorch.utils.logging_ import PicklableClientLogger -SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType] +SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix] class BaseTargetValidator(BaseEstimator): diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 7da2bd8ed..3e8c316b0 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -23,6 +23,7 @@ DatasetDTypeContainerType, reduce_dataset_size_if_too_large ) +from autoPyTorch.utils.common import ispandas from autoPyTorch.utils.logging_ import PicklableClientLogger @@ -155,7 +156,7 @@ def _fit( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if hasattr(X, "iloc") and not issparse(X): + if ispandas(X) and not issparse(X): X = cast(pd.DataFrame, X) # Treat a column with all instances a NaN as numerical # This will prevent doing encoding to a categorical column made completely @@ -245,7 +246,7 @@ def transform( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if hasattr(X, "iloc") and not issparse(X): + if ispandas(X) and not issparse(X): if np.any(pd.isnull(X)): for column in X.columns: if X[column].isna().all(): @@ -259,7 +260,7 @@ def transform( self._check_data(X) # Pandas related transformations - if hasattr(X, "iloc") and self.column_transformer is not None: + if ispandas(X) and self.column_transformer is not None: if np.any(pd.isnull(X)): # After above check it means that if there is a NaN # the whole column must be NaN @@ -309,7 +310,7 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio DatasetCompressionInputType: Compressed dataset. """ - is_dataframe = hasattr(X, 'iloc') + is_dataframe = ispandas(X) is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe if not is_reducible_type or self._dataset_compression is None: return X @@ -363,7 +364,7 @@ def _check_data( ) # Then for Pandas, we do not support Nan in categorical columns - if hasattr(X, "iloc"): + if ispandas(X): # If entered here, we have a pandas dataframe X = cast(pd.DataFrame, X) diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py index 67b6001f8..22cabb999 100644 --- a/autoPyTorch/data/tabular_target_validator.py +++ b/autoPyTorch/data/tabular_target_validator.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.api.types import is_numeric_dtype -import scipy.sparse +from scipy.sparse import issparse, spmatrix import sklearn.utils from sklearn import preprocessing @@ -14,10 +14,10 @@ from sklearn.utils.multiclass import type_of_target from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes -from autoPyTorch.utils.common import SparseMatrixType +from autoPyTorch.utils.common import ispandas -ArrayType = Union[np.ndarray, SparseMatrixType] +ArrayType = Union[np.ndarray, spmatrix] def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType: @@ -71,7 +71,7 @@ def _fit( return self if y_test is not None: - if hasattr(y_train, "iloc"): + if ispandas(y_train): y_train = pd.concat([y_train, y_test], ignore_index=True, sort=False) elif isinstance(y_train, list): y_train = y_train + y_test @@ -100,7 +100,7 @@ def _fit( if ndim > 1: self.encoder.fit(y_train) else: - if hasattr(y_train, 'iloc'): + if ispandas(y_train): y_train = cast(pd.DataFrame, y_train) self.encoder.fit(y_train.to_numpy().reshape(-1, 1)) else: @@ -131,7 +131,7 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray: shape = np.shape(y) if len(shape) > 1: y = self.encoder.transform(y) - elif hasattr(y, 'iloc'): + elif ispandas(y): # The Ordinal encoder expects a 2 dimensional input. # The targets are 1 dimensional, so reshape to match the expected shape y = cast(pd.DataFrame, y) @@ -192,7 +192,7 @@ def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray: y = self.encoder.inverse_transform(y) else: # The targets should be a flattened array, hence reshape with -1 - if hasattr(y, 'iloc'): + if ispandas(y): y = cast(pd.DataFrame, y) y = self.encoder.inverse_transform(y.to_numpy().reshape(-1, 1)).reshape(-1) else: @@ -216,7 +216,7 @@ def _check_data(self, y: SupportedTargetTypes) -> None: if not isinstance(y, (np.ndarray, pd.DataFrame, List, pd.Series)) \ - and not scipy.sparse.issparse(y): # type: ignore[misc] + and not issparse(y): # type: ignore[misc] raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," " pd.Series, sparse data and Python Lists as targets, yet, " "the provided input is of type {}".format( @@ -225,8 +225,8 @@ def _check_data(self, y: SupportedTargetTypes) -> None: # Sparse data muss be numerical # Type ignore on attribute because sparse targets have a dtype - if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr] - np.number): + if issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr] + np.number): raise ValueError("When providing a sparse matrix as targets, the only supported " "values are numerical. Please consider using a dense" " instead." @@ -245,10 +245,10 @@ def _check_data(self, y: SupportedTargetTypes) -> None: # No Nan is supported has_nan_values = False - if hasattr(y, 'iloc'): + if ispandas(y): has_nan_values = cast(pd.DataFrame, y).isnull().values.any() - if scipy.sparse.issparse(y): - y = cast(scipy.sparse.spmatrix, y) + if issparse(y): + y = cast(spmatrix, y) has_nan_values = not np.array_equal(y.data, y.data) else: # List and array like values are considered here diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 43dacf543..03375ce27 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -21,6 +21,8 @@ from scipy.sparse import issparse, spmatrix +from autoPyTorch.utils.common import ispandas + # TODO: TypedDict with python 3.8 # @@ -246,7 +248,7 @@ def reduce_precision( reduced_dtypes = reduction_mapping[X.dtype] X = X.astype(reduced_dtypes) - elif hasattr(X, 'iloc'): + elif ispandas(X): dtypes = dict(X.dtypes) col_names = X.dtypes.index @@ -270,7 +272,7 @@ def megabytes(arr: DatasetCompressionInputType) -> float: memory_in_bytes = arr.nbytes elif issparse(arr): memory_in_bytes = arr.data.nbytes - elif hasattr(arr, 'iloc'): + elif ispandas(arr): memory_in_bytes = arr.memory_usage(index=True, deep=True).sum() else: raise ValueError(f"Unrecognised data type of X, expected data type to " diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index be17b945d..7761d07c2 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -27,7 +27,7 @@ NoResamplingStrategyTypes, ResamplingStrategies ) -from autoPyTorch.utils.common import FitRequirement +from autoPyTorch.utils.common import FitRequirement, ispandas BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset] BaseDatasetPropertiesType = Union[int, float, str, List, bool] @@ -220,7 +220,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]: A transformed single point prediction """ - X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \ + X = self.train_tensors[0].iloc[[index]] if ispandas(self.train_tensors[0]) \ else self.train_tensors[0][index] if self.train_transform is not None and train: diff --git a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py index cfc1a890b..fb8bbdaa7 100644 --- a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py +++ b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py @@ -6,7 +6,7 @@ import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import spmatrix import torch @@ -24,7 +24,7 @@ def __init__(self) -> None: super().__init__() self.add_fit_requirements([ FitRequirement('X_train', - (np.ndarray, pd.DataFrame, csr_matrix), + (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False), FitRequirement('backend', (Backend, ), diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 7fbf33f99..aa2b4c25f 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -6,7 +6,7 @@ import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import spmatrix from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent @@ -21,7 +21,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None self.random_state = random_state self.add_fit_requirements([ FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), - FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, + FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False)]) def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing": diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 1a04d6645..7ff914a98 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -5,7 +5,7 @@ import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import spmatrix import torch from torch import nn @@ -29,7 +29,7 @@ def __init__(self, super().__init__() self.add_fit_requirements([ FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), - FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, + FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False), FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index b0620a7db..48302bdee 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -14,21 +14,17 @@ import pandas as pd -import scipy.sparse +from scipy.sparse import spmatrix import torch from torch.utils.data.dataloader import default_collate HyperparameterValueType = Union[int, str, float] -SparseMatrixType = Union[ - scipy.sparse.bsr_matrix, - scipy.sparse.coo_matrix, - scipy.sparse.csc_matrix, - scipy.sparse.csr_matrix, - scipy.sparse.dia_matrix, - scipy.sparse.dok_matrix, - scipy.sparse.lil_matrix, -] + + +def ispandas(X: Any) -> bool: + """ Whether X is pandas.DataFrame or pandas.Series """ + return hasattr(X, "iloc") class FitRequirement(NamedTuple): @@ -177,10 +173,10 @@ def get_device_from_fit_dictionary(X: Dict[str, Any]) -> torch.device: return torch.device(X.get("device", "cpu")) -def subsampler(data: Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix], +def subsampler(data: Union[np.ndarray, pd.DataFrame, spmatrix], x: Union[np.ndarray, List[int]] - ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]: - return data[x] if isinstance(data, (np.ndarray, scipy.sparse.csr_matrix)) else data.iloc[x] + ) -> Union[np.ndarray, pd.DataFrame, spmatrix]: + return data[x] if isinstance(data, (np.ndarray, spmatrix)) else data.iloc[x] def get_hyperparameter(hyperparameter: HyperparameterSearchSpace,