Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refactor] Fix SparseMatrixType --> spmatrix and add ispandas #397

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

import pandas as pd

from scipy.sparse import spmatrix

from sklearn.base import BaseEstimator

from autoPyTorch.utils.common import SparseMatrixType
from autoPyTorch.utils.logging_ import PicklableClientLogger


SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType]
SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix]


class BaseFeatureValidator(BaseEstimator):
Expand Down
5 changes: 3 additions & 2 deletions autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

import pandas as pd

from scipy.sparse import spmatrix

from sklearn.base import BaseEstimator

from autoPyTorch.utils.common import SparseMatrixType
from autoPyTorch.utils.logging_ import PicklableClientLogger


SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType]
SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix]


class BaseTargetValidator(BaseEstimator):
Expand Down
11 changes: 6 additions & 5 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.common import ispandas
from autoPyTorch.utils.logging_ import PicklableClientLogger


Expand Down Expand Up @@ -155,7 +156,7 @@ def _fit(
if isinstance(X, np.ndarray):
X = self.numpy_array_to_pandas(X)

if hasattr(X, "iloc") and not issparse(X):
if ispandas(X) and not issparse(X):
X = cast(pd.DataFrame, X)
# Treat a column with all instances a NaN as numerical
# This will prevent doing encoding to a categorical column made completely
Expand Down Expand Up @@ -245,7 +246,7 @@ def transform(
if isinstance(X, np.ndarray):
X = self.numpy_array_to_pandas(X)

if hasattr(X, "iloc") and not issparse(X):
if ispandas(X) and not issparse(X):
if np.any(pd.isnull(X)):
for column in X.columns:
if X[column].isna().all():
Expand All @@ -259,7 +260,7 @@ def transform(
self._check_data(X)

# Pandas related transformations
if hasattr(X, "iloc") and self.column_transformer is not None:
if ispandas(X) and self.column_transformer is not None:
if np.any(pd.isnull(X)):
# After above check it means that if there is a NaN
# the whole column must be NaN
Expand Down Expand Up @@ -309,7 +310,7 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = hasattr(X, 'iloc')
is_dataframe = ispandas(X)
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self._dataset_compression is None:
return X
Expand Down Expand Up @@ -363,7 +364,7 @@ def _check_data(
)

# Then for Pandas, we do not support Nan in categorical columns
if hasattr(X, "iloc"):
if ispandas(X):
# If entered here, we have a pandas dataframe
X = cast(pd.DataFrame, X)

Expand Down
26 changes: 13 additions & 13 deletions autoPyTorch/data/tabular_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from pandas.api.types import is_numeric_dtype

import scipy.sparse
from scipy.sparse import issparse, spmatrix

import sklearn.utils
from sklearn import preprocessing
Expand All @@ -14,10 +14,10 @@
from sklearn.utils.multiclass import type_of_target

from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
from autoPyTorch.utils.common import SparseMatrixType
from autoPyTorch.utils.common import ispandas


ArrayType = Union[np.ndarray, SparseMatrixType]
ArrayType = Union[np.ndarray, spmatrix]


def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
Expand Down Expand Up @@ -71,7 +71,7 @@ def _fit(
return self

if y_test is not None:
if hasattr(y_train, "iloc"):
if ispandas(y_train):
y_train = pd.concat([y_train, y_test], ignore_index=True, sort=False)
elif isinstance(y_train, list):
y_train = y_train + y_test
Expand Down Expand Up @@ -100,7 +100,7 @@ def _fit(
if ndim > 1:
self.encoder.fit(y_train)
else:
if hasattr(y_train, 'iloc'):
if ispandas(y_train):
y_train = cast(pd.DataFrame, y_train)
self.encoder.fit(y_train.to_numpy().reshape(-1, 1))
else:
Expand Down Expand Up @@ -131,7 +131,7 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
shape = np.shape(y)
if len(shape) > 1:
y = self.encoder.transform(y)
elif hasattr(y, 'iloc'):
elif ispandas(y):
# The Ordinal encoder expects a 2 dimensional input.
# The targets are 1 dimensional, so reshape to match the expected shape
y = cast(pd.DataFrame, y)
Expand Down Expand Up @@ -192,7 +192,7 @@ def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray:
y = self.encoder.inverse_transform(y)
else:
# The targets should be a flattened array, hence reshape with -1
if hasattr(y, 'iloc'):
if ispandas(y):
y = cast(pd.DataFrame, y)
y = self.encoder.inverse_transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
else:
Expand All @@ -216,7 +216,7 @@ def _check_data(self, y: SupportedTargetTypes) -> None:

if not isinstance(y, (np.ndarray, pd.DataFrame,
List, pd.Series)) \
and not scipy.sparse.issparse(y): # type: ignore[misc]
and not issparse(y): # type: ignore[misc]
raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
" pd.Series, sparse data and Python Lists as targets, yet, "
"the provided input is of type {}".format(
Expand All @@ -225,8 +225,8 @@ def _check_data(self, y: SupportedTargetTypes) -> None:

# Sparse data muss be numerical
# Type ignore on attribute because sparse targets have a dtype
if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr]
np.number):
if issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr]
np.number):
raise ValueError("When providing a sparse matrix as targets, the only supported "
"values are numerical. Please consider using a dense"
" instead."
Expand All @@ -245,10 +245,10 @@ def _check_data(self, y: SupportedTargetTypes) -> None:

# No Nan is supported
has_nan_values = False
if hasattr(y, 'iloc'):
if ispandas(y):
has_nan_values = cast(pd.DataFrame, y).isnull().values.any()
if scipy.sparse.issparse(y):
y = cast(scipy.sparse.spmatrix, y)
if issparse(y):
y = cast(spmatrix, y)
has_nan_values = not np.array_equal(y.data, y.data)
else:
# List and array like values are considered here
Expand Down
6 changes: 4 additions & 2 deletions autoPyTorch/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

from scipy.sparse import issparse, spmatrix

from autoPyTorch.utils.common import ispandas


# TODO: TypedDict with python 3.8
#
Expand Down Expand Up @@ -246,7 +248,7 @@ def reduce_precision(
reduced_dtypes = reduction_mapping[X.dtype]
X = X.astype(reduced_dtypes)

elif hasattr(X, 'iloc'):
elif ispandas(X):
dtypes = dict(X.dtypes)

col_names = X.dtypes.index
Expand All @@ -270,7 +272,7 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
memory_in_bytes = arr.nbytes
elif issparse(arr):
memory_in_bytes = arr.data.nbytes
elif hasattr(arr, 'iloc'):
elif ispandas(arr):
memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
else:
raise ValueError(f"Unrecognised data type of X, expected data type to "
Expand Down
4 changes: 2 additions & 2 deletions autoPyTorch/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
NoResamplingStrategyTypes,
ResamplingStrategies
)
from autoPyTorch.utils.common import FitRequirement
from autoPyTorch.utils.common import FitRequirement, ispandas

BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
BaseDatasetPropertiesType = Union[int, float, str, List, bool]
Expand Down Expand Up @@ -220,7 +220,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
A transformed single point prediction
"""

X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \
X = self.train_tensors[0].iloc[[index]] if ispandas(self.train_tensors[0]) \
else self.train_tensors[0][index]

if self.train_transform is not None and train:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from scipy.sparse import csr_matrix
from scipy.sparse import spmatrix

import torch

Expand All @@ -24,7 +24,7 @@ def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('X_train',
(np.ndarray, pd.DataFrame, csr_matrix),
(np.ndarray, pd.DataFrame, spmatrix),
user_defined=True, dataset_property=False),
FitRequirement('backend',
(Backend, ),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from scipy.sparse import csr_matrix
from scipy.sparse import spmatrix

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent
Expand All @@ -21,7 +21,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
self.random_state = random_state
self.add_fit_requirements([
FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
dataset_property=False)])

def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas as pd

from scipy.sparse import csr_matrix
from scipy.sparse import spmatrix

import torch
from torch import nn
Expand All @@ -29,7 +29,7 @@ def __init__(self,
super().__init__()
self.add_fit_requirements([
FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
dataset_property=False),
FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
Expand Down
22 changes: 9 additions & 13 deletions autoPyTorch/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,17 @@

import pandas as pd

import scipy.sparse
from scipy.sparse import spmatrix

import torch
from torch.utils.data.dataloader import default_collate

HyperparameterValueType = Union[int, str, float]
SparseMatrixType = Union[
scipy.sparse.bsr_matrix,
scipy.sparse.coo_matrix,
scipy.sparse.csc_matrix,
scipy.sparse.csr_matrix,
scipy.sparse.dia_matrix,
scipy.sparse.dok_matrix,
scipy.sparse.lil_matrix,
]


def ispandas(X: Any) -> bool:
""" Whether X is pandas.DataFrame or pandas.Series """
return hasattr(X, "iloc")


class FitRequirement(NamedTuple):
Expand Down Expand Up @@ -177,10 +173,10 @@ def get_device_from_fit_dictionary(X: Dict[str, Any]) -> torch.device:
return torch.device(X.get("device", "cpu"))


def subsampler(data: Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
def subsampler(data: Union[np.ndarray, pd.DataFrame, spmatrix],
x: Union[np.ndarray, List[int]]
) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]:
return data[x] if isinstance(data, (np.ndarray, scipy.sparse.csr_matrix)) else data.iloc[x]
) -> Union[np.ndarray, pd.DataFrame, spmatrix]:
return data[x] if isinstance(data, (np.ndarray, spmatrix)) else data.iloc[x]


def get_hyperparameter(hyperparameter: HyperparameterSearchSpace,
Expand Down