Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] 018 add unfitted error pca & tests on IPCA #3272

Merged
merged 8 commits into from
Dec 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions python/cuml/decomposition/pca.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ from cuml.common.array_descriptor import CumlArrayDescriptor
from cuml.common import using_output_type
from cuml.prims.stats import cov
from cuml.common.input_utils import sparse_scipy_to_cp
from cuml.common.exceptions import NotFittedError


cdef extern from "cuml/decomposition/pca.hpp" namespace "ML":
Expand Down Expand Up @@ -554,6 +555,7 @@ class PCA(Base):

"""

self._check_is_fitted('components_')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that in sklearn, they do sklearn.utils.validation.check_is_fitted in a similar way. Is there an advantage to having this as a method on base instead of a utility? I believe we can always add the function later as a wrapper if necessary, so probably not a major issue either way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The advantage of a base method would be that there is no additionnal import needed, and it would be really easy to use. But then we would have two copy of this code, for the estimators with Dask.

if cupyx.scipy.sparse.issparse(X):
return self._sparse_inverse_transform(X,
return_sparse=return_sparse,
Expand Down Expand Up @@ -653,6 +655,7 @@ class PCA(Base):

"""

self._check_is_fitted('components_')
if cupyx.scipy.sparse.issparse(X):
return self._sparse_transform(X)
elif scipy.sparse.issparse(X):
Expand Down Expand Up @@ -736,3 +739,9 @@ class PCA(Base):
'X_types_gpu': ['2darray', 'sparse'],
'X_types': ['2darray', 'sparse']
}

def _check_is_fitted(self, attr):
lowener marked this conversation as resolved.
Show resolved Hide resolved
if not hasattr(self, attr) or (getattr(self, attr) is None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, agreed on the generalization - good suggestion from Dante

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will do the generalization of NotFittedError in a following PR

msg = ("This instance is not fitted yet. Call 'fit' "
"with appropriate arguments before using this estimator.")
raise NotFittedError(msg)
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def _svd_flip(u, v, u_based_decision=True):
if u_based_decision:
# columns of u, rows of v
max_abs_cols = cp.argmax(cp.abs(u), axis=0)
signs = cp.sign(u[max_abs_cols, range(u.shape[1])])
signs = cp.sign(u[max_abs_cols, list(range(u.shape[1]))])
u *= signs
v *= signs[:, cp.newaxis]
else:
Expand Down
34 changes: 34 additions & 0 deletions python/cuml/test/test_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@

from cuml.datasets import make_blobs
from cuml.experimental.decomposition import IncrementalPCA as cuIPCA
from cuml.experimental.decomposition.incremental_pca import _svd_flip

from cuml.test.utils import array_equal
from cuml.common.exceptions import NotFittedError


@pytest.mark.parametrize(
Expand Down Expand Up @@ -109,3 +111,35 @@ def test_partial_fit(nrows, ncols, n_components, density,

assert array_equal(cu_inv, sk_inv,
5e-5, with_sign=True)


def test_exceptions():
X = cupyx.scipy.sparse.eye(10)
ipca = cuIPCA()
with pytest.raises(TypeError):
ipca.partial_fit(X)

X = X.toarray()
with pytest.raises(NotFittedError):
ipca.transform(X)

with pytest.raises(NotFittedError):
ipca.inverse_transform(X)

with pytest.raises(ValueError):
cuIPCA(n_components=8).fit(X[:5])

with pytest.raises(ValueError):
cuIPCA(n_components=8).fit(X[:, :5])


def test_svd_flip():
x = cp.array(range(-10, 80)).reshape((9, 10))
u, s, v = cp.linalg.svd(x, full_matrices=False)
u_true, v_true = _svd_flip(u, v, u_based_decision=True)
reco_true = cp.dot(u_true * s, v_true)
u_false, v_false = _svd_flip(u, v, u_based_decision=False)
reco_false = cp.dot(u_false * s, v_false)

assert array_equal(reco_true, x)
assert array_equal(reco_false, x)
11 changes: 11 additions & 0 deletions python/cuml/test/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import PCA as skPCA
from sklearn.datasets import make_blobs
from cuml.common.exceptions import NotFittedError


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
Expand Down Expand Up @@ -233,3 +234,13 @@ def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input):
assert isinstance(i_sparse, cp.core.ndarray)

assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)


def test_exceptions():
with pytest.raises(NotFittedError):
X = cp.random.random((10, 10))
cuPCA().transform(X)

with pytest.raises(NotFittedError):
X = cp.random.random((10, 10))
cuPCA().inverse_transform(X)