From eddef3d5d722826248b9e9bb6dd90896f51263c3 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 4 Aug 2022 15:22:45 -0700 Subject: [PATCH 1/6] Initial commit --- python/ray/data/preprocessors/hasher.py | 11 ++---- python/ray/data/tests/test_preprocessors.py | 42 ++++++++++----------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index effe06967dfe..638490ea830c 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -33,17 +33,12 @@ def __init__(self, columns: List[str], num_features: int): def _transform_pandas(self, df: pd.DataFrame): # TODO(matt): Use sparse matrix for efficiency. - joined_columns = "_".join(self.columns) - def row_feature_hasher(row): hash_counts = collections.defaultdict(int) for column in self.columns: - hashed_value = simple_hash(row[column], self.num_features) - hash_counts[hashed_value] = hash_counts[hashed_value] + 1 - return { - f"hash_{joined_columns}_{i}": hash_counts[i] - for i in range(self.num_features) - } + hashed_value = simple_hash(column, self.num_features) + hash_counts[hashed_value] += row[column] + return {f"hash{i}": hash_counts[i] for i in range(self.num_features)} feature_columns = df.loc[:, self.columns].apply( row_feature_hasher, axis=1, result_type="expand" diff --git a/python/ray/data/tests/test_preprocessors.py b/python/ray/data/tests/test_preprocessors.py index 134ff667a65c..8249776bb0c5 100644 --- a/python/ray/data/tests/test_preprocessors.py +++ b/python/ray/data/tests/test_preprocessors.py @@ -1302,33 +1302,29 @@ def test_tokenizer(): def test_feature_hasher(): """Tests basic FeatureHasher functionality.""" + # This dataframe represents the counts from the documents "I like Python" and "I + # dislike Python". + token_counts = pd.DataFrame( + {"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]} + ) - col_a = [0, "a", "b"] - col_b = [0, "a", "c"] - in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b}) - ds = ray.data.from_pandas(in_df) - - hasher = FeatureHasher(["A", "B"], num_features=5) - transformed = hasher.transform(ds) - out_df = transformed.to_pandas() + hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256) + document_term_matrix = hasher.fit_transform( + ray.data.from_pandas(token_counts) + ).to_pandas() - processed_col_0 = [0, 0, 1] - processed_col_1 = [0, 0, 1] - processed_col_2 = [0, 2, 0] - processed_col_3 = [2, 0, 0] - processed_col_4 = [0, 0, 0] + # Document-term matrix should have shape (# documents, # features) + assert document_term_matrix.shape == (2, 256) - expected_df = pd.DataFrame.from_dict( - { - "hash_A_B_0": processed_col_0, - "hash_A_B_1": processed_col_1, - "hash_A_B_2": processed_col_2, - "hash_A_B_3": processed_col_3, - "hash_A_B_4": processed_col_4, - } - ) + # The tokens tokens "I", "like", and "Python" should be hashed to distinct indices + # for adequately large `num_features`. + assert document_term_matrix.iloc[0].sum() == 3 + assert all(document_term_matrix.iloc[0] <= 1) - assert out_df.equals(expected_df) + # The tokens tokens "I", "dislike", and "Python" should be hashed to distinct + # indices for adequately large `num_features`. + assert document_term_matrix.iloc[1].sum() == 3 + assert all(document_term_matrix.iloc[1] <= 1) def test_hashing_vectorizer(): From d9911c6244f05c6a8ce8cb1517b811bb590688e6 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 4 Aug 2022 17:29:27 -0700 Subject: [PATCH 2/6] Add docstring --- python/ray/data/preprocessors/hasher.py | 63 +++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 638490ea830c..a5d235757169 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -9,18 +9,65 @@ class FeatureHasher(Preprocessor): - """Hash the features of the specified columns. + """Apply the `hashing trick `_ to a + table that describes token frequencies. - The created columns will have names in the format ``hash_{column_names}_{hash}``, - e.g. ``hash_column1_column2_0``, ``hash_column1_column2_1``, ... + :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``, + where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column + ``hash_{index}`` describes the frequency of tokens that hash to ``index``. - Note: Currently sparse matrices are not supported. - Therefore, it is recommended to **not** use a large ``num_features``. + Distinct tokens can correspond to the same index. However, if ``num_features`` is large enough, + then columns probably correspond to a unique token. + + This preprocessor is memory efficient and quick to pickle. However, given a transformed + column, you can't know which tokens correspond to it. This might make it hard + to determine which tokens are important to your model. + + .. warning:: + Sparse matrices aren't supported. If you use a large ``num_features``, this + preprocessor might behave poorly. Args: - columns: The columns of features that should be projected - onto a single hashed feature vector. - num_features: The size of the hashed feature vector. + columns: The columns to apply the hashing trick to. Each column should describe + the frequency of a token. + num_features: The number of features used to represent the vocabulary. You + should choose a value large enough to prevent hash collisions between + distinct tokens. + + Examples: + + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import FeatureHasher + + The data below describes the frequencies of tokens in ``"I like Python"`` and ``"I dislike Python"``. + + >>> df = pd.DataFrame({ + ... "I": [1, 1], + ... "like": [1, 0], + ... "dislike": [0, 1], + ... "Python": [1, 1] + ... }) + >>> ds = ray.data.from_pandas(df) + + :class:`FeatureHasher` hashes each token to determine its index. For example, + the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`. + + >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8) + >>> hasher.fit_transform(ds).to_pandas().to_numpy() + [[0 0 0 2 0 1 0 0] + [0 0 0 1 0 1 1 0]] + + Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index :math:`3`. You can + avoid hash collisions like these by increasing ``num_features``. + + .. seealso:: + :class:`~ray.data.preprocessors.CountVectorizer` + Use this preprocessor to generate inputs for :class:`FeatureHasher`. + + :class:`ray.data.preprocessors.HashingVectorizer` + If your input data describes documents rather than token frequencies, + use :class:`~ray.data.preprocessors.HashingVectorizer`. """ _is_fittable = False From c866ee2c3a5240f86564d013aaddbcf1642c1c09 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 5 Aug 2022 00:11:55 -0700 Subject: [PATCH 3/6] Appease lint --- python/ray/data/preprocessors/hasher.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index a5d235757169..7d8b30a138f2 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -16,12 +16,12 @@ class FeatureHasher(Preprocessor): where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column ``hash_{index}`` describes the frequency of tokens that hash to ``index``. - Distinct tokens can correspond to the same index. However, if ``num_features`` is large enough, - then columns probably correspond to a unique token. + Distinct tokens can correspond to the same index. However, if ``num_features`` is + large enough, then columns probably correspond to a unique token. - This preprocessor is memory efficient and quick to pickle. However, given a transformed - column, you can't know which tokens correspond to it. This might make it hard - to determine which tokens are important to your model. + This preprocessor is memory efficient and quick to pickle. However, given a + transformed column, you can't know which tokens correspond to it. This might make it + hard to determine which tokens are important to your model. .. warning:: Sparse matrices aren't supported. If you use a large ``num_features``, this @@ -40,7 +40,8 @@ class FeatureHasher(Preprocessor): >>> import ray >>> from ray.data.preprocessors import FeatureHasher - The data below describes the frequencies of tokens in ``"I like Python"`` and ``"I dislike Python"``. + The data below describes the frequencies of tokens in ``"I like Python"`` and + ``"I dislike Python"``. >>> df = pd.DataFrame({ ... "I": [1, 1], @@ -58,8 +59,9 @@ class FeatureHasher(Preprocessor): [[0 0 0 2 0 1 0 0] [0 0 0 1 0 1 1 0]] - Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index :math:`3`. You can - avoid hash collisions like these by increasing ``num_features``. + Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index + :math:`3`. You can avoid hash collisions like these by increasing + ``num_features``. .. seealso:: :class:`~ray.data.preprocessors.CountVectorizer` @@ -68,7 +70,7 @@ class FeatureHasher(Preprocessor): :class:`ray.data.preprocessors.HashingVectorizer` If your input data describes documents rather than token frequencies, use :class:`~ray.data.preprocessors.HashingVectorizer`. - """ + """ # noqa: E501 _is_fittable = False From 2504dd8cf78cd4f5e2e2e70b4cdd73f21c91c955 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 5 Aug 2022 00:12:01 -0700 Subject: [PATCH 4/6] Address review comment --- python/ray/data/preprocessors/hasher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 7d8b30a138f2..3243a6b31758 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -87,7 +87,7 @@ def row_feature_hasher(row): for column in self.columns: hashed_value = simple_hash(column, self.num_features) hash_counts[hashed_value] += row[column] - return {f"hash{i}": hash_counts[i] for i in range(self.num_features)} + return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)} feature_columns = df.loc[:, self.columns].apply( row_feature_hasher, axis=1, result_type="expand" From bd283e651e8c573fd8e669dc178c436ca1a7844d Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 5 Aug 2022 02:58:01 -0700 Subject: [PATCH 5/6] Fix doctest --- python/ray/data/preprocessors/hasher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 3243a6b31758..8103a430f571 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -56,8 +56,8 @@ class FeatureHasher(Preprocessor): >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8) >>> hasher.fit_transform(ds).to_pandas().to_numpy() - [[0 0 0 2 0 1 0 0] - [0 0 0 1 0 1 1 0]] + array([[0, 0, 0, 2, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 0]]) Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index :math:`3`. You can avoid hash collisions like these by increasing From 8166ca830995caca6e1217de0ecca1f221f66ee6 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 5 Aug 2022 11:32:13 -0700 Subject: [PATCH 6/6] Skip doctests --- python/ray/data/preprocessors/hasher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 8103a430f571..66ac8e92dd9c 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -49,13 +49,13 @@ class FeatureHasher(Preprocessor): ... "dislike": [0, 1], ... "Python": [1, 1] ... }) - >>> ds = ray.data.from_pandas(df) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP :class:`FeatureHasher` hashes each token to determine its index. For example, the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`. >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8) - >>> hasher.fit_transform(ds).to_pandas().to_numpy() + >>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP array([[0, 0, 0, 2, 0, 1, 0, 0], [0, 0, 0, 1, 0, 1, 1, 0]])