From eddef3d5d722826248b9e9bb6dd90896f51263c3 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Thu, 4 Aug 2022 15:22:45 -0700
Subject: [PATCH 1/6] Initial commit

---
 python/ray/data/preprocessors/hasher.py     | 11 ++----
 python/ray/data/tests/test_preprocessors.py | 42 ++++++++++-----------
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index effe06967dfe..638490ea830c 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -33,17 +33,12 @@ def __init__(self, columns: List[str], num_features: int):
 
     def _transform_pandas(self, df: pd.DataFrame):
         # TODO(matt): Use sparse matrix for efficiency.
-        joined_columns = "_".join(self.columns)
-
         def row_feature_hasher(row):
             hash_counts = collections.defaultdict(int)
             for column in self.columns:
-                hashed_value = simple_hash(row[column], self.num_features)
-                hash_counts[hashed_value] = hash_counts[hashed_value] + 1
-            return {
-                f"hash_{joined_columns}_{i}": hash_counts[i]
-                for i in range(self.num_features)
-            }
+                hashed_value = simple_hash(column, self.num_features)
+                hash_counts[hashed_value] += row[column]
+            return {f"hash{i}": hash_counts[i] for i in range(self.num_features)}
 
         feature_columns = df.loc[:, self.columns].apply(
             row_feature_hasher, axis=1, result_type="expand"
diff --git a/python/ray/data/tests/test_preprocessors.py b/python/ray/data/tests/test_preprocessors.py
index 134ff667a65c..8249776bb0c5 100644
--- a/python/ray/data/tests/test_preprocessors.py
+++ b/python/ray/data/tests/test_preprocessors.py
@@ -1302,33 +1302,29 @@ def test_tokenizer():
 
 def test_feature_hasher():
     """Tests basic FeatureHasher functionality."""
+    # This dataframe represents the counts from the documents "I like Python" and "I
+    # dislike Python".
+    token_counts = pd.DataFrame(
+        {"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
+    )
 
-    col_a = [0, "a", "b"]
-    col_b = [0, "a", "c"]
-    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
-    ds = ray.data.from_pandas(in_df)
-
-    hasher = FeatureHasher(["A", "B"], num_features=5)
-    transformed = hasher.transform(ds)
-    out_df = transformed.to_pandas()
+    hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
+    document_term_matrix = hasher.fit_transform(
+        ray.data.from_pandas(token_counts)
+    ).to_pandas()
 
-    processed_col_0 = [0, 0, 1]
-    processed_col_1 = [0, 0, 1]
-    processed_col_2 = [0, 2, 0]
-    processed_col_3 = [2, 0, 0]
-    processed_col_4 = [0, 0, 0]
+    # Document-term matrix should have shape (# documents, # features)
+    assert document_term_matrix.shape == (2, 256)
 
-    expected_df = pd.DataFrame.from_dict(
-        {
-            "hash_A_B_0": processed_col_0,
-            "hash_A_B_1": processed_col_1,
-            "hash_A_B_2": processed_col_2,
-            "hash_A_B_3": processed_col_3,
-            "hash_A_B_4": processed_col_4,
-        }
-    )
+    # The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
+    # for adequately large `num_features`.
+    assert document_term_matrix.iloc[0].sum() == 3
+    assert all(document_term_matrix.iloc[0] <= 1)
 
-    assert out_df.equals(expected_df)
+    # The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
+    # indices for adequately large `num_features`.
+    assert document_term_matrix.iloc[1].sum() == 3
+    assert all(document_term_matrix.iloc[1] <= 1)
 
 
 def test_hashing_vectorizer():

From d9911c6244f05c6a8ce8cb1517b811bb590688e6 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Thu, 4 Aug 2022 17:29:27 -0700
Subject: [PATCH 2/6] Add docstring

---
 python/ray/data/preprocessors/hasher.py | 63 +++++++++++++++++++++----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index 638490ea830c..a5d235757169 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -9,18 +9,65 @@
 
 
 class FeatureHasher(Preprocessor):
-    """Hash the features of the specified columns.
+    """Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
+    table that describes token frequencies.
 
-    The created columns will have names in the format ``hash_{column_names}_{hash}``,
-    e.g. ``hash_column1_column2_0``, ``hash_column1_column2_1``, ...
+    :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
+    where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
+    ``hash_{index}`` describes the frequency of tokens that hash to ``index``.
 
-    Note: Currently sparse matrices are not supported.
-    Therefore, it is recommended to **not** use a large ``num_features``.
+    Distinct tokens can correspond to the same index. However, if ``num_features`` is large enough,
+    then columns probably correspond to a unique token.
+
+    This preprocessor is memory efficient and quick to pickle. However, given a transformed
+    column, you can't know which tokens correspond to it. This might make it hard
+    to determine which tokens are important to your model.
+
+    .. warning::
+        Sparse matrices aren't supported. If you use a large ``num_features``, this
+        preprocessor might behave poorly.
 
     Args:
-        columns: The columns of features that should be projected
-                 onto a single hashed feature vector.
-        num_features: The size of the hashed feature vector.
+        columns: The columns to apply the hashing trick to. Each column should describe
+            the frequency of a token.
+        num_features: The number of features used to represent the vocabulary. You
+            should choose a value large enough to prevent hash collisions between
+            distinct tokens.
+
+    Examples:
+
+        >>> import pandas as pd
+        >>> import ray
+        >>> from ray.data.preprocessors import FeatureHasher
+
+        The data below describes the frequencies of tokens in ``"I like Python"`` and ``"I dislike Python"``.
+
+        >>> df = pd.DataFrame({
+        ...     "I": [1, 1],
+        ...     "like": [1, 0],
+        ...     "dislike": [0, 1],
+        ...     "Python": [1, 1]
+        ... })
+        >>> ds = ray.data.from_pandas(df)
+
+        :class:`FeatureHasher` hashes each token to determine its index. For example,
+        the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
+
+        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
+        >>> hasher.fit_transform(ds).to_pandas().to_numpy()
+        [[0 0 0 2 0 1 0 0]
+         [0 0 0 1 0 1 1 0]]
+
+        Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index :math:`3`. You can
+        avoid hash collisions like these by increasing ``num_features``.
+
+    .. seealso::
+        :class:`~ray.data.preprocessors.CountVectorizer`
+            Use this preprocessor to generate inputs for :class:`FeatureHasher`.
+
+        :class:`ray.data.preprocessors.HashingVectorizer`
+            If your input data describes documents rather than token frequencies,
+            use :class:`~ray.data.preprocessors.HashingVectorizer`.
     """
 
     _is_fittable = False

From c866ee2c3a5240f86564d013aaddbcf1642c1c09 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 5 Aug 2022 00:11:55 -0700
Subject: [PATCH 3/6] Appease lint

---
 python/ray/data/preprocessors/hasher.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index a5d235757169..7d8b30a138f2 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -16,12 +16,12 @@ class FeatureHasher(Preprocessor):
     where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
     ``hash_{index}`` describes the frequency of tokens that hash to ``index``.
 
-    Distinct tokens can correspond to the same index. However, if ``num_features`` is large enough,
-    then columns probably correspond to a unique token.
+    Distinct tokens can correspond to the same index. However, if ``num_features`` is
+    large enough, then columns probably correspond to a unique token.
 
-    This preprocessor is memory efficient and quick to pickle. However, given a transformed
-    column, you can't know which tokens correspond to it. This might make it hard
-    to determine which tokens are important to your model.
+    This preprocessor is memory efficient and quick to pickle. However, given a
+    transformed column, you can't know which tokens correspond to it. This might make it
+    hard to determine which tokens are important to your model.
 
     .. warning::
         Sparse matrices aren't supported. If you use a large ``num_features``, this
@@ -40,7 +40,8 @@ class FeatureHasher(Preprocessor):
         >>> import ray
         >>> from ray.data.preprocessors import FeatureHasher
 
-        The data below describes the frequencies of tokens in ``"I like Python"`` and ``"I dislike Python"``.
+        The data below describes the frequencies of tokens in ``"I like Python"`` and
+        ``"I dislike Python"``.
 
         >>> df = pd.DataFrame({
         ...     "I": [1, 1],
@@ -58,8 +59,9 @@ class FeatureHasher(Preprocessor):
         [[0 0 0 2 0 1 0 0]
          [0 0 0 1 0 1 1 0]]
 
-        Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index :math:`3`. You can
-        avoid hash collisions like these by increasing ``num_features``.
+        Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
+        :math:`3`. You can avoid hash collisions like these by increasing
+        ``num_features``.
 
     .. seealso::
         :class:`~ray.data.preprocessors.CountVectorizer`
@@ -68,7 +70,7 @@ class FeatureHasher(Preprocessor):
         :class:`ray.data.preprocessors.HashingVectorizer`
             If your input data describes documents rather than token frequencies,
             use :class:`~ray.data.preprocessors.HashingVectorizer`.
-    """
+    """  # noqa: E501
 
     _is_fittable = False
 

From 2504dd8cf78cd4f5e2e2e70b4cdd73f21c91c955 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 5 Aug 2022 00:12:01 -0700
Subject: [PATCH 4/6] Address review comment

---
 python/ray/data/preprocessors/hasher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index 7d8b30a138f2..3243a6b31758 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -87,7 +87,7 @@ def row_feature_hasher(row):
             for column in self.columns:
                 hashed_value = simple_hash(column, self.num_features)
                 hash_counts[hashed_value] += row[column]
-            return {f"hash{i}": hash_counts[i] for i in range(self.num_features)}
+            return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
 
         feature_columns = df.loc[:, self.columns].apply(
             row_feature_hasher, axis=1, result_type="expand"

From bd283e651e8c573fd8e669dc178c436ca1a7844d Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 5 Aug 2022 02:58:01 -0700
Subject: [PATCH 5/6] Fix doctest

---
 python/ray/data/preprocessors/hasher.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index 3243a6b31758..8103a430f571 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -56,8 +56,8 @@ class FeatureHasher(Preprocessor):
 
         >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
         >>> hasher.fit_transform(ds).to_pandas().to_numpy()
-        [[0 0 0 2 0 1 0 0]
-         [0 0 0 1 0 1 1 0]]
+        array([[0, 0, 0, 2, 0, 1, 0, 0],
+               [0, 0, 0, 1, 0, 1, 1, 0]])
 
         Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
         :math:`3`. You can avoid hash collisions like these by increasing

From 8166ca830995caca6e1217de0ecca1f221f66ee6 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 5 Aug 2022 11:32:13 -0700
Subject: [PATCH 6/6] Skip doctests

---
 python/ray/data/preprocessors/hasher.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index 8103a430f571..66ac8e92dd9c 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -49,13 +49,13 @@ class FeatureHasher(Preprocessor):
         ...     "dislike": [0, 1],
         ...     "Python": [1, 1]
         ... })
-        >>> ds = ray.data.from_pandas(df)
+        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
 
         :class:`FeatureHasher` hashes each token to determine its index. For example,
         the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
 
         >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
-        >>> hasher.fit_transform(ds).to_pandas().to_numpy()
+        >>> hasher.fit_transform(ds).to_pandas().to_numpy()  # doctest: +SKIP
         array([[0, 0, 0, 2, 0, 1, 0, 0],
                [0, 0, 0, 1, 0, 1, 1, 0]])