Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AIR] Change FeatureHasher input schema #27523

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 61 additions & 17 deletions python/ray/data/preprocessors/hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,68 @@


class FeatureHasher(Preprocessor):
"""Hash the features of the specified columns.
"""Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
table that describes token frequencies.

The created columns will have names in the format ``hash_{column_names}_{hash}``,
e.g. ``hash_column1_column2_0``, ``hash_column1_column2_1``, ...
:class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
``hash_{index}`` describes the frequency of tokens that hash to ``index``.

Note: Currently sparse matrices are not supported.
Therefore, it is recommended to **not** use a large ``num_features``.
Distinct tokens can correspond to the same index. However, if ``num_features`` is
large enough, then columns probably correspond to a unique token.

This preprocessor is memory efficient and quick to pickle. However, given a
transformed column, you can't know which tokens correspond to it. This might make it
hard to determine which tokens are important to your model.

.. warning::
Sparse matrices aren't supported. If you use a large ``num_features``, this
preprocessor might behave poorly.

Args:
columns: The columns of features that should be projected
onto a single hashed feature vector.
num_features: The size of the hashed feature vector.
"""
columns: The columns to apply the hashing trick to. Each column should describe
the frequency of a token.
num_features: The number of features used to represent the vocabulary. You
should choose a value large enough to prevent hash collisions between
distinct tokens.

Examples:

>>> import pandas as pd
>>> import ray
>>> from ray.data.preprocessors import FeatureHasher

The data below describes the frequencies of tokens in ``"I like Python"`` and
``"I dislike Python"``.

>>> df = pd.DataFrame({
... "I": [1, 1],
... "like": [1, 0],
... "dislike": [0, 1],
... "Python": [1, 1]
... })
>>> ds = ray.data.from_pandas(df) # doctest: +SKIP

:class:`FeatureHasher` hashes each token to determine its index. For example,
the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.

>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
>>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP
array([[0, 0, 0, 2, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 1, 1, 0]])

Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
:math:`3`. You can avoid hash collisions like these by increasing
``num_features``.

.. seealso::
:class:`~ray.data.preprocessors.CountVectorizer`
Use this preprocessor to generate inputs for :class:`FeatureHasher`.

:class:`ray.data.preprocessors.HashingVectorizer`
If your input data describes documents rather than token frequencies,
use :class:`~ray.data.preprocessors.HashingVectorizer`.
""" # noqa: E501
Comment on lines +66 to +73
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice ! we should do this way more.


_is_fittable = False

Expand All @@ -33,17 +82,12 @@ def __init__(self, columns: List[str], num_features: int):

def _transform_pandas(self, df: pd.DataFrame):
# TODO(matt): Use sparse matrix for efficiency.
joined_columns = "_".join(self.columns)

def row_feature_hasher(row):
hash_counts = collections.defaultdict(int)
for column in self.columns:
hashed_value = simple_hash(row[column], self.num_features)
hash_counts[hashed_value] = hash_counts[hashed_value] + 1
return {
f"hash_{joined_columns}_{i}": hash_counts[i]
for i in range(self.num_features)
}
hashed_value = simple_hash(column, self.num_features)
hash_counts[hashed_value] += row[column]
return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}

feature_columns = df.loc[:, self.columns].apply(
row_feature_hasher, axis=1, result_type="expand"
Expand Down
42 changes: 19 additions & 23 deletions python/ray/data/tests/test_preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1302,33 +1302,29 @@ def test_tokenizer():

def test_feature_hasher():
"""Tests basic FeatureHasher functionality."""
# This dataframe represents the counts from the documents "I like Python" and "I
# dislike Python".
token_counts = pd.DataFrame(
{"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
)

col_a = [0, "a", "b"]
col_b = [0, "a", "c"]
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
ds = ray.data.from_pandas(in_df)

hasher = FeatureHasher(["A", "B"], num_features=5)
transformed = hasher.transform(ds)
out_df = transformed.to_pandas()
hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
document_term_matrix = hasher.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()

processed_col_0 = [0, 0, 1]
processed_col_1 = [0, 0, 1]
processed_col_2 = [0, 2, 0]
processed_col_3 = [2, 0, 0]
processed_col_4 = [0, 0, 0]
# Document-term matrix should have shape (# documents, # features)
assert document_term_matrix.shape == (2, 256)

expected_df = pd.DataFrame.from_dict(
{
"hash_A_B_0": processed_col_0,
"hash_A_B_1": processed_col_1,
"hash_A_B_2": processed_col_2,
"hash_A_B_3": processed_col_3,
"hash_A_B_4": processed_col_4,
}
)
# The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
# for adequately large `num_features`.
assert document_term_matrix.iloc[0].sum() == 3
assert all(document_term_matrix.iloc[0] <= 1)

assert out_df.equals(expected_df)
# The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
# indices for adequately large `num_features`.
assert document_term_matrix.iloc[1].sum() == 3
assert all(document_term_matrix.iloc[1] <= 1)


def test_hashing_vectorizer():
Expand Down