diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..06938b73 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,62 @@ +# This workflow will install the package and dependencies, run tests and lint with a single version of Python + +name: Python package build and test + +on: + + push: + pull_request: + branches: [ master, develop ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-dev.txt + + - name: Lint with flake8 + if: always() + run: | + flake8 . + + - name: Lint with black + if: always() + run: | + black --version + black --check . + + - name: Test with pytest + if: always() + run: | + pytest . --cov=tubular/ --cov-report=html --cov-config=.coveragerc + + - name: Upload coverage pytest html test results to github + if: always() + uses: actions/upload-artifact@v2 + with: + name: coverage-html-pytest-results + path: htmlcov + + - name: Run Bandit tests + if: always() + run: | + bandit -c "bandit.yml" -r tubular -f html -o bandit-test-results.html + + - name: Upload bandit test results + if: always() + uses: actions/upload-artifact@v2 + with: + name: bandit-results + path: bandit-test-results.html + + + diff --git a/examples/imputers/NearestMeanResponseImputer.ipynb b/examples/imputers/NearestMeanResponseImputer.ipynb index 2a374e7e..a3349d20 100644 --- a/examples/imputers/NearestMeanResponseImputer.ipynb +++ b/examples/imputers/NearestMeanResponseImputer.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -21,9 +21,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + ], "source": [ "import tubular\n", "from tubular.imputers import NearestMeanResponseImputer" @@ -31,20 +32,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.2.8'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tubular.__version__" ] @@ -593,497 +583,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Alternate Usage\n", + "## Alternate Usage - DEPRECATED\n", "\n", - "We can also use this transformer in the event that we want to fill null values in our test set with impute values learned from our training set. In particular, if our training set contains no null values and our test set does, we can specify use_median_if_no_nulls as True in the fit stage so that our imputer will learn the median values of our training columns." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df_train=boston_df[boston_df['CRIM'].notnull()]\n", - "df_test=boston_df[boston_df['CRIM'].isnull()]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtargetZN_catCHAS_catRAD_cat
3NaNNaN2.180.00.458NaN45.86.06223.0222.018.7NaNNaN33.4NaN0.03.0
6NaNNaN7.870.00.5246.01266.65.56055.0311.015.2395.6012.4322.9NaN0.05.0
13NaNNaN8.140.00.5385.94961.84.70754.0307.0NaN396.908.2620.4NaN0.04.0
14NaN0.08.140.00.5386.09684.5NaNNaN307.021.0380.02NaN18.20.00.0NaN
19NaN0.08.140.00.5385.727NaN3.79654.0307.021.0390.9511.2818.20.00.04.0
\n", - "
" - ], - "text/plain": [ - " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n", - "3 NaN NaN 2.18 0.0 0.458 NaN 45.8 6.0622 3.0 222.0 18.7 \n", - "6 NaN NaN 7.87 0.0 0.524 6.012 66.6 5.5605 5.0 311.0 15.2 \n", - "13 NaN NaN 8.14 0.0 0.538 5.949 61.8 4.7075 4.0 307.0 NaN \n", - "14 NaN 0.0 8.14 0.0 0.538 6.096 84.5 NaN NaN 307.0 21.0 \n", - "19 NaN 0.0 8.14 0.0 0.538 5.727 NaN 3.7965 4.0 307.0 21.0 \n", - "\n", - " B LSTAT target ZN_cat CHAS_cat RAD_cat \n", - "3 NaN NaN 33.4 NaN 0.0 3.0 \n", - "6 395.60 12.43 22.9 NaN 0.0 5.0 \n", - "13 396.90 8.26 20.4 NaN 0.0 4.0 \n", - "14 380.02 NaN 18.2 0.0 0.0 NaN \n", - "19 390.95 11.28 18.2 0.0 0.0 4.0 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_test.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.25199" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_train['CRIM'].median()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialising NearestMeanResponseImputer" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "BaseTransformer.__init__() called\n" - ] - } - ], - "source": [ - "imp_2=NearestMeanResponseImputer(\n", - " response_column='target',\n", - " columns='CRIM',\n", - " use_median_if_no_nulls=True,\n", - " copy=True,\n", - " verbose=True\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### NearestMeanResponseImputer Fit" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "BaseTransformer.fit() called\n" - ] - }, - { - "data": { - "text/plain": [ - "NearestMeanResponseImputer(columns=['CRIM'], response_column='target',\n", - " use_median_if_no_nulls=True)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "imp_2.fit(df_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'CRIM': 0.25199}" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "imp_2.impute_values_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### NearestMeanResponseImputer Transform" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "BaseTransformer.transform() called\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtargetZN_catCHAS_catRAD_cat
30.25199NaN2.180.00.458NaN45.86.06223.0222.018.7NaNNaN33.4NaN0.03.0
60.25199NaN7.870.00.5246.01266.65.56055.0311.015.2395.6012.4322.9NaN0.05.0
130.25199NaN8.140.00.5385.94961.84.70754.0307.0NaN396.908.2620.4NaN0.04.0
140.251990.08.140.00.5386.09684.5NaNNaN307.021.0380.02NaN18.20.00.0NaN
190.251990.08.140.00.5385.727NaN3.79654.0307.021.0390.9511.2818.20.00.04.0
\n", - "
" - ], - "text/plain": [ - " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n", - "3 0.25199 NaN 2.18 0.0 0.458 NaN 45.8 6.0622 3.0 222.0 18.7 \n", - "6 0.25199 NaN 7.87 0.0 0.524 6.012 66.6 5.5605 5.0 311.0 15.2 \n", - "13 0.25199 NaN 8.14 0.0 0.538 5.949 61.8 4.7075 4.0 307.0 NaN \n", - "14 0.25199 0.0 8.14 0.0 0.538 6.096 84.5 NaN NaN 307.0 21.0 \n", - "19 0.25199 0.0 8.14 0.0 0.538 5.727 NaN 3.7965 4.0 307.0 21.0 \n", - "\n", - " B LSTAT target ZN_cat CHAS_cat RAD_cat \n", - "3 NaN NaN 33.4 NaN 0.0 3.0 \n", - "6 395.60 12.43 22.9 NaN 0.0 5.0 \n", - "13 396.90 8.26 20.4 NaN 0.0 4.0 \n", - "14 380.02 NaN 18.2 0.0 0.0 NaN \n", - "19 390.95 11.28 18.2 0.0 0.0 4.0 " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_test2 = imp_2.transform(df_test)\n", - "df_test2.head()" + "We can also use this transformer in the event that we want to fill null values in our test set with impute values learned from our training set. In particular, if our training set contains no null values and our test set does, we can specify use_median_if_no_nulls as True in the fit stage so that our imputer will learn the median values of our training columns.\n", + "\n", + "**=> In this use case please use the MedianImputer**" ] } ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1097,7 +607,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" }, "toc": { "base_numbering": 1, diff --git a/requirements-dev.txt b/requirements-dev.txt index 9ca4323f..0fa436c4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,6 @@ pytest>=5.4.1 pytest-mock>=3.5.1 pytest-cov>=2.10.1 pytest-benchmark -black>=19.10b0 -flake8==3.8.4 +black>=21.9b0 +flake8==3.9.2 bandit>=1.7.0 \ No newline at end of file diff --git a/tests/dates/test_BetweenDatesTransformer.py b/tests/dates/test_BetweenDatesTransformer.py index dc0f057b..1d8f8a3e 100644 --- a/tests/dates/test_BetweenDatesTransformer.py +++ b/tests/dates/test_BetweenDatesTransformer.py @@ -44,11 +44,7 @@ def test_super_init_called(self, mocker): expected_call_args = { 0: { "args": (), - "kwargs": { - "columns": ["a", "b", "c"], - "verbose": False, - "copy": True, - }, + "kwargs": {"columns": ["a", "b", "c"], "verbose": False, "copy": True}, } } diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index ccd8367d..14a58884 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -16,21 +16,14 @@ def test_arguments(self): h.test_function_arguments( func=NearestMeanResponseImputer.__init__, - expected_arguments=[ - "self", - "response_column", - "use_median_if_no_nulls", - "columns", - ], - expected_default_values=(False, None), + expected_arguments=["self", "response_column", "columns"], + expected_default_values=(None,), ) def test_class_methods(self): """Test that NearestMeanResponseImputer has fit and transform methods.""" - x = NearestMeanResponseImputer( - response_column="c", use_median_if_no_nulls=False, columns=None - ) + x = NearestMeanResponseImputer(response_column="c", columns=None) h.test_object_method(obj=x, expected_method="fit", msg="fit") @@ -39,9 +32,7 @@ def test_class_methods(self): def test_inheritance(self): """Test that NearestMeanResponseImputer inherits from BaseImputer.""" - x = NearestMeanResponseImputer( - response_column="c", use_median_if_no_nulls=False, columns=None - ) + x = NearestMeanResponseImputer(response_column="c", columns=None) h.assert_inheritance(x, tubular.imputers.BaseImputer) @@ -58,7 +49,6 @@ def test_super_init_called(self, mocker): NearestMeanResponseImputer( response_column="c", - use_median_if_no_nulls=False, columns=None, verbose=True, copy=True, @@ -71,28 +61,17 @@ def test_response_column_not_str_error(self): NearestMeanResponseImputer(response_column=0) - def test_use_median_if_no_nulls_not_bool_error(self): - """Test that an exception is raised if use_median_if_no_nulls is not bool""" - - with pytest.raises(TypeError, match="use_median_if_no_nulls must be a bool"): - - NearestMeanResponseImputer( - response_column="a", use_median_if_no_nulls="abc" - ) - def test_values_passed_in_init_set_to_attribute(self): """Test that the values passed in init are saved in an attribute of the same name.""" x = NearestMeanResponseImputer( - response_column="c", columns="a", use_median_if_no_nulls=True + response_column="c", + columns="a", ) h.test_object_attributes( obj=x, - expected_attributes={ - "response_column": "c", - "use_median_if_no_nulls": True, - }, + expected_attributes={"response_column": "c"}, msg="Attributes for NearestMeanResponseImputer set in init", ) @@ -155,8 +134,8 @@ def test_null_values_in_response_error(self): x.fit(df) - def test_use_median_if_no_nulls_false_and_columns_with_no_nulls_error(self): - """Test an error is raised if a non-response column contains no nulls and use_median_if_no_nulls is false.""" + def test_columns_with_no_nulls_error(self): + """Test an error is raised if a non-response column contains no nulls.""" df = pd.DataFrame( {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [3, 2, 1, 4, 5]} @@ -211,38 +190,30 @@ def test_learnt_values(self): h.test_object_attributes( obj=x, expected_attributes={ - "impute_values_": { - "a": np.float64(2), - "b": np.float64(3), - } + "impute_values_": {"a": np.float64(2), "b": np.float64(3)} }, msg="impute_values_ attribute", ) def test_learnt_values2(self): - """Test that the nearest mean response values learnt during fit are expected - when values to be learnt include medians""" + """Test that the nearest mean response values learnt during fit are expected""" df = pd.DataFrame( { - "a": [1, 1, 2, 3, 3, 5], + "a": [1, 1, np.nan, np.nan, 3, 5], "b": [np.nan, np.nan, 1, 3, 3, 4], "c": [2, 3, 2, 1, 4, 1], } ) - x = NearestMeanResponseImputer( - response_column="c", columns=["a", "b"], use_median_if_no_nulls=True - ) + x = NearestMeanResponseImputer(response_column="c", columns=["a", "b"]) x.fit(df) h.test_object_attributes( obj=x, expected_attributes={ - "impute_values_": { - "a": np.float64(2.5), - "b": np.float64(3), - } + "impute_values_": {"a": np.float64(5), "b": np.float64(3)} }, msg="impute_values_ attribute", ) @@ -388,12 +359,10 @@ def test_nulls_imputed_correctly2(self, df, expected): pd.DataFrame({"a": [np.nan, 3, 4, 1, 4, np.nan]}), expected_df_3() ), ) - def test_nulls_imputed_correctly4(self, df, expected): + def test_nulls_imputed_correctly3(self, df, expected): """Test missing values are filled with the correct values - with median value from separate dataframe.""" - x = NearestMeanResponseImputer( - response_column="c", columns="a", use_median_if_no_nulls=True - ) + x = NearestMeanResponseImputer(response_column="c", columns="a") # set the impute values dict directly rather than fitting x on df so test works with helpers x.impute_values_ = {"a": 2.0} diff --git a/tubular/imputers.py b/tubular/imputers.py index 70d4fdbe..c035f6fd 100644 --- a/tubular/imputers.py +++ b/tubular/imputers.py @@ -280,25 +280,15 @@ class NearestMeanResponseImputer(BaseImputer): columns : None or str or list, default = None Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. - use_median_if_no_nulls : bool, default = False - If there are no nulls in a column on which the model is fitted, should the median of the column be learned instead? - If use_median_if_no_nulls = False and there are no nulls in the column to impute an error will be raised. """ - def __init__( - self, response_column, use_median_if_no_nulls=False, columns=None, **kwds - ): + def __init__(self, response_column, columns=None, **kwds): if not type(response_column) is str: raise TypeError("response_column must be a str") - if not type(use_median_if_no_nulls) is bool: - - raise TypeError("use_median_if_no_nulls must be a bool") - self.response_column = response_column - self.use_median_if_no_nulls = use_median_if_no_nulls super().__init__(columns=columns, **kwds) @@ -334,15 +324,9 @@ def fit(self, X, y=None): if c_nulls.sum() == 0: - if self.use_median_if_no_nulls: - - self.impute_values_[c] = X[c].median() - - else: - - raise ValueError( - f"Column {c} has no missing values, cannot use this transformer." - ) + raise ValueError( + f"Column {c} has no missing values, cannot use this transformer." + ) else: @@ -356,7 +340,7 @@ def fit(self, X, y=None): mean_response_by_levels[self.response_column] - mean_response_nulls ) - # take first value having the minimum difference in terms of average resposne + # take first value having the minimum difference in terms of average response self.impute_values_[c] = mean_response_by_levels.loc[ mean_response_by_levels["abs_diff_response"] == mean_response_by_levels["abs_diff_response"].min(),