From da1466272467377be87d9930c8ff04b82a738bdb Mon Sep 17 00:00:00 2001 From: Roman Shaptala Date: Tue, 9 Nov 2021 18:41:42 +0200 Subject: [PATCH 1/2] Faster categorical column names selection Change slow and redundant dataframe query by select_dtypes into a dataframe.dtypes list comprehension --- python-package/lightgbm/basic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 812fd82a5d97..6d46f0497323 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -16,6 +16,7 @@ import numpy as np import scipy.sparse +from pandas import CategoricalDtype from .compat import PANDAS_INSTALLED, concat, dt_DataTable, is_dtype_sparse, pd_DataFrame, pd_Series from .libpath import find_lib_path @@ -566,7 +567,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica raise ValueError('Input data must be 2 dimensional and non empty.') if feature_name == 'auto' or feature_name is None: data = data.rename(columns=str) - cat_cols = list(data.select_dtypes(include=['category']).columns) + cat_cols = [col for col, t in zip(data.columns, data.dtypes) if isinstance(t, CategoricalDtype)] cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] if pandas_categorical is None: # train dataset pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] From 8ee25d330e9e5efcebbcc33fc7cfb1e9e6c943bb Mon Sep 17 00:00:00 2001 From: Roman Shaptala Date: Tue, 9 Nov 2021 18:50:51 +0200 Subject: [PATCH 2/2] rename variable --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 6d46f0497323..7617c738e168 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -567,7 +567,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica raise ValueError('Input data must be 2 dimensional and non empty.') if feature_name == 'auto' or feature_name is None: data = data.rename(columns=str) - cat_cols = [col for col, t in zip(data.columns, data.dtypes) if isinstance(t, CategoricalDtype)] + cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, CategoricalDtype)] cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] if pandas_categorical is None: # train dataset pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]