From e65dc323f67f2fb50f8d612c70e292bf6edb95e8 Mon Sep 17 00:00:00 2001 From: Dongli He Date: Fri, 3 Nov 2023 17:50:40 +0400 Subject: [PATCH 1/5] detect task type --- linear_trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/linear_trainer.py b/linear_trainer.py index e577cf0a..cbe95171 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -5,13 +5,15 @@ from tqdm import tqdm import libmultilabel.linear as linear -from libmultilabel.common_utils import dump_log +from libmultilabel.common_utils import dump_log, is_multiclass_dataset from libmultilabel.linear.utils import LINEAR_TECHNIQUES def linear_test(config, model, datasets, label_mapping): metrics = linear.get_metrics( - config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.name == "binary_and_multiclass" + config.monitor_metrics, + datasets["test"]["y"].shape[1], + multiclass=is_multiclass_dataset(datasets["train"], "y"), ) num_instance = datasets["test"]["x"].shape[0] k = config.save_k_predictions From 8326176249e75601c6c2053fb909201ddf2164ee Mon Sep 17 00:00:00 2001 From: Dongli He Date: Wed, 29 Nov 2023 16:54:05 +0400 Subject: [PATCH 2/5] rewrite the logics of task type detection --- libmultilabel/linear/linear.py | 90 +++++++++++++++++++++++++++++----- linear_trainer.py | 24 ++++++++- 2 files changed, 101 insertions(+), 13 deletions(-) diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index f9b4fb84..00799f26 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -27,11 +27,13 @@ def __init__( weights: np.matrix, bias: float, thresholds: float | np.ndarray, + is_multilabel: bool, ): self.name = name self.weights = weights self.bias = bias self.thresholds = thresholds + self.is_multilabel = is_multilabel def predict_values(self, x: sparse.csr_matrix) -> np.ndarray: """Calculates the decision values associated with x. @@ -67,12 +69,19 @@ def predict_values(self, x: sparse.csr_matrix) -> np.ndarray: return (x * self.weights).A + self.thresholds -def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True) -> FlatModel: - """Trains a linear model for multiabel data using a one-vs-rest strategy. +def train_1vsrest( + y: sparse.csr_matrix, + x: sparse.csr_matrix, + is_multilabel: bool, + options: str = "", + verbose: bool = True, +) -> FlatModel: + """Trains a linear classification model with one-vs-rest strategy. Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + is_multilabel (bool): A flag indicating if the dataset is multilabel. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. @@ -93,7 +102,13 @@ def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", yi = y[:, i].toarray().reshape(-1) weights[:, i] = _do_train(2 * yi - 1, x, options).ravel() - return FlatModel(name="1vsrest", weights=np.asmatrix(weights), bias=bias, thresholds=0) + return FlatModel( + name="1vsrest", + weights=np.asmatrix(weights), + bias=bias, + thresholds=0, + is_multilabel=is_multilabel, + ) def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]: @@ -145,7 +160,11 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat def train_thresholding( - y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True + y: sparse.csr_matrix, + x: sparse.csr_matrix, + is_multilabel: bool, + options: str = "", + verbose: bool = True, ) -> FlatModel: """Trains a linear model for multi-label data using a one-vs-rest strategy and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1. @@ -160,12 +179,16 @@ def train_thresholding( Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + is_multilabel (bool): A flag indicating if the dataset is multilabel. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. Returns: A model which can be used in predict_values. """ + if not is_multilabel: + raise ValueError("thresholding method doesn't support binary/multiclass datasets.") + x, options, bias = _prepare_options(x, options) y = y.tocsc() @@ -189,7 +212,13 @@ def train_thresholding( weights[:, i] = w.ravel() thresholds[i] = t - return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds) + return FlatModel( + name="thresholding", + weights=np.asmatrix(weights), + bias=bias, + thresholds=thresholds, + is_multilabel=is_multilabel, + ) def _micromacro_one_label( @@ -361,7 +390,11 @@ def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float: def train_cost_sensitive( - y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True + y: sparse.csr_matrix, + x: sparse.csr_matrix, + is_multilabel: bool, + options: str = "", + verbose: bool = True, ) -> FlatModel: """Trains a linear model for multilabel data using a one-vs-rest strategy and cross-validation to pick an optimal asymmetric misclassification cost @@ -373,12 +406,16 @@ def train_cost_sensitive( Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + is_multilabel (bool): A flag indicating if the dataset is multilabel. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. Returns: A model which can be used in predict_values. """ + if not is_multilabel: + raise ValueError("cost_sensitive method doesn't support binary/multiclass datasets.") + # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = _prepare_options(x, options) @@ -394,7 +431,13 @@ def train_cost_sensitive( w = _cost_sensitive_one_label(2 * yi - 1, x, options) weights[:, i] = w.ravel() - return FlatModel(name="cost_sensitive", weights=np.asmatrix(weights), bias=bias, thresholds=0) + return FlatModel( + name="cost_sensitive", + weights=np.asmatrix(weights), + bias=bias, + thresholds=0, + is_multilabel=is_multilabel, + ) def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.ndarray: @@ -454,7 +497,11 @@ def _cross_validate(y: np.ndarray, x: sparse.csr_matrix, options: str, perm: np. def train_cost_sensitive_micro( - y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True + y: sparse.csr_matrix, + x: sparse.csr_matrix, + is_multilabel: bool, + options: str = "", + verbose: bool = True, ) -> FlatModel: """Trains a linear model for multilabel data using a one-vs-rest strategy and cross-validation to pick an optimal asymmetric misclassification cost @@ -466,12 +513,16 @@ def train_cost_sensitive_micro( Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + is_multilabel (bool): A flag indicating if the dataset is multilabel. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. Returns: A model which can be used in predict_values. """ + if not is_multilabel: + raise ValueError("cost_sensitive_micro method doesn't support binary/multiclass datasets.") + # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = _prepare_options(x, options) @@ -510,17 +561,28 @@ def train_cost_sensitive_micro( w = _do_train(2 * yi - 1, x, final_options) weights[:, i] = w.ravel() - return FlatModel(name="cost_sensitive_micro", weights=np.asmatrix(weights), bias=bias, thresholds=0) + return FlatModel( + name="cost_sensitive_micro", + weights=np.asmatrix(weights), + bias=bias, + thresholds=0, + is_multilabel=is_multilabel, + ) def train_binary_and_multiclass( - y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True + y: sparse.csr_matrix, + x: sparse.csr_matrix, + is_multilabel: bool, + options: str = "", + verbose: bool = True, ) -> FlatModel: """Trains a linear model for binary and multi-class data. Args: y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + is_multilabel (bool): A flag indicating if the dataset is multilabel. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. @@ -556,7 +618,13 @@ def train_binary_and_multiclass( # For labels not appeared in training, assign thresholds to -inf so they won't be predicted. thresholds = np.full(num_labels, -np.inf) thresholds[train_labels] = 0 - return FlatModel(name="binary_and_multiclass", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds) + return FlatModel( + name="binary_and_multiclass", + weights=np.asmatrix(weights), + bias=bias, + thresholds=thresholds, + is_multilabel=is_multilabel, + ) def predict_values(model, x: sparse.csr_matrix) -> np.ndarray: diff --git a/linear_trainer.py b/linear_trainer.py index cbe95171..d0602eab 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -13,7 +13,7 @@ def linear_test(config, model, datasets, label_mapping): metrics = linear.get_metrics( config.monitor_metrics, datasets["test"]["y"].shape[1], - multiclass=is_multiclass_dataset(datasets["train"], "y"), + multiclass=not model.is_multilabel, ) num_instance = datasets["test"]["x"].shape[0] k = config.save_k_predictions @@ -39,7 +39,26 @@ def linear_test(config, model, datasets, label_mapping): def linear_train(datasets, config): + # detect task type + is_multilabel = config.get("is_multilabel", "auto") + if is_multilabel == "auto": + is_multilabel = not is_multiclass_dataset(datasets["train"], "y") + elif not isinstance(is_multilabel, bool): + raise ValueError( + f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}" instead.' + ) + + task_type = "multilabel" if is_multilabel else "binary/multiclass" + logging.info( + f'is_multilabel is set to "{config.get("is_multilabel", "auto")}". ' + f"Model will be trained in {task_type} mode." + ) + + # train if config.linear_technique == "tree": + if not is_multilabel: + raise ValueError("Tree model should only be used with multilabel datasets.") + model = LINEAR_TECHNIQUES[config.linear_technique]( datasets["train"]["y"], datasets["train"]["x"], @@ -51,7 +70,8 @@ def linear_train(datasets, config): model = LINEAR_TECHNIQUES[config.linear_technique]( datasets["train"]["y"], datasets["train"]["x"], - config.liblinear_options, + is_multilabel=is_multilabel, + options=config.liblinear_options, ) return model From 8fa186e5bd8e6733e7180287178f8b04929ba972 Mon Sep 17 00:00:00 2001 From: Dongli He Date: Wed, 29 Nov 2023 17:37:10 +0400 Subject: [PATCH 3/5] fix arg missing issue --- libmultilabel/linear/tree.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 61c585db..59a0ee4c 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -48,6 +48,7 @@ def __init__( self.root = root self.flat_model = flat_model self.weight_map = weight_map + self.is_multilabel = True def predict_values( self, @@ -203,14 +204,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node: node (Node): Node to be trained. """ if node.isLeaf(): - node.model = linear.train_1vsrest(y[:, node.label_map], x, options, False) + node.model = linear.train_1vsrest(y[:, node.label_map], x, True, options, False) else: # meta_y[i, j] is 1 if the ith instance is relevant to the jth child. # getnnz returns an ndarray of shape number of instances. # This must be reshaped into number of instances * 1 to be interpreted as a column. meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children] meta_y = sparse.csr_matrix(np.hstack(meta_y)) - node.model = linear.train_1vsrest(meta_y, x, options, False) + node.model = linear.train_1vsrest(meta_y, x, True, options, False) node.model.weights = sparse.csc_matrix(node.model.weights) @@ -250,6 +251,7 @@ def visit(node): weights=sparse.hstack(weights, "csr"), bias=bias, thresholds=0, + is_multilabel=True, ) # w.shape[1] is the number of labels/metalabels of each node From 09ca71a4317ad689519e305d964650d41f3fe06b Mon Sep 17 00:00:00 2001 From: Dongli He Date: Thu, 30 Nov 2023 23:12:20 +0400 Subject: [PATCH 4/5] rewrite for reproduction purpose only --- libmultilabel/linear/linear.py | 11 ++--------- linear_trainer.py | 14 +------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 00799f26..3d430e69 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -186,9 +186,6 @@ def train_thresholding( Returns: A model which can be used in predict_values. """ - if not is_multilabel: - raise ValueError("thresholding method doesn't support binary/multiclass datasets.") - x, options, bias = _prepare_options(x, options) y = y.tocsc() @@ -413,9 +410,6 @@ def train_cost_sensitive( Returns: A model which can be used in predict_values. """ - if not is_multilabel: - raise ValueError("cost_sensitive method doesn't support binary/multiclass datasets.") - # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = _prepare_options(x, options) @@ -520,9 +514,6 @@ def train_cost_sensitive_micro( Returns: A model which can be used in predict_values. """ - if not is_multilabel: - raise ValueError("cost_sensitive_micro method doesn't support binary/multiclass datasets.") - # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ x, options, bias = _prepare_options(x, options) @@ -589,6 +580,8 @@ def train_binary_and_multiclass( Returns: A model which can be used in predict_values. """ + if is_multilabel: + raise ValueError("binary_and_multiclass doesn't support multilabel data.") x, options, bias = _prepare_options(x, options) num_instances, num_labels = y.shape nonzero_instance_ids, nonzero_label_ids = y.nonzero() diff --git a/linear_trainer.py b/linear_trainer.py index d0602eab..449868be 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -40,19 +40,7 @@ def linear_test(config, model, datasets, label_mapping): def linear_train(datasets, config): # detect task type - is_multilabel = config.get("is_multilabel", "auto") - if is_multilabel == "auto": - is_multilabel = not is_multiclass_dataset(datasets["train"], "y") - elif not isinstance(is_multilabel, bool): - raise ValueError( - f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}" instead.' - ) - - task_type = "multilabel" if is_multilabel else "binary/multiclass" - logging.info( - f'is_multilabel is set to "{config.get("is_multilabel", "auto")}". ' - f"Model will be trained in {task_type} mode." - ) + is_multilabel = not is_multiclass_dataset(datasets["train"], "y") # train if config.linear_technique == "tree": From a4e96921e2ec50ee8dbbd8551d6712302d3889b0 Mon Sep 17 00:00:00 2001 From: Dongli He Date: Thu, 30 Nov 2023 23:34:53 +0400 Subject: [PATCH 5/5] allow users to enter data type manually --- linear_trainer.py | 8 +++++++- torch_trainer.py | 10 +++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/linear_trainer.py b/linear_trainer.py index 449868be..6aafda87 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -40,7 +40,13 @@ def linear_test(config, model, datasets, label_mapping): def linear_train(datasets, config): # detect task type - is_multilabel = not is_multiclass_dataset(datasets["train"], "y") + is_multilabel = config.get("is_multilabel", "auto") + if is_multilabel == "auto": + is_multilabel = not is_multiclass_dataset(datasets["train"], "y") + elif not isinstance(is_multilabel, bool): + raise ValueError( + f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".' + ) # train if config.linear_technique == "tree": diff --git a/torch_trainer.py b/torch_trainer.py index 0c7cf4f9..da904aea 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -65,7 +65,15 @@ def __init__( else: self.datasets = datasets - self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list())) + # detect task type + is_multilabel = self.config.get("is_multilabel", "auto") + if is_multilabel == "auto": + self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list())) + elif not isinstance(is_multilabel, bool): + raise ValueError( + f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".' + ) + self._setup_model( classes=classes, word_dict=word_dict,