From e65dc323f67f2fb50f8d612c70e292bf6edb95e8 Mon Sep 17 00:00:00 2001
From: Dongli He <dongli.he@mbzuai.ac.ae>
Date: Fri, 3 Nov 2023 17:50:40 +0400
Subject: [PATCH 1/5] detect task type

---
 linear_trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/linear_trainer.py b/linear_trainer.py
index e577cf0a..cbe95171 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -5,13 +5,15 @@
 from tqdm import tqdm
 
 import libmultilabel.linear as linear
-from libmultilabel.common_utils import dump_log
+from libmultilabel.common_utils import dump_log, is_multiclass_dataset
 from libmultilabel.linear.utils import LINEAR_TECHNIQUES
 
 
 def linear_test(config, model, datasets, label_mapping):
     metrics = linear.get_metrics(
-        config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.name == "binary_and_multiclass"
+        config.monitor_metrics,
+        datasets["test"]["y"].shape[1],
+        multiclass=is_multiclass_dataset(datasets["train"], "y"),
     )
     num_instance = datasets["test"]["x"].shape[0]
     k = config.save_k_predictions

From 8326176249e75601c6c2053fb909201ddf2164ee Mon Sep 17 00:00:00 2001
From: Dongli He <dongli.he@mbzuai.ac.ae>
Date: Wed, 29 Nov 2023 16:54:05 +0400
Subject: [PATCH 2/5] rewrite the logics of task type detection

---
 libmultilabel/linear/linear.py | 90 +++++++++++++++++++++++++++++-----
 linear_trainer.py              | 24 ++++++++-
 2 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py
index f9b4fb84..00799f26 100644
--- a/libmultilabel/linear/linear.py
+++ b/libmultilabel/linear/linear.py
@@ -27,11 +27,13 @@ def __init__(
         weights: np.matrix,
         bias: float,
         thresholds: float | np.ndarray,
+        is_multilabel: bool,
     ):
         self.name = name
         self.weights = weights
         self.bias = bias
         self.thresholds = thresholds
+        self.is_multilabel = is_multilabel
 
     def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
         """Calculates the decision values associated with x.
@@ -67,12 +69,19 @@ def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
         return (x * self.weights).A + self.thresholds
 
 
-def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True) -> FlatModel:
-    """Trains a linear model for multiabel data using a one-vs-rest strategy.
+def train_1vsrest(
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
+) -> FlatModel:
+    """Trains a linear classification model with one-vs-rest strategy.
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -93,7 +102,13 @@ def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "",
         yi = y[:, i].toarray().reshape(-1)
         weights[:, i] = _do_train(2 * yi - 1, x, options).ravel()
 
-    return FlatModel(name="1vsrest", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="1vsrest",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        is_multilabel=is_multilabel,
+    )
 
 
 def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]:
@@ -145,7 +160,11 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat
 
 
 def train_thresholding(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multi-label data using a one-vs-rest strategy
     and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
@@ -160,12 +179,16 @@ def train_thresholding(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
     Returns:
         A model which can be used in predict_values.
     """
+    if not is_multilabel:
+        raise ValueError("thresholding method doesn't support binary/multiclass datasets.")
+
     x, options, bias = _prepare_options(x, options)
 
     y = y.tocsc()
@@ -189,7 +212,13 @@ def train_thresholding(
         weights[:, i] = w.ravel()
         thresholds[i] = t
 
-    return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
+    return FlatModel(
+        name="thresholding",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=thresholds,
+        is_multilabel=is_multilabel,
+    )
 
 
 def _micromacro_one_label(
@@ -361,7 +390,11 @@ def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:
 
 
 def train_cost_sensitive(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multilabel data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
@@ -373,12 +406,16 @@ def train_cost_sensitive(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
     Returns:
         A model which can be used in predict_values.
     """
+    if not is_multilabel:
+        raise ValueError("cost_sensitive method doesn't support binary/multiclass datasets.")
+
     # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
     x, options, bias = _prepare_options(x, options)
 
@@ -394,7 +431,13 @@ def train_cost_sensitive(
         w = _cost_sensitive_one_label(2 * yi - 1, x, options)
         weights[:, i] = w.ravel()
 
-    return FlatModel(name="cost_sensitive", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="cost_sensitive",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        is_multilabel=is_multilabel,
+    )
 
 
 def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.ndarray:
@@ -454,7 +497,11 @@ def _cross_validate(y: np.ndarray, x: sparse.csr_matrix, options: str, perm: np.
 
 
 def train_cost_sensitive_micro(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multilabel data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
@@ -466,12 +513,16 @@ def train_cost_sensitive_micro(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
     Returns:
         A model which can be used in predict_values.
     """
+    if not is_multilabel:
+        raise ValueError("cost_sensitive_micro method doesn't support binary/multiclass datasets.")
+
     # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
     x, options, bias = _prepare_options(x, options)
 
@@ -510,17 +561,28 @@ def train_cost_sensitive_micro(
         w = _do_train(2 * yi - 1, x, final_options)
         weights[:, i] = w.ravel()
 
-    return FlatModel(name="cost_sensitive_micro", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="cost_sensitive_micro",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        is_multilabel=is_multilabel,
+    )
 
 
 def train_binary_and_multiclass(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for binary and multi-class data.
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -556,7 +618,13 @@ def train_binary_and_multiclass(
     # For labels not appeared in training, assign thresholds to -inf so they won't be predicted.
     thresholds = np.full(num_labels, -np.inf)
     thresholds[train_labels] = 0
-    return FlatModel(name="binary_and_multiclass", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
+    return FlatModel(
+        name="binary_and_multiclass",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=thresholds,
+        is_multilabel=is_multilabel,
+    )
 
 
 def predict_values(model, x: sparse.csr_matrix) -> np.ndarray:
diff --git a/linear_trainer.py b/linear_trainer.py
index cbe95171..d0602eab 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -13,7 +13,7 @@ def linear_test(config, model, datasets, label_mapping):
     metrics = linear.get_metrics(
         config.monitor_metrics,
         datasets["test"]["y"].shape[1],
-        multiclass=is_multiclass_dataset(datasets["train"], "y"),
+        multiclass=not model.is_multilabel,
     )
     num_instance = datasets["test"]["x"].shape[0]
     k = config.save_k_predictions
@@ -39,7 +39,26 @@ def linear_test(config, model, datasets, label_mapping):
 
 
 def linear_train(datasets, config):
+    # detect task type
+    is_multilabel = config.get("is_multilabel", "auto")
+    if is_multilabel == "auto":
+        is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
+    elif not isinstance(is_multilabel, bool):
+        raise ValueError(
+            f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}" instead.'
+        )
+
+    task_type = "multilabel" if is_multilabel else "binary/multiclass"
+    logging.info(
+        f'is_multilabel is set to "{config.get("is_multilabel", "auto")}". '
+        f"Model will be trained in {task_type} mode."
+    )
+
+    # train
     if config.linear_technique == "tree":
+        if not is_multilabel:
+            raise ValueError("Tree model should only be used with multilabel datasets.")
+
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
             datasets["train"]["x"],
@@ -51,7 +70,8 @@ def linear_train(datasets, config):
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
             datasets["train"]["x"],
-            config.liblinear_options,
+            is_multilabel=is_multilabel,
+            options=config.liblinear_options,
         )
     return model
 

From 8fa186e5bd8e6733e7180287178f8b04929ba972 Mon Sep 17 00:00:00 2001
From: Dongli He <dongli.he@mbzuai.ac.ae>
Date: Wed, 29 Nov 2023 17:37:10 +0400
Subject: [PATCH 3/5] fix arg missing issue

---
 libmultilabel/linear/tree.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 61c585db..59a0ee4c 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -48,6 +48,7 @@ def __init__(
         self.root = root
         self.flat_model = flat_model
         self.weight_map = weight_map
+        self.is_multilabel = True
 
     def predict_values(
         self,
@@ -203,14 +204,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node:
         node (Node): Node to be trained.
     """
     if node.isLeaf():
-        node.model = linear.train_1vsrest(y[:, node.label_map], x, options, False)
+        node.model = linear.train_1vsrest(y[:, node.label_map], x, True, options, False)
     else:
         # meta_y[i, j] is 1 if the ith instance is relevant to the jth child.
         # getnnz returns an ndarray of shape number of instances.
         # This must be reshaped into number of instances * 1 to be interpreted as a column.
         meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children]
         meta_y = sparse.csr_matrix(np.hstack(meta_y))
-        node.model = linear.train_1vsrest(meta_y, x, options, False)
+        node.model = linear.train_1vsrest(meta_y, x, True, options, False)
 
     node.model.weights = sparse.csc_matrix(node.model.weights)
 
@@ -250,6 +251,7 @@ def visit(node):
         weights=sparse.hstack(weights, "csr"),
         bias=bias,
         thresholds=0,
+        is_multilabel=True,
     )
 
     # w.shape[1] is the number of labels/metalabels of each node

From 09ca71a4317ad689519e305d964650d41f3fe06b Mon Sep 17 00:00:00 2001
From: Dongli He <dongli.he@mbzuai.ac.ae>
Date: Thu, 30 Nov 2023 23:12:20 +0400
Subject: [PATCH 4/5] rewrite for reproduction purpose only

---
 libmultilabel/linear/linear.py | 11 ++---------
 linear_trainer.py              | 14 +-------------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py
index 00799f26..3d430e69 100644
--- a/libmultilabel/linear/linear.py
+++ b/libmultilabel/linear/linear.py
@@ -186,9 +186,6 @@ def train_thresholding(
     Returns:
         A model which can be used in predict_values.
     """
-    if not is_multilabel:
-        raise ValueError("thresholding method doesn't support binary/multiclass datasets.")
-
     x, options, bias = _prepare_options(x, options)
 
     y = y.tocsc()
@@ -413,9 +410,6 @@ def train_cost_sensitive(
     Returns:
         A model which can be used in predict_values.
     """
-    if not is_multilabel:
-        raise ValueError("cost_sensitive method doesn't support binary/multiclass datasets.")
-
     # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
     x, options, bias = _prepare_options(x, options)
 
@@ -520,9 +514,6 @@ def train_cost_sensitive_micro(
     Returns:
         A model which can be used in predict_values.
     """
-    if not is_multilabel:
-        raise ValueError("cost_sensitive_micro method doesn't support binary/multiclass datasets.")
-
     # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
     x, options, bias = _prepare_options(x, options)
 
@@ -589,6 +580,8 @@ def train_binary_and_multiclass(
     Returns:
         A model which can be used in predict_values.
     """
+    if is_multilabel:
+        raise ValueError("binary_and_multiclass doesn't support multilabel data.")
     x, options, bias = _prepare_options(x, options)
     num_instances, num_labels = y.shape
     nonzero_instance_ids, nonzero_label_ids = y.nonzero()
diff --git a/linear_trainer.py b/linear_trainer.py
index d0602eab..449868be 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -40,19 +40,7 @@ def linear_test(config, model, datasets, label_mapping):
 
 def linear_train(datasets, config):
     # detect task type
-    is_multilabel = config.get("is_multilabel", "auto")
-    if is_multilabel == "auto":
-        is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
-    elif not isinstance(is_multilabel, bool):
-        raise ValueError(
-            f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}" instead.'
-        )
-
-    task_type = "multilabel" if is_multilabel else "binary/multiclass"
-    logging.info(
-        f'is_multilabel is set to "{config.get("is_multilabel", "auto")}". '
-        f"Model will be trained in {task_type} mode."
-    )
+    is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
 
     # train
     if config.linear_technique == "tree":

From a4e96921e2ec50ee8dbbd8551d6712302d3889b0 Mon Sep 17 00:00:00 2001
From: Dongli He <dongli.he@mbzuai.ac.ae>
Date: Thu, 30 Nov 2023 23:34:53 +0400
Subject: [PATCH 5/5] allow users to enter data type manually

---
 linear_trainer.py |  8 +++++++-
 torch_trainer.py  | 10 +++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/linear_trainer.py b/linear_trainer.py
index 449868be..6aafda87 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -40,7 +40,13 @@ def linear_test(config, model, datasets, label_mapping):
 
 def linear_train(datasets, config):
     # detect task type
-    is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
+    is_multilabel = config.get("is_multilabel", "auto")
+    if is_multilabel == "auto":
+        is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
+    elif not isinstance(is_multilabel, bool):
+        raise ValueError(
+            f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".'
+        )
 
     # train
     if config.linear_technique == "tree":
diff --git a/torch_trainer.py b/torch_trainer.py
index 0c7cf4f9..da904aea 100644
--- a/torch_trainer.py
+++ b/torch_trainer.py
@@ -65,7 +65,15 @@ def __init__(
         else:
             self.datasets = datasets
 
-        self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list()))
+        # detect task type
+        is_multilabel = self.config.get("is_multilabel", "auto")
+        if is_multilabel == "auto":
+            self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list()))
+        elif not isinstance(is_multilabel, bool):
+            raise ValueError(
+                f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".'
+            )
+
         self._setup_model(
             classes=classes,
             word_dict=word_dict,